Anki/anki/find.py
Damien Elmes 9c247f45bd remove q/a cache, tags in fields, rewrite remaining ids, more
Anki used random 64bit IDs for cards, facts and fields. This had some nice
properties:
- merging data in syncs and imports was simply a matter of copying each way,
  as conflicts were astronomically unlikely
- it made it easy to identify identical cards and prevent them from being
  reimported
But there were some negatives too:
- they're more expensive to store
- javascript can't handle numbers > 2**53, which means AnkiMobile, iAnki and
  so on have to treat the ids as strings, which is slow
- simply copying data in a sync or import can lead to corruption, as while a
  duplicate id indicates the data was originally the same, it may have
  diverged. A more intelligent approach is necessary.
- sqlite was sorting the fields table based on the id, which meant the fields
  were spread across the table, and costly to fetch

So instead, we'll move to incremental ids. In the case of model changes we'll
declare that a schema change and force a full sync to avoid having to deal
with conflicts, and in the case of cards and facts, we'll need to update the
ids on one end to merge. Identical cards can be detected by checking to see if
their id is the same and their creation time is the same.

Creation time has been added back to cards and facts because it's necessary
for sync conflict merging. That means facts.pos is not required.

The graves table has been removed. It's not necessary for schema related
changes, and dead cards/facts can be represented as a card with queue=-4 and
created=0. Because we will record schema modification time and can ensure a
full sync propagates to all endpoints, it means we can remove the dead
cards/facts on schema change.

Tags have been removed from the facts table and are represented as a field
with ord=-1 and fmid=0. Combined with the locality improvement for fields, it
means that fetching fields is not much more expensive than using the q/a
cache.

Because of the above, removing the q/a cache is a possibility now. The q and a
columns on cards has been dropped. It will still be necessary to render the
q/a on fact add/edit, since we need to record media references. It would be
nice to avoid this in the future. Perhaps one way would be the ability to
assign a type to fields, like "image", "audio", or "latex". LaTeX needs
special consider anyway, as it was being rendered into the q/a cache.
2011-04-28 09:23:53 +09:00

688 lines
26 KiB
Python

# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
# A lot of findCards() and related functions was contributed by
# Marcus.
import re
from anki.utils import ids2str
SEARCH_TAG = 0
SEARCH_TYPE = 1
SEARCH_PHRASE = 2
SEARCH_FID = 3
SEARCH_CARD = 4
SEARCH_DISTINCT = 5
SEARCH_FIELD = 6
SEARCH_FIELD_EXISTS = 7
SEARCH_QA = 8
SEARCH_PHRASE_WB = 9
# Find
##########################################################################
def findCards(deck, query):
(q, cmquery, showdistinct, filters, args) = findCardsWhere(deck, query)
(fidList, cardIdList) = findCardsMatchingFilters(deck, filters)
query = "select id from cards"
hasWhere = False
if q:
query += " where " + q
hasWhere = True
if cmquery['pos'] or cmquery['neg']:
if hasWhere is False:
query += " where "
hasWhere = True
else: query += " and "
if cmquery['pos']:
query += (" fid in(select distinct fid from cards "+
"where id in (" + cmquery['pos'] + ")) ")
query += " and id in(" + cmquery['pos'] + ") "
if cmquery['neg']:
query += (" fid not in(select distinct fid from "+
"cards where id in (" + cmquery['neg'] + ")) ")
if fidList is not None:
if hasWhere is False:
query += " where "
hasWhere = True
else: query += " and "
query += " fid IN %s" % ids2str(fidList)
if cardIdList is not None:
if hasWhere is False:
query += " where "
hasWhere = True
else: query += " and "
query += " id IN %s" % ids2str(cardIdList)
if showdistinct:
query += " group by fid"
#print query, args
return deck.db.list(query, **args)
def findCardsWhere(deck, query):
(tquery, fquery, qquery, fidquery, cmquery, sfquery, qaquery,
showdistinct, filters, args) = _findCards(deck, query)
q = ""
x = []
if tquery:
x.append(" fid in (%s)" % tquery)
if fquery:
x.append(" fid in (%s)" % fquery)
if qquery:
x.append(" id in (%s)" % qquery)
if fidquery:
x.append(" id in (%s)" % fidquery)
if sfquery:
x.append(" fid in (%s)" % sfquery)
if qaquery:
x.append(" id in (%s)" % qaquery)
if x:
q += " and ".join(x)
return q, cmquery, showdistinct, filters, args
def allFMFields(deck, tolower=False):
fields = []
try:
fields = deck.db.list(
"select distinct name from fieldmodels order by name")
except:
fields = []
if tolower is True:
for i, v in enumerate(fields):
fields[i] = v.lower()
return fields
def _parseQuery(deck, query):
tokens = []
res = []
allowedfields = allFMFields(deck, True)
def addSearchFieldToken(field, value, isNeg, filter):
if field.lower() in allowedfields:
res.append((field + ':' + value, isNeg, SEARCH_FIELD, filter))
elif field in ['question', 'answer']:
res.append((field + ':' + value, isNeg, SEARCH_QA, filter))
else:
for p in phraselog:
res.append((p['value'], p['is_neg'], p['type'], p['filter']))
# break query into words or phraselog
# an extra space is added so the loop never ends in the middle
# completing a token
for match in re.findall(
r'(-)?\'(([^\'\\]|\\.)*)\'|(-)?"(([^"\\]|\\.)*)"|(-)?([^ ]+)|([ ]+)',
query + ' '):
type = ' '
if match[1]: type = "'"
elif match[4]: type = '"'
value = (match[1] or match[4] or match[7])
isNeg = (match[0] == '-' or match[3] == '-' or match[6] == '-')
tokens.append({'type': type, 'value': value, 'is_neg': isNeg,
'filter': ('wb' if type == "'" else 'none')})
intoken = isNeg = False
field = '' #name of the field for field related commands
phraselog = [] #log of phrases in case potential command is not a commad
for c, token in enumerate(tokens):
doprocess = True # only look for commands when this is true
#prevent cases such as "field" : value as being processed as a command
if len(token['value']) == 0:
if intoken is True and type == SEARCH_FIELD and field:
#case: fieldname: any thing here check for existance of fieldname
addSearchFieldToken(field, '*', isNeg, 'none')
phraselog = [] # reset phrases since command is completed
intoken = doprocess = False
if intoken is True:
if type == SEARCH_FIELD_EXISTS:
#case: field:"value"
res.append((token['value'], isNeg, type, 'none'))
intoken = doprocess = False
elif type == SEARCH_FIELD and field:
#case: fieldname:"value"
addSearchFieldToken(
field, token['value'], isNeg, token['filter'])
intoken = doprocess = False
elif type == SEARCH_FIELD and not field:
#case: "fieldname":"name" or "field" anything
if token['value'].startswith(":") and len(phraselog) == 1:
#we now know a colon is next, so mark it as field
# and keep looking for the value
field = phraselog[0]['value']
parts = token['value'].split(':', 1)
phraselog.append(
{'value': token['value'], 'is_neg': False,
'type': SEARCH_PHRASE, 'filter': token['filter']})
if parts[1]:
#value is included with the :, so wrap it up
addSearchFieldToken(field, parts[1], isNeg, 'none')
intoken = doprocess = False
doprocess = False
else:
#case: "fieldname"string/"fieldname"tag:name
intoken = False
if intoken is False and doprocess is False:
#command has been fully processed
phraselog = [] # reset phraselog, since we used it for a command
if intoken is False:
#include any non-command related phrases in the query
for p in phraselog: res.append(
(p['value'], p['is_neg'], p['type'], p['filter']))
phraselog = []
if intoken is False and doprocess is True:
field = ''
isNeg = token['is_neg']
if token['value'].startswith("tag:"):
token['value'] = token['value'][4:]
type = SEARCH_TAG
elif token['value'].startswith("is:"):
token['value'] = token['value'][3:].lower()
type = SEARCH_TYPE
elif token['value'].startswith("fid:") and len(token['value']) > 4:
dec = token['value'][4:]
try:
int(dec)
token['value'] = token['value'][4:]
except:
try:
for d in dec.split(","):
int(d)
token['value'] = token['value'][4:]
except:
token['value'] = "0"
type = SEARCH_FID
elif token['value'].startswith("card:"):
token['value'] = token['value'][5:]
type = SEARCH_CARD
elif token['value'].startswith("show:"):
token['value'] = token['value'][5:].lower()
type = SEARCH_DISTINCT
elif token['value'].startswith("field:"):
type = SEARCH_FIELD_EXISTS
parts = token['value'][6:].split(':', 1)
field = parts[0]
if len(parts) == 1 and parts[0]:
token['value'] = parts[0]
elif len(parts) == 1 and not parts[0]:
intoken = True
else:
type = SEARCH_FIELD
intoken = True
parts = token['value'].split(':', 1)
phraselog.append(
{'value': token['value'], 'is_neg': isNeg,
'type': SEARCH_PHRASE, 'filter': token['filter']})
if len(parts) == 2 and parts[0]:
field = parts[0]
if parts[1]:
#simple fieldname:value case - no need to look for more data
addSearchFieldToken(field, parts[1], isNeg, 'none')
intoken = doprocess = False
if intoken is False: phraselog = []
if intoken is False and doprocess is True:
res.append((token['value'], isNeg, type, token['filter']))
return res
def findCardsMatchingFilters(deck, filters):
factFilters = []
fieldFilters = {}
cardFilters = {}
factFilterMatches = []
fieldFilterMatches = []
cardFilterMatches = []
if (len(filters) > 0):
for filter in filters:
if filter['scope'] == 'fact':
regexp = re.compile(
r'\b' + re.escape(filter['value']) + r'\b', flags=re.I)
factFilters.append(
{'value': filter['value'], 'regexp': regexp,
'is_neg': filter['is_neg']})
if filter['scope'] == 'field':
fieldName = filter['field'].lower()
if (fieldName in fieldFilters) is False:
fieldFilters[fieldName] = []
regexp = re.compile(
r'\b' + re.escape(filter['value']) + r'\b', flags=re.I)
fieldFilters[fieldName].append(
{'value': filter['value'], 'regexp': regexp,
'is_neg': filter['is_neg']})
if filter['scope'] == 'card':
fieldName = filter['field'].lower()
if (fieldName in cardFilters) is False:
cardFilters[fieldName] = []
regexp = re.compile(r'\b' + re.escape(filter['value']) +
r'\b', flags=re.I)
cardFilters[fieldName].append(
{'value': filter['value'], 'regexp': regexp,
'is_neg': filter['is_neg']})
if len(factFilters) > 0:
fquery = ''
args = {}
for filter in factFilters:
c = len(args)
if fquery:
if filter['is_neg']: fquery += " except "
else: fquery += " intersect "
elif filter['is_neg']: fquery += "select id from fdata except "
value = filter['value'].replace("*", "%")
args["_ff_%d" % c] = "%"+value+"%"
fquery += (
"select id from fdata where value like "+
":_ff_%d escape '\\'" % c)
rows = deck.db.execute(
'select fid, value from fdata where id in (' +
fquery + ')', args)
while (1):
row = rows.fetchone()
if row is None: break
doesMatch = False
for filter in factFilters:
res = filter['regexp'].search(row[1])
if ((filter['is_neg'] is False and res) or
(filter['is_neg'] is True and res is None)):
factFilterMatches.append(row[0])
if len(fieldFilters) > 0:
sfquery = ''
args = {}
for field, filters in fieldFilters.iteritems():
for filter in filters:
c = len(args)
if sfquery:
if filter['is_neg']: sfquery += " except "
else: sfquery += " intersect "
elif filter['is_neg']: sfquery += "select id from fdata except "
field = field.replace("*", "%")
value = filter['value'].replace("*", "%")
args["_ff_%d" % c] = "%"+value+"%"
ids = deck.db.list(
"select id from fieldmodels where name like "+
":field escape '\\'", field=field)
sfquery += ("select id from fdata where "+
"fmid in %s and value like "+
":_ff_%d escape '\\'") % (ids2str(ids), c)
rows = deck.db.execute(
'select f.fid, f.value, fm.name from fdata as f '+
'left join fieldmodels as fm ON (f.fmid = '+
'fm.id) where f.id in (' + sfquery + ')', args)
while (1):
row = rows.fetchone()
if row is None: break
field = row[2].lower()
doesMatch = False
if field in fieldFilters:
for filter in fieldFilters[field]:
res = filter['regexp'].search(row[1])
if ((filter['is_neg'] is False and res) or
(filter['is_neg'] is True and res is None)):
fieldFilterMatches.append(row[0])
if len(cardFilters) > 0:
qaquery = ''
args = {}
for field, filters in cardFilters.iteritems():
for filter in filters:
c = len(args)
if qaquery:
if filter['is_neg']: qaquery += " except "
else: qaquery += " intersect "
elif filter['is_neg']: qaquery += "select id from cards except "
value = value.replace("*", "%")
args["_ff_%d" % c] = "%"+value+"%"
if field == 'question':
qaquery += "select id from cards where question "
qaquery += "like :_ff_%d escape '\\'" % c
else:
qaquery += "select id from cards where answer "
qaquery += "like :_ff_%d escape '\\'" % c
rows = deck.db.execute(
'select id, question, answer from cards where id IN (' +
qaquery + ')', args)
while (1):
row = rows.fetchone()
if row is None: break
doesMatch = False
if field in cardFilters:
rowValue = row[1] if field == 'question' else row[2]
for filter in cardFilters[field]:
res = filter['regexp'].search(rowValue)
if ((filter['is_neg'] is False and res) or
(filter['is_neg'] is True and res is None)):
cardFilterMatches.append(row[0])
fids = None
if len(factFilters) > 0 or len(fieldFilters) > 0:
fids = []
fids.extend(factFilterMatches)
fids.extend(fieldFilterMatches)
cardIds = None
if len(cardFilters) > 0:
cardIds = []
cardIds.extend(cardFilterMatches)
return (fids, cardIds)
def _findCards(deck, query):
"Find facts matching QUERY."
tquery = ""
fquery = ""
qquery = ""
fidquery = ""
cmquery = { 'pos': '', 'neg': '' }
sfquery = qaquery = ""
showdistinct = False
filters = []
args = {}
for c, (token, isNeg, type, filter) in enumerate(_parseQuery(deck, query)):
if type == SEARCH_TAG:
# a tag
if tquery:
if isNeg:
tquery += " except "
else:
tquery += " intersect "
elif isNeg:
tquery += "select id from facts except "
if token == "none":
tquery += """
select id from cards where fid in (select fid from fdata where ord = -1 and
val = ''"""
else:
token = token.replace("*", "%")
if not token.startswith("%"):
token = "% " + token
if not token.endswith("%"):
token += " %"
args["_tag_%d" % c] = token
tquery += """
select fid from fdata where ord = -1 and val like :_tag_%d""" % c
elif type == SEARCH_TYPE:
if qquery:
if isNeg:
qquery += " except "
else:
qquery += " intersect "
elif isNeg:
qquery += "select id from cards except "
if token in ("rev", "new", "lrn"):
if token == "rev":
n = 1
elif token == "new":
n = 2
else:
n = 0
qquery += "select id from cards where type = %d" % n
elif token == "delayed":
print "delayed"
qquery += ("select id from cards where "
"due < %d and due > %d and "
"type in (0,1,2)") % (
deck.dayCutoff, deck.dayCutoff)
elif token == "suspended":
qquery += ("select id from cards where "
"queue = -1")
elif token == "leech":
qquery += (
"select id from cards where noCount >= (select value "
"from deckvars where key = 'leechFails')")
else: # due
qquery += ("select id from cards where "
"queue between 0 and 1 and due < %d") % deck.dayCutoff
elif type == SEARCH_FID:
if fidquery:
if isNeg:
fidquery += " except "
else:
fidquery += " intersect "
elif isNeg:
fidquery += "select id from cards except "
fidquery += "select id from cards where fid in (%s)" % token
elif type == SEARCH_CARD:
print "search_card broken"
token = token.replace("*", "%")
ids = deck.db.list("""
select id from tags where name like :tag escape '\\'""", tag=token)
if isNeg:
if cmquery['neg']:
cmquery['neg'] += " intersect "
cmquery['neg'] += """
select cardId from cardTags where src = 2 and cardTags.tagId in %s""" % ids2str(ids)
else:
if cmquery['pos']:
cmquery['pos'] += " intersect "
cmquery['pos'] += """
select cardId from cardTags where src = 2 and cardTags.tagId in %s""" % ids2str(ids)
elif type == SEARCH_FIELD or type == SEARCH_FIELD_EXISTS:
field = value = ''
if type == SEARCH_FIELD:
parts = token.split(':', 1);
if len(parts) == 2:
field = parts[0]
value = parts[1]
elif type == SEARCH_FIELD_EXISTS:
field = token
value = '*'
if (type == SEARCH_FIELD and filter != 'none'):
if field and value:
filters.append(
{'scope': 'field', 'type': filter,
'field': field, 'value': value, 'is_neg': isNeg})
else:
if field and value:
if sfquery:
if isNeg:
sfquery += " except "
else:
sfquery += " intersect "
elif isNeg:
sfquery += "select id from facts except "
field = field.replace("*", "%")
value = value.replace("*", "%")
args["_ff_%d" % c] = "%"+value+"%"
ids = deck.db.list("""
select id from fieldmodels where name like :field escape '\\'""", field=field)
sfquery += """
select fid from fdata where fmid in %s and
value like :_ff_%d escape '\\'""" % (ids2str(ids), c)
elif type == SEARCH_QA:
field = value = ''
parts = token.split(':', 1);
if len(parts) == 2:
field = parts[0]
value = parts[1]
if (filter != 'none'):
if field and value:
filters.append(
{'scope': 'card', 'type': filter, 'field': field,
'value': value, 'is_neg': isNeg})
else:
if field and value:
if qaquery:
if isNeg:
qaquery += " except "
else:
qaquery += " intersect "
elif isNeg:
qaquery += "select id from cards except "
value = value.replace("*", "%")
args["_ff_%d" % c] = "%"+value+"%"
if field == 'question':
qaquery += """
select id from cards where question like :_ff_%d escape '\\'""" % c
else:
qaquery += """
select id from cards where answer like :_ff_%d escape '\\'""" % c
elif type == SEARCH_DISTINCT:
if isNeg is False:
showdistinct = True if token == "one" else False
else:
showdistinct = False if token == "one" else True
else:
if (filter != 'none'):
filters.append(
{'scope': 'fact', 'type': filter,
'value': token, 'is_neg': isNeg})
else:
if fquery:
if isNeg:
fquery += " except "
else:
fquery += " intersect "
elif isNeg:
fquery += "select id from facts except "
token = token.replace("*", "%")
args["_ff_%d" % c] = "%"+token+"%"
fquery += """
select id from facts where cache like :_ff_%d escape '\\'""" % c
return (tquery, fquery, qquery, fidquery, cmquery, sfquery,
qaquery, showdistinct, filters, args)
# Find and replace
##########################################################################
def findReplace(deck, fids, src, dst, isRe=False, field=None):
"Find and replace fields in a fact."
# find
s = "select id, fid, value from fdata where fid in %s"
if isRe:
isRe = re.compile(src)
else:
s += " and value like :v"
if field:
s += " and fmid = :fmid"
rows = deck.db.all(s % ids2str(fids),
v="%"+src.replace("%", "%%")+"%",
fmid=field)
modded = []
if isRe:
modded = [
{'id': id, 'fid': fid, 'val': re.sub(isRe, dst, val)}
for (id, fid, val) in rows
if isRe.search(val)]
else:
modded = [
{'id': id, 'fid': fid, 'val': val.replace(src, dst)}
for (id, fid, val) in rows
if val.find(src) != -1]
# update
if modded:
deck.db.executemany(
'update fdata set value = :val where id = :id', modded)
deck.updateCardQACacheFromIds([f['fid'] for f in modded],
type="facts")
if field:
deck.updateFieldChecksums(field)
else:
deck.updateAllFieldChecksums()
return len(set([f['fid'] for f in modded]))
# Find duplicates
##########################################################################
def findDuplicates(deck, fmids):
data = deck.db.all(
"select fid, value from fdata where fmid in %s" %
ids2str(fmids))
vals = {}
for (fid, val) in data:
if not val.strip():
continue
if val not in vals:
vals[val] = [fid]
else:
vals[val].append(fid)
return [(k,v) for (k,v) in vals.items() if len(v) > 1]
# Find & sort
##########################################################################
# copied from ankiqt and trivially changed; will not work at the moment
# if idx == 0:
# self.sortKey = "question"
# elif idx == 1:
# self.sortKey = "answer"
# elif idx == 2:
# self.sortKey = "created"
# elif idx == 3:
# self.sortKey = "modified"
# elif idx == 4:
# self.sortKey = "combinedDue"
# elif idx == 5:
# self.sortKey = "interval"
# elif idx == 6:
# self.sortKey = "reps"
# elif idx == 7:
# self.sortKey = "factor"
# elif idx == 8:
# self.sortKey = "fact"
# elif idx == 9:
# self.sortKey = "noCount"
# elif idx == 10:
# self.sortKey = "firstAnswered"
# else:
# self.sortKey = ("field", self.sortFields[idx-11])
def findSorted(deck, query, sortKey):
# sorting
if not query.strip():
ads = ""
else:
ids = self.deck.findCards(query)
ads = "cards.id in %s" % ids2str(ids)
sort = ""
if isinstance(sortKey, types.StringType):
# card property
if sortKey == "fact":
sort = "order by facts.created, cards.created"
else:
sort = "order by cards." + sortKey
if sortKey in ("question", "answer"):
sort += " collate nocase"
if sortKey == "fact":
query = """
select cards.id from cards, facts
where cards.fid = facts.id """
if ads:
query += "and " + ads + " "
else:
query = "select id from cards "
if ads:
query += "where %s " % ads
query += sort
else:
# field value
ret = self.deck.db.all(
"select id, numeric from fields where name = :name",
name=sortKey[1])
fields = ",".join([str(x[0]) for x in ret])
# if multiple models have the same field, use the first numeric bool
numeric = ret[0][1]
if numeric:
order = "cast(fdata.value as real)"
else:
order = "fdata.value collate nocase"
if ads:
ads = " and " + ads
query = ("select cards.id "
"from fdata, cards where fdata.fmid in (%s) "
"and fdata.fid = cards.fid" + ads +
" order by cards.ordinal, %s") % (fields, order)
# run the query
self.cards = self.deck.db.all(query)