more checksum work

- convert checksums to int
- add bulk update & update on upgrade
- add indices pending performance testing. The fsum table & indices add about
  2MB to a deck with 50k unique fields
This commit is contained in:
Damien Elmes 2011-03-11 02:21:29 +09:00
parent 4becd8399c
commit f5b326c753
5 changed files with 34 additions and 31 deletions

View file

@ -10,7 +10,7 @@ from itertools import groupby
from anki.lang import _, ngettext from anki.lang import _, ngettext
from anki.utils import parseTags, tidyHTML, ids2str, hexifyID, \ from anki.utils import parseTags, tidyHTML, ids2str, hexifyID, \
canonifyTags, joinTags, addTags, deleteTags, checksum, fieldChecksum, \ canonifyTags, joinTags, addTags, deleteTags, checksum, fieldChecksum, \
stripHTML, intTime stripHTML, intTime, splitFields
from anki.fonts import toPlatformFont from anki.fonts import toPlatformFont
from anki.hooks import runHook, hookEmpty, runFilter from anki.hooks import runHook, hookEmpty, runFilter
@ -508,9 +508,9 @@ due > :now and due < :now""", now=time.time())
# [cid, fid, mid, tid, gid, tags, flds, data] # [cid, fid, mid, tid, gid, tags, flds, data]
data = [1, 1, fact.model.id, template.id, 1, data = [1, 1, fact.model.id, template.id, 1,
"", fact.joinedFields(), ""] "", fact.joinedFields(), ""]
now = self.formatQA(fact.model, template, "", data) now = self._formatQA(fact.model, template, "", data)
data[6] = "\x1f".join([""]*len(fact._fields)) data[6] = "\x1f".join([""]*len(fact._fields))
empty = self.formatQA(fact.model, template, "", data) empty = self._formatQA(fact.model, template, "", data)
if now['q'] == empty['q']: if now['q'] == empty['q']:
continue continue
if not template.conf['allowEmptyAns']: if not template.conf['allowEmptyAns']:
@ -929,12 +929,10 @@ where tid in %s""" % strids, now=time.time())
for t in m.templates: for t in m.templates:
templs[t.id] = t templs[t.id] = t
groups = dict(self.db.all("select id, name from groups")) groups = dict(self.db.all("select id, name from groups"))
return [self.formatQA(mods[row[2]], templs[row[3]], groups[row[4]], row) return [self._formatQA(mods[row[2]], templs[row[3]], groups[row[4]], row)
for row in self._qaData(where)] for row in self._qaData(where)]
# # and checksum
# self._updateFieldChecksums(facts)
def formatQA(self, model, template, gname, data, filters=True): def _formatQA(self, model, template, gname, data, filters=True):
"Returns hash of id, question, answer." "Returns hash of id, question, answer."
# data is [cid, fid, mid, tid, gid, tags, flds, data] # data is [cid, fid, mid, tid, gid, tags, flds, data]
# unpack fields and create dict # unpack fields and create dict
@ -975,23 +973,24 @@ where c.fid == f.id and f.mid == m.id and
c.tid = t.id and c.gid = g.id c.tid = t.id and c.gid = g.id
%s""" % where) %s""" % where)
def _updateFieldChecksums(self, facts): # Field checksum bulk update
print "benchmark updatefieldchecksums" ##########################################################################
confs = {}
def updateFieldChecksums(self, fids):
"Update all field checksums, after find&replace, etc."
sfids = ids2str(fids)
mods = {}
for m in self.allModels():
mods[m.id] = m
r = [] r = []
for (fid, map) in facts.items(): for row in self._qaData(where="and f.id in "+sfids):
for (fmid, val) in map.values(): fields = splitFields(row[6])
if fmid and fmid not in confs: model = mods[row[2]]
confs[fmid] = simplejson.loads(self.db.scalar( for c, f in enumerate(model.fields):
"select conf from fields where id = ?", if f['uniq'] and fields[c]:
fmid)) r.append((row[1], model.id, fieldChecksum(fields[c])))
# if unique checking has been turned off, don't bother to self.db.execute("delete from fsums where fid in "+sfids)
# zero out old values self.db.executemany("insert into fsums values (?,?,?)", r)
if confs[fmid]['unique']:
csum = fieldChecksum(val)
r.append((csum, fid, fmid))
self.db.executemany(
"update fdata set csum=? where fid=? and fmid=?", r)
# Tags # Tags
########################################################################## ##########################################################################

View file

@ -58,10 +58,10 @@ insert or replace into facts values (?, ?, ?, ?, ?, ?, ?, ?)""",
for (ord, conf) in self._fmap.values(): for (ord, conf) in self._fmap.values():
if not conf['uniq']: if not conf['uniq']:
continue continue
val = fieldChecksum(self._fields[ord]) val = self._fields[ord]
if not val: if not val:
continue continue
d.append((self.id, self.mid, val)) d.append((self.id, self.mid, fieldChecksum(val)))
self.deck.db.executemany("insert into fsums values (?, ?, ?)", d) self.deck.db.executemany("insert into fsums values (?, ?, ?)", d)
def cards(self): def cards(self):

View file

@ -187,6 +187,9 @@ create index if not exists ix_cards_fid on cards (fid);
create index if not exists ix_revlog_cid on revlog (cid); create index if not exists ix_revlog_cid on revlog (cid);
-- media -- media
create index if not exists ix_media_csum on media (csum); create index if not exists ix_media_csum on media (csum);
-- unique checking
create index if not exists ix_fsums_fid on fsums (fid);
create index if not exists ix_fsums_csum on fsums (csum);
""") """)
# 2.0 schema migration # 2.0 schema migration
@ -460,7 +463,10 @@ def _rewriteModelIds(deck):
def _postSchemaUpgrade(deck): def _postSchemaUpgrade(deck):
"Handle the rest of the upgrade to 2.0." "Handle the rest of the upgrade to 2.0."
import anki.deck import anki.deck
# fix up model/template ids
_rewriteModelIds(deck) _rewriteModelIds(deck)
# update uniq cache
deck.updateFieldChecksums(deck.db.list("select id from facts"))
# remove old views # remove old views
for v in ("failedCards", "revCardsOld", "revCardsNew", for v in ("failedCards", "revCardsOld", "revCardsNew",
"revCardsDue", "revCardsRandom", "acqCardsRandom", "revCardsDue", "revCardsRandom", "acqCardsRandom",

View file

@ -272,10 +272,8 @@ def checksum(data):
return md5(data).hexdigest() return md5(data).hexdigest()
def fieldChecksum(data): def fieldChecksum(data):
# 8 digit md5 hash of utf8 string, or empty string if empty value # 32 bit unsigned number from first 8 digits of md5 hash
if not data: return int(checksum(data.encode("utf-8"))[:8], 16)
return ""
return checksum(data.encode("utf-8"))[:8]
def call(argv, wait=True, **kwargs): def call(argv, wait=True, **kwargs):
try: try:

View file

@ -88,7 +88,7 @@ def test_fieldChecksum():
f['Front'] = u"new"; f['Back'] = u"new2" f['Front'] = u"new"; f['Back'] = u"new2"
deck.addFact(f) deck.addFact(f)
assert deck.db.scalar( assert deck.db.scalar(
"select csum from fsums") == "22af645d" "select csum from fsums") == int("22af645d", 16)
# empty field should have no checksum # empty field should have no checksum
f['Front'] = u"" f['Front'] = u""
f.flush() f.flush()
@ -98,7 +98,7 @@ def test_fieldChecksum():
f['Front'] = u"newx" f['Front'] = u"newx"
f.flush() f.flush()
assert deck.db.scalar( assert deck.db.scalar(
"select csum from fsums") == "4b0e5a4c" "select csum from fsums") == int("4b0e5a4c", 16)
# turning off unique and modifying the fact should delete the sum # turning off unique and modifying the fact should delete the sum
f.model.fields[0]['uniq'] = False f.model.fields[0]['uniq'] = False
f.model.flush() f.model.flush()