more checksum work

- convert checksums to int
- add bulk update & update on upgrade
- add indices pending performance testing. The fsum table & indices add about
  2MB to a deck with 50k unique fields
This commit is contained in:
Damien Elmes 2011-03-11 02:21:29 +09:00
parent 4becd8399c
commit f5b326c753
5 changed files with 34 additions and 31 deletions

View file

@ -10,7 +10,7 @@ from itertools import groupby
from anki.lang import _, ngettext
from anki.utils import parseTags, tidyHTML, ids2str, hexifyID, \
canonifyTags, joinTags, addTags, deleteTags, checksum, fieldChecksum, \
stripHTML, intTime
stripHTML, intTime, splitFields
from anki.fonts import toPlatformFont
from anki.hooks import runHook, hookEmpty, runFilter
@ -508,9 +508,9 @@ due > :now and due < :now""", now=time.time())
# [cid, fid, mid, tid, gid, tags, flds, data]
data = [1, 1, fact.model.id, template.id, 1,
"", fact.joinedFields(), ""]
now = self.formatQA(fact.model, template, "", data)
now = self._formatQA(fact.model, template, "", data)
data[6] = "\x1f".join([""]*len(fact._fields))
empty = self.formatQA(fact.model, template, "", data)
empty = self._formatQA(fact.model, template, "", data)
if now['q'] == empty['q']:
continue
if not template.conf['allowEmptyAns']:
@ -929,12 +929,10 @@ where tid in %s""" % strids, now=time.time())
for t in m.templates:
templs[t.id] = t
groups = dict(self.db.all("select id, name from groups"))
return [self.formatQA(mods[row[2]], templs[row[3]], groups[row[4]], row)
return [self._formatQA(mods[row[2]], templs[row[3]], groups[row[4]], row)
for row in self._qaData(where)]
# # and checksum
# self._updateFieldChecksums(facts)
def formatQA(self, model, template, gname, data, filters=True):
def _formatQA(self, model, template, gname, data, filters=True):
"Returns hash of id, question, answer."
# data is [cid, fid, mid, tid, gid, tags, flds, data]
# unpack fields and create dict
@ -975,23 +973,24 @@ where c.fid == f.id and f.mid == m.id and
c.tid = t.id and c.gid = g.id
%s""" % where)
def _updateFieldChecksums(self, facts):
print "benchmark updatefieldchecksums"
confs = {}
# Field checksum bulk update
##########################################################################
def updateFieldChecksums(self, fids):
"Update all field checksums, after find&replace, etc."
sfids = ids2str(fids)
mods = {}
for m in self.allModels():
mods[m.id] = m
r = []
for (fid, map) in facts.items():
for (fmid, val) in map.values():
if fmid and fmid not in confs:
confs[fmid] = simplejson.loads(self.db.scalar(
"select conf from fields where id = ?",
fmid))
# if unique checking has been turned off, don't bother to
# zero out old values
if confs[fmid]['unique']:
csum = fieldChecksum(val)
r.append((csum, fid, fmid))
self.db.executemany(
"update fdata set csum=? where fid=? and fmid=?", r)
for row in self._qaData(where="and f.id in "+sfids):
fields = splitFields(row[6])
model = mods[row[2]]
for c, f in enumerate(model.fields):
if f['uniq'] and fields[c]:
r.append((row[1], model.id, fieldChecksum(fields[c])))
self.db.execute("delete from fsums where fid in "+sfids)
self.db.executemany("insert into fsums values (?,?,?)", r)
# Tags
##########################################################################

View file

@ -58,10 +58,10 @@ insert or replace into facts values (?, ?, ?, ?, ?, ?, ?, ?)""",
for (ord, conf) in self._fmap.values():
if not conf['uniq']:
continue
val = fieldChecksum(self._fields[ord])
val = self._fields[ord]
if not val:
continue
d.append((self.id, self.mid, val))
d.append((self.id, self.mid, fieldChecksum(val)))
self.deck.db.executemany("insert into fsums values (?, ?, ?)", d)
def cards(self):

View file

@ -187,6 +187,9 @@ create index if not exists ix_cards_fid on cards (fid);
create index if not exists ix_revlog_cid on revlog (cid);
-- media
create index if not exists ix_media_csum on media (csum);
-- unique checking
create index if not exists ix_fsums_fid on fsums (fid);
create index if not exists ix_fsums_csum on fsums (csum);
""")
# 2.0 schema migration
@ -460,7 +463,10 @@ def _rewriteModelIds(deck):
def _postSchemaUpgrade(deck):
"Handle the rest of the upgrade to 2.0."
import anki.deck
# fix up model/template ids
_rewriteModelIds(deck)
# update uniq cache
deck.updateFieldChecksums(deck.db.list("select id from facts"))
# remove old views
for v in ("failedCards", "revCardsOld", "revCardsNew",
"revCardsDue", "revCardsRandom", "acqCardsRandom",

View file

@ -272,10 +272,8 @@ def checksum(data):
return md5(data).hexdigest()
def fieldChecksum(data):
# 8 digit md5 hash of utf8 string, or empty string if empty value
if not data:
return ""
return checksum(data.encode("utf-8"))[:8]
# 32 bit unsigned number from first 8 digits of md5 hash
return int(checksum(data.encode("utf-8"))[:8], 16)
def call(argv, wait=True, **kwargs):
try:

View file

@ -88,7 +88,7 @@ def test_fieldChecksum():
f['Front'] = u"new"; f['Back'] = u"new2"
deck.addFact(f)
assert deck.db.scalar(
"select csum from fsums") == "22af645d"
"select csum from fsums") == int("22af645d", 16)
# empty field should have no checksum
f['Front'] = u""
f.flush()
@ -98,7 +98,7 @@ def test_fieldChecksum():
f['Front'] = u"newx"
f.flush()
assert deck.db.scalar(
"select csum from fsums") == "4b0e5a4c"
"select csum from fsums") == int("4b0e5a4c", 16)
# turning off unique and modifying the fact should delete the sum
f.model.fields[0]['uniq'] = False
f.model.flush()