use a checksum for field values; fixed import->update number

Previously we had an index on the value field, which was very expensive for
long fields. Instead we use a separate column and take the first 8 characters
of the field value's md5sum, and index that. In decks with lots of text in
fields, it can cut the deck size by 30% or more, and many decks improve by
10-20%. Decks with only a few characters in fields may increase in size
slightly, but this is offset by the fact that we only generate a checksum for
fields that have uniqueness checking on.

Also, fixed import->update reporting the total # of available facts instead of
the number of facts that were imported.
This commit is contained in:
Damien Elmes 2011-02-12 08:09:13 +09:00
parent da48eb1e55
commit 4302306fe9
7 changed files with 147 additions and 32 deletions

View file

@ -16,7 +16,7 @@ from anki.lang import _, ngettext
from anki.errors import DeckAccessError
from anki.stdmodels import BasicModel
from anki.utils import parseTags, tidyHTML, genID, ids2str, hexifyID, \
canonifyTags, joinTags, addTags, checksum
canonifyTags, joinTags, addTags, checksum, fieldChecksum
from anki.history import CardHistoryEntry
from anki.models import Model, CardModel, formatQA
from anki.stats import dailyStats, globalStats, genToday
@ -56,7 +56,7 @@ SEARCH_FIELD = 6
SEARCH_FIELD_EXISTS = 7
SEARCH_QA = 8
SEARCH_PHRASE_WB = 9
DECK_VERSION = 70
DECK_VERSION = 71
deckVarsTable = Table(
'deckVars', metadata,
@ -393,6 +393,32 @@ type = (case
when type >= 0 then relativeDelay else relativeDelay - 3 end)
""")
def updateAllFieldChecksums(self):
# zero out
self.s.statement("update fields set chksum = ''")
# add back for unique fields
for m in self.models:
for fm in m.fieldModels:
self.updateFieldChecksums(fm.id)
def updateFieldChecksums(self, fmid):
self.s.flush()
self.setSchemaModified()
unique = self.s.scalar(
"select \"unique\" from fieldModels where id = :id", id=fmid)
if unique:
l = []
for (id, value) in self.s.all(
"select id, value from fields where fieldModelId = :id",
id=fmid):
l.append({'id':id, 'chk':fieldChecksum(value)})
self.s.statements(
"update fields set chksum = :chk where id = :id", l)
else:
self.s.statement(
"update fields set chksum = '' where fieldModelId=:id",
id=fmid)
def _cardQueue(self, card):
return self.cardType(card)
@ -2697,10 +2723,15 @@ select id from facts where spaceUntil like :_ff_%d escape '\\'""" % c
for (id, fid, val) in rows
if val.find(src) != -1]
# update
self.s.statements(
'update fields set value = :val where id = :id', modded)
self.updateCardQACacheFromIds([f['fid'] for f in modded],
if modded:
self.s.statements(
'update fields set value = :val where id = :id', modded)
self.updateCardQACacheFromIds([f['fid'] for f in modded],
type="facts")
if field:
self.updateFieldChecksums(field)
else:
self.updateAllFieldChecksums()
return len(set([f['fid'] for f in modded]))
# Find duplicates
@ -2989,7 +3020,8 @@ Return new path, relative to media dir."""
self.modified = newTime or time.time()
def setSchemaModified(self):
self.setVar("schemaMod", time.time())
# we might be called during an upgrade, so avoid bumping modtime
self.setVar("schemaMod", time.time(), mod=False)
def flushMod(self):
"Mark modified and flush to DB."
@ -3113,10 +3145,10 @@ where id = :id""", fid=f.id, cmid=m.cardModels[0].id, id=id)
if quick:
num = 4
else:
num = 9
num = 10
oldSize = os.stat(self.path)[stat.ST_SIZE]
self.startProgress(num)
self.updateProgress(_("Checking integrity..."))
self.updateProgress(_("Checking database..."))
if self.s.scalar("pragma integrity_check") != "ok":
self.finishProgress()
return _("Database file is damaged.\n"
@ -3125,7 +3157,7 @@ where id = :id""", fid=f.id, cmid=m.cardModels[0].id, id=id)
self.updateProgress()
DeckStorage._addIndices(self)
# does the user have a model?
self.updateProgress(_("Checking schema..."))
self.updateProgress()
if not self.s.scalar("select count(id) from models"):
self.addModel(BasicModel())
problems.append(_("Deck was missing a model"))
@ -3209,10 +3241,10 @@ select id from fields where factId not in (select id from facts)""")
"update cardModels set allowEmptyAnswer = 1, typeAnswer = '' "
"where allowEmptyAnswer is null or typeAnswer is null")
# fix tags
self.updateProgress(_("Rebuilding tag cache..."))
self.updateProgress()
self.updateCardTags()
# make sure ordinals are correct
self.updateProgress(_("Updating ordinals..."))
self.updateProgress()
self.s.statement("""
update fields set ordinal = (select ordinal from fieldModels
where id = fieldModelId)""")
@ -3220,7 +3252,7 @@ where id = fieldModelId)""")
update cards set ordinal = (select ordinal from cardModels
where cards.cardModelId = cardModels.id)""")
# fix problems with stripping html
self.updateProgress(_("Rebuilding QA cache..."))
self.updateProgress()
fields = self.s.all("select id, value from fields")
newFields = []
for (id, value) in fields:
@ -3228,11 +3260,14 @@ where cards.cardModelId = cardModels.id)""")
self.s.statements(
"update fields set value=:value where id=:id",
newFields)
# and field checksums
self.updateProgress()
self.updateAllFieldChecksums()
# regenerate question/answer cache
for m in self.models:
self.updateCardsFromModel(m, dirty=False)
# rebuild
self.updateProgress(_("Rebuilding types..."))
self.updateProgress()
self.rebuildTypes()
# since we can ensure the updated version will be propagated to
# all locations, we can forget old tombstones
@ -3241,7 +3276,7 @@ where cards.cardModelId = cardModels.id)""")
# force a full sync
self.setSchemaModified()
# and finally, optimize
self.updateProgress(_("Optimizing..."))
self.updateProgress()
self.optimize()
newSize = os.stat(self.path)[stat.ST_SIZE]
save = (oldSize - newSize)/1024
@ -3521,8 +3556,16 @@ class DeckStorage(object):
metadata.create_all(engine)
deck = DeckStorage._init(s)
else:
# add any possibly new tables if we're upgrading
ver = s.scalar("select version from decks limit 1")
# add a checksum to fields
if ver < 71:
try:
s.execute(
"alter table fields add column chksum text "+
"not null default ''")
except:
pass
# add any possibly new tables if we're upgrading
if ver < DECK_VERSION:
metadata.create_all(engine)
deck = s.query(Deck).get(1)
@ -3695,7 +3738,7 @@ create index if not exists ix_fields_factId on fields (factId)""")
deck.s.statement("""
create index if not exists ix_fields_fieldModelId on fields (fieldModelId)""")
deck.s.statement("""
create index if not exists ix_fields_value on fields (value)""")
create index if not exists ix_fields_chksum on fields (chksum)""")
# media
deck.s.statement("""
create unique index if not exists ix_media_filename on media (filename)""")
@ -3860,9 +3903,17 @@ this message. (ERR-0101)""") % {
"revCardsDue", "revCardsRandom", "acqCardsRandom",
"acqCardsOld", "acqCardsNew"):
deck.s.statement("drop view if exists %s" % v)
deck.version = 70
deck.s.commit()
if deck.version < 71:
# remove the expensive value cache
deck.s.statement("drop index if exists ix_fields_value")
# add checksums and index
deck.updateAllFieldChecksums()
DeckStorage._addIndices(deck)
deck.s.execute("vacuum")
deck.s.execute("analyze")
deck.version = 70
deck.version = 71
deck.s.commit()
# executing a pragma here is very slow on large decks, so we store
# our own record

View file

@ -12,7 +12,7 @@ import time
from anki.db import *
from anki.errors import *
from anki.models import Model, FieldModel, fieldModelsTable
from anki.utils import genID, stripHTMLMedia
from anki.utils import genID, stripHTMLMedia, fieldChecksum
from anki.hooks import runHook
# Fields in a fact
@ -25,7 +25,8 @@ fieldsTable = Table(
Column('fieldModelId', Integer, ForeignKey("fieldModels.id"),
nullable=False),
Column('ordinal', Integer, nullable=False),
Column('value', UnicodeText, nullable=False))
Column('value', UnicodeText, nullable=False),
Column('chksum', String, nullable=False, default=""))
class Field(object):
"A field in a fact."
@ -90,9 +91,14 @@ class Fact(object):
def __setitem__(self, key, value):
try:
[f for f in self.fields if f.name == key][0].value = value
item = [f for f in self.fields if f.name == key][0]
except IndexError:
raise KeyError
item.value = value
if item.fieldModel.unique:
item.chksum = fieldChecksum(value)
else:
item.chksum = ""
def get(self, key, default):
try:
@ -121,10 +127,11 @@ class Fact(object):
if not field.fieldModel.unique:
return True
req = ("select value from fields "
"where fieldModelId = :fmid and value = :val")
"where fieldModelId = :fmid and value = :val and chksum = :chk")
if field.id:
req += " and id != %s" % field.id
return not s.scalar(req, val=field.value, fmid=field.fieldModel.id)
return not s.scalar(req, val=field.value, fmid=field.fieldModel.id,
chk=fieldChecksum(field.value))
def focusLost(self, field):
runHook('fact.focusLost', self, field)

View file

@ -17,7 +17,7 @@ import time
from anki.cards import cardsTable
from anki.facts import factsTable, fieldsTable
from anki.lang import _
from anki.utils import genID, canonifyTags
from anki.utils import genID, canonifyTags, fieldChecksum
from anki.utils import canonifyTags, ids2str
from anki.errors import *
from anki.deck import NEW_CARDS_RANDOM
@ -122,11 +122,12 @@ and value != ''""",
continue
data = [{'fid': fid,
'fmid': fm.id,
'v': c.fields[index]}
'v': c.fields[index],
'chk': self.maybeChecksum(c.fields[index], fm.unique)}
for (fid, c) in upcards]
self.deck.s.execute("""
update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
data)
update fields set value = :v, chksum = :chk where factId = :fid
and fieldModelId = :fmid""", data)
# update tags
self.deck.updateProgress()
if tagsIdx is not None:
@ -144,7 +145,7 @@ update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
self.deck.updateCardTags(cids)
self.deck.updateProgress()
self.deck.updateCardsFromFactIds(fids)
self.total = len(fids)
self.total = len(cards)
self.deck.setModified()
self.deck.finishProgress()
@ -152,6 +153,11 @@ update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
"The number of fields."
return 0
def maybeChecksum(self, data, unique):
if not unique:
return ""
return fieldChecksum(data)
def foreignCards(self):
"Return a list of foreign cards for importing."
assert 0
@ -254,7 +260,11 @@ where factId in (%s)""" % ",".join([str(s) for s in factIds]))
'ordinal': fm.ordinal,
'id': genID(),
'value': (index is not None and
cards[m].fields[index] or u"")}
cards[m].fields[index] or u""),
'chksum': self.maybeChecksum(
index is not None and
cards[m].fields[index] or u"", fm.unique)
}
for m in range(len(cards))]
self.deck.s.execute(fieldsTable.insert(),
data)

View file

@ -43,6 +43,7 @@ fieldModelsTable = Table(
# reused as RTL marker
Column('features', UnicodeText, nullable=False, default=u""),
Column('required', Boolean, nullable=False, default=True),
# if code changes this, it should call deck.updateFieldChecksums()
Column('unique', Boolean, nullable=False, default=True), # sqlite keyword
Column('numeric', Boolean, nullable=False, default=False),
# display

View file

@ -430,7 +430,7 @@ class SyncTools(object):
select id, modelId, created, %s, tags, spaceUntil, lastCardId from facts
where id in %s""" % (modified, factIds))),
'fields': self.realLists(self.deck.s.all("""
select id, factId, fieldModelId, ordinal, value from fields
select id, factId, fieldModelId, ordinal, value, chksum from fields
where factId in %s""" % factIds))
}
@ -455,12 +455,17 @@ insert or replace into facts
values
(:id, :modelId, :created, :modified, :tags, :spaceUntil, :lastCardId)""", dlist)
# now fields
def chksum(f):
if len(f) > 5:
return f[5]
return self.deck.fieldChecksum(f[4])
dlist = [{
'id': f[0],
'factId': f[1],
'fieldModelId': f[2],
'ordinal': f[3],
'value': f[4]
'value': f[4],
'chksum': f[5]
} for f in fields]
# delete local fields since ids may have changed
self.deck.s.execute(
@ -469,9 +474,9 @@ values
# then update
self.deck.s.execute("""
insert into fields
(id, factId, fieldModelId, ordinal, value)
(id, factId, fieldModelId, ordinal, value, chksum)
values
(:id, :factId, :fieldModelId, :ordinal, :value)""", dlist)
(:id, :factId, :fieldModelId, :ordinal, :value, :chksum)""", dlist)
self.deck.s.statement(
"delete from factsDeleted where factId in %s" %
ids2str([f[0] for f in facts]))

View file

@ -277,6 +277,12 @@ def deleteTags(tagstr, tags):
def checksum(data):
return md5(data).hexdigest()
def fieldChecksum(data):
# 8 digit md5 hash of utf8 string, or empty string if empty value
if not data:
return ""
return checksum(data.encode("utf-8"))[:8]
def call(argv, wait=True, **kwargs):
try:
o = subprocess.Popen(argv, **kwargs)

View file

@ -129,6 +129,41 @@ def test_factAddDelete():
# and the second should clear the fact
deck.deleteCard(id2)
def test_fieldChecksum():
deck = DeckStorage.Deck()
deck.addModel(BasicModel())
f = deck.newFact()
f['Front'] = u"new"; f['Back'] = u"new2"
deck.addFact(f)
(id, sum) = deck.s.first(
"select id, chksum from fields where value = 'new'")
assert sum == "22af645d"
# empty field should have no checksum
f['Front'] = u""
deck.s.flush()
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == ""
# changing the value should change the checksum
f['Front'] = u"newx"
deck.s.flush()
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == "4b0e5a4c"
# back should have no checksum, because it's not set to be unique
(id, sum) = deck.s.first(
"select id, chksum from fields where value = 'new2'")
assert sum == ""
# if we turn on unique, it should get a checksum
fm = f.model.fieldModels[1]
fm.unique = True
deck.updateFieldChecksums(fm.id)
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == "82f2ec5f"
# and turning it off should zero the checksum again
fm.unique = False
deck.updateFieldChecksums(fm.id)
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == ""
def test_modelAddDelete():
deck = DeckStorage.Deck()
deck.addModel(BasicModel())