use a checksum for field values; fixed import->update number

Previously we had an index on the value field, which was very expensive for
long fields. Instead we use a separate column and take the first 8 characters
of the field value's md5sum, and index that. In decks with lots of text in
fields, it can cut the deck size by 30% or more, and many decks improve by
10-20%. Decks with only a few characters in fields may increase in size
slightly, but this is offset by the fact that we only generate a checksum for
fields that have uniqueness checking on.

Also, fixed import->update reporting the total # of available facts instead of
the number of facts that were imported.
This commit is contained in:
Damien Elmes 2011-02-12 08:09:13 +09:00
parent da48eb1e55
commit 4302306fe9
7 changed files with 147 additions and 32 deletions

View file

@ -16,7 +16,7 @@ from anki.lang import _, ngettext
from anki.errors import DeckAccessError from anki.errors import DeckAccessError
from anki.stdmodels import BasicModel from anki.stdmodels import BasicModel
from anki.utils import parseTags, tidyHTML, genID, ids2str, hexifyID, \ from anki.utils import parseTags, tidyHTML, genID, ids2str, hexifyID, \
canonifyTags, joinTags, addTags, checksum canonifyTags, joinTags, addTags, checksum, fieldChecksum
from anki.history import CardHistoryEntry from anki.history import CardHistoryEntry
from anki.models import Model, CardModel, formatQA from anki.models import Model, CardModel, formatQA
from anki.stats import dailyStats, globalStats, genToday from anki.stats import dailyStats, globalStats, genToday
@ -56,7 +56,7 @@ SEARCH_FIELD = 6
SEARCH_FIELD_EXISTS = 7 SEARCH_FIELD_EXISTS = 7
SEARCH_QA = 8 SEARCH_QA = 8
SEARCH_PHRASE_WB = 9 SEARCH_PHRASE_WB = 9
DECK_VERSION = 70 DECK_VERSION = 71
deckVarsTable = Table( deckVarsTable = Table(
'deckVars', metadata, 'deckVars', metadata,
@ -393,6 +393,32 @@ type = (case
when type >= 0 then relativeDelay else relativeDelay - 3 end) when type >= 0 then relativeDelay else relativeDelay - 3 end)
""") """)
def updateAllFieldChecksums(self):
# zero out
self.s.statement("update fields set chksum = ''")
# add back for unique fields
for m in self.models:
for fm in m.fieldModels:
self.updateFieldChecksums(fm.id)
def updateFieldChecksums(self, fmid):
self.s.flush()
self.setSchemaModified()
unique = self.s.scalar(
"select \"unique\" from fieldModels where id = :id", id=fmid)
if unique:
l = []
for (id, value) in self.s.all(
"select id, value from fields where fieldModelId = :id",
id=fmid):
l.append({'id':id, 'chk':fieldChecksum(value)})
self.s.statements(
"update fields set chksum = :chk where id = :id", l)
else:
self.s.statement(
"update fields set chksum = '' where fieldModelId=:id",
id=fmid)
def _cardQueue(self, card): def _cardQueue(self, card):
return self.cardType(card) return self.cardType(card)
@ -2697,10 +2723,15 @@ select id from facts where spaceUntil like :_ff_%d escape '\\'""" % c
for (id, fid, val) in rows for (id, fid, val) in rows
if val.find(src) != -1] if val.find(src) != -1]
# update # update
self.s.statements( if modded:
'update fields set value = :val where id = :id', modded) self.s.statements(
self.updateCardQACacheFromIds([f['fid'] for f in modded], 'update fields set value = :val where id = :id', modded)
self.updateCardQACacheFromIds([f['fid'] for f in modded],
type="facts") type="facts")
if field:
self.updateFieldChecksums(field)
else:
self.updateAllFieldChecksums()
return len(set([f['fid'] for f in modded])) return len(set([f['fid'] for f in modded]))
# Find duplicates # Find duplicates
@ -2989,7 +3020,8 @@ Return new path, relative to media dir."""
self.modified = newTime or time.time() self.modified = newTime or time.time()
def setSchemaModified(self): def setSchemaModified(self):
self.setVar("schemaMod", time.time()) # we might be called during an upgrade, so avoid bumping modtime
self.setVar("schemaMod", time.time(), mod=False)
def flushMod(self): def flushMod(self):
"Mark modified and flush to DB." "Mark modified and flush to DB."
@ -3113,10 +3145,10 @@ where id = :id""", fid=f.id, cmid=m.cardModels[0].id, id=id)
if quick: if quick:
num = 4 num = 4
else: else:
num = 9 num = 10
oldSize = os.stat(self.path)[stat.ST_SIZE] oldSize = os.stat(self.path)[stat.ST_SIZE]
self.startProgress(num) self.startProgress(num)
self.updateProgress(_("Checking integrity...")) self.updateProgress(_("Checking database..."))
if self.s.scalar("pragma integrity_check") != "ok": if self.s.scalar("pragma integrity_check") != "ok":
self.finishProgress() self.finishProgress()
return _("Database file is damaged.\n" return _("Database file is damaged.\n"
@ -3125,7 +3157,7 @@ where id = :id""", fid=f.id, cmid=m.cardModels[0].id, id=id)
self.updateProgress() self.updateProgress()
DeckStorage._addIndices(self) DeckStorage._addIndices(self)
# does the user have a model? # does the user have a model?
self.updateProgress(_("Checking schema...")) self.updateProgress()
if not self.s.scalar("select count(id) from models"): if not self.s.scalar("select count(id) from models"):
self.addModel(BasicModel()) self.addModel(BasicModel())
problems.append(_("Deck was missing a model")) problems.append(_("Deck was missing a model"))
@ -3209,10 +3241,10 @@ select id from fields where factId not in (select id from facts)""")
"update cardModels set allowEmptyAnswer = 1, typeAnswer = '' " "update cardModels set allowEmptyAnswer = 1, typeAnswer = '' "
"where allowEmptyAnswer is null or typeAnswer is null") "where allowEmptyAnswer is null or typeAnswer is null")
# fix tags # fix tags
self.updateProgress(_("Rebuilding tag cache...")) self.updateProgress()
self.updateCardTags() self.updateCardTags()
# make sure ordinals are correct # make sure ordinals are correct
self.updateProgress(_("Updating ordinals...")) self.updateProgress()
self.s.statement(""" self.s.statement("""
update fields set ordinal = (select ordinal from fieldModels update fields set ordinal = (select ordinal from fieldModels
where id = fieldModelId)""") where id = fieldModelId)""")
@ -3220,7 +3252,7 @@ where id = fieldModelId)""")
update cards set ordinal = (select ordinal from cardModels update cards set ordinal = (select ordinal from cardModels
where cards.cardModelId = cardModels.id)""") where cards.cardModelId = cardModels.id)""")
# fix problems with stripping html # fix problems with stripping html
self.updateProgress(_("Rebuilding QA cache...")) self.updateProgress()
fields = self.s.all("select id, value from fields") fields = self.s.all("select id, value from fields")
newFields = [] newFields = []
for (id, value) in fields: for (id, value) in fields:
@ -3228,11 +3260,14 @@ where cards.cardModelId = cardModels.id)""")
self.s.statements( self.s.statements(
"update fields set value=:value where id=:id", "update fields set value=:value where id=:id",
newFields) newFields)
# and field checksums
self.updateProgress()
self.updateAllFieldChecksums()
# regenerate question/answer cache # regenerate question/answer cache
for m in self.models: for m in self.models:
self.updateCardsFromModel(m, dirty=False) self.updateCardsFromModel(m, dirty=False)
# rebuild # rebuild
self.updateProgress(_("Rebuilding types...")) self.updateProgress()
self.rebuildTypes() self.rebuildTypes()
# since we can ensure the updated version will be propagated to # since we can ensure the updated version will be propagated to
# all locations, we can forget old tombstones # all locations, we can forget old tombstones
@ -3241,7 +3276,7 @@ where cards.cardModelId = cardModels.id)""")
# force a full sync # force a full sync
self.setSchemaModified() self.setSchemaModified()
# and finally, optimize # and finally, optimize
self.updateProgress(_("Optimizing...")) self.updateProgress()
self.optimize() self.optimize()
newSize = os.stat(self.path)[stat.ST_SIZE] newSize = os.stat(self.path)[stat.ST_SIZE]
save = (oldSize - newSize)/1024 save = (oldSize - newSize)/1024
@ -3521,8 +3556,16 @@ class DeckStorage(object):
metadata.create_all(engine) metadata.create_all(engine)
deck = DeckStorage._init(s) deck = DeckStorage._init(s)
else: else:
# add any possibly new tables if we're upgrading
ver = s.scalar("select version from decks limit 1") ver = s.scalar("select version from decks limit 1")
# add a checksum to fields
if ver < 71:
try:
s.execute(
"alter table fields add column chksum text "+
"not null default ''")
except:
pass
# add any possibly new tables if we're upgrading
if ver < DECK_VERSION: if ver < DECK_VERSION:
metadata.create_all(engine) metadata.create_all(engine)
deck = s.query(Deck).get(1) deck = s.query(Deck).get(1)
@ -3695,7 +3738,7 @@ create index if not exists ix_fields_factId on fields (factId)""")
deck.s.statement(""" deck.s.statement("""
create index if not exists ix_fields_fieldModelId on fields (fieldModelId)""") create index if not exists ix_fields_fieldModelId on fields (fieldModelId)""")
deck.s.statement(""" deck.s.statement("""
create index if not exists ix_fields_value on fields (value)""") create index if not exists ix_fields_chksum on fields (chksum)""")
# media # media
deck.s.statement(""" deck.s.statement("""
create unique index if not exists ix_media_filename on media (filename)""") create unique index if not exists ix_media_filename on media (filename)""")
@ -3860,9 +3903,17 @@ this message. (ERR-0101)""") % {
"revCardsDue", "revCardsRandom", "acqCardsRandom", "revCardsDue", "revCardsRandom", "acqCardsRandom",
"acqCardsOld", "acqCardsNew"): "acqCardsOld", "acqCardsNew"):
deck.s.statement("drop view if exists %s" % v) deck.s.statement("drop view if exists %s" % v)
deck.version = 70
deck.s.commit()
if deck.version < 71:
# remove the expensive value cache
deck.s.statement("drop index if exists ix_fields_value")
# add checksums and index
deck.updateAllFieldChecksums()
DeckStorage._addIndices(deck)
deck.s.execute("vacuum") deck.s.execute("vacuum")
deck.s.execute("analyze") deck.s.execute("analyze")
deck.version = 70 deck.version = 71
deck.s.commit() deck.s.commit()
# executing a pragma here is very slow on large decks, so we store # executing a pragma here is very slow on large decks, so we store
# our own record # our own record

View file

@ -12,7 +12,7 @@ import time
from anki.db import * from anki.db import *
from anki.errors import * from anki.errors import *
from anki.models import Model, FieldModel, fieldModelsTable from anki.models import Model, FieldModel, fieldModelsTable
from anki.utils import genID, stripHTMLMedia from anki.utils import genID, stripHTMLMedia, fieldChecksum
from anki.hooks import runHook from anki.hooks import runHook
# Fields in a fact # Fields in a fact
@ -25,7 +25,8 @@ fieldsTable = Table(
Column('fieldModelId', Integer, ForeignKey("fieldModels.id"), Column('fieldModelId', Integer, ForeignKey("fieldModels.id"),
nullable=False), nullable=False),
Column('ordinal', Integer, nullable=False), Column('ordinal', Integer, nullable=False),
Column('value', UnicodeText, nullable=False)) Column('value', UnicodeText, nullable=False),
Column('chksum', String, nullable=False, default=""))
class Field(object): class Field(object):
"A field in a fact." "A field in a fact."
@ -90,9 +91,14 @@ class Fact(object):
def __setitem__(self, key, value): def __setitem__(self, key, value):
try: try:
[f for f in self.fields if f.name == key][0].value = value item = [f for f in self.fields if f.name == key][0]
except IndexError: except IndexError:
raise KeyError raise KeyError
item.value = value
if item.fieldModel.unique:
item.chksum = fieldChecksum(value)
else:
item.chksum = ""
def get(self, key, default): def get(self, key, default):
try: try:
@ -121,10 +127,11 @@ class Fact(object):
if not field.fieldModel.unique: if not field.fieldModel.unique:
return True return True
req = ("select value from fields " req = ("select value from fields "
"where fieldModelId = :fmid and value = :val") "where fieldModelId = :fmid and value = :val and chksum = :chk")
if field.id: if field.id:
req += " and id != %s" % field.id req += " and id != %s" % field.id
return not s.scalar(req, val=field.value, fmid=field.fieldModel.id) return not s.scalar(req, val=field.value, fmid=field.fieldModel.id,
chk=fieldChecksum(field.value))
def focusLost(self, field): def focusLost(self, field):
runHook('fact.focusLost', self, field) runHook('fact.focusLost', self, field)

View file

@ -17,7 +17,7 @@ import time
from anki.cards import cardsTable from anki.cards import cardsTable
from anki.facts import factsTable, fieldsTable from anki.facts import factsTable, fieldsTable
from anki.lang import _ from anki.lang import _
from anki.utils import genID, canonifyTags from anki.utils import genID, canonifyTags, fieldChecksum
from anki.utils import canonifyTags, ids2str from anki.utils import canonifyTags, ids2str
from anki.errors import * from anki.errors import *
from anki.deck import NEW_CARDS_RANDOM from anki.deck import NEW_CARDS_RANDOM
@ -122,11 +122,12 @@ and value != ''""",
continue continue
data = [{'fid': fid, data = [{'fid': fid,
'fmid': fm.id, 'fmid': fm.id,
'v': c.fields[index]} 'v': c.fields[index],
'chk': self.maybeChecksum(c.fields[index], fm.unique)}
for (fid, c) in upcards] for (fid, c) in upcards]
self.deck.s.execute(""" self.deck.s.execute("""
update fields set value = :v where factId = :fid and fieldModelId = :fmid""", update fields set value = :v, chksum = :chk where factId = :fid
data) and fieldModelId = :fmid""", data)
# update tags # update tags
self.deck.updateProgress() self.deck.updateProgress()
if tagsIdx is not None: if tagsIdx is not None:
@ -144,7 +145,7 @@ update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
self.deck.updateCardTags(cids) self.deck.updateCardTags(cids)
self.deck.updateProgress() self.deck.updateProgress()
self.deck.updateCardsFromFactIds(fids) self.deck.updateCardsFromFactIds(fids)
self.total = len(fids) self.total = len(cards)
self.deck.setModified() self.deck.setModified()
self.deck.finishProgress() self.deck.finishProgress()
@ -152,6 +153,11 @@ update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
"The number of fields." "The number of fields."
return 0 return 0
def maybeChecksum(self, data, unique):
if not unique:
return ""
return fieldChecksum(data)
def foreignCards(self): def foreignCards(self):
"Return a list of foreign cards for importing." "Return a list of foreign cards for importing."
assert 0 assert 0
@ -254,7 +260,11 @@ where factId in (%s)""" % ",".join([str(s) for s in factIds]))
'ordinal': fm.ordinal, 'ordinal': fm.ordinal,
'id': genID(), 'id': genID(),
'value': (index is not None and 'value': (index is not None and
cards[m].fields[index] or u"")} cards[m].fields[index] or u""),
'chksum': self.maybeChecksum(
index is not None and
cards[m].fields[index] or u"", fm.unique)
}
for m in range(len(cards))] for m in range(len(cards))]
self.deck.s.execute(fieldsTable.insert(), self.deck.s.execute(fieldsTable.insert(),
data) data)

View file

@ -43,6 +43,7 @@ fieldModelsTable = Table(
# reused as RTL marker # reused as RTL marker
Column('features', UnicodeText, nullable=False, default=u""), Column('features', UnicodeText, nullable=False, default=u""),
Column('required', Boolean, nullable=False, default=True), Column('required', Boolean, nullable=False, default=True),
# if code changes this, it should call deck.updateFieldChecksums()
Column('unique', Boolean, nullable=False, default=True), # sqlite keyword Column('unique', Boolean, nullable=False, default=True), # sqlite keyword
Column('numeric', Boolean, nullable=False, default=False), Column('numeric', Boolean, nullable=False, default=False),
# display # display

View file

@ -430,7 +430,7 @@ class SyncTools(object):
select id, modelId, created, %s, tags, spaceUntil, lastCardId from facts select id, modelId, created, %s, tags, spaceUntil, lastCardId from facts
where id in %s""" % (modified, factIds))), where id in %s""" % (modified, factIds))),
'fields': self.realLists(self.deck.s.all(""" 'fields': self.realLists(self.deck.s.all("""
select id, factId, fieldModelId, ordinal, value from fields select id, factId, fieldModelId, ordinal, value, chksum from fields
where factId in %s""" % factIds)) where factId in %s""" % factIds))
} }
@ -455,12 +455,17 @@ insert or replace into facts
values values
(:id, :modelId, :created, :modified, :tags, :spaceUntil, :lastCardId)""", dlist) (:id, :modelId, :created, :modified, :tags, :spaceUntil, :lastCardId)""", dlist)
# now fields # now fields
def chksum(f):
if len(f) > 5:
return f[5]
return self.deck.fieldChecksum(f[4])
dlist = [{ dlist = [{
'id': f[0], 'id': f[0],
'factId': f[1], 'factId': f[1],
'fieldModelId': f[2], 'fieldModelId': f[2],
'ordinal': f[3], 'ordinal': f[3],
'value': f[4] 'value': f[4],
'chksum': f[5]
} for f in fields] } for f in fields]
# delete local fields since ids may have changed # delete local fields since ids may have changed
self.deck.s.execute( self.deck.s.execute(
@ -469,9 +474,9 @@ values
# then update # then update
self.deck.s.execute(""" self.deck.s.execute("""
insert into fields insert into fields
(id, factId, fieldModelId, ordinal, value) (id, factId, fieldModelId, ordinal, value, chksum)
values values
(:id, :factId, :fieldModelId, :ordinal, :value)""", dlist) (:id, :factId, :fieldModelId, :ordinal, :value, :chksum)""", dlist)
self.deck.s.statement( self.deck.s.statement(
"delete from factsDeleted where factId in %s" % "delete from factsDeleted where factId in %s" %
ids2str([f[0] for f in facts])) ids2str([f[0] for f in facts]))

View file

@ -277,6 +277,12 @@ def deleteTags(tagstr, tags):
def checksum(data): def checksum(data):
return md5(data).hexdigest() return md5(data).hexdigest()
def fieldChecksum(data):
# 8 digit md5 hash of utf8 string, or empty string if empty value
if not data:
return ""
return checksum(data.encode("utf-8"))[:8]
def call(argv, wait=True, **kwargs): def call(argv, wait=True, **kwargs):
try: try:
o = subprocess.Popen(argv, **kwargs) o = subprocess.Popen(argv, **kwargs)

View file

@ -129,6 +129,41 @@ def test_factAddDelete():
# and the second should clear the fact # and the second should clear the fact
deck.deleteCard(id2) deck.deleteCard(id2)
def test_fieldChecksum():
deck = DeckStorage.Deck()
deck.addModel(BasicModel())
f = deck.newFact()
f['Front'] = u"new"; f['Back'] = u"new2"
deck.addFact(f)
(id, sum) = deck.s.first(
"select id, chksum from fields where value = 'new'")
assert sum == "22af645d"
# empty field should have no checksum
f['Front'] = u""
deck.s.flush()
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == ""
# changing the value should change the checksum
f['Front'] = u"newx"
deck.s.flush()
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == "4b0e5a4c"
# back should have no checksum, because it's not set to be unique
(id, sum) = deck.s.first(
"select id, chksum from fields where value = 'new2'")
assert sum == ""
# if we turn on unique, it should get a checksum
fm = f.model.fieldModels[1]
fm.unique = True
deck.updateFieldChecksums(fm.id)
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == "82f2ec5f"
# and turning it off should zero the checksum again
fm.unique = False
deck.updateFieldChecksums(fm.id)
assert deck.s.scalar(
"select chksum from fields where id = :id", id=id) == ""
def test_modelAddDelete(): def test_modelAddDelete():
deck = DeckStorage.Deck() deck = DeckStorage.Deck()
deck.addModel(BasicModel()) deck.addModel(BasicModel())