mirror of
https://github.com/ankitects/anki.git
synced 2025-12-10 13:26:56 -05:00
use a checksum for field values; fixed import->update number
Previously we had an index on the value field, which was very expensive for long fields. Instead we use a separate column and take the first 8 characters of the field value's md5sum, and index that. In decks with lots of text in fields, it can cut the deck size by 30% or more, and many decks improve by 10-20%. Decks with only a few characters in fields may increase in size slightly, but this is offset by the fact that we only generate a checksum for fields that have uniqueness checking on. Also, fixed import->update reporting the total # of available facts instead of the number of facts that were imported.
This commit is contained in:
parent
da48eb1e55
commit
4302306fe9
7 changed files with 147 additions and 32 deletions
85
anki/deck.py
85
anki/deck.py
|
|
@ -16,7 +16,7 @@ from anki.lang import _, ngettext
|
|||
from anki.errors import DeckAccessError
|
||||
from anki.stdmodels import BasicModel
|
||||
from anki.utils import parseTags, tidyHTML, genID, ids2str, hexifyID, \
|
||||
canonifyTags, joinTags, addTags, checksum
|
||||
canonifyTags, joinTags, addTags, checksum, fieldChecksum
|
||||
from anki.history import CardHistoryEntry
|
||||
from anki.models import Model, CardModel, formatQA
|
||||
from anki.stats import dailyStats, globalStats, genToday
|
||||
|
|
@ -56,7 +56,7 @@ SEARCH_FIELD = 6
|
|||
SEARCH_FIELD_EXISTS = 7
|
||||
SEARCH_QA = 8
|
||||
SEARCH_PHRASE_WB = 9
|
||||
DECK_VERSION = 70
|
||||
DECK_VERSION = 71
|
||||
|
||||
deckVarsTable = Table(
|
||||
'deckVars', metadata,
|
||||
|
|
@ -393,6 +393,32 @@ type = (case
|
|||
when type >= 0 then relativeDelay else relativeDelay - 3 end)
|
||||
""")
|
||||
|
||||
def updateAllFieldChecksums(self):
|
||||
# zero out
|
||||
self.s.statement("update fields set chksum = ''")
|
||||
# add back for unique fields
|
||||
for m in self.models:
|
||||
for fm in m.fieldModels:
|
||||
self.updateFieldChecksums(fm.id)
|
||||
|
||||
def updateFieldChecksums(self, fmid):
|
||||
self.s.flush()
|
||||
self.setSchemaModified()
|
||||
unique = self.s.scalar(
|
||||
"select \"unique\" from fieldModels where id = :id", id=fmid)
|
||||
if unique:
|
||||
l = []
|
||||
for (id, value) in self.s.all(
|
||||
"select id, value from fields where fieldModelId = :id",
|
||||
id=fmid):
|
||||
l.append({'id':id, 'chk':fieldChecksum(value)})
|
||||
self.s.statements(
|
||||
"update fields set chksum = :chk where id = :id", l)
|
||||
else:
|
||||
self.s.statement(
|
||||
"update fields set chksum = '' where fieldModelId=:id",
|
||||
id=fmid)
|
||||
|
||||
def _cardQueue(self, card):
|
||||
return self.cardType(card)
|
||||
|
||||
|
|
@ -2697,10 +2723,15 @@ select id from facts where spaceUntil like :_ff_%d escape '\\'""" % c
|
|||
for (id, fid, val) in rows
|
||||
if val.find(src) != -1]
|
||||
# update
|
||||
self.s.statements(
|
||||
'update fields set value = :val where id = :id', modded)
|
||||
self.updateCardQACacheFromIds([f['fid'] for f in modded],
|
||||
if modded:
|
||||
self.s.statements(
|
||||
'update fields set value = :val where id = :id', modded)
|
||||
self.updateCardQACacheFromIds([f['fid'] for f in modded],
|
||||
type="facts")
|
||||
if field:
|
||||
self.updateFieldChecksums(field)
|
||||
else:
|
||||
self.updateAllFieldChecksums()
|
||||
return len(set([f['fid'] for f in modded]))
|
||||
|
||||
# Find duplicates
|
||||
|
|
@ -2989,7 +3020,8 @@ Return new path, relative to media dir."""
|
|||
self.modified = newTime or time.time()
|
||||
|
||||
def setSchemaModified(self):
|
||||
self.setVar("schemaMod", time.time())
|
||||
# we might be called during an upgrade, so avoid bumping modtime
|
||||
self.setVar("schemaMod", time.time(), mod=False)
|
||||
|
||||
def flushMod(self):
|
||||
"Mark modified and flush to DB."
|
||||
|
|
@ -3113,10 +3145,10 @@ where id = :id""", fid=f.id, cmid=m.cardModels[0].id, id=id)
|
|||
if quick:
|
||||
num = 4
|
||||
else:
|
||||
num = 9
|
||||
num = 10
|
||||
oldSize = os.stat(self.path)[stat.ST_SIZE]
|
||||
self.startProgress(num)
|
||||
self.updateProgress(_("Checking integrity..."))
|
||||
self.updateProgress(_("Checking database..."))
|
||||
if self.s.scalar("pragma integrity_check") != "ok":
|
||||
self.finishProgress()
|
||||
return _("Database file is damaged.\n"
|
||||
|
|
@ -3125,7 +3157,7 @@ where id = :id""", fid=f.id, cmid=m.cardModels[0].id, id=id)
|
|||
self.updateProgress()
|
||||
DeckStorage._addIndices(self)
|
||||
# does the user have a model?
|
||||
self.updateProgress(_("Checking schema..."))
|
||||
self.updateProgress()
|
||||
if not self.s.scalar("select count(id) from models"):
|
||||
self.addModel(BasicModel())
|
||||
problems.append(_("Deck was missing a model"))
|
||||
|
|
@ -3209,10 +3241,10 @@ select id from fields where factId not in (select id from facts)""")
|
|||
"update cardModels set allowEmptyAnswer = 1, typeAnswer = '' "
|
||||
"where allowEmptyAnswer is null or typeAnswer is null")
|
||||
# fix tags
|
||||
self.updateProgress(_("Rebuilding tag cache..."))
|
||||
self.updateProgress()
|
||||
self.updateCardTags()
|
||||
# make sure ordinals are correct
|
||||
self.updateProgress(_("Updating ordinals..."))
|
||||
self.updateProgress()
|
||||
self.s.statement("""
|
||||
update fields set ordinal = (select ordinal from fieldModels
|
||||
where id = fieldModelId)""")
|
||||
|
|
@ -3220,7 +3252,7 @@ where id = fieldModelId)""")
|
|||
update cards set ordinal = (select ordinal from cardModels
|
||||
where cards.cardModelId = cardModels.id)""")
|
||||
# fix problems with stripping html
|
||||
self.updateProgress(_("Rebuilding QA cache..."))
|
||||
self.updateProgress()
|
||||
fields = self.s.all("select id, value from fields")
|
||||
newFields = []
|
||||
for (id, value) in fields:
|
||||
|
|
@ -3228,11 +3260,14 @@ where cards.cardModelId = cardModels.id)""")
|
|||
self.s.statements(
|
||||
"update fields set value=:value where id=:id",
|
||||
newFields)
|
||||
# and field checksums
|
||||
self.updateProgress()
|
||||
self.updateAllFieldChecksums()
|
||||
# regenerate question/answer cache
|
||||
for m in self.models:
|
||||
self.updateCardsFromModel(m, dirty=False)
|
||||
# rebuild
|
||||
self.updateProgress(_("Rebuilding types..."))
|
||||
self.updateProgress()
|
||||
self.rebuildTypes()
|
||||
# since we can ensure the updated version will be propagated to
|
||||
# all locations, we can forget old tombstones
|
||||
|
|
@ -3241,7 +3276,7 @@ where cards.cardModelId = cardModels.id)""")
|
|||
# force a full sync
|
||||
self.setSchemaModified()
|
||||
# and finally, optimize
|
||||
self.updateProgress(_("Optimizing..."))
|
||||
self.updateProgress()
|
||||
self.optimize()
|
||||
newSize = os.stat(self.path)[stat.ST_SIZE]
|
||||
save = (oldSize - newSize)/1024
|
||||
|
|
@ -3521,8 +3556,16 @@ class DeckStorage(object):
|
|||
metadata.create_all(engine)
|
||||
deck = DeckStorage._init(s)
|
||||
else:
|
||||
# add any possibly new tables if we're upgrading
|
||||
ver = s.scalar("select version from decks limit 1")
|
||||
# add a checksum to fields
|
||||
if ver < 71:
|
||||
try:
|
||||
s.execute(
|
||||
"alter table fields add column chksum text "+
|
||||
"not null default ''")
|
||||
except:
|
||||
pass
|
||||
# add any possibly new tables if we're upgrading
|
||||
if ver < DECK_VERSION:
|
||||
metadata.create_all(engine)
|
||||
deck = s.query(Deck).get(1)
|
||||
|
|
@ -3695,7 +3738,7 @@ create index if not exists ix_fields_factId on fields (factId)""")
|
|||
deck.s.statement("""
|
||||
create index if not exists ix_fields_fieldModelId on fields (fieldModelId)""")
|
||||
deck.s.statement("""
|
||||
create index if not exists ix_fields_value on fields (value)""")
|
||||
create index if not exists ix_fields_chksum on fields (chksum)""")
|
||||
# media
|
||||
deck.s.statement("""
|
||||
create unique index if not exists ix_media_filename on media (filename)""")
|
||||
|
|
@ -3860,9 +3903,17 @@ this message. (ERR-0101)""") % {
|
|||
"revCardsDue", "revCardsRandom", "acqCardsRandom",
|
||||
"acqCardsOld", "acqCardsNew"):
|
||||
deck.s.statement("drop view if exists %s" % v)
|
||||
deck.version = 70
|
||||
deck.s.commit()
|
||||
if deck.version < 71:
|
||||
# remove the expensive value cache
|
||||
deck.s.statement("drop index if exists ix_fields_value")
|
||||
# add checksums and index
|
||||
deck.updateAllFieldChecksums()
|
||||
DeckStorage._addIndices(deck)
|
||||
deck.s.execute("vacuum")
|
||||
deck.s.execute("analyze")
|
||||
deck.version = 70
|
||||
deck.version = 71
|
||||
deck.s.commit()
|
||||
# executing a pragma here is very slow on large decks, so we store
|
||||
# our own record
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import time
|
|||
from anki.db import *
|
||||
from anki.errors import *
|
||||
from anki.models import Model, FieldModel, fieldModelsTable
|
||||
from anki.utils import genID, stripHTMLMedia
|
||||
from anki.utils import genID, stripHTMLMedia, fieldChecksum
|
||||
from anki.hooks import runHook
|
||||
|
||||
# Fields in a fact
|
||||
|
|
@ -25,7 +25,8 @@ fieldsTable = Table(
|
|||
Column('fieldModelId', Integer, ForeignKey("fieldModels.id"),
|
||||
nullable=False),
|
||||
Column('ordinal', Integer, nullable=False),
|
||||
Column('value', UnicodeText, nullable=False))
|
||||
Column('value', UnicodeText, nullable=False),
|
||||
Column('chksum', String, nullable=False, default=""))
|
||||
|
||||
class Field(object):
|
||||
"A field in a fact."
|
||||
|
|
@ -90,9 +91,14 @@ class Fact(object):
|
|||
|
||||
def __setitem__(self, key, value):
|
||||
try:
|
||||
[f for f in self.fields if f.name == key][0].value = value
|
||||
item = [f for f in self.fields if f.name == key][0]
|
||||
except IndexError:
|
||||
raise KeyError
|
||||
item.value = value
|
||||
if item.fieldModel.unique:
|
||||
item.chksum = fieldChecksum(value)
|
||||
else:
|
||||
item.chksum = ""
|
||||
|
||||
def get(self, key, default):
|
||||
try:
|
||||
|
|
@ -121,10 +127,11 @@ class Fact(object):
|
|||
if not field.fieldModel.unique:
|
||||
return True
|
||||
req = ("select value from fields "
|
||||
"where fieldModelId = :fmid and value = :val")
|
||||
"where fieldModelId = :fmid and value = :val and chksum = :chk")
|
||||
if field.id:
|
||||
req += " and id != %s" % field.id
|
||||
return not s.scalar(req, val=field.value, fmid=field.fieldModel.id)
|
||||
return not s.scalar(req, val=field.value, fmid=field.fieldModel.id,
|
||||
chk=fieldChecksum(field.value))
|
||||
|
||||
def focusLost(self, field):
|
||||
runHook('fact.focusLost', self, field)
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ import time
|
|||
from anki.cards import cardsTable
|
||||
from anki.facts import factsTable, fieldsTable
|
||||
from anki.lang import _
|
||||
from anki.utils import genID, canonifyTags
|
||||
from anki.utils import genID, canonifyTags, fieldChecksum
|
||||
from anki.utils import canonifyTags, ids2str
|
||||
from anki.errors import *
|
||||
from anki.deck import NEW_CARDS_RANDOM
|
||||
|
|
@ -122,11 +122,12 @@ and value != ''""",
|
|||
continue
|
||||
data = [{'fid': fid,
|
||||
'fmid': fm.id,
|
||||
'v': c.fields[index]}
|
||||
'v': c.fields[index],
|
||||
'chk': self.maybeChecksum(c.fields[index], fm.unique)}
|
||||
for (fid, c) in upcards]
|
||||
self.deck.s.execute("""
|
||||
update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
|
||||
data)
|
||||
update fields set value = :v, chksum = :chk where factId = :fid
|
||||
and fieldModelId = :fmid""", data)
|
||||
# update tags
|
||||
self.deck.updateProgress()
|
||||
if tagsIdx is not None:
|
||||
|
|
@ -144,7 +145,7 @@ update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
|
|||
self.deck.updateCardTags(cids)
|
||||
self.deck.updateProgress()
|
||||
self.deck.updateCardsFromFactIds(fids)
|
||||
self.total = len(fids)
|
||||
self.total = len(cards)
|
||||
self.deck.setModified()
|
||||
self.deck.finishProgress()
|
||||
|
||||
|
|
@ -152,6 +153,11 @@ update fields set value = :v where factId = :fid and fieldModelId = :fmid""",
|
|||
"The number of fields."
|
||||
return 0
|
||||
|
||||
def maybeChecksum(self, data, unique):
|
||||
if not unique:
|
||||
return ""
|
||||
return fieldChecksum(data)
|
||||
|
||||
def foreignCards(self):
|
||||
"Return a list of foreign cards for importing."
|
||||
assert 0
|
||||
|
|
@ -254,7 +260,11 @@ where factId in (%s)""" % ",".join([str(s) for s in factIds]))
|
|||
'ordinal': fm.ordinal,
|
||||
'id': genID(),
|
||||
'value': (index is not None and
|
||||
cards[m].fields[index] or u"")}
|
||||
cards[m].fields[index] or u""),
|
||||
'chksum': self.maybeChecksum(
|
||||
index is not None and
|
||||
cards[m].fields[index] or u"", fm.unique)
|
||||
}
|
||||
for m in range(len(cards))]
|
||||
self.deck.s.execute(fieldsTable.insert(),
|
||||
data)
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ fieldModelsTable = Table(
|
|||
# reused as RTL marker
|
||||
Column('features', UnicodeText, nullable=False, default=u""),
|
||||
Column('required', Boolean, nullable=False, default=True),
|
||||
# if code changes this, it should call deck.updateFieldChecksums()
|
||||
Column('unique', Boolean, nullable=False, default=True), # sqlite keyword
|
||||
Column('numeric', Boolean, nullable=False, default=False),
|
||||
# display
|
||||
|
|
|
|||
13
anki/sync.py
13
anki/sync.py
|
|
@ -430,7 +430,7 @@ class SyncTools(object):
|
|||
select id, modelId, created, %s, tags, spaceUntil, lastCardId from facts
|
||||
where id in %s""" % (modified, factIds))),
|
||||
'fields': self.realLists(self.deck.s.all("""
|
||||
select id, factId, fieldModelId, ordinal, value from fields
|
||||
select id, factId, fieldModelId, ordinal, value, chksum from fields
|
||||
where factId in %s""" % factIds))
|
||||
}
|
||||
|
||||
|
|
@ -455,12 +455,17 @@ insert or replace into facts
|
|||
values
|
||||
(:id, :modelId, :created, :modified, :tags, :spaceUntil, :lastCardId)""", dlist)
|
||||
# now fields
|
||||
def chksum(f):
|
||||
if len(f) > 5:
|
||||
return f[5]
|
||||
return self.deck.fieldChecksum(f[4])
|
||||
dlist = [{
|
||||
'id': f[0],
|
||||
'factId': f[1],
|
||||
'fieldModelId': f[2],
|
||||
'ordinal': f[3],
|
||||
'value': f[4]
|
||||
'value': f[4],
|
||||
'chksum': f[5]
|
||||
} for f in fields]
|
||||
# delete local fields since ids may have changed
|
||||
self.deck.s.execute(
|
||||
|
|
@ -469,9 +474,9 @@ values
|
|||
# then update
|
||||
self.deck.s.execute("""
|
||||
insert into fields
|
||||
(id, factId, fieldModelId, ordinal, value)
|
||||
(id, factId, fieldModelId, ordinal, value, chksum)
|
||||
values
|
||||
(:id, :factId, :fieldModelId, :ordinal, :value)""", dlist)
|
||||
(:id, :factId, :fieldModelId, :ordinal, :value, :chksum)""", dlist)
|
||||
self.deck.s.statement(
|
||||
"delete from factsDeleted where factId in %s" %
|
||||
ids2str([f[0] for f in facts]))
|
||||
|
|
|
|||
|
|
@ -277,6 +277,12 @@ def deleteTags(tagstr, tags):
|
|||
def checksum(data):
|
||||
return md5(data).hexdigest()
|
||||
|
||||
def fieldChecksum(data):
|
||||
# 8 digit md5 hash of utf8 string, or empty string if empty value
|
||||
if not data:
|
||||
return ""
|
||||
return checksum(data.encode("utf-8"))[:8]
|
||||
|
||||
def call(argv, wait=True, **kwargs):
|
||||
try:
|
||||
o = subprocess.Popen(argv, **kwargs)
|
||||
|
|
|
|||
|
|
@ -129,6 +129,41 @@ def test_factAddDelete():
|
|||
# and the second should clear the fact
|
||||
deck.deleteCard(id2)
|
||||
|
||||
def test_fieldChecksum():
|
||||
deck = DeckStorage.Deck()
|
||||
deck.addModel(BasicModel())
|
||||
f = deck.newFact()
|
||||
f['Front'] = u"new"; f['Back'] = u"new2"
|
||||
deck.addFact(f)
|
||||
(id, sum) = deck.s.first(
|
||||
"select id, chksum from fields where value = 'new'")
|
||||
assert sum == "22af645d"
|
||||
# empty field should have no checksum
|
||||
f['Front'] = u""
|
||||
deck.s.flush()
|
||||
assert deck.s.scalar(
|
||||
"select chksum from fields where id = :id", id=id) == ""
|
||||
# changing the value should change the checksum
|
||||
f['Front'] = u"newx"
|
||||
deck.s.flush()
|
||||
assert deck.s.scalar(
|
||||
"select chksum from fields where id = :id", id=id) == "4b0e5a4c"
|
||||
# back should have no checksum, because it's not set to be unique
|
||||
(id, sum) = deck.s.first(
|
||||
"select id, chksum from fields where value = 'new2'")
|
||||
assert sum == ""
|
||||
# if we turn on unique, it should get a checksum
|
||||
fm = f.model.fieldModels[1]
|
||||
fm.unique = True
|
||||
deck.updateFieldChecksums(fm.id)
|
||||
assert deck.s.scalar(
|
||||
"select chksum from fields where id = :id", id=id) == "82f2ec5f"
|
||||
# and turning it off should zero the checksum again
|
||||
fm.unique = False
|
||||
deck.updateFieldChecksums(fm.id)
|
||||
assert deck.s.scalar(
|
||||
"select chksum from fields where id = :id", id=id) == ""
|
||||
|
||||
def test_modelAddDelete():
|
||||
deck = DeckStorage.Deck()
|
||||
deck.addModel(BasicModel())
|
||||
|
|
|
|||
Loading…
Reference in a new issue