initial work on sync refactor

Ported the sync code to the latest libanki structure. Key points:

No summary:

The old style got each side to fetch ids+mod times and required the client to
diff them and then request or bundle up the appropriate objects. Instead, we now
get each side to send all changed objects, and it's the responsibility of the
other side to decide what needs to be merged and what needs to be discarded.
This allows us to skip a separate summary step, which saves scanning tables
twice, and allows us to reduce server requests from 4 to 3.

Schema changes:

Certain operations that are difficult to merge (such as changing the number of
fields in a model, or deleting models or groups) result in a full sync. The
user is warned about it in the GUI before such schema-changing operations
execute.

Sync size:

For now, we don't try to deal with large incremental syncs. Because the cards,
facts and revlog can be large in memory (hundreds of megabytes in some cases),
they would have to be chunked for the benefit of devices with a low amount of
memory.

Currently findChanges() uses the full fact/card objects which we're planning to
send to the server. It could be rewritten to fetch a summary (just the id, mod
& rep columns) which would save some memory, and then compare against blocks
of a few hundred remote objects at a time. However, it's a bit more
complicated than that:

- If the local summary is huge it could exceed memory limits. Without a local
  summary we'd have to query the db for each record, which could be a lot
  slower.

- We currently accumulate a list of remote records we need to add locally.
  This list also has the potential to get too big. We would need to
  periodically commit the changes as we accumulate them.

- Merging a large amount of changes is also potentially slow on mobile
  devices.

Given the fact that certain schema-changing operations require a full sync
anyway, I think it's probably best to concentrate on a chunked full sync for
now instead, as provided the user syncs periodically it should not be easy to
hit the full sync limits except after bulk editing operations.

Chunked partial syncing should be possible to add in the future without any
changes to the deck format.

Still to do:
- deck conf merging
- full syncing
- new http proxy
This commit is contained in:
Damien Elmes 2011-09-08 12:47:53 +09:00
parent 7034c1ed29
commit 362ae3eee2
9 changed files with 474 additions and 916 deletions

View file

@ -11,3 +11,7 @@ class AnkiError(Exception):
if self.data:
m += ": %s" % repr(self.data)
return m
class SyncTooLarge(Exception):
pass

View file

@ -38,8 +38,8 @@ select mid, gid, mod, tags, flds, data from facts where id = ?""", self.id)
self._model = self.deck.models.get(self.mid)
self._fmap = self.deck.models.fieldMap(self._model)
def flush(self):
self.mod = intTime()
def flush(self, mod=None):
self.mod = mod if mod else intTime()
sfld = stripHTML(self.fields[self.deck.models.sortIdx(self._model)])
tags = self.stringTags()
res = self.deck.db.execute("""

View file

@ -72,9 +72,10 @@ class GroupManager(object):
self.gconf = simplejson.loads(gconf)
self.changed = False
def save(self, g):
def save(self, g=None):
"Can be called with either a group or a group configuration."
g['mod'] = intTime()
if g:
g['mod'] = intTime()
self.changed = True
def flush(self):
@ -126,6 +127,10 @@ class GroupManager(object):
"A list of all groups."
return self.groups.values()
def allConf(self):
"A list of all group config."
return self.gconf.values()
def _ensureParents(self, name):
path = name.split("::")
s = ""
@ -146,12 +151,24 @@ class GroupManager(object):
return self.gconf[str(self.groups[str(gid)]['conf'])]
def get(self, gid):
return self.groups[str(gid)]
id = str(gid)
if id in self.groups:
return self.groups[id]
def setGroup(self, cids, gid):
self.db.execute(
"update cards set gid = ? where id in "+ids2str(cids), gid)
def update(self, g):
"Add or update an existing group. Used for syncing and merging."
self.groups[str(g['id'])] = g
# mark registry changed, but don't bump mod time
self.save()
def updateConf(self, g):
self.gconf[str(g['id'])] = g
self.save()
# Group selection
#############################################################

View file

@ -19,6 +19,9 @@ from anki.utils import fieldChecksum, ids2str
from anki.errors import *
#from anki.deck import NEW_CARDS_RANDOM
# FIXME: when importing an anki file, if any revlog entries are less than the
# last sync time, we need to bump the deck schema
# Base importer
##########################################################################

View file

@ -4,7 +4,6 @@
from anki import Deck
from anki.importing import Importer
from anki.sync import SyncClient, SyncServer, copyLocalMedia
from anki.lang import _
from anki.utils import ids2str
#from anki.deck import NEW_CARDS_RANDOM

View file

@ -98,8 +98,10 @@ class ModelManager(object):
self.deck.groups.top()['curModel'] = m['id']
def get(self, id):
"Get model with ID."
return self.models[str(id)]
"Get model with ID, or None."
id = str(id)
if id in self.models:
return self.models[id]
def all(self):
"Get all models."
@ -139,11 +141,16 @@ select id from cards where fid in (select id from facts where mid = ?)""",
def _add(self, m):
self._setID(m)
self.models[m['id']] = m
self.save(m)
self.update(m)
self.setCurrent(m)
return m
def update(self, m):
"Add or update an existing model. Used for syncing and merging."
self.models[str(m['id'])] = m
# mark registry changed, but don't bump mod time
self.save()
def _setID(self, m):
while 1:
id = str(intTime(1000))

File diff suppressed because it is too large Load diff

View file

@ -57,6 +57,9 @@ class TagManager(object):
self.register(set(self.split(
" ".join(self.deck.db.list("select distinct tags from facts"+lim)))))
def allSince(self, mod):
return [k for k,v in self.tags.items() if v > mod]
# Bulk addition/removal from facts
#############################################################

View file

@ -6,8 +6,7 @@ from tests.shared import assertException
from anki.errors import *
from anki import Deck
from anki.utils import intTime
from anki.sync import SyncClient, SyncServer, HttpSyncServer, HttpSyncServerProxy
from anki.sync import copyLocalMedia
from anki.sync import Syncer, LocalServer
from anki.facts import Fact
from anki.cards import Card
from tests.shared import getEmptyDeck
@ -22,306 +21,108 @@ deck2=None
client=None
server=None
def setup_local(loadDecks=None):
def setup_basic(loadDecks=None):
global deck1, deck2, client, server
if loadDecks:
deck1 = Deck(loadDecks[0], backup=False)
deck2 = Deck(loadDecks[1], backup=False)
else:
deck1 = getEmptyDeck()
# add a fact to deck 1
f = deck1.newFact()
f['Front'] = u"foo"; f['Back'] = u"bar"; f.tags = [u"foo"]
deck1.addFact(f)
deck1.syncName = "abc"
# answer it
deck1.reset(); deck1.sched.answerCard(deck1.sched.getCard(), 4)
# repeat for deck2; sleep a tick so we have different ids
deck2 = getEmptyDeck()
f = deck2.newFact()
f['Front'] = u"foo"; f['Back'] = u"bar"; f.tags = [u"foo"]
f['Front'] = u"bar"; f['Back'] = u"bar"; f.tags = [u"bar"]
deck2.addFact(f)
deck2.syncName = "abc"
deck1.lastSync = deck2.lastSync = intTime()
deck2.reset(); deck2.sched.answerCard(deck2.sched.getCard(), 4)
# start with same schema and sync time
deck1.lastSync = deck2.lastSync = intTime() - 1
deck1.scm = deck2.scm = 0
time.sleep(1)
# now add another fact to deck1 that hasn't been synced yet
f = deck1.newFact()
f['Front'] = u"bar"; f['Back'] = u"baz"
deck1.addFact(f)
# and another to deck2
f = deck2.newFact()
f['Front'] = u"qux"; f['Back'] = u"baz"
deck2.addFact(f)
deck2.reset()
c = deck2.sched.getCard()
deck2.sched.answerCard(c, 3)
# change deck1's model
deck1.currentModel().flush()
# and same mod time, so sync does nothing
deck1.save(); deck2.save()
client = SyncClient(deck1)
server = SyncServer(deck2)
print "deck1", client.deck.db.all("select * from facts")
print "deck2", server.deck.db.all("select * from facts")
client.setServer(server)
server = LocalServer(deck2)
client = Syncer(deck1, server)
def teardown():
pass
def setup_modified():
setup_basic()
# mark deck1 as changed
deck1.save(mod=intTime()+1)
@nose.with_setup(setup_local, teardown)
def _test_changes():
deck2.scm = 0
dels = client.deletions(deck1.lastSync)
rem = server.changes(deck1.lastSync, dels)
client.delete(rem['deletions'])
assert rem
client.rewriteIds(rem)
loc = client.changes(deck1.lastSync)
assert loc
l, r = client.diff(loc, rem, "facts", 3)
# local id is larger
assert l[0][0] == 3
assert r[0][0] == 2
@nose.with_setup(setup_basic)
def test_nochange():
assert client.sync() == "noChanges"
keys = ("models", "groups", "gconf", "facts", "cards")
keys2 = ("revlog", "tags")
@nose.with_setup(setup_modified)
def test_changedSchema():
deck1.scm += 1
assert client.sync() == "fullSync"
proc = {}
resp = {}
for type in keys:
l, r = getattr(client, 'diff'+type.capitalize())(loc, rem)
proc[type] = r
resp[type] = l
for type in keys2:
l = loc[type]; r = rem[type]
proc[type] = r
resp[type] = l
@nose.with_setup(setup_modified)
def test_sync():
def check(num):
for d in deck1, deck2:
for t in ("revlog", "facts", "cards", "fsums"):
assert d.db.scalar("select count() from %s" % t) == num
assert len(d.models.all()) == num*2
# the default group and config have an id of 1, so always 1
assert len(d.groups.all()) == 1
assert len(d.groups.gconf) == 1
assert len(d.tags.all()) == num
check(1)
origLs = deck1.lastSync
assert client.sync() == "success"
# last sync times and mod times should agree
assert deck1.mod == deck2.mod
assert deck1.lastSync == deck2.lastSync
assert deck1.lastSync != origLs
# because everything was created separately it will be merged in. in
# actual use we use a full sync to ensure initial a common starting point.
check(2)
# repeating it does nothing
assert client.sync() == "noChanges"
# if we bump mod time, everything is copied across again because of the
# 600 second sync leeway. but the decks should remain the same.
deck1.save(mod=intTime()+2)
assert client.sync() == "success"
check(2)
for type in keys + keys2:
getattr(client, 'update'+type.capitalize())(proc[type])
@nose.with_setup(setup_modified)
def test_models():
test_sync()
# update model one
cm = deck1.models.current()
cm['name'] = "new"
cm['mod'] = intTime() + 1
deck1.save(mod=intTime()+1)
assert deck2.models.get(cm['id'])['name'] == "Basic"
assert client.sync() == "success"
assert deck2.models.get(cm['id'])['name'] == "new"
# deleting triggers a full sync
deck1.scm = deck2.scm = 0
deck1.models.rem(cm)
deck1.save(mod=intTime()+1)
assert client.sync() == "fullSync"
for type in keys + keys2:
getattr(server, 'update'+type.capitalize())(resp[type])
print "deck1", client.deck.db.all("select * from revlog")
print "deck2", server.deck.db.all("select * from revlog")
#client.process(loc, rem)
# @nose.with_setup(setup_local, teardown)
# def test_localsync_deck():
# # deck two was modified last
# assert deck2.modified > deck1.modified
# d2mod = deck2.modified
# assert deck1.lastSync == 0 and deck2.lastSync == 0
# client.sync()
# assert deck1.modified == deck2.modified
# assert deck1.modified <= deck1.lastSync
# assert deck1.lastSync == deck2.lastSync
# # ensure values are being synced
# deck1.lowPriority += u",foo"
# deck1.setModified()
# client.sync()
# assert "foo" in deck2.lowPriority
# assert deck1.modified == deck2.modified
# assert deck1.lastSync == deck2.lastSync
# deck2.description = u"newname"
# deck2.setModified()
# client.sync()
# assert deck1.description == u"newname"
# # the most recent change should take precedence
# deck1.description = u"foo"
# deck1.setModified()
# deck2.description = u"bar"
# deck2.setModified()
# client.sync()
# assert deck1.description == "bar"
# # answer a card to ensure stats & history are copied
# c = deck1.getCard()
# deck1.answerCard(c, 4)
# client.sync()
# assert deck2.db.scalar("select count(*) from revlog") == 1
# # make sure meta data is synced
# deck1.setVar("foo", 1)
# assert deck1.getInt("foo") == 1
# assert deck2.getInt("foo") is None
# client.sync()
# assert deck1.getInt("foo") == 1
# assert deck2.getInt("foo") == 1
# @nose.with_setup(setup_local, teardown)
# def test_localsync_models():
# client.sync()
# # add a model
# deck1.addModel(BasicModel())
# assert len(deck1.models) == 3
# assert len(deck2.models) == 2
# deck1.setVar("schemaMod", 0)
# client.sync()
# assert len(deck2.models) == 3
# assert deck1.currentModel.id == deck2.currentModel.id
# # delete the recently added model
# deck2.deleteModel(deck2.currentModel)
# assert len(deck2.models) == 2
# deck2.setVar("schemaMod", 0)
# client.sync()
# assert len(deck1.models) == 2
# assert deck1.currentModel.id == deck2.currentModel.id
# # make a card model inactive
# assert deck1.currentModel.cardModels[1].active == True
# deck2.currentModel.cardModels[1].active = False
# deck2.currentModel.setModified()
# deck2.flushMod()
# client.sync()
# assert deck1.currentModel.cardModels[1].active == False
# # remove a card model
# deck1.deleteCardModel(deck1.currentModel,
# deck1.currentModel.cardModels[1])
# deck1.currentModel.setModified()
# deck1.setModified()
# assert len(deck1.currentModel.cardModels) == 1
# deck1.setVar("schemaMod", 0)
# client.sync()
# assert len(deck2.currentModel.cardModels) == 1
# # rename a field
# c = deck1.getCard()
# assert u"Front" in c.fact.keys()
# deck1.renameFieldModel(deck1.currentModel,
# deck1.currentModel.fieldModels[0],
# u"Sideways")
# client.sync()
# assert deck2.currentModel.fieldModels[0].name == u"Sideways"
# @nose.with_setup(setup_local, teardown)
# def test_localsync_factsandcards():
# assert deck1.factCount() == 1 and deck1.cardCount() == 2
# assert deck2.factCount() == 1 and deck2.cardCount() == 2
# client.sync()
# deck1.reset(); deck2.reset()
# assert deck1.factCount() == 2 and deck1.cardCount() == 4
# assert deck2.factCount() == 2 and deck2.cardCount() == 4
# # ensure the fact was copied across
# f1 = deck1.db.query(Fact).first()
# f2 = deck1.db.query(Fact).get(f1.id)
# f1['Front'] = u"myfront"
# f1.setModified()
# deck1.setModified()
# client.sync()
# deck1.rebuildCounts()
# deck2.rebuildCounts()
# f2 = deck1.db.query(Fact).get(f1.id)
# assert f2['Front'] == u"myfront"
# c1 = deck1.getCard()
# c2 = deck2.getCard()
# assert c1.id == c2.id
# @nose.with_setup(setup_local, teardown)
# def test_localsync_threeway():
# # deck1 (client) <-> deck2 (server) <-> deck3 (client)
# deck3 = Deck()
# client2 = SyncClient(deck3)
# server2 = SyncServer(deck2)
# client2.setServer(server2)
# client.sync()
# client2.sync()
# # add a new question
# f = deck1.newFact()
# f['Front'] = u"a"; f['Back'] = u"b"
# f = deck1.addFact(f)
# card = f.cards[0]
# client.sync()
# assert deck1.cardCount() == 6
# assert deck2.cardCount() == 6
# # check it propagates from server to deck3
# client2.sync()
# assert deck3.cardCount() == 6
# # delete a card on deck1
# deck1.deleteCard(card.id)
# client.sync()
# deck1.reset(); deck2.reset()
# assert deck1.cardCount() == 5
# assert deck2.cardCount() == 5
# # make sure the delete is now propagated from the server to deck3
# client2.sync()
# assert deck3.cardCount() == 5
# def test_localsync_media():
# tmpdir = "/tmp/media-tests"
# try:
# shutil.rmtree(tmpdir)
# except OSError:
# pass
# shutil.copytree(os.path.join(os.path.dirname(__file__), "..",
# "tests/syncing/media-tests"),
# tmpdir)
# deck1anki = os.path.join(tmpdir, "1.anki")
# deck2anki = os.path.join(tmpdir, "2.anki")
# deck1media = os.path.join(tmpdir, "1.media")
# deck2media = os.path.join(tmpdir, "2.media")
# setup_local((deck1anki, deck2anki))
# assert len(os.listdir(deck1media)) == 2
# assert len(os.listdir(deck2media)) == 1
# client.sync()
# # metadata should have been copied
# assert deck1.db.scalar("select count(1) from media") == 3
# assert deck2.db.scalar("select count(1) from media") == 3
# # copy local files
# copyLocalMedia(deck1, deck2)
# assert len(os.listdir(deck1media)) == 2
# assert len(os.listdir(deck2media)) == 3
# copyLocalMedia(deck2, deck1)
# assert len(os.listdir(deck1media)) == 3
# assert len(os.listdir(deck2media)) == 3
# # check delete
# os.unlink(os.path.join(deck1media, "22161b29b0c18e068038021f54eee1ee.png"))
# rebuildMediaDir(deck1)
# client.sync()
# assert deck1.db.scalar("select count(1) from media") == 3
# assert deck2.db.scalar("select count(1) from media") == 3
# # Remote tests
# ##########################################################################
# # a replacement runCmd which just calls our server directly
# def runCmd(action, *args, **kargs):
# #print action, kargs
# return server.unstuff(apply(getattr(server, action), tuple(args) +
# tuple(kargs.values())))
# def setup_remote():
# setup_local()
# global client, server
# proxy = HttpSyncServerProxy("test", "foo")
# client = SyncClient(deck1)
# client.setServer(proxy)
# proxy.deckName = "test"
# proxy.runCmd = runCmd
# server = HttpSyncServer()
# server.deck = deck2
# server.decks = {"test": (deck2.modified, 0)}
# @nose.with_setup(setup_remote, teardown)
# def test_remotesync_fromserver():
# # deck two was modified last
# assert deck2.modified > deck1.modified
# client.sync()
# assert deck2.modified == deck1.modified
# # test deck vars
# deck1.setVar("foo", 1)
# client.sync()
# @nose.with_setup(setup_remote, teardown)
# def test_remotesync_toserver():
# deck1.setModified()
# client.sync()
# assert deck2.modified == deck1.modified
# # Full sync
# ##########################################################################
# @nose.with_setup(setup_remote, teardown)
# def test_formdata():
# global deck1
# (fd, name) = tempfile.mkstemp()
# deck1 = deck1.saveAs(name)
# deck1.setModified()
# client.deck = deck1
# client.prepareSync(0)
# client.prepareFullSync()
@nose.with_setup(setup_modified)
def test_facts():
test_sync()
# modifications should be synced
fid = deck1.db.scalar("select id from facts")
fact = deck1.getFact(fid)
assert fact['Front'] != "abc"
fact['Front'] = "abc"
fact.flush(mod=intTime()+1)
deck1.save(mod=intTime()+1)
assert client.sync() == "success"
assert deck2.getFact(fid)['Front'] == "abc"
# deletions too
deck1.remFacts([fid])
deck1.save(mod=intTime()+1)
assert client.sync() == "success"
assert not deck1.db.scalar("select 1 from facts where id = ?", fid)
assert not deck2.db.scalar("select 1 from facts where id = ?", fid)