add change detection

I removed the media database in an earlier commit, but it's now necessary
again as I decided to add native media syncing to AnkiWeb.

This time, the DB is stored in the media folder rather than with the deck.
This means we avoid sending it in a full sync, and makes deck backups faster.
The DB is a cache of file modtimes and checksums. When findChanges() is
called, the code checks to see which files were added, changed or deleted
since the last time, and updates the log of changes. Because the scanning step
and log retrieval is separate, it's possible to do the scanning in the
background if the need arises.

If the DB is deleted by the user, Anki will forget any deletions, and add all
the files back to the DB the next time it's accessed.

File changes are recorded as a delete + add.

media.addFile() could be optimized in the future to log media added manually
by the user, allowing us to skip the full directory scan in cases where the
only changes were manually added media.
This commit is contained in:
Damien Elmes 2011-09-12 03:11:06 +09:00
parent 7e1df75cc2
commit c59dd854fb
4 changed files with 160 additions and 17 deletions

View file

@ -6,6 +6,7 @@ import os, shutil, re, urllib, urllib2, time, unicodedata, \
urllib, sys, shutil
from anki.utils import checksum, intTime, namedtmp, isWin
from anki.lang import _
from anki.db import DB
class MediaManager(object):
@ -15,14 +16,19 @@ class MediaManager(object):
def __init__(self, deck):
self.deck = deck
self._dir = None
# media directory
self._dir = re.sub("(?i)\.(anki)$", ".media", self.deck.path)
if not os.path.exists(self._dir):
os.makedirs(self._dir)
os.chdir(self._dir)
# change database
path = os.path.join(self.dir(), "media.db")
create = not os.path.exists(path)
self.db = DB(path)
if create:
self._initDB()
def dir(self):
if not self._dir:
self._dir = re.sub("(?i)\.(anki)$", ".media", self.deck.path)
if not os.path.exists(self._dir):
os.makedirs(self._dir)
os.chdir(self._dir)
return self._dir
# Adding media
@ -118,7 +124,7 @@ If the same name exists, compare checksums."""
# loop through directory and find unused & missing media
unused = []
for file in os.listdir(mdir):
if file.startswith("latex-"):
if file.startswith("latex-") or file.startswith("media.db"):
continue
path = os.path.join(mdir, file)
if not os.path.isfile(path):
@ -145,3 +151,107 @@ If the same name exists, compare checksums."""
for f in self.mediaFiles(p[type]):
files.add(f)
return files
# Tracking changes
##########################################################################
def _initDB(self):
# in the log, a mod time of zero indicates a delete
self.db.executescript("""
create table media (fname text primary key, csum text, mod int);
create table meta (dirMod int);
insert into meta values (0);
create table log (id int, fname text, mod int);
create index ix_log_id on log (id);
""")
def _mtime(self, path):
return int(os.stat(path).st_mtime)
def _checksum(self, path):
return checksum(open(path, "rb").read())
def changed(self):
"Return dir mtime if it has changed since the last findChanges()"
# doesn't track edits, but user can add or remove a file to update
mod = self.db.scalar("select dirMod from meta")
mtime = self._mtime(self.dir())
if mod and mod == mtime:
return False
return mtime
def findChanges(self):
"Scan the media folder if it's changed, and note any changes."
if self.changed():
self._logChanges()
def changesSince(self, mod):
"Return a list of added and removed files since MOD time."
self.findChanges()
added = {}
removed = {}
# loop through and collect changes, removing duplicates
for fname, mod in self.db.all(
"select fname, mod from log where id > ?", mod):
if mod:
added[fname] = mod
else:
removed[fname] = mod
return added.items(), removed.keys()
def _changes(self):
self.cache = {}
for (name, csum, mod) in self.db.execute(
"select * from media"):
self.cache[name] = [csum, mod, False]
added = []
removed = []
changed = []
# loop through on-disk files
for f in os.listdir(self.dir()):
# ignore our db and folders
if f.startswith("media.db") or os.path.isdir(f):
continue
# newly added?
if f not in self.cache:
added.append(f)
else:
# modified since last time?
if self._mtime(f) != self.cache[f][1]:
# and has different checksum?
if self._checksum(f) != self.cache[f][0]:
changed.append(f)
# mark as used
self.cache[f][2] = True
# look for any entries in the cache that no longer exist on disk
for (k, v) in self.cache.items():
if not v[2]:
removed.append(k)
return added, changed, removed
def _logChanges(self):
(added, changed, removed) = self._changes()
log = []
media = []
mediaRem = []
t = intTime()
for f in added:
mt = self._mtime(f)
media.append((f, self._checksum(f), mt))
log.append((t, f, mt))
for f in changed:
mt = self._mtime(f)
media.append((f, self._checksum(f), mt))
log.append((t, f, 0))
log.append((t, f, mt))
for f in removed:
mediaRem.append((f,))
log.append((t, f, 0))
# update db
self.db.executemany("insert or replace into media values (?,?,?)",
media)
self.db.executemany("insert into log values (?,?,?)", log)
if mediaRem:
self.db.executemany("delete from media where fname = ?",
mediaRem)
self.db.execute("update meta set dirMod = ?", self._mtime(self.dir()))

View file

@ -4,14 +4,7 @@
import re, os, random, time, types, math, htmlentitydefs, subprocess, \
tempfile, shutil
try:
import hashlib
md5 = hashlib.md5
except ImportError:
import md5
md5 = md5.new
from hashlib import md5, sha1
from anki.lang import _, ngettext
import locale, sys
@ -204,6 +197,9 @@ def splitFields(string):
def checksum(data):
return md5(data).hexdigest()
def sha1sum(data):
return sha1(data).hexdigest()
def fieldChecksum(data):
# 32 bit unsigned number from first 8 digits of md5 hash
return int(checksum(data.encode("utf-8"))[:8], 16)

View file

@ -14,8 +14,8 @@ def test_latex():
f = d.newFact()
f['Front'] = u"[latex]hello[/latex]"
d.addFact(f)
# but since latex couldn't run, it will be empty
assert len(os.listdir(d.media.dir())) == 0
# but since latex couldn't run, it will only have the media.db
assert len(os.listdir(d.media.dir())) == 1
# check the error message
msg = f.cards()[0].q()
assert "executing latex" in msg

View file

@ -59,3 +59,40 @@ def test_deckIntegration():
ret = d.media.check()
assert ret[0] == ["fake2.png"]
assert ret[1] == ["foo.jpg"]
def test_changes():
d = getEmptyDeck()
assert d.media.changed()
add, rem = d.media.changesSince(0)
assert not add; assert not rem
assert not d.media.changed()
# add a file
dir = tempfile.mkdtemp(prefix="anki")
path = os.path.join(dir, "foo.jpg")
open(path, "w").write("hello")
time.sleep(1)
path = d.media.addFile(path)
# should have been logged
add, rem = d.media.changesSince(0)
assert add; assert not rem
mod = add[0][1]
# if we modify it, the cache won't notice
time.sleep(1)
open(path, "w").write("world")
add, rem = d.media.changesSince(0)
assert len(add) == 1
# but if we add another file, it will
time.sleep(1)
open(path+"2", "w").write("yo")
add, rem = d.media.changesSince(0)
assert len(add) == 2
assert len(rem) == 1
assert add[0][1] != mod
assert add[0][0] == "foo.jpg"
# deletions should get noticed too
time.sleep(1)
os.unlink(path+"2")
add, rem = d.media.changesSince(0)
assert len(add) == 2
assert len(rem) == 2