From c59dd854fb36524c8c2a962fafc85b34d1fea25d Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Mon, 12 Sep 2011 03:11:06 +0900 Subject: [PATCH] add change detection I removed the media database in an earlier commit, but it's now necessary again as I decided to add native media syncing to AnkiWeb. This time, the DB is stored in the media folder rather than with the deck. This means we avoid sending it in a full sync, and makes deck backups faster. The DB is a cache of file modtimes and checksums. When findChanges() is called, the code checks to see which files were added, changed or deleted since the last time, and updates the log of changes. Because the scanning step and log retrieval is separate, it's possible to do the scanning in the background if the need arises. If the DB is deleted by the user, Anki will forget any deletions, and add all the files back to the DB the next time it's accessed. File changes are recorded as a delete + add. media.addFile() could be optimized in the future to log media added manually by the user, allowing us to skip the full directory scan in cases where the only changes were manually added media. --- anki/media.py | 124 +++++++++++++++++++++++++++++++++++++++++--- anki/utils.py | 12 ++--- tests/test_latex.py | 4 +- tests/test_media.py | 37 +++++++++++++ 4 files changed, 160 insertions(+), 17 deletions(-) diff --git a/anki/media.py b/anki/media.py index b1bd1fd8d..d3800ba63 100644 --- a/anki/media.py +++ b/anki/media.py @@ -6,6 +6,7 @@ import os, shutil, re, urllib, urllib2, time, unicodedata, \ urllib, sys, shutil from anki.utils import checksum, intTime, namedtmp, isWin from anki.lang import _ +from anki.db import DB class MediaManager(object): @@ -15,14 +16,19 @@ class MediaManager(object): def __init__(self, deck): self.deck = deck - self._dir = None + # media directory + self._dir = re.sub("(?i)\.(anki)$", ".media", self.deck.path) + if not os.path.exists(self._dir): + os.makedirs(self._dir) + os.chdir(self._dir) + # change database + path = os.path.join(self.dir(), "media.db") + create = not os.path.exists(path) + self.db = DB(path) + if create: + self._initDB() def dir(self): - if not self._dir: - self._dir = re.sub("(?i)\.(anki)$", ".media", self.deck.path) - if not os.path.exists(self._dir): - os.makedirs(self._dir) - os.chdir(self._dir) return self._dir # Adding media @@ -118,7 +124,7 @@ If the same name exists, compare checksums.""" # loop through directory and find unused & missing media unused = [] for file in os.listdir(mdir): - if file.startswith("latex-"): + if file.startswith("latex-") or file.startswith("media.db"): continue path = os.path.join(mdir, file) if not os.path.isfile(path): @@ -145,3 +151,107 @@ If the same name exists, compare checksums.""" for f in self.mediaFiles(p[type]): files.add(f) return files + + # Tracking changes + ########################################################################## + + def _initDB(self): + # in the log, a mod time of zero indicates a delete + self.db.executescript(""" +create table media (fname text primary key, csum text, mod int); +create table meta (dirMod int); +insert into meta values (0); +create table log (id int, fname text, mod int); +create index ix_log_id on log (id); +""") + + def _mtime(self, path): + return int(os.stat(path).st_mtime) + + def _checksum(self, path): + return checksum(open(path, "rb").read()) + + def changed(self): + "Return dir mtime if it has changed since the last findChanges()" + # doesn't track edits, but user can add or remove a file to update + mod = self.db.scalar("select dirMod from meta") + mtime = self._mtime(self.dir()) + if mod and mod == mtime: + return False + return mtime + + def findChanges(self): + "Scan the media folder if it's changed, and note any changes." + if self.changed(): + self._logChanges() + + def changesSince(self, mod): + "Return a list of added and removed files since MOD time." + self.findChanges() + added = {} + removed = {} + # loop through and collect changes, removing duplicates + for fname, mod in self.db.all( + "select fname, mod from log where id > ?", mod): + if mod: + added[fname] = mod + else: + removed[fname] = mod + return added.items(), removed.keys() + + def _changes(self): + self.cache = {} + for (name, csum, mod) in self.db.execute( + "select * from media"): + self.cache[name] = [csum, mod, False] + added = [] + removed = [] + changed = [] + # loop through on-disk files + for f in os.listdir(self.dir()): + # ignore our db and folders + if f.startswith("media.db") or os.path.isdir(f): + continue + # newly added? + if f not in self.cache: + added.append(f) + else: + # modified since last time? + if self._mtime(f) != self.cache[f][1]: + # and has different checksum? + if self._checksum(f) != self.cache[f][0]: + changed.append(f) + # mark as used + self.cache[f][2] = True + # look for any entries in the cache that no longer exist on disk + for (k, v) in self.cache.items(): + if not v[2]: + removed.append(k) + return added, changed, removed + + def _logChanges(self): + (added, changed, removed) = self._changes() + log = [] + media = [] + mediaRem = [] + t = intTime() + for f in added: + mt = self._mtime(f) + media.append((f, self._checksum(f), mt)) + log.append((t, f, mt)) + for f in changed: + mt = self._mtime(f) + media.append((f, self._checksum(f), mt)) + log.append((t, f, 0)) + log.append((t, f, mt)) + for f in removed: + mediaRem.append((f,)) + log.append((t, f, 0)) + # update db + self.db.executemany("insert or replace into media values (?,?,?)", + media) + self.db.executemany("insert into log values (?,?,?)", log) + if mediaRem: + self.db.executemany("delete from media where fname = ?", + mediaRem) + self.db.execute("update meta set dirMod = ?", self._mtime(self.dir())) diff --git a/anki/utils.py b/anki/utils.py index c74f1f8c3..c99d89597 100644 --- a/anki/utils.py +++ b/anki/utils.py @@ -4,14 +4,7 @@ import re, os, random, time, types, math, htmlentitydefs, subprocess, \ tempfile, shutil - -try: - import hashlib - md5 = hashlib.md5 -except ImportError: - import md5 - md5 = md5.new - +from hashlib import md5, sha1 from anki.lang import _, ngettext import locale, sys @@ -204,6 +197,9 @@ def splitFields(string): def checksum(data): return md5(data).hexdigest() +def sha1sum(data): + return sha1(data).hexdigest() + def fieldChecksum(data): # 32 bit unsigned number from first 8 digits of md5 hash return int(checksum(data.encode("utf-8"))[:8], 16) diff --git a/tests/test_latex.py b/tests/test_latex.py index 9e47fd348..d3e8c4457 100644 --- a/tests/test_latex.py +++ b/tests/test_latex.py @@ -14,8 +14,8 @@ def test_latex(): f = d.newFact() f['Front'] = u"[latex]hello[/latex]" d.addFact(f) - # but since latex couldn't run, it will be empty - assert len(os.listdir(d.media.dir())) == 0 + # but since latex couldn't run, it will only have the media.db + assert len(os.listdir(d.media.dir())) == 1 # check the error message msg = f.cards()[0].q() assert "executing latex" in msg diff --git a/tests/test_media.py b/tests/test_media.py index 2c5e183fb..d84afa4fa 100644 --- a/tests/test_media.py +++ b/tests/test_media.py @@ -59,3 +59,40 @@ def test_deckIntegration(): ret = d.media.check() assert ret[0] == ["fake2.png"] assert ret[1] == ["foo.jpg"] + +def test_changes(): + d = getEmptyDeck() + assert d.media.changed() + add, rem = d.media.changesSince(0) + assert not add; assert not rem + assert not d.media.changed() + # add a file + dir = tempfile.mkdtemp(prefix="anki") + path = os.path.join(dir, "foo.jpg") + open(path, "w").write("hello") + time.sleep(1) + path = d.media.addFile(path) + # should have been logged + add, rem = d.media.changesSince(0) + assert add; assert not rem + mod = add[0][1] + # if we modify it, the cache won't notice + time.sleep(1) + open(path, "w").write("world") + add, rem = d.media.changesSince(0) + assert len(add) == 1 + # but if we add another file, it will + time.sleep(1) + open(path+"2", "w").write("yo") + add, rem = d.media.changesSince(0) + assert len(add) == 2 + assert len(rem) == 1 + assert add[0][1] != mod + assert add[0][0] == "foo.jpg" + # deletions should get noticed too + time.sleep(1) + os.unlink(path+"2") + add, rem = d.media.changesSince(0) + assert len(add) == 2 + assert len(rem) == 2 +