From be045d451c7c517a498b27fde9ea911ce9d9f61c Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Sat, 19 Mar 2011 10:45:48 +0900 Subject: [PATCH] remove the media table The media table was originally introduced when Anki hashed media filenames, and needed a way to remember the original filename. It also helped with: 1) getting a quick list of all media used in the deck, or the media added since the last sync, for mobile clients 2) merging identical files with different names But had some drawbacks: - every operation that modifies templates, models or facts meant generating the q/a and checking if any new media had appeared - each entry is about 70 bytes, and some decks have 100k+ media files So we remove the media table. We address 1) by being more intelligent about media downloads on the mobile platform. We ask the user after a full sync if they want to look for missing media, and they can choose not to if they know they haven't added any. And on a partial sync, we can scan the contents of the incoming facts for media references, and download any references we find. This also avoids all the issues people had with media not downloading because it was in their media folder but not in the media database. For 2), when copying media to the media folder, if we have a duplicate filename, we check if that file has the same md5, and avoid copying if so. This won't merge identical content that has separate names, but instances where users need that are rare. --- anki/deck.py | 1 - anki/latex.py | 2 +- anki/media.py | 150 ++++++++++++++------------------------ anki/storage.py | 16 +--- tests/support/anki12.anki | Bin 196608 -> 196608 bytes tests/test_media.py | 112 +++++++++++----------------- 6 files changed, 98 insertions(+), 183 deletions(-) diff --git a/anki/deck.py b/anki/deck.py index 1e22ec18e..444f88115 100644 --- a/anki/deck.py +++ b/anki/deck.py @@ -450,7 +450,6 @@ select id from cards where fid in (select id from facts where mid = ?)""", html = anki.template.render(format, fields) # if filters: # d[type] = runFilter("renderQA.post", html, fields, meta, self) - self.media.registerText(html) d[type] = html return d diff --git a/anki/latex.py b/anki/latex.py index dabbadfb0..97a847593 100644 --- a/anki/latex.py +++ b/anki/latex.py @@ -69,7 +69,7 @@ def buildImg(deck, latex): texfile.write(latex) texfile.close() # make sure we have a valid mediaDir - mdir = deck.mediaDir(create=True) + mdir = deck.media.dir(create=True) oldcwd = os.getcwd() if sys.platform == "win32": si = subprocess.STARTUPINFO() diff --git a/anki/media.py b/anki/media.py index 0c49f213f..740ec6b00 100644 --- a/anki/media.py +++ b/anki/media.py @@ -15,17 +15,17 @@ class MediaRegistry(object): def __init__(self, deck): self.deck = deck self.mediaPrefix = "" - self._mediaDir = None - self._updateMediaDir() + self._dir = None + self._updateDir() - def mediaDir(self, create=False): - if self._mediaDir: - return self._mediaDir + def dir(self, create=False): + if self._dir: + return self._dir elif create: - self._updateMediaDir(True) - return self._mediaDir + self._updateDir(True) + return self._dir - def _updateMediaDir(self, create=False): + def _updateDir(self, create=False): if self.mediaPrefix: dir = os.path.join( self.mediaPrefix, os.path.basename(self.deck.path)) @@ -42,88 +42,32 @@ class MediaRegistry(object): os.makedirs(dir) # change to the current dir os.chdir(dir) - self._mediaDir = dir + self._dir = dir - # Adding and registering media + # Adding media ########################################################################## - def addFile(self, path): + def addFile(self, opath): """Copy PATH to MEDIADIR, and return new filename. -If a file with the same md5sum exists in the DB, return that. -If a file with the same name exists, return a unique name.""" - # see if have duplicate contents - csum = self.mediaChecksum(path) - if not csum: - # file was unreadable or didn't exist - return None - file = self.deck.db.scalar( - "select file from media where csum = :cs", - cs=csum) - if not file: - base = os.path.basename(path) - mdir = self.mediaDir(create=True) - file = self.uniquePath(mdir, base) - shutil.copy2(path, file) - self.registerFile(base) - return os.path.basename(file) - - def registerFile(self, file): - "Add a single file to the media database." - if self.mediaDir(): - csum = self.mediaChecksum(os.path.join(self.mediaDir(), file)) - else: - csum = "" - self.deck.db.execute( - "insert or replace into media values (?, ?, ?)", - file, intTime(), csum) - - def registerText(self, string): - "Add all media in string to the media database." - for f in self.mediaFiles(string): - self.registerFile(f) - - def removeUnusedMedia(deck): - ids = deck.s.list("select id from media where size = 0") - for id in ids: - deck.s.statement("insert into mediaDeleted values (:id, :t)", - id=id, t=time.time()) - deck.s.statement("delete from media where size = 0") - - # Moving media - ########################################################################## - - def renameMediaDir(self, oldPath): - "Copy oldPath to our current media dir. " - assert os.path.exists(oldPath) - newPath = self.mediaDir(create=None) - # copytree doesn't want the dir to exist - try: - shutil.copytree(oldPath, newPath) - except: - # FIXME: should really remove everything in old dir instead of - # giving up - pass - - # Tools - ########################################################################## - - def mediaChecksum(self, path): - "Return checksum of PATH, or empty string." - try: - return checksum(open(path, "rb").read()) - except: - return "" - - def uniquePath(self, dir, base): +If the same name exists, compare checksums.""" + mdir = self.dir(create=True) # remove any dangerous characters - base = re.sub(r"[][<>:/\\&]", "", base) - # find a unique name + base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath)) + dst = os.path.join(mdir, base) + # if it doesn't exist, copy it directly + if not os.path.exists(dst): + shutil.copy2(opath, dst) + return base + # if it's identical, reuse + if self.filesIdentical(opath, dst): + return base + # otherwise, find a unique name (root, ext) = os.path.splitext(base) def repl(match): n = int(match.group(1)) return " (%d)" % (n+1) while True: - path = os.path.join(dir, root + ext) + path = os.path.join(mdir, root + ext) if not os.path.exists(path): break reg = " \((\d+)\)$" @@ -131,7 +75,14 @@ If a file with the same name exists, return a unique name.""" root = root + " (1)" else: root = re.sub(reg, repl, root) - return path + # copy and return + shutil.copy2(opath, path) + return os.path.basename(os.path.basename(path)) + + def filesIdentical(self, path1, path2): + "True if files are the same." + return (checksum(open(path1, "rb").read()) == + checksum(open(path2, "rb").read())) # String manipulation ########################################################################## @@ -163,25 +114,20 @@ If a file with the same name exists, return a unique name.""" # Rebuilding DB ########################################################################## - def rebuildMediaDir(self, delete=False): - mdir = self.mediaDir() + def check(self, delete=False): + "Return (missingFiles, unusedFiles)." + mdir = self.dir() if not mdir: return (0, 0) - # delete all media entries in database - self.deck.db.execute("delete from media") - # look through cards for media references + # generate card q/a and look through all references normrefs = {} def norm(s): if isinstance(s, unicode): return unicodedata.normalize('NFD', s) return s - # generate q/a and look through all references - for p in self.deck.renderQA(type="all"): - for type in ("q", "a"): - for f in self.mediaFiles(p[type]): - normrefs[norm(f)] = True - self.registerFile(f) - # find unused media + for f in self.allMedia(): + normrefs[norm(f)] = True + # loop through directory and find unused & missing media unused = [] for file in os.listdir(mdir): path = os.path.join(mdir, file) @@ -191,15 +137,25 @@ If a file with the same name exists, return a unique name.""" nfile = norm(file) if nfile not in normrefs: unused.append(file) + else: + del normrefs[nfile] # optionally delete if delete: for f in unused: path = os.path.join(mdir, f) os.unlink(path) - nohave = self.deck.db.list( - "select file from media where csum = ''") + nohave = normrefs.keys() return (nohave, unused) + def allMedia(self): + "Return a set of all referenced filenames." + files = set() + for p in self.deck.renderQA(type="all"): + for type in ("q", "a"): + for f in self.mediaFiles(p[type]): + files.add(f) + return files + # Download missing ########################################################################## @@ -207,7 +163,7 @@ If a file with the same name exists, return a unique name.""" urlbase = self.deck.getVar("mediaURL") if not urlbase: return None - mdir = self.deck.mediaDir(create=True) + mdir = self.deck.dir(create=True) missing = 0 grabbed = 0 for c, (f, sum) in enumerate(self.deck.db.all( @@ -233,7 +189,7 @@ If a file with the same name exists, return a unique name.""" ########################################################################## def downloadRemote(self): - mdir = self.deck.mediaDir(create=True) + mdir = self.deck.dir(create=True) refs = {} for (question, answer) in self.deck.db.all( "select question, answer from cards"): diff --git a/anki/storage.py b/anki/storage.py index b51b30daf..55cf57d4a 100644 --- a/anki/storage.py +++ b/anki/storage.py @@ -130,12 +130,6 @@ create table if not exists gconf ( conf text not null ); -create table if not exists media ( - file text primary key, - mod integer not null, - csum text not null -); - create table if not exists revlog ( time integer primary key, cid integer not null, @@ -182,9 +176,7 @@ create index if not exists ix_facts_mod on facts (mod); create index if not exists ix_cards_fid on cards (fid); -- revlog by card create index if not exists ix_revlog_cid on revlog (cid); --- media -create index if not exists ix_media_csum on media (csum); --- unique checking +-- field uniqueness check create index if not exists ix_fsums_fid on fsums (fid); create index if not exists ix_fsums_csum on fsums (csum); """) @@ -312,11 +304,7 @@ from facts order by created""") # media ########### - _moveTable(db, "media") - db.execute(""" -insert or ignore into media select filename, cast(created as int), -originalPath from media2""") - db.execute("drop table media2") + db.execute("drop table media") # models ########### diff --git a/tests/support/anki12.anki b/tests/support/anki12.anki index 24d02755682ca35d4fc28676d1b84cab12b9ca07..7fd7be092867e304b055d784c0160e8cfa558331 100644 GIT binary patch delta 25 gcmZo@;Av>!nIO%`I#I@%k+m_QHGy$!0@H&30AWiAJpcdz delta 25 gcmZo@;Av>!nIO%`GEv5vk)<)AHGy$!0@H&30AVl)I{*Lx diff --git a/tests/test_media.py b/tests/test_media.py index e9d147f38..83184f867 100644 --- a/tests/test_media.py +++ b/tests/test_media.py @@ -5,85 +5,57 @@ from anki import Deck from anki.utils import checksum from shared import getEmptyDeck, testDir -# uniqueness check -def test_unique(): - d = getEmptyDeck() - dir = tempfile.mkdtemp(prefix="anki") - # new file - n = "foo.jpg" - new = os.path.basename(d.media.uniquePath(dir, n)) - assert new == n - # duplicate file - open(os.path.join(dir, n), "w").write("hello") - n = "foo.jpg" - new = os.path.basename(d.media.uniquePath(dir, n)) - assert new == "foo (1).jpg" - # another duplicate - open(os.path.join(dir, "foo (1).jpg"), "w").write("hello") - n = "foo.jpg" - new = os.path.basename(d.media.uniquePath(dir, n)) - assert new == "foo (2).jpg" - # copying files to media folder -def test_copy(): +def test_add(): d = getEmptyDeck() dir = tempfile.mkdtemp(prefix="anki") path = os.path.join(dir, "foo.jpg") open(path, "w").write("hello") - # new file + # new file, should preserve name assert d.media.addFile(path) == "foo.jpg" - # dupe md5 - path = os.path.join(dir, "bar.jpg") - open(path, "w").write("hello") + # adding the same file again should not create a duplicate assert d.media.addFile(path) == "foo.jpg" - -# media db -def test_db(): - deck = getEmptyDeck() - dir = tempfile.mkdtemp(prefix="anki") - path = os.path.join(dir, "foo.jpg") - open(path, "w").write("hello") - # add a new fact that references it twice - f = deck.newFact() - f['Front'] = u"" - f['Back'] = u"back [sound:foo.jpg]" - deck.addFact(f) - # 1 entry in the media db, and no checksum - assert deck.db.scalar("select count() from media") == 1 - assert not deck.db.scalar("select group_concat(csum, '') from media") - # copy to media folder - path = deck.media.addFile(path) - # md5 should be set now - assert deck.db.scalar("select count() from media") == 1 - assert deck.db.scalar("select group_concat(csum, '') from media") - # detect file modifications - oldsum = deck.db.scalar("select csum from media") + # but if it has a different md5, it should open(path, "w").write("world") - deck.media.rebuildMediaDir() - newsum = deck.db.scalar("select csum from media") - assert newsum and newsum != oldsum - # delete underlying file and check db - os.unlink(path) - deck.media.rebuildMediaDir() - # md5 should be gone again - assert deck.db.scalar("select count() from media") == 1 - assert deck.db.scalar("select not csum from media") - # media db should pick up media defined via templates & bulk update - f['Back'] = u"bar.jpg" - f.flush() - # modify template & regenerate - assert deck.db.scalar("select count() from media") == 1 - m = deck.currentModel() - m.templates[0]['afmt']=u'' - m.flush() - deck.renderQA(type="all") - assert deck.db.scalar("select count() from media") == 2 + assert d.media.addFile(path) == "foo (1).jpg" + +def test_strings(): + d = getEmptyDeck() + mf = d.media.mediaFiles + assert mf("aoeu") == [] + assert mf("aoeuao") == ["foo.jpg"] + assert mf("aoeuao") == ["foo bar.jpg"] + assert mf("aoeuao") == ["foo.jpg"] + assert mf("aoeuao") == [ + "foo.jpg", "fo"] + assert mf("aou[sound:foo.mp3]aou") == ["foo.mp3"] + sp = d.media.stripMedia + assert sp("aoeu") == "aoeu" + assert sp("aoeu[sound:foo.mp3]aoeu") == "aoeuaoeu" + assert sp("aoeu") == "aoeu" + es = d.media.escapeImages + assert es("aoeu") == "aoeu" + assert es("") == "" + assert es('') == '' def test_deckIntegration(): - deck = getEmptyDeck() + d = getEmptyDeck() # create a media dir - deck.media.mediaDir(create=True) + d.media.dir(create=True) # put a file into it - file = unicode(os.path.join(testDir, "deck/fake.png")) - deck.media.addFile(file) - print "todo: check media copied on rename" + file = unicode(os.path.join(testDir, "support/fake.png")) + d.media.addFile(file) + # add a fact which references it + f = d.newFact() + f['Front'] = u"one"; f['Back'] = u"" + d.addFact(f) + # and one which references a non-existent file + f = d.newFact() + f['Front'] = u"one"; f['Back'] = u"" + d.addFact(f) + # and add another file which isn't used + open(os.path.join(d.media.dir(), "foo.jpg"), "wb").write("test") + # check media + ret = d.media.check() + assert ret[0] == ["fake2.png"] + assert ret[1] == ["foo.jpg"]