remove the media table

The media table was originally introduced when Anki hashed media filenames,
and needed a way to remember the original filename. It also helped with:
1) getting a quick list of all media used in the deck, or the media added
   since the last sync, for mobile clients
2) merging identical files with different names

But had some drawbacks:
- every operation that modifies templates, models or facts meant generating
  the q/a and checking if any new media had appeared
- each entry is about 70 bytes, and some decks have 100k+ media files

So we remove the media table. We address 1) by being more intelligent about
media downloads on the mobile platform. We ask the user after a full sync if
they want to look for missing media, and they can choose not to if they know
they haven't added any. And on a partial sync, we can scan the contents of the
incoming facts for media references, and download any references we find. This
also avoids all the issues people had with media not downloading because it
was in their media folder but not in the media database.

For 2), when copying media to the media folder, if we have a duplicate
filename, we check if that file has the same md5, and avoid copying if so.
This won't merge identical content that has separate names, but instances
where users need that are rare.
This commit is contained in:
Damien Elmes 2011-03-19 10:45:48 +09:00
parent bd477de1a9
commit be045d451c
6 changed files with 98 additions and 183 deletions

View file

@ -450,7 +450,6 @@ select id from cards where fid in (select id from facts where mid = ?)""",
html = anki.template.render(format, fields)
# if filters:
# d[type] = runFilter("renderQA.post", html, fields, meta, self)
self.media.registerText(html)
d[type] = html
return d

View file

@ -69,7 +69,7 @@ def buildImg(deck, latex):
texfile.write(latex)
texfile.close()
# make sure we have a valid mediaDir
mdir = deck.mediaDir(create=True)
mdir = deck.media.dir(create=True)
oldcwd = os.getcwd()
if sys.platform == "win32":
si = subprocess.STARTUPINFO()

View file

@ -15,17 +15,17 @@ class MediaRegistry(object):
def __init__(self, deck):
self.deck = deck
self.mediaPrefix = ""
self._mediaDir = None
self._updateMediaDir()
self._dir = None
self._updateDir()
def mediaDir(self, create=False):
if self._mediaDir:
return self._mediaDir
def dir(self, create=False):
if self._dir:
return self._dir
elif create:
self._updateMediaDir(True)
return self._mediaDir
self._updateDir(True)
return self._dir
def _updateMediaDir(self, create=False):
def _updateDir(self, create=False):
if self.mediaPrefix:
dir = os.path.join(
self.mediaPrefix, os.path.basename(self.deck.path))
@ -42,88 +42,32 @@ class MediaRegistry(object):
os.makedirs(dir)
# change to the current dir
os.chdir(dir)
self._mediaDir = dir
self._dir = dir
# Adding and registering media
# Adding media
##########################################################################
def addFile(self, path):
def addFile(self, opath):
"""Copy PATH to MEDIADIR, and return new filename.
If a file with the same md5sum exists in the DB, return that.
If a file with the same name exists, return a unique name."""
# see if have duplicate contents
csum = self.mediaChecksum(path)
if not csum:
# file was unreadable or didn't exist
return None
file = self.deck.db.scalar(
"select file from media where csum = :cs",
cs=csum)
if not file:
base = os.path.basename(path)
mdir = self.mediaDir(create=True)
file = self.uniquePath(mdir, base)
shutil.copy2(path, file)
self.registerFile(base)
return os.path.basename(file)
def registerFile(self, file):
"Add a single file to the media database."
if self.mediaDir():
csum = self.mediaChecksum(os.path.join(self.mediaDir(), file))
else:
csum = ""
self.deck.db.execute(
"insert or replace into media values (?, ?, ?)",
file, intTime(), csum)
def registerText(self, string):
"Add all media in string to the media database."
for f in self.mediaFiles(string):
self.registerFile(f)
def removeUnusedMedia(deck):
ids = deck.s.list("select id from media where size = 0")
for id in ids:
deck.s.statement("insert into mediaDeleted values (:id, :t)",
id=id, t=time.time())
deck.s.statement("delete from media where size = 0")
# Moving media
##########################################################################
def renameMediaDir(self, oldPath):
"Copy oldPath to our current media dir. "
assert os.path.exists(oldPath)
newPath = self.mediaDir(create=None)
# copytree doesn't want the dir to exist
try:
shutil.copytree(oldPath, newPath)
except:
# FIXME: should really remove everything in old dir instead of
# giving up
pass
# Tools
##########################################################################
def mediaChecksum(self, path):
"Return checksum of PATH, or empty string."
try:
return checksum(open(path, "rb").read())
except:
return ""
def uniquePath(self, dir, base):
If the same name exists, compare checksums."""
mdir = self.dir(create=True)
# remove any dangerous characters
base = re.sub(r"[][<>:/\\&]", "", base)
# find a unique name
base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath))
dst = os.path.join(mdir, base)
# if it doesn't exist, copy it directly
if not os.path.exists(dst):
shutil.copy2(opath, dst)
return base
# if it's identical, reuse
if self.filesIdentical(opath, dst):
return base
# otherwise, find a unique name
(root, ext) = os.path.splitext(base)
def repl(match):
n = int(match.group(1))
return " (%d)" % (n+1)
while True:
path = os.path.join(dir, root + ext)
path = os.path.join(mdir, root + ext)
if not os.path.exists(path):
break
reg = " \((\d+)\)$"
@ -131,7 +75,14 @@ If a file with the same name exists, return a unique name."""
root = root + " (1)"
else:
root = re.sub(reg, repl, root)
return path
# copy and return
shutil.copy2(opath, path)
return os.path.basename(os.path.basename(path))
def filesIdentical(self, path1, path2):
"True if files are the same."
return (checksum(open(path1, "rb").read()) ==
checksum(open(path2, "rb").read()))
# String manipulation
##########################################################################
@ -163,25 +114,20 @@ If a file with the same name exists, return a unique name."""
# Rebuilding DB
##########################################################################
def rebuildMediaDir(self, delete=False):
mdir = self.mediaDir()
def check(self, delete=False):
"Return (missingFiles, unusedFiles)."
mdir = self.dir()
if not mdir:
return (0, 0)
# delete all media entries in database
self.deck.db.execute("delete from media")
# look through cards for media references
# generate card q/a and look through all references
normrefs = {}
def norm(s):
if isinstance(s, unicode):
return unicodedata.normalize('NFD', s)
return s
# generate q/a and look through all references
for p in self.deck.renderQA(type="all"):
for type in ("q", "a"):
for f in self.mediaFiles(p[type]):
normrefs[norm(f)] = True
self.registerFile(f)
# find unused media
for f in self.allMedia():
normrefs[norm(f)] = True
# loop through directory and find unused & missing media
unused = []
for file in os.listdir(mdir):
path = os.path.join(mdir, file)
@ -191,15 +137,25 @@ If a file with the same name exists, return a unique name."""
nfile = norm(file)
if nfile not in normrefs:
unused.append(file)
else:
del normrefs[nfile]
# optionally delete
if delete:
for f in unused:
path = os.path.join(mdir, f)
os.unlink(path)
nohave = self.deck.db.list(
"select file from media where csum = ''")
nohave = normrefs.keys()
return (nohave, unused)
def allMedia(self):
"Return a set of all referenced filenames."
files = set()
for p in self.deck.renderQA(type="all"):
for type in ("q", "a"):
for f in self.mediaFiles(p[type]):
files.add(f)
return files
# Download missing
##########################################################################
@ -207,7 +163,7 @@ If a file with the same name exists, return a unique name."""
urlbase = self.deck.getVar("mediaURL")
if not urlbase:
return None
mdir = self.deck.mediaDir(create=True)
mdir = self.deck.dir(create=True)
missing = 0
grabbed = 0
for c, (f, sum) in enumerate(self.deck.db.all(
@ -233,7 +189,7 @@ If a file with the same name exists, return a unique name."""
##########################################################################
def downloadRemote(self):
mdir = self.deck.mediaDir(create=True)
mdir = self.deck.dir(create=True)
refs = {}
for (question, answer) in self.deck.db.all(
"select question, answer from cards"):

View file

@ -130,12 +130,6 @@ create table if not exists gconf (
conf text not null
);
create table if not exists media (
file text primary key,
mod integer not null,
csum text not null
);
create table if not exists revlog (
time integer primary key,
cid integer not null,
@ -182,9 +176,7 @@ create index if not exists ix_facts_mod on facts (mod);
create index if not exists ix_cards_fid on cards (fid);
-- revlog by card
create index if not exists ix_revlog_cid on revlog (cid);
-- media
create index if not exists ix_media_csum on media (csum);
-- unique checking
-- field uniqueness check
create index if not exists ix_fsums_fid on fsums (fid);
create index if not exists ix_fsums_csum on fsums (csum);
""")
@ -312,11 +304,7 @@ from facts order by created""")
# media
###########
_moveTable(db, "media")
db.execute("""
insert or ignore into media select filename, cast(created as int),
originalPath from media2""")
db.execute("drop table media2")
db.execute("drop table media")
# models
###########

Binary file not shown.

View file

@ -5,85 +5,57 @@ from anki import Deck
from anki.utils import checksum
from shared import getEmptyDeck, testDir
# uniqueness check
def test_unique():
d = getEmptyDeck()
dir = tempfile.mkdtemp(prefix="anki")
# new file
n = "foo.jpg"
new = os.path.basename(d.media.uniquePath(dir, n))
assert new == n
# duplicate file
open(os.path.join(dir, n), "w").write("hello")
n = "foo.jpg"
new = os.path.basename(d.media.uniquePath(dir, n))
assert new == "foo (1).jpg"
# another duplicate
open(os.path.join(dir, "foo (1).jpg"), "w").write("hello")
n = "foo.jpg"
new = os.path.basename(d.media.uniquePath(dir, n))
assert new == "foo (2).jpg"
# copying files to media folder
def test_copy():
def test_add():
d = getEmptyDeck()
dir = tempfile.mkdtemp(prefix="anki")
path = os.path.join(dir, "foo.jpg")
open(path, "w").write("hello")
# new file
# new file, should preserve name
assert d.media.addFile(path) == "foo.jpg"
# dupe md5
path = os.path.join(dir, "bar.jpg")
open(path, "w").write("hello")
# adding the same file again should not create a duplicate
assert d.media.addFile(path) == "foo.jpg"
# media db
def test_db():
deck = getEmptyDeck()
dir = tempfile.mkdtemp(prefix="anki")
path = os.path.join(dir, "foo.jpg")
open(path, "w").write("hello")
# add a new fact that references it twice
f = deck.newFact()
f['Front'] = u"<img src='foo.jpg'>"
f['Back'] = u"back [sound:foo.jpg]"
deck.addFact(f)
# 1 entry in the media db, and no checksum
assert deck.db.scalar("select count() from media") == 1
assert not deck.db.scalar("select group_concat(csum, '') from media")
# copy to media folder
path = deck.media.addFile(path)
# md5 should be set now
assert deck.db.scalar("select count() from media") == 1
assert deck.db.scalar("select group_concat(csum, '') from media")
# detect file modifications
oldsum = deck.db.scalar("select csum from media")
# but if it has a different md5, it should
open(path, "w").write("world")
deck.media.rebuildMediaDir()
newsum = deck.db.scalar("select csum from media")
assert newsum and newsum != oldsum
# delete underlying file and check db
os.unlink(path)
deck.media.rebuildMediaDir()
# md5 should be gone again
assert deck.db.scalar("select count() from media") == 1
assert deck.db.scalar("select not csum from media")
# media db should pick up media defined via templates & bulk update
f['Back'] = u"bar.jpg"
f.flush()
# modify template & regenerate
assert deck.db.scalar("select count() from media") == 1
m = deck.currentModel()
m.templates[0]['afmt']=u'<img src="{{{Back}}}">'
m.flush()
deck.renderQA(type="all")
assert deck.db.scalar("select count() from media") == 2
assert d.media.addFile(path) == "foo (1).jpg"
def test_strings():
d = getEmptyDeck()
mf = d.media.mediaFiles
assert mf("aoeu") == []
assert mf("aoeu<img src='foo.jpg'>ao") == ["foo.jpg"]
assert mf("aoeu<img src=foo bar.jpg>ao") == ["foo bar.jpg"]
assert mf("aoeu<img src=\"foo.jpg\">ao") == ["foo.jpg"]
assert mf("aoeu<img src=\"foo.jpg\"><img class=yo src=fo>ao") == [
"foo.jpg", "fo"]
assert mf("aou[sound:foo.mp3]aou") == ["foo.mp3"]
sp = d.media.stripMedia
assert sp("aoeu") == "aoeu"
assert sp("aoeu[sound:foo.mp3]aoeu") == "aoeuaoeu"
assert sp("a<img src=yo>oeu") == "aoeu"
es = d.media.escapeImages
assert es("aoeu") == "aoeu"
assert es("<img src='http://foo.com'>") == "<img src='http://foo.com'>"
assert es('<img src="foo bar.jpg">') == '<img src="foo%20bar.jpg">'
def test_deckIntegration():
deck = getEmptyDeck()
d = getEmptyDeck()
# create a media dir
deck.media.mediaDir(create=True)
d.media.dir(create=True)
# put a file into it
file = unicode(os.path.join(testDir, "deck/fake.png"))
deck.media.addFile(file)
print "todo: check media copied on rename"
file = unicode(os.path.join(testDir, "support/fake.png"))
d.media.addFile(file)
# add a fact which references it
f = d.newFact()
f['Front'] = u"one"; f['Back'] = u"<img src='fake.png'>"
d.addFact(f)
# and one which references a non-existent file
f = d.newFact()
f['Front'] = u"one"; f['Back'] = u"<img src='fake2.png'>"
d.addFact(f)
# and add another file which isn't used
open(os.path.join(d.media.dir(), "foo.jpg"), "wb").write("test")
# check media
ret = d.media.check()
assert ret[0] == ["fake2.png"]
assert ret[1] == ["foo.jpg"]