Anki/anki/media.py
Damien Elmes c59dd854fb add change detection
I removed the media database in an earlier commit, but it's now necessary
again as I decided to add native media syncing to AnkiWeb.

This time, the DB is stored in the media folder rather than with the deck.
This means we avoid sending it in a full sync, and makes deck backups faster.
The DB is a cache of file modtimes and checksums. When findChanges() is
called, the code checks to see which files were added, changed or deleted
since the last time, and updates the log of changes. Because the scanning step
and log retrieval is separate, it's possible to do the scanning in the
background if the need arises.

If the DB is deleted by the user, Anki will forget any deletions, and add all
the files back to the DB the next time it's accessed.

File changes are recorded as a delete + add.

media.addFile() could be optimized in the future to log media added manually
by the user, allowing us to skip the full directory scan in cases where the
only changes were manually added media.
2011-09-12 03:11:06 +09:00

257 lines
8.8 KiB
Python

# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import os, shutil, re, urllib, urllib2, time, unicodedata, \
urllib, sys, shutil
from anki.utils import checksum, intTime, namedtmp, isWin
from anki.lang import _
from anki.db import DB
class MediaManager(object):
# other code depends on this order, so don't reorder
regexps = ("(?i)(\[sound:([^]]+)\])",
"(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)")
def __init__(self, deck):
self.deck = deck
# media directory
self._dir = re.sub("(?i)\.(anki)$", ".media", self.deck.path)
if not os.path.exists(self._dir):
os.makedirs(self._dir)
os.chdir(self._dir)
# change database
path = os.path.join(self.dir(), "media.db")
create = not os.path.exists(path)
self.db = DB(path)
if create:
self._initDB()
def dir(self):
return self._dir
# Adding media
##########################################################################
def addFile(self, opath):
"""Copy PATH to MEDIADIR, and return new filename.
If the same name exists, compare checksums."""
mdir = self.dir()
# remove any dangerous characters
base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath))
dst = os.path.join(mdir, base)
# if it doesn't exist, copy it directly
if not os.path.exists(dst):
shutil.copy2(opath, dst)
return base
# if it's identical, reuse
if self.filesIdentical(opath, dst):
return base
# otherwise, find a unique name
(root, ext) = os.path.splitext(base)
def repl(match):
n = int(match.group(1))
return " (%d)" % (n+1)
while True:
path = os.path.join(mdir, root + ext)
if not os.path.exists(path):
break
reg = " \((\d+)\)$"
if not re.search(reg, root):
root = root + " (1)"
else:
root = re.sub(reg, repl, root)
# copy and return
shutil.copy2(opath, path)
return os.path.basename(os.path.basename(path))
def filesIdentical(self, path1, path2):
"True if files are the same."
return (checksum(open(path1, "rb").read()) ==
checksum(open(path2, "rb").read()))
# String manipulation
##########################################################################
def mediaFiles(self, string, includeRemote=False):
l = []
for reg in self.regexps:
for (full, fname) in re.findall(reg, string):
isLocal = not re.match("(https?|ftp)://", fname.lower())
if isLocal or includeRemote:
l.append(fname)
return l
def strip(self, txt):
for reg in self.regexps:
txt = re.sub(reg, "", txt)
return txt
def escapeImages(self, string):
# Feeding webkit unicode can result in it not finding images, so on
# linux/osx we percent escape the image paths as utf8. On Windows the
# problem is more complicated - if we percent-escape as utf8 it fixes
# some images but breaks others. When filenames are normalized by
# dropbox they become unreadable if we escape them.
if isWin:
return string
def repl(match):
tag = match.group(1)
fname = match.group(2)
if re.match("(https?|ftp)://", fname):
return tag
return tag.replace(
fname, urllib.quote(fname.encode("utf-8")))
return re.sub(self.regexps[1], repl, string)
# Rebuilding DB
##########################################################################
def check(self, delete=False):
"Return (missingFiles, unusedFiles)."
mdir = self.dir()
if not mdir:
return (0, 0)
# generate card q/a and look through all references
normrefs = {}
def norm(s):
if isinstance(s, unicode):
return unicodedata.normalize('NFD', s)
return s
for f in self.allMedia():
normrefs[norm(f)] = True
# loop through directory and find unused & missing media
unused = []
for file in os.listdir(mdir):
if file.startswith("latex-") or file.startswith("media.db"):
continue
path = os.path.join(mdir, file)
if not os.path.isfile(path):
# ignore directories
continue
nfile = norm(file)
if nfile not in normrefs:
unused.append(file)
else:
del normrefs[nfile]
# optionally delete
if delete:
for f in unused:
path = os.path.join(mdir, f)
os.unlink(path)
nohave = normrefs.keys()
return (nohave, unused)
def allMedia(self):
"Return a set of all referenced filenames."
files = set()
for p in self.deck.renderQA(type="all"):
for type in ("q", "a"):
for f in self.mediaFiles(p[type]):
files.add(f)
return files
# Tracking changes
##########################################################################
def _initDB(self):
# in the log, a mod time of zero indicates a delete
self.db.executescript("""
create table media (fname text primary key, csum text, mod int);
create table meta (dirMod int);
insert into meta values (0);
create table log (id int, fname text, mod int);
create index ix_log_id on log (id);
""")
def _mtime(self, path):
return int(os.stat(path).st_mtime)
def _checksum(self, path):
return checksum(open(path, "rb").read())
def changed(self):
"Return dir mtime if it has changed since the last findChanges()"
# doesn't track edits, but user can add or remove a file to update
mod = self.db.scalar("select dirMod from meta")
mtime = self._mtime(self.dir())
if mod and mod == mtime:
return False
return mtime
def findChanges(self):
"Scan the media folder if it's changed, and note any changes."
if self.changed():
self._logChanges()
def changesSince(self, mod):
"Return a list of added and removed files since MOD time."
self.findChanges()
added = {}
removed = {}
# loop through and collect changes, removing duplicates
for fname, mod in self.db.all(
"select fname, mod from log where id > ?", mod):
if mod:
added[fname] = mod
else:
removed[fname] = mod
return added.items(), removed.keys()
def _changes(self):
self.cache = {}
for (name, csum, mod) in self.db.execute(
"select * from media"):
self.cache[name] = [csum, mod, False]
added = []
removed = []
changed = []
# loop through on-disk files
for f in os.listdir(self.dir()):
# ignore our db and folders
if f.startswith("media.db") or os.path.isdir(f):
continue
# newly added?
if f not in self.cache:
added.append(f)
else:
# modified since last time?
if self._mtime(f) != self.cache[f][1]:
# and has different checksum?
if self._checksum(f) != self.cache[f][0]:
changed.append(f)
# mark as used
self.cache[f][2] = True
# look for any entries in the cache that no longer exist on disk
for (k, v) in self.cache.items():
if not v[2]:
removed.append(k)
return added, changed, removed
def _logChanges(self):
(added, changed, removed) = self._changes()
log = []
media = []
mediaRem = []
t = intTime()
for f in added:
mt = self._mtime(f)
media.append((f, self._checksum(f), mt))
log.append((t, f, mt))
for f in changed:
mt = self._mtime(f)
media.append((f, self._checksum(f), mt))
log.append((t, f, 0))
log.append((t, f, mt))
for f in removed:
mediaRem.append((f,))
log.append((t, f, 0))
# update db
self.db.executemany("insert or replace into media values (?,?,?)",
media)
self.db.executemany("insert into log values (?,?,?)", log)
if mediaRem:
self.db.executemany("delete from media where fname = ?",
mediaRem)
self.db.execute("update meta set dirMod = ?", self._mtime(self.dir()))