mirror of
https://github.com/ankitects/anki.git
synced 2025-09-19 14:32:22 -04:00

I removed the media database in an earlier commit, but it's now necessary again as I decided to add native media syncing to AnkiWeb. This time, the DB is stored in the media folder rather than with the deck. This means we avoid sending it in a full sync, and makes deck backups faster. The DB is a cache of file modtimes and checksums. When findChanges() is called, the code checks to see which files were added, changed or deleted since the last time, and updates the log of changes. Because the scanning step and log retrieval is separate, it's possible to do the scanning in the background if the need arises. If the DB is deleted by the user, Anki will forget any deletions, and add all the files back to the DB the next time it's accessed. File changes are recorded as a delete + add. media.addFile() could be optimized in the future to log media added manually by the user, allowing us to skip the full directory scan in cases where the only changes were manually added media.
257 lines
8.8 KiB
Python
257 lines
8.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright: Damien Elmes <anki@ichi2.net>
|
|
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
|
|
|
|
import os, shutil, re, urllib, urllib2, time, unicodedata, \
|
|
urllib, sys, shutil
|
|
from anki.utils import checksum, intTime, namedtmp, isWin
|
|
from anki.lang import _
|
|
from anki.db import DB
|
|
|
|
class MediaManager(object):
|
|
|
|
# other code depends on this order, so don't reorder
|
|
regexps = ("(?i)(\[sound:([^]]+)\])",
|
|
"(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)")
|
|
|
|
def __init__(self, deck):
|
|
self.deck = deck
|
|
# media directory
|
|
self._dir = re.sub("(?i)\.(anki)$", ".media", self.deck.path)
|
|
if not os.path.exists(self._dir):
|
|
os.makedirs(self._dir)
|
|
os.chdir(self._dir)
|
|
# change database
|
|
path = os.path.join(self.dir(), "media.db")
|
|
create = not os.path.exists(path)
|
|
self.db = DB(path)
|
|
if create:
|
|
self._initDB()
|
|
|
|
def dir(self):
|
|
return self._dir
|
|
|
|
# Adding media
|
|
##########################################################################
|
|
|
|
def addFile(self, opath):
|
|
"""Copy PATH to MEDIADIR, and return new filename.
|
|
If the same name exists, compare checksums."""
|
|
mdir = self.dir()
|
|
# remove any dangerous characters
|
|
base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath))
|
|
dst = os.path.join(mdir, base)
|
|
# if it doesn't exist, copy it directly
|
|
if not os.path.exists(dst):
|
|
shutil.copy2(opath, dst)
|
|
return base
|
|
# if it's identical, reuse
|
|
if self.filesIdentical(opath, dst):
|
|
return base
|
|
# otherwise, find a unique name
|
|
(root, ext) = os.path.splitext(base)
|
|
def repl(match):
|
|
n = int(match.group(1))
|
|
return " (%d)" % (n+1)
|
|
while True:
|
|
path = os.path.join(mdir, root + ext)
|
|
if not os.path.exists(path):
|
|
break
|
|
reg = " \((\d+)\)$"
|
|
if not re.search(reg, root):
|
|
root = root + " (1)"
|
|
else:
|
|
root = re.sub(reg, repl, root)
|
|
# copy and return
|
|
shutil.copy2(opath, path)
|
|
return os.path.basename(os.path.basename(path))
|
|
|
|
def filesIdentical(self, path1, path2):
|
|
"True if files are the same."
|
|
return (checksum(open(path1, "rb").read()) ==
|
|
checksum(open(path2, "rb").read()))
|
|
|
|
# String manipulation
|
|
##########################################################################
|
|
|
|
def mediaFiles(self, string, includeRemote=False):
|
|
l = []
|
|
for reg in self.regexps:
|
|
for (full, fname) in re.findall(reg, string):
|
|
isLocal = not re.match("(https?|ftp)://", fname.lower())
|
|
if isLocal or includeRemote:
|
|
l.append(fname)
|
|
return l
|
|
|
|
def strip(self, txt):
|
|
for reg in self.regexps:
|
|
txt = re.sub(reg, "", txt)
|
|
return txt
|
|
|
|
def escapeImages(self, string):
|
|
# Feeding webkit unicode can result in it not finding images, so on
|
|
# linux/osx we percent escape the image paths as utf8. On Windows the
|
|
# problem is more complicated - if we percent-escape as utf8 it fixes
|
|
# some images but breaks others. When filenames are normalized by
|
|
# dropbox they become unreadable if we escape them.
|
|
if isWin:
|
|
return string
|
|
def repl(match):
|
|
tag = match.group(1)
|
|
fname = match.group(2)
|
|
if re.match("(https?|ftp)://", fname):
|
|
return tag
|
|
return tag.replace(
|
|
fname, urllib.quote(fname.encode("utf-8")))
|
|
return re.sub(self.regexps[1], repl, string)
|
|
|
|
# Rebuilding DB
|
|
##########################################################################
|
|
|
|
def check(self, delete=False):
|
|
"Return (missingFiles, unusedFiles)."
|
|
mdir = self.dir()
|
|
if not mdir:
|
|
return (0, 0)
|
|
# generate card q/a and look through all references
|
|
normrefs = {}
|
|
def norm(s):
|
|
if isinstance(s, unicode):
|
|
return unicodedata.normalize('NFD', s)
|
|
return s
|
|
for f in self.allMedia():
|
|
normrefs[norm(f)] = True
|
|
# loop through directory and find unused & missing media
|
|
unused = []
|
|
for file in os.listdir(mdir):
|
|
if file.startswith("latex-") or file.startswith("media.db"):
|
|
continue
|
|
path = os.path.join(mdir, file)
|
|
if not os.path.isfile(path):
|
|
# ignore directories
|
|
continue
|
|
nfile = norm(file)
|
|
if nfile not in normrefs:
|
|
unused.append(file)
|
|
else:
|
|
del normrefs[nfile]
|
|
# optionally delete
|
|
if delete:
|
|
for f in unused:
|
|
path = os.path.join(mdir, f)
|
|
os.unlink(path)
|
|
nohave = normrefs.keys()
|
|
return (nohave, unused)
|
|
|
|
def allMedia(self):
|
|
"Return a set of all referenced filenames."
|
|
files = set()
|
|
for p in self.deck.renderQA(type="all"):
|
|
for type in ("q", "a"):
|
|
for f in self.mediaFiles(p[type]):
|
|
files.add(f)
|
|
return files
|
|
|
|
# Tracking changes
|
|
##########################################################################
|
|
|
|
def _initDB(self):
|
|
# in the log, a mod time of zero indicates a delete
|
|
self.db.executescript("""
|
|
create table media (fname text primary key, csum text, mod int);
|
|
create table meta (dirMod int);
|
|
insert into meta values (0);
|
|
create table log (id int, fname text, mod int);
|
|
create index ix_log_id on log (id);
|
|
""")
|
|
|
|
def _mtime(self, path):
|
|
return int(os.stat(path).st_mtime)
|
|
|
|
def _checksum(self, path):
|
|
return checksum(open(path, "rb").read())
|
|
|
|
def changed(self):
|
|
"Return dir mtime if it has changed since the last findChanges()"
|
|
# doesn't track edits, but user can add or remove a file to update
|
|
mod = self.db.scalar("select dirMod from meta")
|
|
mtime = self._mtime(self.dir())
|
|
if mod and mod == mtime:
|
|
return False
|
|
return mtime
|
|
|
|
def findChanges(self):
|
|
"Scan the media folder if it's changed, and note any changes."
|
|
if self.changed():
|
|
self._logChanges()
|
|
|
|
def changesSince(self, mod):
|
|
"Return a list of added and removed files since MOD time."
|
|
self.findChanges()
|
|
added = {}
|
|
removed = {}
|
|
# loop through and collect changes, removing duplicates
|
|
for fname, mod in self.db.all(
|
|
"select fname, mod from log where id > ?", mod):
|
|
if mod:
|
|
added[fname] = mod
|
|
else:
|
|
removed[fname] = mod
|
|
return added.items(), removed.keys()
|
|
|
|
def _changes(self):
|
|
self.cache = {}
|
|
for (name, csum, mod) in self.db.execute(
|
|
"select * from media"):
|
|
self.cache[name] = [csum, mod, False]
|
|
added = []
|
|
removed = []
|
|
changed = []
|
|
# loop through on-disk files
|
|
for f in os.listdir(self.dir()):
|
|
# ignore our db and folders
|
|
if f.startswith("media.db") or os.path.isdir(f):
|
|
continue
|
|
# newly added?
|
|
if f not in self.cache:
|
|
added.append(f)
|
|
else:
|
|
# modified since last time?
|
|
if self._mtime(f) != self.cache[f][1]:
|
|
# and has different checksum?
|
|
if self._checksum(f) != self.cache[f][0]:
|
|
changed.append(f)
|
|
# mark as used
|
|
self.cache[f][2] = True
|
|
# look for any entries in the cache that no longer exist on disk
|
|
for (k, v) in self.cache.items():
|
|
if not v[2]:
|
|
removed.append(k)
|
|
return added, changed, removed
|
|
|
|
def _logChanges(self):
|
|
(added, changed, removed) = self._changes()
|
|
log = []
|
|
media = []
|
|
mediaRem = []
|
|
t = intTime()
|
|
for f in added:
|
|
mt = self._mtime(f)
|
|
media.append((f, self._checksum(f), mt))
|
|
log.append((t, f, mt))
|
|
for f in changed:
|
|
mt = self._mtime(f)
|
|
media.append((f, self._checksum(f), mt))
|
|
log.append((t, f, 0))
|
|
log.append((t, f, mt))
|
|
for f in removed:
|
|
mediaRem.append((f,))
|
|
log.append((t, f, 0))
|
|
# update db
|
|
self.db.executemany("insert or replace into media values (?,?,?)",
|
|
media)
|
|
self.db.executemany("insert into log values (?,?,?)", log)
|
|
if mediaRem:
|
|
self.db.executemany("delete from media where fname = ?",
|
|
mediaRem)
|
|
self.db.execute("update meta set dirMod = ?", self._mtime(self.dir()))
|