Anki/anki/media.py
dae 2aa7714f87 new media syncing protocol
- media syncing no longer locks the account, so it can be done
  in the background in the future, and multiple clients can safely
  sync media at the same time
- all operations are now idempotent, so they can be repeatedly safely
  in the event of a connection error
- whether it's a normal incremental sync, an initial sync,
  or the media database has been deleted, no files will be uploaded
  or downloaded if they already exist on the other side
- file removals are now chunked like additions & updates, preventing
  timeouts due to large requests
- if the server can't process a chunk in time, it will return a count
  of what it did process, so the client can retry the rest

Notes for AnkiDroid:

- when porting this, recommend you pick a different name for the
.media.db2 file, so users don't accidentally copy the AD version to
the desktop or vice versa
- please make sure filenames are added to the zip in NFC form
2014-06-26 09:00:38 +09:00

496 lines
17 KiB
Python

# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import re
import urllib
import unicodedata
import sys
import zipfile
from cStringIO import StringIO
from anki.utils import checksum, isWin, isMac, json
from anki.db import DB
from anki.consts import *
from anki.latex import mungeQA
class MediaManager(object):
soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"]
imgRegexps = [
# src element quoted case
"(?i)(<img[^>]* src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)",
# unquoted case
"(?i)(<img[^>]* src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)",
]
regexps = soundRegexps + imgRegexps
def __init__(self, col, server):
self.col = col
if server:
self._dir = None
return
# media directory
self._dir = re.sub("(?i)\.(anki2)$", ".media", self.col.path)
# convert dir to unicode if it's not already
if isinstance(self._dir, str):
self._dir = unicode(self._dir, sys.getfilesystemencoding())
if not os.path.exists(self._dir):
os.makedirs(self._dir)
try:
self._oldcwd = os.getcwd()
except OSError:
# cwd doesn't exist
self._oldcwd = None
try:
os.chdir(self._dir)
except OSError:
raise Exception("invalidTempFolder")
# change database
self.connect()
def connect(self):
if self.col.server:
return
path = self.dir()+".db2"
create = not os.path.exists(path)
os.chdir(self._dir)
self.db = DB(path)
if create:
self._initDB()
self.maybeUpgrade()
def _initDB(self):
self.db.executescript("""
create table media (
fname text not null primary key,
csum text, -- null indicates deleted file
mtime int not null, -- zero if deleted
dirty int not null
);
create index idx_media_dirty on media (dirty);
create table meta (dirMod int, lastUsn int); insert into meta values (0, 0);
""")
def maybeUpgrade(self):
oldpath = self.dir()+".db"
if os.path.exists(oldpath):
self.db.execute('attach "../collection.media.db" as old')
self.db.execute("""
insert into media
select m.fname, csum, mod, ifnull((select 1 from log l2 where l2.fname=m.fname), 0) as dirty
from old.media m
left outer join old.log l using (fname)
union
select fname, null, 0, 1 from old.log where type=1;""")
self.db.execute("delete from meta")
self.db.execute("""
insert into meta select dirMod, usn from old.meta
""")
self.db.execute("detach old")
self.db.commit()
self.db.execute("vacuum analyze")
os.rename("../collection.media.db", "../collection.media.db.old")
def close(self):
if self.col.server:
return
self.db.close()
self.db = None
# change cwd back to old location
if self._oldcwd:
try:
os.chdir(self._oldcwd)
except:
# may have been deleted
pass
def dir(self):
return self._dir
def _isFAT32(self):
if not isWin:
return
import win32api, win32file
try:
name = win32file.GetVolumeNameForVolumeMountPoint(self._dir[:3])
except:
# mapped & unmapped network drive; pray that it's not vfat
return
if win32api.GetVolumeInformation(name)[4].lower().startswith("fat"):
return True
# Adding media
##########################################################################
# opath must be in unicode
def addFile(self, opath):
return self.writeData(opath, open(opath, "rb").read())
def writeData(self, opath, data):
# if fname is a full path, use only the basename
fname = os.path.basename(opath)
# make sure we write it in NFC form (on mac will autoconvert to NFD),
# and return an NFC-encoded reference
fname = unicodedata.normalize("NFC", fname)
# remove any dangerous characters
base = self.stripIllegal(fname)
(root, ext) = os.path.splitext(base)
def repl(match):
n = int(match.group(1))
return " (%d)" % (n+1)
# find the first available name
csum = checksum(data)
while True:
fname = root + ext
path = os.path.join(self.dir(), fname)
# if it doesn't exist, copy it directly
if not os.path.exists(path):
open(path, "wb").write(data)
return fname
# if it's identical, reuse
if checksum(open(path, "rb").read()) == csum:
return fname
# otherwise, increment the index in the filename
reg = " \((\d+)\)$"
if not re.search(reg, root):
root = root + " (1)"
else:
root = re.sub(reg, repl, root)
# String manipulation
##########################################################################
def filesInStr(self, mid, string, includeRemote=False):
l = []
model = self.col.models.get(mid)
strings = []
if model['type'] == MODEL_CLOZE and "{{c" in string:
# if the field has clozes in it, we'll need to expand the
# possibilities so we can render latex
strings = self._expandClozes(string)
else:
strings = [string]
for string in strings:
# handle latex
string = mungeQA(string, None, None, model, None, self.col)
# extract filenames
for reg in self.regexps:
for match in re.finditer(reg, string):
fname = match.group("fname")
isLocal = not re.match("(https?|ftp)://", fname.lower())
if isLocal or includeRemote:
l.append(fname)
return l
def _expandClozes(self, string):
ords = set(re.findall("{{c(\d+)::.+?}}", string))
strings = []
from anki.template.template import clozeReg
def qrepl(m):
if m.group(3):
return "[%s]" % m.group(3)
else:
return "[...]"
def arepl(m):
return m.group(1)
for ord in ords:
s = re.sub(clozeReg%ord, qrepl, string)
s = re.sub(clozeReg%".+?", "\\1", s)
strings.append(s)
strings.append(re.sub(clozeReg%".+?", arepl, string))
return strings
def transformNames(self, txt, func):
for reg in self.regexps:
txt = re.sub(reg, func, txt)
return txt
def strip(self, txt):
for reg in self.regexps:
txt = re.sub(reg, "", txt)
return txt
def escapeImages(self, string):
def repl(match):
tag = match.group(0)
fname = match.group("fname")
if re.match("(https?|ftp)://", fname):
return tag
return tag.replace(
fname, urllib.quote(fname.encode("utf-8")))
for reg in self.imgRegexps:
string = re.sub(reg, repl, string)
return string
# Rebuilding DB
##########################################################################
def check(self, local=None):
"Return (missingFiles, unusedFiles)."
mdir = self.dir()
# gather all media references in NFC form
allRefs = set()
for nid, mid, flds in self.col.db.execute("select id, mid, flds from notes"):
noteRefs = self.filesInStr(mid, flds)
# check the refs are in NFC
for f in noteRefs:
# if they're not, we'll need to fix them first
if f != unicodedata.normalize("NFC", f):
self._normalizeNoteRefs(nid)
noteRefs = self.filesInStr(mid, flds)
break
allRefs.update(noteRefs)
# loop through media folder
unused = []
invalid = []
if local is None:
files = os.listdir(mdir)
else:
files = local
renamedFiles = False
for file in files:
if not local:
if not os.path.isfile(file):
# ignore directories
continue
if file.startswith("_"):
# leading _ says to ignore file
continue
if not isinstance(file, unicode):
invalid.append(unicode(file, sys.getfilesystemencoding(), "replace"))
continue
nfcFile = unicodedata.normalize("NFC", file)
# we enforce NFC fs encoding on non-macs; on macs we'll have gotten
# NFD so we use the above variable for comparing references
if not isMac and not local:
if file != nfcFile:
# delete if we already have the NFC form, otherwise rename
if os.path.exists(nfcFile):
os.unlink(file)
renamedFiles = True
else:
os.rename(file, nfcFile)
renamedFiles = True
file = nfcFile
# compare
if nfcFile not in allRefs:
unused.append(file)
else:
allRefs.discard(nfcFile)
# if we renamed any files to nfc format, we must rerun the check
# to make sure the renamed files are not marked as unused
if renamedFiles:
return self.check(local=local)
nohave = [x for x in allRefs if not x.startswith("_")]
return (nohave, unused, invalid)
def _normalizeNoteRefs(self, nid):
note = self.col.getNote(nid)
for c, fld in enumerate(note.fields):
nfc = unicodedata.normalize("NFC", fld)
if nfc != fld:
note.fields[c] = nfc
note.flush()
# Copying on import
##########################################################################
def have(self, fname):
return os.path.exists(os.path.join(self.dir(), fname))
# Illegal characters
##########################################################################
_illegalCharReg = re.compile(r'[][><:"/?*^\\|\0\r\n]')
def stripIllegal(self, str):
return re.sub(self._illegalCharReg, "", str)
def hasIllegal(self, str):
# a file that couldn't be decoded to unicode is considered invalid
if not isinstance(str, unicode):
return True
return not not re.search(self._illegalCharReg, str)
# Tracking changes
##########################################################################
def findChanges(self):
"Scan the media folder if it's changed, and note any changes."
if self._changed():
self._logChanges()
def haveDirty(self):
return self.db.scalar("select 1 from media where dirty=1 limit 1")
def _mtime(self, path):
return int(os.stat(path).st_mtime)
def _checksum(self, path):
return checksum(open(path, "rb").read())
def _changed(self):
"Return dir mtime if it has changed since the last findChanges()"
# doesn't track edits, but user can add or remove a file to update
mod = self.db.scalar("select dirMod from meta")
mtime = self._mtime(self.dir())
if not self._isFAT32() and mod and mod == mtime:
return False
return mtime
def _logChanges(self):
(added, removed) = self._changes()
media = []
for f in added:
mt = self._mtime(f)
media.append((f, self._checksum(f), mt, 1))
for f in removed:
media.append((f, None, 0, 1))
# update media db
self.db.executemany("insert or replace into media values (?,?,?,?)",
media)
self.db.execute("update meta set dirMod = ?", self._mtime(self.dir()))
self.db.commit()
def _changes(self):
self.cache = {}
for (name, csum, mod) in self.db.execute(
"select fname, csum, mtime from media"):
self.cache[name] = [csum, mod, False]
added = []
removed = []
# loop through on-disk files
for f in os.listdir(self.dir()):
# ignore folders and thumbs.db
if os.path.isdir(f):
continue
if f.lower() == "thumbs.db":
continue
# and files with invalid chars
if self.hasIllegal(f):
continue
# empty files are invalid; clean them up and continue
if not os.path.getsize(f):
os.unlink(f)
continue
# newly added?
if f not in self.cache:
added.append(f)
else:
# modified since last time?
if self._mtime(f) != self.cache[f][1]:
# and has different checksum?
if self._checksum(f) != self.cache[f][0]:
added.append(f)
# mark as used
self.cache[f][2] = True
# look for any entries in the cache that no longer exist on disk
for (k, v) in self.cache.items():
if not v[2]:
removed.append(k)
return added, removed
# Syncing-related
##########################################################################
def lastUsn(self):
return self.db.scalar("select lastUsn from meta")
def setLastUsn(self, usn):
self.db.execute("update meta set lastUsn = ?", usn)
self.db.commit()
def syncInfo(self, fname):
ret = self.db.first(
"select csum, dirty from media where fname=?", fname)
return ret or (None, 0)
def markClean(self, fnames):
for fname in fnames:
self.db.execute(
"update media set dirty=0 where fname=?", fname)
def syncDelete(self, fname):
if os.path.exists(fname):
os.unlink(fname)
self.db.execute("delete from media where fname=?", fname)
def mediaCount(self):
return self.db.scalar(
"select count() from media where csum is not null")
def forceResync(self):
self.db.execute("delete from media")
self.db.execute("vacuum analyze")
self.db.commit()
# Media syncing: zips
##########################################################################
def mediaChangesZip(self):
f = StringIO()
z = zipfile.ZipFile(f, "w", compression=zipfile.ZIP_DEFLATED)
fnames = []
# meta is list of (fname, zipname), where zipname of None
# is a deleted file
meta = []
sz = 0
for c, (fname, csum) in enumerate(self.db.execute(
"select fname, csum from media where dirty=1"
" limit %d"%SYNC_ZIP_COUNT)):
fnames.append(fname)
normname = unicodedata.normalize("NFC", fname)
if csum:
z.write(fname, str(c))
meta.append((normname, str(c)))
sz += os.path.getsize(fname)
else:
meta.append((normname, ""))
if sz >= SYNC_ZIP_SIZE:
break
z.writestr("_meta", json.dumps(meta))
z.close()
return f.getvalue(), fnames
def addFilesFromZip(self, zipData):
"Extract zip data; true if finished."
f = StringIO(zipData)
z = zipfile.ZipFile(f, "r")
media = []
# get meta info first
meta = json.loads(z.read("_meta"))
# then loop through all files
cnt = 0
for i in z.infolist():
if i.filename == "_meta":
# ignore previously-retrieved meta
continue
else:
data = z.read(i)
csum = checksum(data)
name = meta[i.filename]
if not isinstance(name, unicode):
name = unicode(name, "utf8")
# normalize name for platform
if isMac:
name = unicodedata.normalize("NFD", name)
else:
name = unicodedata.normalize("NFC", name)
# save file
open(name, "wb").write(data)
# update db
media.append((name, csum, self._mtime(name), 0))
cnt += 1
if media:
self.db.executemany(
"insert or replace into media values (?,?,?,?)", media)
return cnt