Anki/anki/media.py
Damien Elmes 9c247f45bd remove q/a cache, tags in fields, rewrite remaining ids, more
Anki used random 64bit IDs for cards, facts and fields. This had some nice
properties:
- merging data in syncs and imports was simply a matter of copying each way,
  as conflicts were astronomically unlikely
- it made it easy to identify identical cards and prevent them from being
  reimported
But there were some negatives too:
- they're more expensive to store
- javascript can't handle numbers > 2**53, which means AnkiMobile, iAnki and
  so on have to treat the ids as strings, which is slow
- simply copying data in a sync or import can lead to corruption, as while a
  duplicate id indicates the data was originally the same, it may have
  diverged. A more intelligent approach is necessary.
- sqlite was sorting the fields table based on the id, which meant the fields
  were spread across the table, and costly to fetch

So instead, we'll move to incremental ids. In the case of model changes we'll
declare that a schema change and force a full sync to avoid having to deal
with conflicts, and in the case of cards and facts, we'll need to update the
ids on one end to merge. Identical cards can be detected by checking to see if
their id is the same and their creation time is the same.

Creation time has been added back to cards and facts because it's necessary
for sync conflict merging. That means facts.pos is not required.

The graves table has been removed. It's not necessary for schema related
changes, and dead cards/facts can be represented as a card with queue=-4 and
created=0. Because we will record schema modification time and can ensure a
full sync propagates to all endpoints, it means we can remove the dead
cards/facts on schema change.

Tags have been removed from the facts table and are represented as a field
with ord=-1 and fmid=0. Combined with the locality improvement for fields, it
means that fetching fields is not much more expensive than using the q/a
cache.

Because of the above, removing the q/a cache is a possibility now. The q and a
columns on cards has been dropped. It will still be necessary to render the
q/a on fact add/edit, since we need to record media references. It would be
nice to avoid this in the future. Perhaps one way would be the ability to
assign a type to fields, like "image", "audio", or "latex". LaTeX needs
special consider anyway, as it was being rendered into the q/a cache.
2011-04-28 09:23:53 +09:00

277 lines
9.7 KiB
Python

# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
import os, shutil, re, urllib2, time, tempfile, unicodedata, urllib
from anki.utils import checksum, intTime
from anki.lang import _
class MediaRegistry(object):
# other code depends on this order, so don't reorder
regexps = ("(?i)(\[sound:([^]]+)\])",
"(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)")
def __init__(self, deck):
self.deck = deck
self.mediaPrefix = ""
self._mediaDir = None
self._updateMediaDir()
def mediaDir(self, create=False):
if self._mediaDir:
return self._mediaDir
elif create:
self._updateMediaDir(True)
return self._mediaDir
def _updateMediaDir(self, create=False):
if self.mediaPrefix:
dir = os.path.join(
self.mediaPrefix, os.path.basename(self.deck.path))
else:
dir = self.deck.path
dir = re.sub("(?i)\.(anki)$", ".media", dir)
if create == None:
# don't create, but return dir
return dir
if not os.path.exists(dir):
if not create:
return
# will raise error if we can't create
os.makedirs(dir)
# change to the current dir
os.chdir(dir)
self._mediaDir = dir
# Adding and registering media
##########################################################################
def addFile(self, path):
"""Copy PATH to MEDIADIR, and return new filename.
If a file with the same md5sum exists in the DB, return that.
If a file with the same name exists, return a unique name."""
# see if have duplicate contents
csum = self.mediaChecksum(path)
if not csum:
# file was unreadable or didn't exist
return None
file = self.deck.db.scalar(
"select file from media where csum = :cs",
cs=csum)
if not file:
base = os.path.basename(path)
mdir = self.mediaDir(create=True)
file = self.uniquePath(mdir, base)
shutil.copy2(path, file)
self.registerFile(base)
return os.path.basename(file)
def registerFile(self, file):
"Add a single file to the media database."
if self.mediaDir():
csum = self.mediaChecksum(os.path.join(self.mediaDir(), file))
else:
csum = ""
self.deck.db.execute(
"insert or replace into media values (?, ?, ?)",
file, intTime(), csum)
def registerText(self, string):
"Add all media in string to the media database."
for f in self.mediaFiles(string):
self.registerFile(f)
def removeUnusedMedia(deck):
ids = deck.s.list("select id from media where size = 0")
for id in ids:
deck.s.statement("insert into mediaDeleted values (:id, :t)",
id=id, t=time.time())
deck.s.statement("delete from media where size = 0")
# Moving media
##########################################################################
def renameMediaDir(self, oldPath):
"Copy oldPath to our current media dir. "
assert os.path.exists(oldPath)
newPath = self.mediaDir(create=None)
# copytree doesn't want the dir to exist
try:
shutil.copytree(oldPath, newPath)
except:
# FIXME: should really remove everything in old dir instead of
# giving up
pass
# Tools
##########################################################################
def mediaChecksum(self, path):
"Return checksum of PATH, or empty string."
try:
return checksum(open(path, "rb").read())
except:
return ""
def uniquePath(self, dir, base):
# remove any dangerous characters
base = re.sub(r"[][<>:/\\&]", "", base)
# find a unique name
(root, ext) = os.path.splitext(base)
def repl(match):
n = int(match.group(1))
return " (%d)" % (n+1)
while True:
path = os.path.join(dir, root + ext)
if not os.path.exists(path):
break
reg = " \((\d+)\)$"
if not re.search(reg, root):
root = root + " (1)"
else:
root = re.sub(reg, repl, root)
return path
# String manipulation
##########################################################################
def mediaFiles(self, string, includeRemote=False):
l = []
for reg in self.regexps:
for (full, fname) in re.findall(reg, string):
isLocal = not re.match("(https?|ftp)://", fname.lower())
if isLocal or includeRemote:
l.append(fname)
return l
def stripMedia(self, txt):
for reg in self.regexps:
txt = re.sub(reg, "", txt)
return txt
def escapeImages(self, string):
def repl(match):
tag = match.group(1)
fname = match.group(2)
if re.match("(https?|ftp)://", fname):
return tag
return tag.replace(
fname, urllib.quote(fname.encode("utf-8")))
return re.sub(self.regexps[1], repl, string)
# Rebuilding DB
##########################################################################
def rebuildMediaDir(self, delete=False):
mdir = self.mediaDir()
if not mdir:
return (0, 0)
self.deck.startProgress()
# delete all media entries in database
self.deck.db.execute("delete from media")
# look through cards for media references
normrefs = {}
def norm(s):
if isinstance(s, unicode):
return unicodedata.normalize('NFD', s)
return s
# generate q/a and look through all references
(cids, fids, meta) = self.deck._cacheMeta()
facts = self.deck._cacheFacts(fids)
pend = [self.deck.formatQA(cids[n], facts[fids[n]], meta[cids[n]])
for n in range(len(cids))]
for p in pend:
for type in ("q", "a"):
for f in self.mediaFiles(p[type]):
normrefs[norm(f)] = True
self.registerFile(f)
# find unused media
unused = []
for file in os.listdir(mdir):
path = os.path.join(mdir, file)
if not os.path.isfile(path):
# ignore directories
continue
nfile = norm(file)
if nfile not in normrefs:
unused.append(file)
# optionally delete
if delete:
for f in unused:
path = os.path.join(mdir, f)
os.unlink(path)
nohave = self.deck.db.list(
"select file from media where csum = ''")
self.deck.finishProgress()
return (nohave, unused)
# Download missing
##########################################################################
def downloadMissing(self):
urlbase = self.deck.getVar("mediaURL")
if not urlbase:
return None
mdir = self.deck.mediaDir(create=True)
self.deck.startProgress()
missing = 0
grabbed = 0
for c, (f, sum) in enumerate(self.deck.db.all(
"select file, csum from media")):
path = os.path.join(mdir, f)
if not os.path.exists(path):
try:
rpath = urlbase + f
url = urllib2.urlopen(rpath)
open(f, "wb").write(url.read())
grabbed += 1
except:
if sum:
# the file is supposed to exist
self.deck.finishProgress()
return (False, rpath)
else:
# ignore and keep going
missing += 1
self.deck.updateProgress(label=_("File %d...") % (grabbed+missing))
self.deck.finishProgress()
return (True, grabbed, missing)
# Convert remote links to local ones
##########################################################################
def downloadRemote(self):
mdir = self.deck.mediaDir(create=True)
refs = {}
self.deck.startProgress()
for (question, answer) in self.deck.db.all(
"select question, answer from cards"):
for txt in (question, answer):
for f in mediaFiles(txt, remote=True):
refs[f] = True
tmpdir = tempfile.mkdtemp(prefix="anki")
failed = []
passed = []
for c, link in enumerate(refs.keys()):
try:
path = os.path.join(tmpdir, os.path.basename(link))
url = urllib2.urlopen(link)
open(path, "wb").write(url.read())
newpath = copyToMedia(self.deck, path)
passed.append([link, newpath])
except:
failed.append(link)
self.deck.updateProgress(label=_("Download %d...") % c)
for (url, name) in passed:
self.deck.db.execute(
"update fields set value = replace(value, :url, :name)",
url=url, name=name)
self.deck.updateProgress(label=_("Updating references..."))
self.deck.updateProgress(label=_("Updating cards..."))
# rebuild entire q/a cache
for m in self.deck.models:
self.deck.updateCardsFromModel(m, dirty=True)
self.deck.finishProgress()
return (passed, failed)