mirror of
https://github.com/ankitects/anki.git
synced 2025-09-19 14:32:22 -04:00

The media table was originally introduced when Anki hashed media filenames, and needed a way to remember the original filename. It also helped with: 1) getting a quick list of all media used in the deck, or the media added since the last sync, for mobile clients 2) merging identical files with different names But had some drawbacks: - every operation that modifies templates, models or facts meant generating the q/a and checking if any new media had appeared - each entry is about 70 bytes, and some decks have 100k+ media files So we remove the media table. We address 1) by being more intelligent about media downloads on the mobile platform. We ask the user after a full sync if they want to look for missing media, and they can choose not to if they know they haven't added any. And on a partial sync, we can scan the contents of the incoming facts for media references, and download any references we find. This also avoids all the issues people had with media not downloading because it was in their media folder but not in the media database. For 2), when copying media to the media folder, if we have a duplicate filename, we check if that file has the same md5, and avoid copying if so. This won't merge identical content that has separate names, but instances where users need that are rare.
222 lines
7.6 KiB
Python
222 lines
7.6 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright: Damien Elmes <anki@ichi2.net>
|
|
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
|
|
|
|
import os, shutil, re, urllib2, time, tempfile, unicodedata, urllib
|
|
from anki.utils import checksum, intTime
|
|
from anki.lang import _
|
|
|
|
class MediaRegistry(object):
|
|
|
|
# other code depends on this order, so don't reorder
|
|
regexps = ("(?i)(\[sound:([^]]+)\])",
|
|
"(?i)(<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>)")
|
|
|
|
def __init__(self, deck):
|
|
self.deck = deck
|
|
self.mediaPrefix = ""
|
|
self._dir = None
|
|
self._updateDir()
|
|
|
|
def dir(self, create=False):
|
|
if self._dir:
|
|
return self._dir
|
|
elif create:
|
|
self._updateDir(True)
|
|
return self._dir
|
|
|
|
def _updateDir(self, create=False):
|
|
if self.mediaPrefix:
|
|
dir = os.path.join(
|
|
self.mediaPrefix, os.path.basename(self.deck.path))
|
|
else:
|
|
dir = self.deck.path
|
|
dir = re.sub("(?i)\.(anki)$", ".media", dir)
|
|
if create == None:
|
|
# don't create, but return dir
|
|
return dir
|
|
if not os.path.exists(dir):
|
|
if not create:
|
|
return
|
|
# will raise error if we can't create
|
|
os.makedirs(dir)
|
|
# change to the current dir
|
|
os.chdir(dir)
|
|
self._dir = dir
|
|
|
|
# Adding media
|
|
##########################################################################
|
|
|
|
def addFile(self, opath):
|
|
"""Copy PATH to MEDIADIR, and return new filename.
|
|
If the same name exists, compare checksums."""
|
|
mdir = self.dir(create=True)
|
|
# remove any dangerous characters
|
|
base = re.sub(r"[][<>:/\\&]", "", os.path.basename(opath))
|
|
dst = os.path.join(mdir, base)
|
|
# if it doesn't exist, copy it directly
|
|
if not os.path.exists(dst):
|
|
shutil.copy2(opath, dst)
|
|
return base
|
|
# if it's identical, reuse
|
|
if self.filesIdentical(opath, dst):
|
|
return base
|
|
# otherwise, find a unique name
|
|
(root, ext) = os.path.splitext(base)
|
|
def repl(match):
|
|
n = int(match.group(1))
|
|
return " (%d)" % (n+1)
|
|
while True:
|
|
path = os.path.join(mdir, root + ext)
|
|
if not os.path.exists(path):
|
|
break
|
|
reg = " \((\d+)\)$"
|
|
if not re.search(reg, root):
|
|
root = root + " (1)"
|
|
else:
|
|
root = re.sub(reg, repl, root)
|
|
# copy and return
|
|
shutil.copy2(opath, path)
|
|
return os.path.basename(os.path.basename(path))
|
|
|
|
def filesIdentical(self, path1, path2):
|
|
"True if files are the same."
|
|
return (checksum(open(path1, "rb").read()) ==
|
|
checksum(open(path2, "rb").read()))
|
|
|
|
# String manipulation
|
|
##########################################################################
|
|
|
|
def mediaFiles(self, string, includeRemote=False):
|
|
l = []
|
|
for reg in self.regexps:
|
|
for (full, fname) in re.findall(reg, string):
|
|
isLocal = not re.match("(https?|ftp)://", fname.lower())
|
|
if isLocal or includeRemote:
|
|
l.append(fname)
|
|
return l
|
|
|
|
def stripMedia(self, txt):
|
|
for reg in self.regexps:
|
|
txt = re.sub(reg, "", txt)
|
|
return txt
|
|
|
|
def escapeImages(self, string):
|
|
def repl(match):
|
|
tag = match.group(1)
|
|
fname = match.group(2)
|
|
if re.match("(https?|ftp)://", fname):
|
|
return tag
|
|
return tag.replace(
|
|
fname, urllib.quote(fname.encode("utf-8")))
|
|
return re.sub(self.regexps[1], repl, string)
|
|
|
|
# Rebuilding DB
|
|
##########################################################################
|
|
|
|
def check(self, delete=False):
|
|
"Return (missingFiles, unusedFiles)."
|
|
mdir = self.dir()
|
|
if not mdir:
|
|
return (0, 0)
|
|
# generate card q/a and look through all references
|
|
normrefs = {}
|
|
def norm(s):
|
|
if isinstance(s, unicode):
|
|
return unicodedata.normalize('NFD', s)
|
|
return s
|
|
for f in self.allMedia():
|
|
normrefs[norm(f)] = True
|
|
# loop through directory and find unused & missing media
|
|
unused = []
|
|
for file in os.listdir(mdir):
|
|
path = os.path.join(mdir, file)
|
|
if not os.path.isfile(path):
|
|
# ignore directories
|
|
continue
|
|
nfile = norm(file)
|
|
if nfile not in normrefs:
|
|
unused.append(file)
|
|
else:
|
|
del normrefs[nfile]
|
|
# optionally delete
|
|
if delete:
|
|
for f in unused:
|
|
path = os.path.join(mdir, f)
|
|
os.unlink(path)
|
|
nohave = normrefs.keys()
|
|
return (nohave, unused)
|
|
|
|
def allMedia(self):
|
|
"Return a set of all referenced filenames."
|
|
files = set()
|
|
for p in self.deck.renderQA(type="all"):
|
|
for type in ("q", "a"):
|
|
for f in self.mediaFiles(p[type]):
|
|
files.add(f)
|
|
return files
|
|
|
|
# Download missing
|
|
##########################################################################
|
|
|
|
def downloadMissing(self):
|
|
urlbase = self.deck.getVar("mediaURL")
|
|
if not urlbase:
|
|
return None
|
|
mdir = self.deck.dir(create=True)
|
|
missing = 0
|
|
grabbed = 0
|
|
for c, (f, sum) in enumerate(self.deck.db.all(
|
|
"select file, csum from media")):
|
|
path = os.path.join(mdir, f)
|
|
if not os.path.exists(path):
|
|
try:
|
|
rpath = urlbase + f
|
|
url = urllib2.urlopen(rpath)
|
|
open(f, "wb").write(url.read())
|
|
grabbed += 1
|
|
except:
|
|
if sum:
|
|
# the file is supposed to exist
|
|
return (False, rpath)
|
|
else:
|
|
# ignore and keep going
|
|
missing += 1
|
|
#self.deck.updateProgress(label=_("File %d...") % (grabbed+missing))
|
|
return (True, grabbed, missing)
|
|
|
|
# Convert remote links to local ones
|
|
##########################################################################
|
|
|
|
def downloadRemote(self):
|
|
mdir = self.deck.dir(create=True)
|
|
refs = {}
|
|
for (question, answer) in self.deck.db.all(
|
|
"select question, answer from cards"):
|
|
for txt in (question, answer):
|
|
for f in mediaFiles(txt, remote=True):
|
|
refs[f] = True
|
|
|
|
tmpdir = tempfile.mkdtemp(prefix="anki")
|
|
failed = []
|
|
passed = []
|
|
for c, link in enumerate(refs.keys()):
|
|
try:
|
|
path = os.path.join(tmpdir, os.path.basename(link))
|
|
url = urllib2.urlopen(link)
|
|
open(path, "wb").write(url.read())
|
|
newpath = copyToMedia(self.deck, path)
|
|
passed.append([link, newpath])
|
|
except:
|
|
failed.append(link)
|
|
#self.deck.updateProgress(label=_("Download %d...") % c)
|
|
for (url, name) in passed:
|
|
self.deck.db.execute(
|
|
"update fields set value = replace(value, :url, :name)",
|
|
url=url, name=name)
|
|
#self.deck.updateProgress(label=_("Updating references..."))
|
|
#self.deck.updateProgress(label=_("Updating cards..."))
|
|
# rebuild entire q/a cache
|
|
for m in self.deck.models:
|
|
self.deck.updateCardsFromModel(m, dirty=True)
|
|
return (passed, failed)
|