define standard encoding for unicode (#893)

- always store media references in fields in NFC form
- always encode filenames on disk in NFC form on machines other than macs
- use relevant encoding when placing files in the media folder during syncs
  and apkg imports as well
- rename 'unused media' back to 'check media'
- check media can now automatically change media references and filename
  encodings to the correct format
This commit is contained in:
Damien Elmes 2013-09-20 18:06:41 +09:00
parent 4d42282b7b
commit 0d1d8c5bf9
6 changed files with 60 additions and 31 deletions

View file

@ -3,6 +3,7 @@
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import os import os
import unicodedata
from anki import Collection from anki import Collection
from anki.utils import intTime, splitFields, joinFields, incGuid from anki.utils import intTime, splitFields, joinFields, incGuid
from anki.importing.base import Importer from anki.importing.base import Importer
@ -349,7 +350,8 @@ insert or ignore into revlog values (?,?,?,?,?,?,?,?,?)""", revlog)
return self._mediaData(fname, self.dst.media.dir()) return self._mediaData(fname, self.dst.media.dir())
def _writeDstMedia(self, fname, data): def _writeDstMedia(self, fname, data):
path = os.path.join(self.dst.media.dir(), fname) path = os.path.join(self.dst.media.dir(),
unicodedata.normalize("NFC", fname))
try: try:
open(path, "wb").write(data) open(path, "wb").write(data)
except (OSError, IOError): except (OSError, IOError):

View file

@ -3,6 +3,7 @@
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import zipfile, os import zipfile, os
import unicodedata
from anki.utils import tmpfile, json from anki.utils import tmpfile, json
from anki.importing.anki2 import Anki2Importer from anki.importing.anki2 import Anki2Importer
@ -26,7 +27,8 @@ class AnkiPackageImporter(Anki2Importer):
for file, c in self.nameToNum.items(): for file, c in self.nameToNum.items():
if not file.startswith("_") and not file.startswith("latex-"): if not file.startswith("_") and not file.startswith("latex-"):
continue continue
path = os.path.join(self.col.media.dir(), file) path = os.path.join(self.col.media.dir(),
unicodedata.normalize("NFC", file))
if not os.path.exists(path): if not os.path.exists(path):
open(path, "wb").write(z.read(c)) open(path, "wb").write(z.read(c))

View file

@ -83,6 +83,7 @@ class MediaManager(object):
# Adding media # Adding media
########################################################################## ##########################################################################
# opath must be in unicode
def addFile(self, opath): def addFile(self, opath):
return self.writeData(opath, open(opath, "rb").read()) return self.writeData(opath, open(opath, "rb").read())
@ -90,6 +91,9 @@ class MediaManager(object):
def writeData(self, opath, data): def writeData(self, opath, data):
# if fname is a full path, use only the basename # if fname is a full path, use only the basename
fname = os.path.basename(opath) fname = os.path.basename(opath)
# make sure we write it in NFC form (on mac will autoconvert to NFD),
# and return an NFC-encoded reference
fname = unicodedata.normalize("NFC", fname)
# remove any dangerous characters # remove any dangerous characters
base = self.stripIllegal(fname) base = self.stripIllegal(fname)
(root, ext) = os.path.splitext(base) (root, ext) = os.path.splitext(base)
@ -186,15 +190,19 @@ class MediaManager(object):
def check(self, local=None): def check(self, local=None):
"Return (missingFiles, unusedFiles)." "Return (missingFiles, unusedFiles)."
mdir = self.dir() mdir = self.dir()
# generate card q/a and look through all references # gather all media references in NFC form
normrefs = {} allRefs = set()
def norm(s): for nid, mid, flds in self.col.db.execute("select id, mid, flds from notes"):
if isinstance(s, unicode) and isMac: noteRefs = self.filesInStr(mid, flds)
return unicodedata.normalize('NFD', s) # check the refs are in NFC
return s for f in noteRefs:
for f in self.allMedia(): # if they're not, we'll need to fix them first
normrefs[norm(f)] = True if f != unicodedata.normalize("NFC", f):
# loop through directory and find unused & missing media self._normalizeNoteRefs(nid)
noteRefs = self.filesInStr(mid, flds)
break
allRefs.update(noteRefs)
# loop through media folder
unused = [] unused = []
if local is None: if local is None:
files = os.listdir(mdir) files = os.listdir(mdir)
@ -202,28 +210,38 @@ class MediaManager(object):
files = local files = local
for file in files: for file in files:
if not local: if not local:
path = os.path.join(mdir, file) if not os.path.isfile(file):
if not os.path.isfile(path):
# ignore directories # ignore directories
continue continue
if file.startswith("_"): if file.startswith("_"):
# leading _ says to ignore file # leading _ says to ignore file
continue continue
nfile = norm(file) nfcFile = unicodedata.normalize("NFC", file)
if nfile not in normrefs: # we enforce NFC fs encoding on non-macs; on macs we'll have gotten
# NFD so we use the above variable for comparing references
if not isMac:
if file != nfcFile:
# delete if we already have the NFC form, otherwise rename
if os.path.exists(nfcFile):
os.unlink(file)
else:
os.rename(file, nfcFile)
file = nfcFile
# compare
if nfcFile not in allRefs:
unused.append(file) unused.append(file)
else: else:
del normrefs[nfile] allRefs.discard(nfcFile)
nohave = [x for x in normrefs.keys() if not x.startswith("_")] nohave = [x for x in allRefs if not x.startswith("_")]
return (nohave, unused) return (nohave, unused)
def allMedia(self): def _normalizeNoteRefs(self, nid):
"Return a set of all referenced filenames." note = self.col.getNote(nid)
files = set() for c, fld in enumerate(note.fields):
for mid, flds in self.col.db.execute("select mid, flds from notes"): nfc = unicodedata.normalize("NFC", fld)
for f in self.filesInStr(mid, flds): if nfc != fld:
files.add(f) note.fields[c] = nfc
return files note.flush()
# Copying on import # Copying on import
########################################################################## ##########################################################################
@ -276,6 +294,11 @@ class MediaManager(object):
data = z.read(i) data = z.read(i)
csum = checksum(data) csum = checksum(data)
name = meta[i.filename] name = meta[i.filename]
# normalize name for platform
if isMac:
name = unicodedata.normalize("NFD", name)
else:
name = unicodedata.normalize("NFC", name)
# save file # save file
open(name, "wb").write(data) open(name, "wb").write(data)
# update db # update db
@ -327,6 +350,8 @@ class MediaManager(object):
z.writestr("_finished", "") z.writestr("_finished", "")
break break
fname = fname[0] fname = fname[0]
# we add it as a one-element array simply to make
# the later forgetAdded() call easier
fnames.append([fname]) fnames.append([fname])
z.write(fname, str(cnt)) z.write(fname, str(cnt))
files[str(cnt)] = fname files[str(cnt)] = fname

View file

@ -151,7 +151,7 @@
</action> </action>
<action name="actionCheckMediaDatabase"> <action name="actionCheckMediaDatabase">
<property name="text"> <property name="text">
<string>&amp;Unused Media...</string> <string>Check &amp;Media...</string>
</property> </property>
<property name="statusTip"> <property name="statusTip">
<string>Check the files in the media directory</string> <string>Check the files in the media directory</string>

View file

@ -18,6 +18,6 @@ def getUpgradeDeckPath(name="anki12.anki"):
src = os.path.join(testDir, "support", name) src = os.path.join(testDir, "support", name)
(fd, dst) = tempfile.mkstemp(suffix=".anki2") (fd, dst) = tempfile.mkstemp(suffix=".anki2")
shutil.copy(src, dst) shutil.copy(src, dst)
return dst return unicode(dst, "utf8")
testDir = os.path.dirname(__file__) testDir = os.path.dirname(__file__)

View file

@ -7,7 +7,7 @@ from shared import getEmptyDeck, testDir
def test_add(): def test_add():
d = getEmptyDeck() d = getEmptyDeck()
dir = tempfile.mkdtemp(prefix="anki") dir = tempfile.mkdtemp(prefix="anki")
path = os.path.join(dir, "foo.jpg") path = os.path.join(dir, u"foo.jpg")
open(path, "w").write("hello") open(path, "w").write("hello")
# new file, should preserve name # new file, should preserve name
assert d.media.addFile(path) == "foo.jpg" assert d.media.addFile(path) == "foo.jpg"
@ -72,7 +72,7 @@ def test_changes():
assert not list(d.media.removed()) assert not list(d.media.removed())
# add a file # add a file
dir = tempfile.mkdtemp(prefix="anki") dir = tempfile.mkdtemp(prefix="anki")
path = os.path.join(dir, "foo.jpg") path = os.path.join(dir, u"foo.jpg")
open(path, "w").write("hello") open(path, "w").write("hello")
time.sleep(1) time.sleep(1)
path = d.media.addFile(path) path = d.media.addFile(path)