refactor media downloading on paste/drop

- use html as the first choice for incoming drops/pastes - when filtering incoming html, automatically localize any remote image references - add a special case for pasting/dropping from google images when html stripping is on - move filtering code into editor
2025-09-25 01:06:35 -04:00 · 2013-07-11 17:21:16 +09:00 · 2013-07-11 17:21:16 +09:00 · e2d2b759a4
commit e2d2b759a4
parent a538e29480
1 changed files with 157 additions and 150 deletions
--- a/aqt/editor.py
+++ b/aqt/editor.py
@ -1,11 +1,12 @@
 # -*- coding: utf-8 -*-
 # Copyright: Damien Elmes <anki@ichi2.net>
 # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
 from anki.lang import _
 from aqt.qt import *
 import re, os, urllib2, ctypes
 from anki.utils import stripHTML, isWin, isMac, namedtmp, json, stripHTMLMedia
-from anki.sound import play
+import anki.sound
 from anki.hooks import runHook, runFilter
 from aqt.sound import getAudio
 from aqt.webview import AnkiWebView
@ -16,12 +17,6 @@ import anki.js
 from BeautifulSoup import BeautifulSoup
 import urllib
 # fixme: when tab order returns to the webview, the previously focused field
 # is focused, which is not good when the user is tabbing through the dialog
 # fixme: set rtl in div css
 # fixme: commit from tag area causes error
 pics = ("jpg", "jpeg", "png", "tif", "tiff", "gif", "svg")
 audio =  ("wav", "mp3", "ogg", "flac", "mp4", "swf", "mov", "mpeg", "mkv")
@ -264,72 +259,6 @@ document.onclick = function (evt) {
 </body></html>
 """
 def _filterHTML(html):
    doc = BeautifulSoup(html)
    # remove implicit regular font style from outermost element
    if doc.span:
        try:
            attrs = doc.span['style'].split(";")
        except (KeyError, TypeError):
            attrs = []
        if attrs:
            new = []
            for attr in attrs:
                sattr = attr.strip()
                if sattr and sattr not in ("font-style: normal", "font-weight: normal"):
                    new.append(sattr)
            doc.span['style'] = ";".join(new)
    # filter out implicit formatting from webkit
    for tag in doc("span", "Apple-style-span"):
        preserve = ""
        for item in tag['style'].split(";"):
            try:
                k, v = item.split(":")
            except ValueError:
                continue
            if k.strip() == "color" and not v.strip() == "rgb(0, 0, 0)":
                preserve += "color:%s;" % v
            if k.strip() in ("font-weight", "font-style"):
                preserve += item + ";"
        if preserve:
            # preserve colour attribute, delete implicit class
            tag['style'] = preserve
            del tag['class']
        else:
            # strip completely
            tag.replaceWithChildren()
    for tag in doc("font", "Apple-style-span"):
        # strip all but colour attr from implicit font tags
        if 'color' in dict(tag.attrs):
            for attr in tag.attrs:
                if attr != "color":
                    del tag[attr]
            # and apple class
            del tag['class']
        else:
            # remove completely
            tag.replaceWithChildren()
    # now images
    for tag in doc("img"):
        # turn file:/// links into relative ones
        try:
            if tag['src'].lower().startswith("file://"):
                tag['src'] = os.path.basename(tag['src'])
        except KeyError:
            # for some bizarre reason, mnemosyne removes src elements
            # from missing media
            pass
        # strip all other attributes, including implicit max-width
        for attr, val in tag.attrs:
            if attr != "src":
                del tag[attr]
    # strip superfluous elements
    for elem in "html", "head", "body", "meta":
        for tag in doc(elem):
            tag.replaceWithChildren()
    html = unicode(doc)
    return html
 # caller is responsible for resetting note on reset
 class Editor(object):
    def __init__(self, mw, widget, parentWindow, addMode=False):
@ -551,7 +480,7 @@ class Editor(object):
    def mungeHTML(self, txt):
        if txt == "<br>":
            txt = ""
-        return _filterHTML(txt)
+        return self._filterHTML(txt, localize=False)
    # Setting/unsetting the current note
    ######################################################################
@ -824,24 +753,18 @@ to a cloze type first, via Edit>Change Note Type."""))
        self.web.eval("setFormat('inserthtml', %s);" % json.dumps(html))
    def _addMedia(self, path, canDelete=False):
-        "Add to media folder and return basename."
+        "Add to media folder and return local img or sound tag."
        # copy to media folder
-        name = self.mw.col.media.addFile(path)
+        fname = self.mw.col.media.addFile(path)
        # remove original?
        if canDelete and self.mw.pm.profile['deleteMedia']:
-            if os.path.abspath(name) != os.path.abspath(path):
+            if os.path.abspath(fname) != os.path.abspath(path):
                try:
                    os.unlink(path)
                except:
                    pass
        # return a local html link
-        ext = name.split(".")[-1].lower()
+        return self.fnameToLink(fname)
        if ext in pics:
            name = urllib.quote(name.encode("utf8"))
            return '<img src="%s">' % name
        else:
            anki.sound.play(name)
            return '[sound:%s]' % name
    def onRecSound(self):
        try:
@ -853,6 +776,127 @@ to a cloze type first, via Edit>Change Note Type."""))
            return
        self.addMedia(file)
    # Media downloads
    ######################################################################
    def urlToLink(self, url):
        fname = self.urlToFile(url)
        if not fname:
            return ""
        return self.fnameToLink(fname)
    def fnameToLink(self, fname):
        ext = fname.split(".")[-1].lower()
        if ext in pics:
            name = urllib.quote(fname.encode("utf8"))
            return '<img src="%s">' % name
        else:
            anki.sound.play(fname)
            return '[sound:%s]' % fname
    def urlToFile(self, url):
        l = url.lower()
        for suffix in pics+audio:
            if l.endswith(suffix):
                return self._retrieveURL(url)
            # not a supported type; return link verbatim
        return
    def _retrieveURL(self, url):
        "Download file into media folder and return local filename or None."
        # urllib is picky with local file links
        if url.lower().startswith("file://"):
            url = url.replace("%", "%25")
            url = url.replace("#", "%23")
            # fetch it into a temporary folder
        self.mw.progress.start(
            immediate=True, parent=self.parentWindow)
        try:
            req = urllib2.Request(url, None, {
                'User-Agent': 'Mozilla/5.0 (compatible; Anki)'})
            filecontents = urllib2.urlopen(req).read()
        except urllib2.URLError, e:
            showWarning(_("An error occurred while opening %s") % e)
            return
        finally:
            self.mw.progress.finish()
        path = unicode(urllib2.unquote(url.encode("utf8")), "utf8")
        return self.mw.col.media.writeData(path, filecontents)
    # HTML filtering
    ######################################################################
    def _filterHTML(self, html, localize=False):
        doc = BeautifulSoup(html)
        # remove implicit regular font style from outermost element
        if doc.span:
            try:
                attrs = doc.span['style'].split(";")
            except (KeyError, TypeError):
                attrs = []
            if attrs:
                new = []
                for attr in attrs:
                    sattr = attr.strip()
                    if sattr and sattr not in ("font-style: normal", "font-weight: normal"):
                        new.append(sattr)
                doc.span['style'] = ";".join(new)
            # filter out implicit formatting from webkit
        for tag in doc("span", "Apple-style-span"):
            preserve = ""
            for item in tag['style'].split(";"):
                try:
                    k, v = item.split(":")
                except ValueError:
                    continue
                if k.strip() == "color" and not v.strip() == "rgb(0, 0, 0)":
                    preserve += "color:%s;" % v
                if k.strip() in ("font-weight", "font-style"):
                    preserve += item + ";"
            if preserve:
                # preserve colour attribute, delete implicit class
                tag['style'] = preserve
                del tag['class']
            else:
                # strip completely
                tag.replaceWithChildren()
        for tag in doc("font", "Apple-style-span"):
            # strip all but colour attr from implicit font tags
            if 'color' in dict(tag.attrs):
                for attr in tag.attrs:
                    if attr != "color":
                        del tag[attr]
                    # and apple class
                del tag['class']
            else:
                # remove completely
                tag.replaceWithChildren()
            # now images
        for tag in doc("img"):
            # turn file:/// links into relative ones
            try:
                if tag['src'].lower().startswith("file://"):
                    tag['src'] = os.path.basename(tag['src'])
                if localize:
                    # convert remote image links to local ones
                    fname = self.urlToFile(tag['src'])
                    if fname:
                        tag['src'] = fname
            except KeyError:
                # for some bizarre reason, mnemosyne removes src elements
                # from missing media
                pass
                # strip all other attributes, including implicit max-width
            for attr, val in tag.attrs:
                if attr != "src":
                    del tag[attr]
            # strip superfluous elements
        for elem in "html", "head", "body", "meta":
            for tag in doc(elem):
                tag.replaceWithChildren()
        html = unicode(doc)
        return html
    # Advanced menu
    ######################################################################
@ -922,7 +966,6 @@ class EditorWebView(AnkiWebView):
    def __init__(self, parent, editor):
        AnkiWebView.__init__(self)
        self.editor = editor
        self.errtxt = _("An error occured while opening %s")
        self.strip = self.editor.mw.pm.profile['stripHTML']
    def keyPressEvent(self, evt):
@ -981,7 +1024,7 @@ class EditorWebView(AnkiWebView):
        if evt.source():
            if oldmime.hasHtml():
                mime = QMimeData()
-                mime.setHtml(_filterHTML(oldmime.html()))
+                mime.setHtml(self.editor._filterHTML(oldmime.html()))
            else:
                # old qt on linux won't give us html when dragging an image;
                # in that case just do the default action (which is to ignore
@ -1001,12 +1044,6 @@ class EditorWebView(AnkiWebView):
    def prepareClip(self, mode=QClipboard.Clipboard):
        clip = self.editor.mw.app.clipboard()
        mime = clip.mimeData(mode=mode)
        if mime.hasHtml() and mime.html().startswith("<!--anki-->"):
            # pasting from another field, filter extraneous webkit formatting
            html = mime.html()[11:]
            html = _filterHTML(html)
            mime.setHtml(html)
            return
        self.saveClip(mode=mode)
        mime = self._processMime(mime)
        clip.setMimeData(mime, mode=mode)
@ -1037,17 +1074,14 @@ class EditorWebView(AnkiWebView):
        # print "html", mime.html()
        # print "urls", mime.urls()
        # print "text", mime.text()
-        if mime.hasUrls():
+        if mime.hasHtml():
-            return self._processUrls(mime)
+            return self._processHtml(mime)
-        elif mime.hasText() and (self.strip or not mime.hasHtml()):
+        elif mime.hasText():
            return self._processText(mime)
-        # we currently aren't able to extract images from html, so we prioritize
+        elif mime.hasUrls():
-        # images over html in cases where we have both. this is a hack until
+            return self._processUrls(mime)
        # issue 92 is implemented
        elif mime.hasImage():
            return self._processImage(mime)
        elif mime.hasHtml():
            return self._processHtml(mime)
        else:
            # nothing
            return QMimeData()
@ -1056,20 +1090,12 @@ class EditorWebView(AnkiWebView):
        url = mime.urls()[0].toString()
        # chrome likes to give us the URL twice with a \n
        url = url.splitlines()[0]
        link = self._localizedMediaLink(url)
        mime = QMimeData()
        link = self.editor.urlToLink(url)
        if link:
            mime.setHtml(link)
        return mime
    def _localizedMediaLink(self, url):
        l = url.lower()
        for suffix in pics+audio:
            if l.endswith(suffix):
                return self._retrieveURL(url)
        # not a supported type; return link verbatim
        return url
    def _processText(self, mime):
        txt = unicode(mime.text())
        l = txt.lower()
@ -1077,12 +1103,7 @@ class EditorWebView(AnkiWebView):
        # if the user is pasting an image or sound link, convert it to local
        if l.startswith("http://") or l.startswith("https://") or l.startswith("file://"):
            txt = txt.split("\r\n")[0]
-            html = self._localizedMediaLink(txt)
+            html = self.editor.urlToLink(txt)
            if not html:
                return QMimeData()
            if html == txt:
                # wasn't of a supported media type; don't change
                html = None
        new = QMimeData()
        if html:
            new.setHtml(html)
@ -1092,13 +1113,28 @@ class EditorWebView(AnkiWebView):
    def _processHtml(self, mime):
        html = mime.html()
-        if self.strip:
+        newMime = QMimeData()
-            html = stripHTML(html)
+        if self.strip and not html.startswith("<!--anki-->"):
            # special case for google images: if after stripping there's no text
            # and there are image links, we'll paste those as html instead
            if not stripHTML(html).strip():
                newHtml = ""
                mid = self.editor.note.mid
                for url in self.editor.mw.col.media.filesInStr(
                    mid, html, includeRemote=True):
                    newHtml += self.editor.urlToLink(url)
                newMime.setHtml(newHtml)
            else:
                # use .text() if available so newlines are preserved; otherwise strip
                if mime.hasText():
                    return self._processText(mime)
                else:
                    newMime.setText(stripHTML(mime.text()))
        else:
-            html = _filterHTML(html)
+            # no stripping
-        mime = QMimeData()
+            html = self.editor._filterHTML(html, localize=True)
-        mime.setHtml(html)
+            newMime.setHtml(html)
-        return mime
+        return newMime
    def _processImage(self, mime):
        im = QImage(mime.imageData())
@ -1116,35 +1152,6 @@ class EditorWebView(AnkiWebView):
        mime.setHtml(self.editor._addMedia(uname+ext))
        return mime
    def _retrieveURL(self, url):
        # is it media?
        ext = url.split(".")[-1].lower()
        if ext not in pics and ext not in audio:
            return
        if url.lower().startswith("file://"):
            url = url.replace("%", "%25")
            url = url.replace("#", "%23")
        # fetch it into a temporary folder
        self.editor.mw.progress.start(
            immediate=True, parent=self.editor.parentWindow)
        try:
            req = urllib2.Request(url, None, {
                'User-Agent': 'Mozilla/5.0 (compatible; Anki)'})
            filecontents = urllib2.urlopen(req).read()
        except urllib2.URLError, e:
            showWarning(self.errtxt % e)
            return
        finally:
            self.editor.mw.progress.finish()
        path = unicode(urllib2.unquote(url.encode("utf8")), "utf8")
        for badChar in "#%\"":
            path = path.replace(badChar, "")
        path = namedtmp(os.path.basename(path))
        file = open(path, "wb")
        file.write(filecontents)
        file.close()
        return self.editor._addMedia(path)
    def _flagAnkiText(self):
        # add a comment in the clipboard html so we can tell text is copied
        # from us and doesn't need to be stripped