refactor media downloading on paste/drop

- use html as the first choice for incoming drops/pastes
- when filtering incoming html, automatically localize any remote image
  references
- add a special case for pasting/dropping from google images when html
  stripping is on
- move filtering code into editor
This commit is contained in:
Damien Elmes 2013-07-11 17:21:16 +09:00
parent a538e29480
commit e2d2b759a4

View file

@ -1,11 +1,12 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net> # Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
from anki.lang import _
from aqt.qt import * from aqt.qt import *
import re, os, urllib2, ctypes import re, os, urllib2, ctypes
from anki.utils import stripHTML, isWin, isMac, namedtmp, json, stripHTMLMedia from anki.utils import stripHTML, isWin, isMac, namedtmp, json, stripHTMLMedia
from anki.sound import play import anki.sound
from anki.hooks import runHook, runFilter from anki.hooks import runHook, runFilter
from aqt.sound import getAudio from aqt.sound import getAudio
from aqt.webview import AnkiWebView from aqt.webview import AnkiWebView
@ -16,12 +17,6 @@ import anki.js
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup
import urllib import urllib
# fixme: when tab order returns to the webview, the previously focused field
# is focused, which is not good when the user is tabbing through the dialog
# fixme: set rtl in div css
# fixme: commit from tag area causes error
pics = ("jpg", "jpeg", "png", "tif", "tiff", "gif", "svg") pics = ("jpg", "jpeg", "png", "tif", "tiff", "gif", "svg")
audio = ("wav", "mp3", "ogg", "flac", "mp4", "swf", "mov", "mpeg", "mkv") audio = ("wav", "mp3", "ogg", "flac", "mp4", "swf", "mov", "mpeg", "mkv")
@ -264,72 +259,6 @@ document.onclick = function (evt) {
</body></html> </body></html>
""" """
def _filterHTML(html):
doc = BeautifulSoup(html)
# remove implicit regular font style from outermost element
if doc.span:
try:
attrs = doc.span['style'].split(";")
except (KeyError, TypeError):
attrs = []
if attrs:
new = []
for attr in attrs:
sattr = attr.strip()
if sattr and sattr not in ("font-style: normal", "font-weight: normal"):
new.append(sattr)
doc.span['style'] = ";".join(new)
# filter out implicit formatting from webkit
for tag in doc("span", "Apple-style-span"):
preserve = ""
for item in tag['style'].split(";"):
try:
k, v = item.split(":")
except ValueError:
continue
if k.strip() == "color" and not v.strip() == "rgb(0, 0, 0)":
preserve += "color:%s;" % v
if k.strip() in ("font-weight", "font-style"):
preserve += item + ";"
if preserve:
# preserve colour attribute, delete implicit class
tag['style'] = preserve
del tag['class']
else:
# strip completely
tag.replaceWithChildren()
for tag in doc("font", "Apple-style-span"):
# strip all but colour attr from implicit font tags
if 'color' in dict(tag.attrs):
for attr in tag.attrs:
if attr != "color":
del tag[attr]
# and apple class
del tag['class']
else:
# remove completely
tag.replaceWithChildren()
# now images
for tag in doc("img"):
# turn file:/// links into relative ones
try:
if tag['src'].lower().startswith("file://"):
tag['src'] = os.path.basename(tag['src'])
except KeyError:
# for some bizarre reason, mnemosyne removes src elements
# from missing media
pass
# strip all other attributes, including implicit max-width
for attr, val in tag.attrs:
if attr != "src":
del tag[attr]
# strip superfluous elements
for elem in "html", "head", "body", "meta":
for tag in doc(elem):
tag.replaceWithChildren()
html = unicode(doc)
return html
# caller is responsible for resetting note on reset # caller is responsible for resetting note on reset
class Editor(object): class Editor(object):
def __init__(self, mw, widget, parentWindow, addMode=False): def __init__(self, mw, widget, parentWindow, addMode=False):
@ -551,7 +480,7 @@ class Editor(object):
def mungeHTML(self, txt): def mungeHTML(self, txt):
if txt == "<br>": if txt == "<br>":
txt = "" txt = ""
return _filterHTML(txt) return self._filterHTML(txt, localize=False)
# Setting/unsetting the current note # Setting/unsetting the current note
###################################################################### ######################################################################
@ -824,24 +753,18 @@ to a cloze type first, via Edit>Change Note Type."""))
self.web.eval("setFormat('inserthtml', %s);" % json.dumps(html)) self.web.eval("setFormat('inserthtml', %s);" % json.dumps(html))
def _addMedia(self, path, canDelete=False): def _addMedia(self, path, canDelete=False):
"Add to media folder and return basename." "Add to media folder and return local img or sound tag."
# copy to media folder # copy to media folder
name = self.mw.col.media.addFile(path) fname = self.mw.col.media.addFile(path)
# remove original? # remove original?
if canDelete and self.mw.pm.profile['deleteMedia']: if canDelete and self.mw.pm.profile['deleteMedia']:
if os.path.abspath(name) != os.path.abspath(path): if os.path.abspath(fname) != os.path.abspath(path):
try: try:
os.unlink(path) os.unlink(path)
except: except:
pass pass
# return a local html link # return a local html link
ext = name.split(".")[-1].lower() return self.fnameToLink(fname)
if ext in pics:
name = urllib.quote(name.encode("utf8"))
return '<img src="%s">' % name
else:
anki.sound.play(name)
return '[sound:%s]' % name
def onRecSound(self): def onRecSound(self):
try: try:
@ -853,6 +776,127 @@ to a cloze type first, via Edit>Change Note Type."""))
return return
self.addMedia(file) self.addMedia(file)
# Media downloads
######################################################################
def urlToLink(self, url):
fname = self.urlToFile(url)
if not fname:
return ""
return self.fnameToLink(fname)
def fnameToLink(self, fname):
ext = fname.split(".")[-1].lower()
if ext in pics:
name = urllib.quote(fname.encode("utf8"))
return '<img src="%s">' % name
else:
anki.sound.play(fname)
return '[sound:%s]' % fname
def urlToFile(self, url):
l = url.lower()
for suffix in pics+audio:
if l.endswith(suffix):
return self._retrieveURL(url)
# not a supported type; return link verbatim
return
def _retrieveURL(self, url):
"Download file into media folder and return local filename or None."
# urllib is picky with local file links
if url.lower().startswith("file://"):
url = url.replace("%", "%25")
url = url.replace("#", "%23")
# fetch it into a temporary folder
self.mw.progress.start(
immediate=True, parent=self.parentWindow)
try:
req = urllib2.Request(url, None, {
'User-Agent': 'Mozilla/5.0 (compatible; Anki)'})
filecontents = urllib2.urlopen(req).read()
except urllib2.URLError, e:
showWarning(_("An error occurred while opening %s") % e)
return
finally:
self.mw.progress.finish()
path = unicode(urllib2.unquote(url.encode("utf8")), "utf8")
return self.mw.col.media.writeData(path, filecontents)
# HTML filtering
######################################################################
def _filterHTML(self, html, localize=False):
doc = BeautifulSoup(html)
# remove implicit regular font style from outermost element
if doc.span:
try:
attrs = doc.span['style'].split(";")
except (KeyError, TypeError):
attrs = []
if attrs:
new = []
for attr in attrs:
sattr = attr.strip()
if sattr and sattr not in ("font-style: normal", "font-weight: normal"):
new.append(sattr)
doc.span['style'] = ";".join(new)
# filter out implicit formatting from webkit
for tag in doc("span", "Apple-style-span"):
preserve = ""
for item in tag['style'].split(";"):
try:
k, v = item.split(":")
except ValueError:
continue
if k.strip() == "color" and not v.strip() == "rgb(0, 0, 0)":
preserve += "color:%s;" % v
if k.strip() in ("font-weight", "font-style"):
preserve += item + ";"
if preserve:
# preserve colour attribute, delete implicit class
tag['style'] = preserve
del tag['class']
else:
# strip completely
tag.replaceWithChildren()
for tag in doc("font", "Apple-style-span"):
# strip all but colour attr from implicit font tags
if 'color' in dict(tag.attrs):
for attr in tag.attrs:
if attr != "color":
del tag[attr]
# and apple class
del tag['class']
else:
# remove completely
tag.replaceWithChildren()
# now images
for tag in doc("img"):
# turn file:/// links into relative ones
try:
if tag['src'].lower().startswith("file://"):
tag['src'] = os.path.basename(tag['src'])
if localize:
# convert remote image links to local ones
fname = self.urlToFile(tag['src'])
if fname:
tag['src'] = fname
except KeyError:
# for some bizarre reason, mnemosyne removes src elements
# from missing media
pass
# strip all other attributes, including implicit max-width
for attr, val in tag.attrs:
if attr != "src":
del tag[attr]
# strip superfluous elements
for elem in "html", "head", "body", "meta":
for tag in doc(elem):
tag.replaceWithChildren()
html = unicode(doc)
return html
# Advanced menu # Advanced menu
###################################################################### ######################################################################
@ -922,7 +966,6 @@ class EditorWebView(AnkiWebView):
def __init__(self, parent, editor): def __init__(self, parent, editor):
AnkiWebView.__init__(self) AnkiWebView.__init__(self)
self.editor = editor self.editor = editor
self.errtxt = _("An error occured while opening %s")
self.strip = self.editor.mw.pm.profile['stripHTML'] self.strip = self.editor.mw.pm.profile['stripHTML']
def keyPressEvent(self, evt): def keyPressEvent(self, evt):
@ -981,7 +1024,7 @@ class EditorWebView(AnkiWebView):
if evt.source(): if evt.source():
if oldmime.hasHtml(): if oldmime.hasHtml():
mime = QMimeData() mime = QMimeData()
mime.setHtml(_filterHTML(oldmime.html())) mime.setHtml(self.editor._filterHTML(oldmime.html()))
else: else:
# old qt on linux won't give us html when dragging an image; # old qt on linux won't give us html when dragging an image;
# in that case just do the default action (which is to ignore # in that case just do the default action (which is to ignore
@ -1001,12 +1044,6 @@ class EditorWebView(AnkiWebView):
def prepareClip(self, mode=QClipboard.Clipboard): def prepareClip(self, mode=QClipboard.Clipboard):
clip = self.editor.mw.app.clipboard() clip = self.editor.mw.app.clipboard()
mime = clip.mimeData(mode=mode) mime = clip.mimeData(mode=mode)
if mime.hasHtml() and mime.html().startswith("<!--anki-->"):
# pasting from another field, filter extraneous webkit formatting
html = mime.html()[11:]
html = _filterHTML(html)
mime.setHtml(html)
return
self.saveClip(mode=mode) self.saveClip(mode=mode)
mime = self._processMime(mime) mime = self._processMime(mime)
clip.setMimeData(mime, mode=mode) clip.setMimeData(mime, mode=mode)
@ -1037,17 +1074,14 @@ class EditorWebView(AnkiWebView):
# print "html", mime.html() # print "html", mime.html()
# print "urls", mime.urls() # print "urls", mime.urls()
# print "text", mime.text() # print "text", mime.text()
if mime.hasUrls(): if mime.hasHtml():
return self._processUrls(mime) return self._processHtml(mime)
elif mime.hasText() and (self.strip or not mime.hasHtml()): elif mime.hasText():
return self._processText(mime) return self._processText(mime)
# we currently aren't able to extract images from html, so we prioritize elif mime.hasUrls():
# images over html in cases where we have both. this is a hack until return self._processUrls(mime)
# issue 92 is implemented
elif mime.hasImage(): elif mime.hasImage():
return self._processImage(mime) return self._processImage(mime)
elif mime.hasHtml():
return self._processHtml(mime)
else: else:
# nothing # nothing
return QMimeData() return QMimeData()
@ -1056,20 +1090,12 @@ class EditorWebView(AnkiWebView):
url = mime.urls()[0].toString() url = mime.urls()[0].toString()
# chrome likes to give us the URL twice with a \n # chrome likes to give us the URL twice with a \n
url = url.splitlines()[0] url = url.splitlines()[0]
link = self._localizedMediaLink(url)
mime = QMimeData() mime = QMimeData()
link = self.editor.urlToLink(url)
if link: if link:
mime.setHtml(link) mime.setHtml(link)
return mime return mime
def _localizedMediaLink(self, url):
l = url.lower()
for suffix in pics+audio:
if l.endswith(suffix):
return self._retrieveURL(url)
# not a supported type; return link verbatim
return url
def _processText(self, mime): def _processText(self, mime):
txt = unicode(mime.text()) txt = unicode(mime.text())
l = txt.lower() l = txt.lower()
@ -1077,12 +1103,7 @@ class EditorWebView(AnkiWebView):
# if the user is pasting an image or sound link, convert it to local # if the user is pasting an image or sound link, convert it to local
if l.startswith("http://") or l.startswith("https://") or l.startswith("file://"): if l.startswith("http://") or l.startswith("https://") or l.startswith("file://"):
txt = txt.split("\r\n")[0] txt = txt.split("\r\n")[0]
html = self._localizedMediaLink(txt) html = self.editor.urlToLink(txt)
if not html:
return QMimeData()
if html == txt:
# wasn't of a supported media type; don't change
html = None
new = QMimeData() new = QMimeData()
if html: if html:
new.setHtml(html) new.setHtml(html)
@ -1092,13 +1113,28 @@ class EditorWebView(AnkiWebView):
def _processHtml(self, mime): def _processHtml(self, mime):
html = mime.html() html = mime.html()
if self.strip: newMime = QMimeData()
html = stripHTML(html) if self.strip and not html.startswith("<!--anki-->"):
# special case for google images: if after stripping there's no text
# and there are image links, we'll paste those as html instead
if not stripHTML(html).strip():
newHtml = ""
mid = self.editor.note.mid
for url in self.editor.mw.col.media.filesInStr(
mid, html, includeRemote=True):
newHtml += self.editor.urlToLink(url)
newMime.setHtml(newHtml)
else: else:
html = _filterHTML(html) # use .text() if available so newlines are preserved; otherwise strip
mime = QMimeData() if mime.hasText():
mime.setHtml(html) return self._processText(mime)
return mime else:
newMime.setText(stripHTML(mime.text()))
else:
# no stripping
html = self.editor._filterHTML(html, localize=True)
newMime.setHtml(html)
return newMime
def _processImage(self, mime): def _processImage(self, mime):
im = QImage(mime.imageData()) im = QImage(mime.imageData())
@ -1116,35 +1152,6 @@ class EditorWebView(AnkiWebView):
mime.setHtml(self.editor._addMedia(uname+ext)) mime.setHtml(self.editor._addMedia(uname+ext))
return mime return mime
def _retrieveURL(self, url):
# is it media?
ext = url.split(".")[-1].lower()
if ext not in pics and ext not in audio:
return
if url.lower().startswith("file://"):
url = url.replace("%", "%25")
url = url.replace("#", "%23")
# fetch it into a temporary folder
self.editor.mw.progress.start(
immediate=True, parent=self.editor.parentWindow)
try:
req = urllib2.Request(url, None, {
'User-Agent': 'Mozilla/5.0 (compatible; Anki)'})
filecontents = urllib2.urlopen(req).read()
except urllib2.URLError, e:
showWarning(self.errtxt % e)
return
finally:
self.editor.mw.progress.finish()
path = unicode(urllib2.unquote(url.encode("utf8")), "utf8")
for badChar in "#%\"":
path = path.replace(badChar, "")
path = namedtmp(os.path.basename(path))
file = open(path, "wb")
file.write(filecontents)
file.close()
return self.editor._addMedia(path)
def _flagAnkiText(self): def _flagAnkiText(self):
# add a comment in the clipboard html so we can tell text is copied # add a comment in the clipboard html so we can tell text is copied
# from us and doesn't need to be stripped # from us and doesn't need to be stripped