refactor media downloading on paste/drop

- use html as the first choice for incoming drops/pastes
- when filtering incoming html, automatically localize any remote image
  references
- add a special case for pasting/dropping from google images when html
  stripping is on
- move filtering code into editor
This commit is contained in:
Damien Elmes 2013-07-11 17:21:16 +09:00
parent a538e29480
commit e2d2b759a4

View file

@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
from anki.lang import _
from aqt.qt import *
import re, os, urllib2, ctypes
from anki.utils import stripHTML, isWin, isMac, namedtmp, json, stripHTMLMedia
from anki.sound import play
import anki.sound
from anki.hooks import runHook, runFilter
from aqt.sound import getAudio
from aqt.webview import AnkiWebView
@ -16,12 +17,6 @@ import anki.js
from BeautifulSoup import BeautifulSoup
import urllib
# fixme: when tab order returns to the webview, the previously focused field
# is focused, which is not good when the user is tabbing through the dialog
# fixme: set rtl in div css
# fixme: commit from tag area causes error
pics = ("jpg", "jpeg", "png", "tif", "tiff", "gif", "svg")
audio = ("wav", "mp3", "ogg", "flac", "mp4", "swf", "mov", "mpeg", "mkv")
@ -264,72 +259,6 @@ document.onclick = function (evt) {
</body></html>
"""
def _filterHTML(html):
doc = BeautifulSoup(html)
# remove implicit regular font style from outermost element
if doc.span:
try:
attrs = doc.span['style'].split(";")
except (KeyError, TypeError):
attrs = []
if attrs:
new = []
for attr in attrs:
sattr = attr.strip()
if sattr and sattr not in ("font-style: normal", "font-weight: normal"):
new.append(sattr)
doc.span['style'] = ";".join(new)
# filter out implicit formatting from webkit
for tag in doc("span", "Apple-style-span"):
preserve = ""
for item in tag['style'].split(";"):
try:
k, v = item.split(":")
except ValueError:
continue
if k.strip() == "color" and not v.strip() == "rgb(0, 0, 0)":
preserve += "color:%s;" % v
if k.strip() in ("font-weight", "font-style"):
preserve += item + ";"
if preserve:
# preserve colour attribute, delete implicit class
tag['style'] = preserve
del tag['class']
else:
# strip completely
tag.replaceWithChildren()
for tag in doc("font", "Apple-style-span"):
# strip all but colour attr from implicit font tags
if 'color' in dict(tag.attrs):
for attr in tag.attrs:
if attr != "color":
del tag[attr]
# and apple class
del tag['class']
else:
# remove completely
tag.replaceWithChildren()
# now images
for tag in doc("img"):
# turn file:/// links into relative ones
try:
if tag['src'].lower().startswith("file://"):
tag['src'] = os.path.basename(tag['src'])
except KeyError:
# for some bizarre reason, mnemosyne removes src elements
# from missing media
pass
# strip all other attributes, including implicit max-width
for attr, val in tag.attrs:
if attr != "src":
del tag[attr]
# strip superfluous elements
for elem in "html", "head", "body", "meta":
for tag in doc(elem):
tag.replaceWithChildren()
html = unicode(doc)
return html
# caller is responsible for resetting note on reset
class Editor(object):
def __init__(self, mw, widget, parentWindow, addMode=False):
@ -551,7 +480,7 @@ class Editor(object):
def mungeHTML(self, txt):
if txt == "<br>":
txt = ""
return _filterHTML(txt)
return self._filterHTML(txt, localize=False)
# Setting/unsetting the current note
######################################################################
@ -824,24 +753,18 @@ to a cloze type first, via Edit>Change Note Type."""))
self.web.eval("setFormat('inserthtml', %s);" % json.dumps(html))
def _addMedia(self, path, canDelete=False):
"Add to media folder and return basename."
"Add to media folder and return local img or sound tag."
# copy to media folder
name = self.mw.col.media.addFile(path)
fname = self.mw.col.media.addFile(path)
# remove original?
if canDelete and self.mw.pm.profile['deleteMedia']:
if os.path.abspath(name) != os.path.abspath(path):
if os.path.abspath(fname) != os.path.abspath(path):
try:
os.unlink(path)
except:
pass
# return a local html link
ext = name.split(".")[-1].lower()
if ext in pics:
name = urllib.quote(name.encode("utf8"))
return '<img src="%s">' % name
else:
anki.sound.play(name)
return '[sound:%s]' % name
return self.fnameToLink(fname)
def onRecSound(self):
try:
@ -853,6 +776,127 @@ to a cloze type first, via Edit>Change Note Type."""))
return
self.addMedia(file)
# Media downloads
######################################################################
def urlToLink(self, url):
fname = self.urlToFile(url)
if not fname:
return ""
return self.fnameToLink(fname)
def fnameToLink(self, fname):
ext = fname.split(".")[-1].lower()
if ext in pics:
name = urllib.quote(fname.encode("utf8"))
return '<img src="%s">' % name
else:
anki.sound.play(fname)
return '[sound:%s]' % fname
def urlToFile(self, url):
l = url.lower()
for suffix in pics+audio:
if l.endswith(suffix):
return self._retrieveURL(url)
# not a supported type; return link verbatim
return
def _retrieveURL(self, url):
"Download file into media folder and return local filename or None."
# urllib is picky with local file links
if url.lower().startswith("file://"):
url = url.replace("%", "%25")
url = url.replace("#", "%23")
# fetch it into a temporary folder
self.mw.progress.start(
immediate=True, parent=self.parentWindow)
try:
req = urllib2.Request(url, None, {
'User-Agent': 'Mozilla/5.0 (compatible; Anki)'})
filecontents = urllib2.urlopen(req).read()
except urllib2.URLError, e:
showWarning(_("An error occurred while opening %s") % e)
return
finally:
self.mw.progress.finish()
path = unicode(urllib2.unquote(url.encode("utf8")), "utf8")
return self.mw.col.media.writeData(path, filecontents)
# HTML filtering
######################################################################
def _filterHTML(self, html, localize=False):
doc = BeautifulSoup(html)
# remove implicit regular font style from outermost element
if doc.span:
try:
attrs = doc.span['style'].split(";")
except (KeyError, TypeError):
attrs = []
if attrs:
new = []
for attr in attrs:
sattr = attr.strip()
if sattr and sattr not in ("font-style: normal", "font-weight: normal"):
new.append(sattr)
doc.span['style'] = ";".join(new)
# filter out implicit formatting from webkit
for tag in doc("span", "Apple-style-span"):
preserve = ""
for item in tag['style'].split(";"):
try:
k, v = item.split(":")
except ValueError:
continue
if k.strip() == "color" and not v.strip() == "rgb(0, 0, 0)":
preserve += "color:%s;" % v
if k.strip() in ("font-weight", "font-style"):
preserve += item + ";"
if preserve:
# preserve colour attribute, delete implicit class
tag['style'] = preserve
del tag['class']
else:
# strip completely
tag.replaceWithChildren()
for tag in doc("font", "Apple-style-span"):
# strip all but colour attr from implicit font tags
if 'color' in dict(tag.attrs):
for attr in tag.attrs:
if attr != "color":
del tag[attr]
# and apple class
del tag['class']
else:
# remove completely
tag.replaceWithChildren()
# now images
for tag in doc("img"):
# turn file:/// links into relative ones
try:
if tag['src'].lower().startswith("file://"):
tag['src'] = os.path.basename(tag['src'])
if localize:
# convert remote image links to local ones
fname = self.urlToFile(tag['src'])
if fname:
tag['src'] = fname
except KeyError:
# for some bizarre reason, mnemosyne removes src elements
# from missing media
pass
# strip all other attributes, including implicit max-width
for attr, val in tag.attrs:
if attr != "src":
del tag[attr]
# strip superfluous elements
for elem in "html", "head", "body", "meta":
for tag in doc(elem):
tag.replaceWithChildren()
html = unicode(doc)
return html
# Advanced menu
######################################################################
@ -922,7 +966,6 @@ class EditorWebView(AnkiWebView):
def __init__(self, parent, editor):
AnkiWebView.__init__(self)
self.editor = editor
self.errtxt = _("An error occured while opening %s")
self.strip = self.editor.mw.pm.profile['stripHTML']
def keyPressEvent(self, evt):
@ -981,7 +1024,7 @@ class EditorWebView(AnkiWebView):
if evt.source():
if oldmime.hasHtml():
mime = QMimeData()
mime.setHtml(_filterHTML(oldmime.html()))
mime.setHtml(self.editor._filterHTML(oldmime.html()))
else:
# old qt on linux won't give us html when dragging an image;
# in that case just do the default action (which is to ignore
@ -1001,12 +1044,6 @@ class EditorWebView(AnkiWebView):
def prepareClip(self, mode=QClipboard.Clipboard):
clip = self.editor.mw.app.clipboard()
mime = clip.mimeData(mode=mode)
if mime.hasHtml() and mime.html().startswith("<!--anki-->"):
# pasting from another field, filter extraneous webkit formatting
html = mime.html()[11:]
html = _filterHTML(html)
mime.setHtml(html)
return
self.saveClip(mode=mode)
mime = self._processMime(mime)
clip.setMimeData(mime, mode=mode)
@ -1037,17 +1074,14 @@ class EditorWebView(AnkiWebView):
# print "html", mime.html()
# print "urls", mime.urls()
# print "text", mime.text()
if mime.hasUrls():
return self._processUrls(mime)
elif mime.hasText() and (self.strip or not mime.hasHtml()):
if mime.hasHtml():
return self._processHtml(mime)
elif mime.hasText():
return self._processText(mime)
# we currently aren't able to extract images from html, so we prioritize
# images over html in cases where we have both. this is a hack until
# issue 92 is implemented
elif mime.hasUrls():
return self._processUrls(mime)
elif mime.hasImage():
return self._processImage(mime)
elif mime.hasHtml():
return self._processHtml(mime)
else:
# nothing
return QMimeData()
@ -1056,20 +1090,12 @@ class EditorWebView(AnkiWebView):
url = mime.urls()[0].toString()
# chrome likes to give us the URL twice with a \n
url = url.splitlines()[0]
link = self._localizedMediaLink(url)
mime = QMimeData()
link = self.editor.urlToLink(url)
if link:
mime.setHtml(link)
return mime
def _localizedMediaLink(self, url):
l = url.lower()
for suffix in pics+audio:
if l.endswith(suffix):
return self._retrieveURL(url)
# not a supported type; return link verbatim
return url
def _processText(self, mime):
txt = unicode(mime.text())
l = txt.lower()
@ -1077,12 +1103,7 @@ class EditorWebView(AnkiWebView):
# if the user is pasting an image or sound link, convert it to local
if l.startswith("http://") or l.startswith("https://") or l.startswith("file://"):
txt = txt.split("\r\n")[0]
html = self._localizedMediaLink(txt)
if not html:
return QMimeData()
if html == txt:
# wasn't of a supported media type; don't change
html = None
html = self.editor.urlToLink(txt)
new = QMimeData()
if html:
new.setHtml(html)
@ -1092,13 +1113,28 @@ class EditorWebView(AnkiWebView):
def _processHtml(self, mime):
html = mime.html()
if self.strip:
html = stripHTML(html)
newMime = QMimeData()
if self.strip and not html.startswith("<!--anki-->"):
# special case for google images: if after stripping there's no text
# and there are image links, we'll paste those as html instead
if not stripHTML(html).strip():
newHtml = ""
mid = self.editor.note.mid
for url in self.editor.mw.col.media.filesInStr(
mid, html, includeRemote=True):
newHtml += self.editor.urlToLink(url)
newMime.setHtml(newHtml)
else:
# use .text() if available so newlines are preserved; otherwise strip
if mime.hasText():
return self._processText(mime)
else:
newMime.setText(stripHTML(mime.text()))
else:
html = _filterHTML(html)
mime = QMimeData()
mime.setHtml(html)
return mime
# no stripping
html = self.editor._filterHTML(html, localize=True)
newMime.setHtml(html)
return newMime
def _processImage(self, mime):
im = QImage(mime.imageData())
@ -1116,35 +1152,6 @@ class EditorWebView(AnkiWebView):
mime.setHtml(self.editor._addMedia(uname+ext))
return mime
def _retrieveURL(self, url):
# is it media?
ext = url.split(".")[-1].lower()
if ext not in pics and ext not in audio:
return
if url.lower().startswith("file://"):
url = url.replace("%", "%25")
url = url.replace("#", "%23")
# fetch it into a temporary folder
self.editor.mw.progress.start(
immediate=True, parent=self.editor.parentWindow)
try:
req = urllib2.Request(url, None, {
'User-Agent': 'Mozilla/5.0 (compatible; Anki)'})
filecontents = urllib2.urlopen(req).read()
except urllib2.URLError, e:
showWarning(self.errtxt % e)
return
finally:
self.editor.mw.progress.finish()
path = unicode(urllib2.unquote(url.encode("utf8")), "utf8")
for badChar in "#%\"":
path = path.replace(badChar, "")
path = namedtmp(os.path.basename(path))
file = open(path, "wb")
file.write(filecontents)
file.close()
return self.editor._addMedia(path)
def _flagAnkiText(self):
# add a comment in the clipboard html so we can tell text is copied
# from us and doesn't need to be stripped