more media regexp fixes

- split quoted and unquoted image case into separate regexps, since we can't
  include a group reference inside a character set
- disallow spaces in the non-quoted case
- this should fix matching on images with other attributes again
This commit is contained in:
Damien Elmes 2013-05-22 09:45:58 +09:00
parent 35764757eb
commit 918694a096
2 changed files with 16 additions and 6 deletions

View file

@ -12,10 +12,14 @@ from anki.latex import mungeQA
class MediaManager(object):
# other code depends on this order, so don't reorder
regexps = ("(?i)(\[sound:(?P<fname>[^]]+)\])",
"(?i)(<img[^>]+src=(?P<str>[\"']?)"+
"(?P<fname>[^>]+)(?P=str)[^>]*>)")
soundRegexps = ["(?i)(\[sound:(?P<fname>[^]]+)\])"]
imgRegexps = [
# src element quoted case
"(?i)(<img[^>]+src=(?P<str>[\"'])(?P<fname>[^>]+?)(?P=str)[^>]*>)",
# unquoted case
"(?i)(<img[^>]+src=(?!['\"])(?P<fname>[^ >]+)[^>]*?>)",
]
regexps = soundRegexps + imgRegexps
def __init__(self, col, server):
self.col = col
@ -175,7 +179,9 @@ If the same name exists, compare checksums."""
return tag
return tag.replace(
fname, urllib.quote(fname.encode("utf-8")))
return re.sub(self.regexps[1], repl, string)
for reg in self.imgRegexps:
string = re.sub(reg, repl, string)
return string
# Rebuilding DB
##########################################################################

View file

@ -23,7 +23,11 @@ def test_strings():
mid = d.models.models.keys()[0]
assert mf(mid, "aoeu") == []
assert mf(mid, "aoeu<img src='foo.jpg'>ao") == ["foo.jpg"]
assert mf(mid, "aoeu<img src=foo bar.jpg>ao") == ["foo bar.jpg"]
assert mf(mid, "aoeu<img src='foo.jpg' style='test'>ao") == ["foo.jpg"]
assert mf(mid, "aoeu<img src='foo.jpg'><img src=\"bar.jpg\">ao") == [
"foo.jpg", "bar.jpg"]
assert mf(mid, "aoeu<img src=foo.jpg style=bar>ao") == ["foo.jpg"]
assert mf(mid, "<img src=one><img src=two>") == ["one", "two"]
assert mf(mid, "aoeu<img src=\"foo.jpg\">ao") == ["foo.jpg"]
assert mf(mid, "aoeu<img src=\"foo.jpg\"><img class=yo src=fo>ao") == [
"foo.jpg", "fo"]