use html parser on text export instead of fragile regex

2026-01-01 00:02:57 -05:00 · 2010-01-23 08:13:43 +09:00 · 2010-01-23 08:13:43 +09:00 · b4f1d6622a
commit b4f1d6622a
parent 28ff71a031
1 changed files with 12 additions and 5 deletions
--- a/anki/exporting.py
+++ b/anki/exporting.py
@ -17,6 +17,7 @@ from anki.lang import _
 from anki.utils import findTag, parseTags, stripHTML, ids2str
 from anki.tags import tagIds
 from anki.db import *
+from BeautifulSoup import BeautifulSoup as BS

 class Exporter(object):
    def __init__(self, deck):
@ -29,11 +30,16 @@ class Exporter(object):
        self.doExport(file)
        file.close()

-    def escapeText(self, text):
+    def escapeText(self, text, removeFields=False):
        "Escape newlines and tabs, and strip Anki HTML."
        text = text.replace("\n", "<br>")
        text = text.replace("\t", " " * 8)
-        text = re.sub('<span class="fm.*?">(.*?)</span>', '\\1', text)
+        if removeFields:
+            s = BS(text)
+            all = s('span', {'class': re.compile("fm.*")})
+            for e in all:
+                e.replaceWith("".join([unicode(x) for x in e.contents]))
+            text = unicode(s)
        return text

    def cardIds(self):
@ -180,8 +186,9 @@ select cards.id, cards.tags || "," || facts.tags from cards, facts
 where cards.factId = facts.id
 and cards.id in %s
 order by cards.created""" % strids))
-        out = u"\n".join(["%s\t%s%s" % (self.escapeText(c[0]),
-                                        self.escapeText(c[1]),
+        out = u"\n".join(["%s\t%s%s" % (
+            self.escapeText(c[0], removeFields=True),
+            self.escapeText(c[1], removeFields=True),
            self.tags(c[2]))
                          for c in cards])
        if out: