From 331781cf453d0e6511e2bf340829eabe1ab6b56f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= <agentydragon@gmail.com>
Date: Sun, 22 Dec 2019 04:28:29 +0100
Subject: [PATCH 1/5] Document newly found bug in _removeFormattingFromMathjax

Also adds some comments I wrote to help me understand what's going
on in the code.

I hope to fix this bug myself, but I think it might be beyond what
you can do with Python regexes and might require writing a proper
parser.

So, as step 1, I'm adding in a couple comments explaining that the
bug exists and how to reproduce it.
---
 anki/template/template.py | 25 ++++++++++++++++++++++++-
 tests/test_models.py      | 13 +++++++++++++
 tests/test_template.py    | 16 ++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_template.py
diff --git a/anki/template/template.py b/anki/template/template.py
index e5f780df6..baa2c34dd 100644
--- a/anki/template/template.py
+++ b/anki/template/template.py
@@ -4,6 +4,9 @@ from typing import Any, Callable, Dict, Pattern
 from anki.hooks import runFilter
 from anki.utils import stripHTML, stripHTMLMedia
 
+# The (?si) flags make the regex match case-insensitively and make . match any
+# character including newlines.
+# See: https://docs.python.org/3/howto/regex.html#compilation-flags
 clozeReg = r"(?si)\{\{(c)%s::(.*?)(::(.*?))?\}\}"
 
 modifiers: Dict[str, Callable] = {}
@@ -34,6 +37,7 @@ def get_or_attr(obj, name, default=None) -> Any:
             return default
 
 
+
 class Template:
     # The regular expression used to find a #section
     section_re: Pattern = None
@@ -197,6 +201,7 @@ class Template:
     def clozeText(self, txt, ord, type) -> str:
         reg = clozeReg
         if not re.search(reg%ord, txt):
+            # No Cloze deletion was found in txt.
             return ""
         txt = self._removeFormattingFromMathjax(txt, ord)
         def repl(m):
@@ -216,13 +221,31 @@ class Template:
         # and display other clozes normally
         return re.sub(reg%r"\d+", "\\2", txt)
 
-    # look for clozes wrapped in mathjax, and change {{cx to {{Cx
     def _removeFormattingFromMathjax(self, txt, ord) -> str:
+        """Marks all clozes within MathJax to prevent formatting them.
+
+        Active Cloze deletions within MathJax should not be wrapped inside
+        a Cloze <span>, as that would interfere with MathJax.
+
+        This method finds all Cloze deletions number `ord` in `txt` which are
+        inside MathJax inline or display formulas, and replaces their opening
+        '{{c123' with a '{{C123'. The clozeText method interprets the upper-case
+        C as "don't wrap this Cloze in a <span>".
+        """
+        # TODO: There is a bug in this method.
+        # Say txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord = 1.
+        #
+        # This method should return: '\(a\) {{c1::b}} \[ {{C1::c}} \]'.
+        # Since the {{c1::c}} occurs within a MathJax display formula.
+        # However, it returns '\(a\) {{c1::b}} \[ {{c1::c}} \]'.
+        # This causes the Cloze within the MathJax display formula
+        # to be erroneously formatted with a <span>.
         opening = ["\\(", "\\["]
         closing = ["\\)", "\\]"]
         # flags in middle of expression deprecated
         creg = clozeReg.replace("(?si)", "")
         regex = r"(?si)(\\[([])(.*?)"+(creg%ord)+r"(.*?)(\\[\])])"
+
         def repl(m):
             enclosed = True
             for s in closing:
diff --git a/tests/test_models.py b/tests/test_models.py
index b1cfbce5f..a85230531 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -199,6 +199,19 @@ def test_cloze_mathjax():
     assert "class=cloze" in f.cards()[3].q()
     assert "class=cloze" in f.cards()[4].q()
 
+def test_cloze_mathjax_bug():
+    d = getEmptyCol()
+    d.models.setCurrent(d.models.byName("Cloze"))
+
+    f = d.newNote()
+    f['Text'] = r'\(a\) {{c1::b}} \[ {{c1::c}} \]'
+    assert d.addNote(f)
+    assert len(f.cards()) == 1
+
+    # TODO: The following assertion should work, but currently fails due
+    # to a bug in _removeFormatingFromMathjax.
+    #   assert f.cards()[0].q() == '\(a\) <span class=cloze>[...]</span> \[ [...] \]'
+
 def test_chained_mods():
     d = getEmptyCol()
     d.models.setCurrent(d.models.byName("Cloze"))
diff --git a/tests/test_template.py b/tests/test_template.py
new file mode 100644
index 000000000..59f935736
--- /dev/null
+++ b/tests/test_template.py
@@ -0,0 +1,16 @@
+from anki.template import Template
+
+
+def test_remove_formatting_from_mathjax():
+    t = Template('')
+    assert t._removeFormattingFromMathjax(r'\(2^{{c3::2}}\)', 3) == r'\(2^{{C3::2}}\)'
+
+    txt = (r'{{c1::ok}} \(2^2\) {{c2::not ok}} \(2^{{c3::2}}\) \(x^3\) '
+           r'{{c4::blah}} {{c5::text with \(x^2\) jax}}')
+    # Cloze 2 is not in MathJax, so it should not get protected against
+    # formatting.
+    assert t._removeFormattingFromMathjax(txt, 2) == txt
+
+    # TODO: r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord=1 should return
+    # r'\(a\) {{c1::b}} \[ {{C1::c}} \]', but actually fails to mark the cloze
+    # as not-to-be-formatted.

From 2d2f21bfe3fdcd3e6da101903adabc2ee1fd5068 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= <agentydragon@gmail.com>
Date: Sun, 22 Dec 2019 05:47:45 +0100
Subject: [PATCH 2/5] Fix bug in _removeFormattingFromMathjax

---
 anki/template/template.py | 63 +++++++++++++++++++++++----------------
 tests/test_models.py      |  8 +----
 tests/test_template.py    |  6 ++--
 3 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/anki/template/template.py b/anki/template/template.py
index baa2c34dd..f867db33b 100644
--- a/anki/template/template.py
+++ b/anki/template/template.py
@@ -232,34 +232,45 @@ class Template:
         '{{c123' with a '{{C123'. The clozeText method interprets the upper-case
         C as "don't wrap this Cloze in a <span>".
         """
-        # TODO: There is a bug in this method.
-        # Say txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord = 1.
-        #
-        # This method should return: '\(a\) {{c1::b}} \[ {{C1::c}} \]'.
-        # Since the {{c1::c}} occurs within a MathJax display formula.
-        # However, it returns '\(a\) {{c1::b}} \[ {{c1::c}} \]'.
-        # This causes the Cloze within the MathJax display formula
-        # to be erroneously formatted with a <span>.
-        opening = ["\\(", "\\["]
-        closing = ["\\)", "\\]"]
-        # flags in middle of expression deprecated
         creg = clozeReg.replace("(?si)", "")
-        regex = r"(?si)(\\[([])(.*?)"+(creg%ord)+r"(.*?)(\\[\])])"
 
-        def repl(m):
-            enclosed = True
-            for s in closing:
-                if s in m.group(1):
-                    enclosed = False
-            for s in opening:
-                if s in m.group(7):
-                    enclosed = False
-            if not enclosed:
-                return m.group(0)
-            # remove formatting
-            return m.group(0).replace("{{c", "{{C")
-        txt = re.sub(regex, repl, txt)
-        return txt
+        # Scan the string left to right.
+        # After a MathJax opening - \( or \[ - flip in_mathjax to True.
+        # After a MathJax closing - \) or \] - flip in_mathjax to False.
+        # When a Cloze pattern number `ord` is found and we are in MathJax,
+        # replace its '{{c' with '{{C'.
+        #
+        # TODO: Report mismatching opens/closes - e.g. '\(\]'
+        # TODO: Report errors in this method better than printing to stdout.
+        # flags in middle of expression deprecated
+        in_mathjax = False
+        def replace(match):
+            nonlocal in_mathjax
+            if match.group('mathjax_open'):
+                if in_mathjax:
+                    print("MathJax opening found while already in MathJax")
+                in_mathjax = True
+            elif match.group('mathjax_close'):
+                if not in_mathjax:
+                    print("MathJax close found while not in MathJax")
+                in_mathjax = False
+            elif match.group('cloze'):
+                if in_mathjax:
+                    return match.group(0).replace(
+                        '{{c{}::'.format(ord),
+                        '{{C{}::'.format(ord))
+            else:
+                print("Unexpected: no expected capture group is present")
+            return match.group(0)
+        # The following regex matches one of:
+        #  -  MathJax opening
+        #  -  MathJax close
+        #  -  Cloze deletion number `ord`
+        return re.sub(
+            r"(?si)"
+            r"(?P<mathjax_open>\\[([])|"
+            r"(?P<mathjax_close>\\[\])])|"
+            r"(?P<cloze>" + (creg%ord) + ")", replace, txt)
 
     @modifier('=')
     def render_delimiter(self, tag_name=None, context=None) -> str:
diff --git a/tests/test_models.py b/tests/test_models.py
index a85230531..971ec02ca 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -199,18 +199,12 @@ def test_cloze_mathjax():
     assert "class=cloze" in f.cards()[3].q()
     assert "class=cloze" in f.cards()[4].q()
 
-def test_cloze_mathjax_bug():
-    d = getEmptyCol()
-    d.models.setCurrent(d.models.byName("Cloze"))
-
     f = d.newNote()
     f['Text'] = r'\(a\) {{c1::b}} \[ {{c1::c}} \]'
     assert d.addNote(f)
     assert len(f.cards()) == 1
+    assert f.cards()[0].q().endswith('\(a\) <span class=cloze>[...]</span> \[ [...] \]')
 
-    # TODO: The following assertion should work, but currently fails due
-    # to a bug in _removeFormatingFromMathjax.
-    #   assert f.cards()[0].q() == '\(a\) <span class=cloze>[...]</span> \[ [...] \]'
 
 def test_chained_mods():
     d = getEmptyCol()
diff --git a/tests/test_template.py b/tests/test_template.py
index 59f935736..7132c1240 100644
--- a/tests/test_template.py
+++ b/tests/test_template.py
@@ -11,6 +11,6 @@ def test_remove_formatting_from_mathjax():
     # formatting.
     assert t._removeFormattingFromMathjax(txt, 2) == txt
 
-    # TODO: r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord=1 should return
-    # r'\(a\) {{c1::b}} \[ {{C1::c}} \]', but actually fails to mark the cloze
-    # as not-to-be-formatted.
+    txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]'
+    assert t._removeFormattingFromMathjax(txt, 1) == (
+        r'\(a\) {{c1::b}} \[ {{C1::c}} \]')

From 5ff05471104f44fd4edc1f2e7eac4481871df9d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= <agentydragon@gmail.com>
Date: Sun, 22 Dec 2019 12:43:15 +0100
Subject: [PATCH 3/5] Don't repeat Python regex docs

---
 anki/template/template.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/anki/template/template.py b/anki/template/template.py
index f867db33b..3c4d4b407 100644
--- a/anki/template/template.py
+++ b/anki/template/template.py
@@ -4,9 +4,7 @@ from typing import Any, Callable, Dict, Pattern
 from anki.hooks import runFilter
 from anki.utils import stripHTML, stripHTMLMedia
 
-# The (?si) flags make the regex match case-insensitively and make . match any
-# character including newlines.
-# See: https://docs.python.org/3/howto/regex.html#compilation-flags
+# Matches a {{c123::clozed-out text::hint}} Cloze deletion, case-insensitively.
 clozeReg = r"(?si)\{\{(c)%s::(.*?)(::(.*?))?\}\}"
 
 modifiers: Dict[str, Callable] = {}

From 2ae342592cfb863e6b1aad6a6233ea799d083042 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= <agentydragon@gmail.com>
Date: Sun, 22 Dec 2019 13:59:24 +0100
Subject: [PATCH 4/5] Deduplicate media extension filter with existing list of
 media extensions

---
 aqt/editor.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/aqt/editor.py b/aqt/editor.py
index cf500c4c2..37c0253f9 100644
--- a/aqt/editor.py
+++ b/aqt/editor.py
@@ -3,6 +3,7 @@
 # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
 import base64
 import html
+import itertools
 import json
 import mimetypes
 import re
@@ -553,10 +554,10 @@ to a cloze type first, via Edit>Change Note Type."""))
     ######################################################################
 
     def onAddMedia(self):
-        key = (_("Media") +
-               " (*.jpg *.png *.gif *.tiff *.svg *.tif *.jpeg "+
-               "*.mp3 *.ogg *.wav *.avi *.ogv *.mpg *.mpeg *.mov *.mp4 " +
-               "*.mkv *.ogx *.ogv *.oga *.flv *.swf *.flac *.webp *.m4a)")
+        extension_filter = ' '.join(
+            '*.' + extension
+            for extension in sorted(itertools.chain(pics, audio)))
+        key = (_("Media") + " (" + extension_filter + ")")
         def accept(file):
             self.addMedia(file, canDelete=True)
         file = getFile(self.widget, _("Add Media"), accept, key, key="media")

From 36bdb4ebe0dd99e20b2336e011d0f29b2bf3a3bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= <agentydragon@gmail.com>
Date: Sun, 22 Dec 2019 14:50:42 +0100
Subject: [PATCH 5/5] Add some type declarations in tags.py

---
 anki/tags.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/anki/tags.py b/anki/tags.py
index 373dc4d20..86eb34064 100644
--- a/anki/tags.py
+++ b/anki/tags.py
@@ -12,7 +12,7 @@ This module manages the tag cache and tags for notes.
 
 import json
 import re
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Callable, Dict, List, Tuple
 
 from anki.hooks import runHook
 from anki.utils import ids2str, intTime
@@ -66,13 +66,13 @@ class TagManager:
         self.register(set(self.split(
             " ".join(self.col.db.list("select distinct tags from notes"+lim)))))
 
-    def allItems(self) -> List[Tuple[Any, Any]]:
+    def allItems(self) -> List[Tuple[str, int]]:
         return list(self.tags.items())
 
     def save(self) -> None:
         self.changed = True
 
-    def byDeck(self, did, children=False) -> List:
+    def byDeck(self, did, children=False) -> List[str]:
         basequery = "select n.tags from cards c, notes n WHERE c.nid = n.id"
         if not children:
             query = basequery + " AND c.did=?"
@@ -127,7 +127,7 @@ class TagManager:
     # String-based utilities
     ##########################################################################
 
-    def split(self, tags) -> List:
+    def split(self, tags) -> List[str]:
         "Parse a string and return a list of tags."
         return [t for t in tags.replace('\u3000', ' ').split(" ") if t]
 
@@ -165,7 +165,7 @@ class TagManager:
     # List-based utilities
     ##########################################################################
 
-    def canonify(self, tagList) -> List:
+    def canonify(self, tagList) -> List[str]:
         "Strip duplicates, adjust case to match existing tags, and sort."
         strippedTags = []
         for t in tagList: