From 331781cf453d0e6511e2bf340829eabe1ab6b56f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 04:28:29 +0100 Subject: [PATCH 1/3] Document newly found bug in _removeFormattingFromMathjax Also adds some comments I wrote to help me understand what's going on in the code. I hope to fix this bug myself, but I think it might be beyond what you can do with Python regexes and might require writing a proper parser. So, as step 1, I'm adding in a couple comments explaining that the bug exists and how to reproduce it. --- anki/template/template.py | 25 ++++++++++++++++++++++++- tests/test_models.py | 13 +++++++++++++ tests/test_template.py | 16 ++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/test_template.py diff --git a/anki/template/template.py b/anki/template/template.py index e5f780df6..baa2c34dd 100644 --- a/anki/template/template.py +++ b/anki/template/template.py @@ -4,6 +4,9 @@ from typing import Any, Callable, Dict, Pattern from anki.hooks import runFilter from anki.utils import stripHTML, stripHTMLMedia +# The (?si) flags make the regex match case-insensitively and make . match any +# character including newlines. +# See: https://docs.python.org/3/howto/regex.html#compilation-flags clozeReg = r"(?si)\{\{(c)%s::(.*?)(::(.*?))?\}\}" modifiers: Dict[str, Callable] = {} @@ -34,6 +37,7 @@ def get_or_attr(obj, name, default=None) -> Any: return default + class Template: # The regular expression used to find a #section section_re: Pattern = None @@ -197,6 +201,7 @@ class Template: def clozeText(self, txt, ord, type) -> str: reg = clozeReg if not re.search(reg%ord, txt): + # No Cloze deletion was found in txt. return "" txt = self._removeFormattingFromMathjax(txt, ord) def repl(m): @@ -216,13 +221,31 @@ class Template: # and display other clozes normally return re.sub(reg%r"\d+", "\\2", txt) - # look for clozes wrapped in mathjax, and change {{cx to {{Cx def _removeFormattingFromMathjax(self, txt, ord) -> str: + """Marks all clozes within MathJax to prevent formatting them. + + Active Cloze deletions within MathJax should not be wrapped inside + a Cloze , as that would interfere with MathJax. + + This method finds all Cloze deletions number `ord` in `txt` which are + inside MathJax inline or display formulas, and replaces their opening + '{{c123' with a '{{C123'. The clozeText method interprets the upper-case + C as "don't wrap this Cloze in a ". + """ + # TODO: There is a bug in this method. + # Say txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord = 1. + # + # This method should return: '\(a\) {{c1::b}} \[ {{C1::c}} \]'. + # Since the {{c1::c}} occurs within a MathJax display formula. + # However, it returns '\(a\) {{c1::b}} \[ {{c1::c}} \]'. + # This causes the Cloze within the MathJax display formula + # to be erroneously formatted with a . opening = ["\\(", "\\["] closing = ["\\)", "\\]"] # flags in middle of expression deprecated creg = clozeReg.replace("(?si)", "") regex = r"(?si)(\\[([])(.*?)"+(creg%ord)+r"(.*?)(\\[\])])" + def repl(m): enclosed = True for s in closing: diff --git a/tests/test_models.py b/tests/test_models.py index b1cfbce5f..a85230531 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -199,6 +199,19 @@ def test_cloze_mathjax(): assert "class=cloze" in f.cards()[3].q() assert "class=cloze" in f.cards()[4].q() +def test_cloze_mathjax_bug(): + d = getEmptyCol() + d.models.setCurrent(d.models.byName("Cloze")) + + f = d.newNote() + f['Text'] = r'\(a\) {{c1::b}} \[ {{c1::c}} \]' + assert d.addNote(f) + assert len(f.cards()) == 1 + + # TODO: The following assertion should work, but currently fails due + # to a bug in _removeFormatingFromMathjax. + # assert f.cards()[0].q() == '\(a\) [...] \[ [...] \]' + def test_chained_mods(): d = getEmptyCol() d.models.setCurrent(d.models.byName("Cloze")) diff --git a/tests/test_template.py b/tests/test_template.py new file mode 100644 index 000000000..59f935736 --- /dev/null +++ b/tests/test_template.py @@ -0,0 +1,16 @@ +from anki.template import Template + + +def test_remove_formatting_from_mathjax(): + t = Template('') + assert t._removeFormattingFromMathjax(r'\(2^{{c3::2}}\)', 3) == r'\(2^{{C3::2}}\)' + + txt = (r'{{c1::ok}} \(2^2\) {{c2::not ok}} \(2^{{c3::2}}\) \(x^3\) ' + r'{{c4::blah}} {{c5::text with \(x^2\) jax}}') + # Cloze 2 is not in MathJax, so it should not get protected against + # formatting. + assert t._removeFormattingFromMathjax(txt, 2) == txt + + # TODO: r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord=1 should return + # r'\(a\) {{c1::b}} \[ {{C1::c}} \]', but actually fails to mark the cloze + # as not-to-be-formatted. From 2d2f21bfe3fdcd3e6da101903adabc2ee1fd5068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 05:47:45 +0100 Subject: [PATCH 2/3] Fix bug in _removeFormattingFromMathjax --- anki/template/template.py | 63 +++++++++++++++++++++++---------------- tests/test_models.py | 8 +---- tests/test_template.py | 6 ++-- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/anki/template/template.py b/anki/template/template.py index baa2c34dd..f867db33b 100644 --- a/anki/template/template.py +++ b/anki/template/template.py @@ -232,34 +232,45 @@ class Template: '{{c123' with a '{{C123'. The clozeText method interprets the upper-case C as "don't wrap this Cloze in a ". """ - # TODO: There is a bug in this method. - # Say txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord = 1. - # - # This method should return: '\(a\) {{c1::b}} \[ {{C1::c}} \]'. - # Since the {{c1::c}} occurs within a MathJax display formula. - # However, it returns '\(a\) {{c1::b}} \[ {{c1::c}} \]'. - # This causes the Cloze within the MathJax display formula - # to be erroneously formatted with a . - opening = ["\\(", "\\["] - closing = ["\\)", "\\]"] - # flags in middle of expression deprecated creg = clozeReg.replace("(?si)", "") - regex = r"(?si)(\\[([])(.*?)"+(creg%ord)+r"(.*?)(\\[\])])" - def repl(m): - enclosed = True - for s in closing: - if s in m.group(1): - enclosed = False - for s in opening: - if s in m.group(7): - enclosed = False - if not enclosed: - return m.group(0) - # remove formatting - return m.group(0).replace("{{c", "{{C") - txt = re.sub(regex, repl, txt) - return txt + # Scan the string left to right. + # After a MathJax opening - \( or \[ - flip in_mathjax to True. + # After a MathJax closing - \) or \] - flip in_mathjax to False. + # When a Cloze pattern number `ord` is found and we are in MathJax, + # replace its '{{c' with '{{C'. + # + # TODO: Report mismatching opens/closes - e.g. '\(\]' + # TODO: Report errors in this method better than printing to stdout. + # flags in middle of expression deprecated + in_mathjax = False + def replace(match): + nonlocal in_mathjax + if match.group('mathjax_open'): + if in_mathjax: + print("MathJax opening found while already in MathJax") + in_mathjax = True + elif match.group('mathjax_close'): + if not in_mathjax: + print("MathJax close found while not in MathJax") + in_mathjax = False + elif match.group('cloze'): + if in_mathjax: + return match.group(0).replace( + '{{c{}::'.format(ord), + '{{C{}::'.format(ord)) + else: + print("Unexpected: no expected capture group is present") + return match.group(0) + # The following regex matches one of: + # - MathJax opening + # - MathJax close + # - Cloze deletion number `ord` + return re.sub( + r"(?si)" + r"(?P\\[([])|" + r"(?P\\[\])])|" + r"(?P" + (creg%ord) + ")", replace, txt) @modifier('=') def render_delimiter(self, tag_name=None, context=None) -> str: diff --git a/tests/test_models.py b/tests/test_models.py index a85230531..971ec02ca 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -199,18 +199,12 @@ def test_cloze_mathjax(): assert "class=cloze" in f.cards()[3].q() assert "class=cloze" in f.cards()[4].q() -def test_cloze_mathjax_bug(): - d = getEmptyCol() - d.models.setCurrent(d.models.byName("Cloze")) - f = d.newNote() f['Text'] = r'\(a\) {{c1::b}} \[ {{c1::c}} \]' assert d.addNote(f) assert len(f.cards()) == 1 + assert f.cards()[0].q().endswith('\(a\) [...] \[ [...] \]') - # TODO: The following assertion should work, but currently fails due - # to a bug in _removeFormatingFromMathjax. - # assert f.cards()[0].q() == '\(a\) [...] \[ [...] \]' def test_chained_mods(): d = getEmptyCol() diff --git a/tests/test_template.py b/tests/test_template.py index 59f935736..7132c1240 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -11,6 +11,6 @@ def test_remove_formatting_from_mathjax(): # formatting. assert t._removeFormattingFromMathjax(txt, 2) == txt - # TODO: r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord=1 should return - # r'\(a\) {{c1::b}} \[ {{C1::c}} \]', but actually fails to mark the cloze - # as not-to-be-formatted. + txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]' + assert t._removeFormattingFromMathjax(txt, 1) == ( + r'\(a\) {{c1::b}} \[ {{C1::c}} \]') From 5ff05471104f44fd4edc1f2e7eac4481871df9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 12:43:15 +0100 Subject: [PATCH 3/3] Don't repeat Python regex docs --- anki/template/template.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/anki/template/template.py b/anki/template/template.py index f867db33b..3c4d4b407 100644 --- a/anki/template/template.py +++ b/anki/template/template.py @@ -4,9 +4,7 @@ from typing import Any, Callable, Dict, Pattern from anki.hooks import runFilter from anki.utils import stripHTML, stripHTMLMedia -# The (?si) flags make the regex match case-insensitively and make . match any -# character including newlines. -# See: https://docs.python.org/3/howto/regex.html#compilation-flags +# Matches a {{c123::clozed-out text::hint}} Cloze deletion, case-insensitively. clozeReg = r"(?si)\{\{(c)%s::(.*?)(::(.*?))?\}\}" modifiers: Dict[str, Callable] = {}