From 331781cf453d0e6511e2bf340829eabe1ab6b56f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 04:28:29 +0100 Subject: [PATCH 1/5] Document newly found bug in _removeFormattingFromMathjax Also adds some comments I wrote to help me understand what's going on in the code. I hope to fix this bug myself, but I think it might be beyond what you can do with Python regexes and might require writing a proper parser. So, as step 1, I'm adding in a couple comments explaining that the bug exists and how to reproduce it. --- anki/template/template.py | 25 ++++++++++++++++++++++++- tests/test_models.py | 13 +++++++++++++ tests/test_template.py | 16 ++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 tests/test_template.py diff --git a/anki/template/template.py b/anki/template/template.py index e5f780df6..baa2c34dd 100644 --- a/anki/template/template.py +++ b/anki/template/template.py @@ -4,6 +4,9 @@ from typing import Any, Callable, Dict, Pattern from anki.hooks import runFilter from anki.utils import stripHTML, stripHTMLMedia +# The (?si) flags make the regex match case-insensitively and make . match any +# character including newlines. +# See: https://docs.python.org/3/howto/regex.html#compilation-flags clozeReg = r"(?si)\{\{(c)%s::(.*?)(::(.*?))?\}\}" modifiers: Dict[str, Callable] = {} @@ -34,6 +37,7 @@ def get_or_attr(obj, name, default=None) -> Any: return default + class Template: # The regular expression used to find a #section section_re: Pattern = None @@ -197,6 +201,7 @@ class Template: def clozeText(self, txt, ord, type) -> str: reg = clozeReg if not re.search(reg%ord, txt): + # No Cloze deletion was found in txt. return "" txt = self._removeFormattingFromMathjax(txt, ord) def repl(m): @@ -216,13 +221,31 @@ class Template: # and display other clozes normally return re.sub(reg%r"\d+", "\\2", txt) - # look for clozes wrapped in mathjax, and change {{cx to {{Cx def _removeFormattingFromMathjax(self, txt, ord) -> str: + """Marks all clozes within MathJax to prevent formatting them. + + Active Cloze deletions within MathJax should not be wrapped inside + a Cloze , as that would interfere with MathJax. + + This method finds all Cloze deletions number `ord` in `txt` which are + inside MathJax inline or display formulas, and replaces their opening + '{{c123' with a '{{C123'. The clozeText method interprets the upper-case + C as "don't wrap this Cloze in a ". + """ + # TODO: There is a bug in this method. + # Say txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord = 1. + # + # This method should return: '\(a\) {{c1::b}} \[ {{C1::c}} \]'. + # Since the {{c1::c}} occurs within a MathJax display formula. + # However, it returns '\(a\) {{c1::b}} \[ {{c1::c}} \]'. + # This causes the Cloze within the MathJax display formula + # to be erroneously formatted with a . opening = ["\\(", "\\["] closing = ["\\)", "\\]"] # flags in middle of expression deprecated creg = clozeReg.replace("(?si)", "") regex = r"(?si)(\\[([])(.*?)"+(creg%ord)+r"(.*?)(\\[\])])" + def repl(m): enclosed = True for s in closing: diff --git a/tests/test_models.py b/tests/test_models.py index b1cfbce5f..a85230531 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -199,6 +199,19 @@ def test_cloze_mathjax(): assert "class=cloze" in f.cards()[3].q() assert "class=cloze" in f.cards()[4].q() +def test_cloze_mathjax_bug(): + d = getEmptyCol() + d.models.setCurrent(d.models.byName("Cloze")) + + f = d.newNote() + f['Text'] = r'\(a\) {{c1::b}} \[ {{c1::c}} \]' + assert d.addNote(f) + assert len(f.cards()) == 1 + + # TODO: The following assertion should work, but currently fails due + # to a bug in _removeFormatingFromMathjax. + # assert f.cards()[0].q() == '\(a\) [...] \[ [...] \]' + def test_chained_mods(): d = getEmptyCol() d.models.setCurrent(d.models.byName("Cloze")) diff --git a/tests/test_template.py b/tests/test_template.py new file mode 100644 index 000000000..59f935736 --- /dev/null +++ b/tests/test_template.py @@ -0,0 +1,16 @@ +from anki.template import Template + + +def test_remove_formatting_from_mathjax(): + t = Template('') + assert t._removeFormattingFromMathjax(r'\(2^{{c3::2}}\)', 3) == r'\(2^{{C3::2}}\)' + + txt = (r'{{c1::ok}} \(2^2\) {{c2::not ok}} \(2^{{c3::2}}\) \(x^3\) ' + r'{{c4::blah}} {{c5::text with \(x^2\) jax}}') + # Cloze 2 is not in MathJax, so it should not get protected against + # formatting. + assert t._removeFormattingFromMathjax(txt, 2) == txt + + # TODO: r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord=1 should return + # r'\(a\) {{c1::b}} \[ {{C1::c}} \]', but actually fails to mark the cloze + # as not-to-be-formatted. From 2d2f21bfe3fdcd3e6da101903adabc2ee1fd5068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 05:47:45 +0100 Subject: [PATCH 2/5] Fix bug in _removeFormattingFromMathjax --- anki/template/template.py | 63 +++++++++++++++++++++++---------------- tests/test_models.py | 8 +---- tests/test_template.py | 6 ++-- 3 files changed, 41 insertions(+), 36 deletions(-) diff --git a/anki/template/template.py b/anki/template/template.py index baa2c34dd..f867db33b 100644 --- a/anki/template/template.py +++ b/anki/template/template.py @@ -232,34 +232,45 @@ class Template: '{{c123' with a '{{C123'. The clozeText method interprets the upper-case C as "don't wrap this Cloze in a ". """ - # TODO: There is a bug in this method. - # Say txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord = 1. - # - # This method should return: '\(a\) {{c1::b}} \[ {{C1::c}} \]'. - # Since the {{c1::c}} occurs within a MathJax display formula. - # However, it returns '\(a\) {{c1::b}} \[ {{c1::c}} \]'. - # This causes the Cloze within the MathJax display formula - # to be erroneously formatted with a . - opening = ["\\(", "\\["] - closing = ["\\)", "\\]"] - # flags in middle of expression deprecated creg = clozeReg.replace("(?si)", "") - regex = r"(?si)(\\[([])(.*?)"+(creg%ord)+r"(.*?)(\\[\])])" - def repl(m): - enclosed = True - for s in closing: - if s in m.group(1): - enclosed = False - for s in opening: - if s in m.group(7): - enclosed = False - if not enclosed: - return m.group(0) - # remove formatting - return m.group(0).replace("{{c", "{{C") - txt = re.sub(regex, repl, txt) - return txt + # Scan the string left to right. + # After a MathJax opening - \( or \[ - flip in_mathjax to True. + # After a MathJax closing - \) or \] - flip in_mathjax to False. + # When a Cloze pattern number `ord` is found and we are in MathJax, + # replace its '{{c' with '{{C'. + # + # TODO: Report mismatching opens/closes - e.g. '\(\]' + # TODO: Report errors in this method better than printing to stdout. + # flags in middle of expression deprecated + in_mathjax = False + def replace(match): + nonlocal in_mathjax + if match.group('mathjax_open'): + if in_mathjax: + print("MathJax opening found while already in MathJax") + in_mathjax = True + elif match.group('mathjax_close'): + if not in_mathjax: + print("MathJax close found while not in MathJax") + in_mathjax = False + elif match.group('cloze'): + if in_mathjax: + return match.group(0).replace( + '{{c{}::'.format(ord), + '{{C{}::'.format(ord)) + else: + print("Unexpected: no expected capture group is present") + return match.group(0) + # The following regex matches one of: + # - MathJax opening + # - MathJax close + # - Cloze deletion number `ord` + return re.sub( + r"(?si)" + r"(?P\\[([])|" + r"(?P\\[\])])|" + r"(?P" + (creg%ord) + ")", replace, txt) @modifier('=') def render_delimiter(self, tag_name=None, context=None) -> str: diff --git a/tests/test_models.py b/tests/test_models.py index a85230531..971ec02ca 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -199,18 +199,12 @@ def test_cloze_mathjax(): assert "class=cloze" in f.cards()[3].q() assert "class=cloze" in f.cards()[4].q() -def test_cloze_mathjax_bug(): - d = getEmptyCol() - d.models.setCurrent(d.models.byName("Cloze")) - f = d.newNote() f['Text'] = r'\(a\) {{c1::b}} \[ {{c1::c}} \]' assert d.addNote(f) assert len(f.cards()) == 1 + assert f.cards()[0].q().endswith('\(a\) [...] \[ [...] \]') - # TODO: The following assertion should work, but currently fails due - # to a bug in _removeFormatingFromMathjax. - # assert f.cards()[0].q() == '\(a\) [...] \[ [...] \]' def test_chained_mods(): d = getEmptyCol() diff --git a/tests/test_template.py b/tests/test_template.py index 59f935736..7132c1240 100644 --- a/tests/test_template.py +++ b/tests/test_template.py @@ -11,6 +11,6 @@ def test_remove_formatting_from_mathjax(): # formatting. assert t._removeFormattingFromMathjax(txt, 2) == txt - # TODO: r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord=1 should return - # r'\(a\) {{c1::b}} \[ {{C1::c}} \]', but actually fails to mark the cloze - # as not-to-be-formatted. + txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]' + assert t._removeFormattingFromMathjax(txt, 1) == ( + r'\(a\) {{c1::b}} \[ {{C1::c}} \]') From 5ff05471104f44fd4edc1f2e7eac4481871df9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 12:43:15 +0100 Subject: [PATCH 3/5] Don't repeat Python regex docs --- anki/template/template.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/anki/template/template.py b/anki/template/template.py index f867db33b..3c4d4b407 100644 --- a/anki/template/template.py +++ b/anki/template/template.py @@ -4,9 +4,7 @@ from typing import Any, Callable, Dict, Pattern from anki.hooks import runFilter from anki.utils import stripHTML, stripHTMLMedia -# The (?si) flags make the regex match case-insensitively and make . match any -# character including newlines. -# See: https://docs.python.org/3/howto/regex.html#compilation-flags +# Matches a {{c123::clozed-out text::hint}} Cloze deletion, case-insensitively. clozeReg = r"(?si)\{\{(c)%s::(.*?)(::(.*?))?\}\}" modifiers: Dict[str, Callable] = {} From 2ae342592cfb863e6b1aad6a6233ea799d083042 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 13:59:24 +0100 Subject: [PATCH 4/5] Deduplicate media extension filter with existing list of media extensions --- aqt/editor.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/aqt/editor.py b/aqt/editor.py index cf500c4c2..37c0253f9 100644 --- a/aqt/editor.py +++ b/aqt/editor.py @@ -3,6 +3,7 @@ # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html import base64 import html +import itertools import json import mimetypes import re @@ -553,10 +554,10 @@ to a cloze type first, via Edit>Change Note Type.""")) ###################################################################### def onAddMedia(self): - key = (_("Media") + - " (*.jpg *.png *.gif *.tiff *.svg *.tif *.jpeg "+ - "*.mp3 *.ogg *.wav *.avi *.ogv *.mpg *.mpeg *.mov *.mp4 " + - "*.mkv *.ogx *.ogv *.oga *.flv *.swf *.flac *.webp *.m4a)") + extension_filter = ' '.join( + '*.' + extension + for extension in sorted(itertools.chain(pics, audio))) + key = (_("Media") + " (" + extension_filter + ")") def accept(file): self.addMedia(file, canDelete=True) file = getFile(self.widget, _("Add Media"), accept, key, key="media") From 36bdb4ebe0dd99e20b2336e011d0f29b2bf3a3bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Pokorn=C3=BD=20=28Rai=29?= Date: Sun, 22 Dec 2019 14:50:42 +0100 Subject: [PATCH 5/5] Add some type declarations in tags.py --- anki/tags.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/anki/tags.py b/anki/tags.py index 373dc4d20..86eb34064 100644 --- a/anki/tags.py +++ b/anki/tags.py @@ -12,7 +12,7 @@ This module manages the tag cache and tags for notes. import json import re -from typing import Any, Callable, Dict, List, Tuple +from typing import Callable, Dict, List, Tuple from anki.hooks import runHook from anki.utils import ids2str, intTime @@ -66,13 +66,13 @@ class TagManager: self.register(set(self.split( " ".join(self.col.db.list("select distinct tags from notes"+lim))))) - def allItems(self) -> List[Tuple[Any, Any]]: + def allItems(self) -> List[Tuple[str, int]]: return list(self.tags.items()) def save(self) -> None: self.changed = True - def byDeck(self, did, children=False) -> List: + def byDeck(self, did, children=False) -> List[str]: basequery = "select n.tags from cards c, notes n WHERE c.nid = n.id" if not children: query = basequery + " AND c.did=?" @@ -127,7 +127,7 @@ class TagManager: # String-based utilities ########################################################################## - def split(self, tags) -> List: + def split(self, tags) -> List[str]: "Parse a string and return a list of tags." return [t for t in tags.replace('\u3000', ' ').split(" ") if t] @@ -165,7 +165,7 @@ class TagManager: # List-based utilities ########################################################################## - def canonify(self, tagList) -> List: + def canonify(self, tagList) -> List[str]: "Strip duplicates, adjust case to match existing tags, and sort." strippedTags = [] for t in tagList: