Document newly found bug in _removeFormattingFromMathjax

Also adds some comments I wrote to help me understand what's going
on in the code.

I hope to fix this bug myself, but I think it might be beyond what
you can do with Python regexes and might require writing a proper
parser.

So, as step 1, I'm adding in a couple comments explaining that the
bug exists and how to reproduce it.
This commit is contained in:
Michal Pokorný (Rai) 2019-12-22 04:28:29 +01:00
parent 59ce08bc4e
commit 331781cf45
3 changed files with 53 additions and 1 deletions

View file

@ -4,6 +4,9 @@ from typing import Any, Callable, Dict, Pattern
from anki.hooks import runFilter
from anki.utils import stripHTML, stripHTMLMedia
# The (?si) flags make the regex match case-insensitively and make . match any
# character including newlines.
# See: https://docs.python.org/3/howto/regex.html#compilation-flags
clozeReg = r"(?si)\{\{(c)%s::(.*?)(::(.*?))?\}\}"
modifiers: Dict[str, Callable] = {}
@ -34,6 +37,7 @@ def get_or_attr(obj, name, default=None) -> Any:
return default
class Template:
# The regular expression used to find a #section
section_re: Pattern = None
@ -197,6 +201,7 @@ class Template:
def clozeText(self, txt, ord, type) -> str:
reg = clozeReg
if not re.search(reg%ord, txt):
# No Cloze deletion was found in txt.
return ""
txt = self._removeFormattingFromMathjax(txt, ord)
def repl(m):
@ -216,13 +221,31 @@ class Template:
# and display other clozes normally
return re.sub(reg%r"\d+", "\\2", txt)
# look for clozes wrapped in mathjax, and change {{cx to {{Cx
def _removeFormattingFromMathjax(self, txt, ord) -> str:
"""Marks all clozes within MathJax to prevent formatting them.
Active Cloze deletions within MathJax should not be wrapped inside
a Cloze <span>, as that would interfere with MathJax.
This method finds all Cloze deletions number `ord` in `txt` which are
inside MathJax inline or display formulas, and replaces their opening
'{{c123' with a '{{C123'. The clozeText method interprets the upper-case
C as "don't wrap this Cloze in a <span>".
"""
# TODO: There is a bug in this method.
# Say txt = r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord = 1.
#
# This method should return: '\(a\) {{c1::b}} \[ {{C1::c}} \]'.
# Since the {{c1::c}} occurs within a MathJax display formula.
# However, it returns '\(a\) {{c1::b}} \[ {{c1::c}} \]'.
# This causes the Cloze within the MathJax display formula
# to be erroneously formatted with a <span>.
opening = ["\\(", "\\["]
closing = ["\\)", "\\]"]
# flags in middle of expression deprecated
creg = clozeReg.replace("(?si)", "")
regex = r"(?si)(\\[([])(.*?)"+(creg%ord)+r"(.*?)(\\[\])])"
def repl(m):
enclosed = True
for s in closing:

View file

@ -199,6 +199,19 @@ def test_cloze_mathjax():
assert "class=cloze" in f.cards()[3].q()
assert "class=cloze" in f.cards()[4].q()
def test_cloze_mathjax_bug():
d = getEmptyCol()
d.models.setCurrent(d.models.byName("Cloze"))
f = d.newNote()
f['Text'] = r'\(a\) {{c1::b}} \[ {{c1::c}} \]'
assert d.addNote(f)
assert len(f.cards()) == 1
# TODO: The following assertion should work, but currently fails due
# to a bug in _removeFormatingFromMathjax.
# assert f.cards()[0].q() == '\(a\) <span class=cloze>[...]</span> \[ [...] \]'
def test_chained_mods():
d = getEmptyCol()
d.models.setCurrent(d.models.byName("Cloze"))

16
tests/test_template.py Normal file
View file

@ -0,0 +1,16 @@
from anki.template import Template
def test_remove_formatting_from_mathjax():
t = Template('')
assert t._removeFormattingFromMathjax(r'\(2^{{c3::2}}\)', 3) == r'\(2^{{C3::2}}\)'
txt = (r'{{c1::ok}} \(2^2\) {{c2::not ok}} \(2^{{c3::2}}\) \(x^3\) '
r'{{c4::blah}} {{c5::text with \(x^2\) jax}}')
# Cloze 2 is not in MathJax, so it should not get protected against
# formatting.
assert t._removeFormattingFromMathjax(txt, 2) == txt
# TODO: r'\(a\) {{c1::b}} \[ {{c1::c}} \]', ord=1 should return
# r'\(a\) {{c1::b}} \[ {{C1::c}} \]', but actually fails to mark the cloze
# as not-to-be-formatted.