ensure fields normalized before checksumming

https://forums.ankiweb.net/t/python-checksum-rust-checksum/8195
This commit is contained in:
Damien Elmes 2021-03-13 10:23:32 +10:00
parent bd959731d7
commit 1ab085dfab

View file

@ -15,6 +15,7 @@ import sys
import tempfile import tempfile
import time import time
import traceback import traceback
import unicodedata
from contextlib import contextmanager from contextlib import contextmanager
from hashlib import sha1 from hashlib import sha1
from html.entities import name2codepoint from html.entities import name2codepoint
@ -201,8 +202,11 @@ def checksum(data: Union[bytes, str]) -> str:
def fieldChecksum(data: str) -> int: def fieldChecksum(data: str) -> int:
without_html = stripHTMLMedia(data)
normalized = unicodedata.normalize("NFC", without_html)
utf8_text = normalized.encode("utf-8")
# 32 bit unsigned number from first 8 digits of sha1 hash # 32 bit unsigned number from first 8 digits of sha1 hash
return int(checksum(stripHTMLMedia(data).encode("utf-8"))[:8], 16) return int(checksum(utf8_text)[:8], 16)
# Temp files # Temp files