diff --git a/anki/importing/__init__.py b/anki/importing/__init__.py index 3e13f967f..264c0052b 100644 --- a/anki/importing/__init__.py +++ b/anki/importing/__init__.py @@ -253,7 +253,7 @@ where factId in (%s)""" % ",".join([str(s) for s in factIds])) # Export modules ########################################################################## -from anki.importing.csv import TextImporter +from anki.importing.csvfile import TextImporter from anki.importing.anki10 import Anki10Importer from anki.importing.mnemosyne10 import Mnemosyne10Importer from anki.importing.wcu import WCUImporter diff --git a/anki/importing/csv.py b/anki/importing/csv.py deleted file mode 100644 index 4cbb9752d..000000000 --- a/anki/importing/csv.py +++ /dev/null @@ -1,130 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright: Damien Elmes -# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html - -"""\ -Importing CSV/TSV files -======================== -""" -__docformat__ = 'restructuredtext' - -import codecs -from anki.importing import Importer, ForeignCard -from anki.lang import _ -from anki.errors import * -from anki.utils import tidyHTML - -class TextImporter(Importer): - - patterns = ("\t", ";") - - def __init__(self, *args): - Importer.__init__(self, *args) - self.lines = None - - def foreignCards(self): - self.parseTopLine() - # process all lines - log = [] - cards = [] - lineNum = 0 - ignored = 0 - for line in self.lines: - lineNum += 1 - if not line.strip(): - # ignore blank lines - continue - try: - fields = self.parseLine(line) - except ValueError: - log.append(_("Line %(line)d doesn't match pattern '%(pat)s'") - % { - 'line': lineNum, - 'pat': pattern, - }) - ignored += 1 - continue - if len(fields) != self.numFields: - log.append(_( - "Line %(line)d had %(num1)d fields," - " expected %(num2)d") % { - "line": lineNum, - "num1": len(fields), - "num2": self.numFields, - }) - ignored += 1 - continue - card = self.cardFromFields(fields) - cards.append(card) - self.log = log - self.ignored = ignored - return cards - - def parseTopLine(self): - "Parse the top line and determine the pattern and number of fields." - # load & look for the right pattern - self.cacheFile() - # look for the first non-blank line - l = None - for line in self.lines: - ret = line.strip() - if ret: - l = line - break - if not l: - raise ImportFormatError(type="emptyFile", - info=_("The file had no non-empty lines.")) - found = False - for p in self.patterns: - if p in l: - pattern = p - fields = l.split(p) - numFields = len(fields) - found = True - break - if not found: - fmtError = _( - "Couldn't find pattern. The file should be a series " - "of lines separated by tabs or semicolons.") - raise ImportFormatError(type="invalidPattern", - info=fmtError) - self.pattern = pattern - self.setNumFields(line) - - def cacheFile(self): - "Read file into self.lines if not already there." - if not self.lines: - self.lines = self.readFile() - - def readFile(self): - f = codecs.open(self.file, encoding="utf-8") - try: - data = f.readlines() - except UnicodeDecodeError, e: - raise ImportFormatError(type="encodingError", - info=_("The file was not in UTF8 format.")) - if not data: - return [] - if data[0].startswith(unicode(codecs.BOM_UTF8, "utf8")): - data[0] = data[0][1:] - # remove comment char - lines = [l for l in data if not l.lstrip().startswith("#")] - return lines - - def fields(self): - "Number of fields." - self.parseTopLine() - return self.numFields - - def setNumFields(self, line): - self.numFields = len(self.parseLine(line)) - - def parseLine(self, line): - fields = line.split(self.pattern) - fields = [tidyHTML(f.strip()) for f in fields] - return fields - - def cardFromFields(self, fields): - card = ForeignCard() - card.fields.extend(fields) - return card diff --git a/anki/importing/csvfile.py b/anki/importing/csvfile.py new file mode 100644 index 000000000..92ac34bfb --- /dev/null +++ b/anki/importing/csvfile.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- +# Copyright: Damien Elmes +# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html + +"""\ +Importing CSV/TSV files +======================== +""" +__docformat__ = 'restructuredtext' + +import codecs, csv, re +from anki.importing import Importer, ForeignCard +from anki.lang import _ +from anki.errors import * +from anki.utils import tidyHTML + +class TextImporter(Importer): + + patterns = ("\t", ";") + + def __init__(self, *args): + Importer.__init__(self, *args) + self.lines = None + self.fileobj = None + + def foreignCards(self): + self.sniff() + # process all lines + log = [] + cards = [] + lineNum = 0 + ignored = 0 + reader = csv.reader(self.data, self.dialect) + for row in reader: + row = [unicode(x, "utf-8") for x in row] + if len(row) != self.numFields: + log.append(_( + "'%(row)s' had %(num1)d fields, " + "expected %(num2)d") % { + "row": u" ".join(row), + "num1": len(row), + "num2": self.numFields, + }) + ignored += 1 + continue + card = self.cardFromFields(row) + cards.append(card) + self.log = log + self.ignored = ignored + return cards + + def sniff(self): + "Parse the top line and determine the pattern and number of fields." + # load & look for the right pattern + self.cacheFile() + + def cacheFile(self): + "Read file into self.lines if not already there." + if not self.fileobj: + self.openFile() + + def openFile(self): + self.dialect = None + self.fileobj = open(self.file, "rb") + try: + self.data = self.fileobj.read() + self.data = re.sub("^ *#.*", "", self.data) + self.data = [x for x in self.data.split("\n") if x] + if self.data: + # strip out comments and blank lines + try: + self.dialect = csv.Sniffer().sniff("\n".join(self.data[:10])) + except: + self.dialect = csv.Sniffer().sniff(self.data[0]) + reader = csv.reader(self.data, self.dialect) + self.numFields = len(reader.next()) + else: + self.dialect = None + except UnicodeDecodeError, e: + raise ImportFormatError( + type="encodingError", + info=_("The file was not in UTF8 format.")) + if not self.dialect: + raise ImportFormatError( + type="encodingError", + info=_("Couldn't determine format of file.")) + + def fields(self): + "Number of fields." + self.sniff() + return self.numFields + + def setNumFields(self, line): + self.numFields = len(self.parseLine(line)) + + def parseLine(self, line): + fields = line.split(self.pattern) + fields = [tidyHTML(f.strip()) for f in fields] + return fields + + def cardFromFields(self, fields): + card = ForeignCard() + card.fields.extend(fields) + return card diff --git a/tests/importing/text-2fields.txt b/tests/importing/text-2fields.txt index 199b2faef..b72afcaba 100644 --- a/tests/importing/text-2fields.txt +++ b/tests/importing/text-2fields.txt @@ -5,5 +5,7 @@ テスト test to eat 食べる 飲む to drink +多すぎる too many fields +not, enough, fields 遊ぶ to play diff --git a/tests/importing/text-tags.txt b/tests/importing/text-tags.txt index c088e1500..c7f68a8ce 100644 --- a/tests/importing/text-tags.txt +++ b/tests/importing/text-tags.txt @@ -1 +1,2 @@ foo bar baz,qux +foo2 bar2 baz2 diff --git a/tests/test_importing.py b/tests/test_importing.py index 002e127fc..7731f0616 100644 --- a/tests/test_importing.py +++ b/tests/test_importing.py @@ -5,7 +5,7 @@ from tests.shared import assertException from anki.errors import * from anki import DeckStorage -from anki.importing import anki10, csv, mnemosyne10 +from anki.importing import anki10, csvfile, mnemosyne10 from anki.stdmodels import BasicModel from anki.facts import Fact from anki.sync import SyncClient, SyncServer @@ -18,10 +18,10 @@ def test_csv(): deck = DeckStorage.Deck() deck.addModel(BasicModel()) file = unicode(os.path.join(testDir, "importing/text-2fields.txt")) - i = csv.TextImporter(deck, file) + i = csvfile.TextImporter(deck, file) i.doImport() - # two problems - missing front, dupe front - assert len(i.log) == 2 + # four problems - missing front, dupe front, wrong num of fields + assert len(i.log) == 4 assert i.total == 5 deck.s.close() @@ -29,11 +29,11 @@ def test_csv_tags(): deck = DeckStorage.Deck() deck.addModel(BasicModel()) file = unicode(os.path.join(testDir, "importing/text-tags.txt")) - i = csv.TextImporter(deck, file) + i = csvfile.TextImporter(deck, file) i.doImport() facts = deck.s.query(Fact).all() - assert len(facts) == 1 - assert facts[0].tags == "baz qux" + assert len(facts) == 2 + assert facts[0].tags == "baz qux" or facts[1].tags == "baz qux" deck.s.close() def test_mnemosyne10():