switch to python csv

This commit is contained in:
Damien Elmes 2009-06-18 05:21:47 +09:00
parent b6a50db1c2
commit e62967ecb1
6 changed files with 115 additions and 138 deletions

View file

@ -253,7 +253,7 @@ where factId in (%s)""" % ",".join([str(s) for s in factIds]))
# Export modules
##########################################################################
from anki.importing.csv import TextImporter
from anki.importing.csvfile import TextImporter
from anki.importing.anki10 import Anki10Importer
from anki.importing.mnemosyne10 import Mnemosyne10Importer
from anki.importing.wcu import WCUImporter

View file

@ -1,130 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
"""\
Importing CSV/TSV files
========================
"""
__docformat__ = 'restructuredtext'
import codecs
from anki.importing import Importer, ForeignCard
from anki.lang import _
from anki.errors import *
from anki.utils import tidyHTML
class TextImporter(Importer):
patterns = ("\t", ";")
def __init__(self, *args):
Importer.__init__(self, *args)
self.lines = None
def foreignCards(self):
self.parseTopLine()
# process all lines
log = []
cards = []
lineNum = 0
ignored = 0
for line in self.lines:
lineNum += 1
if not line.strip():
# ignore blank lines
continue
try:
fields = self.parseLine(line)
except ValueError:
log.append(_("Line %(line)d doesn't match pattern '%(pat)s'")
% {
'line': lineNum,
'pat': pattern,
})
ignored += 1
continue
if len(fields) != self.numFields:
log.append(_(
"Line %(line)d had %(num1)d fields,"
" expected %(num2)d") % {
"line": lineNum,
"num1": len(fields),
"num2": self.numFields,
})
ignored += 1
continue
card = self.cardFromFields(fields)
cards.append(card)
self.log = log
self.ignored = ignored
return cards
def parseTopLine(self):
"Parse the top line and determine the pattern and number of fields."
# load & look for the right pattern
self.cacheFile()
# look for the first non-blank line
l = None
for line in self.lines:
ret = line.strip()
if ret:
l = line
break
if not l:
raise ImportFormatError(type="emptyFile",
info=_("The file had no non-empty lines."))
found = False
for p in self.patterns:
if p in l:
pattern = p
fields = l.split(p)
numFields = len(fields)
found = True
break
if not found:
fmtError = _(
"Couldn't find pattern. The file should be a series "
"of lines separated by tabs or semicolons.")
raise ImportFormatError(type="invalidPattern",
info=fmtError)
self.pattern = pattern
self.setNumFields(line)
def cacheFile(self):
"Read file into self.lines if not already there."
if not self.lines:
self.lines = self.readFile()
def readFile(self):
f = codecs.open(self.file, encoding="utf-8")
try:
data = f.readlines()
except UnicodeDecodeError, e:
raise ImportFormatError(type="encodingError",
info=_("The file was not in UTF8 format."))
if not data:
return []
if data[0].startswith(unicode(codecs.BOM_UTF8, "utf8")):
data[0] = data[0][1:]
# remove comment char
lines = [l for l in data if not l.lstrip().startswith("#")]
return lines
def fields(self):
"Number of fields."
self.parseTopLine()
return self.numFields
def setNumFields(self, line):
self.numFields = len(self.parseLine(line))
def parseLine(self, line):
fields = line.split(self.pattern)
fields = [tidyHTML(f.strip()) for f in fields]
return fields
def cardFromFields(self, fields):
card = ForeignCard()
card.fields.extend(fields)
return card

104
anki/importing/csvfile.py Normal file
View file

@ -0,0 +1,104 @@
# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
"""\
Importing CSV/TSV files
========================
"""
__docformat__ = 'restructuredtext'
import codecs, csv, re
from anki.importing import Importer, ForeignCard
from anki.lang import _
from anki.errors import *
from anki.utils import tidyHTML
class TextImporter(Importer):
patterns = ("\t", ";")
def __init__(self, *args):
Importer.__init__(self, *args)
self.lines = None
self.fileobj = None
def foreignCards(self):
self.sniff()
# process all lines
log = []
cards = []
lineNum = 0
ignored = 0
reader = csv.reader(self.data, self.dialect)
for row in reader:
row = [unicode(x, "utf-8") for x in row]
if len(row) != self.numFields:
log.append(_(
"'%(row)s' had %(num1)d fields, "
"expected %(num2)d") % {
"row": u" ".join(row),
"num1": len(row),
"num2": self.numFields,
})
ignored += 1
continue
card = self.cardFromFields(row)
cards.append(card)
self.log = log
self.ignored = ignored
return cards
def sniff(self):
"Parse the top line and determine the pattern and number of fields."
# load & look for the right pattern
self.cacheFile()
def cacheFile(self):
"Read file into self.lines if not already there."
if not self.fileobj:
self.openFile()
def openFile(self):
self.dialect = None
self.fileobj = open(self.file, "rb")
try:
self.data = self.fileobj.read()
self.data = re.sub("^ *#.*", "", self.data)
self.data = [x for x in self.data.split("\n") if x]
if self.data:
# strip out comments and blank lines
try:
self.dialect = csv.Sniffer().sniff("\n".join(self.data[:10]))
except:
self.dialect = csv.Sniffer().sniff(self.data[0])
reader = csv.reader(self.data, self.dialect)
self.numFields = len(reader.next())
else:
self.dialect = None
except UnicodeDecodeError, e:
raise ImportFormatError(
type="encodingError",
info=_("The file was not in UTF8 format."))
if not self.dialect:
raise ImportFormatError(
type="encodingError",
info=_("Couldn't determine format of file."))
def fields(self):
"Number of fields."
self.sniff()
return self.numFields
def setNumFields(self, line):
self.numFields = len(self.parseLine(line))
def parseLine(self, line):
fields = line.split(self.pattern)
fields = [tidyHTML(f.strip()) for f in fields]
return fields
def cardFromFields(self, fields):
card = ForeignCard()
card.fields.extend(fields)
return card

View file

@ -5,5 +5,7 @@
テスト test
to eat 食べる
飲む to drink
多すぎる too many fields
not, enough, fields
遊ぶ
to play

View file

@ -1 +1,2 @@
foo bar baz,qux
foo2 bar2 baz2

View file

@ -5,7 +5,7 @@ from tests.shared import assertException
from anki.errors import *
from anki import DeckStorage
from anki.importing import anki10, csv, mnemosyne10
from anki.importing import anki10, csvfile, mnemosyne10
from anki.stdmodels import BasicModel
from anki.facts import Fact
from anki.sync import SyncClient, SyncServer
@ -18,10 +18,10 @@ def test_csv():
deck = DeckStorage.Deck()
deck.addModel(BasicModel())
file = unicode(os.path.join(testDir, "importing/text-2fields.txt"))
i = csv.TextImporter(deck, file)
i = csvfile.TextImporter(deck, file)
i.doImport()
# two problems - missing front, dupe front
assert len(i.log) == 2
# four problems - missing front, dupe front, wrong num of fields
assert len(i.log) == 4
assert i.total == 5
deck.s.close()
@ -29,11 +29,11 @@ def test_csv_tags():
deck = DeckStorage.Deck()
deck.addModel(BasicModel())
file = unicode(os.path.join(testDir, "importing/text-tags.txt"))
i = csv.TextImporter(deck, file)
i = csvfile.TextImporter(deck, file)
i.doImport()
facts = deck.s.query(Fact).all()
assert len(facts) == 1
assert facts[0].tags == "baz qux"
assert len(facts) == 2
assert facts[0].tags == "baz qux" or facts[1].tags == "baz qux"
deck.s.close()
def test_mnemosyne10():