csv importing basics

This commit is contained in:
Damien Elmes 2012-02-29 12:39:35 +09:00
parent 54b8ee059f
commit 7189e57e80
6 changed files with 203 additions and 347 deletions

View file

@ -1,321 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import time
from anki.lang import _
from anki.utils import fieldChecksum, ids2str
from anki.errors import *
from anki.importing.base import Importer
#from anki.deck import NEW_CARDS_RANDOM
# Stores a list of fields, tags, and optionally properties like 'ivl'
######################################################################
class ForeignCard(object):
"An temporary object storing fields and attributes."
def __init__(self):
self.fields = []
self.tags = u""
# Base class for csv/supermemo/etc importers
######################################################################
class CardImporter(Importer):
needMapper = True
tagDuplicates = False
# if set, update instead of regular importing
# (foreignCardFieldIndex, fieldModelId)
updateKey = None
needDelimiter = False
def __init__(self, col, file):
Importer.__init__(self, col, file)
self._model = col.currentModel
self.tagsToAdd = u""
self._mapping = None
def run(self):
"Import."
if self.updateKey is not None:
return self.doUpdate()
random = self.col.newCardOrder == NEW_CARDS_RANDOM
num = 6
if random:
num += 1
c = self.foreignCards()
if self.importCards(c):
self.col.updateCardTags(self.cardIds)
if random:
self.col.randomizeNewCards(self.cardIds)
if c:
self.col.setModified()
def doUpdate(self):
# grab the data from the external file
cards = self.foreignCards()
# grab data from db
fields = self.col.db.all("""
select noteId, value from fields where fieldModelId = :id
and value != ''""",
id=self.updateKey[1])
# hash it
vhash = {}
nids = []
for (nid, val) in fields:
nids.append(nid)
vhash[val] = nid
# prepare tags
tagsIdx = None
try:
tagsIdx = self.mapping.index(0)
for c in cards:
c.tags = canonifyTags(self.tagsToAdd + " " + c.fields[tagsIdx])
except ValueError:
pass
# look for matches
upcards = []
newcards = []
for c in cards:
v = c.fields[self.updateKey[0]]
if v in vhash:
# ignore empty keys
if v:
# nid, card
upcards.append((vhash[v], c))
else:
newcards.append(c)
# update fields
for fm in self.model.fieldModels:
if fm.id == self.updateKey[1]:
# don't update key
continue
try:
index = self.mapping.index(fm)
except ValueError:
# not mapped
continue
data = [{'nid': nid,
'fmid': fm.id,
'v': c.fields[index],
'chk': self.maybeChecksum(c.fields[index], fm.unique)}
for (nid, c) in upcards]
self.col.db.execute("""
update fields set value = :v, chksum = :chk where noteId = :nid
and fieldModelId = :fmid""", data)
# update tags
if tagsIdx is not None:
data = [{'nid': nid,
't': c.fields[tagsIdx]}
for (nid, c) in upcards]
self.col.db.execute(
"update notes set tags = :t where id = :nid",
data)
# rebuild caches
cids = self.col.db.column0(
"select id from cards where noteId in %s" %
ids2str(nids))
self.col.updateCardTags(cids)
self.col.updateCardsFromNoteIds(nids)
self.total = len(cards)
self.col.setModified()
def fields(self):
"The number of fields."
return 0
def maybeChecksum(self, data, unique):
if not unique:
return ""
return fieldChecksum(data)
def foreignCards(self):
"Return a list of foreign cards for importing."
assert 0
def resetMapping(self):
"Reset mapping to default."
numFields = self.fields()
m = [f for f in self.model.fieldModels]
m.append(0)
rem = max(0, self.fields() - len(m))
m += [None] * rem
del m[numFields:]
self._mapping = m
def getMapping(self):
if not self._mapping:
self.resetMapping()
return self._mapping
def setMapping(self, mapping):
self._mapping = mapping
mapping = property(getMapping, setMapping)
def getModel(self):
return self._model
def setModel(self, model):
self._model = model
# update the mapping for the new model
self._mapping = None
self.getMapping()
model = property(getModel, setModel)
def importCards(self, cards):
"Convert each card into a note, apply attributes and add to col."
# ensure all unique and required fields are mapped
for fm in self.model.fieldModels:
if fm.required or fm.unique:
if fm not in self.mapping:
raise ImportFormatError(
type="missingRequiredUnique",
info=_("Missing required/unique field '%(field)s'") %
{'field': fm.name})
active = 0
for cm in self.model.cardModels:
if cm.active: active += 1
# strip invalid cards
cards = self.stripInvalid(cards)
cards = self.stripOrTagDupes(cards)
self.cardIds = []
if cards:
self.addCards(cards)
return cards
def addCards(self, cards):
"Add notes in bulk from foreign cards."
# map tags field to attr
try:
idx = self.mapping.index(0)
for c in cards:
c.tags += " " + c.fields[idx]
except ValueError:
pass
# add notes
noteIds = [genID() for n in range(len(cards))]
noteCreated = {}
def fudgeCreated(d, tmp=[]):
if not tmp:
tmp.append(time.time())
else:
tmp[0] += 0.0001
d['created'] = tmp[0]
noteCreated[d['id']] = d['created']
return d
self.col.db.execute(notesTable.insert(),
[fudgeCreated({'modelId': self.model.id,
'tags': canonifyTags(self.tagsToAdd + " " + cards[n].tags),
'id': noteIds[n]}) for n in range(len(cards))])
self.col.db.execute("""
delete from notesDeleted
where noteId in (%s)""" % ",".join([str(s) for s in noteIds]))
# add all the fields
for fm in self.model.fieldModels:
try:
index = self.mapping.index(fm)
except ValueError:
index = None
data = [{'noteId': noteIds[m],
'fieldModelId': fm.id,
'ordinal': fm.ordinal,
'id': genID(),
'value': (index is not None and
cards[m].fields[index] or u""),
'chksum': self.maybeChecksum(
index is not None and
cards[m].fields[index] or u"", fm.unique)
}
for m in range(len(cards))]
self.col.db.execute(fieldsTable.insert(),
data)
# and cards
active = 0
for cm in self.model.cardModels:
if cm.active:
active += 1
data = [self.addMeta({
'id': genID(),
'noteId': noteIds[m],
'noteCreated': noteCreated[noteIds[m]],
'cardModelId': cm.id,
'ordinal': cm.ordinal,
'question': u"",
'answer': u""
},cards[m]) for m in range(len(cards))]
self.col.db.execute(cardsTable.insert(),
data)
self.col.updateCardsFromNoteIds(noteIds)
self.total = len(noteIds)
def addMeta(self, data, card):
"Add any scheduling metadata to cards"
if 'fields' in card.__dict__:
del card.fields
t = data['noteCreated'] + data['ordinal'] * 0.00001
data['created'] = t
data['modified'] = t
data['due'] = t
data.update(card.__dict__)
data['tags'] = u""
self.cardIds.append(data['id'])
data['combinedDue'] = data['due']
if data.get('successive', 0):
t = 1
elif data.get('reps', 0):
t = 0
else:
t = 2
data['type'] = t
data['queue'] = t
return data
def stripInvalid(self, cards):
return [c for c in cards if self.cardIsValid(c)]
def cardIsValid(self, card):
fieldNum = len(card.fields)
for n in range(len(self.mapping)):
if self.mapping[n] and self.mapping[n].required:
if fieldNum <= n or not card.fields[n].strip():
self.log.append("Note is missing field '%s': %s" %
(self.mapping[n].name,
", ".join(card.fields)))
return False
return True
def stripOrTagDupes(self, cards):
# build a cache of items
self.uniqueCache = {}
for field in self.mapping:
if field and field.unique:
self.uniqueCache[field.id] = self.getUniqueCache(field)
return [c for c in cards if self.cardIsUnique(c)]
def getUniqueCache(self, field):
"Return a dict with all fields, to test for uniqueness."
return dict(self.col.db.all(
"select value, 1 from fields where fieldModelId = :fmid",
fmid=field.id))
def cardIsUnique(self, card):
fieldsAsTags = []
for n in range(len(self.mapping)):
if self.mapping[n] and self.mapping[n].unique:
if card.fields[n] in self.uniqueCache[self.mapping[n].id]:
if not self.tagDuplicates:
self.log.append("Note has duplicate '%s': %s" %
(self.mapping[n].name,
", ".join(card.fields)))
return False
fieldsAsTags.append(self.mapping[n].name.replace(" ", "-"))
else:
self.uniqueCache[self.mapping[n].id][card.fields[n]] = 1
if fieldsAsTags:
card.tags += u" Duplicate:" + (
"+".join(fieldsAsTags))
card.tags = canonifyTags(card.tags)
return True

View file

@ -3,26 +3,26 @@
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import codecs, csv, re
from anki.importing.cardimp import CardImporter, ForeignCard
from anki.importing.noteimp import NoteImporter, ForeignNote
from anki.lang import _
from anki.errors import *
class TextImporter(CardImporter):
class TextImporter(NoteImporter):
needDelimiter = True
patterns = ("\t", ";")
def __init__(self, *args):
Importer.__init__(self, *args)
NoteImporter.__init__(self, *args)
self.lines = None
self.fileobj = None
self.delimiter = None
def foreignCards(self):
def foreignNotes(self):
self.sniff()
# process all lines
log = []
cards = []
notes = []
lineNum = 0
ignored = 0
if self.delimiter:
@ -46,12 +46,12 @@ class TextImporter(CardImporter):
})
ignored += 1
continue
card = self.cardFromFields(row)
cards.append(card)
note = self.noteFromFields(row)
notes.append(note)
self.log = log
self.ignored = ignored
self.fileobj.close()
return cards
return notes
def sniff(self):
"Parse the top line and determine the pattern and number of fields."
@ -77,7 +77,7 @@ class TextImporter(CardImporter):
self.data = [sub(x) for x in self.data.split("\n") if sub(x)]
if self.data:
if self.data[0].startswith("tags:"):
self.tagsToAdd = self.data[0][5:]
self.tagsToAdd = self.data[0][5:].split(" ")
del self.data[0]
self.updateDelimiter()
if not self.dialect and not self.delimiter:
@ -128,7 +128,8 @@ class TextImporter(CardImporter):
self.sniff()
return self.numFields
def cardFromFields(self, fields):
card = ForeignCard()
card.fields.extend([x.strip() for x in fields])
return card
def noteFromFields(self, fields):
note = ForeignNote()
note.fields.extend([x.strip() for x in fields])
print "fixme - add tagsToAdd to note tags"
return note

175
anki/importing/noteimp.py Normal file
View file

@ -0,0 +1,175 @@
# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
import time
from anki.lang import _
from anki.utils import fieldChecksum, ids2str, guid64, timestampID, \
joinFields, intTime
from anki.errors import *
from anki.importing.base import Importer
#from anki.deck import NEW_CARDS_RANDOM
# Stores a list of fields, tags and deck
######################################################################
class ForeignNote(object):
"An temporary object storing fields and attributes."
def __init__(self):
self.fields = []
self.tags = []
self.deck = None
# Base class for csv/supermemo/etc importers
######################################################################
# - instead of specifying an update key, do it by default using first field
# The mapping is list of input fields, like:
# ['Expression', 'Reading', '_tags', None]
# - None means that the input should be discarded
# - _tags maps to note tags
# - _deck maps to card deck
# If the first field of the model is not in the map, the map is invalid.
class NoteImporter(Importer):
needMapper = True
needDelimiter = False
update = True
def __init__(self, col, file):
Importer.__init__(self, col, file)
self.model = col.models.current()
self.mapping = None
self._deckMap = {}
def run(self):
"Import."
print "fixme: randomize"
assert self.mapping
c = self.foreignNotes()
self.importNotes(c)
def fields(self):
"The number of fields."
return 0
def maybeChecksum(self, data, unique):
if not unique:
return ""
return fieldChecksum(data)
def foreignNotes(self):
"Return a list of foreign notes for importing."
assert 0
def importNotes(self, notes):
"Convert each card into a note, apply attributes and add to col."
# gather checks for duplicate comparison
csums = {}
for csum, id in self.col.db.execute(
"select csum, id from notes where mid = ?", self.model['id']):
if csum in csums:
csums[csum].append(id)
else:
csums[csum] = [id]
fld0idx = self.mapping.index(self.model['flds'][0]['name'])
self._fmap = self.col.models.fieldMap(self.model)
self._nextID = timestampID(self.col.db, "notes")
# loop through the notes
updates = []
new = []
self._ids = []
for n in notes:
fld0 = n.fields[fld0idx]
csum = fieldChecksum(fld0)
# first field must exist
if not fld0:
self.log.append(_("Empty first field: %s") %
" ".join(n.fields))
continue
# already exists?
if csum in csums:
if csums[csum] == -1:
# duplicates in source file; log and ignore
self.log.append(_("Appeared twice in file: %s") %
fld0)
continue
# csum is not a guarantee; have to check
for id in csums[csum]:
flds = self.col.db.scalar(
"select flds from notes where id = ?", id)
if fld0 == splitFields(flds)[0]:
# duplicate
data = self.updateData(n, id)
if data:
updates.append(data)
break
# newly add
else:
data = self.newData(n)
if data:
new.append(data)
# note that we've seen this note once already
csums[fieldChecksum(n.fields[0])] = -1
self.addNew(new)
self.addUpdates(updates)
self.col.updateFieldCache(self._ids)
assert not self.col.genCards(self._ids)
# make sure to update sflds, etc
self.total = len(self._ids)
def newData(self, n):
id = self._nextID
self._nextID += 1
self._ids.append(id)
if not self.processFields(n):
print "no cards generated"
return
return [id, guid64(), self.model['id'], self.didForNote(n),
intTime(), self.col.usn(), self.col.tags.join(n.tags),
n.fieldsStr, "", "", 0, ""]
def addNew(self, rows):
self.col.db.executemany(
"insert or replace into notes values (?,?,?,?,?,?,?,?,?,?,?,?)",
rows)
# need to document that deck is ignored in this case
def updateData(self, n, id):
self._ids.append(id)
if not self.processFields(n):
print "no cards generated"
return
tags = self.col.tags.join(n.tags)
return [intTime(), self.col.usn(), n.fieldsStr, tags,
id, n.fieldsStr, tags]
def addUpdates(self, rows):
self.col.db.executemany("""
update notes set mod = ?, usn = ?, flds = ?, tags = ?
where id = ? and (flds != ? or tags != ?)""", rows)
def didForNote(self, n):
if not n.deck:
n.deck = _("Imported")
if n.deck not in self._deckMap:
self._deckMap[n.deck] = self.col.decks.id(n.deck)
return self._deckMap[n.deck]
def processFields(self, note):
fields = [""]*len(self.model['flds'])
for c, f in enumerate(self.mapping):
if not f:
continue
elif f == "_tags":
note.tags.extend(self.col.tags.split(note.fields[c]))
elif f == "_deck":
note.deck = note.fields[c]
else:
sidx = self._fmap[f][0]
fields[sidx] = note.fields[c]
note.fieldsStr = joinFields(fields)
return self.col.models.availOrds(self.model, note.fieldsStr)

View file

@ -4,7 +4,7 @@
import sys
from anki.importing.cardimp import CardImporter, ForeignCard
from anki.importing.noteimp import NoteImporter, ForeignNote
from anki.lang import _
from anki.errors import *
@ -63,7 +63,7 @@ class SuperMemoElement(SmartDict):
# This is an AnkiImporter
class SupermemoXmlImporter(CardImporter):
class SupermemoXmlImporter(NoteImporter):
"""
Supermemo XML export's to Anki parser.
Goes through a SM collection and fetch all elements.

View file

@ -225,7 +225,7 @@ def checksum(data):
return sha1(data).hexdigest()
def fieldChecksum(data):
# 32 bit unsigned number from first 8 digits of md5 hash
# 32 bit unsigned number from first 8 digits of sha1 hash
return int(checksum(data.encode("utf-8"))[:8], 16)
# Temp files

View file

@ -77,23 +77,24 @@ def test_anki1():
check()
def test_csv():
print "disabled"; return
deck = Deck()
deck.addModel(BasicModel())
file = unicode(os.path.join(testDir, "importing/text-2fields.txt"))
i = csvfile.TextImporter(deck, file)
deck = getEmptyDeck()
file = unicode(os.path.join(testDir, "support/text-2fields.txt"))
i = TextImporter(deck, file)
i.mapping = ['Front', 'Back']
i.run()
# four problems - missing front, dupe front, wrong num of fields
print i.log
# four problems - too many & too few fields, a missing front, and a
# duplicate entry
assert len(i.log) == 4
assert i.total == 5
print deck.db.all("select * from notes")
deck.close()
def test_csv_tags():
print "disabled"; return
deck = Deck()
deck.addModel(BasicModel())
file = unicode(os.path.join(testDir, "importing/text-tags.txt"))
i = csvfile.TextImporter(deck, file)
deck = getEmptyDeck()
file = unicode(os.path.join(testDir, "support/text-tags.txt"))
i = TextImporter(deck, file)
i.run()
notes = deck.db.query(Note).all()
assert len(notes) == 2