Anki/anki/features/chinese/save_unihan.py
2008-09-27 23:50:03 +09:00

101 lines
3.1 KiB
Python

# -*- coding: utf-8 -*-
# read unihan.txt and save it as a db
import psyco; psyco.full()
from sqlalchemy import (Table, Integer, Float, Unicode, Column, MetaData,
ForeignKey, Boolean, String, Date, UniqueConstraint,
UnicodeText)
from sqlalchemy import (create_engine)
from sqlalchemy.orm import mapper, sessionmaker, relation, backref, \
object_session as _object_session
from sqlalchemy.sql import select, text, and_
from sqlalchemy.exceptions import DBAPIError
import re
metadata = MetaData()
unihanTable = Table(
'unihan', metadata,
Column("id", Integer, primary_key=True),
Column("mandarin", UnicodeText),
Column("cantonese", UnicodeText),
Column("grade", Integer),
)
engine = create_engine("sqlite:///unihan.db",
echo=False, strategy='threadlocal')
session = sessionmaker(bind=engine,
autoflush=False,
transactional=True)
metadata.create_all(engine)
s = session()
# Convert codes to accents
##########################################################################
# code from Donald Chai
accenttable = {
'a' : [u'a', u'ā', u'á', u'ǎ', u'à', u'a'],
'e' : [u'e', u'ē', u'é', u'ě', u'è', u'e'],
'i' : [u'i', u'ī', u'í', u'ǐ', u'ì', u'i'],
'o' : [u'o', u'ō', u'ó', u'ǒ', u'ò', u'o'],
'u' : [u'u', u'ū', u'ú', u'ǔ', u'ù', u'u'],
'v' : [u'ü', u'ǖ', u'ǘ', u'ǚ', u'ǜ', u'ü'],
}
def convert(word):
'''Converts a pinyin word to unicode'''
word = word.lower()
orig = word
# convert ü to v for now to make life easier
word = re.sub(u'\xfc|\xc3\xbc', 'v', word)
# extract fields
mo = re.match('([qwrtypsdfghjklzxcbnm]*)([aeiouv]*)(\D*)(\d?)', word)
init = mo.group(1)
vowel = mo.group(2)
final = mo.group(3)
tone = mo.group(4)
# do nothing if no vowel or tone
if vowel=='' or tone=='':
return orig
tone = int(tone)
if len(vowel)==1:
vowel = accenttable[vowel][tone]
elif vowel[-2]=='i' or vowel[-2]=='u':
# put over last
vowel = vowel[:-1] + accenttable[vowel[-1]][tone]
else:
# put over second to last
vowel = vowel[:-2] + accenttable[vowel[-2]][tone] + vowel[-1]
return init+vowel+final
##########################################################################
kanji = {}
import codecs
for line in codecs.open("Unihan.txt", encoding="utf-8"):
try:
(u, f, v) = line.strip().split("\t")
except ValueError:
continue
if not u.startswith("U+"):
continue
n = int(u.replace("U+",""), 16)
if not n in kanji:
kanji[n] = {}
if f == "kMandarin":
kanji[n]['mandarin'] = " ".join([convert(w) for w in v.split()])
elif f == "kCantonese":
kanji[n]['cantonese'] = v
elif f == "kGradeLevel":
kanji[n]['grade'] = int(v)
dict = [{'id': k,
'mandarin': v.get('mandarin'),
'cantonese': v.get('cantonese'),
'grade': v.get('grade') } for (k,v) in kanji.items()
if v.get('mandarin') or v.get('cantonese') or v.get('grade')]
s.execute(unihanTable.insert(), dict)
s.commit()