mirror of
https://github.com/ankitects/anki.git
synced 2025-09-21 07:22:23 -04:00
101 lines
3.1 KiB
Python
101 lines
3.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
# read unihan.txt and save it as a db
|
|
|
|
import psyco; psyco.full()
|
|
|
|
from sqlalchemy import (Table, Integer, Float, Unicode, Column, MetaData,
|
|
ForeignKey, Boolean, String, Date, UniqueConstraint,
|
|
UnicodeText)
|
|
from sqlalchemy import (create_engine)
|
|
from sqlalchemy.orm import mapper, sessionmaker, relation, backref, \
|
|
object_session as _object_session
|
|
from sqlalchemy.sql import select, text, and_
|
|
from sqlalchemy.exceptions import DBAPIError
|
|
import re
|
|
|
|
metadata = MetaData()
|
|
|
|
unihanTable = Table(
|
|
'unihan', metadata,
|
|
Column("id", Integer, primary_key=True),
|
|
Column("mandarin", UnicodeText),
|
|
Column("cantonese", UnicodeText),
|
|
Column("grade", Integer),
|
|
)
|
|
|
|
engine = create_engine("sqlite:///unihan.db",
|
|
echo=False, strategy='threadlocal')
|
|
session = sessionmaker(bind=engine,
|
|
autoflush=False,
|
|
transactional=True)
|
|
metadata.create_all(engine)
|
|
|
|
s = session()
|
|
|
|
# Convert codes to accents
|
|
##########################################################################
|
|
# code from Donald Chai
|
|
|
|
accenttable = {
|
|
'a' : [u'a', u'ā', u'á', u'ǎ', u'à', u'a'],
|
|
'e' : [u'e', u'ē', u'é', u'ě', u'è', u'e'],
|
|
'i' : [u'i', u'ī', u'í', u'ǐ', u'ì', u'i'],
|
|
'o' : [u'o', u'ō', u'ó', u'ǒ', u'ò', u'o'],
|
|
'u' : [u'u', u'ū', u'ú', u'ǔ', u'ù', u'u'],
|
|
'v' : [u'ü', u'ǖ', u'ǘ', u'ǚ', u'ǜ', u'ü'],
|
|
}
|
|
def convert(word):
|
|
'''Converts a pinyin word to unicode'''
|
|
word = word.lower()
|
|
orig = word
|
|
# convert ü to v for now to make life easier
|
|
word = re.sub(u'\xfc|\xc3\xbc', 'v', word)
|
|
# extract fields
|
|
mo = re.match('([qwrtypsdfghjklzxcbnm]*)([aeiouv]*)(\D*)(\d?)', word)
|
|
init = mo.group(1)
|
|
vowel = mo.group(2)
|
|
final = mo.group(3)
|
|
tone = mo.group(4)
|
|
# do nothing if no vowel or tone
|
|
if vowel=='' or tone=='':
|
|
return orig
|
|
tone = int(tone)
|
|
if len(vowel)==1:
|
|
vowel = accenttable[vowel][tone]
|
|
elif vowel[-2]=='i' or vowel[-2]=='u':
|
|
# put over last
|
|
vowel = vowel[:-1] + accenttable[vowel[-1]][tone]
|
|
else:
|
|
# put over second to last
|
|
vowel = vowel[:-2] + accenttable[vowel[-2]][tone] + vowel[-1]
|
|
return init+vowel+final
|
|
|
|
##########################################################################
|
|
|
|
kanji = {}
|
|
import codecs
|
|
for line in codecs.open("Unihan.txt", encoding="utf-8"):
|
|
try:
|
|
(u, f, v) = line.strip().split("\t")
|
|
except ValueError:
|
|
continue
|
|
if not u.startswith("U+"):
|
|
continue
|
|
n = int(u.replace("U+",""), 16)
|
|
if not n in kanji:
|
|
kanji[n] = {}
|
|
if f == "kMandarin":
|
|
kanji[n]['mandarin'] = " ".join([convert(w) for w in v.split()])
|
|
elif f == "kCantonese":
|
|
kanji[n]['cantonese'] = v
|
|
elif f == "kGradeLevel":
|
|
kanji[n]['grade'] = int(v)
|
|
|
|
dict = [{'id': k,
|
|
'mandarin': v.get('mandarin'),
|
|
'cantonese': v.get('cantonese'),
|
|
'grade': v.get('grade') } for (k,v) in kanji.items()
|
|
if v.get('mandarin') or v.get('cantonese') or v.get('grade')]
|
|
s.execute(unihanTable.insert(), dict)
|
|
|
|
s.commit()
|