Anki/anki/features/chinese/save_unihan.py

# -*- coding: utf-8 -*-
# read unihan.txt and save it as a db

import psyco; psyco.full()

from sqlalchemy import (Table, Integer, Float, Unicode, Column, MetaData,
                        ForeignKey, Boolean, String, Date, UniqueConstraint,
                        UnicodeText)
from sqlalchemy import (create_engine)
from sqlalchemy.orm import mapper, sessionmaker, relation, backref, \
     object_session as _object_session
from sqlalchemy.sql import select, text, and_
from sqlalchemy.exceptions import DBAPIError
import re

metadata = MetaData()

unihanTable = Table(
    'unihan', metadata,
    Column("id", Integer, primary_key=True),
    Column("mandarin", UnicodeText),
    Column("cantonese", UnicodeText),
    Column("grade", Integer),
    )

engine = create_engine("sqlite:///unihan.db",
                       echo=False, strategy='threadlocal')
session = sessionmaker(bind=engine,
                       autoflush=False,
                       transactional=True)
metadata.create_all(engine)

s = session()

# Convert codes to accents
##########################################################################
# code from Donald Chai

accenttable = {
    'a' : [u'a', u'ā', u'á', u'ǎ', u'à', u'a'],
    'e' : [u'e', u'ē', u'é', u'ě', u'è', u'e'],
    'i' : [u'i', u'ī', u'í', u'ǐ', u'ì', u'i'],
    'o' : [u'o', u'ō', u'ó', u'ǒ', u'ò', u'o'],
    'u' : [u'u', u'ū', u'ú', u'ǔ', u'ù', u'u'],
    'v' : [u'ü', u'ǖ', u'ǘ', u'ǚ', u'ǜ', u'ü'],
}
def convert(word):
    '''Converts a pinyin word to unicode'''
    word = word.lower()
    orig = word
    # convert ü to v for now to make life easier
    word = re.sub(u'\xfc|\xc3\xbc', 'v', word)
    # extract fields
    mo = re.match('([qwrtypsdfghjklzxcbnm]*)([aeiouv]*)(\D*)(\d?)', word)
    init  = mo.group(1)
    vowel = mo.group(2)
    final = mo.group(3)
    tone  = mo.group(4)
    # do nothing if no vowel or tone
    if vowel=='' or tone=='':
        return orig
    tone  = int(tone)
    if len(vowel)==1:
        vowel = accenttable[vowel][tone]
    elif vowel[-2]=='i' or vowel[-2]=='u':
        # put over last
        vowel = vowel[:-1] + accenttable[vowel[-1]][tone]
    else:
        # put over second to last
        vowel = vowel[:-2] + accenttable[vowel[-2]][tone] + vowel[-1]
    return init+vowel+final

##########################################################################

kanji = {}
import codecs
for line in codecs.open("Unihan.txt", encoding="utf-8"):
    try:
        (u, f, v) = line.strip().split("\t")
    except ValueError:
        continue
    if not u.startswith("U+"):
        continue
    n = int(u.replace("U+",""), 16)
    if not n in kanji:
        kanji[n] = {}
    if f == "kMandarin":
        kanji[n]['mandarin'] = " ".join([convert(w) for w in v.split()])
    elif f == "kCantonese":
        kanji[n]['cantonese'] = v
    elif f == "kGradeLevel":
        kanji[n]['grade'] = int(v)

dict = [{'id': k,
         'mandarin': v.get('mandarin'),
         'cantonese': v.get('cantonese'),
         'grade': v.get('grade') } for (k,v) in kanji.items()
        if v.get('mandarin') or v.get('cantonese') or v.get('grade')]
s.execute(unihanTable.insert(), dict)

s.commit()