always keep as unicode

This commit is contained in:
Damien Elmes 2008-11-21 23:41:14 +09:00
parent fbcdae4c51
commit 8f983d5bcc
2 changed files with 12 additions and 22 deletions

View file

@ -40,16 +40,6 @@ except ImportError:
from sqlalchemy import Unicode from sqlalchemy import Unicode
UnicodeText = Unicode UnicodeText = Unicode
# dump location of non-unicode string
from sqlalchemy import util
if getattr(util, 'warn', None):
import traceback
oldWarn = util.warn
def newWarn(*args, **kwargs):
traceback.print_stack()
oldWarn(*args, **kwargs)
util.warn = newWarn
# shared metadata # shared metadata
metadata = MetaData() metadata = MetaData()

View file

@ -146,25 +146,25 @@ def stripHTML(s):
def tidyHTML(html): def tidyHTML(html):
"Remove cruft like body tags and return just the important part." "Remove cruft like body tags and return just the important part."
# contents of body - no head or html tags # contents of body - no head or html tags
html = re.sub(".*<body.*?>(.*)</body></html>", html = re.sub(u".*<body.*?>(.*)</body></html>",
"\\1", html.replace("\n", u"")) "\\1", html.replace("\n", u""))
# strip superfluous Qt formatting # strip superfluous Qt formatting
html = re.sub("margin-top:\d+px; margin-bottom:\d+px; margin-left:\d+px; " html = re.sub(u"margin-top:\d+px; margin-bottom:\d+px; margin-left:\d+px; "
"margin-right:\d+px; -qt-block-indent:0; " "margin-right:\d+px; -qt-block-indent:0; "
"text-indent:0px;", "", html) "text-indent:0px;", u"", html)
html = re.sub("-qt-paragraph-type:empty;", "", html) html = re.sub(u"-qt-paragraph-type:empty;", u"", html)
# collapse multiple spaces into one # collapse multiple spaces into one
html = re.sub(" +", " ", html) html = re.sub(u" +", u" ", html)
# strip leading space in style statements, and remove if no contents # strip leading space in style statements, and remove if no contents
html = re.sub('style=" ', 'style="', html) html = re.sub(u'style=" ', u'style="', html)
html = re.sub(' style=""', "", html) html = re.sub(u' style=""', u"", html)
# convert P tags into SPAN and/or BR # convert P tags into SPAN and/or BR
html = re.sub('<p( style=.+?)>(.*?)</p>', u'<span\\1>\\2</span><br>', html) html = re.sub(u'<p( style=.+?)>(.*?)</p>', u'<span\\1>\\2</span><br>', html)
html = re.sub('<p>(.*?)</p>', u'\\1<br>', html) html = re.sub(u'<p>(.*?)</p>', u'\\1<br>', html)
html = re.sub('<br>$', u'', html) html = re.sub(u'<br>$', u'', html)
# remove leading or trailing whitespace # remove leading or trailing whitespace
html = re.sub('^ +', u'', html) html = re.sub(u'^ +', u'', html)
html = re.sub(' +$', u'', html) html = re.sub(u' +$', u'', html)
return html return html
def genID(static=[]): def genID(static=[]):