Better handling of encodings. User is asked only once, if possible.

This commit is contained in:
Mattias Põldaru 2012-06-25 09:44:11 +03:00
parent 75ae9065d0
commit 25dc4fe36c

View File

@ -473,15 +473,16 @@ class StripRtf():
u'fcharset204': u'cp1251', u'fcharset204': u'cp1251',
u'fcharset222': u'cp874', u'fcharset222': u'cp874',
u'fcharset238': u'cp1250'} u'fcharset238': u'cp1250'}
# If user is asked for an encoding, it is used since then.
user_encoding = []
def strip_rtf(self, text, default_encoding=None): def strip_rtf(self, text, default_encoding=None):
self.default_encoding = default_encoding
# Current font is the font tag we last met. # Current font is the font tag we last met.
font = u'' font = u''
# Character encoding is defined inside fonttable. # Character encoding is defined inside fonttable.
# font_table could contain eg u'0': u'cp1252' # font_table could contain eg u'0': u'cp1252'
font_table = {u'': default_encoding} font_table = {u'': default_encoding}
# Whether we are inside the font table.
inside_font_table = False
# Stack of things to keep track of when entering/leaving groups. # Stack of things to keep track of when entering/leaving groups.
stack = [] stack = []
# Whether this group (and all inside it) are "ignorable". # Whether this group (and all inside it) are "ignorable".
@ -498,10 +499,10 @@ class StripRtf():
curskip = 0 curskip = 0
if brace == u'{': if brace == u'{':
# Push state # Push state
stack.append((ucskip, ignorable, font, inside_font_table)) stack.append((ucskip, ignorable, font))
elif brace == u'}': elif brace == u'}':
# Pop state # Pop state
ucskip, ignorable, font, inside_font_table = stack.pop() ucskip, ignorable, font = stack.pop()
# \x (not a letter) # \x (not a letter)
elif char: elif char:
curskip = 0 curskip = 0
@ -533,29 +534,19 @@ class StripRtf():
ignorable = True ignorable = True
elif word == u'f': elif word == u'f':
font = arg font = arg
if not inside_font_table:
if arg in font_table.keys():
encoding = font_table[arg]
else:
encoding = default_encoding
elif word == u'ansicpg': elif word == u'ansicpg':
if font == u'': font_table[font] = 'cp' + arg
if inside_font_table or font == u'':
font_table[font] = 'cp' + arg
elif word == u'fcharset': elif word == u'fcharset':
charset_reference = word + arg charset_reference = word + arg
if charset_reference in self.CHARSET_MAPPING: if charset_reference in self.CHARSET_MAPPING:
charset = self.CHARSET_MAPPING[charset_reference] charset = self.CHARSET_MAPPING[charset_reference]
if not charset:
charset = default_encoding
else: else:
charset = None
log.error(u"Charset '%s' not in CHARSET_MAPPING " log.error(u"Charset '%s' not in CHARSET_MAPPING "
u"dictionary in " u"dictionary in "
u"openlp/plugins/songs/lib/__init__.py" u"openlp/plugins/songs/lib/__init__.py"
% charset_reference) % charset_reference)
charset = default_encoding if font not in font_table:
if font == u'':
if inside_font_table or font == u'':
font_table[font] = charset font_table[font] = charset
# \'xx # \'xx
elif hex: elif hex:
@ -563,14 +554,13 @@ class StripRtf():
curskip -= 1 curskip -= 1
elif not ignorable: elif not ignorable:
charcode = int(hex, 16) charcode = int(hex, 16)
encoding = self.get_encoding(font, font_table)
while True: while True:
try: try:
out.append(chr(charcode).decode(encoding)) out.append(chr(charcode).decode(encoding))
except UnicodeDecodeError: except UnicodeDecodeError:
encoding = \ encoding = self.get_encoding(font, font_table,
retrieve_windows_encoding(default_encoding) failed=True)
if font:
font_table[font] = encoding
else: else:
break break
elif tchar: elif tchar:
@ -580,6 +570,23 @@ class StripRtf():
out.append(tchar) out.append(tchar)
return u''.join(out) return u''.join(out)
def get_encoding(self, font, font_table, failed=False):
encoding = None
if font in font_table:
encoding = font_table[font]
if not encoding and len(self.user_encoding):
encoding = self.user_encoding[-1]
if not encoding and self.default_encoding:
encoding = self.default_encoding
if not encoding or (failed and self.user_encoding == encoding):
encoding = retrieve_windows_encoding(self.default_encoding)
if encoding not in self.user_encoding:
self.user_encoding.append(encoding)
elif failed:
encoding = self.user_encoding
font_table[font] = encoding
return encoding
from xml import OpenLyrics, SongXML from xml import OpenLyrics, SongXML
from songstab import SongsTab from songstab import SongsTab
from mediaitem import SongMediaItem from mediaitem import SongMediaItem