Fix strip_rtf to handle CJK encodings

bzr-revno: 2297
This commit is contained in:
Jeffrey Smith 2013-09-13 17:13:12 +02:00 committed by Andreas Preikschat
commit 2511332a9e
2 changed files with 105 additions and 50 deletions

View File

@ -46,7 +46,13 @@ log = logging.getLogger(__name__)
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile('[\'`ʻ]', re.UNICODE) APOSTROPHE = re.compile('[\'`ʻ]', re.UNICODE)
PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) # PATTERN will look for the next occurence of one of these symbols:
# \controlword - optionally preceded by \*, optionally followed by a number
# \'## - where ## is a pair of hex digits, representing a single character
# \# - where # is a single non-alpha character, representing a special symbol
# { or } - marking the beginning/end of a group
# a run of characters without any \ { } or end-of-line
PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
# RTF control words which specify a "destination" to be ignored. # RTF control words which specify a "destination" to be ignored.
DESTINATIONS = frozenset(( DESTINATIONS = frozenset((
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor', 'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@ -57,8 +63,8 @@ DESTINATIONS = frozenset((
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr', 'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field', 'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname', 'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr', 'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
'generator', 'gridtbl', 'header', 'headerf', 'headerl', 'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@ -106,6 +112,11 @@ DESTINATIONS = frozenset((
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen')) 'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
# Translation of some special characters. # Translation of some special characters.
SPECIAL_CHARS = { SPECIAL_CHARS = {
'\n': '\n',
'\r': '\n',
'~': '\u00A0',
'-': '\u00AD',
'_': '\u2011',
'par': '\n', 'par': '\n',
'sect': '\n\n', 'sect': '\n\n',
# Required page and column break. # Required page and column break.
@ -132,16 +143,19 @@ SPECIAL_CHARS = {
'zwj': '\u200D', 'zwj': '\u200D',
'zwnj': '\u200C'} 'zwnj': '\u200C'}
CHARSET_MAPPING = { CHARSET_MAPPING = {
'fcharset0': 'cp1252', '0': 'cp1252',
'fcharset161': 'cp1253', '128': 'cp932',
'fcharset162': 'cp1254', '129': 'cp949',
'fcharset163': 'cp1258', '134': 'cp936',
'fcharset177': 'cp1255', '161': 'cp1253',
'fcharset178': 'cp1256', '162': 'cp1254',
'fcharset186': 'cp1257', '163': 'cp1258',
'fcharset204': 'cp1251', '177': 'cp1255',
'fcharset222': 'cp874', '178': 'cp1256',
'fcharset238': 'cp1250'} '186': 'cp1257',
'204': 'cp1251',
'222': 'cp874',
'238': 'cp1250'}
class VerseType(object): class VerseType(object):
@ -351,7 +365,7 @@ def retrieve_windows_encoding(recommendation=None):
if recommendation == encodings[index][0]: if recommendation == encodings[index][0]:
recommended_index = index recommended_index = index
break break
if recommended_index > 0: if recommended_index > -1:
choice = QtGui.QInputDialog.getItem(None, choice = QtGui.QInputDialog.getItem(None,
translate('SongsPlugin', 'Character Encoding'), translate('SongsPlugin', 'Character Encoding'),
translate('SongsPlugin', 'The codepage setting is responsible\n' translate('SongsPlugin', 'The codepage setting is responsible\n'
@ -365,7 +379,7 @@ def retrieve_windows_encoding(recommendation=None):
[pair[1] for pair in encodings], 0, False) [pair[1] for pair in encodings], 0, False)
if not choice[1]: if not choice[1]:
return None return None
return filter(lambda item: item[1] == choice[0], encodings)[0][0] return next(filter(lambda item: item[1] == choice[0], encodings))[0]
def clean_string(string): def clean_string(string):
@ -521,43 +535,59 @@ def strip_rtf(text, default_encoding=None):
curskip = 0 curskip = 0
# Output buffer. # Output buffer.
out = [] out = []
# Encoded buffer.
ebytes = bytearray()
for match in PATTERN.finditer(text): for match in PATTERN.finditer(text):
word, arg, hex, char, brace, tchar = match.groups() iinu, word, arg, hex, char, brace, tchar = match.groups()
# \x (non-alpha character)
if char:
if char in '\\{}':
tchar = char
else:
word = char
# Flush encoded buffer to output buffer
if ebytes and not hex and not tchar:
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
dbytes = ebytes.decode(encoding)
# Code 5C is a peculiar case with Windows Codepage 932
if encoding == 'cp932' and '\\' in dbytes:
dbytes = dbytes.replace('\\', '\u00A5')
out.append(dbytes)
ebytes.clear()
except UnicodeDecodeError:
failed = True
else:
break
# {}
if brace: if brace:
curskip = 0 curskip = 0
if brace == '{': if brace == '{':
# Push state # Push state
stack.append((ucskip, ignorable, font)) stack.append((ucskip, ignorable, font))
elif brace == '}': elif brace == '}' and len(stack) > 0:
# Pop state # Pop state
ucskip, ignorable, font = stack.pop() ucskip, ignorable, font = stack.pop()
# \x (not a letter)
elif char:
curskip = 0
if char == '~' and not ignorable:
out.append('\xA0')
elif char in '{}\\' and not ignorable:
out.append(char)
elif char == '-' and not ignorable:
out.append('\u00AD')
elif char == '_' and not ignorable:
out.append('\u2011')
elif char == '*':
ignorable = True
# \command # \command
elif word: elif word:
curskip = 0 curskip = 0
if word in DESTINATIONS: if word in DESTINATIONS:
ignorable = True ignorable = True
elif word in SPECIAL_CHARS: elif word in SPECIAL_CHARS:
out.append(SPECIAL_CHARS[word]) if not ignorable:
out.append(SPECIAL_CHARS[word])
elif word == 'uc': elif word == 'uc':
ucskip = int(arg) ucskip = int(arg)
elif word == ' ': elif word == 'u':
c = int(arg) c = int(arg)
if c < 0: if c < 0:
c += 0x10000 c += 0x10000
out.append(chr(c)) if not ignorable:
out.append(chr(c))
curskip = ucskip curskip = ucskip
elif word == 'fonttbl': elif word == 'fonttbl':
ignorable = True ignorable = True
@ -565,31 +595,24 @@ def strip_rtf(text, default_encoding=None):
font = arg font = arg
elif word == 'ansicpg': elif word == 'ansicpg':
font_table[font] = 'cp' + arg font_table[font] = 'cp' + arg
elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING: elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
# \ansicpg overrides \fcharset, if present. font_table[font] = CHARSET_MAPPING[arg]
font_table[font] = CHARSET_MAPPING[word + arg] elif word == 'fldrslt':
pass
# \* 'Ignore if not understood' marker
elif iinu:
ignorable = True
# \'xx # \'xx
elif hex: elif hex:
if curskip > 0: if curskip > 0:
curskip -= 1 curskip -= 1
elif not ignorable: elif not ignorable:
charcode = int(hex, 16) ebytes.append(int(hex, 16))
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
out.append(chr(charcode).decode(encoding))
except UnicodeDecodeError:
failed = True
else:
break
elif tchar: elif tchar:
if curskip > 0: if curskip > 0:
curskip -= 1 curskip -= 1
elif not ignorable: elif not ignorable:
out.append(tchar) ebytes += tchar.encode()
text = ''.join(out) text = ''.join(out)
return text, default_encoding return text, default_encoding

View File

@ -6,7 +6,7 @@ from unittest import TestCase
from mock import patch, MagicMock from mock import patch, MagicMock
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
@ -215,6 +215,38 @@ class TestLib(TestCase):
# THEN: The maximum length should be returned. # THEN: The maximum length should be returned.
assert result == 10, 'The length should be 10.' assert result == 10, 'The length should be 10.'
def strip_rtf_charsets_test(self):
"""
Test that the strip_rtf() method properly decodes the supported charsets.
"""
test_charset_table = [
('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
'\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'),
('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
]
# GIVEN: For each character set and input
for charset, input, exp_result in test_charset_table:
# WHEN: We call strip_rtf on the input RTF
result, result_enc = strip_rtf(
'{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
'{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
# THEN: The stripped text matches thed expected result
assert result == exp_result, 'The result should be %s' % exp_result
class TestVerseType(TestCase): class TestVerseType(TestCase):
""" """