Fix strip_rtf to handle CJK encodings

This commit is contained in:
Jeffrey S. Smith 2013-09-07 22:50:51 -05:00
parent 880a548eb8
commit 53ac150337
2 changed files with 99 additions and 50 deletions

View File

@ -46,7 +46,7 @@ log = logging.getLogger(__name__)
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile('[\'`ʻ]', re.UNICODE) APOSTROPHE = re.compile('[\'`ʻ]', re.UNICODE)
PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
# RTF control words which specify a "destination" to be ignored. # RTF control words which specify a "destination" to be ignored.
DESTINATIONS = frozenset(( DESTINATIONS = frozenset((
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor', 'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@ -57,8 +57,8 @@ DESTINATIONS = frozenset((
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr', 'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field', 'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname', 'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr', 'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
'generator', 'gridtbl', 'header', 'headerf', 'headerl', 'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@ -106,6 +106,11 @@ DESTINATIONS = frozenset((
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen')) 'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
# Translation of some special characters. # Translation of some special characters.
SPECIAL_CHARS = { SPECIAL_CHARS = {
'\n': '\n',
'\r': '\n',
'~': '\u00A0',
'-': '\u00AD',
'_': '\u2011',
'par': '\n', 'par': '\n',
'sect': '\n\n', 'sect': '\n\n',
# Required page and column break. # Required page and column break.
@ -132,16 +137,19 @@ SPECIAL_CHARS = {
'zwj': '\u200D', 'zwj': '\u200D',
'zwnj': '\u200C'} 'zwnj': '\u200C'}
CHARSET_MAPPING = { CHARSET_MAPPING = {
'fcharset0': 'cp1252', '0': 'cp1252',
'fcharset161': 'cp1253', '128': 'cp932',
'fcharset162': 'cp1254', '129': 'cp949',
'fcharset163': 'cp1258', '134': 'cp936',
'fcharset177': 'cp1255', '161': 'cp1253',
'fcharset178': 'cp1256', '162': 'cp1254',
'fcharset186': 'cp1257', '163': 'cp1258',
'fcharset204': 'cp1251', '177': 'cp1255',
'fcharset222': 'cp874', '178': 'cp1256',
'fcharset238': 'cp1250'} '186': 'cp1257',
'204': 'cp1251',
'222': 'cp874',
'238': 'cp1250'}
class VerseType(object): class VerseType(object):
@ -351,7 +359,7 @@ def retrieve_windows_encoding(recommendation=None):
if recommendation == encodings[index][0]: if recommendation == encodings[index][0]:
recommended_index = index recommended_index = index
break break
if recommended_index > 0: if recommended_index > -1:
choice = QtGui.QInputDialog.getItem(None, choice = QtGui.QInputDialog.getItem(None,
translate('SongsPlugin', 'Character Encoding'), translate('SongsPlugin', 'Character Encoding'),
translate('SongsPlugin', 'The codepage setting is responsible\n' translate('SongsPlugin', 'The codepage setting is responsible\n'
@ -365,7 +373,7 @@ def retrieve_windows_encoding(recommendation=None):
[pair[1] for pair in encodings], 0, False) [pair[1] for pair in encodings], 0, False)
if not choice[1]: if not choice[1]:
return None return None
return filter(lambda item: item[1] == choice[0], encodings)[0][0] return next(filter(lambda item: item[1] == choice[0], encodings))[0]
def clean_string(string): def clean_string(string):
@ -521,43 +529,59 @@ def strip_rtf(text, default_encoding=None):
curskip = 0 curskip = 0
# Output buffer. # Output buffer.
out = [] out = []
# Encoded buffer.
ebytes = bytearray()
for match in PATTERN.finditer(text): for match in PATTERN.finditer(text):
word, arg, hex, char, brace, tchar = match.groups() iinu, word, arg, hex, char, brace, tchar = match.groups()
# \x (non-alpha character)
if char:
if char in '\\{}':
tchar = char
else:
word = char
# Flush encoded buffer to output buffer
if ebytes and not hex and not tchar:
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
dbytes = ebytes.decode(encoding)
# Code 5C is a peculiar case with Windows Codepage 932
if encoding == 'cp932' and '\\' in dbytes:
dbytes = dbytes.replace('\\', '\u00A5')
out.append(dbytes)
ebytes.clear()
except UnicodeDecodeError:
failed = True
else:
break
# {}
if brace: if brace:
curskip = 0 curskip = 0
if brace == '{': if brace == '{':
# Push state # Push state
stack.append((ucskip, ignorable, font)) stack.append((ucskip, ignorable, font))
elif brace == '}': elif brace == '}' and len(stack) > 0:
# Pop state # Pop state
ucskip, ignorable, font = stack.pop() ucskip, ignorable, font = stack.pop()
# \x (not a letter)
elif char:
curskip = 0
if char == '~' and not ignorable:
out.append('\xA0')
elif char in '{}\\' and not ignorable:
out.append(char)
elif char == '-' and not ignorable:
out.append('\u00AD')
elif char == '_' and not ignorable:
out.append('\u2011')
elif char == '*':
ignorable = True
# \command # \command
elif word: elif word:
curskip = 0 curskip = 0
if word in DESTINATIONS: if word in DESTINATIONS:
ignorable = True ignorable = True
elif word in SPECIAL_CHARS: elif word in SPECIAL_CHARS:
out.append(SPECIAL_CHARS[word]) if not ignorable:
out.append(SPECIAL_CHARS[word])
elif word == 'uc': elif word == 'uc':
ucskip = int(arg) ucskip = int(arg)
elif word == ' ': elif word == 'u':
c = int(arg) c = int(arg)
if c < 0: if c < 0:
c += 0x10000 c += 0x10000
out.append(chr(c)) if not ignorable:
out.append(chr(c))
curskip = ucskip curskip = ucskip
elif word == 'fonttbl': elif word == 'fonttbl':
ignorable = True ignorable = True
@ -565,31 +589,24 @@ def strip_rtf(text, default_encoding=None):
font = arg font = arg
elif word == 'ansicpg': elif word == 'ansicpg':
font_table[font] = 'cp' + arg font_table[font] = 'cp' + arg
elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING: elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
# \ansicpg overrides \fcharset, if present. font_table[font] = CHARSET_MAPPING[arg]
font_table[font] = CHARSET_MAPPING[word + arg] elif word == 'fldrslt':
pass
# \* 'Ignore if not understood' marker
elif iinu:
ignorable = True
# \'xx # \'xx
elif hex: elif hex:
if curskip > 0: if curskip > 0:
curskip -= 1 curskip -= 1
elif not ignorable: elif not ignorable:
charcode = int(hex, 16) ebytes.append(int(hex, 16))
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
out.append(chr(charcode).decode(encoding))
except UnicodeDecodeError:
failed = True
else:
break
elif tchar: elif tchar:
if curskip > 0: if curskip > 0:
curskip -= 1 curskip -= 1
elif not ignorable: elif not ignorable:
out.append(tchar) ebytes += tchar.encode()
text = ''.join(out) text = ''.join(out)
return text, default_encoding return text, default_encoding

View File

@ -6,7 +6,7 @@ from unittest import TestCase
from mock import patch, MagicMock from mock import patch, MagicMock
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
@ -215,6 +215,38 @@ class TestLib(TestCase):
# THEN: The maximum length should be returned. # THEN: The maximum length should be returned.
assert result == 10, 'The length should be 10.' assert result == 10, 'The length should be 10.'
def strip_rtf_charsets_test(self):
"""
Test that the strip_rtf() method properly decodes the supported charsets.
"""
test_charset_table = [
('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
'\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'),
('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
]
# GIVEN: For each character set and input
for charset, input, exp_result in test_charset_table:
# WHEN: We call strip_rtf on the input RTF
result, result_enc = strip_rtf(
'{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
'{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
# THEN: The stripped text matches thed expected result
assert result == exp_result, 'The result should be %s' % exp_result
class TestVerseType(TestCase): class TestVerseType(TestCase):
""" """