Fix strip_rtf to handle CJK encodings

This commit is contained in:
Jeffrey S. Smith 2013-09-07 22:50:51 -05:00
parent 880a548eb8
commit 53ac150337
2 changed files with 99 additions and 50 deletions

View File

@ -46,7 +46,7 @@ log = logging.getLogger(__name__)
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile('[\'`ʻ]', re.UNICODE)
PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
# RTF control words which specify a "destination" to be ignored.
DESTINATIONS = frozenset((
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@ -57,8 +57,8 @@ DESTINATIONS = frozenset((
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@ -106,6 +106,11 @@ DESTINATIONS = frozenset((
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
# Translation of some special characters.
SPECIAL_CHARS = {
'\n': '\n',
'\r': '\n',
'~': '\u00A0',
'-': '\u00AD',
'_': '\u2011',
'par': '\n',
'sect': '\n\n',
# Required page and column break.
@ -132,16 +137,19 @@ SPECIAL_CHARS = {
'zwj': '\u200D',
'zwnj': '\u200C'}
CHARSET_MAPPING = {
'fcharset0': 'cp1252',
'fcharset161': 'cp1253',
'fcharset162': 'cp1254',
'fcharset163': 'cp1258',
'fcharset177': 'cp1255',
'fcharset178': 'cp1256',
'fcharset186': 'cp1257',
'fcharset204': 'cp1251',
'fcharset222': 'cp874',
'fcharset238': 'cp1250'}
'0': 'cp1252',
'128': 'cp932',
'129': 'cp949',
'134': 'cp936',
'161': 'cp1253',
'162': 'cp1254',
'163': 'cp1258',
'177': 'cp1255',
'178': 'cp1256',
'186': 'cp1257',
'204': 'cp1251',
'222': 'cp874',
'238': 'cp1250'}
class VerseType(object):
@ -351,7 +359,7 @@ def retrieve_windows_encoding(recommendation=None):
if recommendation == encodings[index][0]:
recommended_index = index
break
if recommended_index > 0:
if recommended_index > -1:
choice = QtGui.QInputDialog.getItem(None,
translate('SongsPlugin', 'Character Encoding'),
translate('SongsPlugin', 'The codepage setting is responsible\n'
@ -365,7 +373,7 @@ def retrieve_windows_encoding(recommendation=None):
[pair[1] for pair in encodings], 0, False)
if not choice[1]:
return None
return filter(lambda item: item[1] == choice[0], encodings)[0][0]
return next(filter(lambda item: item[1] == choice[0], encodings))[0]
def clean_string(string):
@ -521,43 +529,59 @@ def strip_rtf(text, default_encoding=None):
curskip = 0
# Output buffer.
out = []
# Encoded buffer.
ebytes = bytearray()
for match in PATTERN.finditer(text):
word, arg, hex, char, brace, tchar = match.groups()
iinu, word, arg, hex, char, brace, tchar = match.groups()
# \x (non-alpha character)
if char:
if char in '\\{}':
tchar = char
else:
word = char
# Flush encoded buffer to output buffer
if ebytes and not hex and not tchar:
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
dbytes = ebytes.decode(encoding)
# Code 5C is a peculiar case with Windows Codepage 932
if encoding == 'cp932' and '\\' in dbytes:
dbytes = dbytes.replace('\\', '\u00A5')
out.append(dbytes)
ebytes.clear()
except UnicodeDecodeError:
failed = True
else:
break
# {}
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip, ignorable, font))
elif brace == '}':
elif brace == '}' and len(stack) > 0:
# Pop state
ucskip, ignorable, font = stack.pop()
# \x (not a letter)
elif char:
curskip = 0
if char == '~' and not ignorable:
out.append('\xA0')
elif char in '{}\\' and not ignorable:
out.append(char)
elif char == '-' and not ignorable:
out.append('\u00AD')
elif char == '_' and not ignorable:
out.append('\u2011')
elif char == '*':
ignorable = True
# \command
elif word:
curskip = 0
if word in DESTINATIONS:
ignorable = True
elif word in SPECIAL_CHARS:
out.append(SPECIAL_CHARS[word])
if not ignorable:
out.append(SPECIAL_CHARS[word])
elif word == 'uc':
ucskip = int(arg)
elif word == ' ':
elif word == 'u':
c = int(arg)
if c < 0:
c += 0x10000
out.append(chr(c))
if not ignorable:
out.append(chr(c))
curskip = ucskip
elif word == 'fonttbl':
ignorable = True
@ -565,31 +589,24 @@ def strip_rtf(text, default_encoding=None):
font = arg
elif word == 'ansicpg':
font_table[font] = 'cp' + arg
elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
# \ansicpg overrides \fcharset, if present.
font_table[font] = CHARSET_MAPPING[word + arg]
elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
font_table[font] = CHARSET_MAPPING[arg]
elif word == 'fldrslt':
pass
# \* 'Ignore if not understood' marker
elif iinu:
ignorable = True
# \'xx
elif hex:
if curskip > 0:
curskip -= 1
elif not ignorable:
charcode = int(hex, 16)
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
out.append(chr(charcode).decode(encoding))
except UnicodeDecodeError:
failed = True
else:
break
ebytes.append(int(hex, 16))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
ebytes += tchar.encode()
text = ''.join(out)
return text, default_encoding

View File

@ -6,7 +6,7 @@ from unittest import TestCase
from mock import patch, MagicMock
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
@ -215,6 +215,38 @@ class TestLib(TestCase):
# THEN: The maximum length should be returned.
assert result == 10, 'The length should be 10.'
def strip_rtf_charsets_test(self):
"""
Test that the strip_rtf() method properly decodes the supported charsets.
"""
test_charset_table = [
('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
'\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'),
('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
]
# GIVEN: For each character set and input
for charset, input, exp_result in test_charset_table:
# WHEN: We call strip_rtf on the input RTF
result, result_enc = strip_rtf(
'{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
'{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
# THEN: The stripped text matches thed expected result
assert result == exp_result, 'The result should be %s' % exp_result
class TestVerseType(TestCase):
"""