Fix strip_rtf to handle CJK encodings

bzr-revno: 2297
This commit is contained in:
Jeffrey Smith 2013-09-13 17:13:12 +02:00 committed by Andreas Preikschat
commit 2511332a9e
2 changed files with 105 additions and 50 deletions

View File

@ -46,7 +46,13 @@ log = logging.getLogger(__name__)
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile('[\'`ʻ]', re.UNICODE)
PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# PATTERN will look for the next occurence of one of these symbols:
# \controlword - optionally preceded by \*, optionally followed by a number
# \'## - where ## is a pair of hex digits, representing a single character
# \# - where # is a single non-alpha character, representing a special symbol
# { or } - marking the beginning/end of a group
# a run of characters without any \ { } or end-of-line
PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
# RTF control words which specify a "destination" to be ignored.
DESTINATIONS = frozenset((
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@ -57,8 +63,8 @@ DESTINATIONS = frozenset((
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@ -106,6 +112,11 @@ DESTINATIONS = frozenset((
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
# Translation of some special characters.
SPECIAL_CHARS = {
'\n': '\n',
'\r': '\n',
'~': '\u00A0',
'-': '\u00AD',
'_': '\u2011',
'par': '\n',
'sect': '\n\n',
# Required page and column break.
@ -132,16 +143,19 @@ SPECIAL_CHARS = {
'zwj': '\u200D',
'zwnj': '\u200C'}
CHARSET_MAPPING = {
'fcharset0': 'cp1252',
'fcharset161': 'cp1253',
'fcharset162': 'cp1254',
'fcharset163': 'cp1258',
'fcharset177': 'cp1255',
'fcharset178': 'cp1256',
'fcharset186': 'cp1257',
'fcharset204': 'cp1251',
'fcharset222': 'cp874',
'fcharset238': 'cp1250'}
'0': 'cp1252',
'128': 'cp932',
'129': 'cp949',
'134': 'cp936',
'161': 'cp1253',
'162': 'cp1254',
'163': 'cp1258',
'177': 'cp1255',
'178': 'cp1256',
'186': 'cp1257',
'204': 'cp1251',
'222': 'cp874',
'238': 'cp1250'}
class VerseType(object):
@ -351,7 +365,7 @@ def retrieve_windows_encoding(recommendation=None):
if recommendation == encodings[index][0]:
recommended_index = index
break
if recommended_index > 0:
if recommended_index > -1:
choice = QtGui.QInputDialog.getItem(None,
translate('SongsPlugin', 'Character Encoding'),
translate('SongsPlugin', 'The codepage setting is responsible\n'
@ -365,7 +379,7 @@ def retrieve_windows_encoding(recommendation=None):
[pair[1] for pair in encodings], 0, False)
if not choice[1]:
return None
return filter(lambda item: item[1] == choice[0], encodings)[0][0]
return next(filter(lambda item: item[1] == choice[0], encodings))[0]
def clean_string(string):
@ -521,43 +535,59 @@ def strip_rtf(text, default_encoding=None):
curskip = 0
# Output buffer.
out = []
# Encoded buffer.
ebytes = bytearray()
for match in PATTERN.finditer(text):
word, arg, hex, char, brace, tchar = match.groups()
iinu, word, arg, hex, char, brace, tchar = match.groups()
# \x (non-alpha character)
if char:
if char in '\\{}':
tchar = char
else:
word = char
# Flush encoded buffer to output buffer
if ebytes and not hex and not tchar:
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
dbytes = ebytes.decode(encoding)
# Code 5C is a peculiar case with Windows Codepage 932
if encoding == 'cp932' and '\\' in dbytes:
dbytes = dbytes.replace('\\', '\u00A5')
out.append(dbytes)
ebytes.clear()
except UnicodeDecodeError:
failed = True
else:
break
# {}
if brace:
curskip = 0
if brace == '{':
# Push state
stack.append((ucskip, ignorable, font))
elif brace == '}':
elif brace == '}' and len(stack) > 0:
# Pop state
ucskip, ignorable, font = stack.pop()
# \x (not a letter)
elif char:
curskip = 0
if char == '~' and not ignorable:
out.append('\xA0')
elif char in '{}\\' and not ignorable:
out.append(char)
elif char == '-' and not ignorable:
out.append('\u00AD')
elif char == '_' and not ignorable:
out.append('\u2011')
elif char == '*':
ignorable = True
# \command
elif word:
curskip = 0
if word in DESTINATIONS:
ignorable = True
elif word in SPECIAL_CHARS:
out.append(SPECIAL_CHARS[word])
if not ignorable:
out.append(SPECIAL_CHARS[word])
elif word == 'uc':
ucskip = int(arg)
elif word == ' ':
elif word == 'u':
c = int(arg)
if c < 0:
c += 0x10000
out.append(chr(c))
if not ignorable:
out.append(chr(c))
curskip = ucskip
elif word == 'fonttbl':
ignorable = True
@ -565,31 +595,24 @@ def strip_rtf(text, default_encoding=None):
font = arg
elif word == 'ansicpg':
font_table[font] = 'cp' + arg
elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
# \ansicpg overrides \fcharset, if present.
font_table[font] = CHARSET_MAPPING[word + arg]
elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
font_table[font] = CHARSET_MAPPING[arg]
elif word == 'fldrslt':
pass
# \* 'Ignore if not understood' marker
elif iinu:
ignorable = True
# \'xx
elif hex:
if curskip > 0:
curskip -= 1
elif not ignorable:
charcode = int(hex, 16)
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
if not encoding:
return None
out.append(chr(charcode).decode(encoding))
except UnicodeDecodeError:
failed = True
else:
break
ebytes.append(int(hex, 16))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
ebytes += tchar.encode()
text = ''.join(out)
return text, default_encoding

View File

@ -6,7 +6,7 @@ from unittest import TestCase
from mock import patch, MagicMock
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
@ -215,6 +215,38 @@ class TestLib(TestCase):
# THEN: The maximum length should be returned.
assert result == 10, 'The length should be 10.'
def strip_rtf_charsets_test(self):
"""
Test that the strip_rtf() method properly decodes the supported charsets.
"""
test_charset_table = [
('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
'\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'),
('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
]
# GIVEN: For each character set and input
for charset, input, exp_result in test_charset_table:
# WHEN: We call strip_rtf on the input RTF
result, result_enc = strip_rtf(
'{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
'{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
# THEN: The stripped text matches thed expected result
assert result == exp_result, 'The result should be %s' % exp_result
class TestVerseType(TestCase):
"""