forked from openlp/openlp
Fix strip_rtf to handle CJK encodings
bzr-revno: 2297
This commit is contained in:
commit
2511332a9e
@ -46,7 +46,13 @@ log = logging.getLogger(__name__)
|
||||
|
||||
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
|
||||
APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
|
||||
PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
|
||||
# PATTERN will look for the next occurence of one of these symbols:
|
||||
# \controlword - optionally preceded by \*, optionally followed by a number
|
||||
# \'## - where ## is a pair of hex digits, representing a single character
|
||||
# \# - where # is a single non-alpha character, representing a special symbol
|
||||
# { or } - marking the beginning/end of a group
|
||||
# a run of characters without any \ { } or end-of-line
|
||||
PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
|
||||
# RTF control words which specify a "destination" to be ignored.
|
||||
DESTINATIONS = frozenset((
|
||||
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
|
||||
@ -57,8 +63,8 @@ DESTINATIONS = frozenset((
|
||||
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
|
||||
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
|
||||
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
|
||||
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
|
||||
'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
|
||||
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
|
||||
'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
|
||||
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
|
||||
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
|
||||
'generator', 'gridtbl', 'header', 'headerf', 'headerl',
|
||||
@ -106,6 +112,11 @@ DESTINATIONS = frozenset((
|
||||
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
|
||||
# Translation of some special characters.
|
||||
SPECIAL_CHARS = {
|
||||
'\n': '\n',
|
||||
'\r': '\n',
|
||||
'~': '\u00A0',
|
||||
'-': '\u00AD',
|
||||
'_': '\u2011',
|
||||
'par': '\n',
|
||||
'sect': '\n\n',
|
||||
# Required page and column break.
|
||||
@ -132,16 +143,19 @@ SPECIAL_CHARS = {
|
||||
'zwj': '\u200D',
|
||||
'zwnj': '\u200C'}
|
||||
CHARSET_MAPPING = {
|
||||
'fcharset0': 'cp1252',
|
||||
'fcharset161': 'cp1253',
|
||||
'fcharset162': 'cp1254',
|
||||
'fcharset163': 'cp1258',
|
||||
'fcharset177': 'cp1255',
|
||||
'fcharset178': 'cp1256',
|
||||
'fcharset186': 'cp1257',
|
||||
'fcharset204': 'cp1251',
|
||||
'fcharset222': 'cp874',
|
||||
'fcharset238': 'cp1250'}
|
||||
'0': 'cp1252',
|
||||
'128': 'cp932',
|
||||
'129': 'cp949',
|
||||
'134': 'cp936',
|
||||
'161': 'cp1253',
|
||||
'162': 'cp1254',
|
||||
'163': 'cp1258',
|
||||
'177': 'cp1255',
|
||||
'178': 'cp1256',
|
||||
'186': 'cp1257',
|
||||
'204': 'cp1251',
|
||||
'222': 'cp874',
|
||||
'238': 'cp1250'}
|
||||
|
||||
|
||||
class VerseType(object):
|
||||
@ -351,7 +365,7 @@ def retrieve_windows_encoding(recommendation=None):
|
||||
if recommendation == encodings[index][0]:
|
||||
recommended_index = index
|
||||
break
|
||||
if recommended_index > 0:
|
||||
if recommended_index > -1:
|
||||
choice = QtGui.QInputDialog.getItem(None,
|
||||
translate('SongsPlugin', 'Character Encoding'),
|
||||
translate('SongsPlugin', 'The codepage setting is responsible\n'
|
||||
@ -365,7 +379,7 @@ def retrieve_windows_encoding(recommendation=None):
|
||||
[pair[1] for pair in encodings], 0, False)
|
||||
if not choice[1]:
|
||||
return None
|
||||
return filter(lambda item: item[1] == choice[0], encodings)[0][0]
|
||||
return next(filter(lambda item: item[1] == choice[0], encodings))[0]
|
||||
|
||||
|
||||
def clean_string(string):
|
||||
@ -521,43 +535,59 @@ def strip_rtf(text, default_encoding=None):
|
||||
curskip = 0
|
||||
# Output buffer.
|
||||
out = []
|
||||
# Encoded buffer.
|
||||
ebytes = bytearray()
|
||||
for match in PATTERN.finditer(text):
|
||||
word, arg, hex, char, brace, tchar = match.groups()
|
||||
iinu, word, arg, hex, char, brace, tchar = match.groups()
|
||||
# \x (non-alpha character)
|
||||
if char:
|
||||
if char in '\\{}':
|
||||
tchar = char
|
||||
else:
|
||||
word = char
|
||||
# Flush encoded buffer to output buffer
|
||||
if ebytes and not hex and not tchar:
|
||||
failed = False
|
||||
while True:
|
||||
try:
|
||||
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
|
||||
if not encoding:
|
||||
return None
|
||||
dbytes = ebytes.decode(encoding)
|
||||
# Code 5C is a peculiar case with Windows Codepage 932
|
||||
if encoding == 'cp932' and '\\' in dbytes:
|
||||
dbytes = dbytes.replace('\\', '\u00A5')
|
||||
out.append(dbytes)
|
||||
ebytes.clear()
|
||||
except UnicodeDecodeError:
|
||||
failed = True
|
||||
else:
|
||||
break
|
||||
# {}
|
||||
if brace:
|
||||
curskip = 0
|
||||
if brace == '{':
|
||||
# Push state
|
||||
stack.append((ucskip, ignorable, font))
|
||||
elif brace == '}':
|
||||
elif brace == '}' and len(stack) > 0:
|
||||
# Pop state
|
||||
ucskip, ignorable, font = stack.pop()
|
||||
# \x (not a letter)
|
||||
elif char:
|
||||
curskip = 0
|
||||
if char == '~' and not ignorable:
|
||||
out.append('\xA0')
|
||||
elif char in '{}\\' and not ignorable:
|
||||
out.append(char)
|
||||
elif char == '-' and not ignorable:
|
||||
out.append('\u00AD')
|
||||
elif char == '_' and not ignorable:
|
||||
out.append('\u2011')
|
||||
elif char == '*':
|
||||
ignorable = True
|
||||
# \command
|
||||
elif word:
|
||||
curskip = 0
|
||||
if word in DESTINATIONS:
|
||||
ignorable = True
|
||||
elif word in SPECIAL_CHARS:
|
||||
out.append(SPECIAL_CHARS[word])
|
||||
if not ignorable:
|
||||
out.append(SPECIAL_CHARS[word])
|
||||
elif word == 'uc':
|
||||
ucskip = int(arg)
|
||||
elif word == ' ':
|
||||
elif word == 'u':
|
||||
c = int(arg)
|
||||
if c < 0:
|
||||
c += 0x10000
|
||||
out.append(chr(c))
|
||||
if not ignorable:
|
||||
out.append(chr(c))
|
||||
curskip = ucskip
|
||||
elif word == 'fonttbl':
|
||||
ignorable = True
|
||||
@ -565,31 +595,24 @@ def strip_rtf(text, default_encoding=None):
|
||||
font = arg
|
||||
elif word == 'ansicpg':
|
||||
font_table[font] = 'cp' + arg
|
||||
elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
|
||||
# \ansicpg overrides \fcharset, if present.
|
||||
font_table[font] = CHARSET_MAPPING[word + arg]
|
||||
elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
|
||||
font_table[font] = CHARSET_MAPPING[arg]
|
||||
elif word == 'fldrslt':
|
||||
pass
|
||||
# \* 'Ignore if not understood' marker
|
||||
elif iinu:
|
||||
ignorable = True
|
||||
# \'xx
|
||||
elif hex:
|
||||
if curskip > 0:
|
||||
curskip -= 1
|
||||
elif not ignorable:
|
||||
charcode = int(hex, 16)
|
||||
failed = False
|
||||
while True:
|
||||
try:
|
||||
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
|
||||
if not encoding:
|
||||
return None
|
||||
out.append(chr(charcode).decode(encoding))
|
||||
except UnicodeDecodeError:
|
||||
failed = True
|
||||
else:
|
||||
break
|
||||
ebytes.append(int(hex, 16))
|
||||
elif tchar:
|
||||
if curskip > 0:
|
||||
curskip -= 1
|
||||
elif not ignorable:
|
||||
out.append(tchar)
|
||||
ebytes += tchar.encode()
|
||||
text = ''.join(out)
|
||||
return text, default_encoding
|
||||
|
||||
|
@ -6,7 +6,7 @@ from unittest import TestCase
|
||||
|
||||
from mock import patch, MagicMock
|
||||
|
||||
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
|
||||
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
|
||||
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
|
||||
|
||||
|
||||
@ -215,6 +215,38 @@ class TestLib(TestCase):
|
||||
# THEN: The maximum length should be returned.
|
||||
assert result == 10, 'The length should be 10.'
|
||||
|
||||
def strip_rtf_charsets_test(self):
|
||||
"""
|
||||
Test that the strip_rtf() method properly decodes the supported charsets.
|
||||
"""
|
||||
test_charset_table = [
|
||||
('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
|
||||
('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
|
||||
'\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'),
|
||||
('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
|
||||
('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
|
||||
('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
|
||||
('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
|
||||
('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
|
||||
('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
|
||||
('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
|
||||
('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
|
||||
('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
|
||||
('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
|
||||
('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
|
||||
]
|
||||
|
||||
# GIVEN: For each character set and input
|
||||
for charset, input, exp_result in test_charset_table:
|
||||
|
||||
# WHEN: We call strip_rtf on the input RTF
|
||||
result, result_enc = strip_rtf(
|
||||
'{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
|
||||
'{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
|
||||
|
||||
# THEN: The stripped text matches thed expected result
|
||||
assert result == exp_result, 'The result should be %s' % exp_result
|
||||
|
||||
|
||||
class TestVerseType(TestCase):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user