forked from openlp/openlp
Fix strip_rtf to handle CJK encodings
This commit is contained in:
parent
880a548eb8
commit
53ac150337
@ -46,7 +46,7 @@ log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
|
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
|
||||||
APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
|
APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
|
||||||
PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
|
PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
|
||||||
# RTF control words which specify a "destination" to be ignored.
|
# RTF control words which specify a "destination" to be ignored.
|
||||||
DESTINATIONS = frozenset((
|
DESTINATIONS = frozenset((
|
||||||
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
|
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
|
||||||
@ -57,8 +57,8 @@ DESTINATIONS = frozenset((
|
|||||||
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
|
'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
|
||||||
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
|
'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
|
||||||
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
|
'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
|
||||||
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
|
'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
|
||||||
'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
|
'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
|
||||||
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
|
'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
|
||||||
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
|
'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
|
||||||
'generator', 'gridtbl', 'header', 'headerf', 'headerl',
|
'generator', 'gridtbl', 'header', 'headerf', 'headerl',
|
||||||
@ -106,6 +106,11 @@ DESTINATIONS = frozenset((
|
|||||||
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
|
'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
|
||||||
# Translation of some special characters.
|
# Translation of some special characters.
|
||||||
SPECIAL_CHARS = {
|
SPECIAL_CHARS = {
|
||||||
|
'\n': '\n',
|
||||||
|
'\r': '\n',
|
||||||
|
'~': '\u00A0',
|
||||||
|
'-': '\u00AD',
|
||||||
|
'_': '\u2011',
|
||||||
'par': '\n',
|
'par': '\n',
|
||||||
'sect': '\n\n',
|
'sect': '\n\n',
|
||||||
# Required page and column break.
|
# Required page and column break.
|
||||||
@ -132,16 +137,19 @@ SPECIAL_CHARS = {
|
|||||||
'zwj': '\u200D',
|
'zwj': '\u200D',
|
||||||
'zwnj': '\u200C'}
|
'zwnj': '\u200C'}
|
||||||
CHARSET_MAPPING = {
|
CHARSET_MAPPING = {
|
||||||
'fcharset0': 'cp1252',
|
'0': 'cp1252',
|
||||||
'fcharset161': 'cp1253',
|
'128': 'cp932',
|
||||||
'fcharset162': 'cp1254',
|
'129': 'cp949',
|
||||||
'fcharset163': 'cp1258',
|
'134': 'cp936',
|
||||||
'fcharset177': 'cp1255',
|
'161': 'cp1253',
|
||||||
'fcharset178': 'cp1256',
|
'162': 'cp1254',
|
||||||
'fcharset186': 'cp1257',
|
'163': 'cp1258',
|
||||||
'fcharset204': 'cp1251',
|
'177': 'cp1255',
|
||||||
'fcharset222': 'cp874',
|
'178': 'cp1256',
|
||||||
'fcharset238': 'cp1250'}
|
'186': 'cp1257',
|
||||||
|
'204': 'cp1251',
|
||||||
|
'222': 'cp874',
|
||||||
|
'238': 'cp1250'}
|
||||||
|
|
||||||
|
|
||||||
class VerseType(object):
|
class VerseType(object):
|
||||||
@ -351,7 +359,7 @@ def retrieve_windows_encoding(recommendation=None):
|
|||||||
if recommendation == encodings[index][0]:
|
if recommendation == encodings[index][0]:
|
||||||
recommended_index = index
|
recommended_index = index
|
||||||
break
|
break
|
||||||
if recommended_index > 0:
|
if recommended_index > -1:
|
||||||
choice = QtGui.QInputDialog.getItem(None,
|
choice = QtGui.QInputDialog.getItem(None,
|
||||||
translate('SongsPlugin', 'Character Encoding'),
|
translate('SongsPlugin', 'Character Encoding'),
|
||||||
translate('SongsPlugin', 'The codepage setting is responsible\n'
|
translate('SongsPlugin', 'The codepage setting is responsible\n'
|
||||||
@ -365,7 +373,7 @@ def retrieve_windows_encoding(recommendation=None):
|
|||||||
[pair[1] for pair in encodings], 0, False)
|
[pair[1] for pair in encodings], 0, False)
|
||||||
if not choice[1]:
|
if not choice[1]:
|
||||||
return None
|
return None
|
||||||
return filter(lambda item: item[1] == choice[0], encodings)[0][0]
|
return next(filter(lambda item: item[1] == choice[0], encodings))[0]
|
||||||
|
|
||||||
|
|
||||||
def clean_string(string):
|
def clean_string(string):
|
||||||
@ -521,43 +529,59 @@ def strip_rtf(text, default_encoding=None):
|
|||||||
curskip = 0
|
curskip = 0
|
||||||
# Output buffer.
|
# Output buffer.
|
||||||
out = []
|
out = []
|
||||||
|
# Encoded buffer.
|
||||||
|
ebytes = bytearray()
|
||||||
for match in PATTERN.finditer(text):
|
for match in PATTERN.finditer(text):
|
||||||
word, arg, hex, char, brace, tchar = match.groups()
|
iinu, word, arg, hex, char, brace, tchar = match.groups()
|
||||||
|
# \x (non-alpha character)
|
||||||
|
if char:
|
||||||
|
if char in '\\{}':
|
||||||
|
tchar = char
|
||||||
|
else:
|
||||||
|
word = char
|
||||||
|
# Flush encoded buffer to output buffer
|
||||||
|
if ebytes and not hex and not tchar:
|
||||||
|
failed = False
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
|
||||||
|
if not encoding:
|
||||||
|
return None
|
||||||
|
dbytes = ebytes.decode(encoding)
|
||||||
|
# Code 5C is a peculiar case with Windows Codepage 932
|
||||||
|
if encoding == 'cp932' and '\\' in dbytes:
|
||||||
|
dbytes = dbytes.replace('\\', '\u00A5')
|
||||||
|
out.append(dbytes)
|
||||||
|
ebytes.clear()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
failed = True
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
# {}
|
||||||
if brace:
|
if brace:
|
||||||
curskip = 0
|
curskip = 0
|
||||||
if brace == '{':
|
if brace == '{':
|
||||||
# Push state
|
# Push state
|
||||||
stack.append((ucskip, ignorable, font))
|
stack.append((ucskip, ignorable, font))
|
||||||
elif brace == '}':
|
elif brace == '}' and len(stack) > 0:
|
||||||
# Pop state
|
# Pop state
|
||||||
ucskip, ignorable, font = stack.pop()
|
ucskip, ignorable, font = stack.pop()
|
||||||
# \x (not a letter)
|
|
||||||
elif char:
|
|
||||||
curskip = 0
|
|
||||||
if char == '~' and not ignorable:
|
|
||||||
out.append('\xA0')
|
|
||||||
elif char in '{}\\' and not ignorable:
|
|
||||||
out.append(char)
|
|
||||||
elif char == '-' and not ignorable:
|
|
||||||
out.append('\u00AD')
|
|
||||||
elif char == '_' and not ignorable:
|
|
||||||
out.append('\u2011')
|
|
||||||
elif char == '*':
|
|
||||||
ignorable = True
|
|
||||||
# \command
|
# \command
|
||||||
elif word:
|
elif word:
|
||||||
curskip = 0
|
curskip = 0
|
||||||
if word in DESTINATIONS:
|
if word in DESTINATIONS:
|
||||||
ignorable = True
|
ignorable = True
|
||||||
elif word in SPECIAL_CHARS:
|
elif word in SPECIAL_CHARS:
|
||||||
out.append(SPECIAL_CHARS[word])
|
if not ignorable:
|
||||||
|
out.append(SPECIAL_CHARS[word])
|
||||||
elif word == 'uc':
|
elif word == 'uc':
|
||||||
ucskip = int(arg)
|
ucskip = int(arg)
|
||||||
elif word == ' ':
|
elif word == 'u':
|
||||||
c = int(arg)
|
c = int(arg)
|
||||||
if c < 0:
|
if c < 0:
|
||||||
c += 0x10000
|
c += 0x10000
|
||||||
out.append(chr(c))
|
if not ignorable:
|
||||||
|
out.append(chr(c))
|
||||||
curskip = ucskip
|
curskip = ucskip
|
||||||
elif word == 'fonttbl':
|
elif word == 'fonttbl':
|
||||||
ignorable = True
|
ignorable = True
|
||||||
@ -565,31 +589,24 @@ def strip_rtf(text, default_encoding=None):
|
|||||||
font = arg
|
font = arg
|
||||||
elif word == 'ansicpg':
|
elif word == 'ansicpg':
|
||||||
font_table[font] = 'cp' + arg
|
font_table[font] = 'cp' + arg
|
||||||
elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
|
elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
|
||||||
# \ansicpg overrides \fcharset, if present.
|
font_table[font] = CHARSET_MAPPING[arg]
|
||||||
font_table[font] = CHARSET_MAPPING[word + arg]
|
elif word == 'fldrslt':
|
||||||
|
pass
|
||||||
|
# \* 'Ignore if not understood' marker
|
||||||
|
elif iinu:
|
||||||
|
ignorable = True
|
||||||
# \'xx
|
# \'xx
|
||||||
elif hex:
|
elif hex:
|
||||||
if curskip > 0:
|
if curskip > 0:
|
||||||
curskip -= 1
|
curskip -= 1
|
||||||
elif not ignorable:
|
elif not ignorable:
|
||||||
charcode = int(hex, 16)
|
ebytes.append(int(hex, 16))
|
||||||
failed = False
|
|
||||||
while True:
|
|
||||||
try:
|
|
||||||
encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
|
|
||||||
if not encoding:
|
|
||||||
return None
|
|
||||||
out.append(chr(charcode).decode(encoding))
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
failed = True
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
elif tchar:
|
elif tchar:
|
||||||
if curskip > 0:
|
if curskip > 0:
|
||||||
curskip -= 1
|
curskip -= 1
|
||||||
elif not ignorable:
|
elif not ignorable:
|
||||||
out.append(tchar)
|
ebytes += tchar.encode()
|
||||||
text = ''.join(out)
|
text = ''.join(out)
|
||||||
return text, default_encoding
|
return text, default_encoding
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ from unittest import TestCase
|
|||||||
|
|
||||||
from mock import patch, MagicMock
|
from mock import patch, MagicMock
|
||||||
|
|
||||||
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
|
from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
|
||||||
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
|
from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
|
||||||
|
|
||||||
|
|
||||||
@ -215,6 +215,38 @@ class TestLib(TestCase):
|
|||||||
# THEN: The maximum length should be returned.
|
# THEN: The maximum length should be returned.
|
||||||
assert result == 10, 'The length should be 10.'
|
assert result == 10, 'The length should be 10.'
|
||||||
|
|
||||||
|
def strip_rtf_charsets_test(self):
|
||||||
|
"""
|
||||||
|
Test that the strip_rtf() method properly decodes the supported charsets.
|
||||||
|
"""
|
||||||
|
test_charset_table = [
|
||||||
|
('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
|
||||||
|
('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
|
||||||
|
'\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'),
|
||||||
|
('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
|
||||||
|
('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
|
||||||
|
('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
|
||||||
|
('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
|
||||||
|
('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
|
||||||
|
('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
|
||||||
|
('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
|
||||||
|
('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
|
||||||
|
('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
|
||||||
|
('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
|
||||||
|
('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
|
||||||
|
]
|
||||||
|
|
||||||
|
# GIVEN: For each character set and input
|
||||||
|
for charset, input, exp_result in test_charset_table:
|
||||||
|
|
||||||
|
# WHEN: We call strip_rtf on the input RTF
|
||||||
|
result, result_enc = strip_rtf(
|
||||||
|
'{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
|
||||||
|
'{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
|
||||||
|
|
||||||
|
# THEN: The stripped text matches thed expected result
|
||||||
|
assert result == exp_result, 'The result should be %s' % exp_result
|
||||||
|
|
||||||
|
|
||||||
class TestVerseType(TestCase):
|
class TestVerseType(TestCase):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user