diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index 271a94710..ccbe35fd9 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -46,7 +46,7 @@ log = logging.getLogger(__name__) WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE) -PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) +PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I) # RTF control words which specify a "destination" to be ignored. DESTINATIONS = frozenset(( 'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor', @@ -57,8 +57,8 @@ DESTINATIONS = frozenset(( 'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr', - 'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field', - 'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname', + 'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', + 'file', 'filetbl', 'fldinst', 'fldtype', 'fname', 'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr', 'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'generator', 'gridtbl', 'header', 'headerf', 'headerl', @@ -106,6 +106,11 @@ DESTINATIONS = frozenset(( 'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen')) # Translation of some special characters. SPECIAL_CHARS = { + '\n': '\n', + '\r': '\n', + '~': '\u00A0', + '-': '\u00AD', + '_': '\u2011', 'par': '\n', 'sect': '\n\n', # Required page and column break. @@ -132,16 +137,19 @@ SPECIAL_CHARS = { 'zwj': '\u200D', 'zwnj': '\u200C'} CHARSET_MAPPING = { - 'fcharset0': 'cp1252', - 'fcharset161': 'cp1253', - 'fcharset162': 'cp1254', - 'fcharset163': 'cp1258', - 'fcharset177': 'cp1255', - 'fcharset178': 'cp1256', - 'fcharset186': 'cp1257', - 'fcharset204': 'cp1251', - 'fcharset222': 'cp874', - 'fcharset238': 'cp1250'} + '0': 'cp1252', + '128': 'cp932', + '129': 'cp949', + '134': 'cp936', + '161': 'cp1253', + '162': 'cp1254', + '163': 'cp1258', + '177': 'cp1255', + '178': 'cp1256', + '186': 'cp1257', + '204': 'cp1251', + '222': 'cp874', + '238': 'cp1250'} class VerseType(object): @@ -351,7 +359,7 @@ def retrieve_windows_encoding(recommendation=None): if recommendation == encodings[index][0]: recommended_index = index break - if recommended_index > 0: + if recommended_index > -1: choice = QtGui.QInputDialog.getItem(None, translate('SongsPlugin', 'Character Encoding'), translate('SongsPlugin', 'The codepage setting is responsible\n' @@ -365,7 +373,7 @@ def retrieve_windows_encoding(recommendation=None): [pair[1] for pair in encodings], 0, False) if not choice[1]: return None - return filter(lambda item: item[1] == choice[0], encodings)[0][0] + return next(filter(lambda item: item[1] == choice[0], encodings))[0] def clean_string(string): @@ -521,43 +529,59 @@ def strip_rtf(text, default_encoding=None): curskip = 0 # Output buffer. out = [] + # Encoded buffer. + ebytes = bytearray() for match in PATTERN.finditer(text): - word, arg, hex, char, brace, tchar = match.groups() + iinu, word, arg, hex, char, brace, tchar = match.groups() + # \x (non-alpha character) + if char: + if char in '\\{}': + tchar = char + else: + word = char + # Flush encoded buffer to output buffer + if ebytes and not hex and not tchar: + failed = False + while True: + try: + encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed) + if not encoding: + return None + dbytes = ebytes.decode(encoding) + # Code 5C is a peculiar case with Windows Codepage 932 + if encoding == 'cp932' and '\\' in dbytes: + dbytes = dbytes.replace('\\', '\u00A5') + out.append(dbytes) + ebytes.clear() + except UnicodeDecodeError: + failed = True + else: + break + # {} if brace: curskip = 0 if brace == '{': # Push state stack.append((ucskip, ignorable, font)) - elif brace == '}': + elif brace == '}' and len(stack) > 0: # Pop state ucskip, ignorable, font = stack.pop() - # \x (not a letter) - elif char: - curskip = 0 - if char == '~' and not ignorable: - out.append('\xA0') - elif char in '{}\\' and not ignorable: - out.append(char) - elif char == '-' and not ignorable: - out.append('\u00AD') - elif char == '_' and not ignorable: - out.append('\u2011') - elif char == '*': - ignorable = True # \command elif word: curskip = 0 if word in DESTINATIONS: ignorable = True elif word in SPECIAL_CHARS: - out.append(SPECIAL_CHARS[word]) + if not ignorable: + out.append(SPECIAL_CHARS[word]) elif word == 'uc': ucskip = int(arg) - elif word == ' ': + elif word == 'u': c = int(arg) if c < 0: c += 0x10000 - out.append(chr(c)) + if not ignorable: + out.append(chr(c)) curskip = ucskip elif word == 'fonttbl': ignorable = True @@ -565,31 +589,24 @@ def strip_rtf(text, default_encoding=None): font = arg elif word == 'ansicpg': font_table[font] = 'cp' + arg - elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING: - # \ansicpg overrides \fcharset, if present. - font_table[font] = CHARSET_MAPPING[word + arg] + elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING: + font_table[font] = CHARSET_MAPPING[arg] + elif word == 'fldrslt': + pass + # \* 'Ignore if not understood' marker + elif iinu: + ignorable = True # \'xx elif hex: if curskip > 0: curskip -= 1 elif not ignorable: - charcode = int(hex, 16) - failed = False - while True: - try: - encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed) - if not encoding: - return None - out.append(chr(charcode).decode(encoding)) - except UnicodeDecodeError: - failed = True - else: - break + ebytes.append(int(hex, 16)) elif tchar: if curskip > 0: curskip -= 1 elif not ignorable: - out.append(tchar) + ebytes += tchar.encode() text = ''.join(out) return text, default_encoding diff --git a/tests/functional/openlp_plugins/songs/test_lib.py b/tests/functional/openlp_plugins/songs/test_lib.py index ac22ae1ef..a9e64b5c9 100644 --- a/tests/functional/openlp_plugins/songs/test_lib.py +++ b/tests/functional/openlp_plugins/songs/test_lib.py @@ -6,7 +6,7 @@ from unittest import TestCase from mock import patch, MagicMock -from openlp.plugins.songs.lib import VerseType, clean_string, clean_title +from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length @@ -215,6 +215,38 @@ class TestLib(TestCase): # THEN: The maximum length should be returned. assert result == 10, 'The length should be 10.' + def strip_rtf_charsets_test(self): + """ + Test that the strip_rtf() method properly decodes the supported charsets. + """ + test_charset_table = [ + ('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'), + ('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 ' + '\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス・キリスト。 ¥ 表 枝 施 。\n'), + ('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'), + ('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'), + ('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'), + ('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'), + ('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'), + ('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'), + ('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'), + ('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'), + ('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'), + ('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'), + ('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n') + ] + + # GIVEN: For each character set and input + for charset, input, exp_result in test_charset_table: + + # WHEN: We call strip_rtf on the input RTF + result, result_enc = strip_rtf( + '{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \ + '{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input)) + + # THEN: The stripped text matches thed expected result + assert result == exp_result, 'The result should be %s' % exp_result + class TestVerseType(TestCase): """