Fix strip_rtf to handle CJK encodings

bzr-revno: 2297
2013-09-13 17:13:12 +02:00 · 2013-09-13 17:13:12 +02:00 · 2511332a9e
commit 2511332a9e
parent 1ef5d00d45 30618ad60c
2 changed files with 105 additions and 50 deletions
--- a/openlp/plugins/songs/lib/init.py
+++ b/openlp/plugins/songs/lib/init.py
@ -46,7 +46,13 @@ log = logging.getLogger(__name__)
 WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
 APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
-PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
+# PATTERN will look for the next occurence of one of these symbols:
 #   \controlword - optionally preceded by \*, optionally followed by a number
 #   \'## - where ## is a pair of hex digits, representing a single character
 #   \# - where # is a single non-alpha character, representing a special symbol
 #   { or } - marking the beginning/end of a group
 #   a run of characters without any \ { } or end-of-line
 PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
 # RTF control words which specify a "destination" to be ignored.
 DESTINATIONS = frozenset((
    'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@ -57,8 +63,8 @@ DESTINATIONS = frozenset((
    'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
    'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
    'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
-    'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
+    'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
-    'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
+    'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
    'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
    'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
    'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@ -106,6 +112,11 @@ DESTINATIONS = frozenset((
    'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
 # Translation of some special characters.
 SPECIAL_CHARS = {
    '\n': '\n',
    '\r': '\n',
    '~': '\u00A0',
    '-': '\u00AD',
    '_': '\u2011',
    'par': '\n',
    'sect': '\n\n',
    # Required page and column break.
@ -132,16 +143,19 @@ SPECIAL_CHARS = {
    'zwj': '\u200D',
    'zwnj': '\u200C'}
 CHARSET_MAPPING = {
-    'fcharset0': 'cp1252',
+    '0': 'cp1252',
-    'fcharset161': 'cp1253',
+    '128': 'cp932',
-    'fcharset162': 'cp1254',
+    '129': 'cp949',
-    'fcharset163': 'cp1258',
+    '134': 'cp936',
-    'fcharset177': 'cp1255',
+    '161': 'cp1253',
-    'fcharset178': 'cp1256',
+    '162': 'cp1254',
-    'fcharset186': 'cp1257',
+    '163': 'cp1258',
-    'fcharset204': 'cp1251',
+    '177': 'cp1255',
-    'fcharset222': 'cp874',
+    '178': 'cp1256',
-    'fcharset238': 'cp1250'}
+    '186': 'cp1257',
    '204': 'cp1251',
    '222': 'cp874',
    '238': 'cp1250'}
 class VerseType(object):
@ -351,7 +365,7 @@ def retrieve_windows_encoding(recommendation=None):
            if recommendation == encodings[index][0]:
                recommended_index = index
                break
-    if recommended_index > 0:
+    if recommended_index > -1:
        choice = QtGui.QInputDialog.getItem(None,
            translate('SongsPlugin', 'Character Encoding'),
            translate('SongsPlugin', 'The codepage setting is responsible\n'
@ -365,7 +379,7 @@ def retrieve_windows_encoding(recommendation=None):
                [pair[1] for pair in encodings], 0, False)
    if not choice[1]:
        return None
-    return filter(lambda item: item[1] == choice[0], encodings)[0][0]
+    return next(filter(lambda item: item[1] == choice[0], encodings))[0]
 def clean_string(string):
@ -521,43 +535,59 @@ def strip_rtf(text, default_encoding=None):
    curskip = 0
    # Output buffer.
    out = []
    # Encoded buffer.
    ebytes = bytearray()
    for match in PATTERN.finditer(text):
-        word, arg, hex, char, brace, tchar = match.groups()
+        iinu, word, arg, hex, char, brace, tchar = match.groups()
        # \x (non-alpha character)
        if char:
            if char in '\\{}':
                tchar = char
            else:
                word = char
        # Flush encoded buffer to output buffer
        if ebytes and not hex and not tchar:
            failed = False
            while True:
                try:
                    encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
                    if not encoding:
                        return None
                    dbytes = ebytes.decode(encoding)
                    # Code 5C is a peculiar case with Windows Codepage 932
                    if encoding == 'cp932' and '\\' in dbytes:
                        dbytes = dbytes.replace('\\', '\u00A5')
                    out.append(dbytes)
                    ebytes.clear()
                except UnicodeDecodeError:
                    failed = True
                else:
                    break
        # {}
        if brace:
            curskip = 0
            if brace == '{':
                # Push state
                stack.append((ucskip, ignorable, font))
-            elif brace == '}':
+            elif brace == '}' and len(stack) > 0:
                # Pop state
                ucskip, ignorable, font = stack.pop()
        # \x (not a letter)
        elif char:
            curskip = 0
            if char == '~' and not ignorable:
                out.append('\xA0')
            elif char in '{}\\' and not ignorable:
                out.append(char)
            elif char == '-' and not ignorable:
                out.append('\u00AD')
            elif char == '_' and not ignorable:
                out.append('\u2011')
            elif char == '*':
                ignorable = True
        # \command
        elif word:
            curskip = 0
            if word in DESTINATIONS:
                ignorable = True
            elif word in SPECIAL_CHARS:
-                out.append(SPECIAL_CHARS[word])
+                if not ignorable:
                    out.append(SPECIAL_CHARS[word])
            elif word == 'uc':
                ucskip = int(arg)
-            elif word == ' ':
+            elif word == 'u':
                c = int(arg)
                if c < 0:
                    c += 0x10000
-                out.append(chr(c))
+                if not ignorable:
                    out.append(chr(c))
                curskip = ucskip
            elif word == 'fonttbl':
                ignorable = True
@ -565,31 +595,24 @@ def strip_rtf(text, default_encoding=None):
                font = arg
            elif word == 'ansicpg':
                font_table[font] = 'cp' + arg
-            elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
+            elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
-                # \ansicpg overrides \fcharset, if present.
+                font_table[font] = CHARSET_MAPPING[arg]
-                font_table[font] = CHARSET_MAPPING[word + arg]
+            elif word == 'fldrslt':
                pass
            # \* 'Ignore if not understood' marker
            elif iinu:
                ignorable = True
        # \'xx
        elif hex:
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
-                charcode = int(hex, 16)
+                ebytes.append(int(hex, 16))
                failed = False
                while True:
                    try:
                        encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
                        if not encoding:
                            return None
                        out.append(chr(charcode).decode(encoding))
                    except UnicodeDecodeError:
                        failed = True
                    else:
                        break
        elif tchar:
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
-                out.append(tchar)
+                ebytes += tchar.encode()
    text = ''.join(out)
    return text, default_encoding
--- a/tests/functional/openlp_plugins/songs/test_lib.py
+++ b/tests/functional/openlp_plugins/songs/test_lib.py
@ -6,7 +6,7 @@ from unittest import TestCase
 from mock import patch, MagicMock
-from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
+from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
 from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length
@ -215,6 +215,38 @@ class TestLib(TestCase):
        # THEN: The maximum length should be returned.
        assert result == 10, 'The length should be 10.'
    def strip_rtf_charsets_test(self):
        """
        Test that the strip_rtf() method properly decodes the supported charsets.
        """
        test_charset_table = [
            ('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
            ('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
                    '\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス･キリスト｡ ¥ 表 枝 施 ｡\n'),
            ('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
            ('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
            ('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
            ('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
            ('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
            ('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
            ('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
            ('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
            ('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
            ('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
            ('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
        ]
        # GIVEN: For each character set and input
        for charset, input, exp_result in test_charset_table:
            # WHEN: We call strip_rtf on the input RTF
            result, result_enc = strip_rtf(
               '{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
               '{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
            # THEN: The stripped text matches thed expected result
            assert result == exp_result, 'The result should be %s' % exp_result
 class TestVerseType(TestCase):
    """