Fix strip_rtf to handle CJK encodings

bzr-revno: 2297
2013-09-13 17:13:12 +02:00 · 2013-09-13 17:13:12 +02:00 · 2511332a9e
commit 2511332a9e
parent 1ef5d00d45 30618ad60c
2 changed files with 105 additions and 50 deletions
--- a/openlp/plugins/songs/lib/init.py
+++ b/openlp/plugins/songs/lib/init.py
@ -46,7 +46,13 @@ log = logging.getLogger(__name__)

 WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
 APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
-PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
+# PATTERN will look for the next occurence of one of these symbols:
+#   \controlword - optionally preceded by \*, optionally followed by a number
+#   \'## - where ## is a pair of hex digits, representing a single character
+#   \# - where # is a single non-alpha character, representing a special symbol
+#   { or } - marking the beginning/end of a group
+#   a run of characters without any \ { } or end-of-line
+PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
 # RTF control words which specify a "destination" to be ignored.
 DESTINATIONS = frozenset((
    'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
@ -57,8 +63,8 @@ DESTINATIONS = frozenset((
    'datafield', 'datastore', 'defchp', 'defpap', 'do', 'doccomm',
    'docvar', 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname',
    'falt', 'fchars', 'ffdeftext', 'ffentrymcr', 'ffexitmcr',
-    'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext', 'field',
-    'file', 'filetbl', 'fldinst', 'fldrslt', 'fldtype', 'fname',
+    'ffformat', 'ffhelptext', 'ffl', 'ffname', 'ffstattext',
+    'file', 'filetbl', 'fldinst', 'fldtype', 'fname',
    'fontemb', 'fontfile', 'footer', 'footerf', 'footerl', 'footerr',
    'footnote', 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g',
    'generator', 'gridtbl', 'header', 'headerf', 'headerl',
@ -106,6 +112,11 @@ DESTINATIONS = frozenset((
    'xmlclose', 'xmlname', 'xmlnstbl', 'xmlopen'))
 # Translation of some special characters.
 SPECIAL_CHARS = {
+    '\n': '\n',
+    '\r': '\n',
+    '~': '\u00A0',
+    '-': '\u00AD',
+    '_': '\u2011',
    'par': '\n',
    'sect': '\n\n',
    # Required page and column break.
@ -132,16 +143,19 @@ SPECIAL_CHARS = {
    'zwj': '\u200D',
    'zwnj': '\u200C'}
 CHARSET_MAPPING = {
-    'fcharset0': 'cp1252',
-    'fcharset161': 'cp1253',
-    'fcharset162': 'cp1254',
-    'fcharset163': 'cp1258',
-    'fcharset177': 'cp1255',
-    'fcharset178': 'cp1256',
-    'fcharset186': 'cp1257',
-    'fcharset204': 'cp1251',
-    'fcharset222': 'cp874',
-    'fcharset238': 'cp1250'}
+    '0': 'cp1252',
+    '128': 'cp932',
+    '129': 'cp949',
+    '134': 'cp936',
+    '161': 'cp1253',
+    '162': 'cp1254',
+    '163': 'cp1258',
+    '177': 'cp1255',
+    '178': 'cp1256',
+    '186': 'cp1257',
+    '204': 'cp1251',
+    '222': 'cp874',
+    '238': 'cp1250'}


 class VerseType(object):
@ -351,7 +365,7 @@ def retrieve_windows_encoding(recommendation=None):
            if recommendation == encodings[index][0]:
                recommended_index = index
                break
-    if recommended_index > 0:
+    if recommended_index > -1:
        choice = QtGui.QInputDialog.getItem(None,
            translate('SongsPlugin', 'Character Encoding'),
            translate('SongsPlugin', 'The codepage setting is responsible\n'
@ -365,7 +379,7 @@ def retrieve_windows_encoding(recommendation=None):
                [pair[1] for pair in encodings], 0, False)
    if not choice[1]:
        return None
-    return filter(lambda item: item[1] == choice[0], encodings)[0][0]
+    return next(filter(lambda item: item[1] == choice[0], encodings))[0]


 def clean_string(string):
@ -521,43 +535,59 @@ def strip_rtf(text, default_encoding=None):
    curskip = 0
    # Output buffer.
    out = []
+    # Encoded buffer.
+    ebytes = bytearray()
    for match in PATTERN.finditer(text):
-        word, arg, hex, char, brace, tchar = match.groups()
+        iinu, word, arg, hex, char, brace, tchar = match.groups()
+        # \x (non-alpha character)
+        if char:
+            if char in '\\{}':
+                tchar = char
+            else:
+                word = char
+        # Flush encoded buffer to output buffer
+        if ebytes and not hex and not tchar:
+            failed = False
+            while True:
+                try:
+                    encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
+                    if not encoding:
+                        return None
+                    dbytes = ebytes.decode(encoding)
+                    # Code 5C is a peculiar case with Windows Codepage 932
+                    if encoding == 'cp932' and '\\' in dbytes:
+                        dbytes = dbytes.replace('\\', '\u00A5')
+                    out.append(dbytes)
+                    ebytes.clear()
+                except UnicodeDecodeError:
+                    failed = True
+                else:
+                    break
+        # {}
        if brace:
            curskip = 0
            if brace == '{':
                # Push state
                stack.append((ucskip, ignorable, font))
-            elif brace == '}':
+            elif brace == '}' and len(stack) > 0:
                # Pop state
                ucskip, ignorable, font = stack.pop()
-        # \x (not a letter)
-        elif char:
-            curskip = 0
-            if char == '~' and not ignorable:
-                out.append('\xA0')
-            elif char in '{}\\' and not ignorable:
-                out.append(char)
-            elif char == '-' and not ignorable:
-                out.append('\u00AD')
-            elif char == '_' and not ignorable:
-                out.append('\u2011')
-            elif char == '*':
-                ignorable = True
        # \command
        elif word:
            curskip = 0
            if word in DESTINATIONS:
                ignorable = True
            elif word in SPECIAL_CHARS:
-                out.append(SPECIAL_CHARS[word])
+                if not ignorable:
+                    out.append(SPECIAL_CHARS[word])
            elif word == 'uc':
                ucskip = int(arg)
-            elif word == ' ':
+            elif word == 'u':
                c = int(arg)
                if c < 0:
                    c += 0x10000
-                out.append(chr(c))
+                if not ignorable:
+                    out.append(chr(c))
                curskip = ucskip
            elif word == 'fonttbl':
                ignorable = True
@ -565,31 +595,24 @@ def strip_rtf(text, default_encoding=None):
                font = arg
            elif word == 'ansicpg':
                font_table[font] = 'cp' + arg
-            elif word == 'fcharset' and font not in font_table and word + arg in CHARSET_MAPPING:
-                # \ansicpg overrides \fcharset, if present.
-                font_table[font] = CHARSET_MAPPING[word + arg]
+            elif word == 'fcharset' and font not in font_table and arg in CHARSET_MAPPING:
+                font_table[font] = CHARSET_MAPPING[arg]
+            elif word == 'fldrslt':
+                pass
+            # \* 'Ignore if not understood' marker
+            elif iinu:
+                ignorable = True
        # \'xx
        elif hex:
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
-                charcode = int(hex, 16)
-                failed = False
-                while True:
-                    try:
-                        encoding, default_encoding = get_encoding(font, font_table, default_encoding, failed=failed)
-                        if not encoding:
-                            return None
-                        out.append(chr(charcode).decode(encoding))
-                    except UnicodeDecodeError:
-                        failed = True
-                    else:
-                        break
+                ebytes.append(int(hex, 16))
        elif tchar:
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
-                out.append(tchar)
+                ebytes += tchar.encode()
    text = ''.join(out)
    return text, default_encoding

--- a/tests/functional/openlp_plugins/songs/test_lib.py
+++ b/tests/functional/openlp_plugins/songs/test_lib.py
@ -6,7 +6,7 @@ from unittest import TestCase

 from mock import patch, MagicMock

-from openlp.plugins.songs.lib import VerseType, clean_string, clean_title
+from openlp.plugins.songs.lib import VerseType, clean_string, clean_title, strip_rtf
 from openlp.plugins.songs.lib.songcompare import songs_probably_equal, _remove_typos, _op_length


@ -215,6 +215,38 @@ class TestLib(TestCase):
        # THEN: The maximum length should be returned.
        assert result == 10, 'The length should be 10.'

+    def strip_rtf_charsets_test(self):
+        """
+        Test that the strip_rtf() method properly decodes the supported charsets.
+        """
+        test_charset_table = [
+            ('0', 'weor\\\'F0-myndum \\\'FEah\\par ', 'weorð-myndum þah\n'),
+            ('128', '\\\'83C\\\'83G\\\'83X\\\'A5\\\'83L\\\'83\\\'8A\\\'83X\\\'83g\\\'A1 '
+                    '\\\\ \\\'95\\\\ \\\'8E\\} \\\'8E\\{ \\\'A1\\par ', 'イエス･キリスト｡ ¥ 表 枝 施 ｡\n'),
+            ('129', '\\\'BF\\\'B9\\\'BC\\\'F6 \\\'B1\\\'D7\\\'B8\\\'AE\\\'BD\\\'BA\\\'B5\\\'B5\\par ', '예수 그리스도\n'),
+            ('134', '\\\'D2\\\'AE\\\'F6\\\'D5\\\'BB\\\'F9\\\'B6\\\'BD\\\'CA\\\'C7\\\'D6\\\'F7\\par ', '耶稣基督是主\n'),
+            ('161', '\\\'D7\\\'F1\\\'E9\\\'F3\\\'F4\\\'FC\\\'F2\\par ', 'Χριστός\n'),
+            ('162', 'Hazreti \\\'DDsa\\par ', 'Hazreti İsa\n'),
+            ('163', 'ph\\\'FD\\\'F5ng\\par ', 'phương\n'),
+            ('177', '\\\'E1\\\'F8\\\'E0\\\'F9\\\'E9\\\'FA\\par ', 'בראשית\n'),
+            ('178', '\\\'ED\\\'D3\\\'E6\\\'DA \\\'C7\\\'E1\\\'E3\\\'D3\\\'ED\\\'CD\\par ', 'يسوع المسيح\n'),
+            ('186', 'J\\\'EBzus Kristus yra Vie\\\'F0pats\\par ', 'Jėzus Kristus yra Viešpats\n'),
+            ('204', '\\\'D0\\\'EE\\\'F1\\\'F1\\\'E8\\\'FF\\par ', 'Россия\n'),
+            ('222', '\\\'A4\\\'C3\\\'D4\\\'CA\\\'B5\\\'EC\\par ', 'คริสต์\n'),
+            ('238', 'Z\\\'E1v\\\'ECre\\\'E8n\\\'E1 zkou\\\'9Aka\\par ', 'Závěrečná zkouška\n')
+        ]
+
+        # GIVEN: For each character set and input
+        for charset, input, exp_result in test_charset_table:
+
+            # WHEN: We call strip_rtf on the input RTF
+            result, result_enc = strip_rtf(
+               '{\\rtf1 \\ansi \\ansicpg1252 {\\fonttbl \\f0 \\fswiss \\fcharset%s Helvetica;}' \
+               '{\\colortbl ;\\red0 \\green0 \\blue0 ;}\\pard \\f0 %s}' % (charset, input))
+
+            # THEN: The stripped text matches thed expected result
+            assert result == exp_result, 'The result should be %s' % exp_result
+

 class TestVerseType(TestCase):
    """