This is a fix for bug #659673

bzr-revno: 1087
2010-10-14 21:41:43 +02:00 · 2010-10-14 21:41:43 +02:00 · b89b8983f8
commit b89b8983f8
parent c4e6b4aaac b58eecdf63
1 changed files with 61 additions and 7 deletions
--- a/openlp/plugins/songs/lib/ewimport.py
+++ b/openlp/plugins/songs/lib/ewimport.py
@ -35,7 +35,7 @@ import struct
 from openlp.core.lib import translate
 from songimport import SongImport

-def strip_rtf(blob):
+def strip_rtf(blob, encoding):
    depth = 0
    control = False
    clear_text = []
@ -69,12 +69,42 @@ def strip_rtf(blob):
                    if control_str == 'par' or control_str == 'line':
                        clear_text.append(u'\n')
                    elif control_str == 'tab':
-                        clear_text.append(u'\n')
+                        clear_text.append(u'\t')
+                    # Prefer the encoding specified by the RTF data to that
+                    #  specified by the Paradox table header
+                    # West European encoding
+                    elif control_str == 'fcharset0':
+                        encoding = u'cp1252'
+                    # Greek encoding
+                    elif control_str == 'fcharset161':
+                        encoding = u'cp1253'
+                    # Turkish encoding
+                    elif control_str == 'fcharset162':
+                        encoding = u'cp1254'
+                    # Vietnamese encoding
+                    elif control_str == 'fcharset163':
+                        encoding = u'cp1258'
+                    # Hebrew encoding
+                    elif control_str == 'fcharset177':
+                        encoding = u'cp1255'
+                    # Arabic encoding
+                    elif control_str == 'fcharset178':
+                        encoding = u'cp1256'
+                    # Baltic encoding
+                    elif control_str == 'fcharset186':
+                        encoding = u'cp1257'
+                    # Cyrillic encoding
+                    elif control_str == 'fcharset204':
+                        encoding = u'cp1251'
+                    # Thai encoding
+                    elif control_str == 'fcharset222':
+                        encoding = u'cp874'
+                    # Central+East European encoding
+                    elif control_str == 'fcharset238':
+                        encoding = u'cp1250'
                    elif control_str[0] == '\'':
-                        # Really should take RTF character set into account but
-                        # for now assume ANSI (Windows-1252) and call it good
                        s = chr(int(control_str[1:3], 16))
-                        clear_text.append(s.decode(u'windows-1252'))
+                        clear_text.append(s.decode(encoding))
                    del control_word[:]
            if c == '\\' and new_control:
                control = True
@ -126,6 +156,30 @@ class EasyWorshipSongImport(SongImport):
            db_file.close()
            self.memo_file.close()
            return False
+        # Take a stab at how text is encoded
+        self.encoding = u'cp1252'
+        db_file.seek(106)
+        code_page, = struct.unpack('<h', db_file.read(2))
+        if code_page == 852:
+            self.encoding = u'cp1250'
+        # The following codepage to actual encoding mappings have not been
+        #  observed, but merely guessed.  Actual example files are needed.
+        #if code_page == 737:
+        #    self.encoding = u'cp1253'
+        #if code_page == 775:
+        #    self.encoding = u'cp1257'
+        #if code_page == 855:
+        #    self.encoding = u'cp1251'
+        #if code_page == 857:
+        #    self.encoding = u'cp1254'
+        #if code_page == 866:
+        #    self.encoding = u'cp1251'
+        #if code_page == 869:
+        #    self.encoding = u'cp1253'
+        #if code_page == 862:
+        #    self.encoding = u'cp1255'
+        #if code_page == 874:
+        #    self.encoding = u'cp874'
        # There does not appear to be a _reliable_ way of getting the number
        # of songs/records, so let's use file blocks for measuring progress.
        total_blocks = (db_size - header_size) / (block_size * 1024)
@ -204,7 +258,7 @@ class EasyWorshipSongImport(SongImport):
                        self.add_author(author_name.strip())
                if words:
                    # Format the lyrics
-                    words = strip_rtf(words)
+                    words = strip_rtf(words, self.encoding)
                    for verse in words.split(u'\n\n'):
                        self.add_verse(verse.strip(), u'V')
                if self.stop_import_flag:
@ -263,7 +317,7 @@ class EasyWorshipSongImport(SongImport):
        # Format the field depending on the field type
        if field_desc.type == 1:
            # string
-            return field.rstrip('\0').decode(u'windows-1252')
+            return field.rstrip('\0').decode(self.encoding)
        elif field_desc.type == 3:
            # 16-bit int
            return field ^ 0x8000