In EasyWorship song importer, add initial support for non-latin1 encodings

This commit is contained in:
Jeffrey S. Smith 2010-10-14 13:15:02 -05:00
parent 161a1e444a
commit b58eecdf63

View File

@ -35,7 +35,7 @@ import struct
from openlp.core.lib import translate
from songimport import SongImport
def strip_rtf(blob):
def strip_rtf(blob, encoding):
depth = 0
control = False
clear_text = []
@ -69,12 +69,42 @@ def strip_rtf(blob):
if control_str == 'par' or control_str == 'line':
clear_text.append(u'\n')
elif control_str == 'tab':
clear_text.append(u'\n')
clear_text.append(u'\t')
# Prefer the encoding specified by the RTF data to that
# specified by the Paradox table header
# West European encoding
elif control_str == 'fcharset0':
encoding = u'cp1252'
# Greek encoding
elif control_str == 'fcharset161':
encoding = u'cp1253'
# Turkish encoding
elif control_str == 'fcharset162':
encoding = u'cp1254'
# Vietnamese encoding
elif control_str == 'fcharset163':
encoding = u'cp1258'
# Hebrew encoding
elif control_str == 'fcharset177':
encoding = u'cp1255'
# Arabic encoding
elif control_str == 'fcharset178':
encoding = u'cp1256'
# Baltic encoding
elif control_str == 'fcharset186':
encoding = u'cp1257'
# Cyrillic encoding
elif control_str == 'fcharset204':
encoding = u'cp1251'
# Thai encoding
elif control_str == 'fcharset222':
encoding = u'cp874'
# Central+East European encoding
elif control_str == 'fcharset238':
encoding = u'cp1250'
elif control_str[0] == '\'':
# Really should take RTF character set into account but
# for now assume ANSI (Windows-1252) and call it good
s = chr(int(control_str[1:3], 16))
clear_text.append(s.decode(u'windows-1252'))
clear_text.append(s.decode(encoding))
del control_word[:]
if c == '\\' and new_control:
control = True
@ -126,6 +156,30 @@ class EasyWorshipSongImport(SongImport):
db_file.close()
self.memo_file.close()
return False
# Take a stab at how text is encoded
self.encoding = u'cp1252'
db_file.seek(106)
code_page, = struct.unpack('<h', db_file.read(2))
if code_page == 852:
self.encoding = u'cp1250'
# The following codepage to actual encoding mappings have not been
# observed, but merely guessed. Actual example files are needed.
#if code_page == 737:
# self.encoding = u'cp1253'
#if code_page == 775:
# self.encoding = u'cp1257'
#if code_page == 855:
# self.encoding = u'cp1251'
#if code_page == 857:
# self.encoding = u'cp1254'
#if code_page == 866:
# self.encoding = u'cp1251'
#if code_page == 869:
# self.encoding = u'cp1253'
#if code_page == 862:
# self.encoding = u'cp1255'
#if code_page == 874:
# self.encoding = u'cp874'
# There does not appear to be a _reliable_ way of getting the number
# of songs/records, so let's use file blocks for measuring progress.
total_blocks = (db_size - header_size) / (block_size * 1024)
@ -204,7 +258,7 @@ class EasyWorshipSongImport(SongImport):
self.add_author(author_name.strip())
if words:
# Format the lyrics
words = strip_rtf(words)
words = strip_rtf(words, self.encoding)
for verse in words.split(u'\n\n'):
self.add_verse(verse.strip(), u'V')
if self.stop_import_flag:
@ -263,7 +317,7 @@ class EasyWorshipSongImport(SongImport):
# Format the field depending on the field type
if field_desc.type == 1:
# string
return field.rstrip('\0').decode(u'windows-1252')
return field.rstrip('\0').decode(self.encoding)
elif field_desc.type == 3:
# 16-bit int
return field ^ 0x8000