Adds SundayPlus importer with new strip_rtf function which keeps track of encodings.

bzr-revno: 2016
This commit is contained in:
Mattias Põldaru 2012-07-05 17:54:41 +02:00 committed by Andreas Preikschat
commit dc8496fa71
4 changed files with 438 additions and 100 deletions

View File

@ -36,6 +36,104 @@ from ui import SongStrings
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile(u'[\'`ʻ]', re.UNICODE) APOSTROPHE = re.compile(u'[\'`ʻ]', re.UNICODE)
PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'"
r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# RTF control words which specify a "destination" to be ignored.
DESTINATIONS = frozenset((
u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor',
u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime',
u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend',
u'bkmkstart', u'blipuid', u'buptim', u'category',
u'colorschememapping', u'colortbl', u'comment', u'company', u'creatim',
u'datafield', u'datastore', u'defchp', u'defpap', u'do', u'doccomm',
u'docvar', u'dptxbxtext', u'ebcend', u'ebcstart', u'factoidname',
u'falt', u'fchars', u'ffdeftext', u'ffentrymcr', u'ffexitmcr',
u'ffformat', u'ffhelptext', u'ffl', u'ffname', u'ffstattext', u'field',
u'file', u'filetbl', u'fldinst', u'fldrslt', u'fldtype', u'fname',
u'fontemb', u'fontfile', u'footer', u'footerf', u'footerl', u'footerr',
u'footnote', u'formfield', u'ftncn', u'ftnsep', u'ftnsepc', u'g',
u'generator', u'gridtbl', u'header', u'headerf', u'headerl',
u'headerr', u'hl', u'hlfr', u'hlinkbase', u'hlloc', u'hlsrc', u'hsv',
u'htmltag', u'info', u'keycode', u'keywords', u'latentstyles',
u'lchars', u'levelnumbers', u'leveltext', u'lfolevel', u'linkval',
u'list', u'listlevel', u'listname', u'listoverride',
u'listoverridetable', u'listpicture', u'liststylename', u'listtable',
u'listtext', u'lsdlockedexcept', u'macc', u'maccPr', u'mailmerge',
u'maln', u'malnScr', u'manager', u'margPr', u'mbar', u'mbarPr',
u'mbaseJc', u'mbegChr', u'mborderBox', u'mborderBoxPr', u'mbox',
u'mboxPr', u'mchr', u'mcount', u'mctrlPr', u'md', u'mdeg', u'mdegHide',
u'mden', u'mdiff', u'mdPr', u'me', u'mendChr', u'meqArr', u'meqArrPr',
u'mf', u'mfName', u'mfPr', u'mfunc', u'mfuncPr', u'mgroupChr',
u'mgroupChrPr', u'mgrow', u'mhideBot', u'mhideLeft', u'mhideRight',
u'mhideTop', u'mhtmltag', u'mlim', u'mlimloc', u'mlimlow',
u'mlimlowPr', u'mlimupp', u'mlimuppPr', u'mm', u'mmaddfieldname',
u'mmath', u'mmathPict', u'mmathPr', u'mmaxdist', u'mmc', u'mmcJc',
u'mmconnectstr', u'mmconnectstrdata', u'mmcPr', u'mmcs',
u'mmdatasource', u'mmheadersource', u'mmmailsubject', u'mmodso',
u'mmodsofilter', u'mmodsofldmpdata', u'mmodsomappedname',
u'mmodsoname', u'mmodsorecipdata', u'mmodsosort', u'mmodsosrc',
u'mmodsotable', u'mmodsoudl', u'mmodsoudldata', u'mmodsouniquetag',
u'mmPr', u'mmquery', u'mmr', u'mnary', u'mnaryPr', u'mnoBreak',
u'mnum', u'mobjDist', u'moMath', u'moMathPara', u'moMathParaPr',
u'mopEmu', u'mphant', u'mphantPr', u'mplcHide', u'mpos', u'mr',
u'mrad', u'mradPr', u'mrPr', u'msepChr', u'mshow', u'mshp', u'msPre',
u'msPrePr', u'msSub', u'msSubPr', u'msSubSup', u'msSubSupPr', u'msSup',
u'msSupPr', u'mstrikeBLTR', u'mstrikeH', u'mstrikeTLBR', u'mstrikeV',
u'msub', u'msubHide', u'msup', u'msupHide', u'mtransp', u'mtype',
u'mvertJc', u'mvfmf', u'mvfml', u'mvtof', u'mvtol', u'mzeroAsc',
u'mzeroDesc', u'mzeroWid', u'nesttableprops', u'nextfile',
u'nonesttables', u'objalias', u'objclass', u'objdata', u'object',
u'objname', u'objsect', u'objtime', u'oldcprops', u'oldpprops',
u'oldsprops', u'oldtprops', u'oleclsid', u'operator', u'panose',
u'password', u'passwordhash', u'pgp', u'pgptbl', u'picprop', u'pict',
u'pn', u'pnseclvl', u'pntext', u'pntxta', u'pntxtb', u'printim',
u'private', u'propname', u'protend', u'protstart', u'protusertbl',
u'pxe', u'result', u'revtbl', u'revtim', u'rsidtbl', u'rxe', u'shp',
u'shpgrp', u'shpinst', u'shppict', u'shprslt', u'shptxt', u'sn', u'sp',
u'staticval', u'stylesheet', u'subject', u'sv', u'svb', u'tc',
u'template', u'themedata', u'title', u'txe', u'ud', u'upr',
u'userprops', u'wgrffmtfilter', u'windowcaption', u'writereservation',
u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue',
u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen'))
# Translation of some special characters.
SPECIAL_CHARS = {
u'par': u'\n',
u'sect': u'\n\n',
# Required page and column break.
# Would be good if we could split verse into subverses here.
u'page': u'\n\n',
u'column': u'\n\n',
# Soft breaks.
u'softpage': u'[---]',
u'softcol': u'[---]',
u'line': u'\n',
u'tab': u'\t',
u'emdash': u'\u2014',
u'endash': u'\u2013',
u'emspace': u'\u2003',
u'enspace': u'\u2002',
u'qmspace': u'\u2005',
u'bullet': u'\u2022',
u'lquote': u'\u2018',
u'rquote': u'\u2019',
u'ldblquote': u'\u201C',
u'rdblquote': u'\u201D',
u'ltrmark': u'\u200E',
u'rtlmark': u'\u200F',
u'zwj': u'\u200D',
u'zwnj': u'\u200C'}
CHARSET_MAPPING = {
u'fcharset0': u'cp1252',
u'fcharset161': u'cp1253',
u'fcharset162': u'cp1254',
u'fcharset163': u'cp1258',
u'fcharset177': u'cp1255',
u'fcharset178': u'cp1256',
u'fcharset186': u'cp1257',
u'fcharset204': u'cp1251',
u'fcharset222': u'cp874',
u'fcharset238': u'cp1250'}
class VerseType(object): class VerseType(object):
""" """
@ -366,6 +464,136 @@ def clean_song(manager, song):
if song.copyright: if song.copyright:
song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip() song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
def get_encoding(font, font_table, default_encoding, failed=False):
"""
Finds an encoding to use. Asks user, if necessary.
``font``
The number of currently active font.
``font_table``
Dictionary of fonts and respective encodings.
``default_encoding``
The defaul encoding to use when font_table is empty or no font is used.
``failed``
A boolean indicating whether the previous encoding didn't work.
"""
encoding = None
if font in font_table:
encoding = font_table[font]
if not encoding and default_encoding:
encoding = default_encoding
if not encoding or failed:
encoding = retrieve_windows_encoding()
default_encoding = encoding
font_table[font] = encoding
return encoding, default_encoding
def strip_rtf(text, default_encoding=None):
"""
This function strips RTF control structures and returns an unicode string.
Thanks to Markus Jarderot (MizardX) for this code, used by permission.
http://stackoverflow.com/questions/188545
``text``
RTF-encoded text, a string.
``default_encoding``
Default encoding to use when no encoding is specified.
"""
# Current font is the font tag we last met.
font = u''
# Character encoding is defined inside fonttable.
# font_table could contain eg u'0': u'cp1252'
font_table = {u'': u''}
# Stack of things to keep track of when entering/leaving groups.
stack = []
# Whether this group (and all inside it) are "ignorable".
ignorable = False
# Number of ASCII characters to skip after an unicode character.
ucskip = 1
# Number of ASCII characters left to skip.
curskip = 0
# Output buffer.
out = []
for match in PATTERN.finditer(text):
word, arg, hex, char, brace, tchar = match.groups()
if brace:
curskip = 0
if brace == u'{':
# Push state
stack.append((ucskip, ignorable, font))
elif brace == u'}':
# Pop state
ucskip, ignorable, font = stack.pop()
# \x (not a letter)
elif char:
curskip = 0
if char == u'~' and not ignorable:
out.append(u'\xA0')
elif char in u'{}\\' and not ignorable:
out.append(char)
elif char == u'-' and not ignorable:
out.append(u'\u00AD')
elif char == u'_' and not ignorable:
out.append(u'\u2011')
elif char == u'*':
ignorable = True
# \command
elif word:
curskip = 0
if word in DESTINATIONS:
ignorable = True
elif word in SPECIAL_CHARS:
out.append(SPECIAL_CHARS[word])
elif word == u'uc':
ucskip = int(arg)
elif word == u' ':
c = int(arg)
if c < 0:
c += 0x10000
out.append(unichr(c))
curskip = ucskip
elif word == u'fonttbl':
ignorable = True
elif word == u'f':
font = arg
elif word == u'ansicpg':
font_table[font] = 'cp' + arg
elif word == u'fcharset' and font not in font_table and \
word + arg in CHARSET_MAPPING:
# \ansicpg overrides \fcharset, if present.
font_table[font] = CHARSET_MAPPING[word + arg]
# \'xx
elif hex:
if curskip > 0:
curskip -= 1
elif not ignorable:
charcode = int(hex, 16)
failed = False
while True:
try:
encoding, default_encoding = get_encoding(font,
font_table, default_encoding, failed=failed)
out.append(chr(charcode).decode(encoding))
except UnicodeDecodeError:
failed = True
else:
break
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
text = u''.join(out)
return text, default_encoding
from xml import OpenLyrics, SongXML from xml import OpenLyrics, SongXML
from songstab import SongsTab from songstab import SongsTab
from mediaitem import SongMediaItem from mediaitem import SongMediaItem

View File

@ -36,7 +36,7 @@ import re
from openlp.core.lib import translate from openlp.core.lib import translate
from openlp.plugins.songs.lib import VerseType from openlp.plugins.songs.lib import VerseType
from openlp.plugins.songs.lib import retrieve_windows_encoding from openlp.plugins.songs.lib import retrieve_windows_encoding, strip_rtf
from songimport import SongImport from songimport import SongImport
RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}') RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')
@ -45,101 +45,6 @@ SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*')
NUMBER_REGEX = re.compile(r'[0-9]+') NUMBER_REGEX = re.compile(r'[0-9]+')
NOTE_REGEX = re.compile(r'\(.*?\)') NOTE_REGEX = re.compile(r'\(.*?\)')
def strip_rtf(blob, encoding):
depth = 0
control = False
clear_text = []
control_word = []
# workaround for \tx bug: remove one pair of curly braces
# if \tx is encountered
match = RTF_STRIPPING_REGEX.search(blob)
if match:
# start and end indices of match are curly braces - filter them out
blob = ''.join([blob[i] for i in xrange(len(blob))
if i != match.start() and i !=match.end()])
for c in blob:
if control:
# for delimiters, set control to False
if c == '{':
if control_word:
depth += 1
control = False
elif c == '}':
if control_word:
depth -= 1
control = False
elif c == '\\':
new_control = bool(control_word)
control = False
elif c.isspace():
control = False
else:
control_word.append(c)
if len(control_word) == 3 and control_word[0] == '\'':
control = False
if not control:
if not control_word:
if c == '{' or c == '}' or c == '\\':
clear_text.append(c)
else:
control_str = ''.join(control_word)
if control_str == 'par' or control_str == 'line':
clear_text.append(u'\n')
elif control_str == 'tab':
clear_text.append(u'\t')
# Prefer the encoding specified by the RTF data to that
# specified by the Paradox table header
# West European encoding
elif control_str == 'fcharset0':
encoding = u'cp1252'
# Greek encoding
elif control_str == 'fcharset161':
encoding = u'cp1253'
# Turkish encoding
elif control_str == 'fcharset162':
encoding = u'cp1254'
# Vietnamese encoding
elif control_str == 'fcharset163':
encoding = u'cp1258'
# Hebrew encoding
elif control_str == 'fcharset177':
encoding = u'cp1255'
# Arabic encoding
elif control_str == 'fcharset178':
encoding = u'cp1256'
# Baltic encoding
elif control_str == 'fcharset186':
encoding = u'cp1257'
# Cyrillic encoding
elif control_str == 'fcharset204':
encoding = u'cp1251'
# Thai encoding
elif control_str == 'fcharset222':
encoding = u'cp874'
# Central+East European encoding
elif control_str == 'fcharset238':
encoding = u'cp1250'
elif control_str[0] == '\'':
s = chr(int(control_str[1:3], 16))
clear_text.append(s.decode(encoding))
del control_word[:]
if c == '\\' and new_control:
control = True
elif c == '{':
depth += 1
elif c == '}':
depth -= 1
elif depth > 2:
continue
elif c == '\n' or c == '\r':
continue
elif c == '\\':
control = True
else:
clear_text.append(c)
return u''.join(clear_text)
class FieldDescEntry: class FieldDescEntry:
def __init__(self, name, type, size): def __init__(self, name, type, size):
@ -274,7 +179,7 @@ class EasyWorshipSongImport(SongImport):
self.addAuthor(author_name.strip()) self.addAuthor(author_name.strip())
if words: if words:
# Format the lyrics # Format the lyrics
words = strip_rtf(words, self.encoding) words, self.encoding = strip_rtf(words, self.encoding)
verse_type = VerseType.Tags[VerseType.Verse] verse_type = VerseType.Tags[VerseType.Verse]
for verse in SLIDE_BREAK_REGEX.split(words): for verse in SLIDE_BREAK_REGEX.split(words):
verse = verse.strip() verse = verse.strip()

View File

@ -44,6 +44,7 @@ from powersongimport import PowerSongImport
from ewimport import EasyWorshipSongImport from ewimport import EasyWorshipSongImport
from songbeamerimport import SongBeamerImport from songbeamerimport import SongBeamerImport
from songshowplusimport import SongShowPlusImport from songshowplusimport import SongShowPlusImport
from sundayplusimport import SundayPlusImport
from foilpresenterimport import FoilPresenterImport from foilpresenterimport import FoilPresenterImport
from zionworximport import ZionWorxImport from zionworximport import ZionWorxImport
# Imports that might fail # Imports that might fail
@ -145,9 +146,10 @@ class SongFormat(object):
SongBeamer = 11 SongBeamer = 11
SongShowPlus = 12 SongShowPlus = 12
SongsOfFellowship = 13 SongsOfFellowship = 13
WordsOfWorship = 14 SundayPlus = 14
ZionWorx = 15 WordsOfWorship = 15
#CSV = 16 ZionWorx = 16
#CSV = 17
# Set optional attribute defaults # Set optional attribute defaults
__defaults__ = { __defaults__ = {
@ -275,6 +277,13 @@ class SongFormat(object):
'The Songs of Fellowship importer has been disabled because ' 'The Songs of Fellowship importer has been disabled because '
'OpenLP cannot access OpenOffice or LibreOffice.') 'OpenLP cannot access OpenOffice or LibreOffice.')
}, },
SundayPlus: {
u'class': SundayPlusImport,
u'name': u'SundayPlus',
u'prefix': u'sundayPlus',
u'filter': u'%s (*.ptf)' % translate(
'SongsPlugin.ImportWizardForm', 'SundayPlus Song Files')
},
WordsOfWorship: { WordsOfWorship: {
u'class': WowImport, u'class': WowImport,
u'name': u'Words of Worship', u'name': u'Words of Worship',
@ -322,6 +331,7 @@ class SongFormat(object):
SongFormat.SongBeamer, SongFormat.SongBeamer,
SongFormat.SongShowPlus, SongFormat.SongShowPlus,
SongFormat.SongsOfFellowship, SongFormat.SongsOfFellowship,
SongFormat.SundayPlus,
SongFormat.WordsOfWorship, SongFormat.WordsOfWorship,
SongFormat.ZionWorx SongFormat.ZionWorx
] ]

View File

@ -0,0 +1,195 @@
# -*- coding: utf-8 -*-
# vim: autoindent shiftwidth=4 expandtab textwidth=80 tabstop=4 softtabstop=4
###############################################################################
# OpenLP - Open Source Lyrics Projection #
# --------------------------------------------------------------------------- #
# Copyright (c) 2008-2012 Raoul Snyman #
# Portions copyright (c) 2008-2012 Tim Bentley, Gerald Britton, Jonathan #
# Corwin, Samuel Findlay, Michael Gorven, Scott Guerrieri, Matthias Hub, #
# Meinert Jordan, Armin Köhler, Edwin Lunando, Joshua Miller, Stevan Pettit, #
# Andreas Preikschat, Mattias Põldaru, Christian Richter, Philip Ridout, #
# Simon Scudder, Jeffrey Smith, Maikel Stuivenberg, Martin Thompson, Jon #
# Tibble, Dave Warnock, Frode Woldsund #
# --------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
import os
import re
from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding
from openlp.plugins.songs.lib import strip_rtf
from openlp.plugins.songs.lib.songimport import SongImport
HOTKEY_TO_VERSE_TYPE = {
u'1': u'v1',
u'2': u'v2',
u'3': u'v3',
u'4': u'v4',
u'5': u'v5',
u'6': u'v6',
u'7': u'v7',
u'8': u'v8',
u'9': u'v9',
u'C': u'c',
u'+': u'b',
u'Z': u'o'}
class SundayPlusImport(SongImport):
"""
Import Sunday Plus songs
The format examples can be found attached to bug report at
<http://support.openlp.org/issues/395>
"""
def __init__(self, manager, **kwargs):
"""
Initialise the class.
"""
SongImport.__init__(self, manager, **kwargs)
self.encoding = u'us-ascii'
def doImport(self):
self.importWizard.progressBar.setMaximum(len(self.importSource))
for filename in self.importSource:
if self.stopImportFlag:
return
song_file = open(filename)
self.doImportFile(song_file)
song_file.close()
def doImportFile(self, file):
"""
Process the Sunday Plus file object.
"""
self.setDefaults()
if not self.parse(file.read()):
self.logError(file.name)
return
if not self.title:
self.title = self.titleFromFilename(file.name)
if not self.finish():
self.logError(file.name)
def parse(self, data, cell=False):
if len(data) == 0 or data[0:1] != '[' or data[-1] != ']':
self.logError(u'File is malformed')
return False
i = 1
verse_type = VerseType.Tags[VerseType.Verse]
while i < len(data):
# Data is held as #name: value pairs inside groups marked as [].
# Now we are looking for the name.
if data[i:i + 1] == '#':
name_end = data.find(':', i + 1)
name = data[i + 1:name_end]
i = name_end + 1
while data[i:i + 1] == ' ':
i += 1
if data[i:i + 1] == '"':
end = data.find('"', i + 1)
value = data[i + 1:end]
elif data[i:i + 1] == '[':
j = i
inside_quotes = False
while j < len(data):
char = data[j:j + 1]
if char == '"':
inside_quotes = not inside_quotes
elif not inside_quotes and char == ']':
end = j + 1
break
j += 1
value = data[i:end]
else:
end = data.find(',', i + 1)
if data.find('(', i, end) != -1:
end = data.find(')', i) + 1
value = data[i:end]
# If we are in the main group.
if cell == False:
if name == 'title':
self.title = self.decode(self.unescape(value))
elif name == 'Author':
author = self.decode(self.unescape(value))
if len(author):
self.addAuthor(author)
elif name == 'Copyright':
self.copyright = self.decode(self.unescape(value))
elif name[0:4] == 'CELL':
self.parse(value, cell = name[4:])
# We are in a verse group.
else:
if name == 'MARKER_NAME':
value = value.strip()
if len(value):
verse_type = VerseType.Tags[
VerseType.from_loose_input(value[0])]
if len(value) >= 2 and value[-1] in ['0', '1', '2',
'3', '4', '5', '6', '7', '8', '9']:
verse_type = "%s%s" % (verse_type, value[-1])
elif name == 'Hotkey':
# Hotkey always appears after MARKER_NAME, so it
# effectively overrides MARKER_NAME, if present.
if len(value) and \
value in HOTKEY_TO_VERSE_TYPE.keys():
verse_type = HOTKEY_TO_VERSE_TYPE[value]
if name == 'rtf':
value = self.unescape(value)
verse, self.encoding = strip_rtf(value, self.encoding)
lines = verse.strip().split('\n')
# If any line inside any verse contains CCLI or
# only Public Domain, we treat this as special data:
# we remove that line and add data to specific field.
for i in xrange(len(lines)):
lines[i] = lines[i].strip()
line = lines[i]
if line[:4].lower() == u'ccli':
m = re.search(r'[0-9]+', line)
if m:
self.ccliNumber = int(m.group(0))
lines.pop(i)
elif line.lower() == u'public domain':
self.copyright = u'Public Domain'
lines.pop(i)
self.addVerse('\n'.join(lines).strip(), verse_type)
if end == -1:
break
i = end + 1
i += 1
return True
def titleFromFilename(self, filename):
title = os.path.split(filename)[1]
if title.endswith(u'.ptf'):
title = title[:-4]
# For some strange reason all example files names ended with 1-7.
if title.endswith(u'1-7'):
title = title[:-3]
return title.replace(u'_', u' ')
def decode(self, blob):
while True:
try:
return unicode(blob, self.encoding)
except:
self.encoding = retrieve_windows_encoding()
def unescape(self, text):
text = text.replace('^^', '"')
text = text.replace('^', '\'')
return text.strip()