Adds SundayPlus importer with new strip_rtf function which keeps track of encodings.

bzr-revno: 2016
2012-07-05 17:54:41 +02:00 · 2012-07-05 17:54:41 +02:00 · dc8496fa71
commit dc8496fa71
parent ff76a6df58 5f04399d6c
4 changed files with 438 additions and 100 deletions
--- a/openlp/plugins/songs/lib/init.py
+++ b/openlp/plugins/songs/lib/init.py
@ -36,6 +36,104 @@ from ui import SongStrings
 WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
 APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE)
 PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'"
    r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
 # RTF control words which specify a "destination" to be ignored.
 DESTINATIONS = frozenset((
    u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor', 
    u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime',
    u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend',
    u'bkmkstart', u'blipuid', u'buptim', u'category',
    u'colorschememapping', u'colortbl', u'comment', u'company', u'creatim',
    u'datafield', u'datastore', u'defchp', u'defpap', u'do', u'doccomm',
    u'docvar', u'dptxbxtext', u'ebcend', u'ebcstart', u'factoidname',
    u'falt', u'fchars', u'ffdeftext', u'ffentrymcr', u'ffexitmcr',
    u'ffformat', u'ffhelptext', u'ffl', u'ffname', u'ffstattext', u'field',
    u'file', u'filetbl', u'fldinst', u'fldrslt', u'fldtype', u'fname',
    u'fontemb', u'fontfile', u'footer', u'footerf', u'footerl', u'footerr',
    u'footnote', u'formfield', u'ftncn', u'ftnsep', u'ftnsepc', u'g',
    u'generator', u'gridtbl', u'header', u'headerf', u'headerl',
    u'headerr', u'hl', u'hlfr', u'hlinkbase', u'hlloc', u'hlsrc', u'hsv',
    u'htmltag', u'info', u'keycode', u'keywords', u'latentstyles',
    u'lchars', u'levelnumbers', u'leveltext', u'lfolevel', u'linkval',
    u'list', u'listlevel', u'listname', u'listoverride',
    u'listoverridetable', u'listpicture', u'liststylename', u'listtable',
    u'listtext', u'lsdlockedexcept', u'macc', u'maccPr', u'mailmerge',
    u'maln', u'malnScr', u'manager', u'margPr', u'mbar', u'mbarPr',
    u'mbaseJc', u'mbegChr', u'mborderBox', u'mborderBoxPr', u'mbox',
    u'mboxPr', u'mchr', u'mcount', u'mctrlPr', u'md', u'mdeg', u'mdegHide',
    u'mden', u'mdiff', u'mdPr', u'me', u'mendChr', u'meqArr', u'meqArrPr',
    u'mf', u'mfName', u'mfPr', u'mfunc', u'mfuncPr', u'mgroupChr',
    u'mgroupChrPr', u'mgrow', u'mhideBot', u'mhideLeft', u'mhideRight',
    u'mhideTop', u'mhtmltag', u'mlim', u'mlimloc', u'mlimlow',
    u'mlimlowPr', u'mlimupp', u'mlimuppPr', u'mm', u'mmaddfieldname',
    u'mmath', u'mmathPict', u'mmathPr', u'mmaxdist', u'mmc', u'mmcJc',
    u'mmconnectstr', u'mmconnectstrdata', u'mmcPr', u'mmcs',
    u'mmdatasource', u'mmheadersource', u'mmmailsubject', u'mmodso',
    u'mmodsofilter', u'mmodsofldmpdata', u'mmodsomappedname',
    u'mmodsoname', u'mmodsorecipdata', u'mmodsosort', u'mmodsosrc',
    u'mmodsotable', u'mmodsoudl', u'mmodsoudldata', u'mmodsouniquetag',
    u'mmPr', u'mmquery', u'mmr', u'mnary', u'mnaryPr', u'mnoBreak',
    u'mnum', u'mobjDist', u'moMath', u'moMathPara', u'moMathParaPr',
    u'mopEmu', u'mphant', u'mphantPr', u'mplcHide', u'mpos', u'mr',
    u'mrad', u'mradPr', u'mrPr', u'msepChr', u'mshow', u'mshp', u'msPre',
    u'msPrePr', u'msSub', u'msSubPr', u'msSubSup', u'msSubSupPr', u'msSup',
    u'msSupPr', u'mstrikeBLTR', u'mstrikeH', u'mstrikeTLBR', u'mstrikeV',
    u'msub', u'msubHide', u'msup', u'msupHide', u'mtransp', u'mtype',
    u'mvertJc', u'mvfmf', u'mvfml', u'mvtof', u'mvtol', u'mzeroAsc',
    u'mzeroDesc', u'mzeroWid', u'nesttableprops', u'nextfile',
    u'nonesttables', u'objalias', u'objclass', u'objdata', u'object',
    u'objname', u'objsect', u'objtime', u'oldcprops', u'oldpprops',
    u'oldsprops', u'oldtprops', u'oleclsid', u'operator', u'panose',
    u'password', u'passwordhash', u'pgp', u'pgptbl', u'picprop', u'pict',
    u'pn', u'pnseclvl', u'pntext', u'pntxta', u'pntxtb', u'printim',
    u'private', u'propname', u'protend', u'protstart', u'protusertbl',
    u'pxe', u'result', u'revtbl', u'revtim', u'rsidtbl', u'rxe', u'shp',
    u'shpgrp', u'shpinst', u'shppict', u'shprslt', u'shptxt', u'sn', u'sp',
    u'staticval', u'stylesheet', u'subject', u'sv', u'svb', u'tc',
    u'template', u'themedata', u'title', u'txe', u'ud', u'upr',
    u'userprops', u'wgrffmtfilter', u'windowcaption', u'writereservation',
    u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue',
    u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen'))
 # Translation of some special characters.
 SPECIAL_CHARS = {
    u'par': u'\n',
    u'sect': u'\n\n',
    # Required page and column break.
    # Would be good if we could split verse into subverses here.
    u'page': u'\n\n',
    u'column': u'\n\n',
    # Soft breaks.
    u'softpage': u'[---]',
    u'softcol': u'[---]',
    u'line': u'\n',
    u'tab': u'\t',
    u'emdash': u'\u2014',
    u'endash': u'\u2013',
    u'emspace': u'\u2003',
    u'enspace': u'\u2002',
    u'qmspace': u'\u2005',
    u'bullet': u'\u2022',
    u'lquote': u'\u2018',
    u'rquote': u'\u2019',
    u'ldblquote': u'\u201C',
    u'rdblquote': u'\u201D',
    u'ltrmark': u'\u200E',
    u'rtlmark': u'\u200F',
    u'zwj': u'\u200D',
    u'zwnj': u'\u200C'}
 CHARSET_MAPPING = {
    u'fcharset0': u'cp1252',
    u'fcharset161': u'cp1253',
    u'fcharset162': u'cp1254',
    u'fcharset163': u'cp1258',
    u'fcharset177': u'cp1255',
    u'fcharset178': u'cp1256',
    u'fcharset186': u'cp1257',
    u'fcharset204': u'cp1251',
    u'fcharset222': u'cp874',
    u'fcharset238': u'cp1250'}
 class VerseType(object):
    """
@ -366,6 +464,136 @@ def clean_song(manager, song):
    if song.copyright:
        song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
 def get_encoding(font, font_table, default_encoding, failed=False):
    """
    Finds an encoding to use. Asks user, if necessary.
    ``font``
    The number of currently active font.
    ``font_table``
    Dictionary of fonts and respective encodings.
    ``default_encoding``
    The defaul encoding to use when font_table is empty or no font is used.
    ``failed``
    A boolean indicating whether the previous encoding didn't work.
    """
    encoding = None
    if font in font_table:
        encoding = font_table[font]
    if not encoding and default_encoding:
        encoding = default_encoding
    if not encoding or failed:
        encoding = retrieve_windows_encoding()
        default_encoding = encoding
    font_table[font] = encoding
    return encoding, default_encoding
 def strip_rtf(text, default_encoding=None):
    """
    This function strips RTF control structures and returns an unicode string.
    Thanks to Markus Jarderot (MizardX) for this code, used by permission.
    http://stackoverflow.com/questions/188545
    ``text``
    RTF-encoded text, a string.
    ``default_encoding``
    Default encoding to use when no encoding is specified.
    """
    # Current font is the font tag we last met.
    font = u''
    # Character encoding is defined inside fonttable.
    # font_table could contain eg u'0': u'cp1252'
    font_table = {u'': u''}
    # Stack of things to keep track of when entering/leaving groups.
    stack = []
    # Whether this group (and all inside it) are "ignorable".
    ignorable = False
    # Number of ASCII characters to skip after an unicode character.
    ucskip = 1
    # Number of ASCII characters left to skip.
    curskip = 0
    # Output buffer.
    out = []
    for match in PATTERN.finditer(text):
        word, arg, hex, char, brace, tchar = match.groups()
        if brace:
            curskip = 0
            if brace == u'{':
                # Push state
                stack.append((ucskip, ignorable, font))
            elif brace == u'}':
                # Pop state
                ucskip, ignorable, font = stack.pop()
        # \x (not a letter)
        elif char:
            curskip = 0
            if char == u'~' and not ignorable:
                out.append(u'\xA0')
            elif char in u'{}\\' and not ignorable:
                out.append(char)
            elif char == u'-' and not ignorable:
                out.append(u'\u00AD')
            elif char == u'_' and not ignorable:
                out.append(u'\u2011')
            elif char == u'*':
                ignorable = True
        # \command
        elif word:
            curskip = 0
            if word in DESTINATIONS:
                ignorable = True
            elif word in SPECIAL_CHARS:
                out.append(SPECIAL_CHARS[word])
            elif word == u'uc':
                ucskip = int(arg)
            elif word == u' ':
                c = int(arg)
                if c < 0:
                    c += 0x10000
                out.append(unichr(c))
                curskip = ucskip
            elif word == u'fonttbl':
                ignorable = True
            elif word == u'f':
                font = arg
            elif word == u'ansicpg':
                font_table[font] = 'cp' + arg
            elif word == u'fcharset' and font not in font_table and \
                word + arg in CHARSET_MAPPING:
                # \ansicpg overrides \fcharset, if present.
                font_table[font] = CHARSET_MAPPING[word + arg]
        # \'xx
        elif hex:
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
                charcode = int(hex, 16)
                failed = False
                while True:
                    try:
                        encoding, default_encoding = get_encoding(font, 
                            font_table, default_encoding, failed=failed)
                        out.append(chr(charcode).decode(encoding))
                    except UnicodeDecodeError:
                        failed = True
                    else:
                        break
        elif tchar:
            if curskip > 0:
                curskip -= 1
            elif not ignorable:
                out.append(tchar)
    text = u''.join(out)
    return text, default_encoding
 from xml import OpenLyrics, SongXML
 from songstab import SongsTab
 from mediaitem import SongMediaItem
--- a/openlp/plugins/songs/lib/ewimport.py
+++ b/openlp/plugins/songs/lib/ewimport.py
@ -36,7 +36,7 @@ import re
 from openlp.core.lib import translate
 from openlp.plugins.songs.lib import VerseType
-from openlp.plugins.songs.lib import retrieve_windows_encoding
+from openlp.plugins.songs.lib import retrieve_windows_encoding, strip_rtf
 from songimport import SongImport
 RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')
@ -45,101 +45,6 @@ SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*')
 NUMBER_REGEX = re.compile(r'[0-9]+')
 NOTE_REGEX = re.compile(r'\(.*?\)')
 def strip_rtf(blob, encoding):
    depth = 0
    control = False
    clear_text = []
    control_word = []
    # workaround for \tx bug: remove one pair of curly braces
    # if \tx is encountered
    match = RTF_STRIPPING_REGEX.search(blob)
    if match:
        # start and end indices of match are curly braces - filter them out
        blob = ''.join([blob[i] for i in xrange(len(blob))
            if i != match.start() and i !=match.end()])
    for c in blob:
        if control:
            # for delimiters, set control to False
            if c == '{':
                if control_word:
                    depth += 1
                control = False
            elif c == '}':
                if control_word:
                    depth -= 1
                control = False
            elif c == '\\':
                new_control = bool(control_word)
                control = False
            elif c.isspace():
                control = False
            else:
                control_word.append(c)
                if len(control_word) == 3 and control_word[0] == '\'':
                    control = False
            if not control:
                if not control_word:
                    if c == '{' or c == '}' or c == '\\':
                        clear_text.append(c)
                else:
                    control_str = ''.join(control_word)
                    if control_str == 'par' or control_str == 'line':
                        clear_text.append(u'\n')
                    elif control_str == 'tab':
                        clear_text.append(u'\t')
                    # Prefer the encoding specified by the RTF data to that
                    # specified by the Paradox table header
                    # West European encoding
                    elif control_str == 'fcharset0':
                        encoding = u'cp1252'
                    # Greek encoding
                    elif control_str == 'fcharset161':
                        encoding = u'cp1253'
                    # Turkish encoding
                    elif control_str == 'fcharset162':
                        encoding = u'cp1254'
                    # Vietnamese encoding
                    elif control_str == 'fcharset163':
                        encoding = u'cp1258'
                    # Hebrew encoding
                    elif control_str == 'fcharset177':
                        encoding = u'cp1255'
                    # Arabic encoding
                    elif control_str == 'fcharset178':
                        encoding = u'cp1256'
                    # Baltic encoding
                    elif control_str == 'fcharset186':
                        encoding = u'cp1257'
                    # Cyrillic encoding
                    elif control_str == 'fcharset204':
                        encoding = u'cp1251'
                    # Thai encoding
                    elif control_str == 'fcharset222':
                        encoding = u'cp874'
                    # Central+East European encoding
                    elif control_str == 'fcharset238':
                        encoding = u'cp1250'
                    elif control_str[0] == '\'':
                        s = chr(int(control_str[1:3], 16))
                        clear_text.append(s.decode(encoding))
                    del control_word[:]
            if c == '\\' and new_control:
                control = True
        elif c == '{':
            depth += 1
        elif c == '}':
            depth -= 1
        elif depth > 2:
            continue
        elif c == '\n' or c == '\r':
            continue
        elif c == '\\':
            control = True
        else:
            clear_text.append(c)
    return u''.join(clear_text)
 class FieldDescEntry:
    def __init__(self, name, type, size):
@ -274,7 +179,7 @@ class EasyWorshipSongImport(SongImport):
                        self.addAuthor(author_name.strip())
                if words:
                    # Format the lyrics
-                    words = strip_rtf(words, self.encoding)
+                    words, self.encoding = strip_rtf(words, self.encoding)
                    verse_type = VerseType.Tags[VerseType.Verse]
                    for verse in SLIDE_BREAK_REGEX.split(words):
                        verse = verse.strip()
--- a/openlp/plugins/songs/lib/importer.py
+++ b/openlp/plugins/songs/lib/importer.py
@ -44,6 +44,7 @@ from powersongimport import PowerSongImport
 from ewimport import EasyWorshipSongImport
 from songbeamerimport import SongBeamerImport
 from songshowplusimport import SongShowPlusImport
 from sundayplusimport import SundayPlusImport
 from foilpresenterimport import FoilPresenterImport
 from zionworximport import ZionWorxImport
 # Imports that might fail
@ -145,9 +146,10 @@ class SongFormat(object):
    SongBeamer = 11
    SongShowPlus = 12
    SongsOfFellowship = 13
-    WordsOfWorship = 14
+    SundayPlus = 14
-    ZionWorx = 15
+    WordsOfWorship = 15
-    #CSV = 16
+    ZionWorx = 16
    #CSV = 17
    # Set optional attribute defaults
    __defaults__ = {
@ -275,6 +277,13 @@ class SongFormat(object):
                'The Songs of Fellowship importer has been disabled because '
                'OpenLP cannot access OpenOffice or LibreOffice.')
        },
        SundayPlus: {
            u'class': SundayPlusImport,
            u'name': u'SundayPlus',
            u'prefix': u'sundayPlus',
            u'filter': u'%s (*.ptf)' % translate(
                'SongsPlugin.ImportWizardForm', 'SundayPlus Song Files')
        },
        WordsOfWorship: {
            u'class': WowImport,
            u'name': u'Words of Worship',
@ -322,6 +331,7 @@ class SongFormat(object):
            SongFormat.SongBeamer,
            SongFormat.SongShowPlus,
            SongFormat.SongsOfFellowship,
            SongFormat.SundayPlus,
            SongFormat.WordsOfWorship,
            SongFormat.ZionWorx
        ]
--- a/openlp/plugins/songs/lib/sundayplusimport.py
+++ b/openlp/plugins/songs/lib/sundayplusimport.py
@ -0,0 +1,195 @@
 # -*- coding: utf-8 -*-
 # vim: autoindent shiftwidth=4 expandtab textwidth=80 tabstop=4 softtabstop=4
 ###############################################################################
 # OpenLP - Open Source Lyrics Projection                                      #
 # --------------------------------------------------------------------------- #
 # Copyright (c) 2008-2012 Raoul Snyman                                        #
 # Portions copyright (c) 2008-2012 Tim Bentley, Gerald Britton, Jonathan      #
 # Corwin, Samuel Findlay, Michael Gorven, Scott Guerrieri, Matthias Hub,      #
 # Meinert Jordan, Armin Köhler, Edwin Lunando, Joshua Miller, Stevan Pettit,  #
 # Andreas Preikschat, Mattias Põldaru, Christian Richter, Philip Ridout,      #
 # Simon Scudder, Jeffrey Smith, Maikel Stuivenberg, Martin Thompson, Jon      #
 # Tibble, Dave Warnock, Frode Woldsund                                        #
 # --------------------------------------------------------------------------- #
 # This program is free software; you can redistribute it and/or modify it     #
 # under the terms of the GNU General Public License as published by the Free  #
 # Software Foundation; version 2 of the License.                              #
 #                                                                             #
 # This program is distributed in the hope that it will be useful, but WITHOUT #
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       #
 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for    #
 # more details.                                                               #
 #                                                                             #
 # You should have received a copy of the GNU General Public License along     #
 # with this program; if not, write to the Free Software Foundation, Inc., 59  #
 # Temple Place, Suite 330, Boston, MA 02111-1307 USA                          #
 ###############################################################################
 import os
 import re
 from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding
 from openlp.plugins.songs.lib import strip_rtf
 from openlp.plugins.songs.lib.songimport import SongImport
 HOTKEY_TO_VERSE_TYPE = {
    u'1': u'v1',
    u'2': u'v2',
    u'3': u'v3',
    u'4': u'v4',
    u'5': u'v5',
    u'6': u'v6',
    u'7': u'v7',
    u'8': u'v8',
    u'9': u'v9',
    u'C': u'c',
    u'+': u'b',
    u'Z': u'o'}
 class SundayPlusImport(SongImport):
    """
    Import Sunday Plus songs
    The format examples can be found attached to bug report at
    <http://support.openlp.org/issues/395>
    """
    def __init__(self, manager, **kwargs):
        """
        Initialise the class.
        """
        SongImport.__init__(self, manager, **kwargs)
        self.encoding = u'us-ascii'
    def doImport(self):
        self.importWizard.progressBar.setMaximum(len(self.importSource))
        for filename in self.importSource:
            if self.stopImportFlag:
                return
            song_file = open(filename)
            self.doImportFile(song_file)
            song_file.close()
    def doImportFile(self, file):
        """
        Process the Sunday Plus file object.
        """
        self.setDefaults()
        if not self.parse(file.read()):
            self.logError(file.name)
            return
        if not self.title:
            self.title = self.titleFromFilename(file.name)
        if not self.finish():
            self.logError(file.name)
    def parse(self, data, cell=False):
        if len(data) == 0 or data[0:1] != '[' or data[-1] != ']':
            self.logError(u'File is malformed')
            return False
        i = 1
        verse_type = VerseType.Tags[VerseType.Verse]
        while i < len(data):
            # Data is held as #name: value pairs inside groups marked as [].
            # Now we are looking for the name.
            if data[i:i + 1] == '#':
                name_end = data.find(':', i + 1)
                name = data[i + 1:name_end]
                i = name_end + 1
                while data[i:i + 1] == ' ':
                    i += 1
                if data[i:i + 1] == '"':
                    end = data.find('"', i + 1)
                    value = data[i + 1:end]
                elif data[i:i + 1] == '[':
                    j = i
                    inside_quotes = False
                    while j < len(data):
                        char = data[j:j + 1]
                        if char == '"':
                            inside_quotes = not inside_quotes
                        elif not inside_quotes and char == ']':
                            end = j + 1
                            break
                        j += 1
                    value = data[i:end]
                else:
                    end = data.find(',', i + 1)
                    if data.find('(', i, end) != -1:
                        end = data.find(')', i) + 1
                    value = data[i:end]
                # If we are in the main group.
                if cell == False:
                    if name == 'title':
                        self.title = self.decode(self.unescape(value))
                    elif name == 'Author':
                        author = self.decode(self.unescape(value))
                        if len(author):
                            self.addAuthor(author)
                    elif name == 'Copyright':
                        self.copyright = self.decode(self.unescape(value))
                    elif name[0:4] == 'CELL':
                        self.parse(value, cell = name[4:])
                # We are in a verse group.
                else:
                    if name == 'MARKER_NAME':
                        value = value.strip()
                        if len(value):
                            verse_type = VerseType.Tags[
                                VerseType.from_loose_input(value[0])]
                            if len(value) >= 2 and value[-1] in ['0', '1', '2',
                                '3', '4', '5', '6', '7', '8', '9']:
                                verse_type = "%s%s" % (verse_type, value[-1])
                    elif name == 'Hotkey':
                        # Hotkey always appears after MARKER_NAME, so it
                        # effectively overrides MARKER_NAME, if present.
                        if len(value) and \
                            value in HOTKEY_TO_VERSE_TYPE.keys():
                            verse_type = HOTKEY_TO_VERSE_TYPE[value]
                    if name == 'rtf':
                        value = self.unescape(value)
                        verse, self.encoding = strip_rtf(value, self.encoding)
                        lines = verse.strip().split('\n')
                        # If any line inside any verse contains CCLI or
                        # only Public Domain, we treat this as special data:
                        # we remove that line and add data to specific field.
                        for i in xrange(len(lines)):
                            lines[i] = lines[i].strip()
                            line = lines[i]
                            if line[:4].lower() == u'ccli':
                                m = re.search(r'[0-9]+', line)
                                if m:
                                    self.ccliNumber = int(m.group(0))
                                    lines.pop(i)
                            elif line.lower() == u'public domain':
                                self.copyright = u'Public Domain'
                                lines.pop(i)
                        self.addVerse('\n'.join(lines).strip(), verse_type)
                if end == -1:
                    break
                i = end + 1
            i += 1
        return True
    def titleFromFilename(self, filename):
        title = os.path.split(filename)[1]
        if title.endswith(u'.ptf'):
            title = title[:-4]
        # For some strange reason all example files names ended with 1-7.
        if title.endswith(u'1-7'):
            title = title[:-3]
        return title.replace(u'_', u' ')
    def decode(self, blob):
        while True:
            try:
                return unicode(blob, self.encoding)
            except:
                self.encoding = retrieve_windows_encoding()
    def unescape(self, text):
        text = text.replace('^^', '"')
        text = text.replace('^', '\'')
        return text.strip()