Rewrote PowerSongImport class to read variable-length strings directly from file, rather than searching for them. Other minor fixes.

2012-05-01 23:51:46 +10:00 · 2012-05-01 23:51:46 +10:00 · 63b71802ab
commit 63b71802ab
parent 1184e9219d
2 changed files with 77 additions and 80 deletions
--- a/openlp/plugins/songs/lib/powersongimport.py
+++ b/openlp/plugins/songs/lib/powersongimport.py
@ -29,7 +29,6 @@ The :mod:`powersongimport` module provides the functionality for importing
 PowerSong songs into the OpenLP database.
 """
 import logging
 import re
 from openlp.core.lib import translate
 from openlp.plugins.songs.lib.songimport import SongImport
@ -43,34 +42,27 @@ class PowerSongImport(SongImport):
    **PowerSong Song File Format:**
-    * Encoded as UTF-8.
+    The file has a number of label-field pairs of variable length.
    * The file has a number of fields, with the song metadata fields first,
      followed by the lyrics fields.
-    Fields:
+    Labels and Fields:
-        Each field begins with one of four labels, each of which begin with one
+        * Every label and field is preceded by an integer which specifies its
-        non-printing byte:
+          byte-length.
-
+        * If the length < 128 bytes, only one byte is used to encode
-        * ``ENQ`` (0x05) ``TITLE``
+          the length integer.
-        * ``ACK`` (0x06) ``AUTHOR``
+        * But if it's greater, as many bytes are used as necessary:
-        * ``CR`` (0x0d) ``COPYRIGHTLINE``
+            * the first byte = (length % 128) + 128
-        * ``EOT`` (0x04) ``PART``
+            * the next byte = length / 128
-
+            * another byte is only used if (length / 128) >= 128
-        The field label is separated from the field contents by one random byte.
+            * and so on (3 bytes needed iff length > 16383)
        Each field ends at the next field label, or at the end of the file.
    Metadata fields:
        * Every PowerSong file begins with a TITLE field.
        * This is followed by zero or more AUTHOR fields.
-        * The next field is always COPYRIGHTLINE, but it may be empty (in which
+        * The next label is always COPYRIGHTLINE, but its field may be empty.
-          case the byte following the label is the null byte 0x00).
+          This field may also contain a CCLI number: e.g. "CCLI 176263".
          When the field contents are not empty, the first byte is 0xc2 and
          should be discarded.
          This field may contain a CCLI number at the end: e.g. "CCLI 176263"
    Lyrics fields:
-        * The COPYRIGHTLINE field is followed by zero or more PART fields, each
+        * Each verse is contained in a PART field.
          of which contains one verse.
        * Lines have Windows line endings ``CRLF`` (0x0d, 0x0a).
        * There is no concept of verse types.
@ -78,12 +70,6 @@ class PowerSongImport(SongImport):
        * .song
    """
    def __init__(self, manager, **kwargs):
        """
        Initialise the PowerSong importer.
        """
        SongImport.__init__(self, manager, **kwargs)
    def doImport(self):
        """
        Receive a single file or a list of files to import.
@ -94,67 +80,78 @@ class PowerSongImport(SongImport):
                if self.stopImportFlag:
                    return
                self.setDefaults()
-                with open(file, 'rb') as song_file:
+                with open(file, 'rb') as self.song_file:
-                    # Check file is valid PowerSong song format
+                    # Get title and check file is valid PowerSong song format
-                    if song_file.read(6) != u'\x05TITLE':
+                    label, field = self.readLabelField()
                    if label != u'TITLE':
                        self.logError(file, unicode(
                            translate('SongsPlugin.PowerSongSongImport',
                                ('Invalid PowerSong song file. Missing '
-                                '"\x05TITLE" header.'))))
+                                 '"TITLE" header.'))))
                        continue
-                    song_data = unicode(song_file.read(), u'utf-8', u'replace')
+                    else:
-                    # Extract title and author fields
+                        self.title = field.replace(u'\n', u' ')
-                    first_part, sep, song_data = song_data.partition(
+                    while label:
-                        u'\x0DCOPYRIGHTLINE')
+                        label, field = self.readLabelField()
-                    if not sep:
+                        # Get the author(s)
-                        self.logError(file, unicode(
+                        if label == u'AUTHOR':
-                            translate('SongsPlugin.PowerSongSongImport',
+                            self.parseAuthor(field)
-                                ('Invalid PowerSong song file. Missing '
+                        # Get copyright and look for CCLI number
-                                 '"\x0DCOPYRIGHTLINE" string.'))))
+                        elif label == u'COPYRIGHTLINE':
-                        continue
+                            found_copyright = True
-                    title_authors = first_part.split(u'\x06AUTHOR')
+                            copyright, sep, ccli_no = field.rpartition(u'CCLI')
                    # Get the song title
                    self.title = self.stripControlChars(title_authors[0][1:])
                    # Extract the author(s)
                    for author in title_authors[1:]:
                        self.parseAuthor(self.stripControlChars(author[1:]))
                    # Get copyright and CCLI number
                    copyright, sep, song_data = song_data.partition(
                        u'\x04PART')
                    if not sep:
                        self.logError(file, unicode(
                            translate('SongsPlugin.PowerSongSongImport',
                                ('No verses found. Missing '
                                 '"\x04PART" string.'))))
                        continue
                    copyright, sep, ccli_no = copyright[1:].rpartition(u'CCLI ')
                            if not sep:
                                copyright = ccli_no
                                ccli_no = u''
                            if copyright:
-                        if copyright[0] == u'\u00c2':
+                                self.addCopyright(copyright.rstrip(
-                            copyright = copyright[1:]
+                                    u'\n').replace(u'\n', u' '))
                        self.addCopyright(self.stripControlChars(
                            copyright.rstrip(u'\n')))
                            if ccli_no:
-                        ccli_no = ccli_no.strip()
+                                ccli_no = ccli_no.strip(u' :')
                                if ccli_no.isdigit():
-                            self.ccliNumber = self.stripControlChars(ccli_no)
+                                    self.ccliNumber = ccli_no
-                    # Get the verse(s)
+                        # Get verse(s)
-                    verses = song_data.split(u'\x04PART')
+                        elif label == u'PART':
-                    for verse in verses:
+                            self.addVerse(field)
-                        self.addVerse(self.stripControlChars(verse[1:]))
+                    # Check for copyright label
                    if not found_copyright:
                        self.logError(file, unicode(
                            translate('SongsPlugin.PowerSongSongImport',
                                ('"%s" Invalid PowerSong song file. Missing '
                                 '"COPYRIGHTLINE" string.' % self.title))))
                        continue
                    # Check for at least one verse
                    if not self.verses:
                        self.logError(file, unicode(
                            translate('SongsPlugin.PowerSongSongImport',
                                ('"%s" No verses found. Missing "PART" string.'
                                 % self.title))))
                        continue
                if not self.finish():
                    self.logError(file)
-    def stripControlChars(self, text):
+    def readLabelField(self):
        """
-        Get rid of ASCII control characters.
+        Return as a 2-tuple the next two variable-length strings from song file
        """
        label = unicode(self.song_file.read(
            self.readLength()), u'utf-8', u'ignore')
        if label:
            field = unicode(self.song_file.read(
                self.readLength()), u'utf-8', u'ignore')
        else:
            field = u''
        return label, field
-        Illegals chars are ASCII code points 0-31 and 127, except:
+    def readLength(self):
            * ``HT`` (0x09) - Tab
            * ``LF`` (0x0a) - Line feed
            * ``CR`` (0x0d) - Carriage return
        """
-        ILLEGAL_CHARS = u'([\x00-\x08\x0b-\x0c\x0e-\x1f\x7f])'
+        Return the byte-length of the next variable-length string in song file
-        return re.sub(ILLEGAL_CHARS, '', text)
+        """
        this_byte_char = self.song_file.read(1)
        if not this_byte_char:
            return 0
        this_byte = ord(this_byte_char)
        if this_byte < 128:
            return this_byte
        else:
            return (self.readLength() * 128) + (this_byte - 128)