Changed ZionWorxImport to parse CSV database dump (XML had invalid syntax)

This commit is contained in:
Samuel Findlay 2012-05-24 22:12:48 +10:00
parent b7cde8938c
commit fc87e58a3c
2 changed files with 75 additions and 36 deletions

View File

@ -73,7 +73,7 @@ class CSVBible(BibleDB):
def __init__(self, parent, **kwargs): def __init__(self, parent, **kwargs):
""" """
Loads a Bible from a set of CVS files. Loads a Bible from a set of CSV files.
This class assumes the files contain all the information and This class assumes the files contain all the information and
a clean bible is being loaded. a clean bible is being loaded.
""" """

View File

@ -28,10 +28,8 @@
The :mod:`zionworximport` module provides the functionality for importing The :mod:`zionworximport` module provides the functionality for importing
ZionWorx songs into the OpenLP database. ZionWorx songs into the OpenLP database.
""" """
import csv
import logging import logging
import re
from lxml import etree
from openlp.core.lib import translate from openlp.core.lib import translate
from openlp.plugins.songs.lib.songimport import SongImport from openlp.plugins.songs.lib.songimport import SongImport
@ -40,40 +38,81 @@ log = logging.getLogger(__name__)
class ZionWorxImport(SongImport): class ZionWorxImport(SongImport):
""" """
The :class:`ZionWorxImport` class provides the ability to import... The :class:`ZionWorxImport` class provides the ability to import songs
from ZionWorx, via a dump of the ZionWorx database to a CSV file.
ZionWorx song database fields:
* ``SongNum`` Song ID. Discarded by importer.
* ``Title1`` Main Title.
* ``Title2`` Alternate Title.
* ``Lyrics`` Song verses, separated by blank lines.
* ``Writer`` Song author(s).
* ``Copyright`` Copyright information
* ``Keywords`` Discarded by importer.
* ``DefaultStyle`` Discarded by importer.
ZionWorx has no native export function; it uses the proprietary TurboDB
database engine. The TurboDB vendor, dataWeb, provides tools which can
export TurboDB tables to other formats, such as freeware console tool
TurboDB Data Exchange which is available for Windows and Linux. This command
exports the ZionWorx songs table to a CSV file:
``tdbdatax MainTable.dat songstable.csv -fsdf -s, -qd``
* ``-f`` Table format: ``sdf`` denotes text file.
* ``-s`` Separator character between fields.
* ``-q`` Quote character surrounding fields. ``d`` denotes double-quote.
CSV format expected by importer:
* Fields separated by comma ``,``
* Fields surrounded by double-quotes ``"``. This enables fields (such as
Lyrics) to include new-lines and commas. Double-quotes within a field
are denoted by two double-quotes ``""``
* Note: This is the default format of the Python ``csv`` module.
""" """
def doImport(self): def doImport(self):
""" """
Receive ... to import. Receive a CSV file (from a ZionWorx database dump) to import.
""" """
#open xml file if not os.path.isfile(self.importSource):
with open(self.importSource, 'rb') as f: self.logError(unicode(translate('SongsPlugin.ZionWorxImport',
songs_xml = unicode(f.read(), u'utf-8') 'No songs to import.')),
# check single xml file unicode(translate('SongsPlugin.ZionWorxImport',
if not re.match(ur' *<\?xml[^<>]*\?>', songs_xml): 'No %s CSV file found.' % WizardStrings.ZW)))
# Error: invalid file (no XML declaration) return
print u'Error: invalid file (no XML declaration)' with open(self.importSource, 'rb') as songs_file:
else: songs_reader = csv.reader(songs_file)
# clean invalid XML try:
# remove DefaultStyle attribute if non-empty num_records = sum(1 for _ in songs_reader)
songs_xml = re.sub(ur'DefaultStyle=".+" />', u'/>', songs_xml) except csv.Error, e:
# replace & with &amp; (skip existing entities) self.logError(unicode(translate('SongsPlugin.ZionWorxImport',
songs_xml = re.sub(ur'&(?![a-zA-Z#][a-zA-Z0-9]*;)', u'&amp;', 'Error reading CSV file.')),
songs_xml) unicode(translate('SongsPlugin.ZionWorxImport',
# replace < with &lt; (skip known <tags>) 'Line %d: %s' % songs_reader.line_num, e)))
songs_xml = re.sub(ur'<(?![?DMFR/])', u'&lt;', songs_xml) log.debug(u'%s records found in CSV file' % num_records)
# replace " within Lyrics attribute with &quot; self.importWizard.progressBar.setMaximum(num_records)
songs_xml = re.sub(ur'(?<=Lyrics=")([^<]*)(?=" Writer=)', fieldnames = [u'SongNum', u'Title1', u'Title2', u'Lyrics',
self._escapeQuotes, songs_xml) u'Writer', u'Copyright', u'Keywords', u'DefaultStyle']
print songs_xml songs_reader_dict= csv.DictReader(songs_file, fieldnames)
try:
# parse XML for record in songs_reader_dict:
tree = etree.fromstring(songs_xml.encode(u'utf-8')) if self.stopImportFlag:
for song in tree[1].iterchildren(): return
for attrib, value in song.attrib.items(): self.setDefaults()
print attrib + ':', value self.title = unicode(record[u'Title1'])
print '' if record[u'Title2']:
self.alternateTitle = unicode(record[u'Title2'])
def _escapeQuotes(self, m): self.parseAuthor(unicode(record[u'Writer']))
return m.group(0).replace('"', '&quot;') self.addCopyright(unicode(record[u'Copyright']))
self.processSongText(unicode(record[u'Lyrics']))
if not self.finish():
self.logError(self.title)
except csv.Error, e:
self.logError(unicode(translate('SongsPlugin.ZionWorxImport',
'Error reading CSV file.')),
unicode(translate('SongsPlugin.ZionWorxImport',
'Line %d: %s' % songs_reader_dict.line_num, e)))