Changed ZionWorxImport to parse CSV database dump (XML had invalid syntax)

This commit is contained in:
Samuel Findlay 2012-05-24 22:12:48 +10:00
parent b7cde8938c
commit fc87e58a3c
2 changed files with 75 additions and 36 deletions

View File

@ -73,7 +73,7 @@ class CSVBible(BibleDB):
def __init__(self, parent, **kwargs):
"""
Loads a Bible from a set of CVS files.
Loads a Bible from a set of CSV files.
This class assumes the files contain all the information and
a clean bible is being loaded.
"""

View File

@ -28,10 +28,8 @@
The :mod:`zionworximport` module provides the functionality for importing
ZionWorx songs into the OpenLP database.
"""
import csv
import logging
import re
from lxml import etree
from openlp.core.lib import translate
from openlp.plugins.songs.lib.songimport import SongImport
@ -40,40 +38,81 @@ log = logging.getLogger(__name__)
class ZionWorxImport(SongImport):
"""
The :class:`ZionWorxImport` class provides the ability to import...
The :class:`ZionWorxImport` class provides the ability to import songs
from ZionWorx, via a dump of the ZionWorx database to a CSV file.
ZionWorx song database fields:
* ``SongNum`` Song ID. Discarded by importer.
* ``Title1`` Main Title.
* ``Title2`` Alternate Title.
* ``Lyrics`` Song verses, separated by blank lines.
* ``Writer`` Song author(s).
* ``Copyright`` Copyright information
* ``Keywords`` Discarded by importer.
* ``DefaultStyle`` Discarded by importer.
ZionWorx has no native export function; it uses the proprietary TurboDB
database engine. The TurboDB vendor, dataWeb, provides tools which can
export TurboDB tables to other formats, such as freeware console tool
TurboDB Data Exchange which is available for Windows and Linux. This command
exports the ZionWorx songs table to a CSV file:
``tdbdatax MainTable.dat songstable.csv -fsdf -s, -qd``
* ``-f`` Table format: ``sdf`` denotes text file.
* ``-s`` Separator character between fields.
* ``-q`` Quote character surrounding fields. ``d`` denotes double-quote.
CSV format expected by importer:
* Fields separated by comma ``,``
* Fields surrounded by double-quotes ``"``. This enables fields (such as
Lyrics) to include new-lines and commas. Double-quotes within a field
are denoted by two double-quotes ``""``
* Note: This is the default format of the Python ``csv`` module.
"""
def doImport(self):
"""
Receive ... to import.
Receive a CSV file (from a ZionWorx database dump) to import.
"""
#open xml file
with open(self.importSource, 'rb') as f:
songs_xml = unicode(f.read(), u'utf-8')
# check single xml file
if not re.match(ur' *<\?xml[^<>]*\?>', songs_xml):
# Error: invalid file (no XML declaration)
print u'Error: invalid file (no XML declaration)'
else:
# clean invalid XML
# remove DefaultStyle attribute if non-empty
songs_xml = re.sub(ur'DefaultStyle=".+" />', u'/>', songs_xml)
# replace & with &amp; (skip existing entities)
songs_xml = re.sub(ur'&(?![a-zA-Z#][a-zA-Z0-9]*;)', u'&amp;',
songs_xml)
# replace < with &lt; (skip known <tags>)
songs_xml = re.sub(ur'<(?![?DMFR/])', u'&lt;', songs_xml)
# replace " within Lyrics attribute with &quot;
songs_xml = re.sub(ur'(?<=Lyrics=")([^<]*)(?=" Writer=)',
self._escapeQuotes, songs_xml)
print songs_xml
# parse XML
tree = etree.fromstring(songs_xml.encode(u'utf-8'))
for song in tree[1].iterchildren():
for attrib, value in song.attrib.items():
print attrib + ':', value
print ''
def _escapeQuotes(self, m):
return m.group(0).replace('"', '&quot;')
if not os.path.isfile(self.importSource):
self.logError(unicode(translate('SongsPlugin.ZionWorxImport',
'No songs to import.')),
unicode(translate('SongsPlugin.ZionWorxImport',
'No %s CSV file found.' % WizardStrings.ZW)))
return
with open(self.importSource, 'rb') as songs_file:
songs_reader = csv.reader(songs_file)
try:
num_records = sum(1 for _ in songs_reader)
except csv.Error, e:
self.logError(unicode(translate('SongsPlugin.ZionWorxImport',
'Error reading CSV file.')),
unicode(translate('SongsPlugin.ZionWorxImport',
'Line %d: %s' % songs_reader.line_num, e)))
log.debug(u'%s records found in CSV file' % num_records)
self.importWizard.progressBar.setMaximum(num_records)
fieldnames = [u'SongNum', u'Title1', u'Title2', u'Lyrics',
u'Writer', u'Copyright', u'Keywords', u'DefaultStyle']
songs_reader_dict= csv.DictReader(songs_file, fieldnames)
try:
for record in songs_reader_dict:
if self.stopImportFlag:
return
self.setDefaults()
self.title = unicode(record[u'Title1'])
if record[u'Title2']:
self.alternateTitle = unicode(record[u'Title2'])
self.parseAuthor(unicode(record[u'Writer']))
self.addCopyright(unicode(record[u'Copyright']))
self.processSongText(unicode(record[u'Lyrics']))
if not self.finish():
self.logError(self.title)
except csv.Error, e:
self.logError(unicode(translate('SongsPlugin.ZionWorxImport',
'Error reading CSV file.')),
unicode(translate('SongsPlugin.ZionWorxImport',
'Line %d: %s' % songs_reader_dict.line_num, e)))