Bug fixes for the Bible imports.

bzr-revno: 751
This commit is contained in:
Raoul Snyman 2010-03-21 17:11:37 +02:00
commit 82a6bc791f
3 changed files with 41 additions and 10 deletions

View File

@ -341,17 +341,17 @@ class ImportWizardForm(QtGui.QWizard, Ui_BibleImportWizard):
download_location = self.field(u'web_location').toInt()[0]
if download_location == DownloadLocation.Crosswalk:
bible = self.web_bible_list[DownloadLocation.Crosswalk][
unicode(self.BibleComboBox.currentText())]
unicode(self.BibleComboBox.currentText(), u'utf8')]
elif download_location == DownloadLocation.BibleGateway:
bible = self.web_bible_list[DownloadLocation.BibleGateway][
unicode(self.BibleComboBox.currentText())]
self.BibleComboBox.currentText()]
importer = self.manager.import_bible(BibleFormat.WebDownload,
name=unicode(self.field(u'license_version').toString()),
name=unicode(self.field(u'license_version').toString(), u'utf8'),
download_source=unicode(DownloadLocation.get_name(download_location)),
download_name=unicode(bible),
proxy_server=unicode(self.field(u'proxy_server').toString()),
proxy_username=unicode(self.field(u'proxy_username').toString()),
proxy_password=unicode(self.field(u'proxy_password').toString())
download_name=unicode(bible, u'utf8'),
proxy_server=unicode(self.field(u'proxy_server').toString(), u'utf8'),
proxy_username=unicode(self.field(u'proxy_username').toString(), u'utf8'),
proxy_password=unicode(self.field(u'proxy_password').toString(), u'utf8')
)
success = importer.do_import()
if success:

View File

@ -27,6 +27,7 @@ import urllib2
import logging
import re
import chardet
import htmlentitydefs
only_verses = re.compile(r'([\w .]+)[ ]+([0-9]+)[ ]*[:|v|V][ ]*([0-9]+)'
r'(?:[ ]*-[ ]*([0-9]+|end))?(?:[ ]*,[ ]*([0-9]+)(?:[ ]*-[ ]*([0-9]+|end))?)?',
@ -115,7 +116,6 @@ def parse_reference(reference):
log.debug(reference_list)
return reference_list
class SearchResults(object):
"""
Encapsulate a set of search results. This is Bible-type independant.
@ -247,3 +247,33 @@ class BibleCommon(object):
start_tag = text.find(u'<')
text = text.replace(u'>', u'')
return text.rstrip().lstrip()
def unescape(text):
"""
Removes HTML or XML character references and entities from a text string.
Courtesy of Fredrik Lundh, http://effbot.org/zone/re-sub.htm#unescape-html
@param text The HTML (or XML) source text.
@return The plain text, as a Unicode string, if necessary.
"""
def fixup(m):
text = m.group(0)
if text[:2] == u'&#':
# character reference
try:
if text[:3] == u'&#x':
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub(u'&#?\w+;', fixup, text)

View File

@ -32,7 +32,7 @@ from BeautifulSoup import BeautifulSoup, Tag, NavigableString
from openlp.core.lib import Receiver
from openlp.core.utils import AppLocation
from common import BibleCommon, SearchResults
from common import BibleCommon, SearchResults, unescape
from db import BibleDB
from openlp.plugins.bibles.lib.models import Book
@ -196,7 +196,8 @@ class BGExtract(BibleCommon):
verse_list[verse_number] = u''
continue
if isinstance(verse, NavigableString):
verse_list[verse_number] = verse_list[verse_number] + verse.replace(u'&nbsp;', u' ')
verse_list[verse_number] = verse_list[verse_number] + \
unescape(unicode(verse, u'utf-8').replace(u'&nbsp;', u' '))
# Delete the "0" element, since we don't need it, it's just there for
# some stupid initial whitespace, courtesy of Bible Gateway.
del verse_list[0]