Bug fixes for the Bible imports.

bzr-revno: 751
2010-03-21 17:11:37 +02:00 · 2010-03-21 17:11:37 +02:00 · 82a6bc791f
commit 82a6bc791f
parent 236fd6c739 253c396d7c
3 changed files with 41 additions and 10 deletions
--- a/openlp/plugins/bibles/forms/importwizardform.py
+++ b/openlp/plugins/bibles/forms/importwizardform.py
@ -341,17 +341,17 @@ class ImportWizardForm(QtGui.QWizard, Ui_BibleImportWizard):
            download_location = self.field(u'web_location').toInt()[0]
            if download_location == DownloadLocation.Crosswalk:
                bible = self.web_bible_list[DownloadLocation.Crosswalk][
-                    unicode(self.BibleComboBox.currentText())]
+                    unicode(self.BibleComboBox.currentText(), u'utf8')]
            elif download_location == DownloadLocation.BibleGateway:
                bible = self.web_bible_list[DownloadLocation.BibleGateway][
-                    unicode(self.BibleComboBox.currentText())]
+                    self.BibleComboBox.currentText()]
            importer = self.manager.import_bible(BibleFormat.WebDownload,
-                name=unicode(self.field(u'license_version').toString()),
+                name=unicode(self.field(u'license_version').toString(), u'utf8'),
                download_source=unicode(DownloadLocation.get_name(download_location)),
-                download_name=unicode(bible),
-                proxy_server=unicode(self.field(u'proxy_server').toString()),
-                proxy_username=unicode(self.field(u'proxy_username').toString()),
-                proxy_password=unicode(self.field(u'proxy_password').toString())
+                download_name=unicode(bible, u'utf8'),
+                proxy_server=unicode(self.field(u'proxy_server').toString(), u'utf8'),
+                proxy_username=unicode(self.field(u'proxy_username').toString(), u'utf8'),
+                proxy_password=unicode(self.field(u'proxy_password').toString(), u'utf8')
            )
        success = importer.do_import()
        if success:
--- a/openlp/plugins/bibles/lib/common.py
+++ b/openlp/plugins/bibles/lib/common.py
@ -27,6 +27,7 @@ import urllib2
 import logging
 import re
 import chardet
+import htmlentitydefs

 only_verses = re.compile(r'([\w .]+)[ ]+([0-9]+)[ ]*[:|v|V][ ]*([0-9]+)'
    r'(?:[ ]*-[ ]*([0-9]+|end))?(?:[ ]*,[ ]*([0-9]+)(?:[ ]*-[ ]*([0-9]+|end))?)?',
@ -115,7 +116,6 @@ def parse_reference(reference):
    log.debug(reference_list)
    return reference_list

-
 class SearchResults(object):
    """
    Encapsulate a set of search results. This is Bible-type independant.
@ -247,3 +247,33 @@ class BibleCommon(object):
            start_tag = text.find(u'<')
        text = text.replace(u'>', u'')
        return text.rstrip().lstrip()
+
+
+def unescape(text):
+    """
+    Removes HTML or XML character references and entities from a text string.
+    Courtesy of Fredrik Lundh, http://effbot.org/zone/re-sub.htm#unescape-html
+
+    @param text The HTML (or XML) source text.
+    @return The plain text, as a Unicode string, if necessary.
+    """
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == u'&#':
+            # character reference
+            try:
+                if text[:3] == u'&#x':
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub(u'&#?\w+;', fixup, text)
+
--- a/openlp/plugins/bibles/lib/http.py
+++ b/openlp/plugins/bibles/lib/http.py
@ -32,7 +32,7 @@ from BeautifulSoup import BeautifulSoup, Tag, NavigableString

 from openlp.core.lib import Receiver
 from openlp.core.utils import AppLocation
-from common import BibleCommon, SearchResults
+from common import BibleCommon, SearchResults, unescape
 from db import BibleDB
 from openlp.plugins.bibles.lib.models import Book

@ -196,7 +196,8 @@ class BGExtract(BibleCommon):
                verse_list[verse_number] = u''
                continue
            if isinstance(verse, NavigableString):
-                verse_list[verse_number] = verse_list[verse_number] + verse.replace(u'&nbsp;', u' ')
+                verse_list[verse_number] = verse_list[verse_number] + \
+                    unescape(unicode(verse, u'utf-8').replace(u'&nbsp;', u' '))
        # Delete the "0" element, since we don't need it, it's just there for
        # some stupid initial whitespace, courtesy of Bible Gateway.
        del verse_list[0]