Fixed the BibleGateway importer.

bzr-revno: 747
2010-03-19 09:54:36 +02:00 · 2010-03-19 09:54:36 +02:00 · ee083c30d5
parent 7f5862e9c2 2a903d6a5b
commit ee083c30d5
2 changed files with 59 additions and 40 deletions
--- a/openlp/core/utils/init.py
+++ b/openlp/core/utils/init.py
@ -23,13 +23,14 @@
 # Temple Place, Suite 330, Boston, MA 02111-1307 USA                          #
 ###############################################################################
 import openlp
 import os
 import sys
 import logging
 import urllib2
 from datetime import datetime
 import openlp
 log = logging.getLogger(__name__)
 class AppLocation(object):
--- a/openlp/plugins/bibles/lib/http.py
+++ b/openlp/plugins/bibles/lib/http.py
@ -28,7 +28,7 @@ import urllib2
 import os
 import sqlite3
-from BeautifulSoup import BeautifulSoup
+from BeautifulSoup import BeautifulSoup, Tag, NavigableString
 from openlp.core.lib import Receiver
 from openlp.core.utils import AppLocation
@ -146,44 +146,62 @@ class BGExtract(BibleCommon):
        urlstring = u'http://www.biblegateway.com/passage/?search=%s+%s' \
            u'&version=%s' % (bookname, chapter, version)
        log.debug(u'BibleGateway url = %s' % urlstring)
-        xml_string = self._get_web_text(urlstring, self.proxyurl)
+        # Let's get the page, and then open it in BeautifulSoup, so as to
-        verseSearch = u'<sup class=\"versenum'
+        # attempt to make "easy" work of bad HTML.
-        verseFootnote = u'<sup class=\'footnote'
+        page = urllib2.urlopen(urlstring)
-        verse = 1
+        soup = BeautifulSoup(page)
-        i = xml_string.find(u'result-text-style-normal') + 26
+        verses = soup.find(u'div', u'result-text-style-normal')
-        xml_string = xml_string[i:len(xml_string)]
+        verse_number = 0
-        versePos = xml_string.find(verseSearch)
+        verse_list = {0: u''}
-        bible = {}
+        # http://www.codinghorror.com/blog/2009/11/parsing-html-the-cthulhu-way.html
-        while versePos > -1:
+        # This is a PERFECT example of opening the Cthulu tag!
-            # clear out string
+        # O Bible Gateway, why doth ye such horrific HTML produce?
-            verseText = u''
+        for verse in verses:
-            versePos = xml_string.find(u'</sup>', versePos) + 6
+            if isinstance(verse, Tag) and verse.name == u'div' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] == u'footnotes':
-            i = xml_string.find(verseSearch, versePos + 1)
+                break
-            # Not sure if this is needed now
+            if isinstance(verse, Tag) and verse.name == u'sup' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] != u'versenum':
-            if i == -1:
+                continue
-                i = xml_string.find(u'</div', versePos + 1)
+            if isinstance(verse, Tag) and verse.name == u'p' and not verse.contents:
-                j = xml_string.find(u'<strong', versePos + 1)
+                continue
-                if j > 0 and j < i:
+            if isinstance(verse, Tag) and (verse.name == u'p' or verse.name == u'font') and verse.contents:
-                    i = j
+                for item in verse.contents:
-                verseText = xml_string[versePos + 7 : i ]
+                    if isinstance(item, Tag) and (item.name == u'h4' or item.name == u'h5'):
-                # store the verse
+                        continue
-                bible[verse] = self._clean_text(verseText)
+                    if isinstance(item, Tag) and item.name == u'sup' and filter(lambda a: a[0] == u'class', item.attrs)[0][1] != u'versenum':
-                versePos = -1
+                        continue
-            else:
+                    if isinstance(item, Tag) and item.name == u'p' and not item.contents:
-                verseText = xml_string[versePos: i]
+                        continue
-                start_tag = verseText.find(verseFootnote)
+                    if isinstance(item, Tag) and item.name == u'sup':
-                while start_tag > -1:
+                        verse_number = int(str(item.contents[0]))
-                    end_tag = verseText.find(u'</sup>')
+                        verse_list[verse_number] = u''
-                    verseText = verseText[:start_tag] + verseText[end_tag + 6:len(verseText)]
+                        continue
-                    start_tag = verseText.find(verseFootnote)
+                    if isinstance(item, Tag) and item.name == u'font':
-                # Chop off verse and start again
+                        for subitem in item.contents:
-                xml_string = xml_string[i:]
+                            if isinstance(subitem, Tag) and subitem.name == u'sup' and filter(lambda a: a[0] == u'class', subitem.attrs)[0][1] != u'versenum':
-                #look for the next verse
+                                continue
-                versePos = xml_string.find(verseSearch)
+                            if isinstance(subitem, Tag) and subitem.name == u'p' and not subitem.contents:
-                # store the verse
+                                continue
-                bible[verse] = self._clean_text(verseText)
+                            if isinstance(subitem, Tag) and subitem.name == u'sup':
-                verse += 1
+                                verse_number = int(str(subitem.contents[0]))
-        return SearchResults(bookname, chapter, bible)
+                                verse_list[verse_number] = u''
                                continue
                            if isinstance(subitem, NavigableString):
                                verse_list[verse_number] = verse_list[verse_number] + subitem.replace(u'&nbsp;', u' ')
                        continue
                    if isinstance(item, NavigableString):
                        verse_list[verse_number] = verse_list[verse_number] + item.replace(u'&nbsp;', u' ')
                continue
            if isinstance(verse, Tag) and verse.name == u'sup':
                verse_number = int(str(verse.contents[0]))
                verse_list[verse_number] = u''
                continue
            if isinstance(verse, NavigableString):
                verse_list[verse_number] = verse_list[verse_number] + verse.replace(u'&nbsp;', u' ')
        # Delete the "0" element, since we don't need it, it's just there for
        # some stupid initial whitespace, courtesy of Bible Gateway.
        del verse_list[0]
        # Finally, return the list of verses in a "SearchResults" object.
        return SearchResults(bookname, chapter, verse_list)
 class CWExtract(BibleCommon):
    log.info(u'%s CWExtract loaded', __name__)