Fixed bug #991150: BibleGateway has new HTML, so we need to change our parsing. This now parses the new HTML correctly.

Fixes: https://launchpad.net/bugs/991150
2012-06-12 23:25:31 +02:00 · 2012-06-12 23:25:31 +02:00 · 4e0b9c30d3
commit 4e0b9c30d3
parent e0c5fabc5a
1 changed files with 109 additions and 54 deletions
--- a/openlp/plugins/bibles/lib/http.py
+++ b/openlp/plugins/bibles/lib/http.py
@ -43,6 +43,15 @@ from openlp.plugins.bibles.lib import SearchResults
 from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, \
    Book

+UGLY_CHARS = {
+    u'\u2014': u' - ',
+    u'\u2018': u'\'',
+    u'\u2019': u'\'',
+    u'\u201c': u'"',
+    u'\u201d': u'"',
+    u'&nbsp;': u' '
+}
+
 log = logging.getLogger(__name__)

 class BGExtract(object):
@ -54,6 +63,103 @@ class BGExtract(object):
        self.proxy_url = proxy_url
        socket.setdefaulttimeout(30)

+    def _remove_elements(self, parent, tag, class_=None):
+        """
+        Remove a particular element from the BeautifulSoup tree.
+
+        ``parent``
+            The element from which items need to be removed.
+
+        ``tag``
+            A string of the tab type, e.g. "div"
+
+        ``class_``
+            An HTML class attribute for further qualification.
+        """
+        if class_:
+            all_tags = parent.findAll(tag, class_)
+        else:
+            all_tags = parent.findAll(tag)
+        for element in all_tags:
+            element.extract()
+
+    def _extract_verse(self, tag):
+        """
+        Extract a verse (or part of a verse) from a tag.
+
+        ``tag``
+            The BeautifulSoup Tag element with the stuff we want.
+        """
+        if isinstance(tag, NavigableString):
+            return None, unicode(tag)
+        elif tag.get('class') == 'versenum':
+            verse = unicode(tag.string)\
+                .replace('[', '').replace(']', '').strip()
+            return verse, None
+        elif tag.get('class') == 'chapternum':
+            verse = '1'
+            return verse, None
+        else:
+            verse, text = None, ''
+            for child in tag.contents:
+                c_verse, c_text = self._extract_verse(child)
+                if c_verse:
+                    verse = c_verse
+                if text and c_text:
+                    text += c_text
+                elif c_text is not None:
+                    text = c_text
+            return verse, text
+
+    def _clean_soup(self, tag):
+        """
+        Remove all the rubbish from the HTML page.
+
+        ``tag``
+            The base tag within which we want to remove stuff.
+        """
+        self._remove_elements(tag, 'sup', 'crossreference')
+        self._remove_elements(tag, 'sup', 'footnote')
+        self._remove_elements(tag, 'div', 'footnotes')
+        self._remove_elements(tag, 'div', 'crossrefs')
+        self._remove_elements(tag, 'h3')
+
+    def _extract_verses(self, tags):
+        """
+        Extract all the verses from a pre-prepared list of HTML tags.
+
+        ``tags``
+            A list of BeautifulSoup Tag elements.
+        """
+        verses = []
+        tags = tags[::-1]
+        current_text = ''
+        for tag in tags:
+            verse, text = None, ''
+            for child in tag.contents:
+                c_verse, c_text = self._extract_verse(child)
+                if c_verse:
+                    verse = c_verse
+                if text and c_text:
+                    text += c_text
+                elif c_text is not None:
+                    text = c_text
+            if not verse:
+                current_text = text + ' ' + current_text
+            else:
+                text += ' ' + current_text
+                current_text = ''
+            if text:
+                for old, new in UGLY_CHARS.iteritems():
+                    text = text.replace(old, new)
+                text = u' '.join(text.split())
+            if verse and text:
+                verses.append((int(verse.strip()), text))
+        verse_list = {}
+        for verse, text in verses[::-1]:
+            verse_list[verse] = text
+        return verse_list
+
    def get_bible_chapter(self, version, book_name, chapter):
        """
        Access and decode Bibles via the BibleGateway website.
@ -80,60 +186,9 @@ class BGExtract(object):
        if not soup:
            return None
        Receiver.send_message(u'openlp_process_events')
-        footnotes = soup.findAll(u'sup', u'footnote')
-        if footnotes:
-            for footnote in footnotes:
-                footnote.extract()
-        crossrefs = soup.findAll(u'sup', u'xref')
-        if crossrefs:
-            for crossref in crossrefs:
-                crossref.extract()
-        headings = soup.findAll(u'h5')
-        if headings:
-            for heading in headings:
-                heading.extract()
-        chapter_notes = soup.findAll('div', 'footnotes')
-        if chapter_notes:
-            log.debug('Found chapter notes')
-            for note in chapter_notes:
-                note.extract()
-        note_comments = soup.findAll(text=u'end of footnotes')
-        if note_comments:
-            for comment in note_comments:
-                comment.extract()
-        cleanup = [(re.compile('\s+'), lambda match: ' ')]
-        verses = BeautifulSoup(str(soup), markupMassage=cleanup)
-        verse_list = {}
-        # Cater for inconsistent mark up in the first verse of a chapter.
-        first_verse = verses.find(u'versenum')
-        if first_verse and first_verse.contents:
-            verse_list[1] = unicode(first_verse.contents[0])
-        for verse in verses(u'sup', u'versenum'):
-            raw_verse_num = verse.next
-            clean_verse_num = 0
-            # Not all verses exist in all translations and may or may not be
-            # represented by a verse number. If they are not fine, if they are
-            # it will probably be in a format that breaks int(). We will then
-            # have no idea what garbage may be sucked in to the verse text so
-            # if we do not get a clean int() then ignore the verse completely.
-            try:
-                clean_verse_num = int(str(raw_verse_num))
-            except ValueError:
-                log.warn(u'Illegal verse number in %s %s %s:%s',
-                    version, book_name, chapter, unicode(raw_verse_num))
-            if clean_verse_num:
-                verse_text = raw_verse_num.next
-                part = raw_verse_num.next.next
-                while not (isinstance(part, Tag) and part.attrMap and
-                    part.attrMap[u'class'] == u'versenum'):
-                    # While we are still in the same verse grab all the text.
-                    if isinstance(part, NavigableString):
-                        verse_text = verse_text + part
-                    if isinstance(part.next, Tag) and part.next.name == u'div':
-                        # Run out of verses so stop.
-                        break
-                    part = part.next
-                verse_list[clean_verse_num] = unicode(verse_text)
+        div = soup.find('div', 'result-text-style-normal')
+        self._clean_soup(div)
+        verse_list = self._extract_verses(div.findAll('span', 'text'))
        if not verse_list:
            log.debug(u'No content found in the BibleGateway response.')
            send_error_message(u'parse')