Fixed bug #1049977 where some Bibles from Bible Gateway still used the old HTML format.

bzr-revno: 2059 Fixes: https://launchpad.net/bugs/1049977
2012-09-15 00:06:04 +02:00 · 2012-09-15 00:06:04 +02:00 · 42d751fa9a
commit 42d751fa9a
parent 8297be4624 19845ebfb4
1 changed files with 49 additions and 1 deletions
--- a/openlp/plugins/bibles/lib/http.py
+++ b/openlp/plugins/bibles/lib/http.py
@ -124,6 +124,8 @@ class BGExtract(object):
        self._remove_elements(tag, 'div', 'footnotes')
        self._remove_elements(tag, 'div', 'crossrefs')
        self._remove_elements(tag, 'h3')
        self._remove_elements(tag, 'h4')
        self._remove_elements(tag, 'h5')
    def _extract_verses(self, tags):
        """
@ -161,6 +163,46 @@ class BGExtract(object):
            verse_list[verse] = text
        return verse_list
    def _extract_verses_old(self, div):
        """
        Use the old style of parsing for those Bibles on BG who mysteriously
        have not been migrated to the new (still broken) HTML.
        ``div``
            The parent div.
        """
        verse_list = {}
        # Cater for inconsistent mark up in the first verse of a chapter.
        first_verse = div.find(u'versenum')
        if first_verse and first_verse.contents:
            verse_list[1] = unicode(first_verse.contents[0])
        for verse in div(u'sup', u'versenum'):
            raw_verse_num = verse.next
            clean_verse_num = 0
            # Not all verses exist in all translations and may or may not be
            # represented by a verse number. If they are not fine, if they are
            # it will probably be in a format that breaks int(). We will then
            # have no idea what garbage may be sucked in to the verse text so
            # if we do not get a clean int() then ignore the verse completely.
            try:
                clean_verse_num = int(str(raw_verse_num))
            except ValueError:
                log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
            if clean_verse_num:
                verse_text = raw_verse_num.next
                part = raw_verse_num.next.next
                while not (isinstance(part, Tag) and
                           part.get(u'class') == u'versenum'):
                    # While we are still in the same verse grab all the text.
                    if isinstance(part, NavigableString):
                        verse_text += part
                    if isinstance(part.next, Tag) and part.next.name == u'div':
                        # Run out of verses so stop.
                        break
                    part = part.next
                verse_list[clean_verse_num] = unicode(verse_text)
        return verse_list
    def get_bible_chapter(self, version, book_name, chapter):
        """
        Access and decode Bibles via the BibleGateway website.
@ -189,7 +231,13 @@ class BGExtract(object):
        Receiver.send_message(u'openlp_process_events')
        div = soup.find('div', 'result-text-style-normal')
        self._clean_soup(div)
-        verse_list = self._extract_verses(div.findAll('span', 'text'))
+        span_list = div.findAll('span', 'text')
        log.debug('Span list: %s', span_list)
        if not span_list:
            # If we don't get any spans then we must have the old HTML format
            verse_list = self._extract_verses_old(div)
        else:
            verse_list = self._extract_verses(span_list)
        if not verse_list:
            log.debug(u'No content found in the BibleGateway response.')
            send_error_message(u'parse')