forked from openlp/openlp
Fixed bug #1049977 where some Bibles from Bible Gateway still use the old HTML format.
Fixes: https://launchpad.net/bugs/1049977
This commit is contained in:
parent
8297be4624
commit
19845ebfb4
@ -124,6 +124,8 @@ class BGExtract(object):
|
||||
self._remove_elements(tag, 'div', 'footnotes')
|
||||
self._remove_elements(tag, 'div', 'crossrefs')
|
||||
self._remove_elements(tag, 'h3')
|
||||
self._remove_elements(tag, 'h4')
|
||||
self._remove_elements(tag, 'h5')
|
||||
|
||||
def _extract_verses(self, tags):
|
||||
"""
|
||||
@ -161,6 +163,46 @@ class BGExtract(object):
|
||||
verse_list[verse] = text
|
||||
return verse_list
|
||||
|
||||
def _extract_verses_old(self, div):
|
||||
"""
|
||||
Use the old style of parsing for those Bibles on BG who mysteriously
|
||||
have not been migrated to the new (still broken) HTML.
|
||||
|
||||
``div``
|
||||
The parent div.
|
||||
"""
|
||||
verse_list = {}
|
||||
# Cater for inconsistent mark up in the first verse of a chapter.
|
||||
first_verse = div.find(u'versenum')
|
||||
if first_verse and first_verse.contents:
|
||||
verse_list[1] = unicode(first_verse.contents[0])
|
||||
for verse in div(u'sup', u'versenum'):
|
||||
raw_verse_num = verse.next
|
||||
clean_verse_num = 0
|
||||
# Not all verses exist in all translations and may or may not be
|
||||
# represented by a verse number. If they are not fine, if they are
|
||||
# it will probably be in a format that breaks int(). We will then
|
||||
# have no idea what garbage may be sucked in to the verse text so
|
||||
# if we do not get a clean int() then ignore the verse completely.
|
||||
try:
|
||||
clean_verse_num = int(str(raw_verse_num))
|
||||
except ValueError:
|
||||
log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
|
||||
if clean_verse_num:
|
||||
verse_text = raw_verse_num.next
|
||||
part = raw_verse_num.next.next
|
||||
while not (isinstance(part, Tag) and
|
||||
part.get(u'class') == u'versenum'):
|
||||
# While we are still in the same verse grab all the text.
|
||||
if isinstance(part, NavigableString):
|
||||
verse_text += part
|
||||
if isinstance(part.next, Tag) and part.next.name == u'div':
|
||||
# Run out of verses so stop.
|
||||
break
|
||||
part = part.next
|
||||
verse_list[clean_verse_num] = unicode(verse_text)
|
||||
return verse_list
|
||||
|
||||
def get_bible_chapter(self, version, book_name, chapter):
|
||||
"""
|
||||
Access and decode Bibles via the BibleGateway website.
|
||||
@ -189,7 +231,13 @@ class BGExtract(object):
|
||||
Receiver.send_message(u'openlp_process_events')
|
||||
div = soup.find('div', 'result-text-style-normal')
|
||||
self._clean_soup(div)
|
||||
verse_list = self._extract_verses(div.findAll('span', 'text'))
|
||||
span_list = div.findAll('span', 'text')
|
||||
log.debug('Span list: %s', span_list)
|
||||
if not span_list:
|
||||
# If we don't get any spans then we must have the old HTML format
|
||||
verse_list = self._extract_verses_old(div)
|
||||
else:
|
||||
verse_list = self._extract_verses(span_list)
|
||||
if not verse_list:
|
||||
log.debug(u'No content found in the BibleGateway response.')
|
||||
send_error_message(u'parse')
|
||||
|
Loading…
Reference in New Issue
Block a user