forked from openlp/openlp
Fixed bug #1049977 where some Bibles from Bible Gateway still use the old HTML format.
Fixes: https://launchpad.net/bugs/1049977
This commit is contained in:
parent
8297be4624
commit
19845ebfb4
@ -124,6 +124,8 @@ class BGExtract(object):
|
|||||||
self._remove_elements(tag, 'div', 'footnotes')
|
self._remove_elements(tag, 'div', 'footnotes')
|
||||||
self._remove_elements(tag, 'div', 'crossrefs')
|
self._remove_elements(tag, 'div', 'crossrefs')
|
||||||
self._remove_elements(tag, 'h3')
|
self._remove_elements(tag, 'h3')
|
||||||
|
self._remove_elements(tag, 'h4')
|
||||||
|
self._remove_elements(tag, 'h5')
|
||||||
|
|
||||||
def _extract_verses(self, tags):
|
def _extract_verses(self, tags):
|
||||||
"""
|
"""
|
||||||
@ -161,6 +163,46 @@ class BGExtract(object):
|
|||||||
verse_list[verse] = text
|
verse_list[verse] = text
|
||||||
return verse_list
|
return verse_list
|
||||||
|
|
||||||
|
def _extract_verses_old(self, div):
|
||||||
|
"""
|
||||||
|
Use the old style of parsing for those Bibles on BG who mysteriously
|
||||||
|
have not been migrated to the new (still broken) HTML.
|
||||||
|
|
||||||
|
``div``
|
||||||
|
The parent div.
|
||||||
|
"""
|
||||||
|
verse_list = {}
|
||||||
|
# Cater for inconsistent mark up in the first verse of a chapter.
|
||||||
|
first_verse = div.find(u'versenum')
|
||||||
|
if first_verse and first_verse.contents:
|
||||||
|
verse_list[1] = unicode(first_verse.contents[0])
|
||||||
|
for verse in div(u'sup', u'versenum'):
|
||||||
|
raw_verse_num = verse.next
|
||||||
|
clean_verse_num = 0
|
||||||
|
# Not all verses exist in all translations and may or may not be
|
||||||
|
# represented by a verse number. If they are not fine, if they are
|
||||||
|
# it will probably be in a format that breaks int(). We will then
|
||||||
|
# have no idea what garbage may be sucked in to the verse text so
|
||||||
|
# if we do not get a clean int() then ignore the verse completely.
|
||||||
|
try:
|
||||||
|
clean_verse_num = int(str(raw_verse_num))
|
||||||
|
except ValueError:
|
||||||
|
log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
|
||||||
|
if clean_verse_num:
|
||||||
|
verse_text = raw_verse_num.next
|
||||||
|
part = raw_verse_num.next.next
|
||||||
|
while not (isinstance(part, Tag) and
|
||||||
|
part.get(u'class') == u'versenum'):
|
||||||
|
# While we are still in the same verse grab all the text.
|
||||||
|
if isinstance(part, NavigableString):
|
||||||
|
verse_text += part
|
||||||
|
if isinstance(part.next, Tag) and part.next.name == u'div':
|
||||||
|
# Run out of verses so stop.
|
||||||
|
break
|
||||||
|
part = part.next
|
||||||
|
verse_list[clean_verse_num] = unicode(verse_text)
|
||||||
|
return verse_list
|
||||||
|
|
||||||
def get_bible_chapter(self, version, book_name, chapter):
|
def get_bible_chapter(self, version, book_name, chapter):
|
||||||
"""
|
"""
|
||||||
Access and decode Bibles via the BibleGateway website.
|
Access and decode Bibles via the BibleGateway website.
|
||||||
@ -189,7 +231,13 @@ class BGExtract(object):
|
|||||||
Receiver.send_message(u'openlp_process_events')
|
Receiver.send_message(u'openlp_process_events')
|
||||||
div = soup.find('div', 'result-text-style-normal')
|
div = soup.find('div', 'result-text-style-normal')
|
||||||
self._clean_soup(div)
|
self._clean_soup(div)
|
||||||
verse_list = self._extract_verses(div.findAll('span', 'text'))
|
span_list = div.findAll('span', 'text')
|
||||||
|
log.debug('Span list: %s', span_list)
|
||||||
|
if not span_list:
|
||||||
|
# If we don't get any spans then we must have the old HTML format
|
||||||
|
verse_list = self._extract_verses_old(div)
|
||||||
|
else:
|
||||||
|
verse_list = self._extract_verses(span_list)
|
||||||
if not verse_list:
|
if not verse_list:
|
||||||
log.debug(u'No content found in the BibleGateway response.')
|
log.debug(u'No content found in the BibleGateway response.')
|
||||||
send_error_message(u'parse')
|
send_error_message(u'parse')
|
||||||
|
Loading…
Reference in New Issue
Block a user