Fixed bug #1049977 where some Bibles from Bible Gateway still used the old HTML format.

bzr-revno: 2059
Fixes: https://launchpad.net/bugs/1049977
This commit is contained in:
Raoul Snyman 2012-09-15 00:06:04 +02:00
commit 42d751fa9a

View File

@ -124,6 +124,8 @@ class BGExtract(object):
self._remove_elements(tag, 'div', 'footnotes') self._remove_elements(tag, 'div', 'footnotes')
self._remove_elements(tag, 'div', 'crossrefs') self._remove_elements(tag, 'div', 'crossrefs')
self._remove_elements(tag, 'h3') self._remove_elements(tag, 'h3')
self._remove_elements(tag, 'h4')
self._remove_elements(tag, 'h5')
def _extract_verses(self, tags): def _extract_verses(self, tags):
""" """
@ -161,6 +163,46 @@ class BGExtract(object):
verse_list[verse] = text verse_list[verse] = text
return verse_list return verse_list
def _extract_verses_old(self, div):
"""
Use the old style of parsing for those Bibles on BG who mysteriously
have not been migrated to the new (still broken) HTML.
``div``
The parent div.
"""
verse_list = {}
# Cater for inconsistent mark up in the first verse of a chapter.
first_verse = div.find(u'versenum')
if first_verse and first_verse.contents:
verse_list[1] = unicode(first_verse.contents[0])
for verse in div(u'sup', u'versenum'):
raw_verse_num = verse.next
clean_verse_num = 0
# Not all verses exist in all translations and may or may not be
# represented by a verse number. If they are not fine, if they are
# it will probably be in a format that breaks int(). We will then
# have no idea what garbage may be sucked in to the verse text so
# if we do not get a clean int() then ignore the verse completely.
try:
clean_verse_num = int(str(raw_verse_num))
except ValueError:
log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
if clean_verse_num:
verse_text = raw_verse_num.next
part = raw_verse_num.next.next
while not (isinstance(part, Tag) and
part.get(u'class') == u'versenum'):
# While we are still in the same verse grab all the text.
if isinstance(part, NavigableString):
verse_text += part
if isinstance(part.next, Tag) and part.next.name == u'div':
# Run out of verses so stop.
break
part = part.next
verse_list[clean_verse_num] = unicode(verse_text)
return verse_list
def get_bible_chapter(self, version, book_name, chapter): def get_bible_chapter(self, version, book_name, chapter):
""" """
Access and decode Bibles via the BibleGateway website. Access and decode Bibles via the BibleGateway website.
@ -189,7 +231,13 @@ class BGExtract(object):
Receiver.send_message(u'openlp_process_events') Receiver.send_message(u'openlp_process_events')
div = soup.find('div', 'result-text-style-normal') div = soup.find('div', 'result-text-style-normal')
self._clean_soup(div) self._clean_soup(div)
verse_list = self._extract_verses(div.findAll('span', 'text')) span_list = div.findAll('span', 'text')
log.debug('Span list: %s', span_list)
if not span_list:
# If we don't get any spans then we must have the old HTML format
verse_list = self._extract_verses_old(div)
else:
verse_list = self._extract_verses(span_list)
if not verse_list: if not verse_list:
log.debug(u'No content found in the BibleGateway response.') log.debug(u'No content found in the BibleGateway response.')
send_error_message(u'parse') send_error_message(u'parse')