forked from openlp/openlp
Fixed bug #991150: BibleGateway has new HTML, so we need to change our parsing. This now parses the new HTML correctly.
Fixes: https://launchpad.net/bugs/991150
This commit is contained in:
parent
e0c5fabc5a
commit
4e0b9c30d3
@ -43,6 +43,15 @@ from openlp.plugins.bibles.lib import SearchResults
|
|||||||
from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, \
|
from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, \
|
||||||
Book
|
Book
|
||||||
|
|
||||||
|
UGLY_CHARS = {
|
||||||
|
u'\u2014': u' - ',
|
||||||
|
u'\u2018': u'\'',
|
||||||
|
u'\u2019': u'\'',
|
||||||
|
u'\u201c': u'"',
|
||||||
|
u'\u201d': u'"',
|
||||||
|
u' ': u' '
|
||||||
|
}
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
class BGExtract(object):
|
class BGExtract(object):
|
||||||
@ -54,6 +63,103 @@ class BGExtract(object):
|
|||||||
self.proxy_url = proxy_url
|
self.proxy_url = proxy_url
|
||||||
socket.setdefaulttimeout(30)
|
socket.setdefaulttimeout(30)
|
||||||
|
|
||||||
|
def _remove_elements(self, parent, tag, class_=None):
|
||||||
|
"""
|
||||||
|
Remove a particular element from the BeautifulSoup tree.
|
||||||
|
|
||||||
|
``parent``
|
||||||
|
The element from which items need to be removed.
|
||||||
|
|
||||||
|
``tag``
|
||||||
|
A string of the tab type, e.g. "div"
|
||||||
|
|
||||||
|
``class_``
|
||||||
|
An HTML class attribute for further qualification.
|
||||||
|
"""
|
||||||
|
if class_:
|
||||||
|
all_tags = parent.findAll(tag, class_)
|
||||||
|
else:
|
||||||
|
all_tags = parent.findAll(tag)
|
||||||
|
for element in all_tags:
|
||||||
|
element.extract()
|
||||||
|
|
||||||
|
def _extract_verse(self, tag):
|
||||||
|
"""
|
||||||
|
Extract a verse (or part of a verse) from a tag.
|
||||||
|
|
||||||
|
``tag``
|
||||||
|
The BeautifulSoup Tag element with the stuff we want.
|
||||||
|
"""
|
||||||
|
if isinstance(tag, NavigableString):
|
||||||
|
return None, unicode(tag)
|
||||||
|
elif tag.get('class') == 'versenum':
|
||||||
|
verse = unicode(tag.string)\
|
||||||
|
.replace('[', '').replace(']', '').strip()
|
||||||
|
return verse, None
|
||||||
|
elif tag.get('class') == 'chapternum':
|
||||||
|
verse = '1'
|
||||||
|
return verse, None
|
||||||
|
else:
|
||||||
|
verse, text = None, ''
|
||||||
|
for child in tag.contents:
|
||||||
|
c_verse, c_text = self._extract_verse(child)
|
||||||
|
if c_verse:
|
||||||
|
verse = c_verse
|
||||||
|
if text and c_text:
|
||||||
|
text += c_text
|
||||||
|
elif c_text is not None:
|
||||||
|
text = c_text
|
||||||
|
return verse, text
|
||||||
|
|
||||||
|
def _clean_soup(self, tag):
|
||||||
|
"""
|
||||||
|
Remove all the rubbish from the HTML page.
|
||||||
|
|
||||||
|
``tag``
|
||||||
|
The base tag within which we want to remove stuff.
|
||||||
|
"""
|
||||||
|
self._remove_elements(tag, 'sup', 'crossreference')
|
||||||
|
self._remove_elements(tag, 'sup', 'footnote')
|
||||||
|
self._remove_elements(tag, 'div', 'footnotes')
|
||||||
|
self._remove_elements(tag, 'div', 'crossrefs')
|
||||||
|
self._remove_elements(tag, 'h3')
|
||||||
|
|
||||||
|
def _extract_verses(self, tags):
|
||||||
|
"""
|
||||||
|
Extract all the verses from a pre-prepared list of HTML tags.
|
||||||
|
|
||||||
|
``tags``
|
||||||
|
A list of BeautifulSoup Tag elements.
|
||||||
|
"""
|
||||||
|
verses = []
|
||||||
|
tags = tags[::-1]
|
||||||
|
current_text = ''
|
||||||
|
for tag in tags:
|
||||||
|
verse, text = None, ''
|
||||||
|
for child in tag.contents:
|
||||||
|
c_verse, c_text = self._extract_verse(child)
|
||||||
|
if c_verse:
|
||||||
|
verse = c_verse
|
||||||
|
if text and c_text:
|
||||||
|
text += c_text
|
||||||
|
elif c_text is not None:
|
||||||
|
text = c_text
|
||||||
|
if not verse:
|
||||||
|
current_text = text + ' ' + current_text
|
||||||
|
else:
|
||||||
|
text += ' ' + current_text
|
||||||
|
current_text = ''
|
||||||
|
if text:
|
||||||
|
for old, new in UGLY_CHARS.iteritems():
|
||||||
|
text = text.replace(old, new)
|
||||||
|
text = u' '.join(text.split())
|
||||||
|
if verse and text:
|
||||||
|
verses.append((int(verse.strip()), text))
|
||||||
|
verse_list = {}
|
||||||
|
for verse, text in verses[::-1]:
|
||||||
|
verse_list[verse] = text
|
||||||
|
return verse_list
|
||||||
|
|
||||||
def get_bible_chapter(self, version, book_name, chapter):
|
def get_bible_chapter(self, version, book_name, chapter):
|
||||||
"""
|
"""
|
||||||
Access and decode Bibles via the BibleGateway website.
|
Access and decode Bibles via the BibleGateway website.
|
||||||
@ -80,60 +186,9 @@ class BGExtract(object):
|
|||||||
if not soup:
|
if not soup:
|
||||||
return None
|
return None
|
||||||
Receiver.send_message(u'openlp_process_events')
|
Receiver.send_message(u'openlp_process_events')
|
||||||
footnotes = soup.findAll(u'sup', u'footnote')
|
div = soup.find('div', 'result-text-style-normal')
|
||||||
if footnotes:
|
self._clean_soup(div)
|
||||||
for footnote in footnotes:
|
verse_list = self._extract_verses(div.findAll('span', 'text'))
|
||||||
footnote.extract()
|
|
||||||
crossrefs = soup.findAll(u'sup', u'xref')
|
|
||||||
if crossrefs:
|
|
||||||
for crossref in crossrefs:
|
|
||||||
crossref.extract()
|
|
||||||
headings = soup.findAll(u'h5')
|
|
||||||
if headings:
|
|
||||||
for heading in headings:
|
|
||||||
heading.extract()
|
|
||||||
chapter_notes = soup.findAll('div', 'footnotes')
|
|
||||||
if chapter_notes:
|
|
||||||
log.debug('Found chapter notes')
|
|
||||||
for note in chapter_notes:
|
|
||||||
note.extract()
|
|
||||||
note_comments = soup.findAll(text=u'end of footnotes')
|
|
||||||
if note_comments:
|
|
||||||
for comment in note_comments:
|
|
||||||
comment.extract()
|
|
||||||
cleanup = [(re.compile('\s+'), lambda match: ' ')]
|
|
||||||
verses = BeautifulSoup(str(soup), markupMassage=cleanup)
|
|
||||||
verse_list = {}
|
|
||||||
# Cater for inconsistent mark up in the first verse of a chapter.
|
|
||||||
first_verse = verses.find(u'versenum')
|
|
||||||
if first_verse and first_verse.contents:
|
|
||||||
verse_list[1] = unicode(first_verse.contents[0])
|
|
||||||
for verse in verses(u'sup', u'versenum'):
|
|
||||||
raw_verse_num = verse.next
|
|
||||||
clean_verse_num = 0
|
|
||||||
# Not all verses exist in all translations and may or may not be
|
|
||||||
# represented by a verse number. If they are not fine, if they are
|
|
||||||
# it will probably be in a format that breaks int(). We will then
|
|
||||||
# have no idea what garbage may be sucked in to the verse text so
|
|
||||||
# if we do not get a clean int() then ignore the verse completely.
|
|
||||||
try:
|
|
||||||
clean_verse_num = int(str(raw_verse_num))
|
|
||||||
except ValueError:
|
|
||||||
log.warn(u'Illegal verse number in %s %s %s:%s',
|
|
||||||
version, book_name, chapter, unicode(raw_verse_num))
|
|
||||||
if clean_verse_num:
|
|
||||||
verse_text = raw_verse_num.next
|
|
||||||
part = raw_verse_num.next.next
|
|
||||||
while not (isinstance(part, Tag) and part.attrMap and
|
|
||||||
part.attrMap[u'class'] == u'versenum'):
|
|
||||||
# While we are still in the same verse grab all the text.
|
|
||||||
if isinstance(part, NavigableString):
|
|
||||||
verse_text = verse_text + part
|
|
||||||
if isinstance(part.next, Tag) and part.next.name == u'div':
|
|
||||||
# Run out of verses so stop.
|
|
||||||
break
|
|
||||||
part = part.next
|
|
||||||
verse_list[clean_verse_num] = unicode(verse_text)
|
|
||||||
if not verse_list:
|
if not verse_list:
|
||||||
log.debug(u'No content found in the BibleGateway response.')
|
log.debug(u'No content found in the BibleGateway response.')
|
||||||
send_error_message(u'parse')
|
send_error_message(u'parse')
|
||||||
|
Loading…
Reference in New Issue
Block a user