Fix Jesus' words in BibleGateway parsing.

bzr-revno: 1341
This commit is contained in:
Jon Tibble 2011-02-26 12:11:41 +02:00 committed by Raoul Snyman
commit ccfb472aa6

View File

@ -35,7 +35,7 @@ import socket
import urllib import urllib
from HTMLParser import HTMLParseError from HTMLParser import HTMLParseError
from BeautifulSoup import BeautifulSoup, NavigableString from BeautifulSoup import BeautifulSoup, NavigableString, Tag
from openlp.core.lib import Receiver, translate from openlp.core.lib import Receiver, translate
from openlp.core.lib.ui import critical_error_message_box from openlp.core.lib.ui import critical_error_message_box
@ -221,21 +221,14 @@ class BGExtract(object):
crossrefs = soup.findAll(u'sup', u'xref') crossrefs = soup.findAll(u'sup', u'xref')
if crossrefs: if crossrefs:
[crossref.extract() for crossref in crossrefs] [crossref.extract() for crossref in crossrefs]
headings = soup.findAll(u'h5')
if headings:
[heading.extract() for heading in headings]
cleanup = [(re.compile('\s+'), lambda match: ' ')] cleanup = [(re.compile('\s+'), lambda match: ' ')]
verses = BeautifulSoup(str(soup), markupMassage=cleanup) verses = BeautifulSoup(str(soup), markupMassage=cleanup)
content = verses.find(u'div', u'result-text-style-normal')
if not content:
content = verses.find(u'div', u'result-text-style-rtl-serif')
if not content:
log.debug(u'No content found in the BibleGateway response.')
send_error_message(u'parse')
return None
verse_count = len(verses.findAll(u'sup', u'versenum'))
found_count = 0
verse_list = {} verse_list = {}
while found_count < verse_count: for verse in verses(u'sup', u'versenum'):
content = content.findNext(u'sup', u'versenum') raw_verse_num = verse.next
raw_verse_num = content.next
clean_verse_num = 0 clean_verse_num = 0
# Not all verses exist in all translations and may or may not be # Not all verses exist in all translations and may or may not be
# represented by a verse number. If they are not fine, if they are # represented by a verse number. If they are not fine, if they are
@ -248,9 +241,22 @@ class BGExtract(object):
log.exception(u'Illegal verse number in %s %s %s:%s', log.exception(u'Illegal verse number in %s %s %s:%s',
version, bookname, chapter, unicode(raw_verse_num)) version, bookname, chapter, unicode(raw_verse_num))
if clean_verse_num: if clean_verse_num:
raw_verse_text = raw_verse_num.next verse_text = raw_verse_num.next
verse_list[clean_verse_num] = unicode(raw_verse_text) part = raw_verse_num.next.next
found_count += 1 while not (isinstance(part, Tag) and part.attrMap and
part.attrMap[u'class'] == u'versenum'):
# While we are still in the same verse grab all the text.
if isinstance(part, NavigableString):
verse_text = verse_text + part
if isinstance(part.next, Tag) and part.next.name == u'div':
# Run out of verses so stop.
break
part = part.next
verse_list[clean_verse_num] = unicode(verse_text)
if not verse_list:
log.debug(u'No content found in the BibleGateway response.')
send_error_message(u'parse')
return None
return SearchResults(bookname, chapter, verse_list) return SearchResults(bookname, chapter, verse_list)