Fixed the BibleGateway importer.

bzr-revno: 747
This commit is contained in:
Raoul Snyman 2010-03-19 09:54:36 +02:00
commit ee083c30d5
2 changed files with 59 additions and 40 deletions

View File

@ -23,13 +23,14 @@
# Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Temple Place, Suite 330, Boston, MA 02111-1307 USA #
############################################################################### ###############################################################################
import openlp
import os import os
import sys import sys
import logging import logging
import urllib2 import urllib2
from datetime import datetime from datetime import datetime
import openlp
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class AppLocation(object): class AppLocation(object):

View File

@ -28,7 +28,7 @@ import urllib2
import os import os
import sqlite3 import sqlite3
from BeautifulSoup import BeautifulSoup from BeautifulSoup import BeautifulSoup, Tag, NavigableString
from openlp.core.lib import Receiver from openlp.core.lib import Receiver
from openlp.core.utils import AppLocation from openlp.core.utils import AppLocation
@ -146,44 +146,62 @@ class BGExtract(BibleCommon):
urlstring = u'http://www.biblegateway.com/passage/?search=%s+%s' \ urlstring = u'http://www.biblegateway.com/passage/?search=%s+%s' \
u'&version=%s' % (bookname, chapter, version) u'&version=%s' % (bookname, chapter, version)
log.debug(u'BibleGateway url = %s' % urlstring) log.debug(u'BibleGateway url = %s' % urlstring)
xml_string = self._get_web_text(urlstring, self.proxyurl) # Let's get the page, and then open it in BeautifulSoup, so as to
verseSearch = u'<sup class=\"versenum' # attempt to make "easy" work of bad HTML.
verseFootnote = u'<sup class=\'footnote' page = urllib2.urlopen(urlstring)
verse = 1 soup = BeautifulSoup(page)
i = xml_string.find(u'result-text-style-normal') + 26 verses = soup.find(u'div', u'result-text-style-normal')
xml_string = xml_string[i:len(xml_string)] verse_number = 0
versePos = xml_string.find(verseSearch) verse_list = {0: u''}
bible = {} # http://www.codinghorror.com/blog/2009/11/parsing-html-the-cthulhu-way.html
while versePos > -1: # This is a PERFECT example of opening the Cthulu tag!
# clear out string # O Bible Gateway, why doth ye such horrific HTML produce?
verseText = u'' for verse in verses:
versePos = xml_string.find(u'</sup>', versePos) + 6 if isinstance(verse, Tag) and verse.name == u'div' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] == u'footnotes':
i = xml_string.find(verseSearch, versePos + 1) break
# Not sure if this is needed now if isinstance(verse, Tag) and verse.name == u'sup' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] != u'versenum':
if i == -1: continue
i = xml_string.find(u'</div', versePos + 1) if isinstance(verse, Tag) and verse.name == u'p' and not verse.contents:
j = xml_string.find(u'<strong', versePos + 1) continue
if j > 0 and j < i: if isinstance(verse, Tag) and (verse.name == u'p' or verse.name == u'font') and verse.contents:
i = j for item in verse.contents:
verseText = xml_string[versePos + 7 : i ] if isinstance(item, Tag) and (item.name == u'h4' or item.name == u'h5'):
# store the verse continue
bible[verse] = self._clean_text(verseText) if isinstance(item, Tag) and item.name == u'sup' and filter(lambda a: a[0] == u'class', item.attrs)[0][1] != u'versenum':
versePos = -1 continue
else: if isinstance(item, Tag) and item.name == u'p' and not item.contents:
verseText = xml_string[versePos: i] continue
start_tag = verseText.find(verseFootnote) if isinstance(item, Tag) and item.name == u'sup':
while start_tag > -1: verse_number = int(str(item.contents[0]))
end_tag = verseText.find(u'</sup>') verse_list[verse_number] = u''
verseText = verseText[:start_tag] + verseText[end_tag + 6:len(verseText)] continue
start_tag = verseText.find(verseFootnote) if isinstance(item, Tag) and item.name == u'font':
# Chop off verse and start again for subitem in item.contents:
xml_string = xml_string[i:] if isinstance(subitem, Tag) and subitem.name == u'sup' and filter(lambda a: a[0] == u'class', subitem.attrs)[0][1] != u'versenum':
#look for the next verse continue
versePos = xml_string.find(verseSearch) if isinstance(subitem, Tag) and subitem.name == u'p' and not subitem.contents:
# store the verse continue
bible[verse] = self._clean_text(verseText) if isinstance(subitem, Tag) and subitem.name == u'sup':
verse += 1 verse_number = int(str(subitem.contents[0]))
return SearchResults(bookname, chapter, bible) verse_list[verse_number] = u''
continue
if isinstance(subitem, NavigableString):
verse_list[verse_number] = verse_list[verse_number] + subitem.replace(u'&nbsp;', u' ')
continue
if isinstance(item, NavigableString):
verse_list[verse_number] = verse_list[verse_number] + item.replace(u'&nbsp;', u' ')
continue
if isinstance(verse, Tag) and verse.name == u'sup':
verse_number = int(str(verse.contents[0]))
verse_list[verse_number] = u''
continue
if isinstance(verse, NavigableString):
verse_list[verse_number] = verse_list[verse_number] + verse.replace(u'&nbsp;', u' ')
# Delete the "0" element, since we don't need it, it's just there for
# some stupid initial whitespace, courtesy of Bible Gateway.
del verse_list[0]
# Finally, return the list of verses in a "SearchResults" object.
return SearchResults(bookname, chapter, verse_list)
class CWExtract(BibleCommon): class CWExtract(BibleCommon):
log.info(u'%s CWExtract loaded', __name__) log.info(u'%s CWExtract loaded', __name__)