forked from openlp/openlp
Fixed the BibleGateway importer.
bzr-revno: 747
This commit is contained in:
commit
ee083c30d5
@ -23,13 +23,14 @@
|
|||||||
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
|
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
import openlp
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import logging
|
import logging
|
||||||
import urllib2
|
import urllib2
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
import openlp
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
class AppLocation(object):
|
class AppLocation(object):
|
||||||
|
@ -28,7 +28,7 @@ import urllib2
|
|||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup
|
from BeautifulSoup import BeautifulSoup, Tag, NavigableString
|
||||||
|
|
||||||
from openlp.core.lib import Receiver
|
from openlp.core.lib import Receiver
|
||||||
from openlp.core.utils import AppLocation
|
from openlp.core.utils import AppLocation
|
||||||
@ -146,44 +146,62 @@ class BGExtract(BibleCommon):
|
|||||||
urlstring = u'http://www.biblegateway.com/passage/?search=%s+%s' \
|
urlstring = u'http://www.biblegateway.com/passage/?search=%s+%s' \
|
||||||
u'&version=%s' % (bookname, chapter, version)
|
u'&version=%s' % (bookname, chapter, version)
|
||||||
log.debug(u'BibleGateway url = %s' % urlstring)
|
log.debug(u'BibleGateway url = %s' % urlstring)
|
||||||
xml_string = self._get_web_text(urlstring, self.proxyurl)
|
# Let's get the page, and then open it in BeautifulSoup, so as to
|
||||||
verseSearch = u'<sup class=\"versenum'
|
# attempt to make "easy" work of bad HTML.
|
||||||
verseFootnote = u'<sup class=\'footnote'
|
page = urllib2.urlopen(urlstring)
|
||||||
verse = 1
|
soup = BeautifulSoup(page)
|
||||||
i = xml_string.find(u'result-text-style-normal') + 26
|
verses = soup.find(u'div', u'result-text-style-normal')
|
||||||
xml_string = xml_string[i:len(xml_string)]
|
verse_number = 0
|
||||||
versePos = xml_string.find(verseSearch)
|
verse_list = {0: u''}
|
||||||
bible = {}
|
# http://www.codinghorror.com/blog/2009/11/parsing-html-the-cthulhu-way.html
|
||||||
while versePos > -1:
|
# This is a PERFECT example of opening the Cthulu tag!
|
||||||
# clear out string
|
# O Bible Gateway, why doth ye such horrific HTML produce?
|
||||||
verseText = u''
|
for verse in verses:
|
||||||
versePos = xml_string.find(u'</sup>', versePos) + 6
|
if isinstance(verse, Tag) and verse.name == u'div' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] == u'footnotes':
|
||||||
i = xml_string.find(verseSearch, versePos + 1)
|
break
|
||||||
# Not sure if this is needed now
|
if isinstance(verse, Tag) and verse.name == u'sup' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] != u'versenum':
|
||||||
if i == -1:
|
continue
|
||||||
i = xml_string.find(u'</div', versePos + 1)
|
if isinstance(verse, Tag) and verse.name == u'p' and not verse.contents:
|
||||||
j = xml_string.find(u'<strong', versePos + 1)
|
continue
|
||||||
if j > 0 and j < i:
|
if isinstance(verse, Tag) and (verse.name == u'p' or verse.name == u'font') and verse.contents:
|
||||||
i = j
|
for item in verse.contents:
|
||||||
verseText = xml_string[versePos + 7 : i ]
|
if isinstance(item, Tag) and (item.name == u'h4' or item.name == u'h5'):
|
||||||
# store the verse
|
continue
|
||||||
bible[verse] = self._clean_text(verseText)
|
if isinstance(item, Tag) and item.name == u'sup' and filter(lambda a: a[0] == u'class', item.attrs)[0][1] != u'versenum':
|
||||||
versePos = -1
|
continue
|
||||||
else:
|
if isinstance(item, Tag) and item.name == u'p' and not item.contents:
|
||||||
verseText = xml_string[versePos: i]
|
continue
|
||||||
start_tag = verseText.find(verseFootnote)
|
if isinstance(item, Tag) and item.name == u'sup':
|
||||||
while start_tag > -1:
|
verse_number = int(str(item.contents[0]))
|
||||||
end_tag = verseText.find(u'</sup>')
|
verse_list[verse_number] = u''
|
||||||
verseText = verseText[:start_tag] + verseText[end_tag + 6:len(verseText)]
|
continue
|
||||||
start_tag = verseText.find(verseFootnote)
|
if isinstance(item, Tag) and item.name == u'font':
|
||||||
# Chop off verse and start again
|
for subitem in item.contents:
|
||||||
xml_string = xml_string[i:]
|
if isinstance(subitem, Tag) and subitem.name == u'sup' and filter(lambda a: a[0] == u'class', subitem.attrs)[0][1] != u'versenum':
|
||||||
#look for the next verse
|
continue
|
||||||
versePos = xml_string.find(verseSearch)
|
if isinstance(subitem, Tag) and subitem.name == u'p' and not subitem.contents:
|
||||||
# store the verse
|
continue
|
||||||
bible[verse] = self._clean_text(verseText)
|
if isinstance(subitem, Tag) and subitem.name == u'sup':
|
||||||
verse += 1
|
verse_number = int(str(subitem.contents[0]))
|
||||||
return SearchResults(bookname, chapter, bible)
|
verse_list[verse_number] = u''
|
||||||
|
continue
|
||||||
|
if isinstance(subitem, NavigableString):
|
||||||
|
verse_list[verse_number] = verse_list[verse_number] + subitem.replace(u' ', u' ')
|
||||||
|
continue
|
||||||
|
if isinstance(item, NavigableString):
|
||||||
|
verse_list[verse_number] = verse_list[verse_number] + item.replace(u' ', u' ')
|
||||||
|
continue
|
||||||
|
if isinstance(verse, Tag) and verse.name == u'sup':
|
||||||
|
verse_number = int(str(verse.contents[0]))
|
||||||
|
verse_list[verse_number] = u''
|
||||||
|
continue
|
||||||
|
if isinstance(verse, NavigableString):
|
||||||
|
verse_list[verse_number] = verse_list[verse_number] + verse.replace(u' ', u' ')
|
||||||
|
# Delete the "0" element, since we don't need it, it's just there for
|
||||||
|
# some stupid initial whitespace, courtesy of Bible Gateway.
|
||||||
|
del verse_list[0]
|
||||||
|
# Finally, return the list of verses in a "SearchResults" object.
|
||||||
|
return SearchResults(bookname, chapter, verse_list)
|
||||||
|
|
||||||
class CWExtract(BibleCommon):
|
class CWExtract(BibleCommon):
|
||||||
log.info(u'%s CWExtract loaded', __name__)
|
log.info(u'%s CWExtract loaded', __name__)
|
||||||
|
Loading…
Reference in New Issue
Block a user