Refactor BibleGateway retrieval

This commit is contained in:
Jon Tibble 2010-07-23 03:19:35 +01:00
parent b5ceb59e27
commit e9edb0c7a4
1 changed files with 10 additions and 59 deletions

View File

@ -29,12 +29,11 @@ import os
import sqlite3 import sqlite3
import re import re
from BeautifulSoup import BeautifulSoup, Tag, NavigableString from BeautifulSoup import BeautifulSoup, NavigableString
from openlp.core.lib import Receiver from openlp.core.lib import Receiver
from openlp.core.utils import AppLocation from openlp.core.utils import AppLocation
from openlp.plugins.bibles.lib.common import BibleCommon, SearchResults, \ from openlp.plugins.bibles.lib.common import BibleCommon, SearchResults
unescape
from openlp.plugins.bibles.lib.db import BibleDB, Book from openlp.plugins.bibles.lib.db import BibleDB, Book
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -205,63 +204,15 @@ class BGExtract(BibleCommon):
Receiver.send_message(u'openlp_process_events') Receiver.send_message(u'openlp_process_events')
soup = BeautifulSoup(page) soup = BeautifulSoup(page)
Receiver.send_message(u'openlp_process_events') Receiver.send_message(u'openlp_process_events')
verses = soup.find(u'div', u'result-text-style-normal') text = str(soup.find(u'div', u'result-text-style-normal'))
verse_number = 0 useful_soup = BeautifulSoup(text)
verse_list = {0: u''} verses = useful_soup.findAll(u'p')
# http://www.codinghorror.com/blog/2009/11/parsing-html-the-cthulhu-way.html verses.pop(0)
# This is a PERFECT example of opening the Cthulu tag! verses.pop()
# O Bible Gateway, why doth ye such horrific HTML produce? verse_list = {}
for verse in verses: for verse in verses:
Receiver.send_message(u'openlp_process_events') verse_list[int(str(verse.sup.contents[0]))] = \
if isinstance(verse, Tag) and verse.name == u'div' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] == u'footnotes': unicode(verse.contents[2])
break
if isinstance(verse, Tag) and verse.name == u'sup' and filter(lambda a: a[0] == u'class', verse.attrs)[0][1] != u'versenum':
continue
if isinstance(verse, Tag) and verse.name == u'p' and not verse.contents:
continue
if isinstance(verse, Tag) and (verse.name == u'p' or verse.name == u'font') and verse.contents:
for item in verse.contents:
Receiver.send_message(u'openlp_process_events')
if isinstance(item, Tag) and (item.name == u'h4' or item.name == u'h5'):
continue
if isinstance(item, Tag) and item.name == u'sup' and filter(lambda a: a[0] == u'class', item.attrs)[0][1] != u'versenum':
continue
if isinstance(item, Tag) and item.name == u'p' and not item.contents:
continue
if isinstance(item, Tag) and item.name == u'sup':
verse_number = int(str(item.contents[0]))
verse_list[verse_number] = u''
continue
if isinstance(item, Tag) and item.name == u'font':
for subitem in item.contents:
Receiver.send_message(u'openlp_process_events')
if isinstance(subitem, Tag) and subitem.name == u'sup' and filter(lambda a: a[0] == u'class', subitem.attrs)[0][1] != u'versenum':
continue
if isinstance(subitem, Tag) and subitem.name == u'p' and not subitem.contents:
continue
if isinstance(subitem, Tag) and subitem.name == u'sup':
verse_number = int(str(subitem.contents[0]))
verse_list[verse_number] = u''
continue
if isinstance(subitem, NavigableString):
verse_list[verse_number] = verse_list[verse_number] + subitem.replace(u' ', u' ')
continue
if isinstance(item, NavigableString):
verse_list[verse_number] = verse_list[verse_number] + item.replace(u' ', u' ')
continue
if isinstance(verse, Tag) and verse.name == u'sup':
verse_number = int(str(verse.contents[0]))
verse_list[verse_number] = u''
continue
if isinstance(verse, NavigableString):
if not isinstance(verse, unicode):
verse = unicode(verse, u'utf8')
verse_list[verse_number] = verse_list[verse_number] + \
unescape(verse.replace(u' ', u' '))
# Delete the "0" element, since we don't need it, it's just there for
# some stupid initial whitespace, courtesy of Bible Gateway.
del verse_list[0]
# Finally, return the list of verses in a "SearchResults" object.
return SearchResults(bookname, chapter, verse_list) return SearchResults(bookname, chapter, verse_list)
class CWExtract(BibleCommon): class CWExtract(BibleCommon):