replaced BeautifulSoup3 by BeautifulSoup4

This commit is contained in:
Andreas Preikschat 2013-04-05 21:58:13 +02:00
parent 131b46edef
commit 69009970c0
3 changed files with 52 additions and 58 deletions

View File

@ -35,7 +35,7 @@ import os
import platform
import sqlalchemy
import BeautifulSoup
from bs4 import BeautifulSoup
from lxml import etree
from PyQt4 import Qt, QtCore, QtGui, QtWebKit

View File

@ -36,7 +36,7 @@ import socket
import urllib
from HTMLParser import HTMLParseError
from BeautifulSoup import BeautifulSoup, NavigableString, Tag
from bs4 import BeautifulSoup, NavigableString, Tag
from openlp.core.lib import Registry, translate
from openlp.core.lib.ui import critical_error_message_box
@ -44,6 +44,9 @@ from openlp.core.utils import get_web_page
from openlp.plugins.bibles.lib import SearchResults
from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book
CLEANER_REGEX = re.compile('&nbsp;|<br />|\'\+\'')
FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
UGLY_CHARS = {
u'\u2014': u' - ',
u'\u2018': u'\'',
@ -52,9 +55,12 @@ UGLY_CHARS = {
u'\u201d': u'"',
u'&nbsp;': u' '
}
VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
log = logging.getLogger(__name__)
class BGExtract(object):
"""
Extract verses from BibleGateway
@ -78,9 +84,9 @@ class BGExtract(object):
An HTML class attribute for further qualification.
"""
if class_:
all_tags = parent.findAll(tag, class_)
all_tags = parent.find_all(tag, class_)
else:
all_tags = parent.findAll(tag)
all_tags = parent.find_all(tag)
for element in all_tags:
element.extract()
@ -173,8 +179,8 @@ class BGExtract(object):
def _extract_verses_old(self, div):
"""
Use the old style of parsing for those Bibles on BG who mysteriously
have not been migrated to the new (still broken) HTML.
Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
broken) HTML.
``div``
The parent div.
@ -185,13 +191,12 @@ class BGExtract(object):
if first_verse and first_verse.contents:
verse_list[1] = unicode(first_verse.contents[0])
for verse in div(u'sup', u'versenum'):
raw_verse_num = verse.next
raw_verse_num = verse.next_element
clean_verse_num = 0
# Not all verses exist in all translations and may or may not be
# represented by a verse number. If they are not fine, if they are
# it will probably be in a format that breaks int(). We will then
# have no idea what garbage may be sucked in to the verse text so
# if we do not get a clean int() then ignore the verse completely.
# Not all verses exist in all translations and may or may not be represented by a verse number. If they are
# not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
# garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
# completely.
try:
clean_verse_num = int(str(raw_verse_num))
except ValueError:
@ -201,16 +206,16 @@ class BGExtract(object):
except TypeError:
log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
if clean_verse_num:
verse_text = raw_verse_num.next
part = raw_verse_num.next.next
verse_text = raw_verse_num.next_element
part = raw_verse_num.next_element.next_element
while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'):
# While we are still in the same verse grab all the text.
if isinstance(part, NavigableString):
verse_text += part
if isinstance(part.next, Tag) and part.next.name == u'div':
if isinstance(part.next_element, Tag) and part.next_element.name == u'div':
# Run out of verses so stop.
break
part = part.next
part = part.next_element
verse_list[clean_verse_num] = unicode(verse_text)
return verse_list
@ -230,7 +235,7 @@ class BGExtract(object):
log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter)
url_book_name = urllib.quote(book_name.encode("utf-8"))
url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version)
cleaner = [(re.compile('&nbsp;|<br />|\'\+\''), lambda match: '')]
cleaner = [(CLEANER_REGEX, lambda match: '')]
soup = get_soup_for_bible_ref(
u'http://www.biblegateway.com/passage/?%s' % url_params,
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='', cleaner=cleaner)
@ -238,7 +243,7 @@ class BGExtract(object):
return None
div = soup.find('div', 'result-text-style-normal')
self._clean_soup(div)
span_list = div.findAll('span', 'text')
span_list = div.find_all('span', 'text')
log.debug('Span list: %s', span_list)
if not span_list:
# If we don't get any spans then we must have the old HTML format
@ -282,7 +287,7 @@ class BGExtract(object):
self.application.process_events()
content = soup.find(u'table', u'infotable')
if content:
content = content.findAll(u'tr')
content = content.find_all(u'tr')
if not content:
log.error(u'No books found in the Biblegateway response.')
send_error_message(u'parse')
@ -341,19 +346,17 @@ class BSExtract(object):
log.error(u'No verses found in the Bibleserver response.')
send_error_message(u'parse')
return None
content = content.find(u'div').findAll(u'div')
verse_number = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
content = content.find(u'div').find_all(u'div')
verses = {}
for verse in content:
self.application.process_events()
versenumber = int(verse_number.sub(r'\3', verse[u'class']))
versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', verse[u'class']))
verses[versenumber] = verse.contents[1].rstrip(u'\n')
return SearchResults(book_name, chapter, verses)
def get_books_from_http(self, version):
"""
Load a list of all books a Bible contains from Bibleserver mobile
website.
Load a list of all books a Bible contains from Bibleserver mobile website.
``version``
The version of the Bible like NIV for New International Version
@ -369,7 +372,7 @@ class BSExtract(object):
log.error(u'No books found in the Bibleserver response.')
send_error_message(u'parse')
return None
content = content.findAll(u'li')
content = content.find_all(u'li')
return [book.contents[0].contents[0] for book in content]
@ -404,14 +407,12 @@ class CWExtract(object):
if not soup:
return None
self.application.process_events()
html_verses = soup.findAll(u'span', u'versetext')
html_verses = soup.find_all(u'span', u'versetext')
if not html_verses:
log.error(u'No verses found in the CrossWalk response.')
send_error_message(u'parse')
return None
verses = {}
reduce_spaces = re.compile(r'[ ]{2,}')
fix_punctuation = re.compile(r'[ ]+([.,;])')
for verse in html_verses:
self.application.process_events()
verse_number = int(verse.contents[0].contents[0])
@ -432,11 +433,10 @@ class CWExtract(object):
if isinstance(subsub, NavigableString):
verse_text += subsub
self.application.process_events()
# Fix up leading and trailing spaces, multiple spaces, and spaces
# between text and , and .
# Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
verse_text = verse_text.strip(u'\n\r\t ')
verse_text = reduce_spaces.sub(u' ', verse_text)
verse_text = fix_punctuation.sub(r'\1', verse_text)
verse_text = REDUCE_SPACES_REGEX.sub(u' ', verse_text)
verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
verses[verse_number] = verse_text
return SearchResults(book_name, chapter, verses)
@ -458,7 +458,7 @@ class CWExtract(object):
log.error(u'No books found in the Crosswalk response.')
send_error_message(u'parse')
return None
content = content.findAll(u'li')
content = content.find_all(u'li')
books = []
for book in content:
book = book.find(u'a')
@ -481,9 +481,8 @@ class HTTPBible(BibleDB):
def __init__(self, parent, **kwargs):
"""
Finds all the bibles defined for the system
Creates an Interface Object for each bible containing connection
information
Finds all the bibles defined for the system. Creates an Interface Object for each bible containing connection
information.
Throws Exception if no Bibles are found.
@ -492,8 +491,7 @@ class HTTPBible(BibleDB):
BibleDB.__init__(self, parent, **kwargs)
self.download_source = kwargs[u'download_source']
self.download_name = kwargs[u'download_name']
# TODO: Clean up proxy stuff. We probably want one global proxy per
# connection type (HTTP and HTTPS) at most.
# TODO: Clean up proxy stuff. We probably want one global proxy per connection type (HTTP and HTTPS) at most.
self.proxy_server = None
self.proxy_username = None
self.proxy_password = None
@ -508,8 +506,8 @@ class HTTPBible(BibleDB):
def do_import(self, bible_name=None):
"""
Run the import. This method overrides the parent class method. Returns
``True`` on success, ``False`` on failure.
Run the import. This method overrides the parent class method. Returns ``True`` on success, ``False`` on
failure.
"""
self.wizard.progress_bar.setMaximum(68)
self.wizard.increment_progress_bar(translate('BiblesPlugin.HTTPBible', 'Registering Bible and loading books...'))
@ -549,8 +547,7 @@ class HTTPBible(BibleDB):
if self.stop_import_flag:
break
self.wizard.increment_progress_bar(translate(
'BiblesPlugin.HTTPBible', 'Importing %s...',
'Importing <book name>...') % book)
'BiblesPlugin.HTTPBible', 'Importing %s...', 'Importing <book name>...') % book)
book_ref_id = self.get_book_ref_id_by_name(book, len(books), language_id)
if not book_ref_id:
log.exception(u'Importing books from %s - download name: "%s" '\
@ -567,22 +564,19 @@ class HTTPBible(BibleDB):
def get_verses(self, reference_list, show_error=True):
"""
A reimplementation of the ``BibleDB.get_verses`` method, this one is
specifically for web Bibles. It first checks to see if the particular
chapter exists in the DB, and if not it pulls it from the web. If the
chapter DOES exist, it simply pulls the verses from the DB using the
ancestor method.
A reimplementation of the ``BibleDB.get_verses`` method, this one is specifically for web Bibles. It first
checks to see if the particular chapter exists in the DB, and if not it pulls it from the web. If the chapter
DOES exist, it simply pulls the verses from the DB using the ancestor method.
``reference_list``
This is the list of references the media manager item wants. It is
a list of tuples, with the following format::
This is the list of references the media manager item wants. It is a list of tuples, with the following
format::
(book_reference_id, chapter, start_verse, end_verse)
Therefore, when you are looking for multiple items, simply break
them up into references like this, bundle them into a list. This
function then runs through the list, and returns an amalgamated
list of ``Verse`` objects. For example::
Therefore, when you are looking for multiple items, simply break them up into references like this, bundle
them into a list. This function then runs through the list, and returns an amalgamated list of ``Verse``
objects. For example::
[(u'35', 1, 1, 1), (u'35', 2, 2, 3)]
"""
@ -683,8 +677,8 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
An optional HTTP header to pass to the bible web server.
``pre_parse_regex``
A regular expression to run on the webpage. Allows manipulation of the
webpage before passing to BeautifulSoup for parsing.
A regular expression to run on the webpage. Allows manipulation of the webpage before passing to BeautifulSoup
for parsing.
``pre_parse_substitute``
The text to replace any matches to the regular expression with.
@ -704,7 +698,7 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
soup = None
try:
if cleaner:
soup = BeautifulSoup(page_source, markupMassage=cleaner)
soup = BeautifulSoup(page_source, markup=cleaner)
else:
soup = BeautifulSoup(page_source)
except HTMLParseError:

View File

@ -79,7 +79,7 @@ MODULES = [
'lxml',
'chardet',
'enchant',
'BeautifulSoup',
'bs4',
'mako',
'migrate',
'uno',