diff --git a/openlp/core/ui/exceptionform.py b/openlp/core/ui/exceptionform.py index 25f8201b1..f7dc83d29 100644 --- a/openlp/core/ui/exceptionform.py +++ b/openlp/core/ui/exceptionform.py @@ -35,7 +35,7 @@ import os import platform import sqlalchemy -import BeautifulSoup +from bs4 import BeautifulSoup from lxml import etree from PyQt4 import Qt, QtCore, QtGui, QtWebKit diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py index b01377a05..390ac1cd9 100644 --- a/openlp/plugins/bibles/lib/http.py +++ b/openlp/plugins/bibles/lib/http.py @@ -36,7 +36,7 @@ import socket import urllib from HTMLParser import HTMLParseError -from BeautifulSoup import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, Tag from openlp.core.lib import Registry, translate from openlp.core.lib.ui import critical_error_message_box @@ -44,6 +44,9 @@ from openlp.core.utils import get_web_page from openlp.plugins.bibles.lib import SearchResults from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book +CLEANER_REGEX = re.compile(' |
|\'\+\'') +FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])') +REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}') UGLY_CHARS = { u'\u2014': u' - ', u'\u2018': u'\'', @@ -52,9 +55,12 @@ UGLY_CHARS = { u'\u201d': u'"', u' ': u' ' } +VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') + log = logging.getLogger(__name__) + class BGExtract(object): """ Extract verses from BibleGateway @@ -78,9 +84,9 @@ class BGExtract(object): An HTML class attribute for further qualification. """ if class_: - all_tags = parent.findAll(tag, class_) + all_tags = parent.find_all(tag, class_) else: - all_tags = parent.findAll(tag) + all_tags = parent.find_all(tag) for element in all_tags: element.extract() @@ -173,8 +179,8 @@ class BGExtract(object): def _extract_verses_old(self, div): """ - Use the old style of parsing for those Bibles on BG who mysteriously - have not been migrated to the new (still broken) HTML. + Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still + broken) HTML. ``div`` The parent div. @@ -185,13 +191,12 @@ class BGExtract(object): if first_verse and first_verse.contents: verse_list[1] = unicode(first_verse.contents[0]) for verse in div(u'sup', u'versenum'): - raw_verse_num = verse.next + raw_verse_num = verse.next_element clean_verse_num = 0 - # Not all verses exist in all translations and may or may not be - # represented by a verse number. If they are not fine, if they are - # it will probably be in a format that breaks int(). We will then - # have no idea what garbage may be sucked in to the verse text so - # if we do not get a clean int() then ignore the verse completely. + # Not all verses exist in all translations and may or may not be represented by a verse number. If they are + # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what + # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse + # completely. try: clean_verse_num = int(str(raw_verse_num)) except ValueError: @@ -201,16 +206,16 @@ class BGExtract(object): except TypeError: log.warn(u'Illegal verse number: %s', unicode(raw_verse_num)) if clean_verse_num: - verse_text = raw_verse_num.next - part = raw_verse_num.next.next + verse_text = raw_verse_num.next_element + part = raw_verse_num.next_element.next_element while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'): # While we are still in the same verse grab all the text. if isinstance(part, NavigableString): verse_text += part - if isinstance(part.next, Tag) and part.next.name == u'div': + if isinstance(part.next_element, Tag) and part.next_element.name == u'div': # Run out of verses so stop. break - part = part.next + part = part.next_element verse_list[clean_verse_num] = unicode(verse_text) return verse_list @@ -230,7 +235,7 @@ class BGExtract(object): log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter) url_book_name = urllib.quote(book_name.encode("utf-8")) url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version) - cleaner = [(re.compile(' |
|\'\+\''), lambda match: '')] + cleaner = [(CLEANER_REGEX, lambda match: '')] soup = get_soup_for_bible_ref( u'http://www.biblegateway.com/passage/?%s' % url_params, pre_parse_regex=r'', pre_parse_substitute='', cleaner=cleaner) @@ -238,7 +243,7 @@ class BGExtract(object): return None div = soup.find('div', 'result-text-style-normal') self._clean_soup(div) - span_list = div.findAll('span', 'text') + span_list = div.find_all('span', 'text') log.debug('Span list: %s', span_list) if not span_list: # If we don't get any spans then we must have the old HTML format @@ -282,7 +287,7 @@ class BGExtract(object): self.application.process_events() content = soup.find(u'table', u'infotable') if content: - content = content.findAll(u'tr') + content = content.find_all(u'tr') if not content: log.error(u'No books found in the Biblegateway response.') send_error_message(u'parse') @@ -341,19 +346,17 @@ class BSExtract(object): log.error(u'No verses found in the Bibleserver response.') send_error_message(u'parse') return None - content = content.find(u'div').findAll(u'div') - verse_number = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') + content = content.find(u'div').find_all(u'div') verses = {} for verse in content: self.application.process_events() - versenumber = int(verse_number.sub(r'\3', verse[u'class'])) + versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', verse[u'class'])) verses[versenumber] = verse.contents[1].rstrip(u'\n') return SearchResults(book_name, chapter, verses) def get_books_from_http(self, version): """ - Load a list of all books a Bible contains from Bibleserver mobile - website. + Load a list of all books a Bible contains from Bibleserver mobile website. ``version`` The version of the Bible like NIV for New International Version @@ -369,7 +372,7 @@ class BSExtract(object): log.error(u'No books found in the Bibleserver response.') send_error_message(u'parse') return None - content = content.findAll(u'li') + content = content.find_all(u'li') return [book.contents[0].contents[0] for book in content] @@ -404,14 +407,12 @@ class CWExtract(object): if not soup: return None self.application.process_events() - html_verses = soup.findAll(u'span', u'versetext') + html_verses = soup.find_all(u'span', u'versetext') if not html_verses: log.error(u'No verses found in the CrossWalk response.') send_error_message(u'parse') return None verses = {} - reduce_spaces = re.compile(r'[ ]{2,}') - fix_punctuation = re.compile(r'[ ]+([.,;])') for verse in html_verses: self.application.process_events() verse_number = int(verse.contents[0].contents[0]) @@ -432,11 +433,10 @@ class CWExtract(object): if isinstance(subsub, NavigableString): verse_text += subsub self.application.process_events() - # Fix up leading and trailing spaces, multiple spaces, and spaces - # between text and , and . + # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and . verse_text = verse_text.strip(u'\n\r\t ') - verse_text = reduce_spaces.sub(u' ', verse_text) - verse_text = fix_punctuation.sub(r'\1', verse_text) + verse_text = REDUCE_SPACES_REGEX.sub(u' ', verse_text) + verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text) verses[verse_number] = verse_text return SearchResults(book_name, chapter, verses) @@ -458,7 +458,7 @@ class CWExtract(object): log.error(u'No books found in the Crosswalk response.') send_error_message(u'parse') return None - content = content.findAll(u'li') + content = content.find_all(u'li') books = [] for book in content: book = book.find(u'a') @@ -481,9 +481,8 @@ class HTTPBible(BibleDB): def __init__(self, parent, **kwargs): """ - Finds all the bibles defined for the system - Creates an Interface Object for each bible containing connection - information + Finds all the bibles defined for the system. Creates an Interface Object for each bible containing connection + information. Throws Exception if no Bibles are found. @@ -492,8 +491,7 @@ class HTTPBible(BibleDB): BibleDB.__init__(self, parent, **kwargs) self.download_source = kwargs[u'download_source'] self.download_name = kwargs[u'download_name'] - # TODO: Clean up proxy stuff. We probably want one global proxy per - # connection type (HTTP and HTTPS) at most. + # TODO: Clean up proxy stuff. We probably want one global proxy per connection type (HTTP and HTTPS) at most. self.proxy_server = None self.proxy_username = None self.proxy_password = None @@ -508,8 +506,8 @@ class HTTPBible(BibleDB): def do_import(self, bible_name=None): """ - Run the import. This method overrides the parent class method. Returns - ``True`` on success, ``False`` on failure. + Run the import. This method overrides the parent class method. Returns ``True`` on success, ``False`` on + failure. """ self.wizard.progress_bar.setMaximum(68) self.wizard.increment_progress_bar(translate('BiblesPlugin.HTTPBible', 'Registering Bible and loading books...')) @@ -549,8 +547,7 @@ class HTTPBible(BibleDB): if self.stop_import_flag: break self.wizard.increment_progress_bar(translate( - 'BiblesPlugin.HTTPBible', 'Importing %s...', - 'Importing ...') % book) + 'BiblesPlugin.HTTPBible', 'Importing %s...', 'Importing ...') % book) book_ref_id = self.get_book_ref_id_by_name(book, len(books), language_id) if not book_ref_id: log.exception(u'Importing books from %s - download name: "%s" '\ @@ -567,22 +564,19 @@ class HTTPBible(BibleDB): def get_verses(self, reference_list, show_error=True): """ - A reimplementation of the ``BibleDB.get_verses`` method, this one is - specifically for web Bibles. It first checks to see if the particular - chapter exists in the DB, and if not it pulls it from the web. If the - chapter DOES exist, it simply pulls the verses from the DB using the - ancestor method. + A reimplementation of the ``BibleDB.get_verses`` method, this one is specifically for web Bibles. It first + checks to see if the particular chapter exists in the DB, and if not it pulls it from the web. If the chapter + DOES exist, it simply pulls the verses from the DB using the ancestor method. ``reference_list`` - This is the list of references the media manager item wants. It is - a list of tuples, with the following format:: + This is the list of references the media manager item wants. It is a list of tuples, with the following + format:: (book_reference_id, chapter, start_verse, end_verse) - Therefore, when you are looking for multiple items, simply break - them up into references like this, bundle them into a list. This - function then runs through the list, and returns an amalgamated - list of ``Verse`` objects. For example:: + Therefore, when you are looking for multiple items, simply break them up into references like this, bundle + them into a list. This function then runs through the list, and returns an amalgamated list of ``Verse`` + objects. For example:: [(u'35', 1, 1, 1), (u'35', 2, 2, 3)] """ @@ -643,7 +637,7 @@ class HTTPBible(BibleDB): Return the number of chapters in a particular book. ``book`` - The book object to get the chapter count for. + The book object to get the chapter count for. """ log.debug(u'HTTPBible.get_chapter_count("%s")', book.name) return BiblesResourcesDB.get_chapter_count(book.book_reference_id) @@ -683,8 +677,8 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, An optional HTTP header to pass to the bible web server. ``pre_parse_regex`` - A regular expression to run on the webpage. Allows manipulation of the - webpage before passing to BeautifulSoup for parsing. + A regular expression to run on the webpage. Allows manipulation of the webpage before passing to BeautifulSoup + for parsing. ``pre_parse_substitute`` The text to replace any matches to the regular expression with. @@ -704,7 +698,7 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, soup = None try: if cleaner: - soup = BeautifulSoup(page_source, markupMassage=cleaner) + soup = BeautifulSoup(page_source, markup=cleaner) else: soup = BeautifulSoup(page_source) except HTMLParseError: diff --git a/scripts/check_dependencies.py b/scripts/check_dependencies.py index 4c0f69b91..86a029b71 100755 --- a/scripts/check_dependencies.py +++ b/scripts/check_dependencies.py @@ -79,7 +79,7 @@ MODULES = [ 'lxml', 'chardet', 'enchant', - 'BeautifulSoup', + 'bs4', 'mako', 'migrate', 'uno',