replaced BeautifulSoup3 by BeautifulSoup4

2013-04-05 21:58:13 +02:00 · 2013-04-05 21:58:13 +02:00 · 69009970c0
commit 69009970c0
parent 131b46edef
3 changed files with 52 additions and 58 deletions
--- a/openlp/core/ui/exceptionform.py
+++ b/openlp/core/ui/exceptionform.py
@ -35,7 +35,7 @@ import os
 import platform
 import sqlalchemy
-import BeautifulSoup
+from bs4 import BeautifulSoup
 from lxml import etree
 from PyQt4 import Qt, QtCore, QtGui, QtWebKit
--- a/openlp/plugins/bibles/lib/http.py
+++ b/openlp/plugins/bibles/lib/http.py
@ -36,7 +36,7 @@ import socket
 import urllib
 from HTMLParser import HTMLParseError
-from BeautifulSoup import BeautifulSoup, NavigableString, Tag
+from bs4 import BeautifulSoup, NavigableString, Tag
 from openlp.core.lib import Registry, translate
 from openlp.core.lib.ui import critical_error_message_box
@ -44,6 +44,9 @@ from openlp.core.utils import get_web_page
 from openlp.plugins.bibles.lib import SearchResults
 from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book
 CLEANER_REGEX = re.compile('&nbsp;|<br />|\'\+\'')
 FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
 REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
 UGLY_CHARS = {
    u'\u2014': u' - ',
    u'\u2018': u'\'',
@ -52,9 +55,12 @@ UGLY_CHARS = {
    u'\u201d': u'"',
    u'&nbsp;': u' '
 }
 VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
 log = logging.getLogger(__name__)
 class BGExtract(object):
    """
    Extract verses from BibleGateway
@ -78,9 +84,9 @@ class BGExtract(object):
            An HTML class attribute for further qualification.
        """
        if class_:
-            all_tags = parent.findAll(tag, class_)
+            all_tags = parent.find_all(tag, class_)
        else:
-            all_tags = parent.findAll(tag)
+            all_tags = parent.find_all(tag)
        for element in all_tags:
            element.extract()
@ -173,8 +179,8 @@ class BGExtract(object):
    def _extract_verses_old(self, div):
        """
-        Use the old style of parsing for those Bibles on BG who mysteriously
+        Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
-        have not been migrated to the new (still broken) HTML.
+        broken) HTML.
        ``div``
            The parent div.
@ -185,13 +191,12 @@ class BGExtract(object):
        if first_verse and first_verse.contents:
            verse_list[1] = unicode(first_verse.contents[0])
        for verse in div(u'sup', u'versenum'):
-            raw_verse_num = verse.next
+            raw_verse_num = verse.next_element
            clean_verse_num = 0
-            # Not all verses exist in all translations and may or may not be
+            # Not all verses exist in all translations and may or may not be represented by a verse number. If they are
-            # represented by a verse number. If they are not fine, if they are
+            # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
-            # it will probably be in a format that breaks int(). We will then
+            # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
-            # have no idea what garbage may be sucked in to the verse text so
+            # completely.
            # if we do not get a clean int() then ignore the verse completely.
            try:
                clean_verse_num = int(str(raw_verse_num))
            except ValueError:
@ -201,16 +206,16 @@ class BGExtract(object):
            except TypeError:
                log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
            if clean_verse_num:
-                verse_text = raw_verse_num.next
+                verse_text = raw_verse_num.next_element
-                part = raw_verse_num.next.next
+                part = raw_verse_num.next_element.next_element
                while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'):
                    # While we are still in the same verse grab all the text.
                    if isinstance(part, NavigableString):
                        verse_text += part
-                    if isinstance(part.next, Tag) and part.next.name == u'div':
+                    if isinstance(part.next_element, Tag) and part.next_element.name == u'div':
                        # Run out of verses so stop.
                        break
-                    part = part.next
+                    part = part.next_element
                verse_list[clean_verse_num] = unicode(verse_text)
        return verse_list
@ -230,7 +235,7 @@ class BGExtract(object):
        log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter)
        url_book_name = urllib.quote(book_name.encode("utf-8"))
        url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version)
-        cleaner = [(re.compile('&nbsp;|<br />|\'\+\''), lambda match: '')]
+        cleaner = [(CLEANER_REGEX, lambda match: '')]
        soup = get_soup_for_bible_ref(
            u'http://www.biblegateway.com/passage/?%s' % url_params,
            pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='', cleaner=cleaner)
@ -238,7 +243,7 @@ class BGExtract(object):
            return None
        div = soup.find('div', 'result-text-style-normal')
        self._clean_soup(div)
-        span_list = div.findAll('span', 'text')
+        span_list = div.find_all('span', 'text')
        log.debug('Span list: %s', span_list)
        if not span_list:
            # If we don't get any spans then we must have the old HTML format
@ -282,7 +287,7 @@ class BGExtract(object):
        self.application.process_events()
        content = soup.find(u'table', u'infotable')
        if content:
-            content = content.findAll(u'tr')
+            content = content.find_all(u'tr')
        if not content:
            log.error(u'No books found in the Biblegateway response.')
            send_error_message(u'parse')
@ -341,19 +346,17 @@ class BSExtract(object):
            log.error(u'No verses found in the Bibleserver response.')
            send_error_message(u'parse')
            return None
-        content = content.find(u'div').findAll(u'div')
+        content = content.find(u'div').find_all(u'div')
        verse_number = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
        verses = {}
        for verse in content:
            self.application.process_events()
-            versenumber = int(verse_number.sub(r'\3', verse[u'class']))
+            versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', verse[u'class']))
            verses[versenumber] = verse.contents[1].rstrip(u'\n')
        return SearchResults(book_name, chapter, verses)
    def get_books_from_http(self, version):
        """
-        Load a list of all books a Bible contains from Bibleserver mobile
+        Load a list of all books a Bible contains from Bibleserver mobile website.
        website.
        ``version``
            The version of the Bible like NIV for New International Version
@ -369,7 +372,7 @@ class BSExtract(object):
            log.error(u'No books found in the Bibleserver response.')
            send_error_message(u'parse')
            return None
-        content = content.findAll(u'li')
+        content = content.find_all(u'li')
        return [book.contents[0].contents[0] for book in content]
@ -404,14 +407,12 @@ class CWExtract(object):
        if not soup:
            return None
        self.application.process_events()
-        html_verses = soup.findAll(u'span', u'versetext')
+        html_verses = soup.find_all(u'span', u'versetext')
        if not html_verses:
            log.error(u'No verses found in the CrossWalk response.')
            send_error_message(u'parse')
            return None
        verses = {}
        reduce_spaces = re.compile(r'[ ]{2,}')
        fix_punctuation = re.compile(r'[ ]+([.,;])')
        for verse in html_verses:
            self.application.process_events()
            verse_number = int(verse.contents[0].contents[0])
@ -432,11 +433,10 @@ class CWExtract(object):
                                if isinstance(subsub, NavigableString):
                                    verse_text += subsub
            self.application.process_events()
-            # Fix up leading and trailing spaces, multiple spaces, and spaces
+            # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
            # between text and , and .
            verse_text = verse_text.strip(u'\n\r\t ')
-            verse_text = reduce_spaces.sub(u' ', verse_text)
+            verse_text = REDUCE_SPACES_REGEX.sub(u' ', verse_text)
-            verse_text = fix_punctuation.sub(r'\1', verse_text)
+            verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
            verses[verse_number] = verse_text
        return SearchResults(book_name, chapter, verses)
@ -458,7 +458,7 @@ class CWExtract(object):
            log.error(u'No books found in the Crosswalk response.')
            send_error_message(u'parse')
            return None
-        content = content.findAll(u'li')
+        content = content.find_all(u'li')
        books = []
        for book in content:
            book = book.find(u'a')
@ -481,9 +481,8 @@ class HTTPBible(BibleDB):
    def __init__(self, parent, **kwargs):
        """
-        Finds all the bibles defined for the system
+        Finds all the bibles defined for the system. Creates an Interface Object for each bible containing connection
-        Creates an Interface Object for each bible containing connection
+        information.
        information
        Throws Exception if no Bibles are found.
@ -492,8 +491,7 @@ class HTTPBible(BibleDB):
        BibleDB.__init__(self, parent, **kwargs)
        self.download_source = kwargs[u'download_source']
        self.download_name = kwargs[u'download_name']
-        # TODO: Clean up proxy stuff. We probably want one global proxy per
+        # TODO: Clean up proxy stuff. We probably want one global proxy per connection type (HTTP and HTTPS) at most.
        # connection type (HTTP and HTTPS) at most.
        self.proxy_server = None
        self.proxy_username = None
        self.proxy_password = None
@ -508,8 +506,8 @@ class HTTPBible(BibleDB):
    def do_import(self, bible_name=None):
        """
-        Run the import. This method overrides the parent class method. Returns
+        Run the import. This method overrides the parent class method. Returns ``True`` on success, ``False`` on
-        ``True`` on success, ``False`` on failure.
+        failure.
        """
        self.wizard.progress_bar.setMaximum(68)
        self.wizard.increment_progress_bar(translate('BiblesPlugin.HTTPBible', 'Registering Bible and loading books...'))
@ -549,8 +547,7 @@ class HTTPBible(BibleDB):
            if self.stop_import_flag:
                break
            self.wizard.increment_progress_bar(translate(
-                'BiblesPlugin.HTTPBible', 'Importing %s...',
+                'BiblesPlugin.HTTPBible', 'Importing %s...', 'Importing <book name>...') % book)
                'Importing <book name>...') % book)
            book_ref_id = self.get_book_ref_id_by_name(book, len(books), language_id)
            if not book_ref_id:
                log.exception(u'Importing books from %s - download name: "%s" '\
@ -567,22 +564,19 @@ class HTTPBible(BibleDB):
    def get_verses(self, reference_list, show_error=True):
        """
-        A reimplementation of the ``BibleDB.get_verses`` method, this one is
+        A reimplementation of the ``BibleDB.get_verses`` method, this one is specifically for web Bibles. It first
-        specifically for web Bibles. It first checks to see if the particular
+        checks to see if the particular chapter exists in the DB, and if not it pulls it from the web. If the chapter
-        chapter exists in the DB, and if not it pulls it from the web. If the
+        DOES exist, it simply pulls the verses from the DB using the ancestor method.
        chapter DOES exist, it simply pulls the verses from the DB using the
        ancestor method.
        ``reference_list``
-            This is the list of references the media manager item wants. It is
+            This is the list of references the media manager item wants. It is a list of tuples, with the following
-            a list of tuples, with the following format::
+            format::
                (book_reference_id, chapter, start_verse, end_verse)
-            Therefore, when you are looking for multiple items, simply break
+            Therefore, when you are looking for multiple items, simply break them up into references like this, bundle
-            them up into references like this, bundle them into a list. This
+            them into a list. This function then runs through the list, and returns an amalgamated list of ``Verse``
-            function then runs through the list, and returns an amalgamated
+            objects. For example::
            list of ``Verse`` objects. For example::
                [(u'35', 1, 1, 1), (u'35', 2, 2, 3)]
        """
@ -683,8 +677,8 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
        An optional HTTP header to pass to the bible web server.
    ``pre_parse_regex``
-        A regular expression to run on the webpage. Allows manipulation of the
+        A regular expression to run on the webpage. Allows manipulation of the webpage before passing to BeautifulSoup
-        webpage before passing to BeautifulSoup for parsing.
+        for parsing.
    ``pre_parse_substitute``
        The text to replace any matches to the regular expression with.
@ -704,7 +698,7 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
    soup = None
    try:
        if cleaner:
-            soup = BeautifulSoup(page_source, markupMassage=cleaner)
+            soup = BeautifulSoup(page_source, markup=cleaner)
        else:
            soup = BeautifulSoup(page_source)
    except HTMLParseError:
--- a/scripts/check_dependencies.py
+++ b/scripts/check_dependencies.py
@ -79,7 +79,7 @@ MODULES = [
    'lxml',
    'chardet',
    'enchant',
-    'BeautifulSoup',
+    'bs4',
    'mako',
    'migrate',
    'uno',