- ported code to beautifulsoup 4

- moved regular expressions to the top - fixed regression bzr-revno: 2238
2013-04-23 18:56:23 +02:00 · 2013-04-23 18:56:23 +02:00 · ba261e76e5
commit ba261e76e5
parent 6dcb708f17 3f60c34440
5 changed files with 113 additions and 74 deletions
--- a/openlp/core/ui/exceptionform.py
+++ b/openlp/core/ui/exceptionform.py
@ -35,7 +35,7 @@ import os
 import platform
 import sqlalchemy
-import BeautifulSoup
+from bs4 import BeautifulSoup
 from lxml import etree
 from PyQt4 import Qt, QtCore, QtGui, QtWebKit
--- a/openlp/plugins/bibles/lib/http.py
+++ b/openlp/plugins/bibles/lib/http.py
@ -36,7 +36,7 @@ import socket
 import urllib
 from HTMLParser import HTMLParseError
-from BeautifulSoup import BeautifulSoup, NavigableString, Tag
+from bs4 import BeautifulSoup, NavigableString, Tag
 from openlp.core.lib import Registry, translate
 from openlp.core.lib.ui import critical_error_message_box
@ -44,6 +44,9 @@ from openlp.core.utils import get_web_page
 from openlp.plugins.bibles.lib import SearchResults
 from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book
 CLEANER_REGEX = re.compile(r'&nbsp;|<br />|\'\+\'')
 FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
 REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
 UGLY_CHARS = {
    u'\u2014': u' - ',
    u'\u2018': u'\'',
@ -52,6 +55,8 @@ UGLY_CHARS = {
    u'\u201d': u'"',
    u'&nbsp;': u' '
 }
 VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
 log = logging.getLogger(__name__)
@ -79,9 +84,9 @@ class BGExtract(object):
            An HTML class attribute for further qualification.
        """
        if class_:
-            all_tags = parent.findAll(tag, class_)
+            all_tags = parent.find_all(tag, class_)
        else:
-            all_tags = parent.findAll(tag)
+            all_tags = parent.find_all(tag)
        for element in all_tags:
            element.extract()
@ -94,14 +99,15 @@ class BGExtract(object):
        """
        if isinstance(tag, NavigableString):
            return None, unicode(tag)
-        elif tag.get('class') == 'versenum' or tag.get('class') == 'versenum mid-line':
+        elif tag.get('class')[0] == "versenum" or tag.get('class')[0] == 'versenum mid-line':
            verse = unicode(tag.string).replace('[', '').replace(']', '').strip()
            return verse, None
-        elif tag.get('class') == 'chapternum':
+        elif tag.get('class')[0] == 'chapternum':
            verse = '1'
            return verse, None
        else:
-            verse, text = None, ''
+            verse = None
            text = ''
            for child in tag.contents:
                c_verse, c_text = self._extract_verse(child)
                if c_verse:
@ -138,7 +144,8 @@ class BGExtract(object):
        tags = tags[::-1]
        current_text = ''
        for tag in tags:
-            verse, text = None, ''
+            verse = None
            text = ''
            for child in tag.contents:
                c_verse, c_text = self._extract_verse(child)
                if c_verse:
@ -174,8 +181,8 @@ class BGExtract(object):
    def _extract_verses_old(self, div):
        """
-        Use the old style of parsing for those Bibles on BG who mysteriously
+        Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
-        have not been migrated to the new (still broken) HTML.
+        broken) HTML.
        ``div``
            The parent div.
@ -186,13 +193,12 @@ class BGExtract(object):
        if first_verse and first_verse.contents:
            verse_list[1] = unicode(first_verse.contents[0])
        for verse in div(u'sup', u'versenum'):
-            raw_verse_num = verse.next
+            raw_verse_num = verse.next_element
            clean_verse_num = 0
-            # Not all verses exist in all translations and may or may not be
+            # Not all verses exist in all translations and may or may not be represented by a verse number. If they are
-            # represented by a verse number. If they are not fine, if they are
+            # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
-            # it will probably be in a format that breaks int(). We will then
+            # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
-            # have no idea what garbage may be sucked in to the verse text so
+            # completely.
            # if we do not get a clean int() then ignore the verse completely.
            try:
                clean_verse_num = int(str(raw_verse_num))
            except ValueError:
@ -202,16 +208,16 @@ class BGExtract(object):
            except TypeError:
                log.warn(u'Illegal verse number: %s', unicode(raw_verse_num))
            if clean_verse_num:
-                verse_text = raw_verse_num.next
+                verse_text = raw_verse_num.next_element
-                part = raw_verse_num.next.next
+                part = raw_verse_num.next_element.next_element
-                while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'):
+                while not (isinstance(part, Tag) and part.get(u'class')[0] == u'versenum'):
                    # While we are still in the same verse grab all the text.
                    if isinstance(part, NavigableString):
                        verse_text += part
-                    if isinstance(part.next, Tag) and part.next.name == u'div':
+                    if isinstance(part.next_element, Tag) and part.next_element.name == u'div':
                        # Run out of verses so stop.
                        break
-                    part = part.next
+                    part = part.next_element
                verse_list[clean_verse_num] = unicode(verse_text)
        return verse_list
@ -231,15 +237,14 @@ class BGExtract(object):
        log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter)
        url_book_name = urllib.quote(book_name.encode("utf-8"))
        url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version)
        cleaner = [(re.compile('&nbsp;|<br />|\'\+\''), lambda match: '')]
        soup = get_soup_for_bible_ref(
            u'http://www.biblegateway.com/passage/?%s' % url_params,
-            pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='', cleaner=cleaner)
+            pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
        if not soup:
            return None
        div = soup.find('div', 'result-text-style-normal')
        self._clean_soup(div)
-        span_list = div.findAll('span', 'text')
+        span_list = div.find_all('span', 'text')
        log.debug('Span list: %s', span_list)
        if not span_list:
            # If we don't get any spans then we must have the old HTML format
@ -283,7 +288,7 @@ class BGExtract(object):
        self.application.process_events()
        content = soup.find(u'table', u'infotable')
        if content:
-            content = content.findAll(u'tr')
+            content = content.find_all(u'tr')
        if not content:
            log.error(u'No books found in the Biblegateway response.')
            send_error_message(u'parse')
@ -342,19 +347,17 @@ class BSExtract(object):
            log.error(u'No verses found in the Bibleserver response.')
            send_error_message(u'parse')
            return None
-        content = content.find(u'div').findAll(u'div')
+        content = content.find(u'div').find_all(u'div')
        verse_number = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
        verses = {}
        for verse in content:
            self.application.process_events()
-            versenumber = int(verse_number.sub(r'\3', verse[u'class']))
+            versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', u' '.join(verse[u'class'])))
            verses[versenumber] = verse.contents[1].rstrip(u'\n')
        return SearchResults(book_name, chapter, verses)
    def get_books_from_http(self, version):
        """
-        Load a list of all books a Bible contains from Bibleserver mobile
+        Load a list of all books a Bible contains from Bibleserver mobile website.
        website.
        ``version``
            The version of the Bible like NIV for New International Version
@ -370,9 +373,19 @@ class BSExtract(object):
            log.error(u'No books found in the Bibleserver response.')
            send_error_message(u'parse')
            return None
-        content = content.findAll(u'li')
+        content = content.find_all(u'li')
        return [book.contents[0].contents[0] for book in content]
    def _get_application(self):
        """
        Adds the openlp to the class dynamically
        """
        if not hasattr(self, u'_application'):
            self._application = Registry().get(u'application')
        return self._application
    application = property(_get_application)
 class CWExtract(object):
    """
@ -405,14 +418,12 @@ class CWExtract(object):
        if not soup:
            return None
        self.application.process_events()
-        html_verses = soup.findAll(u'span', u'versetext')
+        html_verses = soup.find_all(u'span', u'versetext')
        if not html_verses:
            log.error(u'No verses found in the CrossWalk response.')
            send_error_message(u'parse')
            return None
        verses = {}
        reduce_spaces = re.compile(r'[ ]{2,}')
        fix_punctuation = re.compile(r'[ ]+([.,;])')
        for verse in html_verses:
            self.application.process_events()
            verse_number = int(verse.contents[0].contents[0])
@ -433,11 +444,10 @@ class CWExtract(object):
                                if isinstance(subsub, NavigableString):
                                    verse_text += subsub
            self.application.process_events()
-            # Fix up leading and trailing spaces, multiple spaces, and spaces
+            # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
            # between text and , and .
            verse_text = verse_text.strip(u'\n\r\t ')
-            verse_text = reduce_spaces.sub(u' ', verse_text)
+            verse_text = REDUCE_SPACES_REGEX.sub(u' ', verse_text)
-            verse_text = fix_punctuation.sub(r'\1', verse_text)
+            verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
            verses[verse_number] = verse_text
        return SearchResults(book_name, chapter, verses)
@ -459,7 +469,7 @@ class CWExtract(object):
            log.error(u'No books found in the Crosswalk response.')
            send_error_message(u'parse')
            return None
-        content = content.findAll(u'li')
+        content = content.find_all(u'li')
        books = []
        for book in content:
            book = book.find(u'a')
@ -482,9 +492,8 @@ class HTTPBible(BibleDB):
    def __init__(self, parent, **kwargs):
        """
-        Finds all the bibles defined for the system
+        Finds all the bibles defined for the system. Creates an Interface Object for each bible containing connection
-        Creates an Interface Object for each bible containing connection
+        information.
        information
        Throws Exception if no Bibles are found.
@ -493,8 +502,7 @@ class HTTPBible(BibleDB):
        BibleDB.__init__(self, parent, **kwargs)
        self.download_source = kwargs[u'download_source']
        self.download_name = kwargs[u'download_name']
-        # TODO: Clean up proxy stuff. We probably want one global proxy per
+        # TODO: Clean up proxy stuff. We probably want one global proxy per connection type (HTTP and HTTPS) at most.
        # connection type (HTTP and HTTPS) at most.
        self.proxy_server = None
        self.proxy_username = None
        self.proxy_password = None
@ -509,8 +517,8 @@ class HTTPBible(BibleDB):
    def do_import(self, bible_name=None):
        """
-        Run the import. This method overrides the parent class method. Returns
+        Run the import. This method overrides the parent class method. Returns ``True`` on success, ``False`` on
-        ``True`` on success, ``False`` on failure.
+        failure.
        """
        self.wizard.progress_bar.setMaximum(68)
        self.wizard.increment_progress_bar(translate('BiblesPlugin.HTTPBible', 'Registering Bible and loading books...'))
@ -550,8 +558,7 @@ class HTTPBible(BibleDB):
            if self.stop_import_flag:
                break
            self.wizard.increment_progress_bar(translate(
-                'BiblesPlugin.HTTPBible', 'Importing %s...',
+                'BiblesPlugin.HTTPBible', 'Importing %s...', 'Importing <book name>...') % book)
                'Importing <book name>...') % book)
            book_ref_id = self.get_book_ref_id_by_name(book, len(books), language_id)
            if not book_ref_id:
                log.exception(u'Importing books from %s - download name: "%s" '\
@ -568,22 +575,19 @@ class HTTPBible(BibleDB):
    def get_verses(self, reference_list, show_error=True):
        """
-        A reimplementation of the ``BibleDB.get_verses`` method, this one is
+        A reimplementation of the ``BibleDB.get_verses`` method, this one is specifically for web Bibles. It first
-        specifically for web Bibles. It first checks to see if the particular
+        checks to see if the particular chapter exists in the DB, and if not it pulls it from the web. If the chapter
-        chapter exists in the DB, and if not it pulls it from the web. If the
+        DOES exist, it simply pulls the verses from the DB using the ancestor method.
        chapter DOES exist, it simply pulls the verses from the DB using the
        ancestor method.
        ``reference_list``
-            This is the list of references the media manager item wants. It is
+            This is the list of references the media manager item wants. It is a list of tuples, with the following
-            a list of tuples, with the following format::
+            format::
                (book_reference_id, chapter, start_verse, end_verse)
-            Therefore, when you are looking for multiple items, simply break
+            Therefore, when you are looking for multiple items, simply break them up into references like this, bundle
-            them up into references like this, bundle them into a list. This
+            them into a list. This function then runs through the list, and returns an amalgamated list of ``Verse``
-            function then runs through the list, and returns an amalgamated
+            objects. For example::
            list of ``Verse`` objects. For example::
                [(u'35', 1, 1, 1), (u'35', 2, 2, 3)]
        """
@ -644,7 +648,7 @@ class HTTPBible(BibleDB):
        Return the number of chapters in a particular book.
        ``book``
-        The book object to get the chapter count for.
+            The book object to get the chapter count for.
        """
        log.debug(u'HTTPBible.get_chapter_count("%s")', book.name)
        return BiblesResourcesDB.get_chapter_count(book.book_reference_id)
@ -672,9 +676,7 @@ class HTTPBible(BibleDB):
    application = property(_get_application)
-
+def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, pre_parse_substitute=None):
 def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
    pre_parse_substitute=None, cleaner=None):
    """
    Gets a webpage and returns a parsed and optionally cleaned soup or None.
@ -685,14 +687,11 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
        An optional HTTP header to pass to the bible web server.
    ``pre_parse_regex``
-        A regular expression to run on the webpage. Allows manipulation of the
+        A regular expression to run on the webpage. Allows manipulation of the webpage before passing to BeautifulSoup
-        webpage before passing to BeautifulSoup for parsing.
+        for parsing.
    ``pre_parse_substitute``
        The text to replace any matches to the regular expression with.
    ``cleaner``
        An optional regex to use during webpage parsing.
    """
    if not reference_url:
        return None
@ -705,10 +704,8 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
        page_source = re.sub(pre_parse_regex, pre_parse_substitute, page_source)
    soup = None
    try:
-        if cleaner:
+        soup = BeautifulSoup(page_source)
-            soup = BeautifulSoup(page_source, markupMassage=cleaner)
+        CLEANER_REGEX.sub(u'', unicode(soup))
        else:
            soup = BeautifulSoup(page_source)
    except HTMLParseError:
        log.exception(u'BeautifulSoup could not parse the bible page.')
    if not soup:
--- a/openlp/plugins/custom/forms/editcustomform.py
+++ b/openlp/plugins/custom/forms/editcustomform.py
@ -257,6 +257,6 @@ class EditCustomForm(QtGui.QDialog, Ui_CustomEditDialog):
        # We must have at least one slide.
        if self.slide_list_view.count() == 0:
            critical_error_message_box(message=translate('CustomPlugin.EditCustomForm',
-                'You need to add at least one slide'))
+                'You need to add at least one slide.'))
            return False
        return True
--- a/scripts/check_dependencies.py
+++ b/scripts/check_dependencies.py
@ -79,7 +79,7 @@ MODULES = [
    'lxml',
    'chardet',
    'enchant',
-    'BeautifulSoup',
+    'bs4',
    'mako',
    'cherrypy',
    'migrate',
--- a/tests/interfaces/openlp_plugins/custom/forms/test_customform.py
+++ b/tests/interfaces/openlp_plugins/custom/forms/test_customform.py
@ -72,3 +72,45 @@ class TestEditCustomForm(TestCase):
            # THEN: One slide should be added.
            assert self.form.slide_list_view.count() == 1, u'There should be one slide added.'
    def validate_not_valid_part1_test(self):
        """
        Test the _validate() method.
        """
        # GIVEN: Mocked methods.
        with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \
                mocked_critical_error_message_box:
            mocked_displayText = MagicMock()
            mocked_displayText.return_value = u''
            self.form.title_edit.displayText = mocked_displayText
            mocked_setFocus = MagicMock()
            self.form.title_edit.setFocus = mocked_setFocus
            # WHEN: Call the method.
            result = self.form._validate()
            # THEN: The validate method should have returned False.
            assert not result, u'The _validate() method should have retured False'
            mocked_setFocus.assert_called_with()
            mocked_critical_error_message_box.assert_called_with(message=u'You need to type in a title.')
    def validate_not_valid_part2_test(self):
        """
        Test the _validate() method.
        """
        # GIVEN: Mocked methods.
        with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \
                mocked_critical_error_message_box:
            mocked_displayText = MagicMock()
            mocked_displayText.return_value = u'something'
            self.form.title_edit.displayText = mocked_displayText
            mocked_count = MagicMock()
            mocked_count.return_value = 0
            self.form.slide_list_view.count = mocked_count
            # WHEN: Call the method.
            result = self.form._validate()
            # THEN: The validate method should have returned False.
            assert not result, u'The _validate() method should have retured False'
            mocked_critical_error_message_box.assert_called_with(message=u'You need to add at least one slide.')