From 69009970c07641217ea2c1a8eee8810152b01812 Mon Sep 17 00:00:00 2001 From: Andreas Preikschat Date: Fri, 5 Apr 2013 21:58:13 +0200 Subject: [PATCH 1/6] replaced BeautifulSoup3 by BeautifulSoup4 --- openlp/core/ui/exceptionform.py | 2 +- openlp/plugins/bibles/lib/http.py | 106 ++++++++++++++---------------- scripts/check_dependencies.py | 2 +- 3 files changed, 52 insertions(+), 58 deletions(-) diff --git a/openlp/core/ui/exceptionform.py b/openlp/core/ui/exceptionform.py index 25f8201b1..f7dc83d29 100644 --- a/openlp/core/ui/exceptionform.py +++ b/openlp/core/ui/exceptionform.py @@ -35,7 +35,7 @@ import os import platform import sqlalchemy -import BeautifulSoup +from bs4 import BeautifulSoup from lxml import etree from PyQt4 import Qt, QtCore, QtGui, QtWebKit diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py index b01377a05..390ac1cd9 100644 --- a/openlp/plugins/bibles/lib/http.py +++ b/openlp/plugins/bibles/lib/http.py @@ -36,7 +36,7 @@ import socket import urllib from HTMLParser import HTMLParseError -from BeautifulSoup import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, Tag from openlp.core.lib import Registry, translate from openlp.core.lib.ui import critical_error_message_box @@ -44,6 +44,9 @@ from openlp.core.utils import get_web_page from openlp.plugins.bibles.lib import SearchResults from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book +CLEANER_REGEX = re.compile(' |
|\'\+\'') +FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])') +REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}') UGLY_CHARS = { u'\u2014': u' - ', u'\u2018': u'\'', @@ -52,9 +55,12 @@ UGLY_CHARS = { u'\u201d': u'"', u' ': u' ' } +VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') + log = logging.getLogger(__name__) + class BGExtract(object): """ Extract verses from BibleGateway @@ -78,9 +84,9 @@ class BGExtract(object): An HTML class attribute for further qualification. """ if class_: - all_tags = parent.findAll(tag, class_) + all_tags = parent.find_all(tag, class_) else: - all_tags = parent.findAll(tag) + all_tags = parent.find_all(tag) for element in all_tags: element.extract() @@ -173,8 +179,8 @@ class BGExtract(object): def _extract_verses_old(self, div): """ - Use the old style of parsing for those Bibles on BG who mysteriously - have not been migrated to the new (still broken) HTML. + Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still + broken) HTML. ``div`` The parent div. @@ -185,13 +191,12 @@ class BGExtract(object): if first_verse and first_verse.contents: verse_list[1] = unicode(first_verse.contents[0]) for verse in div(u'sup', u'versenum'): - raw_verse_num = verse.next + raw_verse_num = verse.next_element clean_verse_num = 0 - # Not all verses exist in all translations and may or may not be - # represented by a verse number. If they are not fine, if they are - # it will probably be in a format that breaks int(). We will then - # have no idea what garbage may be sucked in to the verse text so - # if we do not get a clean int() then ignore the verse completely. + # Not all verses exist in all translations and may or may not be represented by a verse number. If they are + # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what + # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse + # completely. try: clean_verse_num = int(str(raw_verse_num)) except ValueError: @@ -201,16 +206,16 @@ class BGExtract(object): except TypeError: log.warn(u'Illegal verse number: %s', unicode(raw_verse_num)) if clean_verse_num: - verse_text = raw_verse_num.next - part = raw_verse_num.next.next + verse_text = raw_verse_num.next_element + part = raw_verse_num.next_element.next_element while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'): # While we are still in the same verse grab all the text. if isinstance(part, NavigableString): verse_text += part - if isinstance(part.next, Tag) and part.next.name == u'div': + if isinstance(part.next_element, Tag) and part.next_element.name == u'div': # Run out of verses so stop. break - part = part.next + part = part.next_element verse_list[clean_verse_num] = unicode(verse_text) return verse_list @@ -230,7 +235,7 @@ class BGExtract(object): log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter) url_book_name = urllib.quote(book_name.encode("utf-8")) url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version) - cleaner = [(re.compile(' |
|\'\+\''), lambda match: '')] + cleaner = [(CLEANER_REGEX, lambda match: '')] soup = get_soup_for_bible_ref( u'http://www.biblegateway.com/passage/?%s' % url_params, pre_parse_regex=r'', pre_parse_substitute='', cleaner=cleaner) @@ -238,7 +243,7 @@ class BGExtract(object): return None div = soup.find('div', 'result-text-style-normal') self._clean_soup(div) - span_list = div.findAll('span', 'text') + span_list = div.find_all('span', 'text') log.debug('Span list: %s', span_list) if not span_list: # If we don't get any spans then we must have the old HTML format @@ -282,7 +287,7 @@ class BGExtract(object): self.application.process_events() content = soup.find(u'table', u'infotable') if content: - content = content.findAll(u'tr') + content = content.find_all(u'tr') if not content: log.error(u'No books found in the Biblegateway response.') send_error_message(u'parse') @@ -341,19 +346,17 @@ class BSExtract(object): log.error(u'No verses found in the Bibleserver response.') send_error_message(u'parse') return None - content = content.find(u'div').findAll(u'div') - verse_number = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') + content = content.find(u'div').find_all(u'div') verses = {} for verse in content: self.application.process_events() - versenumber = int(verse_number.sub(r'\3', verse[u'class'])) + versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', verse[u'class'])) verses[versenumber] = verse.contents[1].rstrip(u'\n') return SearchResults(book_name, chapter, verses) def get_books_from_http(self, version): """ - Load a list of all books a Bible contains from Bibleserver mobile - website. + Load a list of all books a Bible contains from Bibleserver mobile website. ``version`` The version of the Bible like NIV for New International Version @@ -369,7 +372,7 @@ class BSExtract(object): log.error(u'No books found in the Bibleserver response.') send_error_message(u'parse') return None - content = content.findAll(u'li') + content = content.find_all(u'li') return [book.contents[0].contents[0] for book in content] @@ -404,14 +407,12 @@ class CWExtract(object): if not soup: return None self.application.process_events() - html_verses = soup.findAll(u'span', u'versetext') + html_verses = soup.find_all(u'span', u'versetext') if not html_verses: log.error(u'No verses found in the CrossWalk response.') send_error_message(u'parse') return None verses = {} - reduce_spaces = re.compile(r'[ ]{2,}') - fix_punctuation = re.compile(r'[ ]+([.,;])') for verse in html_verses: self.application.process_events() verse_number = int(verse.contents[0].contents[0]) @@ -432,11 +433,10 @@ class CWExtract(object): if isinstance(subsub, NavigableString): verse_text += subsub self.application.process_events() - # Fix up leading and trailing spaces, multiple spaces, and spaces - # between text and , and . + # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and . verse_text = verse_text.strip(u'\n\r\t ') - verse_text = reduce_spaces.sub(u' ', verse_text) - verse_text = fix_punctuation.sub(r'\1', verse_text) + verse_text = REDUCE_SPACES_REGEX.sub(u' ', verse_text) + verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text) verses[verse_number] = verse_text return SearchResults(book_name, chapter, verses) @@ -458,7 +458,7 @@ class CWExtract(object): log.error(u'No books found in the Crosswalk response.') send_error_message(u'parse') return None - content = content.findAll(u'li') + content = content.find_all(u'li') books = [] for book in content: book = book.find(u'a') @@ -481,9 +481,8 @@ class HTTPBible(BibleDB): def __init__(self, parent, **kwargs): """ - Finds all the bibles defined for the system - Creates an Interface Object for each bible containing connection - information + Finds all the bibles defined for the system. Creates an Interface Object for each bible containing connection + information. Throws Exception if no Bibles are found. @@ -492,8 +491,7 @@ class HTTPBible(BibleDB): BibleDB.__init__(self, parent, **kwargs) self.download_source = kwargs[u'download_source'] self.download_name = kwargs[u'download_name'] - # TODO: Clean up proxy stuff. We probably want one global proxy per - # connection type (HTTP and HTTPS) at most. + # TODO: Clean up proxy stuff. We probably want one global proxy per connection type (HTTP and HTTPS) at most. self.proxy_server = None self.proxy_username = None self.proxy_password = None @@ -508,8 +506,8 @@ class HTTPBible(BibleDB): def do_import(self, bible_name=None): """ - Run the import. This method overrides the parent class method. Returns - ``True`` on success, ``False`` on failure. + Run the import. This method overrides the parent class method. Returns ``True`` on success, ``False`` on + failure. """ self.wizard.progress_bar.setMaximum(68) self.wizard.increment_progress_bar(translate('BiblesPlugin.HTTPBible', 'Registering Bible and loading books...')) @@ -549,8 +547,7 @@ class HTTPBible(BibleDB): if self.stop_import_flag: break self.wizard.increment_progress_bar(translate( - 'BiblesPlugin.HTTPBible', 'Importing %s...', - 'Importing ...') % book) + 'BiblesPlugin.HTTPBible', 'Importing %s...', 'Importing ...') % book) book_ref_id = self.get_book_ref_id_by_name(book, len(books), language_id) if not book_ref_id: log.exception(u'Importing books from %s - download name: "%s" '\ @@ -567,22 +564,19 @@ class HTTPBible(BibleDB): def get_verses(self, reference_list, show_error=True): """ - A reimplementation of the ``BibleDB.get_verses`` method, this one is - specifically for web Bibles. It first checks to see if the particular - chapter exists in the DB, and if not it pulls it from the web. If the - chapter DOES exist, it simply pulls the verses from the DB using the - ancestor method. + A reimplementation of the ``BibleDB.get_verses`` method, this one is specifically for web Bibles. It first + checks to see if the particular chapter exists in the DB, and if not it pulls it from the web. If the chapter + DOES exist, it simply pulls the verses from the DB using the ancestor method. ``reference_list`` - This is the list of references the media manager item wants. It is - a list of tuples, with the following format:: + This is the list of references the media manager item wants. It is a list of tuples, with the following + format:: (book_reference_id, chapter, start_verse, end_verse) - Therefore, when you are looking for multiple items, simply break - them up into references like this, bundle them into a list. This - function then runs through the list, and returns an amalgamated - list of ``Verse`` objects. For example:: + Therefore, when you are looking for multiple items, simply break them up into references like this, bundle + them into a list. This function then runs through the list, and returns an amalgamated list of ``Verse`` + objects. For example:: [(u'35', 1, 1, 1), (u'35', 2, 2, 3)] """ @@ -643,7 +637,7 @@ class HTTPBible(BibleDB): Return the number of chapters in a particular book. ``book`` - The book object to get the chapter count for. + The book object to get the chapter count for. """ log.debug(u'HTTPBible.get_chapter_count("%s")', book.name) return BiblesResourcesDB.get_chapter_count(book.book_reference_id) @@ -683,8 +677,8 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, An optional HTTP header to pass to the bible web server. ``pre_parse_regex`` - A regular expression to run on the webpage. Allows manipulation of the - webpage before passing to BeautifulSoup for parsing. + A regular expression to run on the webpage. Allows manipulation of the webpage before passing to BeautifulSoup + for parsing. ``pre_parse_substitute`` The text to replace any matches to the regular expression with. @@ -704,7 +698,7 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, soup = None try: if cleaner: - soup = BeautifulSoup(page_source, markupMassage=cleaner) + soup = BeautifulSoup(page_source, markup=cleaner) else: soup = BeautifulSoup(page_source) except HTMLParseError: diff --git a/scripts/check_dependencies.py b/scripts/check_dependencies.py index 4c0f69b91..86a029b71 100755 --- a/scripts/check_dependencies.py +++ b/scripts/check_dependencies.py @@ -79,7 +79,7 @@ MODULES = [ 'lxml', 'chardet', 'enchant', - 'BeautifulSoup', + 'bs4', 'mako', 'migrate', 'uno', From 19f9718aec7c83659eb7a8c993234207a9101816 Mon Sep 17 00:00:00 2001 From: Andreas Preikschat Date: Fri, 5 Apr 2013 22:06:00 +0200 Subject: [PATCH 2/6] added FIXME --- openlp/plugins/bibles/lib/http.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py index 390ac1cd9..78ed40525 100644 --- a/openlp/plugins/bibles/lib/http.py +++ b/openlp/plugins/bibles/lib/http.py @@ -698,7 +698,8 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, soup = None try: if cleaner: - soup = BeautifulSoup(page_source, markup=cleaner) + # FIXME: markupMassage not supported. + soup = BeautifulSoup(page_source, markupMassage=cleaner) else: soup = BeautifulSoup(page_source) except HTMLParseError: From ac32b6ca653c9723eaae6e3aef8bf971cdddc710 Mon Sep 17 00:00:00 2001 From: Andreas Preikschat Date: Fri, 5 Apr 2013 22:20:51 +0200 Subject: [PATCH 3/6] fixed missing r'' and removed argument --- openlp/plugins/bibles/lib/http.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py index 78ed40525..370216059 100644 --- a/openlp/plugins/bibles/lib/http.py +++ b/openlp/plugins/bibles/lib/http.py @@ -44,7 +44,7 @@ from openlp.core.utils import get_web_page from openlp.plugins.bibles.lib import SearchResults from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book -CLEANER_REGEX = re.compile(' |
|\'\+\'') +CLEANER_REGEX = re.compile(r' |
|\'\+\'') FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])') REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}') UGLY_CHARS = { @@ -699,7 +699,7 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, try: if cleaner: # FIXME: markupMassage not supported. - soup = BeautifulSoup(page_source, markupMassage=cleaner) + soup = BeautifulSoup(page_source)#, markupMassage=cleaner) else: soup = BeautifulSoup(page_source) except HTMLParseError: From 78ed2f655cdeaa2a5864e204970d4f946317b17d Mon Sep 17 00:00:00 2001 From: Andreas Preikschat Date: Sat, 6 Apr 2013 19:59:07 +0200 Subject: [PATCH 4/6] remove markupMassage --- openlp/plugins/bibles/lib/http.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py index 370216059..2eec3cbcd 100644 --- a/openlp/plugins/bibles/lib/http.py +++ b/openlp/plugins/bibles/lib/http.py @@ -235,10 +235,9 @@ class BGExtract(object): log.debug(u'BGExtract.get_bible_chapter("%s", "%s", "%s")', version, book_name, chapter) url_book_name = urllib.quote(book_name.encode("utf-8")) url_params = u'search=%s+%s&version=%s' % (url_book_name, chapter, version) - cleaner = [(CLEANER_REGEX, lambda match: '')] soup = get_soup_for_bible_ref( u'http://www.biblegateway.com/passage/?%s' % url_params, - pre_parse_regex=r'', pre_parse_substitute='', cleaner=cleaner) + pre_parse_regex=r'', pre_parse_substitute='') if not soup: return None div = soup.find('div', 'result-text-style-normal') @@ -665,8 +664,7 @@ class HTTPBible(BibleDB): application = property(_get_application) -def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, - pre_parse_substitute=None, cleaner=None): +def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, pre_parse_substitute=None): """ Gets a webpage and returns a parsed and optionally cleaned soup or None. @@ -682,9 +680,6 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, ``pre_parse_substitute`` The text to replace any matches to the regular expression with. - - ``cleaner`` - An optional regex to use during webpage parsing. """ if not reference_url: return None @@ -697,11 +692,8 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, page_source = re.sub(pre_parse_regex, pre_parse_substitute, page_source) soup = None try: - if cleaner: - # FIXME: markupMassage not supported. - soup = BeautifulSoup(page_source)#, markupMassage=cleaner) - else: - soup = BeautifulSoup(page_source) + soup = BeautifulSoup(page_source) + CLEANER_REGEX.sub(u'', soup) except HTMLParseError: log.exception(u'BeautifulSoup could not parse the bible page.') if not soup: From e2b8dc54f38fe3ad9f0cd448a1d03a4bddcfd507 Mon Sep 17 00:00:00 2001 From: Andreas Preikschat Date: Mon, 15 Apr 2013 21:54:27 +0200 Subject: [PATCH 5/6] fixed bs4 code; fixed regression --- openlp/plugins/bibles/lib/http.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py index 2eec3cbcd..44b19f857 100644 --- a/openlp/plugins/bibles/lib/http.py +++ b/openlp/plugins/bibles/lib/http.py @@ -99,14 +99,15 @@ class BGExtract(object): """ if isinstance(tag, NavigableString): return None, unicode(tag) - elif tag.get('class') == 'versenum' or tag.get('class') == 'versenum mid-line': + elif tag.get('class')[0] == "versenum" or tag.get('class')[0] == 'versenum mid-line': verse = unicode(tag.string).replace('[', '').replace(']', '').strip() return verse, None - elif tag.get('class') == 'chapternum': + elif tag.get('class')[0] == 'chapternum': verse = '1' return verse, None else: - verse, text = None, '' + verse = None + text = '' for child in tag.contents: c_verse, c_text = self._extract_verse(child) if c_verse: @@ -143,7 +144,8 @@ class BGExtract(object): tags = tags[::-1] current_text = '' for tag in tags: - verse, text = None, '' + verse = None + text = '' for child in tag.contents: c_verse, c_text = self._extract_verse(child) if c_verse: @@ -208,7 +210,7 @@ class BGExtract(object): if clean_verse_num: verse_text = raw_verse_num.next_element part = raw_verse_num.next_element.next_element - while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'): + while not (isinstance(part, Tag) and part.get(u'class')[0] == u'versenum'): # While we are still in the same verse grab all the text. if isinstance(part, NavigableString): verse_text += part @@ -349,7 +351,7 @@ class BSExtract(object): verses = {} for verse in content: self.application.process_events() - versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', verse[u'class'])) + versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', u' '.join(verse[u'class']))) verses[versenumber] = verse.contents[1].rstrip(u'\n') return SearchResults(book_name, chapter, verses) @@ -374,6 +376,16 @@ class BSExtract(object): content = content.find_all(u'li') return [book.contents[0].contents[0] for book in content] + def _get_application(self): + """ + Adds the openlp to the class dynamically + """ + if not hasattr(self, u'_application'): + self._application = Registry().get(u'application') + return self._application + + application = property(_get_application) + class CWExtract(object): """ @@ -693,7 +705,7 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, pre soup = None try: soup = BeautifulSoup(page_source) - CLEANER_REGEX.sub(u'', soup) + CLEANER_REGEX.sub(u'', unicode(soup)) except HTMLParseError: log.exception(u'BeautifulSoup could not parse the bible page.') if not soup: From 0fb24d2233f623031f195865db98d87771cf3d62 Mon Sep 17 00:00:00 2001 From: Andreas Preikschat Date: Wed, 17 Apr 2013 20:33:44 +0200 Subject: [PATCH 6/6] added tests --- openlp/plugins/custom/forms/editcustomform.py | 2 +- .../custom/forms/test_customform.py | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) diff --git a/openlp/plugins/custom/forms/editcustomform.py b/openlp/plugins/custom/forms/editcustomform.py index c78baa974..580fbde07 100644 --- a/openlp/plugins/custom/forms/editcustomform.py +++ b/openlp/plugins/custom/forms/editcustomform.py @@ -257,6 +257,6 @@ class EditCustomForm(QtGui.QDialog, Ui_CustomEditDialog): # We must have at least one slide. if self.slide_list_view.count() == 0: critical_error_message_box(message=translate('CustomPlugin.EditCustomForm', - 'You need to add at least one slide')) + 'You need to add at least one slide.')) return False return True diff --git a/tests/interfaces/openlp_plugins/custom/forms/test_customform.py b/tests/interfaces/openlp_plugins/custom/forms/test_customform.py index 6e6318ca6..dbed244e3 100644 --- a/tests/interfaces/openlp_plugins/custom/forms/test_customform.py +++ b/tests/interfaces/openlp_plugins/custom/forms/test_customform.py @@ -62,3 +62,45 @@ class TestEditCustomForm(TestCase): QtTest.QTest.mouseClick(self.form.add_button, QtCore.Qt.LeftButton) #THEN: One slide should be added. assert self.form.slide_list_view.count() == 1, u'There should be one slide added.' + + def validate_not_valid_part1_test(self): + """ + Test the _validate() method. + """ + # GIVEN: Mocked methods. + with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \ + mocked_critical_error_message_box: + mocked_displayText = MagicMock() + mocked_displayText.return_value = u'' + self.form.title_edit.displayText = mocked_displayText + mocked_setFocus = MagicMock() + self.form.title_edit.setFocus = mocked_setFocus + + # WHEN: Call the method. + result = self.form._validate() + + # THEN: The validate method should have returned False. + assert not result, u'The _validate() method should have retured False' + mocked_setFocus.assert_called_with() + mocked_critical_error_message_box.assert_called_with(message=u'You need to type in a title.') + + def validate_not_valid_part2_test(self): + """ + Test the _validate() method. + """ + # GIVEN: Mocked methods. + with patch(u'openlp.plugins.custom.forms.editcustomform.critical_error_message_box') as \ + mocked_critical_error_message_box: + mocked_displayText = MagicMock() + mocked_displayText.return_value = u'something' + self.form.title_edit.displayText = mocked_displayText + mocked_count = MagicMock() + mocked_count.return_value = 0 + self.form.slide_list_view.count = mocked_count + + # WHEN: Call the method. + result = self.form._validate() + + # THEN: The validate method should have returned False. + assert not result, u'The _validate() method should have retured False' + mocked_critical_error_message_box.assert_called_with(message=u'You need to add at least one slide.')