From 894b4fbf10c08f9f0489bab6086199c8728a3809 Mon Sep 17 00:00:00 2001 From: Philip Ridout Date: Sat, 20 Aug 2016 21:00:50 +0100 Subject: [PATCH] revert changes to http.py due to circular references --- .../plugins/bibles/forms/bibleimportform.py | 4 +- openlp/plugins/bibles/lib/http.py | 535 ++++++++++++++++++ .../bibles/lib/importers/biblegateway.py | 313 ---------- .../bibles/lib/importers/bibleserver.py | 162 ------ .../plugins/bibles/lib/importers/crosswalk.py | 171 ------ .../openlp_plugins/bibles/test_bibleserver.py | 43 +- .../openlp_plugins/bibles/test_lib_http.py | 4 +- 7 files changed, 572 insertions(+), 660 deletions(-) delete mode 100644 openlp/plugins/bibles/lib/importers/biblegateway.py delete mode 100644 openlp/plugins/bibles/lib/importers/bibleserver.py delete mode 100644 openlp/plugins/bibles/lib/importers/crosswalk.py diff --git a/openlp/plugins/bibles/forms/bibleimportform.py b/openlp/plugins/bibles/forms/bibleimportform.py index e9eee88d5..3d02228ca 100644 --- a/openlp/plugins/bibles/forms/bibleimportform.py +++ b/openlp/plugins/bibles/forms/bibleimportform.py @@ -40,9 +40,7 @@ from openlp.core.ui.lib.wizard import OpenLPWizard, WizardStrings from openlp.core.common.languagemanager import get_locale_key from openlp.plugins.bibles.lib.manager import BibleFormat from openlp.plugins.bibles.lib.db import clean_filename -from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract -from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract -from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract +from openlp.plugins.bibles.lib.importers.http import CWExtract, BGExtract, BSExtract log = logging.getLogger(__name__) diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py index 5afd107f6..6921c9005 100644 --- a/openlp/plugins/bibles/lib/http.py +++ b/openlp/plugins/bibles/lib/http.py @@ -38,10 +38,545 @@ from openlp.plugins.bibles.lib.bibleimport import BibleImport from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book CLEANER_REGEX = re.compile(r' |
|\'\+\'') +FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])') +REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}') +UGLY_CHARS = { + '\u2014': ' - ', + '\u2018': '\'', + '\u2019': '\'', + '\u201c': '"', + '\u201d': '"', + ' ': ' ' +} +VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') + +BIBLESERVER_LANGUAGE_CODE = { + 'fl_1': 'de', + 'fl_2': 'en', + 'fl_3': 'fr', + 'fl_4': 'it', + 'fl_5': 'es', + 'fl_6': 'pt', + 'fl_7': 'ru', + 'fl_8': 'sv', + 'fl_9': 'no', + 'fl_10': 'nl', + 'fl_11': 'cs', + 'fl_12': 'sk', + 'fl_13': 'ro', + 'fl_14': 'hr', + 'fl_15': 'hu', + 'fl_16': 'bg', + 'fl_17': 'ar', + 'fl_18': 'tr', + 'fl_19': 'pl', + 'fl_20': 'da', + 'fl_21': 'zh' +} + +CROSSWALK_LANGUAGES = { + 'Portuguese': 'pt', + 'German': 'de', + 'Italian': 'it', + 'Español': 'es', + 'French': 'fr', + 'Dutch': 'nl' +} log = logging.getLogger(__name__) +class BGExtract(RegistryProperties): + """ + Extract verses from BibleGateway + """ + def __init__(self, proxy_url=None): + log.debug('BGExtract.init("{url}")'.format(url=proxy_url)) + self.proxy_url = proxy_url + socket.setdefaulttimeout(30) + + def _remove_elements(self, parent, tag, class_=None): + """ + Remove a particular element from the BeautifulSoup tree. + + :param parent: The element from which items need to be removed. + :param tag: A string of the tab type, e.g. "div" + :param class_: An HTML class attribute for further qualification. + """ + if class_: + all_tags = parent.find_all(tag, class_) + else: + all_tags = parent.find_all(tag) + for element in all_tags: + element.extract() + + def _extract_verse(self, tag): + """ + Extract a verse (or part of a verse) from a tag. + + :param tag: The BeautifulSoup Tag element with the stuff we want. + """ + if isinstance(tag, NavigableString): + return None, str(tag) + elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'): + verse = str(tag.string).replace('[', '').replace(']', '').strip() + return verse, None + elif tag.get('class') and tag.get('class')[0] == 'chapternum': + verse = '1' + return verse, None + else: + verse = None + text = '' + for child in tag.contents: + c_verse, c_text = self._extract_verse(child) + if c_verse: + verse = c_verse + if text and c_text: + text += c_text + elif c_text is not None: + text = c_text + return verse, text + + def _clean_soup(self, tag): + """ + Remove all the rubbish from the HTML page. + + :param tag: The base tag within which we want to remove stuff. + """ + self._remove_elements(tag, 'sup', 'crossreference') + self._remove_elements(tag, 'sup', 'footnote') + self._remove_elements(tag, 'div', 'footnotes') + self._remove_elements(tag, 'div', 'crossrefs') + self._remove_elements(tag, 'h3') + self._remove_elements(tag, 'h4') + self._remove_elements(tag, 'h5') + + def _extract_verses(self, tags): + """ + Extract all the verses from a pre-prepared list of HTML tags. + + :param tags: A list of BeautifulSoup Tag elements. + """ + verses = [] + tags = tags[::-1] + current_text = '' + for tag in tags: + verse = None + text = '' + for child in tag.contents: + c_verse, c_text = self._extract_verse(child) + if c_verse: + verse = c_verse + if text and c_text: + text += c_text + elif c_text is not None: + text = c_text + if not verse: + current_text = text + ' ' + current_text + else: + text += ' ' + current_text + current_text = '' + if text: + for old, new in UGLY_CHARS.items(): + text = text.replace(old, new) + text = ' '.join(text.split()) + if verse and text: + verse = verse.strip() + try: + verse = int(verse) + except ValueError: + verse_parts = verse.split('-') + if len(verse_parts) > 1: + verse = int(verse_parts[0]) + except TypeError: + log.warning('Illegal verse number: {verse:d}'.format(verse=verse)) + verses.append((verse, text)) + verse_list = {} + for verse, text in verses[::-1]: + verse_list[verse] = text + return verse_list + + def _extract_verses_old(self, div): + """ + Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still + broken) HTML. + + :param div: The parent div. + """ + verse_list = {} + # Cater for inconsistent mark up in the first verse of a chapter. + first_verse = div.find('versenum') + if first_verse and first_verse.contents: + verse_list[1] = str(first_verse.contents[0]) + for verse in div('sup', 'versenum'): + raw_verse_num = verse.next_element + clean_verse_num = 0 + # Not all verses exist in all translations and may or may not be represented by a verse number. If they are + # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what + # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse + # completely. + try: + clean_verse_num = int(str(raw_verse_num)) + except ValueError: + verse_parts = str(raw_verse_num).split('-') + if len(verse_parts) > 1: + clean_verse_num = int(verse_parts[0]) + except TypeError: + log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num)) + if clean_verse_num: + verse_text = raw_verse_num.next_element + part = raw_verse_num.next_element.next_element + while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'): + # While we are still in the same verse grab all the text. + if isinstance(part, NavigableString): + verse_text += part + if isinstance(part.next_element, Tag) and part.next_element.name == 'div': + # Run out of verses so stop. + break + part = part.next_element + verse_list[clean_verse_num] = str(verse_text) + return verse_list + + def get_bible_chapter(self, version, book_name, chapter): + """ + Access and decode Bibles via the BibleGateway website. + + :param version: The version of the Bible like 31 for New International version. + :param book_name: Name of the Book. + :param chapter: Chapter number. + """ + log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version, + name=book_name, + chapter=chapter)) + url_book_name = urllib.parse.quote(book_name.encode("utf-8")) + url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name, + chapter=chapter, + version=version) + soup = get_soup_for_bible_ref( + 'http://biblegateway.com/passage/?{url}'.format(url=url_params), + pre_parse_regex=r'', pre_parse_substitute='') + if not soup: + return None + div = soup.find('div', 'result-text-style-normal') + if not div: + return None + self._clean_soup(div) + span_list = div.find_all('span', 'text') + log.debug('Span list: {span}'.format(span=span_list)) + if not span_list: + # If we don't get any spans then we must have the old HTML format + verse_list = self._extract_verses_old(div) + else: + verse_list = self._extract_verses(span_list) + if not verse_list: + log.debug('No content found in the BibleGateway response.') + send_error_message('parse') + return None + return SearchResults(book_name, chapter, verse_list) + + def get_books_from_http(self, version): + """ + Load a list of all books a Bible contains from BibleGateway website. + + :param version: The version of the Bible like NIV for New International Version + """ + log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version)) + url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)}) + reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params) + page = get_web_page(reference_url) + if not page: + send_error_message('download') + return None + page_source = page.read() + try: + page_source = str(page_source, 'utf8') + except UnicodeDecodeError: + page_source = str(page_source, 'cp1251') + try: + soup = BeautifulSoup(page_source, 'lxml') + except Exception: + log.error('BeautifulSoup could not parse the Bible page.') + send_error_message('parse') + return None + if not soup: + send_error_message('parse') + return None + self.application.process_events() + content = soup.find('table', 'infotable') + if content: + content = content.find_all('tr') + if not content: + log.error('No books found in the Biblegateway response.') + send_error_message('parse') + return None + books = [] + for book in content: + book = book.find('td') + if book: + books.append(book.contents[1]) + return books + + def get_bibles_from_http(self): + """ + Load a list of bibles from BibleGateway website. + + returns a list in the form [(biblename, biblekey, language_code)] + """ + log.debug('BGExtract.get_bibles_from_http') + bible_url = 'https://biblegateway.com/versions/' + soup = get_soup_for_bible_ref(bible_url) + if not soup: + return None + bible_select = soup.find('select', {'class': 'search-translation-select'}) + if not bible_select: + log.debug('No select tags found - did site change?') + return None + option_tags = bible_select.find_all('option') + if not option_tags: + log.debug('No option tags found - did site change?') + return None + current_lang = '' + bibles = [] + for ot in option_tags: + tag_class = '' + try: + tag_class = ot['class'][0] + except KeyError: + tag_class = '' + tag_text = ot.get_text() + if tag_class == 'lang': + current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower() + elif tag_class == 'spacer': + continue + else: + bibles.append((tag_text, ot['value'], current_lang)) + return bibles + + +class BSExtract(RegistryProperties): + """ + Extract verses from Bibleserver.com + """ + def __init__(self, proxy_url=None): + log.debug('BSExtract.init("{url}")'.format(url=proxy_url)) + self.proxy_url = proxy_url + socket.setdefaulttimeout(30) + + def get_bible_chapter(self, version, book_name, chapter): + """ + Access and decode bibles via Bibleserver mobile website + + :param version: The version of the bible like NIV for New International Version + :param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung + :param chapter: Chapter number + """ + log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, + book=book_name, + chapter=chapter)) + url_version = urllib.parse.quote(version.encode("utf-8")) + url_book_name = urllib.parse.quote(book_name.encode("utf-8")) + chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version, + name=url_book_name, + chapter=chapter) + header = ('Accept-Language', 'en') + soup = get_soup_for_bible_ref(chapter_url, header) + if not soup: + return None + self.application.process_events() + content = soup.find('div', 'content') + if not content: + log.error('No verses found in the Bibleserver response.') + send_error_message('parse') + return None + content = content.find('div').find_all('div') + verses = {} + for verse in content: + self.application.process_events() + versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class']))) + verses[versenumber] = verse.contents[1].rstrip('\n') + return SearchResults(book_name, chapter, verses) + + def get_books_from_http(self, version): + """ + Load a list of all books a Bible contains from Bibleserver mobile website. + + :param version: The version of the Bible like NIV for New International Version + """ + log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version)) + url_version = urllib.parse.quote(version.encode("utf-8")) + chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version) + soup = get_soup_for_bible_ref(chapter_url) + if not soup: + return None + content = soup.find('ul') + if not content: + log.error('No books found in the Bibleserver response.') + send_error_message('parse') + return None + content = content.find_all('li') + return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)] + + def get_bibles_from_http(self): + """ + Load a list of bibles from Bibleserver website. + + returns a list in the form [(biblename, biblekey, language_code)] + """ + log.debug('BSExtract.get_bibles_from_http') + bible_url = 'http://www.bibleserver.com/index.php?language=2' + soup = get_soup_for_bible_ref(bible_url) + if not soup: + return None + bible_links = soup.find_all('a', {'class': 'trlCell'}) + if not bible_links: + log.debug('No a tags found - did site change?') + return None + bibles = [] + for link in bible_links: + bible_name = link.get_text() + # Skip any audio + if 'audio' in bible_name.lower(): + continue + try: + bible_link = link['href'] + bible_key = bible_link[bible_link.rfind('/') + 1:] + css_classes = link['class'] + except KeyError: + log.debug('No href/class attribute found - did site change?') + language_code = '' + for css_class in css_classes: + if css_class.startswith('fl_'): + try: + language_code = BIBLESERVER_LANGUAGE_CODE[css_class] + except KeyError: + language_code = '' + bibles.append((bible_name, bible_key, language_code)) + return bibles + + +class CWExtract(RegistryProperties): + """ + Extract verses from CrossWalk/BibleStudyTools + """ + def __init__(self, proxy_url=None): + log.debug('CWExtract.init("{url}")'.format(url=proxy_url)) + self.proxy_url = proxy_url + socket.setdefaulttimeout(30) + + def get_bible_chapter(self, version, book_name, chapter): + """ + Access and decode bibles via the Crosswalk website + + :param version: The version of the Bible like niv for New International Version + :param book_name: Text name of in english e.g. 'gen' for Genesis + :param chapter: Chapter number + """ + log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, + book=book_name, + chapter=chapter)) + url_book_name = book_name.replace(' ', '-') + url_book_name = url_book_name.lower() + url_book_name = urllib.parse.quote(url_book_name.encode("utf-8")) + chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version, + book=url_book_name, + chapter=chapter) + soup = get_soup_for_bible_ref(chapter_url) + if not soup: + return None + self.application.process_events() + verses_div = soup.find_all('div', 'verse') + if not verses_div: + log.error('No verses found in the CrossWalk response.') + send_error_message('parse') + return None + verses = {} + for verse in verses_div: + self.application.process_events() + verse_number = int(verse.find('strong').contents[0]) + verse_span = verse.find('span') + tags_to_remove = verse_span.find_all(['a', 'sup']) + for tag in tags_to_remove: + tag.decompose() + verse_text = verse_span.get_text() + self.application.process_events() + # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and . + verse_text = verse_text.strip('\n\r\t ') + verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text) + verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text) + verses[verse_number] = verse_text + return SearchResults(book_name, chapter, verses) + + def get_books_from_http(self, version): + """ + Load a list of all books a Bible contain from the Crosswalk website. + + :param version: The version of the bible like NIV for New International Version + """ + log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version)) + chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version) + soup = get_soup_for_bible_ref(chapter_url) + if not soup: + return None + content = soup.find_all('h4', {'class': 'small-header'}) + if not content: + log.error('No books found in the Crosswalk response.') + send_error_message('parse') + return None + books = [] + for book in content: + books.append(book.contents[0]) + return books + + def get_bibles_from_http(self): + """ + Load a list of bibles from Crosswalk website. + returns a list in the form [(biblename, biblekey, language_code)] + """ + log.debug('CWExtract.get_bibles_from_http') + bible_url = 'http://www.biblestudytools.com/bible-versions/' + soup = get_soup_for_bible_ref(bible_url) + if not soup: + return None + h4_tags = soup.find_all('h4', {'class': 'small-header'}) + if not h4_tags: + log.debug('No h4 tags found - did site change?') + return None + bibles = [] + for h4t in h4_tags: + short_name = None + if h4t.span: + short_name = h4t.span.get_text().strip().lower() + else: + log.error('No span tag found - did site change?') + return None + if not short_name: + continue + h4t.span.extract() + tag_text = h4t.get_text().strip() + # The names of non-english bibles has their language in parentheses at the end + if tag_text.endswith(')'): + language = tag_text[tag_text.rfind('(') + 1:-1] + if language in CROSSWALK_LANGUAGES: + language_code = CROSSWALK_LANGUAGES[language] + else: + language_code = '' + # ... except for those that don't... + elif 'latin' in tag_text.lower(): + language_code = 'la' + elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower(): + language_code = 'es' + elif 'chinese' in tag_text.lower(): + language_code = 'zh' + elif 'greek' in tag_text.lower(): + language_code = 'el' + elif 'nova' in tag_text.lower(): + language_code = 'pt' + else: + language_code = 'en' + bibles.append((tag_text, short_name, language_code)) + return bibles + + class HTTPBible(BibleImport, RegistryProperties): log.info('{name} HTTPBible loaded'.format(name=__name__)) diff --git a/openlp/plugins/bibles/lib/importers/biblegateway.py b/openlp/plugins/bibles/lib/importers/biblegateway.py deleted file mode 100644 index f3caa2204..000000000 --- a/openlp/plugins/bibles/lib/importers/biblegateway.py +++ /dev/null @@ -1,313 +0,0 @@ -# -*- coding: utf-8 -*- -# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4 - -############################################################################### -# OpenLP - Open Source Lyrics Projection # -# --------------------------------------------------------------------------- # -# Copyright (c) 2008-2016 OpenLP Developers # -# --------------------------------------------------------------------------- # -# This program is free software; you can redistribute it and/or modify it # -# under the terms of the GNU General Public License as published by the Free # -# Software Foundation; version 2 of the License. # -# # -# This program is distributed in the hope that it will be useful, but WITHOUT # -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # -# more details. # -# # -# You should have received a copy of the GNU General Public License along # -# with this program; if not, write to the Free Software Foundation, Inc., 59 # -# Temple Place, Suite 330, Boston, MA 02111-1307 USA # -############################################################################### -""" -The :mod:`biblegateway` module enables OpenLP to retrieve scripture from http://biblegateway.com. -""" -import logging -import socket -import urllib.parse -import urllib.error - -from bs4 import BeautifulSoup, NavigableString, Tag - -from openlp.core.common import RegistryProperties -from openlp.core.lib.webpagereader import get_web_page -from openlp.plugins.bibles.lib import SearchResults -from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message - -UGLY_CHARS = { - '\u2014': ' - ', - '\u2018': '\'', - '\u2019': '\'', - '\u201c': '"', - '\u201d': '"', - ' ': ' ' -} - -log = logging.getLogger(__name__) - - -class BGExtract(RegistryProperties): - """ - Extract verses from BibleGateway - """ - def __init__(self, proxy_url=None): - log.debug('BGExtract.init("{url}")'.format(url=proxy_url)) - self.proxy_url = proxy_url - socket.setdefaulttimeout(30) - - def _remove_elements(self, parent, tag, class_=None): - """ - Remove a particular element from the BeautifulSoup tree. - - :param parent: The element from which items need to be removed. - :param tag: A string of the tab type, e.g. "div" - :param class_: An HTML class attribute for further qualification. - """ - if class_: - all_tags = parent.find_all(tag, class_) - else: - all_tags = parent.find_all(tag) - for element in all_tags: - element.extract() - - def _extract_verse(self, tag): - """ - Extract a verse (or part of a verse) from a tag. - - :param tag: The BeautifulSoup Tag element with the stuff we want. - """ - if isinstance(tag, NavigableString): - return None, str(tag) - elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'): - verse = str(tag.string).replace('[', '').replace(']', '').strip() - return verse, None - elif tag.get('class') and tag.get('class')[0] == 'chapternum': - verse = '1' - return verse, None - else: - verse = None - text = '' - for child in tag.contents: - c_verse, c_text = self._extract_verse(child) - if c_verse: - verse = c_verse - if text and c_text: - text += c_text - elif c_text is not None: - text = c_text - return verse, text - - def _clean_soup(self, tag): - """ - Remove all the rubbish from the HTML page. - - :param tag: The base tag within which we want to remove stuff. - """ - self._remove_elements(tag, 'sup', 'crossreference') - self._remove_elements(tag, 'sup', 'footnote') - self._remove_elements(tag, 'div', 'footnotes') - self._remove_elements(tag, 'div', 'crossrefs') - self._remove_elements(tag, 'h3') - self._remove_elements(tag, 'h4') - self._remove_elements(tag, 'h5') - - def _extract_verses(self, tags): - """ - Extract all the verses from a pre-prepared list of HTML tags. - - :param tags: A list of BeautifulSoup Tag elements. - """ - verses = [] - tags = tags[::-1] - current_text = '' - for tag in tags: - verse = None - text = '' - for child in tag.contents: - c_verse, c_text = self._extract_verse(child) - if c_verse: - verse = c_verse - if text and c_text: - text += c_text - elif c_text is not None: - text = c_text - if not verse: - current_text = text + ' ' + current_text - else: - text += ' ' + current_text - current_text = '' - if text: - for old, new in UGLY_CHARS.items(): - text = text.replace(old, new) - text = ' '.join(text.split()) - if verse and text: - verse = verse.strip() - try: - verse = int(verse) - except ValueError: - verse_parts = verse.split('-') - if len(verse_parts) > 1: - verse = int(verse_parts[0]) - except TypeError: - log.warning('Illegal verse number: {verse:d}'.format(verse=verse)) - verses.append((verse, text)) - verse_list = {} - for verse, text in verses[::-1]: - verse_list[verse] = text - return verse_list - - def _extract_verses_old(self, div): - """ - Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still - broken) HTML. - - :param div: The parent div. - """ - verse_list = {} - # Cater for inconsistent mark up in the first verse of a chapter. - first_verse = div.find('versenum') - if first_verse and first_verse.contents: - verse_list[1] = str(first_verse.contents[0]) - for verse in div('sup', 'versenum'): - raw_verse_num = verse.next_element - clean_verse_num = 0 - # Not all verses exist in all translations and may or may not be represented by a verse number. If they are - # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what - # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse - # completely. - try: - clean_verse_num = int(str(raw_verse_num)) - except ValueError: - verse_parts = str(raw_verse_num).split('-') - if len(verse_parts) > 1: - clean_verse_num = int(verse_parts[0]) - except TypeError: - log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num)) - if clean_verse_num: - verse_text = raw_verse_num.next_element - part = raw_verse_num.next_element.next_element - while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'): - # While we are still in the same verse grab all the text. - if isinstance(part, NavigableString): - verse_text += part - if isinstance(part.next_element, Tag) and part.next_element.name == 'div': - # Run out of verses so stop. - break - part = part.next_element - verse_list[clean_verse_num] = str(verse_text) - return verse_list - - def get_bible_chapter(self, version, book_name, chapter): - """ - Access and decode Bibles via the BibleGateway website. - - :param version: The version of the Bible like 31 for New International version. - :param book_name: Name of the Book. - :param chapter: Chapter number. - """ - log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version, - name=book_name, - chapter=chapter)) - url_book_name = urllib.parse.quote(book_name.encode("utf-8")) - url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name, - chapter=chapter, - version=version) - soup = get_soup_for_bible_ref( - 'http://biblegateway.com/passage/?{url}'.format(url=url_params), - pre_parse_regex=r'', pre_parse_substitute='') - if not soup: - return None - div = soup.find('div', 'result-text-style-normal') - if not div: - return None - self._clean_soup(div) - span_list = div.find_all('span', 'text') - log.debug('Span list: {span}'.format(span=span_list)) - if not span_list: - # If we don't get any spans then we must have the old HTML format - verse_list = self._extract_verses_old(div) - else: - verse_list = self._extract_verses(span_list) - if not verse_list: - log.debug('No content found in the BibleGateway response.') - send_error_message('parse') - return None - return SearchResults(book_name, chapter, verse_list) - - def get_books_from_http(self, version): - """ - Load a list of all books a Bible contains from BibleGateway website. - - :param version: The version of the Bible like NIV for New International Version - """ - log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version)) - url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)}) - reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params) - page = get_web_page(reference_url) - if not page: - send_error_message('download') - return None - page_source = page.read() - try: - page_source = str(page_source, 'utf8') - except UnicodeDecodeError: - page_source = str(page_source, 'cp1251') - try: - soup = BeautifulSoup(page_source, 'lxml') - except Exception: - log.error('BeautifulSoup could not parse the Bible page.') - send_error_message('parse') - return None - if not soup: - send_error_message('parse') - return None - self.application.process_events() - content = soup.find('table', 'infotable') - if content: - content = content.find_all('tr') - if not content: - log.error('No books found in the Biblegateway response.') - send_error_message('parse') - return None - books = [] - for book in content: - book = book.find('td') - if book: - books.append(book.contents[1]) - return books - - def get_bibles_from_http(self): - """ - Load a list of bibles from BibleGateway website. - - returns a list in the form [(biblename, biblekey, language_code)] - """ - log.debug('BGExtract.get_bibles_from_http') - bible_url = 'https://biblegateway.com/versions/' - soup = get_soup_for_bible_ref(bible_url) - if not soup: - return None - bible_select = soup.find('select', {'class': 'search-translation-select'}) - if not bible_select: - log.debug('No select tags found - did site change?') - return None - option_tags = bible_select.find_all('option') - if not option_tags: - log.debug('No option tags found - did site change?') - return None - current_lang = '' - bibles = [] - for ot in option_tags: - tag_class = '' - try: - tag_class = ot['class'][0] - except KeyError: - tag_class = '' - tag_text = ot.get_text() - if tag_class == 'lang': - current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower() - elif tag_class == 'spacer': - continue - else: - bibles.append((tag_text, ot['value'], current_lang)) - return bibles diff --git a/openlp/plugins/bibles/lib/importers/bibleserver.py b/openlp/plugins/bibles/lib/importers/bibleserver.py deleted file mode 100644 index 16924d84a..000000000 --- a/openlp/plugins/bibles/lib/importers/bibleserver.py +++ /dev/null @@ -1,162 +0,0 @@ -# -*- coding: utf-8 -*- -# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4 - -############################################################################### -# OpenLP - Open Source Lyrics Projection # -# --------------------------------------------------------------------------- # -# Copyright (c) 2008-2016 OpenLP Developers # -# --------------------------------------------------------------------------- # -# This program is free software; you can redistribute it and/or modify it # -# under the terms of the GNU General Public License as published by the Free # -# Software Foundation; version 2 of the License. # -# # -# This program is distributed in the hope that it will be useful, but WITHOUT # -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # -# more details. # -# # -# You should have received a copy of the GNU General Public License along # -# with this program; if not, write to the Free Software Foundation, Inc., 59 # -# Temple Place, Suite 330, Boston, MA 02111-1307 USA # -############################################################################### -""" -The :mod:`bibleserver` module enables OpenLP to retrieve scripture from http://bibleserver.com. -""" -import logging -import re -import socket -import urllib.parse -import urllib.error - -from openlp.core.common import RegistryProperties -from openlp.plugins.bibles.lib import SearchResults -from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message - -VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') - -BIBLESERVER_LANGUAGE_CODE = { - 'fl_1': 'de', - 'fl_2': 'en', - 'fl_3': 'fr', - 'fl_4': 'it', - 'fl_5': 'es', - 'fl_6': 'pt', - 'fl_7': 'ru', - 'fl_8': 'sv', - 'fl_9': 'no', - 'fl_10': 'nl', - 'fl_11': 'cs', - 'fl_12': 'sk', - 'fl_13': 'ro', - 'fl_14': 'hr', - 'fl_15': 'hu', - 'fl_16': 'bg', - 'fl_17': 'ar', - 'fl_18': 'tr', - 'fl_19': 'pl', - 'fl_20': 'da', - 'fl_21': 'zh' -} - -log = logging.getLogger(__name__) - - -class BSExtract(RegistryProperties): - """ - Extract verses from Bibleserver.com - """ - def __init__(self, proxy_url=None): - log.debug('BSExtract.init("{url}")'.format(url=proxy_url)) - self.proxy_url = proxy_url - socket.setdefaulttimeout(30) - - def get_bible_chapter(self, version, book_name, chapter): - """ - Access and decode bibles via Bibleserver mobile website - - :param version: The version of the bible like NIV for New International Version - :param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung - :param chapter: Chapter number - """ - log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, - book=book_name, - chapter=chapter)) - url_version = urllib.parse.quote(version.encode("utf-8")) - url_book_name = urllib.parse.quote(book_name.encode("utf-8")) - chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version, - name=url_book_name, - chapter=chapter) - header = ('Accept-Language', 'en') - soup = get_soup_for_bible_ref(chapter_url, header) - if not soup: - return None - self.application.process_events() - content = soup.find('div', 'content') - if not content: - log.error('No verses found in the Bibleserver response.') - send_error_message('parse') - return None - content = content.find('div').find_all('div') - verses = {} - for verse in content: - self.application.process_events() - versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class']))) - verses[versenumber] = verse.contents[1].rstrip('\n') - return SearchResults(book_name, chapter, verses) - - def get_books_from_http(self, version): - """ - Load a list of all books a Bible contains from Bibleserver mobile website. - - :param version: The version of the Bible like NIV for New International Version - """ - log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version)) - url_version = urllib.parse.quote(version.encode("utf-8")) - chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version) - soup = get_soup_for_bible_ref(chapter_url) - if not soup: - return None - content = soup.find('ul') - if not content: - log.error('No books found in the Bibleserver response.') - send_error_message('parse') - return None - content = content.find_all('li') - return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)] - - def get_bibles_from_http(self): - """ - Load a list of bibles from Bibleserver website. - - returns a list in the form [(biblename, biblekey, language_code)] - """ - log.debug('BSExtract.get_bibles_from_http') - bible_url = 'http://www.bibleserver.com/index.php?language=2' - soup = get_soup_for_bible_ref(bible_url) - if not soup: - return None - bible_links = soup.find_all('a', {'class': 'trlCell'}) - if not bible_links: - log.debug('No a tags found - did site change?') - return None - bibles = [] - for link in bible_links: - bible_name = link.get_text() - # Skip any audio - if 'audio' in bible_name.lower(): - continue - try: - bible_link = link['href'] - bible_key = bible_link[bible_link.rfind('/') + 1:] - css_classes = link['class'] - except KeyError: - log.debug('No href/class attribute found - did site change?') - language_code = '' - for css_class in css_classes: - if css_class.startswith('fl_'): - try: - language_code = BIBLESERVER_LANGUAGE_CODE[css_class] - except KeyError: - language_code = '' - bibles.append((bible_name, bible_key, language_code)) - return bibles diff --git a/openlp/plugins/bibles/lib/importers/crosswalk.py b/openlp/plugins/bibles/lib/importers/crosswalk.py deleted file mode 100644 index fb354dd29..000000000 --- a/openlp/plugins/bibles/lib/importers/crosswalk.py +++ /dev/null @@ -1,171 +0,0 @@ -# -*- coding: utf-8 -*- -# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4 - -############################################################################### -# OpenLP - Open Source Lyrics Projection # -# --------------------------------------------------------------------------- # -# Copyright (c) 2008-2016 OpenLP Developers # -# --------------------------------------------------------------------------- # -# This program is free software; you can redistribute it and/or modify it # -# under the terms of the GNU General Public License as published by the Free # -# Software Foundation; version 2 of the License. # -# # -# This program is distributed in the hope that it will be useful, but WITHOUT # -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # -# more details. # -# # -# You should have received a copy of the GNU General Public License along # -# with this program; if not, write to the Free Software Foundation, Inc., 59 # -# Temple Place, Suite 330, Boston, MA 02111-1307 USA # -############################################################################### -""" -The :mod:`crosswalk` module enables OpenLP to retrieve scripture from www.biblestudytools.com. -""" -import logging -import re -import socket -import urllib.parse -import urllib.error - -from openlp.core.common import RegistryProperties -from openlp.plugins.bibles.lib import SearchResults -from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message - -FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])') -REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}') - - -CROSSWALK_LANGUAGES = { - 'Portuguese': 'pt', - 'German': 'de', - 'Italian': 'it', - 'Español': 'es', - 'French': 'fr', - 'Dutch': 'nl' -} - -log = logging.getLogger(__name__) - - -class CWExtract(RegistryProperties): - """ - Extract verses from CrossWalk/BibleStudyTools - """ - def __init__(self, proxy_url=None): - log.debug('CWExtract.init("{url}")'.format(url=proxy_url)) - self.proxy_url = proxy_url - socket.setdefaulttimeout(30) - - def get_bible_chapter(self, version, book_name, chapter): - """ - Access and decode bibles via the Crosswalk website - - :param version: The version of the Bible like niv for New International Version - :param book_name: Text name of in english e.g. 'gen' for Genesis - :param chapter: Chapter number - """ - log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, - book=book_name, - chapter=chapter)) - url_book_name = book_name.replace(' ', '-') - url_book_name = url_book_name.lower() - url_book_name = urllib.parse.quote(url_book_name.encode("utf-8")) - chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version, - book=url_book_name, - chapter=chapter) - soup = get_soup_for_bible_ref(chapter_url) - if not soup: - return None - self.application.process_events() - verses_div = soup.find_all('div', 'verse') - if not verses_div: - log.error('No verses found in the CrossWalk response.') - send_error_message('parse') - return None - verses = {} - for verse in verses_div: - self.application.process_events() - verse_number = int(verse.find('strong').contents[0]) - verse_span = verse.find('span') - tags_to_remove = verse_span.find_all(['a', 'sup']) - for tag in tags_to_remove: - tag.decompose() - verse_text = verse_span.get_text() - self.application.process_events() - # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and . - verse_text = verse_text.strip('\n\r\t ') - verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text) - verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text) - verses[verse_number] = verse_text - return SearchResults(book_name, chapter, verses) - - def get_books_from_http(self, version): - """ - Load a list of all books a Bible contain from the Crosswalk website. - - :param version: The version of the bible like NIV for New International Version - """ - log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version)) - chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version) - soup = get_soup_for_bible_ref(chapter_url) - if not soup: - return None - content = soup.find_all('h4', {'class': 'small-header'}) - if not content: - log.error('No books found in the Crosswalk response.') - send_error_message('parse') - return None - books = [] - for book in content: - books.append(book.contents[0]) - return books - - def get_bibles_from_http(self): - """ - Load a list of bibles from Crosswalk website. - returns a list in the form [(biblename, biblekey, language_code)] - """ - log.debug('CWExtract.get_bibles_from_http') - bible_url = 'http://www.biblestudytools.com/bible-versions/' - soup = get_soup_for_bible_ref(bible_url) - if not soup: - return None - h4_tags = soup.find_all('h4', {'class': 'small-header'}) - if not h4_tags: - log.debug('No h4 tags found - did site change?') - return None - bibles = [] - for h4t in h4_tags: - short_name = None - if h4t.span: - short_name = h4t.span.get_text().strip().lower() - else: - log.error('No span tag found - did site change?') - return None - if not short_name: - continue - h4t.span.extract() - tag_text = h4t.get_text().strip() - # The names of non-english bibles has their language in parentheses at the end - if tag_text.endswith(')'): - language = tag_text[tag_text.rfind('(') + 1:-1] - if language in CROSSWALK_LANGUAGES: - language_code = CROSSWALK_LANGUAGES[language] - else: - language_code = '' - # ... except for those that don't... - elif 'latin' in tag_text.lower(): - language_code = 'la' - elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower(): - language_code = 'es' - elif 'chinese' in tag_text.lower(): - language_code = 'zh' - elif 'greek' in tag_text.lower(): - language_code = 'el' - elif 'nova' in tag_text.lower(): - language_code = 'pt' - else: - language_code = 'en' - bibles.append((tag_text, short_name, language_code)) - return bibles diff --git a/tests/functional/openlp_plugins/bibles/test_bibleserver.py b/tests/functional/openlp_plugins/bibles/test_bibleserver.py index 0849a63e3..839c81008 100644 --- a/tests/functional/openlp_plugins/bibles/test_bibleserver.py +++ b/tests/functional/openlp_plugins/bibles/test_bibleserver.py @@ -20,13 +20,41 @@ # Temple Place, Suite 330, Boston, MA 02111-1307 USA # ############################################################################### """ -This module contains tests for the bibleserver module of the Bibles plugin. +This module contains tests for the http module of the Bibles plugin. """ from unittest import TestCase from bs4 import BeautifulSoup from tests.functional import patch, MagicMock -from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract +from openlp.plugins.bibles.lib.importers.http import BSExtract + +# TODO: Items left to test +# BGExtract +# __init__ +# _remove_elements +# _extract_verse +# _clean_soup +# _extract_verses +# _extract_verses_old +# get_bible_chapter +# get_books_from_http +# _get_application +# CWExtract +# __init__ +# get_bible_chapter +# get_books_from_http +# _get_application +# HTTPBible +# __init__ +# do_import +# get_verses +# get_chapter +# get_books +# get_chapter_count +# get_verse_count +# _get_application +# get_soup_for_bible_ref +# send_error_message class TestBSExtract(TestCase): @@ -40,12 +68,11 @@ class TestBSExtract(TestCase): # get_books_from_http # _get_application def setUp(self): - self.get_soup_for_bible_ref_patcher = patch( - 'openlp.plugins.bibles.lib.importers.bibleserver.get_soup_for_bible_ref') - self.log_patcher = patch('openlp.plugins.bibles.lib.importers.bibleserver.log') - self.send_error_message_patcher = patch('openlp.plugins.bibles.lib.importers.bibleserver.send_error_message') - self.socket_patcher = patch('openlp.plugins.bibles.lib.http.socket') - self.urllib_patcher = patch('openlp.plugins.bibles.lib.importers.bibleserver.urllib') + self.get_soup_for_bible_ref_patcher = patch('openlp.plugins.bibles.lib.importers.http.get_soup_for_bible_ref') + self.log_patcher = patch('openlp.plugins.bibles.lib.importers.http.log') + self.send_error_message_patcher = patch('openlp.plugins.bibles.lib.importers.http.send_error_message') + self.socket_patcher = patch('openlp.plugins.bibles.lib.importers.http.socket') + self.urllib_patcher = patch('openlp.plugins.bibles.lib.importers.http.urllib') self.mock_get_soup_for_bible_ref = self.get_soup_for_bible_ref_patcher.start() self.mock_log = self.log_patcher.start() diff --git a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py index fd557eece..084bfa476 100644 --- a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py +++ b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py @@ -25,9 +25,7 @@ from unittest import TestCase, skip from openlp.core.common import Registry -from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract -from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract -from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract +from openlp.plugins.bibles.lib.importers.http import BGExtract, CWExtract, BSExtract from tests.interfaces import MagicMock