diff --git a/openlp/plugins/bibles/forms/bibleimportform.py b/openlp/plugins/bibles/forms/bibleimportform.py
index e9eee88d5..3d02228ca 100644
--- a/openlp/plugins/bibles/forms/bibleimportform.py
+++ b/openlp/plugins/bibles/forms/bibleimportform.py
@@ -40,9 +40,7 @@ from openlp.core.ui.lib.wizard import OpenLPWizard, WizardStrings
from openlp.core.common.languagemanager import get_locale_key
from openlp.plugins.bibles.lib.manager import BibleFormat
from openlp.plugins.bibles.lib.db import clean_filename
-from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract
-from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract
-from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract
+from openlp.plugins.bibles.lib.importers.http import CWExtract, BGExtract, BSExtract
log = logging.getLogger(__name__)
diff --git a/openlp/plugins/bibles/lib/http.py b/openlp/plugins/bibles/lib/http.py
index 5afd107f6..6921c9005 100644
--- a/openlp/plugins/bibles/lib/http.py
+++ b/openlp/plugins/bibles/lib/http.py
@@ -38,10 +38,545 @@ from openlp.plugins.bibles.lib.bibleimport import BibleImport
from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book
CLEANER_REGEX = re.compile(r' |
|\'\+\'')
+FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
+REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
+UGLY_CHARS = {
+ '\u2014': ' - ',
+ '\u2018': '\'',
+ '\u2019': '\'',
+ '\u201c': '"',
+ '\u201d': '"',
+ ' ': ' '
+}
+VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
+
+BIBLESERVER_LANGUAGE_CODE = {
+ 'fl_1': 'de',
+ 'fl_2': 'en',
+ 'fl_3': 'fr',
+ 'fl_4': 'it',
+ 'fl_5': 'es',
+ 'fl_6': 'pt',
+ 'fl_7': 'ru',
+ 'fl_8': 'sv',
+ 'fl_9': 'no',
+ 'fl_10': 'nl',
+ 'fl_11': 'cs',
+ 'fl_12': 'sk',
+ 'fl_13': 'ro',
+ 'fl_14': 'hr',
+ 'fl_15': 'hu',
+ 'fl_16': 'bg',
+ 'fl_17': 'ar',
+ 'fl_18': 'tr',
+ 'fl_19': 'pl',
+ 'fl_20': 'da',
+ 'fl_21': 'zh'
+}
+
+CROSSWALK_LANGUAGES = {
+ 'Portuguese': 'pt',
+ 'German': 'de',
+ 'Italian': 'it',
+ 'Español': 'es',
+ 'French': 'fr',
+ 'Dutch': 'nl'
+}
log = logging.getLogger(__name__)
+class BGExtract(RegistryProperties):
+ """
+ Extract verses from BibleGateway
+ """
+ def __init__(self, proxy_url=None):
+ log.debug('BGExtract.init("{url}")'.format(url=proxy_url))
+ self.proxy_url = proxy_url
+ socket.setdefaulttimeout(30)
+
+ def _remove_elements(self, parent, tag, class_=None):
+ """
+ Remove a particular element from the BeautifulSoup tree.
+
+ :param parent: The element from which items need to be removed.
+ :param tag: A string of the tab type, e.g. "div"
+ :param class_: An HTML class attribute for further qualification.
+ """
+ if class_:
+ all_tags = parent.find_all(tag, class_)
+ else:
+ all_tags = parent.find_all(tag)
+ for element in all_tags:
+ element.extract()
+
+ def _extract_verse(self, tag):
+ """
+ Extract a verse (or part of a verse) from a tag.
+
+ :param tag: The BeautifulSoup Tag element with the stuff we want.
+ """
+ if isinstance(tag, NavigableString):
+ return None, str(tag)
+ elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'):
+ verse = str(tag.string).replace('[', '').replace(']', '').strip()
+ return verse, None
+ elif tag.get('class') and tag.get('class')[0] == 'chapternum':
+ verse = '1'
+ return verse, None
+ else:
+ verse = None
+ text = ''
+ for child in tag.contents:
+ c_verse, c_text = self._extract_verse(child)
+ if c_verse:
+ verse = c_verse
+ if text and c_text:
+ text += c_text
+ elif c_text is not None:
+ text = c_text
+ return verse, text
+
+ def _clean_soup(self, tag):
+ """
+ Remove all the rubbish from the HTML page.
+
+ :param tag: The base tag within which we want to remove stuff.
+ """
+ self._remove_elements(tag, 'sup', 'crossreference')
+ self._remove_elements(tag, 'sup', 'footnote')
+ self._remove_elements(tag, 'div', 'footnotes')
+ self._remove_elements(tag, 'div', 'crossrefs')
+ self._remove_elements(tag, 'h3')
+ self._remove_elements(tag, 'h4')
+ self._remove_elements(tag, 'h5')
+
+ def _extract_verses(self, tags):
+ """
+ Extract all the verses from a pre-prepared list of HTML tags.
+
+ :param tags: A list of BeautifulSoup Tag elements.
+ """
+ verses = []
+ tags = tags[::-1]
+ current_text = ''
+ for tag in tags:
+ verse = None
+ text = ''
+ for child in tag.contents:
+ c_verse, c_text = self._extract_verse(child)
+ if c_verse:
+ verse = c_verse
+ if text and c_text:
+ text += c_text
+ elif c_text is not None:
+ text = c_text
+ if not verse:
+ current_text = text + ' ' + current_text
+ else:
+ text += ' ' + current_text
+ current_text = ''
+ if text:
+ for old, new in UGLY_CHARS.items():
+ text = text.replace(old, new)
+ text = ' '.join(text.split())
+ if verse and text:
+ verse = verse.strip()
+ try:
+ verse = int(verse)
+ except ValueError:
+ verse_parts = verse.split('-')
+ if len(verse_parts) > 1:
+ verse = int(verse_parts[0])
+ except TypeError:
+ log.warning('Illegal verse number: {verse:d}'.format(verse=verse))
+ verses.append((verse, text))
+ verse_list = {}
+ for verse, text in verses[::-1]:
+ verse_list[verse] = text
+ return verse_list
+
+ def _extract_verses_old(self, div):
+ """
+ Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
+ broken) HTML.
+
+ :param div: The parent div.
+ """
+ verse_list = {}
+ # Cater for inconsistent mark up in the first verse of a chapter.
+ first_verse = div.find('versenum')
+ if first_verse and first_verse.contents:
+ verse_list[1] = str(first_verse.contents[0])
+ for verse in div('sup', 'versenum'):
+ raw_verse_num = verse.next_element
+ clean_verse_num = 0
+ # Not all verses exist in all translations and may or may not be represented by a verse number. If they are
+ # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
+ # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
+ # completely.
+ try:
+ clean_verse_num = int(str(raw_verse_num))
+ except ValueError:
+ verse_parts = str(raw_verse_num).split('-')
+ if len(verse_parts) > 1:
+ clean_verse_num = int(verse_parts[0])
+ except TypeError:
+ log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num))
+ if clean_verse_num:
+ verse_text = raw_verse_num.next_element
+ part = raw_verse_num.next_element.next_element
+ while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'):
+ # While we are still in the same verse grab all the text.
+ if isinstance(part, NavigableString):
+ verse_text += part
+ if isinstance(part.next_element, Tag) and part.next_element.name == 'div':
+ # Run out of verses so stop.
+ break
+ part = part.next_element
+ verse_list[clean_verse_num] = str(verse_text)
+ return verse_list
+
+ def get_bible_chapter(self, version, book_name, chapter):
+ """
+ Access and decode Bibles via the BibleGateway website.
+
+ :param version: The version of the Bible like 31 for New International version.
+ :param book_name: Name of the Book.
+ :param chapter: Chapter number.
+ """
+ log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version,
+ name=book_name,
+ chapter=chapter))
+ url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
+ url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name,
+ chapter=chapter,
+ version=version)
+ soup = get_soup_for_bible_ref(
+ 'http://biblegateway.com/passage/?{url}'.format(url=url_params),
+ pre_parse_regex=r'', pre_parse_substitute='')
+ if not soup:
+ return None
+ div = soup.find('div', 'result-text-style-normal')
+ if not div:
+ return None
+ self._clean_soup(div)
+ span_list = div.find_all('span', 'text')
+ log.debug('Span list: {span}'.format(span=span_list))
+ if not span_list:
+ # If we don't get any spans then we must have the old HTML format
+ verse_list = self._extract_verses_old(div)
+ else:
+ verse_list = self._extract_verses(span_list)
+ if not verse_list:
+ log.debug('No content found in the BibleGateway response.')
+ send_error_message('parse')
+ return None
+ return SearchResults(book_name, chapter, verse_list)
+
+ def get_books_from_http(self, version):
+ """
+ Load a list of all books a Bible contains from BibleGateway website.
+
+ :param version: The version of the Bible like NIV for New International Version
+ """
+ log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version))
+ url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)})
+ reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params)
+ page = get_web_page(reference_url)
+ if not page:
+ send_error_message('download')
+ return None
+ page_source = page.read()
+ try:
+ page_source = str(page_source, 'utf8')
+ except UnicodeDecodeError:
+ page_source = str(page_source, 'cp1251')
+ try:
+ soup = BeautifulSoup(page_source, 'lxml')
+ except Exception:
+ log.error('BeautifulSoup could not parse the Bible page.')
+ send_error_message('parse')
+ return None
+ if not soup:
+ send_error_message('parse')
+ return None
+ self.application.process_events()
+ content = soup.find('table', 'infotable')
+ if content:
+ content = content.find_all('tr')
+ if not content:
+ log.error('No books found in the Biblegateway response.')
+ send_error_message('parse')
+ return None
+ books = []
+ for book in content:
+ book = book.find('td')
+ if book:
+ books.append(book.contents[1])
+ return books
+
+ def get_bibles_from_http(self):
+ """
+ Load a list of bibles from BibleGateway website.
+
+ returns a list in the form [(biblename, biblekey, language_code)]
+ """
+ log.debug('BGExtract.get_bibles_from_http')
+ bible_url = 'https://biblegateway.com/versions/'
+ soup = get_soup_for_bible_ref(bible_url)
+ if not soup:
+ return None
+ bible_select = soup.find('select', {'class': 'search-translation-select'})
+ if not bible_select:
+ log.debug('No select tags found - did site change?')
+ return None
+ option_tags = bible_select.find_all('option')
+ if not option_tags:
+ log.debug('No option tags found - did site change?')
+ return None
+ current_lang = ''
+ bibles = []
+ for ot in option_tags:
+ tag_class = ''
+ try:
+ tag_class = ot['class'][0]
+ except KeyError:
+ tag_class = ''
+ tag_text = ot.get_text()
+ if tag_class == 'lang':
+ current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower()
+ elif tag_class == 'spacer':
+ continue
+ else:
+ bibles.append((tag_text, ot['value'], current_lang))
+ return bibles
+
+
+class BSExtract(RegistryProperties):
+ """
+ Extract verses from Bibleserver.com
+ """
+ def __init__(self, proxy_url=None):
+ log.debug('BSExtract.init("{url}")'.format(url=proxy_url))
+ self.proxy_url = proxy_url
+ socket.setdefaulttimeout(30)
+
+ def get_bible_chapter(self, version, book_name, chapter):
+ """
+ Access and decode bibles via Bibleserver mobile website
+
+ :param version: The version of the bible like NIV for New International Version
+ :param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung
+ :param chapter: Chapter number
+ """
+ log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
+ book=book_name,
+ chapter=chapter))
+ url_version = urllib.parse.quote(version.encode("utf-8"))
+ url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
+ chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version,
+ name=url_book_name,
+ chapter=chapter)
+ header = ('Accept-Language', 'en')
+ soup = get_soup_for_bible_ref(chapter_url, header)
+ if not soup:
+ return None
+ self.application.process_events()
+ content = soup.find('div', 'content')
+ if not content:
+ log.error('No verses found in the Bibleserver response.')
+ send_error_message('parse')
+ return None
+ content = content.find('div').find_all('div')
+ verses = {}
+ for verse in content:
+ self.application.process_events()
+ versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class'])))
+ verses[versenumber] = verse.contents[1].rstrip('\n')
+ return SearchResults(book_name, chapter, verses)
+
+ def get_books_from_http(self, version):
+ """
+ Load a list of all books a Bible contains from Bibleserver mobile website.
+
+ :param version: The version of the Bible like NIV for New International Version
+ """
+ log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version))
+ url_version = urllib.parse.quote(version.encode("utf-8"))
+ chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version)
+ soup = get_soup_for_bible_ref(chapter_url)
+ if not soup:
+ return None
+ content = soup.find('ul')
+ if not content:
+ log.error('No books found in the Bibleserver response.')
+ send_error_message('parse')
+ return None
+ content = content.find_all('li')
+ return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)]
+
+ def get_bibles_from_http(self):
+ """
+ Load a list of bibles from Bibleserver website.
+
+ returns a list in the form [(biblename, biblekey, language_code)]
+ """
+ log.debug('BSExtract.get_bibles_from_http')
+ bible_url = 'http://www.bibleserver.com/index.php?language=2'
+ soup = get_soup_for_bible_ref(bible_url)
+ if not soup:
+ return None
+ bible_links = soup.find_all('a', {'class': 'trlCell'})
+ if not bible_links:
+ log.debug('No a tags found - did site change?')
+ return None
+ bibles = []
+ for link in bible_links:
+ bible_name = link.get_text()
+ # Skip any audio
+ if 'audio' in bible_name.lower():
+ continue
+ try:
+ bible_link = link['href']
+ bible_key = bible_link[bible_link.rfind('/') + 1:]
+ css_classes = link['class']
+ except KeyError:
+ log.debug('No href/class attribute found - did site change?')
+ language_code = ''
+ for css_class in css_classes:
+ if css_class.startswith('fl_'):
+ try:
+ language_code = BIBLESERVER_LANGUAGE_CODE[css_class]
+ except KeyError:
+ language_code = ''
+ bibles.append((bible_name, bible_key, language_code))
+ return bibles
+
+
+class CWExtract(RegistryProperties):
+ """
+ Extract verses from CrossWalk/BibleStudyTools
+ """
+ def __init__(self, proxy_url=None):
+ log.debug('CWExtract.init("{url}")'.format(url=proxy_url))
+ self.proxy_url = proxy_url
+ socket.setdefaulttimeout(30)
+
+ def get_bible_chapter(self, version, book_name, chapter):
+ """
+ Access and decode bibles via the Crosswalk website
+
+ :param version: The version of the Bible like niv for New International Version
+ :param book_name: Text name of in english e.g. 'gen' for Genesis
+ :param chapter: Chapter number
+ """
+ log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
+ book=book_name,
+ chapter=chapter))
+ url_book_name = book_name.replace(' ', '-')
+ url_book_name = url_book_name.lower()
+ url_book_name = urllib.parse.quote(url_book_name.encode("utf-8"))
+ chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
+ book=url_book_name,
+ chapter=chapter)
+ soup = get_soup_for_bible_ref(chapter_url)
+ if not soup:
+ return None
+ self.application.process_events()
+ verses_div = soup.find_all('div', 'verse')
+ if not verses_div:
+ log.error('No verses found in the CrossWalk response.')
+ send_error_message('parse')
+ return None
+ verses = {}
+ for verse in verses_div:
+ self.application.process_events()
+ verse_number = int(verse.find('strong').contents[0])
+ verse_span = verse.find('span')
+ tags_to_remove = verse_span.find_all(['a', 'sup'])
+ for tag in tags_to_remove:
+ tag.decompose()
+ verse_text = verse_span.get_text()
+ self.application.process_events()
+ # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
+ verse_text = verse_text.strip('\n\r\t ')
+ verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text)
+ verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
+ verses[verse_number] = verse_text
+ return SearchResults(book_name, chapter, verses)
+
+ def get_books_from_http(self, version):
+ """
+ Load a list of all books a Bible contain from the Crosswalk website.
+
+ :param version: The version of the bible like NIV for New International Version
+ """
+ log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version))
+ chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version)
+ soup = get_soup_for_bible_ref(chapter_url)
+ if not soup:
+ return None
+ content = soup.find_all('h4', {'class': 'small-header'})
+ if not content:
+ log.error('No books found in the Crosswalk response.')
+ send_error_message('parse')
+ return None
+ books = []
+ for book in content:
+ books.append(book.contents[0])
+ return books
+
+ def get_bibles_from_http(self):
+ """
+ Load a list of bibles from Crosswalk website.
+ returns a list in the form [(biblename, biblekey, language_code)]
+ """
+ log.debug('CWExtract.get_bibles_from_http')
+ bible_url = 'http://www.biblestudytools.com/bible-versions/'
+ soup = get_soup_for_bible_ref(bible_url)
+ if not soup:
+ return None
+ h4_tags = soup.find_all('h4', {'class': 'small-header'})
+ if not h4_tags:
+ log.debug('No h4 tags found - did site change?')
+ return None
+ bibles = []
+ for h4t in h4_tags:
+ short_name = None
+ if h4t.span:
+ short_name = h4t.span.get_text().strip().lower()
+ else:
+ log.error('No span tag found - did site change?')
+ return None
+ if not short_name:
+ continue
+ h4t.span.extract()
+ tag_text = h4t.get_text().strip()
+ # The names of non-english bibles has their language in parentheses at the end
+ if tag_text.endswith(')'):
+ language = tag_text[tag_text.rfind('(') + 1:-1]
+ if language in CROSSWALK_LANGUAGES:
+ language_code = CROSSWALK_LANGUAGES[language]
+ else:
+ language_code = ''
+ # ... except for those that don't...
+ elif 'latin' in tag_text.lower():
+ language_code = 'la'
+ elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
+ language_code = 'es'
+ elif 'chinese' in tag_text.lower():
+ language_code = 'zh'
+ elif 'greek' in tag_text.lower():
+ language_code = 'el'
+ elif 'nova' in tag_text.lower():
+ language_code = 'pt'
+ else:
+ language_code = 'en'
+ bibles.append((tag_text, short_name, language_code))
+ return bibles
+
+
class HTTPBible(BibleImport, RegistryProperties):
log.info('{name} HTTPBible loaded'.format(name=__name__))
diff --git a/openlp/plugins/bibles/lib/importers/biblegateway.py b/openlp/plugins/bibles/lib/importers/biblegateway.py
deleted file mode 100644
index f3caa2204..000000000
--- a/openlp/plugins/bibles/lib/importers/biblegateway.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
-
-###############################################################################
-# OpenLP - Open Source Lyrics Projection #
-# --------------------------------------------------------------------------- #
-# Copyright (c) 2008-2016 OpenLP Developers #
-# --------------------------------------------------------------------------- #
-# This program is free software; you can redistribute it and/or modify it #
-# under the terms of the GNU General Public License as published by the Free #
-# Software Foundation; version 2 of the License. #
-# #
-# This program is distributed in the hope that it will be useful, but WITHOUT #
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
-# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
-# more details. #
-# #
-# You should have received a copy of the GNU General Public License along #
-# with this program; if not, write to the Free Software Foundation, Inc., 59 #
-# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
-###############################################################################
-"""
-The :mod:`biblegateway` module enables OpenLP to retrieve scripture from http://biblegateway.com.
-"""
-import logging
-import socket
-import urllib.parse
-import urllib.error
-
-from bs4 import BeautifulSoup, NavigableString, Tag
-
-from openlp.core.common import RegistryProperties
-from openlp.core.lib.webpagereader import get_web_page
-from openlp.plugins.bibles.lib import SearchResults
-from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message
-
-UGLY_CHARS = {
- '\u2014': ' - ',
- '\u2018': '\'',
- '\u2019': '\'',
- '\u201c': '"',
- '\u201d': '"',
- ' ': ' '
-}
-
-log = logging.getLogger(__name__)
-
-
-class BGExtract(RegistryProperties):
- """
- Extract verses from BibleGateway
- """
- def __init__(self, proxy_url=None):
- log.debug('BGExtract.init("{url}")'.format(url=proxy_url))
- self.proxy_url = proxy_url
- socket.setdefaulttimeout(30)
-
- def _remove_elements(self, parent, tag, class_=None):
- """
- Remove a particular element from the BeautifulSoup tree.
-
- :param parent: The element from which items need to be removed.
- :param tag: A string of the tab type, e.g. "div"
- :param class_: An HTML class attribute for further qualification.
- """
- if class_:
- all_tags = parent.find_all(tag, class_)
- else:
- all_tags = parent.find_all(tag)
- for element in all_tags:
- element.extract()
-
- def _extract_verse(self, tag):
- """
- Extract a verse (or part of a verse) from a tag.
-
- :param tag: The BeautifulSoup Tag element with the stuff we want.
- """
- if isinstance(tag, NavigableString):
- return None, str(tag)
- elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'):
- verse = str(tag.string).replace('[', '').replace(']', '').strip()
- return verse, None
- elif tag.get('class') and tag.get('class')[0] == 'chapternum':
- verse = '1'
- return verse, None
- else:
- verse = None
- text = ''
- for child in tag.contents:
- c_verse, c_text = self._extract_verse(child)
- if c_verse:
- verse = c_verse
- if text and c_text:
- text += c_text
- elif c_text is not None:
- text = c_text
- return verse, text
-
- def _clean_soup(self, tag):
- """
- Remove all the rubbish from the HTML page.
-
- :param tag: The base tag within which we want to remove stuff.
- """
- self._remove_elements(tag, 'sup', 'crossreference')
- self._remove_elements(tag, 'sup', 'footnote')
- self._remove_elements(tag, 'div', 'footnotes')
- self._remove_elements(tag, 'div', 'crossrefs')
- self._remove_elements(tag, 'h3')
- self._remove_elements(tag, 'h4')
- self._remove_elements(tag, 'h5')
-
- def _extract_verses(self, tags):
- """
- Extract all the verses from a pre-prepared list of HTML tags.
-
- :param tags: A list of BeautifulSoup Tag elements.
- """
- verses = []
- tags = tags[::-1]
- current_text = ''
- for tag in tags:
- verse = None
- text = ''
- for child in tag.contents:
- c_verse, c_text = self._extract_verse(child)
- if c_verse:
- verse = c_verse
- if text and c_text:
- text += c_text
- elif c_text is not None:
- text = c_text
- if not verse:
- current_text = text + ' ' + current_text
- else:
- text += ' ' + current_text
- current_text = ''
- if text:
- for old, new in UGLY_CHARS.items():
- text = text.replace(old, new)
- text = ' '.join(text.split())
- if verse and text:
- verse = verse.strip()
- try:
- verse = int(verse)
- except ValueError:
- verse_parts = verse.split('-')
- if len(verse_parts) > 1:
- verse = int(verse_parts[0])
- except TypeError:
- log.warning('Illegal verse number: {verse:d}'.format(verse=verse))
- verses.append((verse, text))
- verse_list = {}
- for verse, text in verses[::-1]:
- verse_list[verse] = text
- return verse_list
-
- def _extract_verses_old(self, div):
- """
- Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
- broken) HTML.
-
- :param div: The parent div.
- """
- verse_list = {}
- # Cater for inconsistent mark up in the first verse of a chapter.
- first_verse = div.find('versenum')
- if first_verse and first_verse.contents:
- verse_list[1] = str(first_verse.contents[0])
- for verse in div('sup', 'versenum'):
- raw_verse_num = verse.next_element
- clean_verse_num = 0
- # Not all verses exist in all translations and may or may not be represented by a verse number. If they are
- # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
- # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
- # completely.
- try:
- clean_verse_num = int(str(raw_verse_num))
- except ValueError:
- verse_parts = str(raw_verse_num).split('-')
- if len(verse_parts) > 1:
- clean_verse_num = int(verse_parts[0])
- except TypeError:
- log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num))
- if clean_verse_num:
- verse_text = raw_verse_num.next_element
- part = raw_verse_num.next_element.next_element
- while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'):
- # While we are still in the same verse grab all the text.
- if isinstance(part, NavigableString):
- verse_text += part
- if isinstance(part.next_element, Tag) and part.next_element.name == 'div':
- # Run out of verses so stop.
- break
- part = part.next_element
- verse_list[clean_verse_num] = str(verse_text)
- return verse_list
-
- def get_bible_chapter(self, version, book_name, chapter):
- """
- Access and decode Bibles via the BibleGateway website.
-
- :param version: The version of the Bible like 31 for New International version.
- :param book_name: Name of the Book.
- :param chapter: Chapter number.
- """
- log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version,
- name=book_name,
- chapter=chapter))
- url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
- url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name,
- chapter=chapter,
- version=version)
- soup = get_soup_for_bible_ref(
- 'http://biblegateway.com/passage/?{url}'.format(url=url_params),
- pre_parse_regex=r'', pre_parse_substitute='')
- if not soup:
- return None
- div = soup.find('div', 'result-text-style-normal')
- if not div:
- return None
- self._clean_soup(div)
- span_list = div.find_all('span', 'text')
- log.debug('Span list: {span}'.format(span=span_list))
- if not span_list:
- # If we don't get any spans then we must have the old HTML format
- verse_list = self._extract_verses_old(div)
- else:
- verse_list = self._extract_verses(span_list)
- if not verse_list:
- log.debug('No content found in the BibleGateway response.')
- send_error_message('parse')
- return None
- return SearchResults(book_name, chapter, verse_list)
-
- def get_books_from_http(self, version):
- """
- Load a list of all books a Bible contains from BibleGateway website.
-
- :param version: The version of the Bible like NIV for New International Version
- """
- log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version))
- url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)})
- reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params)
- page = get_web_page(reference_url)
- if not page:
- send_error_message('download')
- return None
- page_source = page.read()
- try:
- page_source = str(page_source, 'utf8')
- except UnicodeDecodeError:
- page_source = str(page_source, 'cp1251')
- try:
- soup = BeautifulSoup(page_source, 'lxml')
- except Exception:
- log.error('BeautifulSoup could not parse the Bible page.')
- send_error_message('parse')
- return None
- if not soup:
- send_error_message('parse')
- return None
- self.application.process_events()
- content = soup.find('table', 'infotable')
- if content:
- content = content.find_all('tr')
- if not content:
- log.error('No books found in the Biblegateway response.')
- send_error_message('parse')
- return None
- books = []
- for book in content:
- book = book.find('td')
- if book:
- books.append(book.contents[1])
- return books
-
- def get_bibles_from_http(self):
- """
- Load a list of bibles from BibleGateway website.
-
- returns a list in the form [(biblename, biblekey, language_code)]
- """
- log.debug('BGExtract.get_bibles_from_http')
- bible_url = 'https://biblegateway.com/versions/'
- soup = get_soup_for_bible_ref(bible_url)
- if not soup:
- return None
- bible_select = soup.find('select', {'class': 'search-translation-select'})
- if not bible_select:
- log.debug('No select tags found - did site change?')
- return None
- option_tags = bible_select.find_all('option')
- if not option_tags:
- log.debug('No option tags found - did site change?')
- return None
- current_lang = ''
- bibles = []
- for ot in option_tags:
- tag_class = ''
- try:
- tag_class = ot['class'][0]
- except KeyError:
- tag_class = ''
- tag_text = ot.get_text()
- if tag_class == 'lang':
- current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower()
- elif tag_class == 'spacer':
- continue
- else:
- bibles.append((tag_text, ot['value'], current_lang))
- return bibles
diff --git a/openlp/plugins/bibles/lib/importers/bibleserver.py b/openlp/plugins/bibles/lib/importers/bibleserver.py
deleted file mode 100644
index 16924d84a..000000000
--- a/openlp/plugins/bibles/lib/importers/bibleserver.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
-
-###############################################################################
-# OpenLP - Open Source Lyrics Projection #
-# --------------------------------------------------------------------------- #
-# Copyright (c) 2008-2016 OpenLP Developers #
-# --------------------------------------------------------------------------- #
-# This program is free software; you can redistribute it and/or modify it #
-# under the terms of the GNU General Public License as published by the Free #
-# Software Foundation; version 2 of the License. #
-# #
-# This program is distributed in the hope that it will be useful, but WITHOUT #
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
-# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
-# more details. #
-# #
-# You should have received a copy of the GNU General Public License along #
-# with this program; if not, write to the Free Software Foundation, Inc., 59 #
-# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
-###############################################################################
-"""
-The :mod:`bibleserver` module enables OpenLP to retrieve scripture from http://bibleserver.com.
-"""
-import logging
-import re
-import socket
-import urllib.parse
-import urllib.error
-
-from openlp.core.common import RegistryProperties
-from openlp.plugins.bibles.lib import SearchResults
-from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message
-
-VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
-
-BIBLESERVER_LANGUAGE_CODE = {
- 'fl_1': 'de',
- 'fl_2': 'en',
- 'fl_3': 'fr',
- 'fl_4': 'it',
- 'fl_5': 'es',
- 'fl_6': 'pt',
- 'fl_7': 'ru',
- 'fl_8': 'sv',
- 'fl_9': 'no',
- 'fl_10': 'nl',
- 'fl_11': 'cs',
- 'fl_12': 'sk',
- 'fl_13': 'ro',
- 'fl_14': 'hr',
- 'fl_15': 'hu',
- 'fl_16': 'bg',
- 'fl_17': 'ar',
- 'fl_18': 'tr',
- 'fl_19': 'pl',
- 'fl_20': 'da',
- 'fl_21': 'zh'
-}
-
-log = logging.getLogger(__name__)
-
-
-class BSExtract(RegistryProperties):
- """
- Extract verses from Bibleserver.com
- """
- def __init__(self, proxy_url=None):
- log.debug('BSExtract.init("{url}")'.format(url=proxy_url))
- self.proxy_url = proxy_url
- socket.setdefaulttimeout(30)
-
- def get_bible_chapter(self, version, book_name, chapter):
- """
- Access and decode bibles via Bibleserver mobile website
-
- :param version: The version of the bible like NIV for New International Version
- :param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung
- :param chapter: Chapter number
- """
- log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
- book=book_name,
- chapter=chapter))
- url_version = urllib.parse.quote(version.encode("utf-8"))
- url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
- chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version,
- name=url_book_name,
- chapter=chapter)
- header = ('Accept-Language', 'en')
- soup = get_soup_for_bible_ref(chapter_url, header)
- if not soup:
- return None
- self.application.process_events()
- content = soup.find('div', 'content')
- if not content:
- log.error('No verses found in the Bibleserver response.')
- send_error_message('parse')
- return None
- content = content.find('div').find_all('div')
- verses = {}
- for verse in content:
- self.application.process_events()
- versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class'])))
- verses[versenumber] = verse.contents[1].rstrip('\n')
- return SearchResults(book_name, chapter, verses)
-
- def get_books_from_http(self, version):
- """
- Load a list of all books a Bible contains from Bibleserver mobile website.
-
- :param version: The version of the Bible like NIV for New International Version
- """
- log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version))
- url_version = urllib.parse.quote(version.encode("utf-8"))
- chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version)
- soup = get_soup_for_bible_ref(chapter_url)
- if not soup:
- return None
- content = soup.find('ul')
- if not content:
- log.error('No books found in the Bibleserver response.')
- send_error_message('parse')
- return None
- content = content.find_all('li')
- return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)]
-
- def get_bibles_from_http(self):
- """
- Load a list of bibles from Bibleserver website.
-
- returns a list in the form [(biblename, biblekey, language_code)]
- """
- log.debug('BSExtract.get_bibles_from_http')
- bible_url = 'http://www.bibleserver.com/index.php?language=2'
- soup = get_soup_for_bible_ref(bible_url)
- if not soup:
- return None
- bible_links = soup.find_all('a', {'class': 'trlCell'})
- if not bible_links:
- log.debug('No a tags found - did site change?')
- return None
- bibles = []
- for link in bible_links:
- bible_name = link.get_text()
- # Skip any audio
- if 'audio' in bible_name.lower():
- continue
- try:
- bible_link = link['href']
- bible_key = bible_link[bible_link.rfind('/') + 1:]
- css_classes = link['class']
- except KeyError:
- log.debug('No href/class attribute found - did site change?')
- language_code = ''
- for css_class in css_classes:
- if css_class.startswith('fl_'):
- try:
- language_code = BIBLESERVER_LANGUAGE_CODE[css_class]
- except KeyError:
- language_code = ''
- bibles.append((bible_name, bible_key, language_code))
- return bibles
diff --git a/openlp/plugins/bibles/lib/importers/crosswalk.py b/openlp/plugins/bibles/lib/importers/crosswalk.py
deleted file mode 100644
index fb354dd29..000000000
--- a/openlp/plugins/bibles/lib/importers/crosswalk.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# -*- coding: utf-8 -*-
-# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
-
-###############################################################################
-# OpenLP - Open Source Lyrics Projection #
-# --------------------------------------------------------------------------- #
-# Copyright (c) 2008-2016 OpenLP Developers #
-# --------------------------------------------------------------------------- #
-# This program is free software; you can redistribute it and/or modify it #
-# under the terms of the GNU General Public License as published by the Free #
-# Software Foundation; version 2 of the License. #
-# #
-# This program is distributed in the hope that it will be useful, but WITHOUT #
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
-# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
-# more details. #
-# #
-# You should have received a copy of the GNU General Public License along #
-# with this program; if not, write to the Free Software Foundation, Inc., 59 #
-# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
-###############################################################################
-"""
-The :mod:`crosswalk` module enables OpenLP to retrieve scripture from www.biblestudytools.com.
-"""
-import logging
-import re
-import socket
-import urllib.parse
-import urllib.error
-
-from openlp.core.common import RegistryProperties
-from openlp.plugins.bibles.lib import SearchResults
-from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message
-
-FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
-REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
-
-
-CROSSWALK_LANGUAGES = {
- 'Portuguese': 'pt',
- 'German': 'de',
- 'Italian': 'it',
- 'Español': 'es',
- 'French': 'fr',
- 'Dutch': 'nl'
-}
-
-log = logging.getLogger(__name__)
-
-
-class CWExtract(RegistryProperties):
- """
- Extract verses from CrossWalk/BibleStudyTools
- """
- def __init__(self, proxy_url=None):
- log.debug('CWExtract.init("{url}")'.format(url=proxy_url))
- self.proxy_url = proxy_url
- socket.setdefaulttimeout(30)
-
- def get_bible_chapter(self, version, book_name, chapter):
- """
- Access and decode bibles via the Crosswalk website
-
- :param version: The version of the Bible like niv for New International Version
- :param book_name: Text name of in english e.g. 'gen' for Genesis
- :param chapter: Chapter number
- """
- log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
- book=book_name,
- chapter=chapter))
- url_book_name = book_name.replace(' ', '-')
- url_book_name = url_book_name.lower()
- url_book_name = urllib.parse.quote(url_book_name.encode("utf-8"))
- chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
- book=url_book_name,
- chapter=chapter)
- soup = get_soup_for_bible_ref(chapter_url)
- if not soup:
- return None
- self.application.process_events()
- verses_div = soup.find_all('div', 'verse')
- if not verses_div:
- log.error('No verses found in the CrossWalk response.')
- send_error_message('parse')
- return None
- verses = {}
- for verse in verses_div:
- self.application.process_events()
- verse_number = int(verse.find('strong').contents[0])
- verse_span = verse.find('span')
- tags_to_remove = verse_span.find_all(['a', 'sup'])
- for tag in tags_to_remove:
- tag.decompose()
- verse_text = verse_span.get_text()
- self.application.process_events()
- # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
- verse_text = verse_text.strip('\n\r\t ')
- verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text)
- verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
- verses[verse_number] = verse_text
- return SearchResults(book_name, chapter, verses)
-
- def get_books_from_http(self, version):
- """
- Load a list of all books a Bible contain from the Crosswalk website.
-
- :param version: The version of the bible like NIV for New International Version
- """
- log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version))
- chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version)
- soup = get_soup_for_bible_ref(chapter_url)
- if not soup:
- return None
- content = soup.find_all('h4', {'class': 'small-header'})
- if not content:
- log.error('No books found in the Crosswalk response.')
- send_error_message('parse')
- return None
- books = []
- for book in content:
- books.append(book.contents[0])
- return books
-
- def get_bibles_from_http(self):
- """
- Load a list of bibles from Crosswalk website.
- returns a list in the form [(biblename, biblekey, language_code)]
- """
- log.debug('CWExtract.get_bibles_from_http')
- bible_url = 'http://www.biblestudytools.com/bible-versions/'
- soup = get_soup_for_bible_ref(bible_url)
- if not soup:
- return None
- h4_tags = soup.find_all('h4', {'class': 'small-header'})
- if not h4_tags:
- log.debug('No h4 tags found - did site change?')
- return None
- bibles = []
- for h4t in h4_tags:
- short_name = None
- if h4t.span:
- short_name = h4t.span.get_text().strip().lower()
- else:
- log.error('No span tag found - did site change?')
- return None
- if not short_name:
- continue
- h4t.span.extract()
- tag_text = h4t.get_text().strip()
- # The names of non-english bibles has their language in parentheses at the end
- if tag_text.endswith(')'):
- language = tag_text[tag_text.rfind('(') + 1:-1]
- if language in CROSSWALK_LANGUAGES:
- language_code = CROSSWALK_LANGUAGES[language]
- else:
- language_code = ''
- # ... except for those that don't...
- elif 'latin' in tag_text.lower():
- language_code = 'la'
- elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
- language_code = 'es'
- elif 'chinese' in tag_text.lower():
- language_code = 'zh'
- elif 'greek' in tag_text.lower():
- language_code = 'el'
- elif 'nova' in tag_text.lower():
- language_code = 'pt'
- else:
- language_code = 'en'
- bibles.append((tag_text, short_name, language_code))
- return bibles
diff --git a/tests/functional/openlp_plugins/bibles/test_bibleserver.py b/tests/functional/openlp_plugins/bibles/test_bibleserver.py
index 0849a63e3..839c81008 100644
--- a/tests/functional/openlp_plugins/bibles/test_bibleserver.py
+++ b/tests/functional/openlp_plugins/bibles/test_bibleserver.py
@@ -20,13 +20,41 @@
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
"""
-This module contains tests for the bibleserver module of the Bibles plugin.
+This module contains tests for the http module of the Bibles plugin.
"""
from unittest import TestCase
from bs4 import BeautifulSoup
from tests.functional import patch, MagicMock
-from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract
+from openlp.plugins.bibles.lib.importers.http import BSExtract
+
+# TODO: Items left to test
+# BGExtract
+# __init__
+# _remove_elements
+# _extract_verse
+# _clean_soup
+# _extract_verses
+# _extract_verses_old
+# get_bible_chapter
+# get_books_from_http
+# _get_application
+# CWExtract
+# __init__
+# get_bible_chapter
+# get_books_from_http
+# _get_application
+# HTTPBible
+# __init__
+# do_import
+# get_verses
+# get_chapter
+# get_books
+# get_chapter_count
+# get_verse_count
+# _get_application
+# get_soup_for_bible_ref
+# send_error_message
class TestBSExtract(TestCase):
@@ -40,12 +68,11 @@ class TestBSExtract(TestCase):
# get_books_from_http
# _get_application
def setUp(self):
- self.get_soup_for_bible_ref_patcher = patch(
- 'openlp.plugins.bibles.lib.importers.bibleserver.get_soup_for_bible_ref')
- self.log_patcher = patch('openlp.plugins.bibles.lib.importers.bibleserver.log')
- self.send_error_message_patcher = patch('openlp.plugins.bibles.lib.importers.bibleserver.send_error_message')
- self.socket_patcher = patch('openlp.plugins.bibles.lib.http.socket')
- self.urllib_patcher = patch('openlp.plugins.bibles.lib.importers.bibleserver.urllib')
+ self.get_soup_for_bible_ref_patcher = patch('openlp.plugins.bibles.lib.importers.http.get_soup_for_bible_ref')
+ self.log_patcher = patch('openlp.plugins.bibles.lib.importers.http.log')
+ self.send_error_message_patcher = patch('openlp.plugins.bibles.lib.importers.http.send_error_message')
+ self.socket_patcher = patch('openlp.plugins.bibles.lib.importers.http.socket')
+ self.urllib_patcher = patch('openlp.plugins.bibles.lib.importers.http.urllib')
self.mock_get_soup_for_bible_ref = self.get_soup_for_bible_ref_patcher.start()
self.mock_log = self.log_patcher.start()
diff --git a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py
index fd557eece..084bfa476 100644
--- a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py
+++ b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py
@@ -25,9 +25,7 @@
from unittest import TestCase, skip
from openlp.core.common import Registry
-from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract
-from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract
-from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract
+from openlp.plugins.bibles.lib.importers.http import BGExtract, CWExtract, BSExtract
from tests.interfaces import MagicMock