From f5480640f687d8ed44700243eab02135b42df4fe Mon Sep 17 00:00:00 2001 From: Philip Ridout Date: Thu, 11 Aug 2016 20:07:21 +0100 Subject: [PATCH] more files --- .../bibles/lib/importers/biblegateway.py | 313 ++++++++++++++++++ .../bibles/lib/importers/bibleserver.py | 162 +++++++++ .../plugins/bibles/lib/importers/crosswalk.py | 171 ++++++++++ 3 files changed, 646 insertions(+) create mode 100644 openlp/plugins/bibles/lib/importers/biblegateway.py create mode 100644 openlp/plugins/bibles/lib/importers/bibleserver.py create mode 100644 openlp/plugins/bibles/lib/importers/crosswalk.py diff --git a/openlp/plugins/bibles/lib/importers/biblegateway.py b/openlp/plugins/bibles/lib/importers/biblegateway.py new file mode 100644 index 000000000..c6a8074bf --- /dev/null +++ b/openlp/plugins/bibles/lib/importers/biblegateway.py @@ -0,0 +1,313 @@ +# -*- coding: utf-8 -*- +# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4 + +############################################################################### +# OpenLP - Open Source Lyrics Projection # +# --------------------------------------------------------------------------- # +# Copyright (c) 2008-2016 OpenLP Developers # +# --------------------------------------------------------------------------- # +# This program is free software; you can redistribute it and/or modify it # +# under the terms of the GNU General Public License as published by the Free # +# Software Foundation; version 2 of the License. # +# # +# This program is distributed in the hope that it will be useful, but WITHOUT # +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # +# more details. # +# # +# You should have received a copy of the GNU General Public License along # +# with this program; if not, write to the Free Software Foundation, Inc., 59 # +# Temple Place, Suite 330, Boston, MA 02111-1307 USA # +############################################################################### +""" +The :mod:`http` module enables OpenLP to retrieve scripture from bible websites. +""" +import logging +import socket +import urllib.parse +import urllib.error + +from bs4 import BeautifulSoup, NavigableString, Tag + +from openlp.core.common import RegistryProperties +from openlp.core.lib.webpagereader import get_web_page +from openlp.plugins.bibles.lib import SearchResults +from openlp.plugins.bibles.lib.importers.http import get_soup_for_bible_ref, send_error_message + +UGLY_CHARS = { + '\u2014': ' - ', + '\u2018': '\'', + '\u2019': '\'', + '\u201c': '"', + '\u201d': '"', + ' ': ' ' +} + +log = logging.getLogger(__name__) + + +class BGExtract(RegistryProperties): + """ + Extract verses from BibleGateway + """ + def __init__(self, proxy_url=None): + log.debug('BGExtract.init("{url}")'.format(url=proxy_url)) + self.proxy_url = proxy_url + socket.setdefaulttimeout(30) + + def _remove_elements(self, parent, tag, class_=None): + """ + Remove a particular element from the BeautifulSoup tree. + + :param parent: The element from which items need to be removed. + :param tag: A string of the tab type, e.g. "div" + :param class_: An HTML class attribute for further qualification. + """ + if class_: + all_tags = parent.find_all(tag, class_) + else: + all_tags = parent.find_all(tag) + for element in all_tags: + element.extract() + + def _extract_verse(self, tag): + """ + Extract a verse (or part of a verse) from a tag. + + :param tag: The BeautifulSoup Tag element with the stuff we want. + """ + if isinstance(tag, NavigableString): + return None, str(tag) + elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'): + verse = str(tag.string).replace('[', '').replace(']', '').strip() + return verse, None + elif tag.get('class') and tag.get('class')[0] == 'chapternum': + verse = '1' + return verse, None + else: + verse = None + text = '' + for child in tag.contents: + c_verse, c_text = self._extract_verse(child) + if c_verse: + verse = c_verse + if text and c_text: + text += c_text + elif c_text is not None: + text = c_text + return verse, text + + def _clean_soup(self, tag): + """ + Remove all the rubbish from the HTML page. + + :param tag: The base tag within which we want to remove stuff. + """ + self._remove_elements(tag, 'sup', 'crossreference') + self._remove_elements(tag, 'sup', 'footnote') + self._remove_elements(tag, 'div', 'footnotes') + self._remove_elements(tag, 'div', 'crossrefs') + self._remove_elements(tag, 'h3') + self._remove_elements(tag, 'h4') + self._remove_elements(tag, 'h5') + + def _extract_verses(self, tags): + """ + Extract all the verses from a pre-prepared list of HTML tags. + + :param tags: A list of BeautifulSoup Tag elements. + """ + verses = [] + tags = tags[::-1] + current_text = '' + for tag in tags: + verse = None + text = '' + for child in tag.contents: + c_verse, c_text = self._extract_verse(child) + if c_verse: + verse = c_verse + if text and c_text: + text += c_text + elif c_text is not None: + text = c_text + if not verse: + current_text = text + ' ' + current_text + else: + text += ' ' + current_text + current_text = '' + if text: + for old, new in UGLY_CHARS.items(): + text = text.replace(old, new) + text = ' '.join(text.split()) + if verse and text: + verse = verse.strip() + try: + verse = int(verse) + except ValueError: + verse_parts = verse.split('-') + if len(verse_parts) > 1: + verse = int(verse_parts[0]) + except TypeError: + log.warning('Illegal verse number: {verse:d}'.format(verse=verse)) + verses.append((verse, text)) + verse_list = {} + for verse, text in verses[::-1]: + verse_list[verse] = text + return verse_list + + def _extract_verses_old(self, div): + """ + Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still + broken) HTML. + + :param div: The parent div. + """ + verse_list = {} + # Cater for inconsistent mark up in the first verse of a chapter. + first_verse = div.find('versenum') + if first_verse and first_verse.contents: + verse_list[1] = str(first_verse.contents[0]) + for verse in div('sup', 'versenum'): + raw_verse_num = verse.next_element + clean_verse_num = 0 + # Not all verses exist in all translations and may or may not be represented by a verse number. If they are + # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what + # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse + # completely. + try: + clean_verse_num = int(str(raw_verse_num)) + except ValueError: + verse_parts = str(raw_verse_num).split('-') + if len(verse_parts) > 1: + clean_verse_num = int(verse_parts[0]) + except TypeError: + log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num)) + if clean_verse_num: + verse_text = raw_verse_num.next_element + part = raw_verse_num.next_element.next_element + while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'): + # While we are still in the same verse grab all the text. + if isinstance(part, NavigableString): + verse_text += part + if isinstance(part.next_element, Tag) and part.next_element.name == 'div': + # Run out of verses so stop. + break + part = part.next_element + verse_list[clean_verse_num] = str(verse_text) + return verse_list + + def get_bible_chapter(self, version, book_name, chapter): + """ + Access and decode Bibles via the BibleGateway website. + + :param version: The version of the Bible like 31 for New International version. + :param book_name: Name of the Book. + :param chapter: Chapter number. + """ + log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version, + name=book_name, + chapter=chapter)) + url_book_name = urllib.parse.quote(book_name.encode("utf-8")) + url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name, + chapter=chapter, + version=version) + soup = get_soup_for_bible_ref( + 'http://biblegateway.com/passage/?{url}'.format(url=url_params), + pre_parse_regex=r'', pre_parse_substitute='') + if not soup: + return None + div = soup.find('div', 'result-text-style-normal') + if not div: + return None + self._clean_soup(div) + span_list = div.find_all('span', 'text') + log.debug('Span list: {span}'.format(span=span_list)) + if not span_list: + # If we don't get any spans then we must have the old HTML format + verse_list = self._extract_verses_old(div) + else: + verse_list = self._extract_verses(span_list) + if not verse_list: + log.debug('No content found in the BibleGateway response.') + send_error_message('parse') + return None + return SearchResults(book_name, chapter, verse_list) + + def get_books_from_http(self, version): + """ + Load a list of all books a Bible contains from BibleGateway website. + + :param version: The version of the Bible like NIV for New International Version + """ + log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version)) + url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)}) + reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params) + page = get_web_page(reference_url) + if not page: + send_error_message('download') + return None + page_source = page.read() + try: + page_source = str(page_source, 'utf8') + except UnicodeDecodeError: + page_source = str(page_source, 'cp1251') + try: + soup = BeautifulSoup(page_source, 'lxml') + except Exception: + log.error('BeautifulSoup could not parse the Bible page.') + send_error_message('parse') + return None + if not soup: + send_error_message('parse') + return None + self.application.process_events() + content = soup.find('table', 'infotable') + if content: + content = content.find_all('tr') + if not content: + log.error('No books found in the Biblegateway response.') + send_error_message('parse') + return None + books = [] + for book in content: + book = book.find('td') + if book: + books.append(book.contents[1]) + return books + + def get_bibles_from_http(self): + """ + Load a list of bibles from BibleGateway website. + + returns a list in the form [(biblename, biblekey, language_code)] + """ + log.debug('BGExtract.get_bibles_from_http') + bible_url = 'https://biblegateway.com/versions/' + soup = get_soup_for_bible_ref(bible_url) + if not soup: + return None + bible_select = soup.find('select', {'class': 'search-translation-select'}) + if not bible_select: + log.debug('No select tags found - did site change?') + return None + option_tags = bible_select.find_all('option') + if not option_tags: + log.debug('No option tags found - did site change?') + return None + current_lang = '' + bibles = [] + for ot in option_tags: + tag_class = '' + try: + tag_class = ot['class'][0] + except KeyError: + tag_class = '' + tag_text = ot.get_text() + if tag_class == 'lang': + current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower() + elif tag_class == 'spacer': + continue + else: + bibles.append((tag_text, ot['value'], current_lang)) + return bibles diff --git a/openlp/plugins/bibles/lib/importers/bibleserver.py b/openlp/plugins/bibles/lib/importers/bibleserver.py new file mode 100644 index 000000000..e651b84ab --- /dev/null +++ b/openlp/plugins/bibles/lib/importers/bibleserver.py @@ -0,0 +1,162 @@ +# -*- coding: utf-8 -*- +# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4 + +############################################################################### +# OpenLP - Open Source Lyrics Projection # +# --------------------------------------------------------------------------- # +# Copyright (c) 2008-2016 OpenLP Developers # +# --------------------------------------------------------------------------- # +# This program is free software; you can redistribute it and/or modify it # +# under the terms of the GNU General Public License as published by the Free # +# Software Foundation; version 2 of the License. # +# # +# This program is distributed in the hope that it will be useful, but WITHOUT # +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # +# more details. # +# # +# You should have received a copy of the GNU General Public License along # +# with this program; if not, write to the Free Software Foundation, Inc., 59 # +# Temple Place, Suite 330, Boston, MA 02111-1307 USA # +############################################################################### +""" +The :mod:`http` module enables OpenLP to retrieve scripture from bible websites. +""" +import logging +import re +import socket +import urllib.parse +import urllib.error + +from openlp.core.common import RegistryProperties +from openlp.plugins.bibles.lib import SearchResults +from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message + +VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') + +BIBLESERVER_LANGUAGE_CODE = { + 'fl_1': 'de', + 'fl_2': 'en', + 'fl_3': 'fr', + 'fl_4': 'it', + 'fl_5': 'es', + 'fl_6': 'pt', + 'fl_7': 'ru', + 'fl_8': 'sv', + 'fl_9': 'no', + 'fl_10': 'nl', + 'fl_11': 'cs', + 'fl_12': 'sk', + 'fl_13': 'ro', + 'fl_14': 'hr', + 'fl_15': 'hu', + 'fl_16': 'bg', + 'fl_17': 'ar', + 'fl_18': 'tr', + 'fl_19': 'pl', + 'fl_20': 'da', + 'fl_21': 'zh' +} + +log = logging.getLogger(__name__) + + +class BSExtract(RegistryProperties): + """ + Extract verses from Bibleserver.com + """ + def __init__(self, proxy_url=None): + log.debug('BSExtract.init("{url}")'.format(url=proxy_url)) + self.proxy_url = proxy_url + socket.setdefaulttimeout(30) + + def get_bible_chapter(self, version, book_name, chapter): + """ + Access and decode bibles via Bibleserver mobile website + + :param version: The version of the bible like NIV for New International Version + :param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung + :param chapter: Chapter number + """ + log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, + book=book_name, + chapter=chapter)) + url_version = urllib.parse.quote(version.encode("utf-8")) + url_book_name = urllib.parse.quote(book_name.encode("utf-8")) + chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version, + name=url_book_name, + chapter=chapter) + header = ('Accept-Language', 'en') + soup = get_soup_for_bible_ref(chapter_url, header) + if not soup: + return None + self.application.process_events() + content = soup.find('div', 'content') + if not content: + log.error('No verses found in the Bibleserver response.') + send_error_message('parse') + return None + content = content.find('div').find_all('div') + verses = {} + for verse in content: + self.application.process_events() + versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class']))) + verses[versenumber] = verse.contents[1].rstrip('\n') + return SearchResults(book_name, chapter, verses) + + def get_books_from_http(self, version): + """ + Load a list of all books a Bible contains from Bibleserver mobile website. + + :param version: The version of the Bible like NIV for New International Version + """ + log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version)) + url_version = urllib.parse.quote(version.encode("utf-8")) + chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version) + soup = get_soup_for_bible_ref(chapter_url) + if not soup: + return None + content = soup.find('ul') + if not content: + log.error('No books found in the Bibleserver response.') + send_error_message('parse') + return None + content = content.find_all('li') + return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)] + + def get_bibles_from_http(self): + """ + Load a list of bibles from Bibleserver website. + + returns a list in the form [(biblename, biblekey, language_code)] + """ + log.debug('BSExtract.get_bibles_from_http') + bible_url = 'http://www.bibleserver.com/index.php?language=2' + soup = get_soup_for_bible_ref(bible_url) + if not soup: + return None + bible_links = soup.find_all('a', {'class': 'trlCell'}) + if not bible_links: + log.debug('No a tags found - did site change?') + return None + bibles = [] + for link in bible_links: + bible_name = link.get_text() + # Skip any audio + if 'audio' in bible_name.lower(): + continue + try: + bible_link = link['href'] + bible_key = bible_link[bible_link.rfind('/') + 1:] + css_classes = link['class'] + except KeyError: + log.debug('No href/class attribute found - did site change?') + language_code = '' + for css_class in css_classes: + if css_class.startswith('fl_'): + try: + language_code = BIBLESERVER_LANGUAGE_CODE[css_class] + except KeyError: + language_code = '' + bibles.append((bible_name, bible_key, language_code)) + return bibles diff --git a/openlp/plugins/bibles/lib/importers/crosswalk.py b/openlp/plugins/bibles/lib/importers/crosswalk.py new file mode 100644 index 000000000..6c75209d1 --- /dev/null +++ b/openlp/plugins/bibles/lib/importers/crosswalk.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- +# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4 + +############################################################################### +# OpenLP - Open Source Lyrics Projection # +# --------------------------------------------------------------------------- # +# Copyright (c) 2008-2016 OpenLP Developers # +# --------------------------------------------------------------------------- # +# This program is free software; you can redistribute it and/or modify it # +# under the terms of the GNU General Public License as published by the Free # +# Software Foundation; version 2 of the License. # +# # +# This program is distributed in the hope that it will be useful, but WITHOUT # +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # +# more details. # +# # +# You should have received a copy of the GNU General Public License along # +# with this program; if not, write to the Free Software Foundation, Inc., 59 # +# Temple Place, Suite 330, Boston, MA 02111-1307 USA # +############################################################################### +""" +The :mod:`http` module enables OpenLP to retrieve scripture from bible websites. +""" +import logging +import re +import socket +import urllib.parse +import urllib.error + +from openlp.core.common import RegistryProperties +from openlp.plugins.bibles.lib import SearchResults +from openlp.plugins.bibles.lib.importers.http import get_soup_for_bible_ref, send_error_message + +FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])') +REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}') + + +CROSSWALK_LANGUAGES = { + 'Portuguese': 'pt', + 'German': 'de', + 'Italian': 'it', + 'EspaƱol': 'es', + 'French': 'fr', + 'Dutch': 'nl' +} + +log = logging.getLogger(__name__) + + +class CWExtract(RegistryProperties): + """ + Extract verses from CrossWalk/BibleStudyTools + """ + def __init__(self, proxy_url=None): + log.debug('CWExtract.init("{url}")'.format(url=proxy_url)) + self.proxy_url = proxy_url + socket.setdefaulttimeout(30) + + def get_bible_chapter(self, version, book_name, chapter): + """ + Access and decode bibles via the Crosswalk website + + :param version: The version of the Bible like niv for New International Version + :param book_name: Text name of in english e.g. 'gen' for Genesis + :param chapter: Chapter number + """ + log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, + book=book_name, + chapter=chapter)) + url_book_name = book_name.replace(' ', '-') + url_book_name = url_book_name.lower() + url_book_name = urllib.parse.quote(url_book_name.encode("utf-8")) + chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version, + book=url_book_name, + chapter=chapter) + soup = get_soup_for_bible_ref(chapter_url) + if not soup: + return None + self.application.process_events() + verses_div = soup.find_all('div', 'verse') + if not verses_div: + log.error('No verses found in the CrossWalk response.') + send_error_message('parse') + return None + verses = {} + for verse in verses_div: + self.application.process_events() + verse_number = int(verse.find('strong').contents[0]) + verse_span = verse.find('span') + tags_to_remove = verse_span.find_all(['a', 'sup']) + for tag in tags_to_remove: + tag.decompose() + verse_text = verse_span.get_text() + self.application.process_events() + # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and . + verse_text = verse_text.strip('\n\r\t ') + verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text) + verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text) + verses[verse_number] = verse_text + return SearchResults(book_name, chapter, verses) + + def get_books_from_http(self, version): + """ + Load a list of all books a Bible contain from the Crosswalk website. + + :param version: The version of the bible like NIV for New International Version + """ + log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version)) + chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version) + soup = get_soup_for_bible_ref(chapter_url) + if not soup: + return None + content = soup.find_all('h4', {'class': 'small-header'}) + if not content: + log.error('No books found in the Crosswalk response.') + send_error_message('parse') + return None + books = [] + for book in content: + books.append(book.contents[0]) + return books + + def get_bibles_from_http(self): + """ + Load a list of bibles from Crosswalk website. + returns a list in the form [(biblename, biblekey, language_code)] + """ + log.debug('CWExtract.get_bibles_from_http') + bible_url = 'http://www.biblestudytools.com/bible-versions/' + soup = get_soup_for_bible_ref(bible_url) + if not soup: + return None + h4_tags = soup.find_all('h4', {'class': 'small-header'}) + if not h4_tags: + log.debug('No h4 tags found - did site change?') + return None + bibles = [] + for h4t in h4_tags: + short_name = None + if h4t.span: + short_name = h4t.span.get_text().strip().lower() + else: + log.error('No span tag found - did site change?') + return None + if not short_name: + continue + h4t.span.extract() + tag_text = h4t.get_text().strip() + # The names of non-english bibles has their language in parentheses at the end + if tag_text.endswith(')'): + language = tag_text[tag_text.rfind('(') + 1:-1] + if language in CROSSWALK_LANGUAGES: + language_code = CROSSWALK_LANGUAGES[language] + else: + language_code = '' + # ... except for those that don't... + elif 'latin' in tag_text.lower(): + language_code = 'la' + elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower(): + language_code = 'es' + elif 'chinese' in tag_text.lower(): + language_code = 'zh' + elif 'greek' in tag_text.lower(): + language_code = 'el' + elif 'nova' in tag_text.lower(): + language_code = 'pt' + else: + language_code = 'en' + bibles.append((tag_text, short_name, language_code)) + return bibles