split web bibles in to their own modules

This commit is contained in:
Philip Ridout 2016-08-11 20:08:25 +01:00
commit 34a5689499
3 changed files with 646 additions and 0 deletions

View File

@ -0,0 +1,313 @@
# -*- coding: utf-8 -*-
# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
###############################################################################
# OpenLP - Open Source Lyrics Projection #
# --------------------------------------------------------------------------- #
# Copyright (c) 2008-2016 OpenLP Developers #
# --------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
"""
The :mod:`http` module enables OpenLP to retrieve scripture from bible websites.
"""
import logging
import socket
import urllib.parse
import urllib.error
from bs4 import BeautifulSoup, NavigableString, Tag
from openlp.core.common import RegistryProperties
from openlp.core.lib.webpagereader import get_web_page
from openlp.plugins.bibles.lib import SearchResults
from openlp.plugins.bibles.lib.importers.http import get_soup_for_bible_ref, send_error_message
UGLY_CHARS = {
'\u2014': ' - ',
'\u2018': '\'',
'\u2019': '\'',
'\u201c': '"',
'\u201d': '"',
' ': ' '
}
log = logging.getLogger(__name__)
class BGExtract(RegistryProperties):
"""
Extract verses from BibleGateway
"""
def __init__(self, proxy_url=None):
log.debug('BGExtract.init("{url}")'.format(url=proxy_url))
self.proxy_url = proxy_url
socket.setdefaulttimeout(30)
def _remove_elements(self, parent, tag, class_=None):
"""
Remove a particular element from the BeautifulSoup tree.
:param parent: The element from which items need to be removed.
:param tag: A string of the tab type, e.g. "div"
:param class_: An HTML class attribute for further qualification.
"""
if class_:
all_tags = parent.find_all(tag, class_)
else:
all_tags = parent.find_all(tag)
for element in all_tags:
element.extract()
def _extract_verse(self, tag):
"""
Extract a verse (or part of a verse) from a tag.
:param tag: The BeautifulSoup Tag element with the stuff we want.
"""
if isinstance(tag, NavigableString):
return None, str(tag)
elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'):
verse = str(tag.string).replace('[', '').replace(']', '').strip()
return verse, None
elif tag.get('class') and tag.get('class')[0] == 'chapternum':
verse = '1'
return verse, None
else:
verse = None
text = ''
for child in tag.contents:
c_verse, c_text = self._extract_verse(child)
if c_verse:
verse = c_verse
if text and c_text:
text += c_text
elif c_text is not None:
text = c_text
return verse, text
def _clean_soup(self, tag):
"""
Remove all the rubbish from the HTML page.
:param tag: The base tag within which we want to remove stuff.
"""
self._remove_elements(tag, 'sup', 'crossreference')
self._remove_elements(tag, 'sup', 'footnote')
self._remove_elements(tag, 'div', 'footnotes')
self._remove_elements(tag, 'div', 'crossrefs')
self._remove_elements(tag, 'h3')
self._remove_elements(tag, 'h4')
self._remove_elements(tag, 'h5')
def _extract_verses(self, tags):
"""
Extract all the verses from a pre-prepared list of HTML tags.
:param tags: A list of BeautifulSoup Tag elements.
"""
verses = []
tags = tags[::-1]
current_text = ''
for tag in tags:
verse = None
text = ''
for child in tag.contents:
c_verse, c_text = self._extract_verse(child)
if c_verse:
verse = c_verse
if text and c_text:
text += c_text
elif c_text is not None:
text = c_text
if not verse:
current_text = text + ' ' + current_text
else:
text += ' ' + current_text
current_text = ''
if text:
for old, new in UGLY_CHARS.items():
text = text.replace(old, new)
text = ' '.join(text.split())
if verse and text:
verse = verse.strip()
try:
verse = int(verse)
except ValueError:
verse_parts = verse.split('-')
if len(verse_parts) > 1:
verse = int(verse_parts[0])
except TypeError:
log.warning('Illegal verse number: {verse:d}'.format(verse=verse))
verses.append((verse, text))
verse_list = {}
for verse, text in verses[::-1]:
verse_list[verse] = text
return verse_list
def _extract_verses_old(self, div):
"""
Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
broken) HTML.
:param div: The parent div.
"""
verse_list = {}
# Cater for inconsistent mark up in the first verse of a chapter.
first_verse = div.find('versenum')
if first_verse and first_verse.contents:
verse_list[1] = str(first_verse.contents[0])
for verse in div('sup', 'versenum'):
raw_verse_num = verse.next_element
clean_verse_num = 0
# Not all verses exist in all translations and may or may not be represented by a verse number. If they are
# not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
# garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
# completely.
try:
clean_verse_num = int(str(raw_verse_num))
except ValueError:
verse_parts = str(raw_verse_num).split('-')
if len(verse_parts) > 1:
clean_verse_num = int(verse_parts[0])
except TypeError:
log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num))
if clean_verse_num:
verse_text = raw_verse_num.next_element
part = raw_verse_num.next_element.next_element
while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'):
# While we are still in the same verse grab all the text.
if isinstance(part, NavigableString):
verse_text += part
if isinstance(part.next_element, Tag) and part.next_element.name == 'div':
# Run out of verses so stop.
break
part = part.next_element
verse_list[clean_verse_num] = str(verse_text)
return verse_list
def get_bible_chapter(self, version, book_name, chapter):
"""
Access and decode Bibles via the BibleGateway website.
:param version: The version of the Bible like 31 for New International version.
:param book_name: Name of the Book.
:param chapter: Chapter number.
"""
log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version,
name=book_name,
chapter=chapter))
url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name,
chapter=chapter,
version=version)
soup = get_soup_for_bible_ref(
'http://biblegateway.com/passage/?{url}'.format(url=url_params),
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
if not soup:
return None
div = soup.find('div', 'result-text-style-normal')
if not div:
return None
self._clean_soup(div)
span_list = div.find_all('span', 'text')
log.debug('Span list: {span}'.format(span=span_list))
if not span_list:
# If we don't get any spans then we must have the old HTML format
verse_list = self._extract_verses_old(div)
else:
verse_list = self._extract_verses(span_list)
if not verse_list:
log.debug('No content found in the BibleGateway response.')
send_error_message('parse')
return None
return SearchResults(book_name, chapter, verse_list)
def get_books_from_http(self, version):
"""
Load a list of all books a Bible contains from BibleGateway website.
:param version: The version of the Bible like NIV for New International Version
"""
log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version))
url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)})
reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params)
page = get_web_page(reference_url)
if not page:
send_error_message('download')
return None
page_source = page.read()
try:
page_source = str(page_source, 'utf8')
except UnicodeDecodeError:
page_source = str(page_source, 'cp1251')
try:
soup = BeautifulSoup(page_source, 'lxml')
except Exception:
log.error('BeautifulSoup could not parse the Bible page.')
send_error_message('parse')
return None
if not soup:
send_error_message('parse')
return None
self.application.process_events()
content = soup.find('table', 'infotable')
if content:
content = content.find_all('tr')
if not content:
log.error('No books found in the Biblegateway response.')
send_error_message('parse')
return None
books = []
for book in content:
book = book.find('td')
if book:
books.append(book.contents[1])
return books
def get_bibles_from_http(self):
"""
Load a list of bibles from BibleGateway website.
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('BGExtract.get_bibles_from_http')
bible_url = 'https://biblegateway.com/versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_select = soup.find('select', {'class': 'search-translation-select'})
if not bible_select:
log.debug('No select tags found - did site change?')
return None
option_tags = bible_select.find_all('option')
if not option_tags:
log.debug('No option tags found - did site change?')
return None
current_lang = ''
bibles = []
for ot in option_tags:
tag_class = ''
try:
tag_class = ot['class'][0]
except KeyError:
tag_class = ''
tag_text = ot.get_text()
if tag_class == 'lang':
current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower()
elif tag_class == 'spacer':
continue
else:
bibles.append((tag_text, ot['value'], current_lang))
return bibles

View File

@ -0,0 +1,162 @@
# -*- coding: utf-8 -*-
# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
###############################################################################
# OpenLP - Open Source Lyrics Projection #
# --------------------------------------------------------------------------- #
# Copyright (c) 2008-2016 OpenLP Developers #
# --------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
"""
The :mod:`http` module enables OpenLP to retrieve scripture from bible websites.
"""
import logging
import re
import socket
import urllib.parse
import urllib.error
from openlp.core.common import RegistryProperties
from openlp.plugins.bibles.lib import SearchResults
from openlp.plugins.bibles.lib.http import get_soup_for_bible_ref, send_error_message
VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
BIBLESERVER_LANGUAGE_CODE = {
'fl_1': 'de',
'fl_2': 'en',
'fl_3': 'fr',
'fl_4': 'it',
'fl_5': 'es',
'fl_6': 'pt',
'fl_7': 'ru',
'fl_8': 'sv',
'fl_9': 'no',
'fl_10': 'nl',
'fl_11': 'cs',
'fl_12': 'sk',
'fl_13': 'ro',
'fl_14': 'hr',
'fl_15': 'hu',
'fl_16': 'bg',
'fl_17': 'ar',
'fl_18': 'tr',
'fl_19': 'pl',
'fl_20': 'da',
'fl_21': 'zh'
}
log = logging.getLogger(__name__)
class BSExtract(RegistryProperties):
"""
Extract verses from Bibleserver.com
"""
def __init__(self, proxy_url=None):
log.debug('BSExtract.init("{url}")'.format(url=proxy_url))
self.proxy_url = proxy_url
socket.setdefaulttimeout(30)
def get_bible_chapter(self, version, book_name, chapter):
"""
Access and decode bibles via Bibleserver mobile website
:param version: The version of the bible like NIV for New International Version
:param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung
:param chapter: Chapter number
"""
log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
book=book_name,
chapter=chapter))
url_version = urllib.parse.quote(version.encode("utf-8"))
url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version,
name=url_book_name,
chapter=chapter)
header = ('Accept-Language', 'en')
soup = get_soup_for_bible_ref(chapter_url, header)
if not soup:
return None
self.application.process_events()
content = soup.find('div', 'content')
if not content:
log.error('No verses found in the Bibleserver response.')
send_error_message('parse')
return None
content = content.find('div').find_all('div')
verses = {}
for verse in content:
self.application.process_events()
versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class'])))
verses[versenumber] = verse.contents[1].rstrip('\n')
return SearchResults(book_name, chapter, verses)
def get_books_from_http(self, version):
"""
Load a list of all books a Bible contains from Bibleserver mobile website.
:param version: The version of the Bible like NIV for New International Version
"""
log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version))
url_version = urllib.parse.quote(version.encode("utf-8"))
chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
content = soup.find('ul')
if not content:
log.error('No books found in the Bibleserver response.')
send_error_message('parse')
return None
content = content.find_all('li')
return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)]
def get_bibles_from_http(self):
"""
Load a list of bibles from Bibleserver website.
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('BSExtract.get_bibles_from_http')
bible_url = 'http://www.bibleserver.com/index.php?language=2'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_links = soup.find_all('a', {'class': 'trlCell'})
if not bible_links:
log.debug('No a tags found - did site change?')
return None
bibles = []
for link in bible_links:
bible_name = link.get_text()
# Skip any audio
if 'audio' in bible_name.lower():
continue
try:
bible_link = link['href']
bible_key = bible_link[bible_link.rfind('/') + 1:]
css_classes = link['class']
except KeyError:
log.debug('No href/class attribute found - did site change?')
language_code = ''
for css_class in css_classes:
if css_class.startswith('fl_'):
try:
language_code = BIBLESERVER_LANGUAGE_CODE[css_class]
except KeyError:
language_code = ''
bibles.append((bible_name, bible_key, language_code))
return bibles

View File

@ -0,0 +1,171 @@
# -*- coding: utf-8 -*-
# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
###############################################################################
# OpenLP - Open Source Lyrics Projection #
# --------------------------------------------------------------------------- #
# Copyright (c) 2008-2016 OpenLP Developers #
# --------------------------------------------------------------------------- #
# This program is free software; you can redistribute it and/or modify it #
# under the terms of the GNU General Public License as published by the Free #
# Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, but WITHOUT #
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
# more details. #
# #
# You should have received a copy of the GNU General Public License along #
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
###############################################################################
"""
The :mod:`http` module enables OpenLP to retrieve scripture from bible websites.
"""
import logging
import re
import socket
import urllib.parse
import urllib.error
from openlp.core.common import RegistryProperties
from openlp.plugins.bibles.lib import SearchResults
from openlp.plugins.bibles.lib.importers.http import get_soup_for_bible_ref, send_error_message
FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
CROSSWALK_LANGUAGES = {
'Portuguese': 'pt',
'German': 'de',
'Italian': 'it',
'Español': 'es',
'French': 'fr',
'Dutch': 'nl'
}
log = logging.getLogger(__name__)
class CWExtract(RegistryProperties):
"""
Extract verses from CrossWalk/BibleStudyTools
"""
def __init__(self, proxy_url=None):
log.debug('CWExtract.init("{url}")'.format(url=proxy_url))
self.proxy_url = proxy_url
socket.setdefaulttimeout(30)
def get_bible_chapter(self, version, book_name, chapter):
"""
Access and decode bibles via the Crosswalk website
:param version: The version of the Bible like niv for New International Version
:param book_name: Text name of in english e.g. 'gen' for Genesis
:param chapter: Chapter number
"""
log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
book=book_name,
chapter=chapter))
url_book_name = book_name.replace(' ', '-')
url_book_name = url_book_name.lower()
url_book_name = urllib.parse.quote(url_book_name.encode("utf-8"))
chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
book=url_book_name,
chapter=chapter)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
self.application.process_events()
verses_div = soup.find_all('div', 'verse')
if not verses_div:
log.error('No verses found in the CrossWalk response.')
send_error_message('parse')
return None
verses = {}
for verse in verses_div:
self.application.process_events()
verse_number = int(verse.find('strong').contents[0])
verse_span = verse.find('span')
tags_to_remove = verse_span.find_all(['a', 'sup'])
for tag in tags_to_remove:
tag.decompose()
verse_text = verse_span.get_text()
self.application.process_events()
# Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
verse_text = verse_text.strip('\n\r\t ')
verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text)
verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
verses[verse_number] = verse_text
return SearchResults(book_name, chapter, verses)
def get_books_from_http(self, version):
"""
Load a list of all books a Bible contain from the Crosswalk website.
:param version: The version of the bible like NIV for New International Version
"""
log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version))
chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
content = soup.find_all('h4', {'class': 'small-header'})
if not content:
log.error('No books found in the Crosswalk response.')
send_error_message('parse')
return None
books = []
for book in content:
books.append(book.contents[0])
return books
def get_bibles_from_http(self):
"""
Load a list of bibles from Crosswalk website.
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('CWExtract.get_bibles_from_http')
bible_url = 'http://www.biblestudytools.com/bible-versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
h4_tags = soup.find_all('h4', {'class': 'small-header'})
if not h4_tags:
log.debug('No h4 tags found - did site change?')
return None
bibles = []
for h4t in h4_tags:
short_name = None
if h4t.span:
short_name = h4t.span.get_text().strip().lower()
else:
log.error('No span tag found - did site change?')
return None
if not short_name:
continue
h4t.span.extract()
tag_text = h4t.get_text().strip()
# The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1]
if language in CROSSWALK_LANGUAGES:
language_code = CROSSWALK_LANGUAGES[language]
else:
language_code = ''
# ... except for those that don't...
elif 'latin' in tag_text.lower():
language_code = 'la'
elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
language_code = 'es'
elif 'chinese' in tag_text.lower():
language_code = 'zh'
elif 'greek' in tag_text.lower():
language_code = 'el'
elif 'nova' in tag_text.lower():
language_code = 'pt'
else:
language_code = 'en'
bibles.append((tag_text, short_name, language_code))
return bibles