split out web bible importers

This commit is contained in:
Philip Ridout 2016-08-11 20:03:58 +01:00
commit 0617f71134
4 changed files with 7 additions and 538 deletions

View File

@ -40,7 +40,9 @@ from openlp.core.ui.lib.wizard import OpenLPWizard, WizardStrings
from openlp.core.common.languagemanager import get_locale_key from openlp.core.common.languagemanager import get_locale_key
from openlp.plugins.bibles.lib.manager import BibleFormat from openlp.plugins.bibles.lib.manager import BibleFormat
from openlp.plugins.bibles.lib.db import clean_filename from openlp.plugins.bibles.lib.db import clean_filename
from openlp.plugins.bibles.lib.importers.http import CWExtract, BGExtract, BSExtract from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract
from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract
from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract
log = logging.getLogger(__name__) log = logging.getLogger(__name__)

View File

@ -38,545 +38,10 @@ from openlp.plugins.bibles.lib.bibleimport import BibleImport
from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book
CLEANER_REGEX = re.compile(r'&nbsp;|<br />|\'\+\'') CLEANER_REGEX = re.compile(r'&nbsp;|<br />|\'\+\'')
FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])')
REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}')
UGLY_CHARS = {
'\u2014': ' - ',
'\u2018': '\'',
'\u2019': '\'',
'\u201c': '"',
'\u201d': '"',
'&nbsp;': ' '
}
VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*')
BIBLESERVER_LANGUAGE_CODE = {
'fl_1': 'de',
'fl_2': 'en',
'fl_3': 'fr',
'fl_4': 'it',
'fl_5': 'es',
'fl_6': 'pt',
'fl_7': 'ru',
'fl_8': 'sv',
'fl_9': 'no',
'fl_10': 'nl',
'fl_11': 'cs',
'fl_12': 'sk',
'fl_13': 'ro',
'fl_14': 'hr',
'fl_15': 'hu',
'fl_16': 'bg',
'fl_17': 'ar',
'fl_18': 'tr',
'fl_19': 'pl',
'fl_20': 'da',
'fl_21': 'zh'
}
CROSSWALK_LANGUAGES = {
'Portuguese': 'pt',
'German': 'de',
'Italian': 'it',
'Español': 'es',
'French': 'fr',
'Dutch': 'nl'
}
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
class BGExtract(RegistryProperties):
"""
Extract verses from BibleGateway
"""
def __init__(self, proxy_url=None):
log.debug('BGExtract.init("{url}")'.format(url=proxy_url))
self.proxy_url = proxy_url
socket.setdefaulttimeout(30)
def _remove_elements(self, parent, tag, class_=None):
"""
Remove a particular element from the BeautifulSoup tree.
:param parent: The element from which items need to be removed.
:param tag: A string of the tab type, e.g. "div"
:param class_: An HTML class attribute for further qualification.
"""
if class_:
all_tags = parent.find_all(tag, class_)
else:
all_tags = parent.find_all(tag)
for element in all_tags:
element.extract()
def _extract_verse(self, tag):
"""
Extract a verse (or part of a verse) from a tag.
:param tag: The BeautifulSoup Tag element with the stuff we want.
"""
if isinstance(tag, NavigableString):
return None, str(tag)
elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'):
verse = str(tag.string).replace('[', '').replace(']', '').strip()
return verse, None
elif tag.get('class') and tag.get('class')[0] == 'chapternum':
verse = '1'
return verse, None
else:
verse = None
text = ''
for child in tag.contents:
c_verse, c_text = self._extract_verse(child)
if c_verse:
verse = c_verse
if text and c_text:
text += c_text
elif c_text is not None:
text = c_text
return verse, text
def _clean_soup(self, tag):
"""
Remove all the rubbish from the HTML page.
:param tag: The base tag within which we want to remove stuff.
"""
self._remove_elements(tag, 'sup', 'crossreference')
self._remove_elements(tag, 'sup', 'footnote')
self._remove_elements(tag, 'div', 'footnotes')
self._remove_elements(tag, 'div', 'crossrefs')
self._remove_elements(tag, 'h3')
self._remove_elements(tag, 'h4')
self._remove_elements(tag, 'h5')
def _extract_verses(self, tags):
"""
Extract all the verses from a pre-prepared list of HTML tags.
:param tags: A list of BeautifulSoup Tag elements.
"""
verses = []
tags = tags[::-1]
current_text = ''
for tag in tags:
verse = None
text = ''
for child in tag.contents:
c_verse, c_text = self._extract_verse(child)
if c_verse:
verse = c_verse
if text and c_text:
text += c_text
elif c_text is not None:
text = c_text
if not verse:
current_text = text + ' ' + current_text
else:
text += ' ' + current_text
current_text = ''
if text:
for old, new in UGLY_CHARS.items():
text = text.replace(old, new)
text = ' '.join(text.split())
if verse and text:
verse = verse.strip()
try:
verse = int(verse)
except ValueError:
verse_parts = verse.split('-')
if len(verse_parts) > 1:
verse = int(verse_parts[0])
except TypeError:
log.warning('Illegal verse number: {verse:d}'.format(verse=verse))
verses.append((verse, text))
verse_list = {}
for verse, text in verses[::-1]:
verse_list[verse] = text
return verse_list
def _extract_verses_old(self, div):
"""
Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still
broken) HTML.
:param div: The parent div.
"""
verse_list = {}
# Cater for inconsistent mark up in the first verse of a chapter.
first_verse = div.find('versenum')
if first_verse and first_verse.contents:
verse_list[1] = str(first_verse.contents[0])
for verse in div('sup', 'versenum'):
raw_verse_num = verse.next_element
clean_verse_num = 0
# Not all verses exist in all translations and may or may not be represented by a verse number. If they are
# not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what
# garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse
# completely.
try:
clean_verse_num = int(str(raw_verse_num))
except ValueError:
verse_parts = str(raw_verse_num).split('-')
if len(verse_parts) > 1:
clean_verse_num = int(verse_parts[0])
except TypeError:
log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num))
if clean_verse_num:
verse_text = raw_verse_num.next_element
part = raw_verse_num.next_element.next_element
while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'):
# While we are still in the same verse grab all the text.
if isinstance(part, NavigableString):
verse_text += part
if isinstance(part.next_element, Tag) and part.next_element.name == 'div':
# Run out of verses so stop.
break
part = part.next_element
verse_list[clean_verse_num] = str(verse_text)
return verse_list
def get_bible_chapter(self, version, book_name, chapter):
"""
Access and decode Bibles via the BibleGateway website.
:param version: The version of the Bible like 31 for New International version.
:param book_name: Name of the Book.
:param chapter: Chapter number.
"""
log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version,
name=book_name,
chapter=chapter))
url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name,
chapter=chapter,
version=version)
soup = get_soup_for_bible_ref(
'http://biblegateway.com/passage/?{url}'.format(url=url_params),
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
if not soup:
return None
div = soup.find('div', 'result-text-style-normal')
if not div:
return None
self._clean_soup(div)
span_list = div.find_all('span', 'text')
log.debug('Span list: {span}'.format(span=span_list))
if not span_list:
# If we don't get any spans then we must have the old HTML format
verse_list = self._extract_verses_old(div)
else:
verse_list = self._extract_verses(span_list)
if not verse_list:
log.debug('No content found in the BibleGateway response.')
send_error_message('parse')
return None
return SearchResults(book_name, chapter, verse_list)
def get_books_from_http(self, version):
"""
Load a list of all books a Bible contains from BibleGateway website.
:param version: The version of the Bible like NIV for New International Version
"""
log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version))
url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)})
reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params)
page = get_web_page(reference_url)
if not page:
send_error_message('download')
return None
page_source = page.read()
try:
page_source = str(page_source, 'utf8')
except UnicodeDecodeError:
page_source = str(page_source, 'cp1251')
try:
soup = BeautifulSoup(page_source, 'lxml')
except Exception:
log.error('BeautifulSoup could not parse the Bible page.')
send_error_message('parse')
return None
if not soup:
send_error_message('parse')
return None
self.application.process_events()
content = soup.find('table', 'infotable')
if content:
content = content.find_all('tr')
if not content:
log.error('No books found in the Biblegateway response.')
send_error_message('parse')
return None
books = []
for book in content:
book = book.find('td')
if book:
books.append(book.contents[1])
return books
def get_bibles_from_http(self):
"""
Load a list of bibles from BibleGateway website.
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('BGExtract.get_bibles_from_http')
bible_url = 'https://biblegateway.com/versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_select = soup.find('select', {'class': 'search-translation-select'})
if not bible_select:
log.debug('No select tags found - did site change?')
return None
option_tags = bible_select.find_all('option')
if not option_tags:
log.debug('No option tags found - did site change?')
return None
current_lang = ''
bibles = []
for ot in option_tags:
tag_class = ''
try:
tag_class = ot['class'][0]
except KeyError:
tag_class = ''
tag_text = ot.get_text()
if tag_class == 'lang':
current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower()
elif tag_class == 'spacer':
continue
else:
bibles.append((tag_text, ot['value'], current_lang))
return bibles
class BSExtract(RegistryProperties):
"""
Extract verses from Bibleserver.com
"""
def __init__(self, proxy_url=None):
log.debug('BSExtract.init("{url}")'.format(url=proxy_url))
self.proxy_url = proxy_url
socket.setdefaulttimeout(30)
def get_bible_chapter(self, version, book_name, chapter):
"""
Access and decode bibles via Bibleserver mobile website
:param version: The version of the bible like NIV for New International Version
:param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung
:param chapter: Chapter number
"""
log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
book=book_name,
chapter=chapter))
url_version = urllib.parse.quote(version.encode("utf-8"))
url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version,
name=url_book_name,
chapter=chapter)
header = ('Accept-Language', 'en')
soup = get_soup_for_bible_ref(chapter_url, header)
if not soup:
return None
self.application.process_events()
content = soup.find('div', 'content')
if not content:
log.error('No verses found in the Bibleserver response.')
send_error_message('parse')
return None
content = content.find('div').find_all('div')
verses = {}
for verse in content:
self.application.process_events()
versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class'])))
verses[versenumber] = verse.contents[1].rstrip('\n')
return SearchResults(book_name, chapter, verses)
def get_books_from_http(self, version):
"""
Load a list of all books a Bible contains from Bibleserver mobile website.
:param version: The version of the Bible like NIV for New International Version
"""
log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version))
url_version = urllib.parse.quote(version.encode("utf-8"))
chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
content = soup.find('ul')
if not content:
log.error('No books found in the Bibleserver response.')
send_error_message('parse')
return None
content = content.find_all('li')
return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)]
def get_bibles_from_http(self):
"""
Load a list of bibles from Bibleserver website.
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('BSExtract.get_bibles_from_http')
bible_url = 'http://www.bibleserver.com/index.php?language=2'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_links = soup.find_all('a', {'class': 'trlCell'})
if not bible_links:
log.debug('No a tags found - did site change?')
return None
bibles = []
for link in bible_links:
bible_name = link.get_text()
# Skip any audio
if 'audio' in bible_name.lower():
continue
try:
bible_link = link['href']
bible_key = bible_link[bible_link.rfind('/') + 1:]
css_classes = link['class']
except KeyError:
log.debug('No href/class attribute found - did site change?')
language_code = ''
for css_class in css_classes:
if css_class.startswith('fl_'):
try:
language_code = BIBLESERVER_LANGUAGE_CODE[css_class]
except KeyError:
language_code = ''
bibles.append((bible_name, bible_key, language_code))
return bibles
class CWExtract(RegistryProperties):
"""
Extract verses from CrossWalk/BibleStudyTools
"""
def __init__(self, proxy_url=None):
log.debug('CWExtract.init("{url}")'.format(url=proxy_url))
self.proxy_url = proxy_url
socket.setdefaulttimeout(30)
def get_bible_chapter(self, version, book_name, chapter):
"""
Access and decode bibles via the Crosswalk website
:param version: The version of the Bible like niv for New International Version
:param book_name: Text name of in english e.g. 'gen' for Genesis
:param chapter: Chapter number
"""
log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version,
book=book_name,
chapter=chapter))
url_book_name = book_name.replace(' ', '-')
url_book_name = url_book_name.lower()
url_book_name = urllib.parse.quote(url_book_name.encode("utf-8"))
chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
book=url_book_name,
chapter=chapter)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
self.application.process_events()
verses_div = soup.find_all('div', 'verse')
if not verses_div:
log.error('No verses found in the CrossWalk response.')
send_error_message('parse')
return None
verses = {}
for verse in verses_div:
self.application.process_events()
verse_number = int(verse.find('strong').contents[0])
verse_span = verse.find('span')
tags_to_remove = verse_span.find_all(['a', 'sup'])
for tag in tags_to_remove:
tag.decompose()
verse_text = verse_span.get_text()
self.application.process_events()
# Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
verse_text = verse_text.strip('\n\r\t ')
verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text)
verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text)
verses[verse_number] = verse_text
return SearchResults(book_name, chapter, verses)
def get_books_from_http(self, version):
"""
Load a list of all books a Bible contain from the Crosswalk website.
:param version: The version of the bible like NIV for New International Version
"""
log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version))
chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
content = soup.find_all('h4', {'class': 'small-header'})
if not content:
log.error('No books found in the Crosswalk response.')
send_error_message('parse')
return None
books = []
for book in content:
books.append(book.contents[0])
return books
def get_bibles_from_http(self):
"""
Load a list of bibles from Crosswalk website.
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('CWExtract.get_bibles_from_http')
bible_url = 'http://www.biblestudytools.com/bible-versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
h4_tags = soup.find_all('h4', {'class': 'small-header'})
if not h4_tags:
log.debug('No h4 tags found - did site change?')
return None
bibles = []
for h4t in h4_tags:
short_name = None
if h4t.span:
short_name = h4t.span.get_text().strip().lower()
else:
log.error('No span tag found - did site change?')
return None
if not short_name:
continue
h4t.span.extract()
tag_text = h4t.get_text().strip()
# The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1]
if language in CROSSWALK_LANGUAGES:
language_code = CROSSWALK_LANGUAGES[language]
else:
language_code = ''
# ... except for those that don't...
elif 'latin' in tag_text.lower():
language_code = 'la'
elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
language_code = 'es'
elif 'chinese' in tag_text.lower():
language_code = 'zh'
elif 'greek' in tag_text.lower():
language_code = 'el'
elif 'nova' in tag_text.lower():
language_code = 'pt'
else:
language_code = 'en'
bibles.append((tag_text, short_name, language_code))
return bibles
class HTTPBible(BibleImport, RegistryProperties): class HTTPBible(BibleImport, RegistryProperties):
log.info('{name} HTTPBible loaded'.format(name=__name__)) log.info('{name} HTTPBible loaded'.format(name=__name__))

View File

@ -108,7 +108,7 @@ class OSISBible(BibleImport):
if self.stop_import_flag: if self.stop_import_flag:
break break
# Remove div-tags in the book # Remove div-tags in the book
etree.strip_tags(book, ('{http://www.bibletechnologies.net/2003/OSIS/namespace}div')) etree.strip_tags(book, '{http://www.bibletechnologies.net/2003/OSIS/namespace}div')
book_ref_id = self.get_book_ref_id_by_name(book.get('osisID'), num_books, language_id) book_ref_id = self.get_book_ref_id_by_name(book.get('osisID'), num_books, language_id)
if not book_ref_id: if not book_ref_id:
log.error('Importing books from "{name}" failed'.format(name=self.filename)) log.error('Importing books from "{name}" failed'.format(name=self.filename))

View File

@ -25,7 +25,9 @@
from unittest import TestCase, skip from unittest import TestCase, skip
from openlp.core.common import Registry from openlp.core.common import Registry
from openlp.plugins.bibles.lib.importers.http import BGExtract, CWExtract, BSExtract from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract
from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract
from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract
from tests.interfaces import MagicMock from tests.interfaces import MagicMock