From 5ac5c6cd68f17cca78b3358c8f18b7a67b391659 Mon Sep 17 00:00:00 2001 From: Philip Ridout Date: Thu, 11 Aug 2016 20:02:29 +0100 Subject: [PATCH] split the web bible importers out io their own files --- .../plugins/bibles/forms/bibleimportform.py | 4 +- openlp/plugins/bibles/lib/importers/http.py | 535 ------------------ openlp/plugins/bibles/lib/importers/osis.py | 2 +- .../openlp_plugins/bibles/test_lib_http.py | 4 +- 4 files changed, 7 insertions(+), 538 deletions(-) diff --git a/openlp/plugins/bibles/forms/bibleimportform.py b/openlp/plugins/bibles/forms/bibleimportform.py index 3d02228ca..e9eee88d5 100644 --- a/openlp/plugins/bibles/forms/bibleimportform.py +++ b/openlp/plugins/bibles/forms/bibleimportform.py @@ -40,7 +40,9 @@ from openlp.core.ui.lib.wizard import OpenLPWizard, WizardStrings from openlp.core.common.languagemanager import get_locale_key from openlp.plugins.bibles.lib.manager import BibleFormat from openlp.plugins.bibles.lib.db import clean_filename -from openlp.plugins.bibles.lib.importers.http import CWExtract, BGExtract, BSExtract +from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract +from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract +from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract log = logging.getLogger(__name__) diff --git a/openlp/plugins/bibles/lib/importers/http.py b/openlp/plugins/bibles/lib/importers/http.py index 6921c9005..5afd107f6 100644 --- a/openlp/plugins/bibles/lib/importers/http.py +++ b/openlp/plugins/bibles/lib/importers/http.py @@ -38,545 +38,10 @@ from openlp.plugins.bibles.lib.bibleimport import BibleImport from openlp.plugins.bibles.lib.db import BibleDB, BiblesResourcesDB, Book CLEANER_REGEX = re.compile(r' |
|\'\+\'') -FIX_PUNKCTUATION_REGEX = re.compile(r'[ ]+([.,;])') -REDUCE_SPACES_REGEX = re.compile(r'[ ]{2,}') -UGLY_CHARS = { - '\u2014': ' - ', - '\u2018': '\'', - '\u2019': '\'', - '\u201c': '"', - '\u201d': '"', - ' ': ' ' -} -VERSE_NUMBER_REGEX = re.compile(r'v(\d{1,2})(\d{3})(\d{3}) verse.*') - -BIBLESERVER_LANGUAGE_CODE = { - 'fl_1': 'de', - 'fl_2': 'en', - 'fl_3': 'fr', - 'fl_4': 'it', - 'fl_5': 'es', - 'fl_6': 'pt', - 'fl_7': 'ru', - 'fl_8': 'sv', - 'fl_9': 'no', - 'fl_10': 'nl', - 'fl_11': 'cs', - 'fl_12': 'sk', - 'fl_13': 'ro', - 'fl_14': 'hr', - 'fl_15': 'hu', - 'fl_16': 'bg', - 'fl_17': 'ar', - 'fl_18': 'tr', - 'fl_19': 'pl', - 'fl_20': 'da', - 'fl_21': 'zh' -} - -CROSSWALK_LANGUAGES = { - 'Portuguese': 'pt', - 'German': 'de', - 'Italian': 'it', - 'EspaƱol': 'es', - 'French': 'fr', - 'Dutch': 'nl' -} log = logging.getLogger(__name__) -class BGExtract(RegistryProperties): - """ - Extract verses from BibleGateway - """ - def __init__(self, proxy_url=None): - log.debug('BGExtract.init("{url}")'.format(url=proxy_url)) - self.proxy_url = proxy_url - socket.setdefaulttimeout(30) - - def _remove_elements(self, parent, tag, class_=None): - """ - Remove a particular element from the BeautifulSoup tree. - - :param parent: The element from which items need to be removed. - :param tag: A string of the tab type, e.g. "div" - :param class_: An HTML class attribute for further qualification. - """ - if class_: - all_tags = parent.find_all(tag, class_) - else: - all_tags = parent.find_all(tag) - for element in all_tags: - element.extract() - - def _extract_verse(self, tag): - """ - Extract a verse (or part of a verse) from a tag. - - :param tag: The BeautifulSoup Tag element with the stuff we want. - """ - if isinstance(tag, NavigableString): - return None, str(tag) - elif tag.get('class') and (tag.get('class')[0] == 'versenum' or tag.get('class')[0] == 'versenum mid-line'): - verse = str(tag.string).replace('[', '').replace(']', '').strip() - return verse, None - elif tag.get('class') and tag.get('class')[0] == 'chapternum': - verse = '1' - return verse, None - else: - verse = None - text = '' - for child in tag.contents: - c_verse, c_text = self._extract_verse(child) - if c_verse: - verse = c_verse - if text and c_text: - text += c_text - elif c_text is not None: - text = c_text - return verse, text - - def _clean_soup(self, tag): - """ - Remove all the rubbish from the HTML page. - - :param tag: The base tag within which we want to remove stuff. - """ - self._remove_elements(tag, 'sup', 'crossreference') - self._remove_elements(tag, 'sup', 'footnote') - self._remove_elements(tag, 'div', 'footnotes') - self._remove_elements(tag, 'div', 'crossrefs') - self._remove_elements(tag, 'h3') - self._remove_elements(tag, 'h4') - self._remove_elements(tag, 'h5') - - def _extract_verses(self, tags): - """ - Extract all the verses from a pre-prepared list of HTML tags. - - :param tags: A list of BeautifulSoup Tag elements. - """ - verses = [] - tags = tags[::-1] - current_text = '' - for tag in tags: - verse = None - text = '' - for child in tag.contents: - c_verse, c_text = self._extract_verse(child) - if c_verse: - verse = c_verse - if text and c_text: - text += c_text - elif c_text is not None: - text = c_text - if not verse: - current_text = text + ' ' + current_text - else: - text += ' ' + current_text - current_text = '' - if text: - for old, new in UGLY_CHARS.items(): - text = text.replace(old, new) - text = ' '.join(text.split()) - if verse and text: - verse = verse.strip() - try: - verse = int(verse) - except ValueError: - verse_parts = verse.split('-') - if len(verse_parts) > 1: - verse = int(verse_parts[0]) - except TypeError: - log.warning('Illegal verse number: {verse:d}'.format(verse=verse)) - verses.append((verse, text)) - verse_list = {} - for verse, text in verses[::-1]: - verse_list[verse] = text - return verse_list - - def _extract_verses_old(self, div): - """ - Use the old style of parsing for those Bibles on BG who mysteriously have not been migrated to the new (still - broken) HTML. - - :param div: The parent div. - """ - verse_list = {} - # Cater for inconsistent mark up in the first verse of a chapter. - first_verse = div.find('versenum') - if first_verse and first_verse.contents: - verse_list[1] = str(first_verse.contents[0]) - for verse in div('sup', 'versenum'): - raw_verse_num = verse.next_element - clean_verse_num = 0 - # Not all verses exist in all translations and may or may not be represented by a verse number. If they are - # not fine, if they are it will probably be in a format that breaks int(). We will then have no idea what - # garbage may be sucked in to the verse text so if we do not get a clean int() then ignore the verse - # completely. - try: - clean_verse_num = int(str(raw_verse_num)) - except ValueError: - verse_parts = str(raw_verse_num).split('-') - if len(verse_parts) > 1: - clean_verse_num = int(verse_parts[0]) - except TypeError: - log.warning('Illegal verse number: {verse:d}'.format(verse=raw_verse_num)) - if clean_verse_num: - verse_text = raw_verse_num.next_element - part = raw_verse_num.next_element.next_element - while not (isinstance(part, Tag) and part.get('class')[0] == 'versenum'): - # While we are still in the same verse grab all the text. - if isinstance(part, NavigableString): - verse_text += part - if isinstance(part.next_element, Tag) and part.next_element.name == 'div': - # Run out of verses so stop. - break - part = part.next_element - verse_list[clean_verse_num] = str(verse_text) - return verse_list - - def get_bible_chapter(self, version, book_name, chapter): - """ - Access and decode Bibles via the BibleGateway website. - - :param version: The version of the Bible like 31 for New International version. - :param book_name: Name of the Book. - :param chapter: Chapter number. - """ - log.debug('BGExtract.get_bible_chapter("{version}", "{name}", "{chapter}")'.format(version=version, - name=book_name, - chapter=chapter)) - url_book_name = urllib.parse.quote(book_name.encode("utf-8")) - url_params = 'search={name}+{chapter}&version={version}'.format(name=url_book_name, - chapter=chapter, - version=version) - soup = get_soup_for_bible_ref( - 'http://biblegateway.com/passage/?{url}'.format(url=url_params), - pre_parse_regex=r'', pre_parse_substitute='') - if not soup: - return None - div = soup.find('div', 'result-text-style-normal') - if not div: - return None - self._clean_soup(div) - span_list = div.find_all('span', 'text') - log.debug('Span list: {span}'.format(span=span_list)) - if not span_list: - # If we don't get any spans then we must have the old HTML format - verse_list = self._extract_verses_old(div) - else: - verse_list = self._extract_verses(span_list) - if not verse_list: - log.debug('No content found in the BibleGateway response.') - send_error_message('parse') - return None - return SearchResults(book_name, chapter, verse_list) - - def get_books_from_http(self, version): - """ - Load a list of all books a Bible contains from BibleGateway website. - - :param version: The version of the Bible like NIV for New International Version - """ - log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version)) - url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)}) - reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params) - page = get_web_page(reference_url) - if not page: - send_error_message('download') - return None - page_source = page.read() - try: - page_source = str(page_source, 'utf8') - except UnicodeDecodeError: - page_source = str(page_source, 'cp1251') - try: - soup = BeautifulSoup(page_source, 'lxml') - except Exception: - log.error('BeautifulSoup could not parse the Bible page.') - send_error_message('parse') - return None - if not soup: - send_error_message('parse') - return None - self.application.process_events() - content = soup.find('table', 'infotable') - if content: - content = content.find_all('tr') - if not content: - log.error('No books found in the Biblegateway response.') - send_error_message('parse') - return None - books = [] - for book in content: - book = book.find('td') - if book: - books.append(book.contents[1]) - return books - - def get_bibles_from_http(self): - """ - Load a list of bibles from BibleGateway website. - - returns a list in the form [(biblename, biblekey, language_code)] - """ - log.debug('BGExtract.get_bibles_from_http') - bible_url = 'https://biblegateway.com/versions/' - soup = get_soup_for_bible_ref(bible_url) - if not soup: - return None - bible_select = soup.find('select', {'class': 'search-translation-select'}) - if not bible_select: - log.debug('No select tags found - did site change?') - return None - option_tags = bible_select.find_all('option') - if not option_tags: - log.debug('No option tags found - did site change?') - return None - current_lang = '' - bibles = [] - for ot in option_tags: - tag_class = '' - try: - tag_class = ot['class'][0] - except KeyError: - tag_class = '' - tag_text = ot.get_text() - if tag_class == 'lang': - current_lang = tag_text[tag_text.find('(') + 1:tag_text.find(')')].lower() - elif tag_class == 'spacer': - continue - else: - bibles.append((tag_text, ot['value'], current_lang)) - return bibles - - -class BSExtract(RegistryProperties): - """ - Extract verses from Bibleserver.com - """ - def __init__(self, proxy_url=None): - log.debug('BSExtract.init("{url}")'.format(url=proxy_url)) - self.proxy_url = proxy_url - socket.setdefaulttimeout(30) - - def get_bible_chapter(self, version, book_name, chapter): - """ - Access and decode bibles via Bibleserver mobile website - - :param version: The version of the bible like NIV for New International Version - :param book_name: Text name of bible book e.g. Genesis, 1. John, 1John or Offenbarung - :param chapter: Chapter number - """ - log.debug('BSExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, - book=book_name, - chapter=chapter)) - url_version = urllib.parse.quote(version.encode("utf-8")) - url_book_name = urllib.parse.quote(book_name.encode("utf-8")) - chapter_url = 'http://m.bibleserver.com/text/{version}/{name}{chapter:d}'.format(version=url_version, - name=url_book_name, - chapter=chapter) - header = ('Accept-Language', 'en') - soup = get_soup_for_bible_ref(chapter_url, header) - if not soup: - return None - self.application.process_events() - content = soup.find('div', 'content') - if not content: - log.error('No verses found in the Bibleserver response.') - send_error_message('parse') - return None - content = content.find('div').find_all('div') - verses = {} - for verse in content: - self.application.process_events() - versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', ' '.join(verse['class']))) - verses[versenumber] = verse.contents[1].rstrip('\n') - return SearchResults(book_name, chapter, verses) - - def get_books_from_http(self, version): - """ - Load a list of all books a Bible contains from Bibleserver mobile website. - - :param version: The version of the Bible like NIV for New International Version - """ - log.debug('BSExtract.get_books_from_http("{version}")'.format(version=version)) - url_version = urllib.parse.quote(version.encode("utf-8")) - chapter_url = 'http://m.bibleserver.com/overlay/selectBook?translation={version}'.format(version=url_version) - soup = get_soup_for_bible_ref(chapter_url) - if not soup: - return None - content = soup.find('ul') - if not content: - log.error('No books found in the Bibleserver response.') - send_error_message('parse') - return None - content = content.find_all('li') - return [book.contents[0].contents[0] for book in content if len(book.contents[0].contents)] - - def get_bibles_from_http(self): - """ - Load a list of bibles from Bibleserver website. - - returns a list in the form [(biblename, biblekey, language_code)] - """ - log.debug('BSExtract.get_bibles_from_http') - bible_url = 'http://www.bibleserver.com/index.php?language=2' - soup = get_soup_for_bible_ref(bible_url) - if not soup: - return None - bible_links = soup.find_all('a', {'class': 'trlCell'}) - if not bible_links: - log.debug('No a tags found - did site change?') - return None - bibles = [] - for link in bible_links: - bible_name = link.get_text() - # Skip any audio - if 'audio' in bible_name.lower(): - continue - try: - bible_link = link['href'] - bible_key = bible_link[bible_link.rfind('/') + 1:] - css_classes = link['class'] - except KeyError: - log.debug('No href/class attribute found - did site change?') - language_code = '' - for css_class in css_classes: - if css_class.startswith('fl_'): - try: - language_code = BIBLESERVER_LANGUAGE_CODE[css_class] - except KeyError: - language_code = '' - bibles.append((bible_name, bible_key, language_code)) - return bibles - - -class CWExtract(RegistryProperties): - """ - Extract verses from CrossWalk/BibleStudyTools - """ - def __init__(self, proxy_url=None): - log.debug('CWExtract.init("{url}")'.format(url=proxy_url)) - self.proxy_url = proxy_url - socket.setdefaulttimeout(30) - - def get_bible_chapter(self, version, book_name, chapter): - """ - Access and decode bibles via the Crosswalk website - - :param version: The version of the Bible like niv for New International Version - :param book_name: Text name of in english e.g. 'gen' for Genesis - :param chapter: Chapter number - """ - log.debug('CWExtract.get_bible_chapter("{version}", "{book}", "{chapter}")'.format(version=version, - book=book_name, - chapter=chapter)) - url_book_name = book_name.replace(' ', '-') - url_book_name = url_book_name.lower() - url_book_name = urllib.parse.quote(url_book_name.encode("utf-8")) - chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version, - book=url_book_name, - chapter=chapter) - soup = get_soup_for_bible_ref(chapter_url) - if not soup: - return None - self.application.process_events() - verses_div = soup.find_all('div', 'verse') - if not verses_div: - log.error('No verses found in the CrossWalk response.') - send_error_message('parse') - return None - verses = {} - for verse in verses_div: - self.application.process_events() - verse_number = int(verse.find('strong').contents[0]) - verse_span = verse.find('span') - tags_to_remove = verse_span.find_all(['a', 'sup']) - for tag in tags_to_remove: - tag.decompose() - verse_text = verse_span.get_text() - self.application.process_events() - # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and . - verse_text = verse_text.strip('\n\r\t ') - verse_text = REDUCE_SPACES_REGEX.sub(' ', verse_text) - verse_text = FIX_PUNKCTUATION_REGEX.sub(r'\1', verse_text) - verses[verse_number] = verse_text - return SearchResults(book_name, chapter, verses) - - def get_books_from_http(self, version): - """ - Load a list of all books a Bible contain from the Crosswalk website. - - :param version: The version of the bible like NIV for New International Version - """ - log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version)) - chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version) - soup = get_soup_for_bible_ref(chapter_url) - if not soup: - return None - content = soup.find_all('h4', {'class': 'small-header'}) - if not content: - log.error('No books found in the Crosswalk response.') - send_error_message('parse') - return None - books = [] - for book in content: - books.append(book.contents[0]) - return books - - def get_bibles_from_http(self): - """ - Load a list of bibles from Crosswalk website. - returns a list in the form [(biblename, biblekey, language_code)] - """ - log.debug('CWExtract.get_bibles_from_http') - bible_url = 'http://www.biblestudytools.com/bible-versions/' - soup = get_soup_for_bible_ref(bible_url) - if not soup: - return None - h4_tags = soup.find_all('h4', {'class': 'small-header'}) - if not h4_tags: - log.debug('No h4 tags found - did site change?') - return None - bibles = [] - for h4t in h4_tags: - short_name = None - if h4t.span: - short_name = h4t.span.get_text().strip().lower() - else: - log.error('No span tag found - did site change?') - return None - if not short_name: - continue - h4t.span.extract() - tag_text = h4t.get_text().strip() - # The names of non-english bibles has their language in parentheses at the end - if tag_text.endswith(')'): - language = tag_text[tag_text.rfind('(') + 1:-1] - if language in CROSSWALK_LANGUAGES: - language_code = CROSSWALK_LANGUAGES[language] - else: - language_code = '' - # ... except for those that don't... - elif 'latin' in tag_text.lower(): - language_code = 'la' - elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower(): - language_code = 'es' - elif 'chinese' in tag_text.lower(): - language_code = 'zh' - elif 'greek' in tag_text.lower(): - language_code = 'el' - elif 'nova' in tag_text.lower(): - language_code = 'pt' - else: - language_code = 'en' - bibles.append((tag_text, short_name, language_code)) - return bibles - - class HTTPBible(BibleImport, RegistryProperties): log.info('{name} HTTPBible loaded'.format(name=__name__)) diff --git a/openlp/plugins/bibles/lib/importers/osis.py b/openlp/plugins/bibles/lib/importers/osis.py index 99a138acd..c833277fe 100644 --- a/openlp/plugins/bibles/lib/importers/osis.py +++ b/openlp/plugins/bibles/lib/importers/osis.py @@ -108,7 +108,7 @@ class OSISBible(BibleImport): if self.stop_import_flag: break # Remove div-tags in the book - etree.strip_tags(book, ('{http://www.bibletechnologies.net/2003/OSIS/namespace}div')) + etree.strip_tags(book, '{http://www.bibletechnologies.net/2003/OSIS/namespace}div') book_ref_id = self.get_book_ref_id_by_name(book.get('osisID'), num_books, language_id) if not book_ref_id: log.error('Importing books from "{name}" failed'.format(name=self.filename)) diff --git a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py index 084bfa476..fd557eece 100644 --- a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py +++ b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py @@ -25,7 +25,9 @@ from unittest import TestCase, skip from openlp.core.common import Registry -from openlp.plugins.bibles.lib.importers.http import BGExtract, CWExtract, BSExtract +from openlp.plugins.bibles.lib.importers.biblegateway import BGExtract +from openlp.plugins.bibles.lib.importers.bibleserver import BSExtract +from openlp.plugins.bibles.lib.importers.crosswalk import CWExtract from tests.interfaces import MagicMock