mirror of https://gitlab.com/openlp/openlp.git
Merge branch 'crosswalk-fix' into 'master'
Change crosswalk webbible parsing to match new layout. See merge request openlp/openlp!460
This commit is contained in:
commit
5a07a0c438
|
@ -21,6 +21,7 @@
|
|||
"""
|
||||
The :mod:`http` module enables OpenLP to retrieve scripture from bible websites.
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import socket
|
||||
|
@ -516,14 +517,14 @@ class CWExtract(RegistryProperties):
|
|||
url_book_name = book_name.replace(' ', '-')
|
||||
url_book_name = url_book_name.lower()
|
||||
url_book_name = urllib.parse.quote(url_book_name.encode("utf-8"))
|
||||
chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
|
||||
book=url_book_name,
|
||||
chapter=chapter)
|
||||
chapter_url = 'https://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
|
||||
book=url_book_name,
|
||||
chapter=chapter)
|
||||
soup = get_soup_for_bible_ref(chapter_url)
|
||||
if not soup:
|
||||
return None
|
||||
self.application.process_events()
|
||||
verses_div = soup.find_all('div', 'verse')
|
||||
verses_div = soup.find_all('div', {'data-verse-id': True})
|
||||
if not verses_div:
|
||||
log.error('No verses found in the CrossWalk response.')
|
||||
send_error_message('parse')
|
||||
|
@ -531,12 +532,11 @@ class CWExtract(RegistryProperties):
|
|||
verses = {}
|
||||
for verse in verses_div:
|
||||
self.application.process_events()
|
||||
verse_number = int(verse.find('span', 'verse-number').strong.contents[0])
|
||||
verse_span = verse.find('span', class_='verse-%d' % verse_number)
|
||||
tags_to_remove = verse_span.find_all(['a', 'sup'])
|
||||
verse_number = int(verse['data-verse-id'])
|
||||
tags_to_remove = verse.find_all(['a', 'sup', 'h3'])
|
||||
for tag in tags_to_remove:
|
||||
tag.decompose()
|
||||
verse_text = verse_span.get_text()
|
||||
verse_text = verse.get_text()
|
||||
self.application.process_events()
|
||||
# Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
|
||||
verse_text = verse_text.strip('\n\r\t ')
|
||||
|
@ -552,18 +552,25 @@ class CWExtract(RegistryProperties):
|
|||
:param version: The version of the bible like NIV for New International Version
|
||||
"""
|
||||
log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version))
|
||||
chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version)
|
||||
soup = get_soup_for_bible_ref(chapter_url)
|
||||
if not soup:
|
||||
return None
|
||||
content = soup.find_all('h4', {'class': 'small-header'})
|
||||
if not content:
|
||||
log.error('No books found in the Crosswalk response.')
|
||||
send_error_message('parse')
|
||||
return None
|
||||
books_url = 'https://www.biblestudytools.com/api/bible/books-selection/?translationCode={version}'
|
||||
books_url = books_url.format(version=version)
|
||||
books = []
|
||||
for book in content:
|
||||
books.append(book.contents[0])
|
||||
books_page = get_web_page(books_url)
|
||||
if not books_page:
|
||||
log.error('No books found in the CrossWalk response.')
|
||||
send_error_message('parse')
|
||||
return books
|
||||
books_json = json.loads(books_page)
|
||||
for book in books_json:
|
||||
# the link looks like this: https://www.biblestudytools.com/bla/2-corintios/
|
||||
link = book['link']
|
||||
# remove trailing forward slash
|
||||
link = link.strip('/')
|
||||
# remove everything before the book name/code
|
||||
book_name = link[link.rfind('/') + 1:]
|
||||
# replace dash with space
|
||||
book_name = book_name.replace('-', ' ')
|
||||
books.append(book_name)
|
||||
return books
|
||||
|
||||
def get_bibles_from_http(self):
|
||||
|
@ -572,29 +579,16 @@ class CWExtract(RegistryProperties):
|
|||
returns a list in the form [(biblename, biblekey, language_code)]
|
||||
"""
|
||||
log.debug('CWExtract.get_bibles_from_http')
|
||||
bible_url = 'http://www.biblestudytools.com/bible-versions/'
|
||||
bible_url = 'https://www.biblestudytools.com/'
|
||||
soup = get_soup_for_bible_ref(bible_url)
|
||||
if not soup:
|
||||
return None
|
||||
# Get all <div class="col-md-12"> on the page
|
||||
content_column = soup.find('div', id='content-column')
|
||||
if not content_column:
|
||||
log.error('No div[id=content-column] -- the site must have changed')
|
||||
return None
|
||||
col_md_12_divs = content_column.find_all('div', 'col-md-12')
|
||||
if not col_md_12_divs:
|
||||
log.error('No div[class=col-md-12] -- the site must have changed')
|
||||
return None
|
||||
# Get all <option class="log-translation" ...> on the page
|
||||
options = soup.find_all('option', {'class': 'log-translation'})
|
||||
bibles = []
|
||||
for col_md_12 in col_md_12_divs:
|
||||
# Check if <a><strong><span class="text-muted"> is a direct descendant
|
||||
if not col_md_12.a or not col_md_12.a.strong or not col_md_12.a.strong.span or \
|
||||
'text-muted' not in col_md_12.a.strong.span['class']:
|
||||
continue
|
||||
short_name = str(col_md_12.a.strong.span.string).strip().lower()
|
||||
if not short_name:
|
||||
continue
|
||||
tag_text = str(col_md_12.a.strong.contents[0]).strip()
|
||||
for option in options:
|
||||
short_name = option['value']
|
||||
tag_text = str(option.contents[0]).strip()
|
||||
# The names of non-english bibles has their language in parentheses at the end
|
||||
if tag_text.endswith(')'):
|
||||
language = tag_text[tag_text.rfind('(') + 1:-1]
|
||||
|
|
|
@ -51,18 +51,18 @@ class EasySlidesImport(SongImport):
|
|||
parser = etree.XMLParser(remove_blank_text=True, recover=True)
|
||||
try:
|
||||
with self.import_source.open('r') as xml_file:
|
||||
parsed_file = etree.parse(str(self.import_source), parser)
|
||||
parsed_file = etree.parse(xml_file, parser)
|
||||
except etree.XMLSyntaxError:
|
||||
log.exception('XML syntax error in file {name}'.format(name=xml_file))
|
||||
log.exception('XML syntax error in file {name}'.format(name=self.import_source))
|
||||
self.log_error(self.import_source, SongStrings.XMLSyntaxError)
|
||||
return
|
||||
except UnicodeDecodeError:
|
||||
log.exception('Unreadable characters in {name}'.format(name=xml_file))
|
||||
log.exception('Unreadable characters in {name}'.format(name=self.import_source))
|
||||
self.log_error(self.import_source, SongStrings.XMLSyntaxError)
|
||||
return
|
||||
file_str = etree.tostring(parsed_file)
|
||||
if not file_str:
|
||||
log.exception('Could not find XML in file {name}'.format(name=xml_file))
|
||||
log.exception('Could not find XML in file {name}'.format(name=self.import_source))
|
||||
self.log_error(self.import_source, SongStrings.XMLSyntaxError)
|
||||
return
|
||||
xml = file_str.decode()
|
||||
|
|
Loading…
Reference in New Issue