Change crosswalk webbible parsing to match new layout.

This commit is contained in:
Tomas Groth 2022-05-27 18:54:01 +00:00 committed by Raoul Snyman
parent ba31406b8f
commit cd8eefff9b
2 changed files with 36 additions and 42 deletions

View File

@ -21,6 +21,7 @@
"""
The :mod:`http` module enables OpenLP to retrieve scripture from bible websites.
"""
import json
import logging
import re
import socket
@ -516,14 +517,14 @@ class CWExtract(RegistryProperties):
url_book_name = book_name.replace(' ', '-')
url_book_name = url_book_name.lower()
url_book_name = urllib.parse.quote(url_book_name.encode("utf-8"))
chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
book=url_book_name,
chapter=chapter)
chapter_url = 'https://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
book=url_book_name,
chapter=chapter)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
self.application.process_events()
verses_div = soup.find_all('div', 'verse')
verses_div = soup.find_all('div', {'data-verse-id': True})
if not verses_div:
log.error('No verses found in the CrossWalk response.')
send_error_message('parse')
@ -531,12 +532,11 @@ class CWExtract(RegistryProperties):
verses = {}
for verse in verses_div:
self.application.process_events()
verse_number = int(verse.find('span', 'verse-number').strong.contents[0])
verse_span = verse.find('span', class_='verse-%d' % verse_number)
tags_to_remove = verse_span.find_all(['a', 'sup'])
verse_number = int(verse['data-verse-id'])
tags_to_remove = verse.find_all(['a', 'sup', 'h3'])
for tag in tags_to_remove:
tag.decompose()
verse_text = verse_span.get_text()
verse_text = verse.get_text()
self.application.process_events()
# Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
verse_text = verse_text.strip('\n\r\t ')
@ -552,18 +552,25 @@ class CWExtract(RegistryProperties):
:param version: The version of the bible like NIV for New International Version
"""
log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version))
chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version)
soup = get_soup_for_bible_ref(chapter_url)
if not soup:
return None
content = soup.find_all('h4', {'class': 'small-header'})
if not content:
log.error('No books found in the Crosswalk response.')
send_error_message('parse')
return None
books_url = 'https://www.biblestudytools.com/api/bible/books-selection/?translationCode={version}'
books_url = books_url.format(version=version)
books = []
for book in content:
books.append(book.contents[0])
books_page = get_web_page(books_url)
if not books_page:
log.error('No books found in the CrossWalk response.')
send_error_message('parse')
return books
books_json = json.loads(books_page)
for book in books_json:
# the link looks like this: https://www.biblestudytools.com/bla/2-corintios/
link = book['link']
# remove trailing forward slash
link = link.strip('/')
# remove everything before the book name/code
book_name = link[link.rfind('/') + 1:]
# replace dash with space
book_name = book_name.replace('-', ' ')
books.append(book_name)
return books
def get_bibles_from_http(self):
@ -572,29 +579,16 @@ class CWExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('CWExtract.get_bibles_from_http')
bible_url = 'http://www.biblestudytools.com/bible-versions/'
bible_url = 'https://www.biblestudytools.com/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
# Get all <div class="col-md-12"> on the page
content_column = soup.find('div', id='content-column')
if not content_column:
log.error('No div[id=content-column] -- the site must have changed')
return None
col_md_12_divs = content_column.find_all('div', 'col-md-12')
if not col_md_12_divs:
log.error('No div[class=col-md-12] -- the site must have changed')
return None
# Get all <option class="log-translation" ...> on the page
options = soup.find_all('option', {'class': 'log-translation'})
bibles = []
for col_md_12 in col_md_12_divs:
# Check if <a><strong><span class="text-muted"> is a direct descendant
if not col_md_12.a or not col_md_12.a.strong or not col_md_12.a.strong.span or \
'text-muted' not in col_md_12.a.strong.span['class']:
continue
short_name = str(col_md_12.a.strong.span.string).strip().lower()
if not short_name:
continue
tag_text = str(col_md_12.a.strong.contents[0]).strip()
for option in options:
short_name = option['value']
tag_text = str(option.contents[0]).strip()
# The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1]

View File

@ -51,18 +51,18 @@ class EasySlidesImport(SongImport):
parser = etree.XMLParser(remove_blank_text=True, recover=True)
try:
with self.import_source.open('r') as xml_file:
parsed_file = etree.parse(str(self.import_source), parser)
parsed_file = etree.parse(xml_file, parser)
except etree.XMLSyntaxError:
log.exception('XML syntax error in file {name}'.format(name=xml_file))
log.exception('XML syntax error in file {name}'.format(name=self.import_source))
self.log_error(self.import_source, SongStrings.XMLSyntaxError)
return
except UnicodeDecodeError:
log.exception('Unreadable characters in {name}'.format(name=xml_file))
log.exception('Unreadable characters in {name}'.format(name=self.import_source))
self.log_error(self.import_source, SongStrings.XMLSyntaxError)
return
file_str = etree.tostring(parsed_file)
if not file_str:
log.exception('Could not find XML in file {name}'.format(name=xml_file))
log.exception('Could not find XML in file {name}'.format(name=self.import_source))
self.log_error(self.import_source, SongStrings.XMLSyntaxError)
return
xml = file_str.decode()