Change crosswalk webbible parsing to match new layout.

2022-05-27 18:54:01 +00:00 · 2022-05-27 18:54:01 +00:00 · cd8eefff9b
parent ba31406b8f
commit cd8eefff9b
2 changed files with 36 additions and 42 deletions
--- a/openlp/plugins/bibles/lib/importers/http.py
+++ b/openlp/plugins/bibles/lib/importers/http.py
@ -21,6 +21,7 @@
 """
 The :mod:`http` module enables OpenLP to retrieve scripture from bible websites.
 """
+import json
 import logging
 import re
 import socket
@ -516,14 +517,14 @@ class CWExtract(RegistryProperties):
        url_book_name = book_name.replace(' ', '-')
        url_book_name = url_book_name.lower()
        url_book_name = urllib.parse.quote(url_book_name.encode("utf-8"))
-        chapter_url = 'http://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
-                                                                                              book=url_book_name,
-                                                                                              chapter=chapter)
+        chapter_url = 'https://www.biblestudytools.com/{version}/{book}/{chapter}.html'.format(version=version,
+                                                                                               book=url_book_name,
+                                                                                               chapter=chapter)
        soup = get_soup_for_bible_ref(chapter_url)
        if not soup:
            return None
        self.application.process_events()
-        verses_div = soup.find_all('div', 'verse')
+        verses_div = soup.find_all('div', {'data-verse-id': True})
        if not verses_div:
            log.error('No verses found in the CrossWalk response.')
            send_error_message('parse')
@ -531,12 +532,11 @@ class CWExtract(RegistryProperties):
        verses = {}
        for verse in verses_div:
            self.application.process_events()
-            verse_number = int(verse.find('span', 'verse-number').strong.contents[0])
-            verse_span = verse.find('span', class_='verse-%d' % verse_number)
-            tags_to_remove = verse_span.find_all(['a', 'sup'])
+            verse_number = int(verse['data-verse-id'])
+            tags_to_remove = verse.find_all(['a', 'sup', 'h3'])
            for tag in tags_to_remove:
                tag.decompose()
-            verse_text = verse_span.get_text()
+            verse_text = verse.get_text()
            self.application.process_events()
            # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
            verse_text = verse_text.strip('\n\r\t ')
@ -552,18 +552,25 @@ class CWExtract(RegistryProperties):
        :param version: The version of the bible like NIV for New International Version
        """
        log.debug('CWExtract.get_books_from_http("{version}")'.format(version=version))
-        chapter_url = 'http://www.biblestudytools.com/{version}/'.format(version=version)
-        soup = get_soup_for_bible_ref(chapter_url)
-        if not soup:
-            return None
-        content = soup.find_all('h4', {'class': 'small-header'})
-        if not content:
-            log.error('No books found in the Crosswalk response.')
-            send_error_message('parse')
-            return None
+        books_url = 'https://www.biblestudytools.com/api/bible/books-selection/?translationCode={version}'
+        books_url = books_url.format(version=version)
        books = []
-        for book in content:
-            books.append(book.contents[0])
+        books_page = get_web_page(books_url)
+        if not books_page:
+            log.error('No books found in the CrossWalk response.')
+            send_error_message('parse')
+            return books
+        books_json = json.loads(books_page)
+        for book in books_json:
+            # the link looks like this: https://www.biblestudytools.com/bla/2-corintios/
+            link = book['link']
+            # remove trailing forward slash
+            link = link.strip('/')
+            # remove everything before the book name/code
+            book_name = link[link.rfind('/') + 1:]
+            # replace dash with space
+            book_name = book_name.replace('-', ' ')
+            books.append(book_name)
        return books

    def get_bibles_from_http(self):
@ -572,29 +579,16 @@ class CWExtract(RegistryProperties):
        returns a list in the form [(biblename, biblekey, language_code)]
        """
        log.debug('CWExtract.get_bibles_from_http')
-        bible_url = 'http://www.biblestudytools.com/bible-versions/'
+        bible_url = 'https://www.biblestudytools.com/'
        soup = get_soup_for_bible_ref(bible_url)
        if not soup:
            return None
-        # Get all <div class="col-md-12"> on the page
-        content_column = soup.find('div', id='content-column')
-        if not content_column:
-            log.error('No div[id=content-column] -- the site must have changed')
-            return None
-        col_md_12_divs = content_column.find_all('div', 'col-md-12')
-        if not col_md_12_divs:
-            log.error('No div[class=col-md-12] -- the site must have changed')
-            return None
+        # Get all <option class="log-translation" ...> on the page
+        options = soup.find_all('option', {'class': 'log-translation'})
        bibles = []
-        for col_md_12 in col_md_12_divs:
-            # Check if <a><strong><span class="text-muted"> is a direct descendant
-            if not col_md_12.a or not col_md_12.a.strong or not col_md_12.a.strong.span or \
-                    'text-muted' not in col_md_12.a.strong.span['class']:
-                continue
-            short_name = str(col_md_12.a.strong.span.string).strip().lower()
-            if not short_name:
-                continue
-            tag_text = str(col_md_12.a.strong.contents[0]).strip()
+        for option in options:
+            short_name = option['value']
+            tag_text = str(option.contents[0]).strip()
            # The names of non-english bibles has their language in parentheses at the end
            if tag_text.endswith(')'):
                language = tag_text[tag_text.rfind('(') + 1:-1]
--- a/openlp/plugins/songs/lib/importers/easyslides.py
+++ b/openlp/plugins/songs/lib/importers/easyslides.py
@ -51,18 +51,18 @@ class EasySlidesImport(SongImport):
        parser = etree.XMLParser(remove_blank_text=True, recover=True)
        try:
            with self.import_source.open('r') as xml_file:
-                parsed_file = etree.parse(str(self.import_source), parser)
+                parsed_file = etree.parse(xml_file, parser)
        except etree.XMLSyntaxError:
-            log.exception('XML syntax error in file {name}'.format(name=xml_file))
+            log.exception('XML syntax error in file {name}'.format(name=self.import_source))
            self.log_error(self.import_source, SongStrings.XMLSyntaxError)
            return
        except UnicodeDecodeError:
-            log.exception('Unreadable characters in {name}'.format(name=xml_file))
+            log.exception('Unreadable characters in {name}'.format(name=self.import_source))
            self.log_error(self.import_source, SongStrings.XMLSyntaxError)
            return
        file_str = etree.tostring(parsed_file)
        if not file_str:
-            log.exception('Could not find XML in file {name}'.format(name=xml_file))
+            log.exception('Could not find XML in file {name}'.format(name=self.import_source))
            self.log_error(self.import_source, SongStrings.XMLSyntaxError)
            return
        xml = file_str.decode()