From 7734ce89a085ee9aac21016d798b63fb7bfdfc98 Mon Sep 17 00:00:00 2001 From: Raoul Snyman Date: Fri, 25 Sep 2020 23:13:10 -0700 Subject: [PATCH] Fix the CrossWalk Biblestudytools.com importer --- openlp/plugins/bibles/lib/importers/http.py | 29 +++++----- .../openlp_plugins/bibles/test_lib_http.py | 58 +++++++++---------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/openlp/plugins/bibles/lib/importers/http.py b/openlp/plugins/bibles/lib/importers/http.py index 3bd132824..332e1cef0 100644 --- a/openlp/plugins/bibles/lib/importers/http.py +++ b/openlp/plugins/bibles/lib/importers/http.py @@ -531,7 +531,7 @@ class CWExtract(RegistryProperties): verses = {} for verse in verses_div: self.application.process_events() - verse_number = int(verse.find('strong').contents[0]) + verse_number = int(verse.find('span', 'verse-number').strong.contents[0]) verse_span = verse.find('span', class_='verse-%d' % verse_number) tags_to_remove = verse_span.find_all(['a', 'sup']) for tag in tags_to_remove: @@ -576,22 +576,25 @@ class CWExtract(RegistryProperties): soup = get_soup_for_bible_ref(bible_url) if not soup: return None - h4_tags = soup.find_all('h4', {'class': 'small-header'}) - if not h4_tags: - log.debug('No h4 tags found - did site change?') + # Get all
on the page + content_column = soup.find('div', id='content-column') + if not content_column: + log.error('No div[id=content-column] -- the site must have changed') + return None + col_md_12_divs = content_column.find_all('div', 'col-md-12') + if not col_md_12_divs: + log.error('No div[class=col-md-12] -- the site must have changed') return None bibles = [] - for h4t in h4_tags: - short_name = None - if h4t.span: - short_name = h4t.span.get_text().strip().lower() - else: - log.error('No span tag found - did site change?') - return None + for col_md_12 in col_md_12_divs: + # Check if is a direct descendant + if not col_md_12.a or not col_md_12.a.strong or not col_md_12.a.strong.span or \ + 'text-muted' not in col_md_12.a.strong.span['class']: + continue + short_name = str(col_md_12.a.strong.span.string).strip().lower() if not short_name: continue - h4t.span.extract() - tag_text = h4t.get_text().strip() + tag_text = str(col_md_12.a.strong.contents[0]).strip() # The names of non-english bibles has their language in parentheses at the end if tag_text.endswith(')'): language = tag_text[tag_text.rfind('(') + 1:-1] diff --git a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py index 703de1f03..50ac885ab 100644 --- a/tests/interfaces/openlp_plugins/bibles/test_lib_http.py +++ b/tests/interfaces/openlp_plugins/bibles/test_lib_http.py @@ -124,6 +124,35 @@ class TestBibleHTTP(TestCase): # THEN: We should get back a valid service item assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed' + def test_crosswalk_get_bibles(self): + """ + Test getting list of bibles from Crosswalk.com + """ + # GIVEN: A new Crosswalk extraction class + handler = CWExtract() + + # WHEN: downloading bible list from Crosswalk + bibles = handler.get_bibles_from_http() + + # THEN: The list should not be None, and some known bibles should be there + assert bibles is not None + assert ('Giovanni Diodati 1649 (Italian)', 'gdb', 'it') in bibles + + def test_crosswalk_get_verse_text(self): + """ + Test verse text from Crosswalk.com + """ + # GIVEN: A new Crosswalk extraction class + handler = CWExtract() + + # WHEN: downloading NIV Genesis from Crosswalk + niv_genesis_chapter_one = handler.get_bible_chapter('niv', 'Genesis', 1) + + # THEN: The verse list should contain the verses + assert niv_genesis_chapter_one.has_verse_list() is True + assert 'In the beginning God created the heavens and the earth.' == niv_genesis_chapter_one.verse_list[1], \ + 'The first chapter of genesis should have been fetched.' + def test_bibleserver_get_bibles(self): """ Test getting list of bibles from BibleServer.com @@ -167,32 +196,3 @@ class TestBibleHTTP(TestCase): # THEN: The list should not be None, and some known bibles should be there assert bibles is not None assert ('Holman Christian Standard Bible (HCSB)', 'HCSB', 'en') in bibles - - def test_crosswalk_get_bibles(self): - """ - Test getting list of bibles from Crosswalk.com - """ - # GIVEN: A new Crosswalk extraction class - handler = CWExtract() - - # WHEN: downloading bible list from Crosswalk - bibles = handler.get_bibles_from_http() - - # THEN: The list should not be None, and some known bibles should be there - assert bibles is not None - assert ('Giovanni Diodati 1649 (Italian)', 'gdb', 'it') in bibles - - def test_crosswalk_get_verse_text(self): - """ - Test verse text from Crosswalk.com - """ - # GIVEN: A new Crosswalk extraction class - handler = CWExtract() - - # WHEN: downloading NIV Genesis from Crosswalk - niv_genesis_chapter_one = handler.get_bible_chapter('niv', 'Genesis', 1) - - # THEN: The verse list should contain the verses - assert niv_genesis_chapter_one.has_verse_list() is True - assert 'In the beginning God created the heavens and the earth.' == niv_genesis_chapter_one.verse_list[1], \ - 'The first chapter of genesis should have been fetched.'