Fix the CrossWalk Biblestudytools.com importer

This commit is contained in:
Raoul Snyman 2020-09-25 23:13:10 -07:00
parent 844399b54f
commit 7734ce89a0
No known key found for this signature in database
GPG Key ID: 7347E1FA47B16091
2 changed files with 45 additions and 42 deletions

View File

@ -531,7 +531,7 @@ class CWExtract(RegistryProperties):
verses = {} verses = {}
for verse in verses_div: for verse in verses_div:
self.application.process_events() self.application.process_events()
verse_number = int(verse.find('strong').contents[0]) verse_number = int(verse.find('span', 'verse-number').strong.contents[0])
verse_span = verse.find('span', class_='verse-%d' % verse_number) verse_span = verse.find('span', class_='verse-%d' % verse_number)
tags_to_remove = verse_span.find_all(['a', 'sup']) tags_to_remove = verse_span.find_all(['a', 'sup'])
for tag in tags_to_remove: for tag in tags_to_remove:
@ -576,22 +576,25 @@ class CWExtract(RegistryProperties):
soup = get_soup_for_bible_ref(bible_url) soup = get_soup_for_bible_ref(bible_url)
if not soup: if not soup:
return None return None
h4_tags = soup.find_all('h4', {'class': 'small-header'}) # Get all <div class="col-md-12"> on the page
if not h4_tags: content_column = soup.find('div', id='content-column')
log.debug('No h4 tags found - did site change?') if not content_column:
log.error('No div[id=content-column] -- the site must have changed')
return None
col_md_12_divs = content_column.find_all('div', 'col-md-12')
if not col_md_12_divs:
log.error('No div[class=col-md-12] -- the site must have changed')
return None return None
bibles = [] bibles = []
for h4t in h4_tags: for col_md_12 in col_md_12_divs:
short_name = None # Check if <a><strong><span class="text-muted"> is a direct descendant
if h4t.span: if not col_md_12.a or not col_md_12.a.strong or not col_md_12.a.strong.span or \
short_name = h4t.span.get_text().strip().lower() 'text-muted' not in col_md_12.a.strong.span['class']:
else: continue
log.error('No span tag found - did site change?') short_name = str(col_md_12.a.strong.span.string).strip().lower()
return None
if not short_name: if not short_name:
continue continue
h4t.span.extract() tag_text = str(col_md_12.a.strong.contents[0]).strip()
tag_text = h4t.get_text().strip()
# The names of non-english bibles has their language in parentheses at the end # The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'): if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1] language = tag_text[tag_text.rfind('(') + 1:-1]

View File

@ -124,6 +124,35 @@ class TestBibleHTTP(TestCase):
# THEN: We should get back a valid service item # THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed' assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
def test_crosswalk_get_bibles(self):
"""
Test getting list of bibles from Crosswalk.com
"""
# GIVEN: A new Crosswalk extraction class
handler = CWExtract()
# WHEN: downloading bible list from Crosswalk
bibles = handler.get_bibles_from_http()
# THEN: The list should not be None, and some known bibles should be there
assert bibles is not None
assert ('Giovanni Diodati 1649 (Italian)', 'gdb', 'it') in bibles
def test_crosswalk_get_verse_text(self):
"""
Test verse text from Crosswalk.com
"""
# GIVEN: A new Crosswalk extraction class
handler = CWExtract()
# WHEN: downloading NIV Genesis from Crosswalk
niv_genesis_chapter_one = handler.get_bible_chapter('niv', 'Genesis', 1)
# THEN: The verse list should contain the verses
assert niv_genesis_chapter_one.has_verse_list() is True
assert 'In the beginning God created the heavens and the earth.' == niv_genesis_chapter_one.verse_list[1], \
'The first chapter of genesis should have been fetched.'
def test_bibleserver_get_bibles(self): def test_bibleserver_get_bibles(self):
""" """
Test getting list of bibles from BibleServer.com Test getting list of bibles from BibleServer.com
@ -167,32 +196,3 @@ class TestBibleHTTP(TestCase):
# THEN: The list should not be None, and some known bibles should be there # THEN: The list should not be None, and some known bibles should be there
assert bibles is not None assert bibles is not None
assert ('Holman Christian Standard Bible (HCSB)', 'HCSB', 'en') in bibles assert ('Holman Christian Standard Bible (HCSB)', 'HCSB', 'en') in bibles
def test_crosswalk_get_bibles(self):
"""
Test getting list of bibles from Crosswalk.com
"""
# GIVEN: A new Crosswalk extraction class
handler = CWExtract()
# WHEN: downloading bible list from Crosswalk
bibles = handler.get_bibles_from_http()
# THEN: The list should not be None, and some known bibles should be there
assert bibles is not None
assert ('Giovanni Diodati 1649 (Italian)', 'gdb', 'it') in bibles
def test_crosswalk_get_verse_text(self):
"""
Test verse text from Crosswalk.com
"""
# GIVEN: A new Crosswalk extraction class
handler = CWExtract()
# WHEN: downloading NIV Genesis from Crosswalk
niv_genesis_chapter_one = handler.get_bible_chapter('niv', 'Genesis', 1)
# THEN: The verse list should contain the verses
assert niv_genesis_chapter_one.has_verse_list() is True
assert 'In the beginning God created the heavens and the earth.' == niv_genesis_chapter_one.verse_list[1], \
'The first chapter of genesis should have been fetched.'