Use BibleGateway standard site instead of the legacy site, fixes bug 1562384. Update Crosswalk webpage parser to match new layout, fixes bug 1599999.

Fixes: https://launchpad.net/bugs/1562384, https://launchpad.net/bugs/1599999
This commit is contained in:
Tomas Groth 2016-07-27 21:27:14 +02:00
parent 37e92d00a2
commit a3a113021b
2 changed files with 39 additions and 29 deletions

View File

@ -248,7 +248,7 @@ class BGExtract(RegistryProperties):
url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
url_params = 'search=%s+%s&version=%s' % (url_book_name, chapter, version)
soup = get_soup_for_bible_ref(
'http://legacy.biblegateway.com/passage/?%s' % url_params,
'http://biblegateway.com/passage/?%s' % url_params,
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
if not soup:
return None
@ -277,7 +277,7 @@ class BGExtract(RegistryProperties):
"""
log.debug('BGExtract.get_books_from_http("%s")', version)
url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '%s' % version})
reference_url = 'http://legacy.biblegateway.com/versions/?%s#books' % url_params
reference_url = 'http://biblegateway.com/versions/?%s#books' % url_params
page = get_web_page(reference_url)
if not page:
send_error_message('download')
@ -308,7 +308,7 @@ class BGExtract(RegistryProperties):
for book in content:
book = book.find('td')
if book:
books.append(book.contents[0])
books.append(book.contents[1])
return books
def get_bibles_from_http(self):
@ -318,11 +318,11 @@ class BGExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('BGExtract.get_bibles_from_http')
bible_url = 'https://legacy.biblegateway.com/versions/'
bible_url = 'https://biblegateway.com/versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_select = soup.find('select', {'class': 'translation-dropdown'})
bible_select = soup.find('select', {'class': 'search-translation-select'})
if not bible_select:
log.debug('No select tags found - did site change?')
return None
@ -520,28 +520,26 @@ class CWExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('CWExtract.get_bibles_from_http')
bible_url = 'http://www.biblestudytools.com/'
bible_url = 'http://www.biblestudytools.com/bible-versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_select = soup.find('select')
if not bible_select:
log.debug('No select tags found - did site change?')
return None
option_tags = bible_select.find_all('option', {'class': 'log-translation'})
if not option_tags:
log.debug('No option tags found - did site change?')
h4_tags = soup.find_all('h4', {'class': 'small-header'})
if not h4_tags:
log.debug('No h4 tags found - did site change?')
return None
bibles = []
for ot in option_tags:
tag_text = ot.get_text().strip()
try:
tag_value = ot['value']
except KeyError:
log.exception('No value attribute found - did site change?')
for h4t in h4_tags:
short_name = None
if h4t.span:
short_name = h4t.span.get_text().strip().lower()
else:
log.error('No span tag found - did site change?')
return None
if not tag_value:
if not short_name:
continue
h4t.span.extract()
tag_text = h4t.get_text().strip()
# The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1]
@ -549,12 +547,20 @@ class CWExtract(RegistryProperties):
language_code = CROSSWALK_LANGUAGES[language]
else:
language_code = ''
# ... except for the latin vulgate
# ... except for those that don't...
elif 'latin' in tag_text.lower():
language_code = 'la'
elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
language_code = 'es'
elif 'chinese' in tag_text.lower():
language_code = 'zh'
elif 'greek' in tag_text.lower():
language_code = 'el'
elif 'nova' in tag_text.lower():
language_code = 'pt'
else:
language_code = 'en'
bibles.append((tag_text, tag_value, language_code))
bibles.append((tag_text, short_name, language_code))
return bibles

View File

@ -50,7 +50,8 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('NIV')
# THEN: We should get back a valid service item
assert len(books) == 66, 'The bible should not have had any books added or removed'
self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
self.assertEqual(books[0], 'Genesis', 'The first bible book should be Genesis')
def bible_gateway_extract_books_support_redirect_test(self):
"""
@ -63,7 +64,7 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('DN1933')
# THEN: We should get back a valid service item
assert len(books) == 66, 'This bible should have 66 books'
self.assertEqual(len(books), 66, 'This bible should have 66 books')
def bible_gateway_extract_verse_test(self):
"""
@ -76,7 +77,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('NIV', 'John', 3)
# THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def bible_gateway_extract_verse_nkjv_test(self):
"""
@ -89,7 +91,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('NKJV', 'John', 3)
# THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def crosswalk_extract_books_test(self):
"""
@ -102,7 +105,7 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('niv')
# THEN: We should get back a valid service item
assert len(books) == 66, 'The bible should not have had any books added or removed'
self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
def crosswalk_extract_verse_test(self):
"""
@ -115,7 +118,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('niv', 'john', 3)
# THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def bibleserver_get_bibles_test(self):
"""
@ -144,7 +148,7 @@ class TestBibleHTTP(TestCase):
# THEN: The list should not be None, and some known bibles should be there
self.assertIsNotNone(bibles)
self.assertIn(('Holman Christian Standard Bible', 'HCSB', 'en'), bibles)
self.assertIn(('Holman Christian Standard Bible (HCSB)', 'HCSB', 'en'), bibles)
def crosswalk_get_bibles_test(self):
"""