Use BibleGateway standard site instead of the legacy site. Fixes bug 1562384.

Update Crosswalk webpage parser to match new layout. Fixes bug 1599999.

bzr-revno: 2644
Fixes: https://launchpad.net/bugs/1562384, https://launchpad.net/bugs/1599999
This commit is contained in:
second@tgc.dk 2016-07-27 22:42:37 +02:00 committed by Tomas Groth
commit 7d52332074
2 changed files with 39 additions and 29 deletions

View File

@ -248,7 +248,7 @@ class BGExtract(RegistryProperties):
url_book_name = urllib.parse.quote(book_name.encode("utf-8")) url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
url_params = 'search=%s+%s&version=%s' % (url_book_name, chapter, version) url_params = 'search=%s+%s&version=%s' % (url_book_name, chapter, version)
soup = get_soup_for_bible_ref( soup = get_soup_for_bible_ref(
'http://legacy.biblegateway.com/passage/?%s' % url_params, 'http://biblegateway.com/passage/?%s' % url_params,
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='') pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
if not soup: if not soup:
return None return None
@ -277,7 +277,7 @@ class BGExtract(RegistryProperties):
""" """
log.debug('BGExtract.get_books_from_http("%s")', version) log.debug('BGExtract.get_books_from_http("%s")', version)
url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '%s' % version}) url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '%s' % version})
reference_url = 'http://legacy.biblegateway.com/versions/?%s#books' % url_params reference_url = 'http://biblegateway.com/versions/?%s#books' % url_params
page = get_web_page(reference_url) page = get_web_page(reference_url)
if not page: if not page:
send_error_message('download') send_error_message('download')
@ -308,7 +308,7 @@ class BGExtract(RegistryProperties):
for book in content: for book in content:
book = book.find('td') book = book.find('td')
if book: if book:
books.append(book.contents[0]) books.append(book.contents[1])
return books return books
def get_bibles_from_http(self): def get_bibles_from_http(self):
@ -318,11 +318,11 @@ class BGExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)] returns a list in the form [(biblename, biblekey, language_code)]
""" """
log.debug('BGExtract.get_bibles_from_http') log.debug('BGExtract.get_bibles_from_http')
bible_url = 'https://legacy.biblegateway.com/versions/' bible_url = 'https://biblegateway.com/versions/'
soup = get_soup_for_bible_ref(bible_url) soup = get_soup_for_bible_ref(bible_url)
if not soup: if not soup:
return None return None
bible_select = soup.find('select', {'class': 'translation-dropdown'}) bible_select = soup.find('select', {'class': 'search-translation-select'})
if not bible_select: if not bible_select:
log.debug('No select tags found - did site change?') log.debug('No select tags found - did site change?')
return None return None
@ -520,28 +520,26 @@ class CWExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)] returns a list in the form [(biblename, biblekey, language_code)]
""" """
log.debug('CWExtract.get_bibles_from_http') log.debug('CWExtract.get_bibles_from_http')
bible_url = 'http://www.biblestudytools.com/' bible_url = 'http://www.biblestudytools.com/bible-versions/'
soup = get_soup_for_bible_ref(bible_url) soup = get_soup_for_bible_ref(bible_url)
if not soup: if not soup:
return None return None
bible_select = soup.find('select') h4_tags = soup.find_all('h4', {'class': 'small-header'})
if not bible_select: if not h4_tags:
log.debug('No select tags found - did site change?') log.debug('No h4 tags found - did site change?')
return None
option_tags = bible_select.find_all('option', {'class': 'log-translation'})
if not option_tags:
log.debug('No option tags found - did site change?')
return None return None
bibles = [] bibles = []
for ot in option_tags: for h4t in h4_tags:
tag_text = ot.get_text().strip() short_name = None
try: if h4t.span:
tag_value = ot['value'] short_name = h4t.span.get_text().strip().lower()
except KeyError: else:
log.exception('No value attribute found - did site change?') log.error('No span tag found - did site change?')
return None return None
if not tag_value: if not short_name:
continue continue
h4t.span.extract()
tag_text = h4t.get_text().strip()
# The names of non-english bibles has their language in parentheses at the end # The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'): if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1] language = tag_text[tag_text.rfind('(') + 1:-1]
@ -549,12 +547,20 @@ class CWExtract(RegistryProperties):
language_code = CROSSWALK_LANGUAGES[language] language_code = CROSSWALK_LANGUAGES[language]
else: else:
language_code = '' language_code = ''
# ... except for the latin vulgate # ... except for those that don't...
elif 'latin' in tag_text.lower(): elif 'latin' in tag_text.lower():
language_code = 'la' language_code = 'la'
elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
language_code = 'es'
elif 'chinese' in tag_text.lower():
language_code = 'zh'
elif 'greek' in tag_text.lower():
language_code = 'el'
elif 'nova' in tag_text.lower():
language_code = 'pt'
else: else:
language_code = 'en' language_code = 'en'
bibles.append((tag_text, tag_value, language_code)) bibles.append((tag_text, short_name, language_code))
return bibles return bibles

View File

@ -50,7 +50,8 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('NIV') books = handler.get_books_from_http('NIV')
# THEN: We should get back a valid service item # THEN: We should get back a valid service item
assert len(books) == 66, 'The bible should not have had any books added or removed' self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
self.assertEqual(books[0], 'Genesis', 'The first bible book should be Genesis')
def bible_gateway_extract_books_support_redirect_test(self): def bible_gateway_extract_books_support_redirect_test(self):
""" """
@ -63,7 +64,7 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('DN1933') books = handler.get_books_from_http('DN1933')
# THEN: We should get back a valid service item # THEN: We should get back a valid service item
assert len(books) == 66, 'This bible should have 66 books' self.assertEqual(len(books), 66, 'This bible should have 66 books')
def bible_gateway_extract_verse_test(self): def bible_gateway_extract_verse_test(self):
""" """
@ -76,7 +77,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('NIV', 'John', 3) results = handler.get_bible_chapter('NIV', 'John', 3)
# THEN: We should get back a valid service item # THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed' self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def bible_gateway_extract_verse_nkjv_test(self): def bible_gateway_extract_verse_nkjv_test(self):
""" """
@ -89,7 +91,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('NKJV', 'John', 3) results = handler.get_bible_chapter('NKJV', 'John', 3)
# THEN: We should get back a valid service item # THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed' self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def crosswalk_extract_books_test(self): def crosswalk_extract_books_test(self):
""" """
@ -102,7 +105,7 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('niv') books = handler.get_books_from_http('niv')
# THEN: We should get back a valid service item # THEN: We should get back a valid service item
assert len(books) == 66, 'The bible should not have had any books added or removed' self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
def crosswalk_extract_verse_test(self): def crosswalk_extract_verse_test(self):
""" """
@ -115,7 +118,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('niv', 'john', 3) results = handler.get_bible_chapter('niv', 'john', 3)
# THEN: We should get back a valid service item # THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed' self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def bibleserver_get_bibles_test(self): def bibleserver_get_bibles_test(self):
""" """
@ -144,7 +148,7 @@ class TestBibleHTTP(TestCase):
# THEN: The list should not be None, and some known bibles should be there # THEN: The list should not be None, and some known bibles should be there
self.assertIsNotNone(bibles) self.assertIsNotNone(bibles)
self.assertIn(('Holman Christian Standard Bible', 'HCSB', 'en'), bibles) self.assertIn(('Holman Christian Standard Bible (HCSB)', 'HCSB', 'en'), bibles)
def crosswalk_get_bibles_test(self): def crosswalk_get_bibles_test(self):
""" """