forked from openlp/openlp
Use BibleGateway standard site instead of the legacy site. Fixes bug 1562384.
Update Crosswalk webpage parser to match new layout. Fixes bug 1599999. bzr-revno: 2644 Fixes: https://launchpad.net/bugs/1562384, https://launchpad.net/bugs/1599999
This commit is contained in:
commit
7d52332074
@ -248,7 +248,7 @@ class BGExtract(RegistryProperties):
|
|||||||
url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
|
url_book_name = urllib.parse.quote(book_name.encode("utf-8"))
|
||||||
url_params = 'search=%s+%s&version=%s' % (url_book_name, chapter, version)
|
url_params = 'search=%s+%s&version=%s' % (url_book_name, chapter, version)
|
||||||
soup = get_soup_for_bible_ref(
|
soup = get_soup_for_bible_ref(
|
||||||
'http://legacy.biblegateway.com/passage/?%s' % url_params,
|
'http://biblegateway.com/passage/?%s' % url_params,
|
||||||
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
|
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
|
||||||
if not soup:
|
if not soup:
|
||||||
return None
|
return None
|
||||||
@ -277,7 +277,7 @@ class BGExtract(RegistryProperties):
|
|||||||
"""
|
"""
|
||||||
log.debug('BGExtract.get_books_from_http("%s")', version)
|
log.debug('BGExtract.get_books_from_http("%s")', version)
|
||||||
url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '%s' % version})
|
url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '%s' % version})
|
||||||
reference_url = 'http://legacy.biblegateway.com/versions/?%s#books' % url_params
|
reference_url = 'http://biblegateway.com/versions/?%s#books' % url_params
|
||||||
page = get_web_page(reference_url)
|
page = get_web_page(reference_url)
|
||||||
if not page:
|
if not page:
|
||||||
send_error_message('download')
|
send_error_message('download')
|
||||||
@ -308,7 +308,7 @@ class BGExtract(RegistryProperties):
|
|||||||
for book in content:
|
for book in content:
|
||||||
book = book.find('td')
|
book = book.find('td')
|
||||||
if book:
|
if book:
|
||||||
books.append(book.contents[0])
|
books.append(book.contents[1])
|
||||||
return books
|
return books
|
||||||
|
|
||||||
def get_bibles_from_http(self):
|
def get_bibles_from_http(self):
|
||||||
@ -318,11 +318,11 @@ class BGExtract(RegistryProperties):
|
|||||||
returns a list in the form [(biblename, biblekey, language_code)]
|
returns a list in the form [(biblename, biblekey, language_code)]
|
||||||
"""
|
"""
|
||||||
log.debug('BGExtract.get_bibles_from_http')
|
log.debug('BGExtract.get_bibles_from_http')
|
||||||
bible_url = 'https://legacy.biblegateway.com/versions/'
|
bible_url = 'https://biblegateway.com/versions/'
|
||||||
soup = get_soup_for_bible_ref(bible_url)
|
soup = get_soup_for_bible_ref(bible_url)
|
||||||
if not soup:
|
if not soup:
|
||||||
return None
|
return None
|
||||||
bible_select = soup.find('select', {'class': 'translation-dropdown'})
|
bible_select = soup.find('select', {'class': 'search-translation-select'})
|
||||||
if not bible_select:
|
if not bible_select:
|
||||||
log.debug('No select tags found - did site change?')
|
log.debug('No select tags found - did site change?')
|
||||||
return None
|
return None
|
||||||
@ -520,28 +520,26 @@ class CWExtract(RegistryProperties):
|
|||||||
returns a list in the form [(biblename, biblekey, language_code)]
|
returns a list in the form [(biblename, biblekey, language_code)]
|
||||||
"""
|
"""
|
||||||
log.debug('CWExtract.get_bibles_from_http')
|
log.debug('CWExtract.get_bibles_from_http')
|
||||||
bible_url = 'http://www.biblestudytools.com/'
|
bible_url = 'http://www.biblestudytools.com/bible-versions/'
|
||||||
soup = get_soup_for_bible_ref(bible_url)
|
soup = get_soup_for_bible_ref(bible_url)
|
||||||
if not soup:
|
if not soup:
|
||||||
return None
|
return None
|
||||||
bible_select = soup.find('select')
|
h4_tags = soup.find_all('h4', {'class': 'small-header'})
|
||||||
if not bible_select:
|
if not h4_tags:
|
||||||
log.debug('No select tags found - did site change?')
|
log.debug('No h4 tags found - did site change?')
|
||||||
return None
|
|
||||||
option_tags = bible_select.find_all('option', {'class': 'log-translation'})
|
|
||||||
if not option_tags:
|
|
||||||
log.debug('No option tags found - did site change?')
|
|
||||||
return None
|
return None
|
||||||
bibles = []
|
bibles = []
|
||||||
for ot in option_tags:
|
for h4t in h4_tags:
|
||||||
tag_text = ot.get_text().strip()
|
short_name = None
|
||||||
try:
|
if h4t.span:
|
||||||
tag_value = ot['value']
|
short_name = h4t.span.get_text().strip().lower()
|
||||||
except KeyError:
|
else:
|
||||||
log.exception('No value attribute found - did site change?')
|
log.error('No span tag found - did site change?')
|
||||||
return None
|
return None
|
||||||
if not tag_value:
|
if not short_name:
|
||||||
continue
|
continue
|
||||||
|
h4t.span.extract()
|
||||||
|
tag_text = h4t.get_text().strip()
|
||||||
# The names of non-english bibles has their language in parentheses at the end
|
# The names of non-english bibles has their language in parentheses at the end
|
||||||
if tag_text.endswith(')'):
|
if tag_text.endswith(')'):
|
||||||
language = tag_text[tag_text.rfind('(') + 1:-1]
|
language = tag_text[tag_text.rfind('(') + 1:-1]
|
||||||
@ -549,12 +547,20 @@ class CWExtract(RegistryProperties):
|
|||||||
language_code = CROSSWALK_LANGUAGES[language]
|
language_code = CROSSWALK_LANGUAGES[language]
|
||||||
else:
|
else:
|
||||||
language_code = ''
|
language_code = ''
|
||||||
# ... except for the latin vulgate
|
# ... except for those that don't...
|
||||||
elif 'latin' in tag_text.lower():
|
elif 'latin' in tag_text.lower():
|
||||||
language_code = 'la'
|
language_code = 'la'
|
||||||
|
elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
|
||||||
|
language_code = 'es'
|
||||||
|
elif 'chinese' in tag_text.lower():
|
||||||
|
language_code = 'zh'
|
||||||
|
elif 'greek' in tag_text.lower():
|
||||||
|
language_code = 'el'
|
||||||
|
elif 'nova' in tag_text.lower():
|
||||||
|
language_code = 'pt'
|
||||||
else:
|
else:
|
||||||
language_code = 'en'
|
language_code = 'en'
|
||||||
bibles.append((tag_text, tag_value, language_code))
|
bibles.append((tag_text, short_name, language_code))
|
||||||
return bibles
|
return bibles
|
||||||
|
|
||||||
|
|
||||||
|
@ -50,7 +50,8 @@ class TestBibleHTTP(TestCase):
|
|||||||
books = handler.get_books_from_http('NIV')
|
books = handler.get_books_from_http('NIV')
|
||||||
|
|
||||||
# THEN: We should get back a valid service item
|
# THEN: We should get back a valid service item
|
||||||
assert len(books) == 66, 'The bible should not have had any books added or removed'
|
self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
|
||||||
|
self.assertEqual(books[0], 'Genesis', 'The first bible book should be Genesis')
|
||||||
|
|
||||||
def bible_gateway_extract_books_support_redirect_test(self):
|
def bible_gateway_extract_books_support_redirect_test(self):
|
||||||
"""
|
"""
|
||||||
@ -63,7 +64,7 @@ class TestBibleHTTP(TestCase):
|
|||||||
books = handler.get_books_from_http('DN1933')
|
books = handler.get_books_from_http('DN1933')
|
||||||
|
|
||||||
# THEN: We should get back a valid service item
|
# THEN: We should get back a valid service item
|
||||||
assert len(books) == 66, 'This bible should have 66 books'
|
self.assertEqual(len(books), 66, 'This bible should have 66 books')
|
||||||
|
|
||||||
def bible_gateway_extract_verse_test(self):
|
def bible_gateway_extract_verse_test(self):
|
||||||
"""
|
"""
|
||||||
@ -76,7 +77,8 @@ class TestBibleHTTP(TestCase):
|
|||||||
results = handler.get_bible_chapter('NIV', 'John', 3)
|
results = handler.get_bible_chapter('NIV', 'John', 3)
|
||||||
|
|
||||||
# THEN: We should get back a valid service item
|
# THEN: We should get back a valid service item
|
||||||
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
|
self.assertEqual(len(results.verse_list), 36,
|
||||||
|
'The book of John should not have had any verses added or removed')
|
||||||
|
|
||||||
def bible_gateway_extract_verse_nkjv_test(self):
|
def bible_gateway_extract_verse_nkjv_test(self):
|
||||||
"""
|
"""
|
||||||
@ -89,7 +91,8 @@ class TestBibleHTTP(TestCase):
|
|||||||
results = handler.get_bible_chapter('NKJV', 'John', 3)
|
results = handler.get_bible_chapter('NKJV', 'John', 3)
|
||||||
|
|
||||||
# THEN: We should get back a valid service item
|
# THEN: We should get back a valid service item
|
||||||
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
|
self.assertEqual(len(results.verse_list), 36,
|
||||||
|
'The book of John should not have had any verses added or removed')
|
||||||
|
|
||||||
def crosswalk_extract_books_test(self):
|
def crosswalk_extract_books_test(self):
|
||||||
"""
|
"""
|
||||||
@ -102,7 +105,7 @@ class TestBibleHTTP(TestCase):
|
|||||||
books = handler.get_books_from_http('niv')
|
books = handler.get_books_from_http('niv')
|
||||||
|
|
||||||
# THEN: We should get back a valid service item
|
# THEN: We should get back a valid service item
|
||||||
assert len(books) == 66, 'The bible should not have had any books added or removed'
|
self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
|
||||||
|
|
||||||
def crosswalk_extract_verse_test(self):
|
def crosswalk_extract_verse_test(self):
|
||||||
"""
|
"""
|
||||||
@ -115,7 +118,8 @@ class TestBibleHTTP(TestCase):
|
|||||||
results = handler.get_bible_chapter('niv', 'john', 3)
|
results = handler.get_bible_chapter('niv', 'john', 3)
|
||||||
|
|
||||||
# THEN: We should get back a valid service item
|
# THEN: We should get back a valid service item
|
||||||
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
|
self.assertEqual(len(results.verse_list), 36,
|
||||||
|
'The book of John should not have had any verses added or removed')
|
||||||
|
|
||||||
def bibleserver_get_bibles_test(self):
|
def bibleserver_get_bibles_test(self):
|
||||||
"""
|
"""
|
||||||
@ -144,7 +148,7 @@ class TestBibleHTTP(TestCase):
|
|||||||
|
|
||||||
# THEN: The list should not be None, and some known bibles should be there
|
# THEN: The list should not be None, and some known bibles should be there
|
||||||
self.assertIsNotNone(bibles)
|
self.assertIsNotNone(bibles)
|
||||||
self.assertIn(('Holman Christian Standard Bible', 'HCSB', 'en'), bibles)
|
self.assertIn(('Holman Christian Standard Bible (HCSB)', 'HCSB', 'en'), bibles)
|
||||||
|
|
||||||
def crosswalk_get_bibles_test(self):
|
def crosswalk_get_bibles_test(self):
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user