Use BibleGateway standard site instead of the legacy site. Fixes bug 1562384.

Update Crosswalk webpage parser to match new layout. Fixes bug 1599999.
Make easyslide importer try to recover when reading non-standard xml. Fixes bug 1588822.
Fix handeling of control chars and escaped chars in VideoPsalm import. Fixes bug 1594945.

bzr-revno: 2683
Fixes: https://launchpad.net/bugs/1562384, https://launchpad.net/bugs/1588822, https://launchpad.net/bugs/1594945, https://launchpad.net/bugs/1599999
This commit is contained in:
second@tgc.dk 2016-07-27 21:11:22 +02:00 committed by Tomas Groth
commit 291c9c8902
7 changed files with 132 additions and 31 deletions

View File

@ -252,7 +252,7 @@ class BGExtract(RegistryProperties):
chapter=chapter,
version=version)
soup = get_soup_for_bible_ref(
'http://legacy.biblegateway.com/passage/?{url}'.format(url=url_params),
'http://biblegateway.com/passage/?{url}'.format(url=url_params),
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='')
if not soup:
return None
@ -281,7 +281,7 @@ class BGExtract(RegistryProperties):
"""
log.debug('BGExtract.get_books_from_http("{version}")'.format(version=version))
url_params = urllib.parse.urlencode({'action': 'getVersionInfo', 'vid': '{version}'.format(version=version)})
reference_url = 'http://legacy.biblegateway.com/versions/?{url}#books'.format(url=url_params)
reference_url = 'http://biblegateway.com/versions/?{url}#books'.format(url=url_params)
page = get_web_page(reference_url)
if not page:
send_error_message('download')
@ -312,7 +312,7 @@ class BGExtract(RegistryProperties):
for book in content:
book = book.find('td')
if book:
books.append(book.contents[0])
books.append(book.contents[1])
return books
def get_bibles_from_http(self):
@ -322,11 +322,11 @@ class BGExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('BGExtract.get_bibles_from_http')
bible_url = 'https://legacy.biblegateway.com/versions/'
bible_url = 'https://biblegateway.com/versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_select = soup.find('select', {'class': 'translation-dropdown'})
bible_select = soup.find('select', {'class': 'search-translation-select'})
if not bible_select:
log.debug('No select tags found - did site change?')
return None
@ -532,28 +532,26 @@ class CWExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)]
"""
log.debug('CWExtract.get_bibles_from_http')
bible_url = 'http://www.biblestudytools.com/'
bible_url = 'http://www.biblestudytools.com/bible-versions/'
soup = get_soup_for_bible_ref(bible_url)
if not soup:
return None
bible_select = soup.find('select')
if not bible_select:
log.debug('No select tags found - did site change?')
return None
option_tags = bible_select.find_all('option', {'class': 'log-translation'})
if not option_tags:
log.debug('No option tags found - did site change?')
h4_tags = soup.find_all('h4', {'class': 'small-header'})
if not h4_tags:
log.debug('No h4 tags found - did site change?')
return None
bibles = []
for ot in option_tags:
tag_text = ot.get_text().strip()
try:
tag_value = ot['value']
except KeyError:
log.exception('No value attribute found - did site change?')
for h4t in h4_tags:
short_name = None
if h4t.span:
short_name = h4t.span.get_text().strip().lower()
else:
log.error('No span tag found - did site change?')
return None
if not tag_value:
if not short_name:
continue
h4t.span.extract()
tag_text = h4t.get_text().strip()
# The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1]
@ -561,12 +559,20 @@ class CWExtract(RegistryProperties):
language_code = CROSSWALK_LANGUAGES[language]
else:
language_code = ''
# ... except for the latin vulgate
# ... except for those that don't...
elif 'latin' in tag_text.lower():
language_code = 'la'
elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
language_code = 'es'
elif 'chinese' in tag_text.lower():
language_code = 'zh'
elif 'greek' in tag_text.lower():
language_code = 'el'
elif 'nova' in tag_text.lower():
language_code = 'pt'
else:
language_code = 'en'
bibles.append((tag_text, tag_value, language_code))
bibles.append((tag_text, short_name, language_code))
return bibles

View File

@ -46,7 +46,7 @@ class EasySlidesImport(SongImport):
def do_import(self):
log.info('Importing EasySlides XML file {source}'.format(source=self.import_source))
parser = etree.XMLParser(remove_blank_text=True)
parser = etree.XMLParser(remove_blank_text=True, recover=True)
parsed_file = etree.parse(self.import_source, parser)
xml = etree.tostring(parsed_file).decode()
song_xml = objectify.fromstring(xml)

View File

@ -73,6 +73,14 @@ class VideoPsalmImport(SongImport):
processed_content += c
c = next(file_content_it)
processed_content += '"' + c
# Remove control characters
elif (c < chr(32)):
processed_content += ' '
# Handle escaped characters
elif c == '\\':
processed_content += c
c = next(file_content_it)
processed_content += c
else:
processed_content += c
songbook = json.loads(processed_content.strip())

View File

@ -43,3 +43,5 @@ class TestVideoPsalmFileImport(SongImportTestHelper):
"""
self.file_import(os.path.join(TEST_PATH, 'videopsalm-as-safe-a-stronghold.json'),
self.load_external_result_data(os.path.join(TEST_PATH, 'as-safe-a-stronghold.json')))
self.file_import(os.path.join(TEST_PATH, 'videopsalm-as-safe-a-stronghold2.json'),
self.load_external_result_data(os.path.join(TEST_PATH, 'as-safe-a-stronghold2.json')))

View File

@ -50,7 +50,8 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('NIV')
# THEN: We should get back a valid service item
assert len(books) == 66, 'The bible should not have had any books added or removed'
self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
self.assertEqual(books[0], 'Genesis', 'The first bible book should be Genesis')
def test_bible_gateway_extract_books_support_redirect(self):
"""
@ -63,7 +64,7 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('DN1933')
# THEN: We should get back a valid service item
assert len(books) == 66, 'This bible should have 66 books'
self.assertEqual(len(books), 66, 'This bible should have 66 books')
def test_bible_gateway_extract_verse(self):
"""
@ -76,7 +77,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('NIV', 'John', 3)
# THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def test_bible_gateway_extract_verse_nkjv(self):
"""
@ -89,7 +91,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('NKJV', 'John', 3)
# THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def test_crosswalk_extract_books(self):
"""
@ -102,7 +105,7 @@ class TestBibleHTTP(TestCase):
books = handler.get_books_from_http('niv')
# THEN: We should get back a valid service item
assert len(books) == 66, 'The bible should not have had any books added or removed'
self.assertEqual(len(books), 66, 'The bible should not have had any books added or removed')
def test_crosswalk_extract_verse(self):
"""
@ -115,7 +118,8 @@ class TestBibleHTTP(TestCase):
results = handler.get_bible_chapter('niv', 'john', 3)
# THEN: We should get back a valid service item
assert len(results.verse_list) == 36, 'The book of John should not have had any verses added or removed'
self.assertEqual(len(results.verse_list), 36,
'The book of John should not have had any verses added or removed')
def test_bibleserver_get_bibles(self):
"""
@ -144,9 +148,8 @@ class TestBibleHTTP(TestCase):
# THEN: The list should not be None, and some known bibles should be there
self.assertIsNotNone(bibles)
self.assertIn(('Holman Christian Standard Bible', 'HCSB', 'en'), bibles)
self.assertIn(('Holman Christian Standard Bible (HCSB)', 'HCSB', 'en'), bibles)
@skip("Waiting for Crosswalk to fix their server")
def test_crosswalk_get_bibles(self):
"""
Test getting list of bibles from Crosswalk.com

View File

@ -0,0 +1,35 @@
{
"authors": [
["Martin Luther", "words"],
["Unknown", "music"]
],
"ccli_number": "12345",
"comments": "This is\nthe first comment\nThis is\nthe second comment\nThis is\nthe third comment\n",
"copyright": "Public Domain",
"song_book_name": "SongBook1",
"song_number": 0,
"title": "A Safe Stronghold Our God is Still",
"topics": [
"tema1",
"tema2"
],
"verse_order_list": [],
"verses": [
[
"As safe a stronghold our God is still,\nA trusty shield and weapon;\nHell help us clear from all the ill\nThat hath us now oertaken.\nThe ancient prince of hell\nHath risen with purpose fell;\nStrong mail of craft and power\nHe weareth in this hour;\nOn earth is not His fellow.",
"v"
],
[
"With \"force\" of arms we nothing can,\nFull soon were we down-ridden;\nBut for us fights \\ the proper Man,\nWhom God Himself hath bidden.\nAsk ye: Who is this same?\nChrist Jesus is His name,\nThe Lord Sabaoths Son;\nHe, and no other one,\nShall conquer in the battle.",
"v"
],
[
"And were this world all devils oer,\nAnd watching to devour us,\nWe lay it not to heart so sore;\nNot they can overpower us.\nAnd let the prince of ill\nLook grim as eer he will,\nHe harms us not a whit;\nFor why? his doom is writ;\nA word shall quickly slay him.",
"v"
],
[
"Gods word, for all their craft and force,\nOne moment will not linger,\nBut, spite of hell, shall have its course;\nTis written by His finger.\nAnd though they take our life,\nGoods, honour, children, wife,\nYet is their profit small:\nThese things shall vanish all;\nThe city of God remaineth.",
"v"
]
]
}

View File

@ -0,0 +1,47 @@
{Abbreviation:"SB1",Copyright:"Public domain",Songs:[{ID:3,Composer:"Unknown",Author:"Martin Luther",Copyright:"Public
Domain",Theme:"tema1
tema2",CCLI:"12345",Alias:"A safe stronghold",Memo1:"This is
the first comment
",Memo2:"This is
the second comment
",Memo3:"This is
the third comment
",Reference:"reference",Guid:"jtCkrJdPIUOmECjaQylg/g",Verses:[{
Text:"As safe a stronghold our God is still,
A trusty shield and weapon;
Hell help us clear from all the ill
That hath us now oertaken.
The ancient prince of hell
Hath risen with purpose fell;
Strong mail of craft and power
He weareth in this hour;
On earth is not His fellow."},{ID:2,
Text:"With \"force\" of arms we nothing can,
Full soon were we down-ridden;
But for us fights \\ the proper Man,
Whom God Himself hath bidden.
Ask ye: Who is this same?
Christ Jesus is His name,
The Lord Sabaoths Son;
He, and no other one,
Shall conquer in the battle."},{ID:3,
Text:"And were this world all devils oer,
And watching to devour us,
We lay it not to heart so sore;
Not they can overpower us.
And let the prince of ill
Look grim as eer he will,
He harms us not a whit;
For why? his doom is writ;
A word shall quickly slay him."},{ID:4,
Text:"Gods word, for all their craft and force,
One moment will not linger,
But, spite of hell, shall have its course;
Tis written by His finger.
And though they take our life,
Goods, honour, children, wife,
Yet is their profit small:
These things shall vanish all;
The city of God remaineth."}],AudioFile:"282.mp3",IsAudioFileEnabled:1,
Text:"A Safe Stronghold Our God is Still"}],Guid:"khiHU2blX0Kb41dGdbDLhA",VersionDate:"20121012000000",
Text:"SongBook1"}