Update Crosswalk webpage parser to match new layout. Fixes bug 1599999.

Fixes: https://launchpad.net/bugs/1599999
This commit is contained in:
Tomas Groth 2016-07-07 22:56:50 +02:00
parent ba80fe653c
commit 93fc6e0145
2 changed files with 23 additions and 18 deletions

View File

@ -532,28 +532,26 @@ class CWExtract(RegistryProperties):
returns a list in the form [(biblename, biblekey, language_code)] returns a list in the form [(biblename, biblekey, language_code)]
""" """
log.debug('CWExtract.get_bibles_from_http') log.debug('CWExtract.get_bibles_from_http')
bible_url = 'http://www.biblestudytools.com/' bible_url = 'http://www.biblestudytools.com/bible-versions/'
soup = get_soup_for_bible_ref(bible_url) soup = get_soup_for_bible_ref(bible_url)
if not soup: if not soup:
return None return None
bible_select = soup.find('select') h4_tags = soup.find_all('h4', {'class': 'small-header'})
if not bible_select: if not h4_tags:
log.debug('No select tags found - did site change?') log.debug('No h4 tags found - did site change?')
return None
option_tags = bible_select.find_all('option', {'class': 'log-translation'})
if not option_tags:
log.debug('No option tags found - did site change?')
return None return None
bibles = [] bibles = []
for ot in option_tags: for h4t in h4_tags:
tag_text = ot.get_text().strip() short_name = None
try: if h4t.span:
tag_value = ot['value'] short_name = h4t.span.get_text().strip().lower()
except KeyError: else:
log.exception('No value attribute found - did site change?') log.error('No span tag found - did site change?')
return None return None
if not tag_value: if not short_name:
continue continue
h4t.span.extract()
tag_text = h4t.get_text().strip()
# The names of non-english bibles has their language in parentheses at the end # The names of non-english bibles has their language in parentheses at the end
if tag_text.endswith(')'): if tag_text.endswith(')'):
language = tag_text[tag_text.rfind('(') + 1:-1] language = tag_text[tag_text.rfind('(') + 1:-1]
@ -561,12 +559,20 @@ class CWExtract(RegistryProperties):
language_code = CROSSWALK_LANGUAGES[language] language_code = CROSSWALK_LANGUAGES[language]
else: else:
language_code = '' language_code = ''
# ... except for the latin vulgate # ... except for those that don't...
elif 'latin' in tag_text.lower(): elif 'latin' in tag_text.lower():
language_code = 'la' language_code = 'la'
elif 'la biblia' in tag_text.lower() or 'nueva' in tag_text.lower():
language_code = 'es'
elif 'chinese' in tag_text.lower():
language_code = 'zh'
elif 'greek' in tag_text.lower():
language_code = 'el'
elif 'nova' in tag_text.lower():
language_code = 'pt'
else: else:
language_code = 'en' language_code = 'en'
bibles.append((tag_text, tag_value, language_code)) bibles.append((tag_text, short_name, language_code))
return bibles return bibles

View File

@ -146,7 +146,6 @@ class TestBibleHTTP(TestCase):
self.assertIsNotNone(bibles) self.assertIsNotNone(bibles)
self.assertIn(('Holman Christian Standard Bible', 'HCSB', 'en'), bibles) self.assertIn(('Holman Christian Standard Bible', 'HCSB', 'en'), bibles)
@skip("Waiting for Crosswalk to fix their server")
def test_crosswalk_get_bibles(self): def test_crosswalk_get_bibles(self):
""" """
Test getting list of bibles from Crosswalk.com Test getting list of bibles from Crosswalk.com