Fix parsing biblestudytools.com. Fixes bug 1418212.

Fixes: https://launchpad.net/bugs/1418212
This commit is contained in:
Tomas Groth 2015-02-06 20:29:38 +00:00
parent 7287759ba5
commit a5b92f1e87
1 changed files with 10 additions and 24 deletions

View File

@ -365,31 +365,20 @@ class CWExtract(RegistryProperties):
if not soup: if not soup:
return None return None
self.application.process_events() self.application.process_events()
html_verses = soup.find_all('span', 'versetext') verses_div = soup.find_all('div', 'verse')
if not html_verses: if not verses_div:
log.error('No verses found in the CrossWalk response.') log.error('No verses found in the CrossWalk response.')
send_error_message('parse') send_error_message('parse')
return None return None
verses = {} verses = {}
for verse in html_verses: for verse in verses_div:
self.application.process_events() self.application.process_events()
verse_number = int(verse.contents[0].contents[0]) verse_number = int(verse.find('strong').contents[0])
verse_text = '' verse_span = verse.find('span')
for part in verse.contents: tags_to_remove = verse_span.find_all(['a', 'sup'])
self.application.process_events() for tag in tags_to_remove:
if isinstance(part, NavigableString): tag.decompose()
verse_text += part verse_text = verse_span.get_text()
elif part and part.attrMap and \
(part.attrMap['class'] == 'WordsOfChrist' or part.attrMap['class'] == 'strongs'):
for subpart in part.contents:
self.application.process_events()
if isinstance(subpart, NavigableString):
verse_text += subpart
elif subpart and subpart.attrMap and subpart.attrMap['class'] == 'strongs':
for subsub in subpart.contents:
self.application.process_events()
if isinstance(subsub, NavigableString):
verse_text += subsub
self.application.process_events() self.application.process_events()
# Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and . # Fix up leading and trailing spaces, multiple spaces, and spaces between text and , and .
verse_text = verse_text.strip('\n\r\t ') verse_text = verse_text.strip('\n\r\t ')
@ -409,16 +398,13 @@ class CWExtract(RegistryProperties):
soup = get_soup_for_bible_ref(chapter_url) soup = get_soup_for_bible_ref(chapter_url)
if not soup: if not soup:
return None return None
content = soup.find('div', {'class': 'Body'}) content = soup.find_all(('h4', {'class': 'small-header'}))
content = content.find('ul', {'class': 'parent'})
if not content: if not content:
log.error('No books found in the Crosswalk response.') log.error('No books found in the Crosswalk response.')
send_error_message('parse') send_error_message('parse')
return None return None
content = content.find_all('li')
books = [] books = []
for book in content: for book in content:
book = book.find('a')
books.append(book.contents[0]) books.append(book.contents[0])
return books return books