fixed bs4 code; fixed regression

This commit is contained in:
Andreas Preikschat 2013-04-15 21:54:27 +02:00
parent 78ed2f655c
commit e2b8dc54f3

View File

@ -99,14 +99,15 @@ class BGExtract(object):
""" """
if isinstance(tag, NavigableString): if isinstance(tag, NavigableString):
return None, unicode(tag) return None, unicode(tag)
elif tag.get('class') == 'versenum' or tag.get('class') == 'versenum mid-line': elif tag.get('class')[0] == "versenum" or tag.get('class')[0] == 'versenum mid-line':
verse = unicode(tag.string).replace('[', '').replace(']', '').strip() verse = unicode(tag.string).replace('[', '').replace(']', '').strip()
return verse, None return verse, None
elif tag.get('class') == 'chapternum': elif tag.get('class')[0] == 'chapternum':
verse = '1' verse = '1'
return verse, None return verse, None
else: else:
verse, text = None, '' verse = None
text = ''
for child in tag.contents: for child in tag.contents:
c_verse, c_text = self._extract_verse(child) c_verse, c_text = self._extract_verse(child)
if c_verse: if c_verse:
@ -143,7 +144,8 @@ class BGExtract(object):
tags = tags[::-1] tags = tags[::-1]
current_text = '' current_text = ''
for tag in tags: for tag in tags:
verse, text = None, '' verse = None
text = ''
for child in tag.contents: for child in tag.contents:
c_verse, c_text = self._extract_verse(child) c_verse, c_text = self._extract_verse(child)
if c_verse: if c_verse:
@ -208,7 +210,7 @@ class BGExtract(object):
if clean_verse_num: if clean_verse_num:
verse_text = raw_verse_num.next_element verse_text = raw_verse_num.next_element
part = raw_verse_num.next_element.next_element part = raw_verse_num.next_element.next_element
while not (isinstance(part, Tag) and part.get(u'class') == u'versenum'): while not (isinstance(part, Tag) and part.get(u'class')[0] == u'versenum'):
# While we are still in the same verse grab all the text. # While we are still in the same verse grab all the text.
if isinstance(part, NavigableString): if isinstance(part, NavigableString):
verse_text += part verse_text += part
@ -349,7 +351,7 @@ class BSExtract(object):
verses = {} verses = {}
for verse in content: for verse in content:
self.application.process_events() self.application.process_events()
versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', verse[u'class'])) versenumber = int(VERSE_NUMBER_REGEX.sub(r'\3', u' '.join(verse[u'class'])))
verses[versenumber] = verse.contents[1].rstrip(u'\n') verses[versenumber] = verse.contents[1].rstrip(u'\n')
return SearchResults(book_name, chapter, verses) return SearchResults(book_name, chapter, verses)
@ -374,6 +376,16 @@ class BSExtract(object):
content = content.find_all(u'li') content = content.find_all(u'li')
return [book.contents[0].contents[0] for book in content] return [book.contents[0].contents[0] for book in content]
def _get_application(self):
"""
Adds the openlp to the class dynamically
"""
if not hasattr(self, u'_application'):
self._application = Registry().get(u'application')
return self._application
application = property(_get_application)
class CWExtract(object): class CWExtract(object):
""" """
@ -693,7 +705,7 @@ def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None, pre
soup = None soup = None
try: try:
soup = BeautifulSoup(page_source) soup = BeautifulSoup(page_source)
CLEANER_REGEX.sub(u'', soup) CLEANER_REGEX.sub(u'', unicode(soup))
except HTMLParseError: except HTMLParseError:
log.exception(u'BeautifulSoup could not parse the bible page.') log.exception(u'BeautifulSoup could not parse the bible page.')
if not soup: if not soup: