This commit is contained in:
Tim Bentley 2011-02-27 06:59:09 +00:00
commit 60520315a7
2 changed files with 25 additions and 18 deletions

View File

@ -35,7 +35,7 @@ import socket
import urllib import urllib
from HTMLParser import HTMLParseError from HTMLParser import HTMLParseError
from BeautifulSoup import BeautifulSoup, NavigableString from BeautifulSoup import BeautifulSoup, NavigableString, Tag
from openlp.core.lib import Receiver, translate from openlp.core.lib import Receiver, translate
from openlp.core.lib.ui import critical_error_message_box from openlp.core.lib.ui import critical_error_message_box
@ -221,21 +221,14 @@ class BGExtract(object):
crossrefs = soup.findAll(u'sup', u'xref') crossrefs = soup.findAll(u'sup', u'xref')
if crossrefs: if crossrefs:
[crossref.extract() for crossref in crossrefs] [crossref.extract() for crossref in crossrefs]
headings = soup.findAll(u'h5')
if headings:
[heading.extract() for heading in headings]
cleanup = [(re.compile('\s+'), lambda match: ' ')] cleanup = [(re.compile('\s+'), lambda match: ' ')]
verses = BeautifulSoup(str(soup), markupMassage=cleanup) verses = BeautifulSoup(str(soup), markupMassage=cleanup)
content = verses.find(u'div', u'result-text-style-normal')
if not content:
content = verses.find(u'div', u'result-text-style-rtl-serif')
if not content:
log.debug(u'No content found in the BibleGateway response.')
send_error_message(u'parse')
return None
verse_count = len(verses.findAll(u'sup', u'versenum'))
found_count = 0
verse_list = {} verse_list = {}
while found_count < verse_count: for verse in verses(u'sup', u'versenum'):
content = content.findNext(u'sup', u'versenum') raw_verse_num = verse.next
raw_verse_num = content.next
clean_verse_num = 0 clean_verse_num = 0
# Not all verses exist in all translations and may or may not be # Not all verses exist in all translations and may or may not be
# represented by a verse number. If they are not fine, if they are # represented by a verse number. If they are not fine, if they are
@ -248,9 +241,22 @@ class BGExtract(object):
log.exception(u'Illegal verse number in %s %s %s:%s', log.exception(u'Illegal verse number in %s %s %s:%s',
version, bookname, chapter, unicode(raw_verse_num)) version, bookname, chapter, unicode(raw_verse_num))
if clean_verse_num: if clean_verse_num:
raw_verse_text = raw_verse_num.next verse_text = raw_verse_num.next
verse_list[clean_verse_num] = unicode(raw_verse_text) part = raw_verse_num.next.next
found_count += 1 while not (isinstance(part, Tag) and part.attrMap and
part.attrMap[u'class'] == u'versenum'):
# While we are still in the same verse grab all the text.
if isinstance(part, NavigableString):
verse_text = verse_text + part
if isinstance(part.next, Tag) and part.next.name == u'div':
# Run out of verses so stop.
break
part = part.next
verse_list[clean_verse_num] = unicode(verse_text)
if not verse_list:
log.debug(u'No content found in the BibleGateway response.')
send_error_message(u'parse')
return None
return SearchResults(bookname, chapter, verse_list) return SearchResults(bookname, chapter, verse_list)

View File

@ -314,14 +314,15 @@ class FoilPresenter(object):
i = 1 i = 1
else: else:
i = 1 i = 1
author_temp = []
for author in strings: for author in strings:
temp = re.split(u',(?=\D{2})|(?<=\D),|\/(?=\D{3,})|(?<=\D);', temp = re.split(u',(?=\D{2})|(?<=\D),|\/(?=\D{3,})|(?<=\D);',
author) author)
for tempx in temp: for tempx in temp:
author_temp.append(tempx) author_temp.append(tempx)
for author in author_temp: for author in author_temp:
regex = u'^[\/,;\-\s]+|[\/,;\-\s]+$|'\ regex = u'^[\/,;\-\s\.]+|[\/,;\-\s\.]+$|'\
'\s*[0-9]{4}\s*[\-\/]?\s*([0-9]{4})?[\/,;\-\s]*$' '\s*[0-9]{4}\s*[\-\/]?\s*([0-9]{4})?[\/,;\-\s\.]*$'
author = re.compile(regex).sub(u'', author) author = re.compile(regex).sub(u'', author)
author = re.compile( author = re.compile(
u'[0-9]{1,2}\.\s?J(ahr)?h\.|um\s*$|vor\s*$').sub(u'', u'[0-9]{1,2}\.\s?J(ahr)?h\.|um\s*$|vor\s*$').sub(u'',