Fix BG Chinese bible (Bug #706211)

This commit is contained in:
Jon Tibble 2011-01-31 01:55:25 +00:00
parent 780a8b8b39
commit b99961d669

View File

@ -210,6 +210,7 @@ class BGExtract(object):
cleaner = [(re.compile('&nbsp;|<br />|\'\+\''), lambda match: '')] cleaner = [(re.compile('&nbsp;|<br />|\'\+\''), lambda match: '')]
soup = get_soup_for_bible_ref( soup = get_soup_for_bible_ref(
u'http://www.biblegateway.com/passage/?%s' % url_params, u'http://www.biblegateway.com/passage/?%s' % url_params,
pre_parse_regex=r'<meta name.*?/>', pre_parse_substitute='',
cleaner=cleaner) cleaner=cleaner)
if not soup: if not soup:
return None return None
@ -499,7 +500,8 @@ class HTTPBible(BibleDB):
""" """
return HTTPBooks.get_verse_count(book, chapter) return HTTPBooks.get_verse_count(book, chapter)
def get_soup_for_bible_ref(reference_url, header=None, cleaner=None): def get_soup_for_bible_ref(reference_url, header=None, pre_parse_regex=None,
pre_parse_substitute=None, cleaner=None):
""" """
Gets a webpage and returns a parsed and optionally cleaned soup or None. Gets a webpage and returns a parsed and optionally cleaned soup or None.
@ -509,6 +511,13 @@ def get_soup_for_bible_ref(reference_url, header=None, cleaner=None):
``header`` ``header``
An optional HTTP header to pass to the bible web server. An optional HTTP header to pass to the bible web server.
``pre_parse_regex``
A regular expression to run on the webpage. Allows manipulation of the
webpage before passing to BeautifulSoup for parsing.
``pre_parse_substitute``
The text to replace any matches to the regular expression with.
``cleaner`` ``cleaner``
An optional regex to use during webpage parsing. An optional regex to use during webpage parsing.
""" """
@ -518,12 +527,15 @@ def get_soup_for_bible_ref(reference_url, header=None, cleaner=None):
if not page: if not page:
send_error_message(u'download') send_error_message(u'download')
return None return None
page_source = page.read()
if pre_parse_regex and pre_parse_substitute is not None:
page_source = re.sub(pre_parse_regex, pre_parse_substitute, page_source)
soup = None soup = None
try: try:
if cleaner: if cleaner:
soup = BeautifulSoup(page, markupMassage=cleaner) soup = BeautifulSoup(page_source, markupMassage=cleaner)
else: else:
soup = BeautifulSoup(page) soup = BeautifulSoup(page_source)
except HTMLParseError: except HTMLParseError:
log.exception(u'BeautifulSoup could not parse the bible page.') log.exception(u'BeautifulSoup could not parse the bible page.')
if not soup: if not soup: