Made some fixes to the common Bible import classes.

This commit is contained in:
Raoul Snyman 2009-07-07 22:18:36 +02:00
parent 1872f57840
commit 66898bad84

View File

@ -19,93 +19,140 @@ import os
import os.path
import sys
import urllib2
import chardet
import logging
class SearchResults:
Encapsulate a set of search results. This is Bible-type independant.
def __init__(self, book, chapter, verselist):
Create the search result object.
The book of the Bible.
The chapter of the book.
The list of verses for this reading
""" = book
self.chapter = chapter
self.verselist = verselist
def get_verselist(self):
return self.verselist
def get_book(self):
def get_chapter(self):
return self.chapter
def has_verselist(self):
if self.verselist == {}:
return False
return True
class BibleCommon:
def get_verselist(self):
Returns the list of verses.
return self.verselist
def get_book(self):
Returns the book of the Bible.
def get_chapter(self):
Returns the chapter of the book.
return self.chapter
def has_verselist(self):
Returns whether or not the verse list contains verses.
return len(self.verselist) > 0
class BibleCommon(object):
A common ancestor for bible download sites.
global log
log = logging.getLogger(u'BibleCommon')'BibleCommon')
def __init__(self):
An empty constructor... not sure why I'm here.
def _get_web_text(self, urlstring, proxyurl):
Get the HTML from the web page.
The URL of the page to open.
The URL of a proxy server used to access the Internet.
log.debug(u'get_web_text %s %s', proxyurl, urlstring)
if not proxyurl == None:
proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
if proxyurl is not None:
proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
http_support = urllib2.HTTPHandler()
opener= urllib2.build_opener(proxy_support, http_support)
opener = urllib2.build_opener(proxy_support, http_support)
xml_string = u''
req = urllib2.Request(urlstring)
req.add_header(u'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
handle = urllib2.urlopen(req)
xml_string = unicode(
html =
details = chardet.detect(html)
xml_string = unicode(html, details['encoding'])
except IOError, e:
if hasattr(e, u'reason'):
log.error(u'Reason : ')
log.error( e.reason)
log.error(u'Reason : %s', e.reason)
return xml_string
def _clean_text(self, text):
Clean up text and remove extra characters
after been downloaded from web
Clean up text and remove extra characters after been downloaded from
the Internet.
The text from the web page that needs to be cleaned up.
#return text.rstrip()
# Remove Headings from the Text
i = text.find(u'<h')
while i > -1:
j=text.find(u'</h', i)
text = text[ : (i - 1)]+text[(j+4)]
i = text.find(u'<h')
start_tag = text.find(u'<h')
while start_tag > -1:
end_tag = text.find(u'</h', start_tag)
text = text[:(start_tag - 1)] + text[(end_tag + 4)]
start_tag = text.find(u'<h')
# Remove Support References from the Text
x = text.find(u'<sup>')
while x > -1:
y = text.find(u'</sup>')
text= text[:x] + text[y + 6:len(text)]
x = text.find(u'<sup>')
start_tag = text.find(u'<sup>')
while start_tag > -1:
end_tag = text.find(u'</sup>')
text = text[:start_tag] + text[end_tag + 6:len(text)]
start_tag = text.find(u'<sup>')
# Static Clean ups
text= text.replace(u'\n', u'')
text= text.replace(u'\r', u'')
text= text.replace(u'&nbsp;', u'')
text= text.replace(u'<P>', u'')
text= text.replace(u'<I>', u'')
text= text.replace(u'</I>', u'')
text= text.replace(u'<P />', u'')
text= text.replace(u'<p />', u'')
text= text.replace(u'</P>', u'')
text= text.replace(u'<BR>', u'')
text= text.replace(u'<BR />', u'')
#text= text.replace(chr(189), u'1/2');print "l"
text= text.replace(u'&quot;', "'")
text= text.replace(u'&apos;', "'")
i = text.find(u'<')
while i > -1 :
j = text.find(u'>', i)
text= text[:i] + text[j+1:]
i = text.find(u'<')
text= text.replace(u'>', u'')
text = text.replace(u'\n', u'')
text = text.replace(u'\r', u'')
text = text.replace(u'&nbsp;', u'')
text = text.replace(u'<P>', u'')
text = text.replace(u'<I>', u'')
text = text.replace(u'</I>', u'')
text = text.replace(u'<P />', u'')
text = text.replace(u'<p />', u'')
text = text.replace(u'</P>', u'')
text = text.replace(u'<BR>', u'')
text = text.replace(u'<BR />', u'')
#text = text.replace(chr(189), u'1/2');print "l"
text = text.replace(u'&quot;', u'\"')
text = text.replace(u'&apos;', u'\'')
# Remove some other tags
start_tag = text.find(u'<')
while start_tag > -1 :
end_tag = text.find(u'>', start_tag)
text = text[:start_tag] + text[end_tag + 1:]
start_tag = text.find(u'<')
text = text.replace(u'>', u'')
return text.rstrip()