Made some fixes to the common Bible import classes.

This commit is contained in:
Raoul Snyman 2009-07-07 22:18:36 +02:00
parent 1872f57840
commit 66898bad84
1 changed files with 103 additions and 56 deletions

View File

@ -19,93 +19,140 @@ import os
import os.path
import sys
import urllib2
import chardet
import logging
class SearchResults:
"""
Encapsulate a set of search results. This is Bible-type independant.
"""
def __init__(self, book, chapter, verselist):
"""
Create the search result object.
``book``
The book of the Bible.
``chapter``
The chapter of the book.
``verselist``
The list of verses for this reading
"""
self.book = book
self.chapter = chapter
self.verselist = verselist
def get_verselist(self):
return self.verselist
def get_book(self):
return self.book
def get_chapter(self):
return self.chapter
def has_verselist(self):
if self.verselist == {}:
return False
else:
return True
class BibleCommon:
def get_verselist(self):
"""
Returns the list of verses.
"""
return self.verselist
def get_book(self):
"""
Returns the book of the Bible.
"""
return self.book
def get_chapter(self):
"""
Returns the chapter of the book.
"""
return self.chapter
def has_verselist(self):
"""
Returns whether or not the verse list contains verses.
"""
return len(self.verselist) > 0
class BibleCommon(object):
"""
A common ancestor for bible download sites.
"""
global log
log = logging.getLogger(u'BibleCommon')
log.info(u'BibleCommon')
def __init__(self):
"""
An empty constructor... not sure why I'm here.
"""
pass
def _get_web_text(self, urlstring, proxyurl):
"""
Get the HTML from the web page.
``urlstring``
The URL of the page to open.
``proxyurl``
The URL of a proxy server used to access the Internet.
"""
log.debug(u'get_web_text %s %s', proxyurl, urlstring)
if not proxyurl == None:
proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
if proxyurl is not None:
proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
http_support = urllib2.HTTPHandler()
opener= urllib2.build_opener(proxy_support, http_support)
opener = urllib2.build_opener(proxy_support, http_support)
urllib2.install_opener(opener)
xml_string = u''
req = urllib2.Request(urlstring)
req.add_header(u'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
try:
handle = urllib2.urlopen(req)
xml_string = unicode(handle.read())
html = handle.read()
details = chardet.detect(html)
xml_string = unicode(html, details['encoding'])
except IOError, e:
if hasattr(e, u'reason'):
log.error(u'Reason : ')
log.error( e.reason)
log.error(u'Reason : %s', e.reason)
return xml_string
def _clean_text(self, text):
"""
Clean up text and remove extra characters
after been downloaded from web
Clean up text and remove extra characters after been downloaded from
the Internet.
``text``
The text from the web page that needs to be cleaned up.
"""
#return text.rstrip()
# Remove Headings from the Text
i = text.find(u'<h')
while i > -1:
j=text.find(u'</h', i)
text = text[ : (i - 1)]+text[(j+4)]
i = text.find(u'<h')
start_tag = text.find(u'<h')
while start_tag > -1:
end_tag = text.find(u'</h', start_tag)
text = text[:(start_tag - 1)] + text[(end_tag + 4)]
start_tag = text.find(u'<h')
# Remove Support References from the Text
x = text.find(u'<sup>')
while x > -1:
y = text.find(u'</sup>')
text= text[:x] + text[y + 6:len(text)]
x = text.find(u'<sup>')
start_tag = text.find(u'<sup>')
while start_tag > -1:
end_tag = text.find(u'</sup>')
text = text[:start_tag] + text[end_tag + 6:len(text)]
start_tag = text.find(u'<sup>')
# Static Clean ups
text= text.replace(u'\n', u'')
text= text.replace(u'\r', u'')
text= text.replace(u'&nbsp;', u'')
text= text.replace(u'<P>', u'')
text= text.replace(u'<I>', u'')
text= text.replace(u'</I>', u'')
text= text.replace(u'<P />', u'')
text= text.replace(u'<p />', u'')
text= text.replace(u'</P>', u'')
text= text.replace(u'<BR>', u'')
text= text.replace(u'<BR />', u'')
#text= text.replace(chr(189), u'1/2');print "l"
text= text.replace(u'&quot;', "'")
text= text.replace(u'&apos;', "'")
i = text.find(u'<')
while i > -1 :
j = text.find(u'>', i)
text= text[:i] + text[j+1:]
i = text.find(u'<')
text= text.replace(u'>', u'')
text = text.replace(u'\n', u'')
text = text.replace(u'\r', u'')
text = text.replace(u'&nbsp;', u'')
text = text.replace(u'<P>', u'')
text = text.replace(u'<I>', u'')
text = text.replace(u'</I>', u'')
text = text.replace(u'<P />', u'')
text = text.replace(u'<p />', u'')
text = text.replace(u'</P>', u'')
text = text.replace(u'<BR>', u'')
text = text.replace(u'<BR />', u'')
#text = text.replace(chr(189), u'1/2');print "l"
text = text.replace(u'&quot;', u'\"')
text = text.replace(u'&apos;', u'\'')
# Remove some other tags
start_tag = text.find(u'<')
while start_tag > -1 :
end_tag = text.find(u'>', start_tag)
text = text[:start_tag] + text[end_tag + 1:]
start_tag = text.find(u'<')
text = text.replace(u'>', u'')
return text.rstrip()