forked from openlp/openlp
Made some fixes to the common Bible import classes.
This commit is contained in:
parent
1872f57840
commit
66898bad84
@ -19,72 +19,119 @@ import os
|
||||
import os.path
|
||||
import sys
|
||||
import urllib2
|
||||
|
||||
import chardet
|
||||
import logging
|
||||
|
||||
class SearchResults:
|
||||
"""
|
||||
Encapsulate a set of search results. This is Bible-type independant.
|
||||
"""
|
||||
def __init__(self, book, chapter, verselist):
|
||||
"""
|
||||
Create the search result object.
|
||||
|
||||
``book``
|
||||
The book of the Bible.
|
||||
|
||||
``chapter``
|
||||
The chapter of the book.
|
||||
|
||||
``verselist``
|
||||
The list of verses for this reading
|
||||
"""
|
||||
self.book = book
|
||||
self.chapter = chapter
|
||||
self.verselist = verselist
|
||||
def get_verselist(self):
|
||||
return self.verselist
|
||||
def get_book(self):
|
||||
return self.book
|
||||
def get_chapter(self):
|
||||
return self.chapter
|
||||
def has_verselist(self):
|
||||
if self.verselist == {}:
|
||||
return False
|
||||
else:
|
||||
return True
|
||||
|
||||
class BibleCommon:
|
||||
def get_verselist(self):
|
||||
"""
|
||||
Returns the list of verses.
|
||||
"""
|
||||
return self.verselist
|
||||
|
||||
def get_book(self):
|
||||
"""
|
||||
Returns the book of the Bible.
|
||||
"""
|
||||
return self.book
|
||||
|
||||
def get_chapter(self):
|
||||
"""
|
||||
Returns the chapter of the book.
|
||||
"""
|
||||
return self.chapter
|
||||
|
||||
def has_verselist(self):
|
||||
"""
|
||||
Returns whether or not the verse list contains verses.
|
||||
"""
|
||||
return len(self.verselist) > 0
|
||||
|
||||
|
||||
class BibleCommon(object):
|
||||
"""
|
||||
A common ancestor for bible download sites.
|
||||
"""
|
||||
global log
|
||||
log = logging.getLogger(u'BibleCommon')
|
||||
log.info(u'BibleCommon')
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
An empty constructor... not sure why I'm here.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _get_web_text(self, urlstring, proxyurl):
|
||||
"""
|
||||
Get the HTML from the web page.
|
||||
|
||||
``urlstring``
|
||||
The URL of the page to open.
|
||||
|
||||
``proxyurl``
|
||||
The URL of a proxy server used to access the Internet.
|
||||
"""
|
||||
log.debug(u'get_web_text %s %s', proxyurl, urlstring)
|
||||
if not proxyurl == None:
|
||||
if proxyurl is not None:
|
||||
proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
|
||||
http_support = urllib2.HTTPHandler()
|
||||
opener = urllib2.build_opener(proxy_support, http_support)
|
||||
urllib2.install_opener(opener)
|
||||
xml_string = u''
|
||||
req = urllib2.Request(urlstring)
|
||||
req.add_header(u'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
|
||||
req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
|
||||
try:
|
||||
handle = urllib2.urlopen(req)
|
||||
xml_string = unicode(handle.read())
|
||||
html = handle.read()
|
||||
details = chardet.detect(html)
|
||||
xml_string = unicode(html, details['encoding'])
|
||||
except IOError, e:
|
||||
if hasattr(e, u'reason'):
|
||||
log.error(u'Reason : ')
|
||||
log.error( e.reason)
|
||||
log.error(u'Reason : %s', e.reason)
|
||||
return xml_string
|
||||
|
||||
def _clean_text(self, text):
|
||||
"""
|
||||
Clean up text and remove extra characters
|
||||
after been downloaded from web
|
||||
Clean up text and remove extra characters after been downloaded from
|
||||
the Internet.
|
||||
|
||||
``text``
|
||||
The text from the web page that needs to be cleaned up.
|
||||
"""
|
||||
#return text.rstrip()
|
||||
# Remove Headings from the Text
|
||||
i = text.find(u'<h')
|
||||
while i > -1:
|
||||
j=text.find(u'</h', i)
|
||||
text = text[ : (i - 1)]+text[(j+4)]
|
||||
i = text.find(u'<h')
|
||||
|
||||
start_tag = text.find(u'<h')
|
||||
while start_tag > -1:
|
||||
end_tag = text.find(u'</h', start_tag)
|
||||
text = text[:(start_tag - 1)] + text[(end_tag + 4)]
|
||||
start_tag = text.find(u'<h')
|
||||
# Remove Support References from the Text
|
||||
x = text.find(u'<sup>')
|
||||
while x > -1:
|
||||
y = text.find(u'</sup>')
|
||||
text= text[:x] + text[y + 6:len(text)]
|
||||
x = text.find(u'<sup>')
|
||||
|
||||
start_tag = text.find(u'<sup>')
|
||||
while start_tag > -1:
|
||||
end_tag = text.find(u'</sup>')
|
||||
text = text[:start_tag] + text[end_tag + 6:len(text)]
|
||||
start_tag = text.find(u'<sup>')
|
||||
# Static Clean ups
|
||||
text = text.replace(u'\n', u'')
|
||||
text = text.replace(u'\r', u'')
|
||||
@ -98,14 +145,14 @@ class BibleCommon:
|
||||
text = text.replace(u'<BR>', u'')
|
||||
text = text.replace(u'<BR />', u'')
|
||||
#text = text.replace(chr(189), u'1/2');print "l"
|
||||
text= text.replace(u'"', "'")
|
||||
text= text.replace(u''', "'")
|
||||
|
||||
i = text.find(u'<')
|
||||
while i > -1 :
|
||||
j = text.find(u'>', i)
|
||||
text= text[:i] + text[j+1:]
|
||||
i = text.find(u'<')
|
||||
|
||||
text = text.replace(u'"', u'\"')
|
||||
text = text.replace(u''', u'\'')
|
||||
# Remove some other tags
|
||||
start_tag = text.find(u'<')
|
||||
while start_tag > -1 :
|
||||
end_tag = text.find(u'>', start_tag)
|
||||
text = text[:start_tag] + text[end_tag + 1:]
|
||||
start_tag = text.find(u'<')
|
||||
text = text.replace(u'>', u'')
|
||||
return text.rstrip()
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user