From 66898bad84f86dbdb693d0ef55ec8a2485aeb898 Mon Sep 17 00:00:00 2001 From: Raoul Snyman Date: Tue, 7 Jul 2009 22:18:36 +0200 Subject: [PATCH] Made some fixes to the common Bible import classes. --- openlp/plugins/bibles/lib/common.py | 159 ++++++++++++++++++---------- 1 file changed, 103 insertions(+), 56 deletions(-) diff --git a/openlp/plugins/bibles/lib/common.py b/openlp/plugins/bibles/lib/common.py index c63fc5e71..f8d37649e 100644 --- a/openlp/plugins/bibles/lib/common.py +++ b/openlp/plugins/bibles/lib/common.py @@ -19,93 +19,140 @@ import os import os.path import sys import urllib2 - +import chardet import logging class SearchResults: + """ + Encapsulate a set of search results. This is Bible-type independant. + """ def __init__(self, book, chapter, verselist): + """ + Create the search result object. + + ``book`` + The book of the Bible. + + ``chapter`` + The chapter of the book. + + ``verselist`` + The list of verses for this reading + """ self.book = book self.chapter = chapter self.verselist = verselist - def get_verselist(self): - return self.verselist - def get_book(self): - return self.book - def get_chapter(self): - return self.chapter - def has_verselist(self): - if self.verselist == {}: - return False - else: - return True -class BibleCommon: + def get_verselist(self): + """ + Returns the list of verses. + """ + return self.verselist + + def get_book(self): + """ + Returns the book of the Bible. + """ + return self.book + + def get_chapter(self): + """ + Returns the chapter of the book. + """ + return self.chapter + + def has_verselist(self): + """ + Returns whether or not the verse list contains verses. + """ + return len(self.verselist) > 0 + + +class BibleCommon(object): + """ + A common ancestor for bible download sites. + """ global log log = logging.getLogger(u'BibleCommon') log.info(u'BibleCommon') + def __init__(self): """ + An empty constructor... not sure why I'm here. """ + pass + def _get_web_text(self, urlstring, proxyurl): + """ + Get the HTML from the web page. + + ``urlstring`` + The URL of the page to open. + + ``proxyurl`` + The URL of a proxy server used to access the Internet. + """ log.debug(u'get_web_text %s %s', proxyurl, urlstring) - if not proxyurl == None: - proxy_support = urllib2.ProxyHandler({'http': self.proxyurl}) + if proxyurl is not None: + proxy_support = urllib2.ProxyHandler({'http': self.proxyurl}) http_support = urllib2.HTTPHandler() - opener= urllib2.build_opener(proxy_support, http_support) + opener = urllib2.build_opener(proxy_support, http_support) urllib2.install_opener(opener) xml_string = u'' req = urllib2.Request(urlstring) - req.add_header(u'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') + req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)') try: handle = urllib2.urlopen(req) - xml_string = unicode(handle.read()) + html = handle.read() + details = chardet.detect(html) + xml_string = unicode(html, details['encoding']) except IOError, e: if hasattr(e, u'reason'): - log.error(u'Reason : ') - log.error( e.reason) + log.error(u'Reason : %s', e.reason) return xml_string def _clean_text(self, text): """ - Clean up text and remove extra characters - after been downloaded from web + Clean up text and remove extra characters after been downloaded from + the Internet. + + ``text`` + The text from the web page that needs to be cleaned up. """ #return text.rstrip() # Remove Headings from the Text - i = text.find(u' -1: - j=text.find(u' -1: + end_tag = text.find(u'') - while x > -1: - y = text.find(u'') - text= text[:x] + text[y + 6:len(text)] - x = text.find(u'') - + start_tag = text.find(u'') + while start_tag > -1: + end_tag = text.find(u'') + text = text[:start_tag] + text[end_tag + 6:len(text)] + start_tag = text.find(u'') # Static Clean ups - text= text.replace(u'\n', u'') - text= text.replace(u'\r', u'') - text= text.replace(u' ', u'') - text= text.replace(u'

', u'') - text= text.replace(u'', u'') - text= text.replace(u'', u'') - text= text.replace(u'

', u'') - text= text.replace(u'

', u'') - text= text.replace(u'

', u'') - text= text.replace(u'
', u'') - text= text.replace(u'
', u'') - #text= text.replace(chr(189), u'1/2');print "l" - text= text.replace(u'"', "'") - text= text.replace(u''', "'") - - i = text.find(u'<') - while i > -1 : - j = text.find(u'>', i) - text= text[:i] + text[j+1:] - i = text.find(u'<') - - text= text.replace(u'>', u'') + text = text.replace(u'\n', u'') + text = text.replace(u'\r', u'') + text = text.replace(u' ', u'') + text = text.replace(u'

', u'') + text = text.replace(u'', u'') + text = text.replace(u'', u'') + text = text.replace(u'

', u'') + text = text.replace(u'

', u'') + text = text.replace(u'

', u'') + text = text.replace(u'
', u'') + text = text.replace(u'
', u'') + #text = text.replace(chr(189), u'1/2');print "l" + text = text.replace(u'"', u'\"') + text = text.replace(u''', u'\'') + # Remove some other tags + start_tag = text.find(u'<') + while start_tag > -1 : + end_tag = text.find(u'>', start_tag) + text = text[:start_tag] + text[end_tag + 1:] + start_tag = text.find(u'<') + text = text.replace(u'>', u'') return text.rstrip() +