From 66898bad84f86dbdb693d0ef55ec8a2485aeb898 Mon Sep 17 00:00:00 2001
From: Raoul Snyman <raoul.snyman@saturnlaboratories.co.za>
Date: Tue, 7 Jul 2009 22:18:36 +0200
Subject: [PATCH] Made some fixes to the common Bible import classes.

---
 openlp/plugins/bibles/lib/common.py | 159 ++++++++++++++++++----------
 1 file changed, 103 insertions(+), 56 deletions(-)

diff --git a/openlp/plugins/bibles/lib/common.py b/openlp/plugins/bibles/lib/common.py
index c63fc5e71..f8d37649e 100644
--- a/openlp/plugins/bibles/lib/common.py
+++ b/openlp/plugins/bibles/lib/common.py
@@ -19,93 +19,140 @@ import os
 import os.path
 import sys
 import urllib2
-
+import chardet
 import logging
 
 class SearchResults:
+    """
+    Encapsulate a set of search results. This is Bible-type independant.
+    """
     def __init__(self, book, chapter, verselist):
+        """
+        Create the search result object.
+
+        ``book``
+            The book of the Bible.
+
+        ``chapter``
+            The chapter of the book.
+
+        ``verselist``
+            The list of verses for this reading
+        """
         self.book = book
         self.chapter = chapter
         self.verselist = verselist
-    def get_verselist(self):
-        return self.verselist
-    def get_book(self):
-        return self.book
-    def get_chapter(self):
-        return self.chapter
-    def has_verselist(self):
-        if self.verselist == {}:
-            return False
-        else:
-            return True
 
-class BibleCommon:
+    def get_verselist(self):
+        """
+        Returns the list of verses.
+        """
+        return self.verselist
+
+    def get_book(self):
+        """
+        Returns the book of the Bible.
+        """
+        return self.book
+
+    def get_chapter(self):
+        """
+        Returns the chapter of the book.
+        """
+        return self.chapter
+
+    def has_verselist(self):
+        """
+        Returns whether or not the verse list contains verses.
+        """
+        return len(self.verselist) > 0
+
+
+class BibleCommon(object):
+    """
+    A common ancestor for bible download sites.
+    """
     global log
     log = logging.getLogger(u'BibleCommon')
     log.info(u'BibleCommon')
+
     def __init__(self):
         """
+        An empty constructor... not sure why I'm here.
         """
+        pass
+
     def _get_web_text(self, urlstring, proxyurl):
+        """
+        Get the HTML from the web page.
+
+        ``urlstring``
+            The URL of the page to open.
+
+        ``proxyurl``
+            The URL of a proxy server used to access the Internet.
+        """
         log.debug(u'get_web_text %s %s', proxyurl, urlstring)
-        if  not proxyurl == None:
-            proxy_support = urllib2.ProxyHandler({'http':  self.proxyurl})
+        if proxyurl is not None:
+            proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
             http_support = urllib2.HTTPHandler()
-            opener= urllib2.build_opener(proxy_support, http_support)
+            opener = urllib2.build_opener(proxy_support, http_support)
             urllib2.install_opener(opener)
         xml_string = u''
         req = urllib2.Request(urlstring)
-        req.add_header(u'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
+        req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
         try:
             handle = urllib2.urlopen(req)
-            xml_string = unicode(handle.read())
+            html = handle.read()
+            details = chardet.detect(html)
+            xml_string = unicode(html, details['encoding'])
         except IOError, e:
             if hasattr(e, u'reason'):
-                log.error(u'Reason : ')
-                log.error( e.reason)
+                log.error(u'Reason : %s', e.reason)
         return xml_string
 
     def _clean_text(self, text):
         """
-        Clean up text and remove extra characters
-        after been downloaded from web
+        Clean up text and remove extra characters after been downloaded from
+        the Internet.
+
+        ``text``
+            The text from the web page that needs to be cleaned up.
         """
         #return text.rstrip()
         # Remove Headings from the Text
-        i = text.find(u'<h')
-        while i > -1:
-            j=text.find(u'</h', i)
-            text = text[ : (i - 1)]+text[(j+4)]
-            i = text.find(u'<h')
-
+        start_tag = text.find(u'<h')
+        while start_tag > -1:
+            end_tag = text.find(u'</h', start_tag)
+            text = text[:(start_tag - 1)] + text[(end_tag + 4)]
+            start_tag = text.find(u'<h')
         # Remove Support References from the Text
-        x = text.find(u'<sup>')
-        while x > -1:
-            y = text.find(u'</sup>')
-            text= text[:x] + text[y + 6:len(text)]
-            x = text.find(u'<sup>')
-
+        start_tag = text.find(u'<sup>')
+        while start_tag > -1:
+            end_tag = text.find(u'</sup>')
+            text = text[:start_tag] + text[end_tag + 6:len(text)]
+            start_tag = text.find(u'<sup>')
         # Static Clean ups
-        text= text.replace(u'\n', u'')
-        text= text.replace(u'\r', u'')
-        text= text.replace(u'&nbsp;', u'')
-        text= text.replace(u'<P>', u'')
-        text= text.replace(u'<I>', u'')
-        text= text.replace(u'</I>', u'')
-        text= text.replace(u'<P />', u'')
-        text= text.replace(u'<p />', u'')
-        text= text.replace(u'</P>', u'')
-        text= text.replace(u'<BR>', u'')
-        text= text.replace(u'<BR />', u'')
-        #text= text.replace(chr(189), u'1/2');print "l"
-        text= text.replace(u'&quot;', "'")
-        text= text.replace(u'&apos;', "'")
-
-        i = text.find(u'<')
-        while i > -1 :
-            j = text.find(u'>', i)
-            text= text[:i] + text[j+1:]
-            i = text.find(u'<')
-
-        text= text.replace(u'>', u'')
+        text = text.replace(u'\n', u'')
+        text = text.replace(u'\r', u'')
+        text = text.replace(u'&nbsp;', u'')
+        text = text.replace(u'<P>', u'')
+        text = text.replace(u'<I>', u'')
+        text = text.replace(u'</I>', u'')
+        text = text.replace(u'<P />', u'')
+        text = text.replace(u'<p />', u'')
+        text = text.replace(u'</P>', u'')
+        text = text.replace(u'<BR>', u'')
+        text = text.replace(u'<BR />', u'')
+        #text = text.replace(chr(189), u'1/2');print "l"
+        text = text.replace(u'&quot;', u'\"')
+        text = text.replace(u'&apos;', u'\'')
+        # Remove some other tags
+        start_tag = text.find(u'<')
+        while start_tag > -1 :
+            end_tag = text.find(u'>', start_tag)
+            text = text[:start_tag] + text[end_tag + 1:]
+            start_tag = text.find(u'<')
+        text = text.replace(u'>', u'')
         return text.rstrip()
+