Made some fixes to the common Bible import classes.

2009-07-07 22:18:36 +02:00 · 2009-07-07 22:18:36 +02:00 · 66898bad84
commit 66898bad84
parent 1872f57840
1 changed files with 103 additions and 56 deletions
--- a/openlp/plugins/bibles/lib/common.py
+++ b/openlp/plugins/bibles/lib/common.py
@ -19,72 +19,119 @@ import os
 import os.path
 import sys
 import urllib2
-
+import chardet
 import logging

 class SearchResults:
+    """
+    Encapsulate a set of search results. This is Bible-type independant.
+    """
    def __init__(self, book, chapter, verselist):
+        """
+        Create the search result object.
+
+        ``book``
+            The book of the Bible.
+
+        ``chapter``
+            The chapter of the book.
+
+        ``verselist``
+            The list of verses for this reading
+        """
        self.book = book
        self.chapter = chapter
        self.verselist = verselist
-    def get_verselist(self):
-        return self.verselist
-    def get_book(self):
-        return self.book
-    def get_chapter(self):
-        return self.chapter
-    def has_verselist(self):
-        if self.verselist == {}:
-            return False
-        else:
-            return True

-class BibleCommon:
+    def get_verselist(self):
+        """
+        Returns the list of verses.
+        """
+        return self.verselist
+
+    def get_book(self):
+        """
+        Returns the book of the Bible.
+        """
+        return self.book
+
+    def get_chapter(self):
+        """
+        Returns the chapter of the book.
+        """
+        return self.chapter
+
+    def has_verselist(self):
+        """
+        Returns whether or not the verse list contains verses.
+        """
+        return len(self.verselist) > 0
+
+
+class BibleCommon(object):
+    """
+    A common ancestor for bible download sites.
+    """
    global log
    log = logging.getLogger(u'BibleCommon')
    log.info(u'BibleCommon')
+
    def __init__(self):
        """
+        An empty constructor... not sure why I'm here.
        """
+        pass
+
    def _get_web_text(self, urlstring, proxyurl):
+        """
+        Get the HTML from the web page.
+
+        ``urlstring``
+            The URL of the page to open.
+
+        ``proxyurl``
+            The URL of a proxy server used to access the Internet.
+        """
        log.debug(u'get_web_text %s %s', proxyurl, urlstring)
-        if  not proxyurl == None:
+        if proxyurl is not None:
            proxy_support = urllib2.ProxyHandler({'http': self.proxyurl})
            http_support = urllib2.HTTPHandler()
            opener = urllib2.build_opener(proxy_support, http_support)
            urllib2.install_opener(opener)
        xml_string = u''
        req = urllib2.Request(urlstring)
-        req.add_header(u'User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
+        req.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
        try:
            handle = urllib2.urlopen(req)
-            xml_string = unicode(handle.read())
+            html = handle.read()
+            details = chardet.detect(html)
+            xml_string = unicode(html, details['encoding'])
        except IOError, e:
            if hasattr(e, u'reason'):
-                log.error(u'Reason : ')
-                log.error( e.reason)
+                log.error(u'Reason : %s', e.reason)
        return xml_string

    def _clean_text(self, text):
        """
-        Clean up text and remove extra characters
-        after been downloaded from web
+        Clean up text and remove extra characters after been downloaded from
+        the Internet.
+
+        ``text``
+            The text from the web page that needs to be cleaned up.
        """
        #return text.rstrip()
        # Remove Headings from the Text
-        i = text.find(u'<h')
-        while i > -1:
-            j=text.find(u'</h', i)
-            text = text[ : (i - 1)]+text[(j+4)]
-            i = text.find(u'<h')
-
+        start_tag = text.find(u'<h')
+        while start_tag > -1:
+            end_tag = text.find(u'</h', start_tag)
+            text = text[:(start_tag - 1)] + text[(end_tag + 4)]
+            start_tag = text.find(u'<h')
        # Remove Support References from the Text
-        x = text.find(u'<sup>')
-        while x > -1:
-            y = text.find(u'</sup>')
-            text= text[:x] + text[y + 6:len(text)]
-            x = text.find(u'<sup>')
-
+        start_tag = text.find(u'<sup>')
+        while start_tag > -1:
+            end_tag = text.find(u'</sup>')
+            text = text[:start_tag] + text[end_tag + 6:len(text)]
+            start_tag = text.find(u'<sup>')
        # Static Clean ups
        text = text.replace(u'\n', u'')
        text = text.replace(u'\r', u'')
@ -98,14 +145,14 @@ class BibleCommon:
        text = text.replace(u'<BR>', u'')
        text = text.replace(u'<BR />', u'')
        #text = text.replace(chr(189), u'1/2');print "l"
-        text= text.replace(u'&quot;', "'")
-        text= text.replace(u'&apos;', "'")
-
-        i = text.find(u'<')
-        while i > -1 :
-            j = text.find(u'>', i)
-            text= text[:i] + text[j+1:]
-            i = text.find(u'<')
-
+        text = text.replace(u'&quot;', u'\"')
+        text = text.replace(u'&apos;', u'\'')
+        # Remove some other tags
+        start_tag = text.find(u'<')
+        while start_tag > -1 :
+            end_tag = text.find(u'>', start_tag)
+            text = text[:start_tag] + text[end_tag + 1:]
+            start_tag = text.find(u'<')
        text = text.replace(u'>', u'')
        return text.rstrip()
+