Fix html ripping bug for bibles

2009-07-12 16:11:57 +01:00 · 2009-07-12 16:11:57 +01:00 · 6c30e67723
commit 6c30e67723
parent f576e9d83d
2 changed files with 8 additions and 2 deletions
--- a/openlp/plugins/bibles/lib/bibleHTTPimpl.py
+++ b/openlp/plugins/bibles/lib/bibleHTTPimpl.py
@ -116,6 +116,7 @@ class CWExtract(BibleCommon):
        ## Strip Verse Data from Page and build an array
        ##
        #log.debug(u'bible data %s', xml_string)
+        #print xml_string
        i= xml_string.find(u'NavCurrentChapter')
        xml_string = xml_string[i:len(xml_string)]
        i= xml_string.find(u'<TABLE')
@ -136,11 +137,10 @@ class CWExtract(BibleCommon):
            i = xml_string.find(u'</I></B>', versePos)
            #log.debug( versePos, i)
            verse= xml_string[versePos:i] # Got the Chapter
-            #verse = int(temp)
            #log.debug( 'Chapter = %s', verse)
            # move the starting position to begining of the text
            versePos = i + 8
-            # fined the start of the next verse
+            # find the start of the next verse
            i = xml_string.find(u'<B><I>', versePos)
            if i == -1:
                i = xml_string.find(u'</BLOCKQUOTE>',versePos)
@ -150,6 +150,7 @@ class CWExtract(BibleCommon):
                #log.debug( i,  versePos)
                verseText = xml_string[versePos: i]
                versePos = i
+            #print verseText
            bible[verse] = self._clean_text(verseText)
            #bible[verse] = verseText

--- a/openlp/plugins/bibles/lib/common.py
+++ b/openlp/plugins/bibles/lib/common.py
@ -132,6 +132,11 @@ class BibleCommon(object):
            end_tag = text.find(u'</sup>')
            text = text[:start_tag] + text[end_tag + 6:len(text)]
            start_tag = text.find(u'<sup>')
+        start_tag = text.find(u'<SUP>')
+        while start_tag > -1:
+            end_tag = text.find(u'</SUP>')
+            text = text[:start_tag] + text[end_tag + 6:len(text)]
+            start_tag = text.find(u'<SUP>')
        # Static Clean ups
        text = text.replace(u'\n', u'')
        text = text.replace(u'\r', u'')