Fix html ripping bug for bibles

This commit is contained in:
Tim Bentley 2009-07-12 16:11:57 +01:00
parent f576e9d83d
commit 6c30e67723
2 changed files with 8 additions and 2 deletions

View File

@ -116,6 +116,7 @@ class CWExtract(BibleCommon):
## Strip Verse Data from Page and build an array ## Strip Verse Data from Page and build an array
## ##
#log.debug(u'bible data %s', xml_string) #log.debug(u'bible data %s', xml_string)
#print xml_string
i= xml_string.find(u'NavCurrentChapter') i= xml_string.find(u'NavCurrentChapter')
xml_string = xml_string[i:len(xml_string)] xml_string = xml_string[i:len(xml_string)]
i= xml_string.find(u'<TABLE') i= xml_string.find(u'<TABLE')
@ -136,11 +137,10 @@ class CWExtract(BibleCommon):
i = xml_string.find(u'</I></B>', versePos) i = xml_string.find(u'</I></B>', versePos)
#log.debug( versePos, i) #log.debug( versePos, i)
verse= xml_string[versePos:i] # Got the Chapter verse= xml_string[versePos:i] # Got the Chapter
#verse = int(temp)
#log.debug( 'Chapter = %s', verse) #log.debug( 'Chapter = %s', verse)
# move the starting position to begining of the text # move the starting position to begining of the text
versePos = i + 8 versePos = i + 8
# fined the start of the next verse # find the start of the next verse
i = xml_string.find(u'<B><I>', versePos) i = xml_string.find(u'<B><I>', versePos)
if i == -1: if i == -1:
i = xml_string.find(u'</BLOCKQUOTE>',versePos) i = xml_string.find(u'</BLOCKQUOTE>',versePos)
@ -150,6 +150,7 @@ class CWExtract(BibleCommon):
#log.debug( i, versePos) #log.debug( i, versePos)
verseText = xml_string[versePos: i] verseText = xml_string[versePos: i]
versePos = i versePos = i
#print verseText
bible[verse] = self._clean_text(verseText) bible[verse] = self._clean_text(verseText)
#bible[verse] = verseText #bible[verse] = verseText

View File

@ -132,6 +132,11 @@ class BibleCommon(object):
end_tag = text.find(u'</sup>') end_tag = text.find(u'</sup>')
text = text[:start_tag] + text[end_tag + 6:len(text)] text = text[:start_tag] + text[end_tag + 6:len(text)]
start_tag = text.find(u'<sup>') start_tag = text.find(u'<sup>')
start_tag = text.find(u'<SUP>')
while start_tag > -1:
end_tag = text.find(u'</SUP>')
text = text[:start_tag] + text[end_tag + 6:len(text)]
start_tag = text.find(u'<SUP>')
# Static Clean ups # Static Clean ups
text = text.replace(u'\n', u'') text = text.replace(u'\n', u'')
text = text.replace(u'\r', u'') text = text.replace(u'\r', u'')