Fix up text formatting / stripping from websites

bzr-revno: 88
This commit is contained in:
Tim Bentley 2008-11-13 20:51:37 +00:00
parent d81d037f2b
commit fa9409f874
2 changed files with 21 additions and 23 deletions

View File

@ -34,22 +34,23 @@ class BibleCommon:
def _cleanText(self, text):
"""
Clean up text and remove extra characters
after been downloaded from web
"""
# text = text.replace('\n', '')
# text = text.replace('\r', '')
# text = text.replace(' ', '')
# text = text.replace('<P>', '')
# text = text.replace('"', '')
# Remove Headings from the Text
i = text.find("<h")
while i > -1:
j=text.find("</h", i)
text = text[ : (i - 1)]+text[(j+4)]
i = text.find("<h")
# Remove Support References from the Text
x = text.find("<sup>")
while x > -1:
y = text.find("</sup>")
#print x, y
#print verseText[:x]
#print verseText[y + 6:len(verseText)]
text= text[:x] + text[y + 6:len(text)]
x = text.find("<sup>")
#print "text= " + text
# Static Clean ups
text= text.replace('\n', '')
text= text.replace('\r', '')
text= text.replace('&nbsp;', '')
@ -64,14 +65,11 @@ class BibleCommon:
text= text.replace(chr(189), '1/2')
text= text.replace("&quot;", '"')
text= text.replace("&apos;", "'")
x = text.find("<")
#print verseText
# while x > -1:
# y = text.find(">")
# #print x , y
# #print verseText[:x-1]
# #print verseText[y : y-1]
# text= text[:x] + text[y+1 : len(text)]
# x = text.find("<")
i = text.find("<")
while i > -1 :
j = text.find(">", i)
text= text[:i] + text[j+1:]
i = text.find("<")
text= text.replace('>', '')
return text.rstrip()

View File

@ -98,7 +98,7 @@ class BibleHTTPImpl(BibleCommon):
versePos = xml_string.find(VerseSearch)
#print versePos
bible = {}
while versePos > 0:
while versePos > -1:
verseText = "" # clear out string
versePos = xml_string.find("</span", versePos)
i = xml_string.find(VerseSearch, versePos+1)
@ -111,13 +111,13 @@ class BibleHTTPImpl(BibleCommon):
i = j
verseText = xml_string[versePos + 7 : i ]
#print xml_string
print "VerseText = " + str(verse) +" "+ verseText
#print "VerseText = " + str(verse) +" "+ verseText
bible[verse] = self._cleanText(verseText) # store the verse
versePos = 0
versePos = -1
else:
i = xml_string[:i].rfind("<span")+1
verseText = xml_string[versePos + 7 : i ] # Loose the </span>
xml_string = xml_string[i:len(xml_string)] # chop off verse 1
verseText = xml_string[versePos + 7 : i - 1 ] # Loose the </span>
xml_string = xml_string[i - 1 :len(xml_string)] # chop off verse 1
versePos = xml_string.find(VerseSearch) #look for the next verse
bible[verse] = self._cleanText(verseText) # store the verse
verse += 1