Fix up text formatting / stripping from websites

bzr-revno: 88
This commit is contained in:
Tim Bentley 2008-11-13 20:51:37 +00:00
parent d81d037f2b
commit fa9409f874
2 changed files with 21 additions and 23 deletions

View File

@ -34,22 +34,23 @@ class BibleCommon:
def _cleanText(self, text): def _cleanText(self, text):
""" """
Clean up text and remove extra characters Clean up text and remove extra characters
after been downloaded from web
""" """
# text = text.replace('\n', '') # Remove Headings from the Text
# text = text.replace('\r', '') i = text.find("<h")
# text = text.replace('&nbsp;', '') while i > -1:
# text = text.replace('<P>', '') j=text.find("</h", i)
# text = text.replace('"', '') text = text[ : (i - 1)]+text[(j+4)]
i = text.find("<h")
# Remove Support References from the Text
x = text.find("<sup>") x = text.find("<sup>")
while x > -1: while x > -1:
y = text.find("</sup>") y = text.find("</sup>")
#print x, y
#print verseText[:x]
#print verseText[y + 6:len(verseText)]
text= text[:x] + text[y + 6:len(text)] text= text[:x] + text[y + 6:len(text)]
x = text.find("<sup>") x = text.find("<sup>")
#print "text= " + text
# Static Clean ups
text= text.replace('\n', '') text= text.replace('\n', '')
text= text.replace('\r', '') text= text.replace('\r', '')
text= text.replace('&nbsp;', '') text= text.replace('&nbsp;', '')
@ -64,14 +65,11 @@ class BibleCommon:
text= text.replace(chr(189), '1/2') text= text.replace(chr(189), '1/2')
text= text.replace("&quot;", '"') text= text.replace("&quot;", '"')
text= text.replace("&apos;", "'") text= text.replace("&apos;", "'")
x = text.find("<") i = text.find("<")
#print verseText while i > -1 :
# while x > -1: j = text.find(">", i)
# y = text.find(">") text= text[:i] + text[j+1:]
# #print x , y i = text.find("<")
# #print verseText[:x-1]
# #print verseText[y : y-1]
# text= text[:x] + text[y+1 : len(text)]
# x = text.find("<")
text= text.replace('>', '') text= text.replace('>', '')
return text.rstrip() return text.rstrip()

View File

@ -98,7 +98,7 @@ class BibleHTTPImpl(BibleCommon):
versePos = xml_string.find(VerseSearch) versePos = xml_string.find(VerseSearch)
#print versePos #print versePos
bible = {} bible = {}
while versePos > 0: while versePos > -1:
verseText = "" # clear out string verseText = "" # clear out string
versePos = xml_string.find("</span", versePos) versePos = xml_string.find("</span", versePos)
i = xml_string.find(VerseSearch, versePos+1) i = xml_string.find(VerseSearch, versePos+1)
@ -111,13 +111,13 @@ class BibleHTTPImpl(BibleCommon):
i = j i = j
verseText = xml_string[versePos + 7 : i ] verseText = xml_string[versePos + 7 : i ]
#print xml_string #print xml_string
print "VerseText = " + str(verse) +" "+ verseText #print "VerseText = " + str(verse) +" "+ verseText
bible[verse] = self._cleanText(verseText) # store the verse bible[verse] = self._cleanText(verseText) # store the verse
versePos = 0 versePos = -1
else: else:
i = xml_string[:i].rfind("<span")+1 i = xml_string[:i].rfind("<span")+1
verseText = xml_string[versePos + 7 : i ] # Loose the </span> verseText = xml_string[versePos + 7 : i - 1 ] # Loose the </span>
xml_string = xml_string[i:len(xml_string)] # chop off verse 1 xml_string = xml_string[i - 1 :len(xml_string)] # chop off verse 1
versePos = xml_string.find(VerseSearch) #look for the next verse versePos = xml_string.find(VerseSearch) #look for the next verse
bible[verse] = self._cleanText(verseText) # store the verse bible[verse] = self._cleanText(verseText) # store the verse
verse += 1 verse += 1