Fix bug #595676 and improve the OSIS importer.

This commit is contained in:
Raoul Snyman 2010-06-24 20:35:01 +02:00
parent c7939ea03c
commit aa2d1586ea
1 changed files with 10 additions and 4 deletions

View File

@ -62,10 +62,13 @@ class OSISBible(BibleDB):
self.fi_regex = re.compile(r'<FI>(.*?)<Fi>')
self.rf_regex = re.compile(r'<RF>(.*?)<Rf>')
self.lb_regex = re.compile(r'<lb(.*?)>')
self.lg_regex = re.compile(r'<lg(.*?)>')
self.l_regex = re.compile(r'<l (.*?)>')
self.w_regex = re.compile(r'<w (.*?)>')
self.q_regex = re.compile(r'<q (.*?)>')
self.q1_regex = re.compile(r'<q(.*?)level="1"(.*?)>')
self.q2_regex = re.compile(r'<q(.*?)level="2"(.*?)>')
self.trans_regex = re.compile(r'<transChange(.*?)>(.*?)</transChange>')
self.divineName_regex = re.compile(r'<divineName(.*?)>(.*?)</divineName>')
self.spaces_regex = re.compile(r'([ ]{2,})')
self.books = {}
filepath = os.path.join(
@ -96,7 +99,7 @@ class OSISBible(BibleDB):
detect_file = None
try:
detect_file = open(self.filename, u'r')
details = chardet.detect(detect_file.read())
details = chardet.detect(detect_file.read(1048576))
except IOError:
log.exception(u'Failed to detect OSIS file encoding')
return
@ -150,11 +153,14 @@ class OSISBible(BibleDB):
verse_text = self.milestone_regex.sub(u'', verse_text)
verse_text = self.fi_regex.sub(u'', verse_text)
verse_text = self.rf_regex.sub(u'', verse_text)
verse_text = self.lb_regex.sub(u'', verse_text)
verse_text = self.lb_regex.sub(u' ', verse_text)
verse_text = self.lg_regex.sub(u'', verse_text)
verse_text = self.l_regex.sub(u'', verse_text)
verse_text = self.w_regex.sub(u'', verse_text)
verse_text = self.q_regex.sub(u'', verse_text)
verse_text = self.q1_regex.sub(u'"', verse_text)
verse_text = self.q2_regex.sub(u'\'', verse_text)
verse_text = self.trans_regex.sub(u'', verse_text)
verse_text = self.divineName_regex.sub(u'', verse_text)
verse_text = verse_text.replace(u'</lb>', u'')\
.replace(u'</l>', u'').replace(u'<lg>', u'')\
.replace(u'</lg>', u'').replace(u'</q>', u'')\