Trying to regexify and make the OSIS importer more robust.

This commit is contained in:
Raoul Snyman 2009-12-23 23:09:07 +02:00
parent dc0f7e7f5a
commit 1181152c5a

View File

@ -28,6 +28,7 @@ import os.path
import logging
import chardet
import codecs
import re
from PyQt4 import QtCore
@ -53,7 +54,8 @@ class BibleOSISImpl():
A reference to a Bible database object.
"""
log.info(u'BibleOSISImpl Initialising')
self.verse_regex = re.compile(r'<verse osisID="([a-zA-Z0-9 ]*).([0-9]*).([0-9]*)">(.*)</verse>')
self.verse_regex = re.compile(r'<verse osisID="([a-zA-Z0-9 ]*).([0-9]*).([0-9]*)">(.*?)</verse>')
self.note_regex = re.compile(r'<note([a-zA-Z0-9 "=]*)>(.*?)</note>')
self.bibledb = bibledb
# books of the bible linked to bibleid {osis , name}
self.booksOfBible = {}
@ -109,84 +111,102 @@ class BibleOSISImpl():
osis = None
try:
osis = codecs.open(osisfile_record, u'r', details['encoding'])
book_ptr = None
count = 0
verseText = u'<verse osisID='
testament = 1
last_chapter = u'0'
for file_record in osis:
# cancel pressed on UI
if not self.loadbible:
break
pos = file_record.find(verseText)
# we have a verse
if pos > -1:
epos = file_record.find(u'>', pos)
# Book Reference
ref = file_record[pos+15:epos-1]
#lets find the bible text only
# find start of text
pos = epos + 1
# end of text
epos = file_record.find(u'</verse>', pos)
text = file_record[pos : epos]
#remove tags of extra information
text = self.remove_block(u'<title', u'</title>', text)
text = self.remove_block(u'<note', u'</note>', text)
text = self.remove_block(
u'<divineName', u'</divineName>', text)
text = self.remove_tag(u'<lb', text)
text = self.remove_tag(u'<q', text)
text = self.remove_tag(u'<l', text)
text = self.remove_tag(u'<lg', text)
# Strange tags where the end is not the same as the start
# The must be in this order as at least one bible has them
# crossing and the removal does not work.
pos = text.find(u'<FI>')
while pos > -1:
epos = text.find(u'<Fi>', pos)
if epos == -1: # TODO
pos = -1
match = self.verse_regex.search(file_record)
if match:
print 'Found:', match.group(4)
#if last_chapter != match.group(2):
print match.group(2)
# dialogobject.incrementProgressBar(
# u'Importing %s %s...' % \
# (self.books[match.group(1)], match.group(2)))
# last_chapter = match.group(2)
verse_text = match.group(4)
#verse_text = self.remove_block(
#verse_text = self.note_regex.sub(lambda match: u'', verse_text)
print verse_text
else:
text = text[:pos] + text[epos + 4: ]
pos = text.find(u'<FI>')
pos = text.find(u'<RF>')
while pos > -1:
epos = text.find(u'<Rf>', pos)
text = text[:pos] + text[epos + 4: ]
pos = text.find(u'<RF>')
print ref
continue
# split up the reference
p = ref.split(u'.', 3)
if book_ptr != p[0]:
# first time through
if book_ptr is None:
# set the max book size depending
# on the first book read
if p[0] == u'Gen':
dialogobject.ImportProgressBar.setMaximum(1188)
else:
dialogobject.ImportProgressBar.setMaximum(260)
# First book of NT
if p[0] == u'Matt':
testament += 1
dialogobject.incrementProgressBar(u'Importing %s %s...' % \
(self.booksOfBible[p[0]], p[1]))
Receiver.send_message(u'process_events')
self.bibledb.save_verses()
book_ptr = p[0]
book = self.bibledb.create_book(
unicode(self.booksOfBible[p[0]]),
unicode(self.abbrevOfBible[p[0]]),
testament)
count = 0
self.bibledb.add_verse(book.id, p[1], p[2], text)
#count += 1
#Every 3 verses repaint the screen
#if count % 3 == 0:
# Receiver.send_message(u'process_events')
print 'Not found...', file_record[:10]
# (self.booksOfBible[p[0]], p[1]))
# book_ptr = None
# count = 0
# verseText = u'<verse osisID='
# testament = 1
# for file_record in osis:
# # cancel pressed on UI
# if not self.loadbible:
# break
# pos = file_record.find(verseText)
# # we have a verse
# if pos > -1:
# epos = file_record.find(u'>', pos)
# # Book Reference
# ref = file_record[pos+15:epos-1]
# #lets find the bible text only
# # find start of text
# pos = epos + 1
# # end of text
# epos = file_record.find(u'</verse>', pos)
# text = file_record[pos : epos]
# #remove tags of extra information
# text = self.remove_block(u'<title', u'</title>', text)
# text = self.remove_block(u'<note', u'</note>', text)
# text = self.remove_block(
# u'<divineName', u'</divineName>', text)
# text = self.remove_tag(u'<lb', text)
# text = self.remove_tag(u'<q', text)
# text = self.remove_tag(u'<l', text)
# text = self.remove_tag(u'<lg', text)
# # Strange tags where the end is not the same as the start
# # The must be in this order as at least one bible has them
# # crossing and the removal does not work.
# pos = text.find(u'<FI>')
# while pos > -1:
# epos = text.find(u'<Fi>', pos)
# if epos == -1: # TODO
# pos = -1
# else:
# text = text[:pos] + text[epos + 4: ]
# pos = text.find(u'<FI>')
# pos = text.find(u'<RF>')
# while pos > -1:
# epos = text.find(u'<Rf>', pos)
# text = text[:pos] + text[epos + 4: ]
# pos = text.find(u'<RF>')
# print ref
# continue
# # split up the reference
# p = ref.split(u'.', 3)
# if book_ptr != p[0]:
# # first time through
# if book_ptr is None:
# # set the max book size depending
# # on the first book read
# if p[0] == u'Gen':
# dialogobject.ImportProgressBar.setMaximum(1188)
# else:
# dialogobject.ImportProgressBar.setMaximum(260)
# # First book of NT
# if p[0] == u'Matt':
# testament += 1
# dialogobject.incrementProgressBar(u'Importing %s %s...' % \
# (self.booksOfBible[p[0]], p[1]))
# Receiver.send_message(u'process_events')
# self.bibledb.save_verses()
# book_ptr = p[0]
# book = self.bibledb.create_book(
# unicode(self.booksOfBible[p[0]]),
# unicode(self.abbrevOfBible[p[0]]),
# testament)
# count = 0
# self.bibledb.add_verse(book.id, p[1], p[2], text)
# #count += 1
# #Every 3 verses repaint the screen
# #if count % 3 == 0:
# # Receiver.send_message(u'process_events')
# # count = 0
# #self.bibledb.save_verses()
except:
log.exception(u'Loading bible from OSIS file failed')
finally: