Trying to regexify and make the OSIS importer more robust.

This commit is contained in:
Raoul Snyman 2009-12-23 23:09:07 +02:00
parent dc0f7e7f5a
commit 1181152c5a

View File

@ -28,6 +28,7 @@ import os.path
import logging import logging
import chardet import chardet
import codecs import codecs
import re
from PyQt4 import QtCore from PyQt4 import QtCore
@ -53,7 +54,8 @@ class BibleOSISImpl():
A reference to a Bible database object. A reference to a Bible database object.
""" """
log.info(u'BibleOSISImpl Initialising') log.info(u'BibleOSISImpl Initialising')
self.verse_regex = re.compile(r'<verse osisID="([a-zA-Z0-9 ]*).([0-9]*).([0-9]*)">(.*)</verse>') self.verse_regex = re.compile(r'<verse osisID="([a-zA-Z0-9 ]*).([0-9]*).([0-9]*)">(.*?)</verse>')
self.note_regex = re.compile(r'<note([a-zA-Z0-9 "=]*)>(.*?)</note>')
self.bibledb = bibledb self.bibledb = bibledb
# books of the bible linked to bibleid {osis , name} # books of the bible linked to bibleid {osis , name}
self.booksOfBible = {} self.booksOfBible = {}
@ -61,7 +63,7 @@ class BibleOSISImpl():
self.abbrevOfBible = {} self.abbrevOfBible = {}
filepath = os.path.split(os.path.abspath(__file__))[0] filepath = os.path.split(os.path.abspath(__file__))[0]
filepath = os.path.abspath(os.path.join( filepath = os.path.abspath(os.path.join(
filepath, u'..', u'resources',u'osisbooks.csv')) filepath, u'..', u'resources', u'osisbooks.csv'))
fbibles = None fbibles = None
self.loadbible = True self.loadbible = True
try: try:
@ -109,84 +111,102 @@ class BibleOSISImpl():
osis = None osis = None
try: try:
osis = codecs.open(osisfile_record, u'r', details['encoding']) osis = codecs.open(osisfile_record, u'r', details['encoding'])
book_ptr = None last_chapter = u'0'
count = 0
verseText = u'<verse osisID='
testament = 1
for file_record in osis: for file_record in osis:
# cancel pressed on UI match = self.verse_regex.search(file_record)
if not self.loadbible: if match:
break print 'Found:', match.group(4)
pos = file_record.find(verseText) #if last_chapter != match.group(2):
# we have a verse print match.group(2)
if pos > -1: # dialogobject.incrementProgressBar(
epos = file_record.find(u'>', pos) # u'Importing %s %s...' % \
# Book Reference # (self.books[match.group(1)], match.group(2)))
ref = file_record[pos+15:epos-1] # last_chapter = match.group(2)
#lets find the bible text only verse_text = match.group(4)
# find start of text #verse_text = self.remove_block(
pos = epos + 1 #verse_text = self.note_regex.sub(lambda match: u'', verse_text)
# end of text print verse_text
epos = file_record.find(u'</verse>', pos)
text = file_record[pos : epos]
#remove tags of extra information
text = self.remove_block(u'<title', u'</title>', text)
text = self.remove_block(u'<note', u'</note>', text)
text = self.remove_block(
u'<divineName', u'</divineName>', text)
text = self.remove_tag(u'<lb', text)
text = self.remove_tag(u'<q', text)
text = self.remove_tag(u'<l', text)
text = self.remove_tag(u'<lg', text)
# Strange tags where the end is not the same as the start
# The must be in this order as at least one bible has them
# crossing and the removal does not work.
pos = text.find(u'<FI>')
while pos > -1:
epos = text.find(u'<Fi>', pos)
if epos == -1: # TODO
pos = -1
else: else:
text = text[:pos] + text[epos + 4: ] print 'Not found...', file_record[:10]
pos = text.find(u'<FI>') # (self.booksOfBible[p[0]], p[1]))
pos = text.find(u'<RF>') # book_ptr = None
while pos > -1: # count = 0
epos = text.find(u'<Rf>', pos) # verseText = u'<verse osisID='
text = text[:pos] + text[epos + 4: ] # testament = 1
pos = text.find(u'<RF>') # for file_record in osis:
print ref # # cancel pressed on UI
continue # if not self.loadbible:
# split up the reference # break
p = ref.split(u'.', 3) # pos = file_record.find(verseText)
if book_ptr != p[0]: # # we have a verse
# first time through # if pos > -1:
if book_ptr is None: # epos = file_record.find(u'>', pos)
# set the max book size depending # # Book Reference
# on the first book read # ref = file_record[pos+15:epos-1]
if p[0] == u'Gen': # #lets find the bible text only
dialogobject.ImportProgressBar.setMaximum(1188) # # find start of text
else: # pos = epos + 1
dialogobject.ImportProgressBar.setMaximum(260) # # end of text
# First book of NT # epos = file_record.find(u'</verse>', pos)
if p[0] == u'Matt': # text = file_record[pos : epos]
testament += 1 # #remove tags of extra information
dialogobject.incrementProgressBar(u'Importing %s %s...' % \ # text = self.remove_block(u'<title', u'</title>', text)
(self.booksOfBible[p[0]], p[1])) # text = self.remove_block(u'<note', u'</note>', text)
Receiver.send_message(u'process_events') # text = self.remove_block(
self.bibledb.save_verses() # u'<divineName', u'</divineName>', text)
book_ptr = p[0] # text = self.remove_tag(u'<lb', text)
book = self.bibledb.create_book( # text = self.remove_tag(u'<q', text)
unicode(self.booksOfBible[p[0]]), # text = self.remove_tag(u'<l', text)
unicode(self.abbrevOfBible[p[0]]), # text = self.remove_tag(u'<lg', text)
testament) # # Strange tags where the end is not the same as the start
count = 0 # # The must be in this order as at least one bible has them
self.bibledb.add_verse(book.id, p[1], p[2], text) # # crossing and the removal does not work.
#count += 1 # pos = text.find(u'<FI>')
#Every 3 verses repaint the screen # while pos > -1:
#if count % 3 == 0: # epos = text.find(u'<Fi>', pos)
# Receiver.send_message(u'process_events') # if epos == -1: # TODO
# count = 0 # pos = -1
#self.bibledb.save_verses() # else:
# text = text[:pos] + text[epos + 4: ]
# pos = text.find(u'<FI>')
# pos = text.find(u'<RF>')
# while pos > -1:
# epos = text.find(u'<Rf>', pos)
# text = text[:pos] + text[epos + 4: ]
# pos = text.find(u'<RF>')
# print ref
# continue
# # split up the reference
# p = ref.split(u'.', 3)
# if book_ptr != p[0]:
# # first time through
# if book_ptr is None:
# # set the max book size depending
# # on the first book read
# if p[0] == u'Gen':
# dialogobject.ImportProgressBar.setMaximum(1188)
# else:
# dialogobject.ImportProgressBar.setMaximum(260)
# # First book of NT
# if p[0] == u'Matt':
# testament += 1
# dialogobject.incrementProgressBar(u'Importing %s %s...' % \
# (self.booksOfBible[p[0]], p[1]))
# Receiver.send_message(u'process_events')
# self.bibledb.save_verses()
# book_ptr = p[0]
# book = self.bibledb.create_book(
# unicode(self.booksOfBible[p[0]]),
# unicode(self.abbrevOfBible[p[0]]),
# testament)
# count = 0
# self.bibledb.add_verse(book.id, p[1], p[2], text)
# #count += 1
# #Every 3 verses repaint the screen
# #if count % 3 == 0:
# # Receiver.send_message(u'process_events')
# # count = 0
# #self.bibledb.save_verses()
except: except:
log.exception(u'Loading bible from OSIS file failed') log.exception(u'Loading bible from OSIS file failed')
finally: finally: