') - self.q1_regex = re.compile(r'') - self.q2_regex = re.compile(r'') - self.trans_regex = re.compile(r'(.*?) ') - self.divine_name_regex = re.compile(r'(.*?) ') - self.spaces_regex = re.compile(r'([ ]{2,})') - filepath = os.path.join( - AppLocation.get_directory(AppLocation.PluginsDir), 'bibles', 'resources', 'osisbooks.csv') def do_import(self, bible_name=None): """ Loads a Bible from file. """ log.debug('Starting OSIS import from "%s"' % self.filename) - detect_file = None - db_book = None - osis = None + if not isinstance(self.filename, str): + self.filename = str(self.filename, 'utf8') + import_file = None success = True - last_chapter = 0 - match_count = 0 - self.wizard.increment_progress_bar( - translate('BiblesPlugin.OsisImport', 'Detecting encoding (this may take a few minutes)...')) try: - detect_file = open(self.filename, 'r') - details = chardet.detect(detect_file.read(1048576)) - detect_file.seek(0) - lines_in_file = int(len(detect_file.readlines())) - except IOError: - log.exception('Failed to detect OSIS file encoding') - return - finally: - if detect_file: - detect_file.close() - try: - osis = codecs.open(self.filename, 'r', details['encoding']) - repl = replacement - language_id = False - # Decide if the bible probably contains only NT or AT and NT or - # AT, NT and Apocrypha - if lines_in_file < 11500: - book_count = 27 - chapter_count = 260 - elif lines_in_file < 34200: - book_count = 66 - chapter_count = 1188 - else: - book_count = 67 - chapter_count = 1336 - for file_record in osis: + # NOTE: We don't need to do any of the normal encoding detection here, because lxml does it's own encoding + # detection, and the two mechanisms together interfere with each other. + import_file = open(self.filename, 'rb') + osis_bible_tree = etree.parse(import_file) + namespace = {'ns': 'http://www.bibletechnologies.net/2003/OSIS/namespace'} + # Find bible language + language_id = None + language = osis_bible_tree.xpath("//ns:osisText/@xml:lang", namespaces=namespace) + if language: + language_id = BiblesResourcesDB.get_language(language[0]) + # The language couldn't be detected, ask the user + if not language_id: + language_id = self.get_language(bible_name) + if not language_id: + log.error('Importing books from "%s" failed' % self.filename) + return False + num_books = int(osis_bible_tree.xpath("count(//ns:div[@type='book'])", namespaces=namespace)) + self.wizard.increment_progress_bar(translate('BiblesPlugin.OsisImport', + 'Removing unused tags (this may take a few minutes)...')) + # We strip unused tags from the XML, this should leave us with only chapter, verse and div tags. + # Strip tags we don't use - remove content + etree.strip_elements(osis_bible_tree, ('{http://www.bibletechnologies.net/2003/OSIS/namespace}note', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}milestone', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}title', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}abbr', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}catchWord', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}index', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}rdg', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}rdgGroup', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}figure'), + with_tail=False) + # Strip tags we don't use - keep content + etree.strip_tags(osis_bible_tree, ('{http://www.bibletechnologies.net/2003/OSIS/namespace}p', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}l', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}lg', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}q', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}a', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}w', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}divineName', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}foreign', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}hi', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}inscription', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}mentioned', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}name', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}reference', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}seg', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}transChange', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}salute', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}signed', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}closer', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}speech', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}speaker', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}list', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}item', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}table', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}head', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}row', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}cell', + '{http://www.bibletechnologies.net/2003/OSIS/namespace}caption')) + # Precompile a few xpath-querys + verse_in_chapter = etree.XPath('count(//ns:chapter[1]/ns:verse)', namespaces=namespace) + text_in_verse = etree.XPath('count(//ns:verse[1]/text())', namespaces=namespace) + # Find books in the bible + bible_books = osis_bible_tree.xpath("//ns:div[@type='book']", namespaces=namespace) + for book in bible_books: if self.stop_import_flag: break - # Try to find the bible language - if not language_id: - language_match = self.language_regex.search(file_record) - if language_match: - language = BiblesResourcesDB.get_language( - language_match.group(1)) - if language: - language_id = language['id'] - self.save_meta('language_id', language_id) - continue - match = self.verse_regex.search(file_record) - if match: - # Set meta language_id if not detected till now - if not language_id: - language_id = self.get_language(bible_name) - if not language_id: - log.error('Importing books from "%s" failed' % self.filename) - return False - match_count += 1 - book = str(match.group(1)) - chapter = int(match.group(2)) - verse = int(match.group(3)) - verse_text = match.group(4) - book_ref_id = self.get_book_ref_id_by_name(book, book_count, language_id) - if not book_ref_id: - log.error('Importing books from "%s" failed' % self.filename) - return False - book_details = BiblesResourcesDB.get_book_by_id(book_ref_id) - if not db_book or db_book.name != book_details['name']: - log.debug('New book: "%s"' % book_details['name']) - db_book = self.create_book( - book_details['name'], - book_ref_id, - book_details['testament_id']) - if last_chapter == 0: - self.wizard.progress_bar.setMaximum(chapter_count) - if last_chapter != chapter: - if last_chapter != 0: - self.session.commit() + # Remove div-tags in the book + etree.strip_tags(book, ('{http://www.bibletechnologies.net/2003/OSIS/namespace}div')) + book_ref_id = self.get_book_ref_id_by_name(book.get('osisID'), num_books) + if not book_ref_id: + book_ref_id = self.get_book_ref_id_by_localised_name(book.get('osisID')) + if not book_ref_id: + log.error('Importing books from "%s" failed' % self.filename) + return False + book_details = BiblesResourcesDB.get_book_by_id(book_ref_id) + db_book = self.create_book(book_details['name'], book_ref_id, book_details['testament_id']) + # Find out if chapter-tags contains the verses, or if it is used as milestone/anchor + if int(verse_in_chapter(book)) > 0: + # The chapter tags contains the verses + for chapter in book: + chapter_number = chapter.get("osisID").split('.')[1] + # Find out if verse-tags contains the text, or if it is used as milestone/anchor + if int(text_in_verse(chapter)) == 0: + # verse-tags are used as milestone + for verse in chapter: + # If this tag marks the start of a verse, the verse text is between this tag and + # the next tag, which the "tail" attribute gives us. + if verse.get('sID'): + verse_number = verse.get("osisID").split('.')[2] + verse_text = verse.tail + if verse_text: + self.create_verse(db_book.id, chapter_number, verse_number, verse_text.strip()) + else: + # Verse-tags contains the text + for verse in chapter: + verse_number = verse.get("osisID").split('.')[2] + self.create_verse(db_book.id, chapter_number, verse_number, verse.text.strip()) self.wizard.increment_progress_bar( - translate('BiblesPlugin.OsisImport', 'Importing %s %s...', - 'Importing...') % (book_details['name'], chapter)) - last_chapter = chapter - # All of this rigmarole below is because the mod2osis tool from the Sword library embeds XML in the - # OSIS but neglects to enclose the verse text (with XML) in <[CDATA[ ]]> tags. - verse_text = self.note_regex.sub('', verse_text) - verse_text = self.title_regex.sub('', verse_text) - verse_text = self.milestone_regex.sub('', verse_text) - verse_text = self.fi_regex.sub('', verse_text) - verse_text = self.rf_regex.sub('', verse_text) - verse_text = self.lb_regex.sub(' ', verse_text) - verse_text = self.lg_regex.sub('', verse_text) - verse_text = self.l_regex.sub(' ', verse_text) - verse_text = self.w_regex.sub('', verse_text) - verse_text = self.q1_regex.sub('"', verse_text) - verse_text = self.q2_regex.sub('\'', verse_text) - verse_text = self.q_regex.sub('', verse_text) - verse_text = self.divine_name_regex.sub(repl, verse_text) - verse_text = self.trans_regex.sub('', verse_text) - verse_text = verse_text.replace('
This draft version of the World English Bible is +substantially complete in the New Testament, Genesis, Exodus, Job, Psalms, Proverbs, Ecclesiastes, Song of Solomon, and the “minor” prophets. Editing continues on the other books of the Old Testament. All WEB companion Apocrypha books are still in +rough draft form.
+Converted web.gbf in GBF to web.osis.xml in +an XML format that is mostly compliant with OSIS 2.0 using gbf2osis.exe. +(Please see http://ebt.cx/translation/ for links to this software.)
+GBF and OSIS metadata fields do not exactly correspond to each other, so +the conversion is not perfect in the metadata. However, the Scripture portion +should be correct.
+No attempt was to convert quotation marks to structural markers using q or +speech elements, because this would require language and style-dependent +processing, and because the current OSIS specification is deficient in that +quotation mark processing is not guaranteed to produce the correct results +for all languages and translations. In English texts, the hard part of the +conversion to markup is figuring out what ’ means. +The other difficulty is that OSIS in no way guarantees that these punctuation +marks would be reconstituted properly by software that reads OSIS files +for anything other than modern English, and even then, it does not +accommodate all styles of punctuation and all cases. +We strongly recommend that anyone using OSIS NOT replace quotation mark +punctuation in any existing text with q or speech elements. It is better +for multiple language processing capabilities to leave the quotation +punctuation as part of the text. If you need the q or speech markup, then you +may supplement those punctuation marks with those markup elements, but specify +the n='' parameter in those elements to indicate that no generation of any +punctuation from those markup elements is required or desired. That way you +can have BOTH correct punctuation already in the text AND markup so that you +can automatically determine when you are in a quotation or not, independent +of language. This may be useful for a search by speaker, for example.
+The output of gbf2osis marks Jesus' words in a non-standard way using the q +element AND quotation marks if they were marked with FR/Fr markers in the GBF +file. The OSIS 2.0 specification requires that quotation marks be stripped out, +and reinserted by software that reads the OSIS files when q elements are used. +This is not acceptable for the reasons given above, and we choose not to do +that, but we used the q element with who='Jesus' to indicate Jesus' words. +Do not generate any additional punctuation due to these markers. The correct +punctuation is already in the text.
+OSIS does not currently support footnote start anchors. Therefore, these +start anchors have been represented with milestone elements, in case someone +might like to use them, for example, to start an href element in a conversion +to HTML. (OSIS sort of supports the same idea by allowing a catchword to be +defined within a footnote, but I did not implement the processing to convert +to this different way of doing things, and it isn't exactly the same, anyway.)
+Traditional psalm book titles are rendered as text rather than titles, because +the title element does not support containing transChange elements, as would be +required to encode the KJV text using OSIS title elements. This may actually be +a superior solution, anyway, in that the Masoretic text makes no such distinction +(even though many modern typeset Bibles do make a typographic distinction in this +case).
+The schema location headers were modified to use local copies rather than the +standard locations so that these files could be validated and used without an +Internet connection active at all times (very important for the developer's +remote island location), but you may wish to change them back.
+
+
+
+
+
+