openlp/openlp/plugins/bibles/lib/importers/wordproject.py

185 lines
8.1 KiB
Python

# -*- coding: utf-8 -*-
##########################################################################
# OpenLP - Open Source Lyrics Projection #
# ---------------------------------------------------------------------- #
# Copyright (c) 2008-2022 OpenLP Developers #
# ---------------------------------------------------------------------- #
# This program is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program. If not, see <https://www.gnu.org/licenses/>. #
##########################################################################
import logging
import re
from pathlib import Path
from tempfile import TemporaryDirectory
from zipfile import ZipFile, BadZipFile
from bs4 import BeautifulSoup, NavigableString, Tag
from openlp.core.common.i18n import translate
from openlp.core.lib.ui import critical_error_message_box
from openlp.plugins.bibles.lib.bibleimport import BibleImport
BOOK_NUMBER_PATTERN = re.compile(r'\[(\d+)\]')
REPLACE_SPACES = re.compile(r'\s{2,}')
log = logging.getLogger(__name__)
class WordProjectBible(BibleImport):
"""
`WordProject <http://www.wordproaudio.com/>`_ Bible format importer class.
"""
def _cleanup(self):
"""
Clean up after ourselves
"""
self.tmp.cleanup()
def _unzip_file(self):
"""
Unzip the file to a temporary directory
"""
self.tmp = TemporaryDirectory()
try:
with ZipFile(self.file_path) as zip_file:
zip_file.extractall(self.tmp.name)
except BadZipFile:
self.log_exception('Extracting {file} failed.'.format(file=self.file_path))
critical_error_message_box(message=translate('BiblesPlugin.WordProjectBible',
'Incorrect Bible file type, not a Zip file.'))
return False
self.base_path = Path(self.tmp.name, self.file_path.stem)
return True
def process_books(self):
"""
Extract and create the bible books from the parsed html
:param bible_data: parsed xml
:return: True if books was parsed, otherwise False
"""
idx_file = (self.base_path / 'index.htm')
if not idx_file.exists():
critical_error_message_box(message=translate('BiblesPlugin.WordProjectBible',
'Incorrect Bible file type, files are missing.'))
return False
page = idx_file.read_text(encoding='utf-8', errors='ignore')
soup = BeautifulSoup(page, 'lxml')
bible_books = soup.find('div', 'textOptions').find_all('li')
book_count = len(bible_books)
for li_book in bible_books:
log.debug(li_book)
if self.stop_import_flag:
break
if isinstance(li_book.contents[0], NavigableString) and str(li_book.contents[0]).strip():
# Sometimes the structure is "[1] <a>Genesis</a>", and sometimes it's "<a>[1] Genesis</a>"
book_string = str(li_book.contents[0])
book_name = str(li_book.a.contents[0])
elif li_book.a and ']' in li_book.a.contents[0]:
# Sometimes there is a [1] in the link text
book_string, book_name = str(li_book.a.contents[0]).split(' ', 1)
elif li_book.a and 'title' in li_book.a.attrs:
# And sometimes there's a title on the a with the number in it
book_string = li_book.a['title'].split(' ')[0]
book_name = li_book.a.contents[0].strip()
book_link = li_book.a['href']
book_id = int(BOOK_NUMBER_PATTERN.search(book_string).group(1))
book_name = book_name.strip()
db_book = self.find_and_create_book(book_name, book_count, self.language_id, book_id)
self.process_chapters(db_book, book_id, book_link)
self.session.commit()
return True
def process_chapters(self, db_book, book_id, book_link):
"""
Extract the chapters, and do some initial processing of the verses
:param book: An OpenLP bible database book object
:param chapters: parsed chapters
:return: None
"""
log.debug(book_link)
page = (self.base_path / book_link).read_text(encoding='utf-8', errors='ignore')
soup = BeautifulSoup(page, 'lxml')
header_div = soup.find('div', 'textHeader')
chapters_p = header_div.find('p')
if not chapters_p:
chapters_p = soup.p
log.debug(chapters_p)
for item in chapters_p.contents:
if self.stop_import_flag:
break
if isinstance(item, Tag) and item.name in ['a', 'span']:
chapter_number = int(item.string.strip())
self.set_current_chapter(db_book.name, chapter_number)
self.process_verses(db_book, book_id, chapter_number)
def process_verses(self, db_book, book_number, chapter_number):
"""
Get the verses for a particular book
"""
chapter_file_path = self.base_path / '{:02d}'.format(book_number) / '{}.htm'.format(chapter_number)
page = chapter_file_path.read_text(encoding='utf-8', errors='ignore')
soup = BeautifulSoup(page, 'lxml')
text_body = soup.find('div', 'textBody')
if text_body:
verses_p = text_body.find('p')
else:
verses_p = soup.find_all('p')[2]
verse_number = 0
verse_text = ''
for item in verses_p.contents:
if self.stop_import_flag:
break
if isinstance(item, Tag) and 'verse' in item.get('class', []):
if verse_number > 0:
self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
verse_number = int(item.string.strip())
verse_text = ''
elif isinstance(item, NavigableString):
verse_text += str(item)
elif isinstance(item, Tag) and item.name in ['span', 'a']:
verse_text += str(item.string)
else:
log.warning('Can\'t store %s', item)
self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
def process_verse(self, db_book, chapter_number, verse_number, verse_text):
"""
Process a verse element
:param book: A database Book object
:param chapter_number: The chapter number to add the verses to (int)
:param element: The verse element to process. (etree element type)
:param use_milestones: set to True to process a 'milestone' verse. Defaults to False
:return: None
"""
if verse_text:
log.debug('%s %s:%s %s', db_book.name, chapter_number, verse_number, verse_text.strip())
self.create_verse(db_book.id, chapter_number, verse_number, verse_text.strip())
def do_import(self, bible_name=None):
"""
Loads a Bible from file.
"""
self.log_debug('Starting WordProject import from "{name}"'.format(name=self.file_path))
if not self._unzip_file():
return False
self.language_id = self.get_language_id(None, bible_name=str(self.file_path))
result = False
if self.language_id:
result = self.process_books()
self._cleanup()
return result