forked from openlp/openlp
166 lines
7.1 KiB
Python
166 lines
7.1 KiB
Python
# -*- coding: utf-8 -*-
|
|
# vim: autoindent shiftwidth=4 expandtab textwidth=120 tabstop=4 softtabstop=4
|
|
|
|
###############################################################################
|
|
# OpenLP - Open Source Lyrics Projection #
|
|
# --------------------------------------------------------------------------- #
|
|
# Copyright (c) 2008-2019 OpenLP Developers #
|
|
# --------------------------------------------------------------------------- #
|
|
# This program is free software; you can redistribute it and/or modify it #
|
|
# under the terms of the GNU General Public License as published by the Free #
|
|
# Software Foundation; version 2 of the License. #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, but WITHOUT #
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or #
|
|
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for #
|
|
# more details. #
|
|
# #
|
|
# You should have received a copy of the GNU General Public License along #
|
|
# with this program; if not, write to the Free Software Foundation, Inc., 59 #
|
|
# Temple Place, Suite 330, Boston, MA 02111-1307 USA #
|
|
###############################################################################
|
|
import logging
|
|
import re
|
|
from tempfile import TemporaryDirectory
|
|
from zipfile import ZipFile
|
|
|
|
from bs4 import BeautifulSoup, NavigableString, Tag
|
|
|
|
from openlp.core.common.path import Path
|
|
from openlp.plugins.bibles.lib.bibleimport import BibleImport
|
|
|
|
|
|
BOOK_NUMBER_PATTERN = re.compile(r'\[(\d+)\]')
|
|
REPLACE_SPACES = re.compile(r'\s{2,}')
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
class WordProjectBible(BibleImport):
|
|
"""
|
|
`WordProject <http://www.wordproaudio.com/>`_ Bible format importer class.
|
|
"""
|
|
def _cleanup(self):
|
|
"""
|
|
Clean up after ourselves
|
|
"""
|
|
self.tmp.cleanup()
|
|
|
|
def _unzip_file(self):
|
|
"""
|
|
Unzip the file to a temporary directory
|
|
"""
|
|
self.tmp = TemporaryDirectory()
|
|
with ZipFile(self.file_path) as zip_file:
|
|
zip_file.extractall(self.tmp.name)
|
|
self.base_path = Path(self.tmp.name, self.file_path.stem)
|
|
|
|
def process_books(self):
|
|
"""
|
|
Extract and create the bible books from the parsed html
|
|
|
|
:param bible_data: parsed xml
|
|
:return: None
|
|
"""
|
|
page = (self.base_path / 'index.htm').read_text(encoding='utf-8', errors='ignore')
|
|
soup = BeautifulSoup(page, 'lxml')
|
|
bible_books = soup.find('div', 'textOptions').find_all('li')
|
|
book_count = len(bible_books)
|
|
for li_book in bible_books:
|
|
log.debug(li_book)
|
|
if self.stop_import_flag:
|
|
break
|
|
# Sometimes the structure is "[1] <a>Genesis</a>", and sometimes it's "<a>[1] Genesis</a>"
|
|
if isinstance(li_book.contents[0], NavigableString) and str(li_book.contents[0]).strip():
|
|
book_string = str(li_book.contents[0])
|
|
book_name = str(li_book.a.contents[0])
|
|
elif li_book.a:
|
|
book_string, book_name = str(li_book.a.contents[0]).split(' ', 1)
|
|
book_link = li_book.a['href']
|
|
book_id = int(BOOK_NUMBER_PATTERN.search(book_string).group(1))
|
|
book_name = book_name.strip()
|
|
db_book = self.find_and_create_book(book_name, book_count, self.language_id, book_id)
|
|
self.process_chapters(db_book, book_id, book_link)
|
|
self.session.commit()
|
|
|
|
def process_chapters(self, db_book, book_id, book_link):
|
|
"""
|
|
Extract the chapters, and do some initial processing of the verses
|
|
|
|
:param book: An OpenLP bible database book object
|
|
:param chapters: parsed chapters
|
|
:return: None
|
|
"""
|
|
log.debug(book_link)
|
|
page = (self.base_path / book_link).read_text(encoding='utf-8', errors='ignore')
|
|
soup = BeautifulSoup(page, 'lxml')
|
|
header_div = soup.find('div', 'textHeader')
|
|
chapters_p = header_div.find('p')
|
|
if not chapters_p:
|
|
chapters_p = soup.p
|
|
log.debug(chapters_p)
|
|
for item in chapters_p.contents:
|
|
if self.stop_import_flag:
|
|
break
|
|
if isinstance(item, Tag) and item.name in ['a', 'span']:
|
|
chapter_number = int(item.string.strip())
|
|
self.set_current_chapter(db_book.name, chapter_number)
|
|
self.process_verses(db_book, book_id, chapter_number)
|
|
|
|
def process_verses(self, db_book, book_number, chapter_number):
|
|
"""
|
|
Get the verses for a particular book
|
|
"""
|
|
chapter_file_path = self.base_path / '{:02d}'.format(book_number) / '{}.htm'.format(chapter_number)
|
|
page = chapter_file_path.read_text(encoding='utf-8', errors='ignore')
|
|
soup = BeautifulSoup(page, 'lxml')
|
|
text_body = soup.find('div', 'textBody')
|
|
if text_body:
|
|
verses_p = text_body.find('p')
|
|
else:
|
|
verses_p = soup.find_all('p')[2]
|
|
verse_number = 0
|
|
verse_text = ''
|
|
for item in verses_p.contents:
|
|
if self.stop_import_flag:
|
|
break
|
|
if isinstance(item, Tag) and 'verse' in item.get('class', []):
|
|
if verse_number > 0:
|
|
self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
|
|
verse_number = int(item.string.strip())
|
|
verse_text = ''
|
|
elif isinstance(item, NavigableString):
|
|
verse_text += str(item)
|
|
elif isinstance(item, Tag) and item.name in ['span', 'a']:
|
|
verse_text += str(item.string)
|
|
else:
|
|
log.warning('Can\'t store %s', item)
|
|
self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
|
|
|
|
def process_verse(self, db_book, chapter_number, verse_number, verse_text):
|
|
"""
|
|
Process a verse element
|
|
:param book: A database Book object
|
|
:param chapter_number: The chapter number to add the verses to (int)
|
|
:param element: The verse element to process. (etree element type)
|
|
:param use_milestones: set to True to process a 'milestone' verse. Defaults to False
|
|
:return: None
|
|
"""
|
|
if verse_text:
|
|
log.debug('%s %s:%s %s', db_book.name, chapter_number, verse_number, verse_text.strip())
|
|
self.create_verse(db_book.id, chapter_number, verse_number, verse_text.strip())
|
|
|
|
def do_import(self, bible_name=None):
|
|
"""
|
|
Loads a Bible from file.
|
|
"""
|
|
self.log_debug('Starting WordProject import from "{name}"'.format(name=self.file_path))
|
|
self._unzip_file()
|
|
self.language_id = self.get_language_id(None, bible_name=str(self.file_path))
|
|
result = False
|
|
if self.language_id:
|
|
self.process_books()
|
|
result = True
|
|
self._cleanup()
|
|
return result
|