Made the wordproject import more robust

This commit is contained in:
Tomas Groth 2024-01-09 17:26:36 +00:00 committed by Raoul Snyman
parent 6fa5075b7b
commit d725627c16
3 changed files with 35 additions and 3 deletions

View File

@ -122,11 +122,16 @@ class WordProjectBible(BibleImport):
log.debug('Cannot find chapters, using all p instead')
chapters_p = soup.p
log.debug(chapters_p)
chapter_number = 0
for item in chapters_p.contents:
if self.stop_import_flag:
break
if isinstance(item, Tag) and item.name in ['a', 'span']:
chapter_number = int(item.string.strip())
old_chapter_number = chapter_number
chapter_number = self.get_number(item.string.strip())
if chapter_number is None:
# attempt to fix a broken bible input by assuming the next chapter is just an increment
chapter_number = old_chapter_number + 1
self.set_current_chapter(db_book.name, chapter_number)
self.process_verses(db_book, book_id, chapter_number)
@ -150,7 +155,11 @@ class WordProjectBible(BibleImport):
if isinstance(item, Tag) and 'verse' in item.get('class', []):
if verse_number > 0:
self.process_verse(db_book, chapter_number, verse_number, verse_text.strip())
verse_number = int(item.string.strip())
old_verse_number = verse_number
verse_number = self.get_number(item.string.strip())
if verse_number is None:
# attempt to fix a broken bible input by assuming the next verse is just an increment
verse_number = old_verse_number + 1
verse_text = ''
elif isinstance(item, NavigableString):
verse_text += str(item)
@ -186,3 +195,14 @@ class WordProjectBible(BibleImport):
result = self.process_books()
self._cleanup()
return result
def get_number(self, input_: str) -> None | int:
"""
Given a string extracts the integer value from the beginning of the string, stopping at first non-numeric char
:param str: input string
:return: integer if found, else None
"""
if result := re.findall(r'\d+', input_):
return int(result[0])
else:
return None

View File

@ -21,6 +21,7 @@
"""
This module contains tests for the WordProject Bible importer.
"""
import pytest
from pathlib import Path
from unittest.mock import MagicMock, call, patch
@ -232,3 +233,14 @@ def test_do_import_no_language(settings):
assert mocked_process_books.call_count == 0
mocked_cleanup.assert_called_once_with()
assert result is False
@pytest.mark.parametrize('input_, output', [
('1', 1),
('02.', 2),
('3-4', 3),
('exodus', None)
])
def test_get_number(input_: str, output: int, settings):
importer = WordProjectBible(MagicMock(), path='.', name='.', file_path='dummy.zip')
assert importer.get_number(input_) == output

View File

@ -172,7 +172,7 @@
<!--... the Word of God:--></a>
<p><span class="verse" id="1">1</span> In the beginning God created the heaven and the earth.
<br /><span class="verse" id="2">2</span> And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.
<br /><span class="verse" id="2">02.</span> And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.
<br /><span class="verse" id="3">3</span> And God said, Let there be light: and there was light.
<br /><span class="verse" id="4">4</span> And God saw the light, that it was good: and God divided the light from the darkness.
<br /><span class="verse" id="5">5</span> And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day.