diff --git a/openlp/core/lib/__init__.py b/openlp/core/lib/__init__.py index 7c00c0348..cf1f61aae 100644 --- a/openlp/core/lib/__init__.py +++ b/openlp/core/lib/__init__.py @@ -24,10 +24,10 @@ The :mod:`lib` module contains most of the components and libraries that make OpenLP work. """ -import chardet import logging import os from distutils.version import LooseVersion +from chardet.universaldetector import UniversalDetector from PyQt5 import QtCore, QtGui, Qt, QtWidgets @@ -340,18 +340,23 @@ def create_separated_list(string_list): def get_file_encoding(filename): """ - Utility function to get the file encoding. + Utility function to incrementally detect the file encoding. + + :param filename: Filename for the file to determine the encoding for. Str + :return: A dict with the keys 'encoding' and 'confidence' """ - detect_file = None + detector = UniversalDetector() try: - detect_file = open(filename, 'rb') - details = chardet.detect(detect_file.read(1024)) - except IOError: + with open(filename, 'rb') as detect_file: + while not detector.done: + chunk = detect_file.read(1024) + if not chunk: + break + detector.feed(chunk) + detector.close() + return detector.result + except OSError: log.exception('Error detecting file encoding') - finally: - if detect_file: - detect_file.close() - return details from .exceptions import ValidationError diff --git a/tests/functional/openlp_core_lib/test_lib.py b/tests/functional/openlp_core_lib/test_lib.py index 145be21f4..fdf9f5acf 100644 --- a/tests/functional/openlp_core_lib/test_lib.py +++ b/tests/functional/openlp_core_lib/test_lib.py @@ -23,6 +23,7 @@ Package to test the openlp.core.lib package. """ import os +from io import BytesIO from unittest import TestCase from datetime import datetime, timedelta @@ -30,8 +31,8 @@ from datetime import datetime, timedelta from PyQt5 import QtCore, QtGui from openlp.core.lib import build_icon, check_item_selected, clean_tags, create_thumb, create_separated_list, \ - expand_tags, get_text_file_string, image_to_byte, resize_image, str_to_bool, validate_thumb -from tests.functional import MagicMock, patch + expand_tags, get_file_encoding, get_text_file_string, image_to_byte, resize_image, str_to_bool, validate_thumb +from tests.functional import MagicMock, PropertyMock, call, patch TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'resources')) @@ -736,3 +737,62 @@ class TestLib(TestCase): # THEN: We should have "Author 1, Author 2, and Author 3" assert string_result == 'Author 1, Author 2, and Author 3', 'The string should be u\'Author 1, ' \ 'Author 2, and Author 3\'.' + + def test_get_file_name_encoding_done_test(self): + """ + Test get_file_encoding when the detector sets done to True + """ + # GIVEN: A mocked UniversalDetector instance with done attribute set to True after first iteration + with patch('openlp.core.lib.UniversalDetector') as mocked_universal_detector, \ + patch('builtins.open', return_value=BytesIO(b"data" * 260)) as mocked_open: + encoding_result = {'encoding': 'UTF-8', 'confidence': 0.99} + mocked_universal_detector_inst = MagicMock(result=encoding_result) + type(mocked_universal_detector_inst).done = PropertyMock(side_effect=[False, True]) + mocked_universal_detector.return_value = mocked_universal_detector_inst + + # WHEN: Calling get_file_encoding + result = get_file_encoding('file name') + + # THEN: The feed method of UniversalDetector should only br called once before returning a result + mocked_open.assert_called_once_with('file name', 'rb') + self.assertEqual(mocked_universal_detector_inst.feed.mock_calls, [call(b"data" * 256)]) + mocked_universal_detector_inst.close.assert_called_once_with() + self.assertEqual(result, encoding_result) + + def test_get_file_name_encoding_eof_test(self): + """ + Test get_file_encoding when the end of the file is reached + """ + # GIVEN: A mocked UniversalDetector instance which isn't set to done and a mocked open, with 1040 bytes of test + # data (enough to run the iterator twice) + with patch('openlp.core.lib.UniversalDetector') as mocked_universal_detector, \ + patch('builtins.open', return_value=BytesIO(b"data" * 260)) as mocked_open: + encoding_result = {'encoding': 'UTF-8', 'confidence': 0.99} + mocked_universal_detector_inst = MagicMock(mock=mocked_universal_detector, + **{'done': False, 'result': encoding_result}) + mocked_universal_detector.return_value = mocked_universal_detector_inst + + # WHEN: Calling get_file_encoding + result = get_file_encoding('file name') + + # THEN: The feed method of UniversalDetector should have been called twice before returning a result + mocked_open.assert_called_once_with('file name', 'rb') + self.assertEqual(mocked_universal_detector_inst.feed.mock_calls, [call(b"data" * 256), call(b"data" * 4)]) + mocked_universal_detector_inst.close.assert_called_once_with() + self.assertEqual(result, encoding_result) + + def test_get_file_name_encoding_oserror_test(self): + """ + Test get_file_encoding when the end of the file is reached + """ + # GIVEN: A mocked UniversalDetector instance which isn't set to done and a mocked open, with 1040 bytes of test + # data (enough to run the iterator twice) + with patch('openlp.core.lib.UniversalDetector'), \ + patch('builtins.open', side_effect=OSError), \ + patch('openlp.core.lib.log') as mocked_log: + # WHEN: Calling get_file_encoding + result = get_file_encoding('file name') + + # THEN: log.exception should be called and get_file_encoding should return None + mocked_log.exception.assert_called_once_with('Error detecting file encoding') + self.assertIsNone(result)