forked from openlp/openlp
improved encoding detection code. Tests added
This commit is contained in:
parent
97dbc85918
commit
e9ec672756
@ -24,10 +24,10 @@ The :mod:`lib` module contains most of the components and libraries that make
|
|||||||
OpenLP work.
|
OpenLP work.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import chardet
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
|
from chardet.universaldetector import UniversalDetector
|
||||||
|
|
||||||
from PyQt5 import QtCore, QtGui, Qt, QtWidgets
|
from PyQt5 import QtCore, QtGui, Qt, QtWidgets
|
||||||
|
|
||||||
@ -340,18 +340,23 @@ def create_separated_list(string_list):
|
|||||||
|
|
||||||
def get_file_encoding(filename):
|
def get_file_encoding(filename):
|
||||||
"""
|
"""
|
||||||
Utility function to get the file encoding.
|
Utility function to incrementally detect the file encoding.
|
||||||
|
|
||||||
|
:param filename: Filename for the file to determine the encoding for. Str
|
||||||
|
:return: A dict with the keys 'encoding' and 'confidence'
|
||||||
"""
|
"""
|
||||||
detect_file = None
|
detector = UniversalDetector()
|
||||||
try:
|
try:
|
||||||
detect_file = open(filename, 'rb')
|
with open(filename, 'rb') as detect_file:
|
||||||
details = chardet.detect(detect_file.read(1024))
|
while not detector.done:
|
||||||
except IOError:
|
chunk = detect_file.read(1024)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
detector.feed(chunk)
|
||||||
|
detector.close()
|
||||||
|
return detector.result
|
||||||
|
except OSError:
|
||||||
log.exception('Error detecting file encoding')
|
log.exception('Error detecting file encoding')
|
||||||
finally:
|
|
||||||
if detect_file:
|
|
||||||
detect_file.close()
|
|
||||||
return details
|
|
||||||
|
|
||||||
|
|
||||||
from .exceptions import ValidationError
|
from .exceptions import ValidationError
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
Package to test the openlp.core.lib package.
|
Package to test the openlp.core.lib package.
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
from unittest import TestCase
|
from unittest import TestCase
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@ -30,8 +31,8 @@ from datetime import datetime, timedelta
|
|||||||
from PyQt5 import QtCore, QtGui
|
from PyQt5 import QtCore, QtGui
|
||||||
|
|
||||||
from openlp.core.lib import build_icon, check_item_selected, clean_tags, create_thumb, create_separated_list, \
|
from openlp.core.lib import build_icon, check_item_selected, clean_tags, create_thumb, create_separated_list, \
|
||||||
expand_tags, get_text_file_string, image_to_byte, resize_image, str_to_bool, validate_thumb
|
expand_tags, get_file_encoding, get_text_file_string, image_to_byte, resize_image, str_to_bool, validate_thumb
|
||||||
from tests.functional import MagicMock, patch
|
from tests.functional import MagicMock, PropertyMock, call, patch
|
||||||
|
|
||||||
TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'resources'))
|
TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'resources'))
|
||||||
|
|
||||||
@ -736,3 +737,62 @@ class TestLib(TestCase):
|
|||||||
# THEN: We should have "Author 1, Author 2, and Author 3"
|
# THEN: We should have "Author 1, Author 2, and Author 3"
|
||||||
assert string_result == 'Author 1, Author 2, and Author 3', 'The string should be u\'Author 1, ' \
|
assert string_result == 'Author 1, Author 2, and Author 3', 'The string should be u\'Author 1, ' \
|
||||||
'Author 2, and Author 3\'.'
|
'Author 2, and Author 3\'.'
|
||||||
|
|
||||||
|
def test_get_file_name_encoding_done_test(self):
|
||||||
|
"""
|
||||||
|
Test get_file_encoding when the detector sets done to True
|
||||||
|
"""
|
||||||
|
# GIVEN: A mocked UniversalDetector instance with done attribute set to True after first iteration
|
||||||
|
with patch('openlp.core.lib.UniversalDetector') as mocked_universal_detector, \
|
||||||
|
patch('builtins.open', return_value=BytesIO(b"data" * 260)) as mocked_open:
|
||||||
|
encoding_result = {'encoding': 'UTF-8', 'confidence': 0.99}
|
||||||
|
mocked_universal_detector_inst = MagicMock(result=encoding_result)
|
||||||
|
type(mocked_universal_detector_inst).done = PropertyMock(side_effect=[False, True])
|
||||||
|
mocked_universal_detector.return_value = mocked_universal_detector_inst
|
||||||
|
|
||||||
|
# WHEN: Calling get_file_encoding
|
||||||
|
result = get_file_encoding('file name')
|
||||||
|
|
||||||
|
# THEN: The feed method of UniversalDetector should only br called once before returning a result
|
||||||
|
mocked_open.assert_called_once_with('file name', 'rb')
|
||||||
|
self.assertEqual(mocked_universal_detector_inst.feed.mock_calls, [call(b"data" * 256)])
|
||||||
|
mocked_universal_detector_inst.close.assert_called_once_with()
|
||||||
|
self.assertEqual(result, encoding_result)
|
||||||
|
|
||||||
|
def test_get_file_name_encoding_eof_test(self):
|
||||||
|
"""
|
||||||
|
Test get_file_encoding when the end of the file is reached
|
||||||
|
"""
|
||||||
|
# GIVEN: A mocked UniversalDetector instance which isn't set to done and a mocked open, with 1040 bytes of test
|
||||||
|
# data (enough to run the iterator twice)
|
||||||
|
with patch('openlp.core.lib.UniversalDetector') as mocked_universal_detector, \
|
||||||
|
patch('builtins.open', return_value=BytesIO(b"data" * 260)) as mocked_open:
|
||||||
|
encoding_result = {'encoding': 'UTF-8', 'confidence': 0.99}
|
||||||
|
mocked_universal_detector_inst = MagicMock(mock=mocked_universal_detector,
|
||||||
|
**{'done': False, 'result': encoding_result})
|
||||||
|
mocked_universal_detector.return_value = mocked_universal_detector_inst
|
||||||
|
|
||||||
|
# WHEN: Calling get_file_encoding
|
||||||
|
result = get_file_encoding('file name')
|
||||||
|
|
||||||
|
# THEN: The feed method of UniversalDetector should have been called twice before returning a result
|
||||||
|
mocked_open.assert_called_once_with('file name', 'rb')
|
||||||
|
self.assertEqual(mocked_universal_detector_inst.feed.mock_calls, [call(b"data" * 256), call(b"data" * 4)])
|
||||||
|
mocked_universal_detector_inst.close.assert_called_once_with()
|
||||||
|
self.assertEqual(result, encoding_result)
|
||||||
|
|
||||||
|
def test_get_file_name_encoding_oserror_test(self):
|
||||||
|
"""
|
||||||
|
Test get_file_encoding when the end of the file is reached
|
||||||
|
"""
|
||||||
|
# GIVEN: A mocked UniversalDetector instance which isn't set to done and a mocked open, with 1040 bytes of test
|
||||||
|
# data (enough to run the iterator twice)
|
||||||
|
with patch('openlp.core.lib.UniversalDetector'), \
|
||||||
|
patch('builtins.open', side_effect=OSError), \
|
||||||
|
patch('openlp.core.lib.log') as mocked_log:
|
||||||
|
# WHEN: Calling get_file_encoding
|
||||||
|
result = get_file_encoding('file name')
|
||||||
|
|
||||||
|
# THEN: log.exception should be called and get_file_encoding should return None
|
||||||
|
mocked_log.exception.assert_called_once_with('Error detecting file encoding')
|
||||||
|
self.assertIsNone(result)
|
||||||
|
Loading…
Reference in New Issue
Block a user