improved encoding detection code. Tests added

This commit is contained in:
Philip Ridout 2016-08-08 19:09:33 +01:00
parent 97dbc85918
commit e9ec672756
2 changed files with 77 additions and 12 deletions

View File

@ -24,10 +24,10 @@ The :mod:`lib` module contains most of the components and libraries that make
OpenLP work.
"""
import chardet
import logging
import os
from distutils.version import LooseVersion
from chardet.universaldetector import UniversalDetector
from PyQt5 import QtCore, QtGui, Qt, QtWidgets
@ -340,18 +340,23 @@ def create_separated_list(string_list):
def get_file_encoding(filename):
"""
Utility function to get the file encoding.
Utility function to incrementally detect the file encoding.
:param filename: Filename for the file to determine the encoding for. Str
:return: A dict with the keys 'encoding' and 'confidence'
"""
detect_file = None
detector = UniversalDetector()
try:
detect_file = open(filename, 'rb')
details = chardet.detect(detect_file.read(1024))
except IOError:
with open(filename, 'rb') as detect_file:
while not detector.done:
chunk = detect_file.read(1024)
if not chunk:
break
detector.feed(chunk)
detector.close()
return detector.result
except OSError:
log.exception('Error detecting file encoding')
finally:
if detect_file:
detect_file.close()
return details
from .exceptions import ValidationError

View File

@ -23,6 +23,7 @@
Package to test the openlp.core.lib package.
"""
import os
from io import BytesIO
from unittest import TestCase
from datetime import datetime, timedelta
@ -30,8 +31,8 @@ from datetime import datetime, timedelta
from PyQt5 import QtCore, QtGui
from openlp.core.lib import build_icon, check_item_selected, clean_tags, create_thumb, create_separated_list, \
expand_tags, get_text_file_string, image_to_byte, resize_image, str_to_bool, validate_thumb
from tests.functional import MagicMock, patch
expand_tags, get_file_encoding, get_text_file_string, image_to_byte, resize_image, str_to_bool, validate_thumb
from tests.functional import MagicMock, PropertyMock, call, patch
TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'resources'))
@ -736,3 +737,62 @@ class TestLib(TestCase):
# THEN: We should have "Author 1, Author 2, and Author 3"
assert string_result == 'Author 1, Author 2, and Author 3', 'The string should be u\'Author 1, ' \
'Author 2, and Author 3\'.'
def test_get_file_name_encoding_done_test(self):
"""
Test get_file_encoding when the detector sets done to True
"""
# GIVEN: A mocked UniversalDetector instance with done attribute set to True after first iteration
with patch('openlp.core.lib.UniversalDetector') as mocked_universal_detector, \
patch('builtins.open', return_value=BytesIO(b"data" * 260)) as mocked_open:
encoding_result = {'encoding': 'UTF-8', 'confidence': 0.99}
mocked_universal_detector_inst = MagicMock(result=encoding_result)
type(mocked_universal_detector_inst).done = PropertyMock(side_effect=[False, True])
mocked_universal_detector.return_value = mocked_universal_detector_inst
# WHEN: Calling get_file_encoding
result = get_file_encoding('file name')
# THEN: The feed method of UniversalDetector should only br called once before returning a result
mocked_open.assert_called_once_with('file name', 'rb')
self.assertEqual(mocked_universal_detector_inst.feed.mock_calls, [call(b"data" * 256)])
mocked_universal_detector_inst.close.assert_called_once_with()
self.assertEqual(result, encoding_result)
def test_get_file_name_encoding_eof_test(self):
"""
Test get_file_encoding when the end of the file is reached
"""
# GIVEN: A mocked UniversalDetector instance which isn't set to done and a mocked open, with 1040 bytes of test
# data (enough to run the iterator twice)
with patch('openlp.core.lib.UniversalDetector') as mocked_universal_detector, \
patch('builtins.open', return_value=BytesIO(b"data" * 260)) as mocked_open:
encoding_result = {'encoding': 'UTF-8', 'confidence': 0.99}
mocked_universal_detector_inst = MagicMock(mock=mocked_universal_detector,
**{'done': False, 'result': encoding_result})
mocked_universal_detector.return_value = mocked_universal_detector_inst
# WHEN: Calling get_file_encoding
result = get_file_encoding('file name')
# THEN: The feed method of UniversalDetector should have been called twice before returning a result
mocked_open.assert_called_once_with('file name', 'rb')
self.assertEqual(mocked_universal_detector_inst.feed.mock_calls, [call(b"data" * 256), call(b"data" * 4)])
mocked_universal_detector_inst.close.assert_called_once_with()
self.assertEqual(result, encoding_result)
def test_get_file_name_encoding_oserror_test(self):
"""
Test get_file_encoding when the end of the file is reached
"""
# GIVEN: A mocked UniversalDetector instance which isn't set to done and a mocked open, with 1040 bytes of test
# data (enough to run the iterator twice)
with patch('openlp.core.lib.UniversalDetector'), \
patch('builtins.open', side_effect=OSError), \
patch('openlp.core.lib.log') as mocked_log:
# WHEN: Calling get_file_encoding
result = get_file_encoding('file name')
# THEN: log.exception should be called and get_file_encoding should return None
mocked_log.exception.assert_called_once_with('Error detecting file encoding')
self.assertIsNone(result)