make tidy text part of core.\nRemove redundant (since py3) re.UNICODE flag

Fixes: https://launchpad.net/bugs/1727517
2017-10-28 11:04:09 +01:00 · 2017-10-28 11:04:09 +01:00 · 960ddedc6f
commit 960ddedc6f
parent fe5430e157
17 changed files with 39 additions and 47 deletions
--- a/openlp/core/common/init.py
+++ b/openlp/core/common/init.py
@ -43,9 +43,13 @@ log = logging.getLogger(__name__ + '.__init__')

 FIRST_CAMEL_REGEX = re.compile('(.)([A-Z][a-z]+)')
 SECOND_CAMEL_REGEX = re.compile('([a-z0-9])([A-Z])')
-CONTROL_CHARS = re.compile(r'[\x00-\x1F\x7F-\x9F]', re.UNICODE)
-INVALID_FILE_CHARS = re.compile(r'[\\/:\*\?"<>\|\+\[\]%]', re.UNICODE)
+CONTROL_CHARS = re.compile(r'[\x00-\x1F\x7F-\x9F]')
+INVALID_FILE_CHARS = re.compile(r'[\\/:\*\?"<>\|\+\[\]%]')
 IMAGES_FILTER = None
+REPLACMENT_CHARS_MAP = str.maketrans({'\u2018': '\'', '\u2019': '\'', '\u201c': '"', '\u201d': '"', '\u2026': '...',
+                                      '\u2013': '-', '\u2014': '-', '\v': '\n\n', '\f': '\n\n'})
+NEW_LINE_REGEX = re.compile(r' ?(\r\n?|\n) ?')
+WHITESPACE_REGEX = re.compile(r'[ \t]+')


 def trace_error_handler(logger):
@ -436,3 +440,13 @@ def get_file_encoding(file_path):
        return detector.result
    except OSError:
        log.exception('Error detecting file encoding')
+
+def normalize_str(irreg_str):
+    """
+
+    :param str irreg_str:
+    :return:
+    """
+    irreg_str = irreg_str.translate(REPLACMENT_CHARS_MAP)
+    irreg_str = NEW_LINE_REGEX.sub('\n', irreg_str)
+    return WHITESPACE_REGEX.sub(' ', irreg_str)
--- a/openlp/core/common/i18n.py
+++ b/openlp/core/common/i18n.py
@ -53,7 +53,7 @@ def translate(context, text, comment=None, qt_translate=QtCore.QCoreApplication.

 Language = namedtuple('Language', ['id', 'name', 'code'])
 ICU_COLLATOR = None
-DIGITS_OR_NONDIGITS = re.compile(r'\d+|\D+', re.UNICODE)
+DIGITS_OR_NONDIGITS = re.compile(r'\d+|\D+')
 LANGUAGES = sorted([
    Language(1, translate('common.languages', '(Afan) Oromo', 'Language code: om'), 'om'),
    Language(2, translate('common.languages', 'Abkhazian', 'Language code: ab'), 'ab'),
--- a/openlp/core/lib/init.py
+++ b/openlp/core/lib/init.py
@ -38,7 +38,6 @@ log = logging.getLogger(__name__ + '.__init__')

 SLIMCHARS = 'fiíIÍjlĺľrtť.,;/ ()|"\'!:\\'

-
 class ServiceItemContext(object):
    """
    The context in which a Service Item is being generated
--- a/openlp/core/lib/mediamanageritem.py
+++ b/openlp/core/lib/mediamanageritem.py
@ -92,7 +92,7 @@ class MediaManagerItem(QtWidgets.QWidget, RegistryProperties):
        Run some initial setup. This method is separate from __init__ in order to mock it out in tests.
        """
        self.hide()
-        self.whitespace = re.compile(r'[\W_]+', re.UNICODE)
+        self.whitespace = re.compile(r'[\W_]+')
        visible_title = self.plugin.get_string(StringContent.VisibleName)
        self.title = str(visible_title['title'])
        Registry().register(self.plugin.name, self)
--- a/openlp/core/ui/formattingtagcontroller.py
+++ b/openlp/core/ui/formattingtagcontroller.py
@ -43,7 +43,7 @@ class FormattingTagController(object):
            r'(?P<tag>[^\s/!\?>]+)(?:\s+[^\s=]+="[^"]*")*\s*(?P<empty>/)?'
            r'|(?P<cdata>!\[CDATA\[(?:(?!\]\]>).)*\]\])'
            r'|(?P<procinst>\?(?:(?!\?>).)*\?)'
-            r'|(?P<comment>!--(?:(?!-->).)*--))>', re.UNICODE)
+            r'|(?P<comment>!--(?:(?!-->).)*--))>')
        self.html_regex = re.compile(r'^(?:[^<>]*%s)*[^<>]*$' % self.html_tag_regex.pattern)

    def pre_save(self):
--- a/openlp/plugins/bibles/forms/booknameform.py
+++ b/openlp/plugins/bibles/forms/booknameform.py
@ -113,8 +113,7 @@ class BookNameForm(QDialog, Ui_BookNameDialog):
            cor_book = self.corresponding_combo_box.currentText()
            for character in '\\.^$*+?{}[]()':
                cor_book = cor_book.replace(character, '\\' + character)
-            books = [key for key in list(self.book_names.keys()) if re.match(cor_book, str(self.book_names[key]),
-                                                                             re.UNICODE)]
+            books = [key for key in list(self.book_names.keys()) if re.match(cor_book, str(self.book_names[key]))]
            books = [_f for _f in map(BiblesResourcesDB.get_book, books) if _f]
            if books:
                self.book_id = books[0]['id']
--- a/openlp/plugins/bibles/lib/init.py
+++ b/openlp/plugins/bibles/lib/init.py
@ -224,13 +224,13 @@ def update_reference_separators():
    range_regex = '(?:(?P<from_chapter>[0-9]+){sep_v})?' \
        '(?P<from_verse>[0-9]+)(?P<range_to>{sep_r}(?:(?:(?P<to_chapter>' \
        '[0-9]+){sep_v})?(?P<to_verse>[0-9]+)|{sep_e})?)?'.format_map(REFERENCE_SEPARATORS)
-    REFERENCE_MATCHES['range'] = re.compile(r'^\s*{range}\s*$'.format(range=range_regex), re.UNICODE)
-    REFERENCE_MATCHES['range_separator'] = re.compile(REFERENCE_SEPARATORS['sep_l'], re.UNICODE)
+    REFERENCE_MATCHES['range'] = re.compile(r'^\s*{range}\s*$'.format(range=range_regex))
+    REFERENCE_MATCHES['range_separator'] = re.compile(REFERENCE_SEPARATORS['sep_l'])
    # full reference match: <book>(<range>(,(?!$)|(?=$)))+
    REFERENCE_MATCHES['full'] = \
        re.compile(r'^\s*(?!\s)(?P<book>[\d]*[.]?[^\d\.]+)\.*(?<!\s)\s*'
                   r'(?P<ranges>(?:{range_regex}(?:{sep_l}(?!\s*$)|(?=\s*$)))+)\s*$'.format(
-                       range_regex=range_regex, sep_l=REFERENCE_SEPARATORS['sep_l']), re.UNICODE)
+                       range_regex=range_regex, sep_l=REFERENCE_SEPARATORS['sep_l']))


 def get_reference_separator(separator_type):
--- a/openlp/plugins/bibles/lib/db.py
+++ b/openlp/plugins/bibles/lib/db.py
@ -307,8 +307,7 @@ class BibleDB(Manager):
        book_escaped = book
        for character in RESERVED_CHARACTERS:
            book_escaped = book_escaped.replace(character, '\\' + character)
-        regex_book = re.compile('\\s*{book}\\s*'.format(book='\\s*'.join(book_escaped.split())),
-                                re.UNICODE | re.IGNORECASE)
+        regex_book = re.compile('\\s*{book}\\s*'.format(book='\\s*'.join(book_escaped.split())), re.IGNORECASE)
        if language_selection == LanguageSelection.Bible:
            db_book = self.get_book(book)
            if db_book:
--- a/openlp/plugins/songs/forms/editsongform.py
+++ b/openlp/plugins/songs/forms/editsongform.py
@ -105,9 +105,9 @@ class EditSongForm(QtWidgets.QDialog, Ui_EditSongDialog, RegistryProperties):
        self.topics_list_view.setSortingEnabled(False)
        self.topics_list_view.setAlternatingRowColors(True)
        self.audio_list_widget.setAlternatingRowColors(True)
-        self.find_verse_split = re.compile('---\[\]---\n', re.UNICODE)
-        self.whitespace = re.compile(r'\W+', re.UNICODE)
-        self.find_tags = re.compile(u'\{/?\w+\}', re.UNICODE)
+        self.find_verse_split = re.compile('---\[\]---\n')
+        self.whitespace = re.compile(r'\W+')
+        self.find_tags = re.compile(r'\{/?\w+\}')

    def _load_objects(self, cls, combo, cache):
        """
--- a/openlp/plugins/songs/lib/init.py
+++ b/openlp/plugins/songs/lib/init.py
@ -24,7 +24,6 @@ The :mod:`~openlp.plugins.songs.lib` module contains a number of library functio
 """

 import logging
-import os
 import re

 from PyQt5 import QtWidgets
@ -39,8 +38,8 @@ from openlp.plugins.songs.lib.ui import SongStrings

 log = logging.getLogger(__name__)

-WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
-APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
+WHITESPACE = re.compile(r'[\W_]+')
+APOSTROPHE = re.compile(r'[\'`’ʻ′]')
 # PATTERN will look for the next occurence of one of these symbols:
 #   \controlword - optionally preceded by \*, optionally followed by a number
 #   \'## - where ## is a pair of hex digits, representing a single character
--- a/openlp/plugins/songs/lib/importers/easyslides.py
+++ b/openlp/plugins/songs/lib/importers/easyslides.py
@ -25,6 +25,7 @@ import re

 from lxml import etree, objectify

+from openlp.core.common import normalize_str
 from openlp.plugins.songs.lib import VerseType
 from openlp.plugins.songs.lib.importers.songimport import SongImport

@ -225,7 +226,7 @@ class EasySlidesImport(SongImport):
                verses[reg].setdefault(vt, {})
                verses[reg][vt].setdefault(vn, {})
                verses[reg][vt][vn].setdefault(inst, [])
-                verses[reg][vt][vn][inst].append(self.tidy_text(line))
+                verses[reg][vt][vn][inst].append(normalize_str(line))
        # done parsing
        versetags = []
        # we use our_verse_order to ensure, we insert lyrics in the same order
--- a/openlp/plugins/songs/lib/importers/mediashout.py
+++ b/openlp/plugins/songs/lib/importers/mediashout.py
@ -101,7 +101,7 @@ class MediaShoutImport(SongImport):
            self.song_book_name = song.SongID
        for verse in verses:
            tag = VERSE_TAGS[verse.Type] + str(verse.Number) if verse.Type < len(VERSE_TAGS) else 'O'
-            self.add_verse(self.tidy_text(verse.Text), tag)
+            self.add_verse(verse.Text, tag)
        for order in verse_order:
            if order.Type < len(VERSE_TAGS):
                self.verse_order_list.append(VERSE_TAGS[order.Type] + str(order.Number))
--- a/openlp/plugins/songs/lib/importers/openoffice.py
+++ b/openlp/plugins/songs/lib/importers/openoffice.py
@ -24,7 +24,7 @@ import time

 from PyQt5 import QtCore

-from openlp.core.common import is_win, get_uno_command, get_uno_instance
+from openlp.core.common import get_uno_command, get_uno_instance, is_win, normalize_str
 from openlp.core.common.i18n import translate
 from .songimport import SongImport

@ -241,7 +241,7 @@ class OpenOfficeImport(SongImport):

        :param text: The text.
        """
-        song_texts = self.tidy_text(text).split('\f')
+        song_texts = normalize_str(text).split('\f')
        self.set_defaults()
        for song_text in song_texts:
            if song_text.strip():
--- a/openlp/plugins/songs/lib/importers/opensong.py
+++ b/openlp/plugins/songs/lib/importers/opensong.py
@ -25,6 +25,7 @@ import re
 from lxml import objectify
 from lxml.etree import Error, LxmlError

+from openlp.core.common import normalize_str
 from openlp.core.common.i18n import translate
 from openlp.core.common.settings import Settings
 from openlp.plugins.songs.lib import VerseType
@ -262,7 +263,7 @@ class OpenSongImport(SongImport):
                                                              post=this_line[offset + column:])
                    offset += len(chord) + 2
            # Tidy text and remove the ____s from extended words
-            this_line = self.tidy_text(this_line)
+            this_line = normalize_str(this_line)
            this_line = this_line.replace('_', '')
            this_line = this_line.replace('||', '\n[---]\n')
            this_line = this_line.strip()
--- a/openlp/plugins/songs/lib/importers/songimport.py
+++ b/openlp/plugins/songs/lib/importers/songimport.py
@ -25,6 +25,7 @@ import re

 from PyQt5 import QtCore

+from openlp.core.common import normalize_str
 from openlp.core.common.applocation import AppLocation
 from openlp.core.common.i18n import translate
 from openlp.core.common.path import copyfile, create_paths
@ -130,26 +131,6 @@ class SongImport(QtCore.QObject):
    def register(self, import_wizard):
        self.import_wizard = import_wizard

-    def tidy_text(self, text):
-        """
-        Get rid of some dodgy unicode and formatting characters we're not interested in. Some can be converted to ascii.
-        """
-        text = text.replace('\u2018', '\'')
-        text = text.replace('\u2019', '\'')
-        text = text.replace('\u201c', '"')
-        text = text.replace('\u201d', '"')
-        text = text.replace('\u2026', '...')
-        text = text.replace('\u2013', '-')
-        text = text.replace('\u2014', '-')
-        # Replace vertical tab with 2 linebreaks
-        text = text.replace('\v', '\n\n')
-        # Replace form feed (page break) with 2 linebreaks
-        text = text.replace('\f', '\n\n')
-        # Remove surplus blank lines, spaces, trailing/leading spaces
-        text = re.sub(r'[ \t]+', ' ', text)
-        text = re.sub(r' ?(\r\n?|\n) ?', '\n', text)
-        return text
-
    def process_song_text(self, text):
        """
        Process the song text from import
@ -368,7 +349,7 @@ class SongImport(QtCore.QObject):
                verse_tag = VerseType.tags[VerseType.Other]
                log.info('Versetype {old} changing to {new}'.format(old=verse_def, new=new_verse_def))
                verse_def = new_verse_def
-            sxml.add_verse_to_lyrics(verse_tag, verse_def[1:], verse_text, lang)
+            sxml.add_verse_to_lyrics(verse_tag, verse_def[1:], normalize_str(verse_text), lang)
        song.lyrics = str(sxml.extract_xml(), 'utf-8')
        if not self.verse_order_list and self.verse_order_list_generated_useful:
            self.verse_order_list = self.verse_order_list_generated
--- a/openlp/plugins/songs/lib/importers/songsoffellowship.py
+++ b/openlp/plugins/songs/lib/importers/songsoffellowship.py
@ -194,7 +194,6 @@ class SongsOfFellowshipImport(OpenOfficeImport):
        :param text_portion: A Piece of text
        """
        text = text_portion.getString()
-        text = self.tidy_text(text)
        if text.strip() == '':
            return text
        if text_portion.CharWeight == BOLD:
--- a/openlp/plugins/songs/lib/openlyricsxml.py
+++ b/openlp/plugins/songs/lib/openlyricsxml.py
@ -281,7 +281,7 @@ class OpenLyrics(object):
        # Process the formatting tags.
        # Have we any tags in song lyrics?
        tags_element = None
-        match = re.search('\{/?\w+\}', song.lyrics, re.UNICODE)
+        match = re.search(r'\{/?\w+\}', song.lyrics)
        if match:
            # Named 'format_' - 'format' is built-in function in Python.
            format_ = etree.SubElement(song_xml, 'format')