From 960ddedc6f08744f3585c28e4e66c55345c75b19 Mon Sep 17 00:00:00 2001 From: Phill Ridout Date: Sat, 28 Oct 2017 11:04:09 +0100 Subject: [PATCH] make tidy text part of core.\nRemove redundant (since py3) re.UNICODE flag Fixes: https://launchpad.net/bugs/1727517 --- openlp/core/common/__init__.py | 18 +++++++++++++-- openlp/core/common/i18n.py | 2 +- openlp/core/lib/__init__.py | 1 - openlp/core/lib/mediamanageritem.py | 2 +- openlp/core/ui/formattingtagcontroller.py | 2 +- openlp/plugins/bibles/forms/booknameform.py | 3 +-- openlp/plugins/bibles/lib/__init__.py | 6 ++--- openlp/plugins/bibles/lib/db.py | 3 +-- openlp/plugins/songs/forms/editsongform.py | 6 ++--- openlp/plugins/songs/lib/__init__.py | 5 ++-- .../plugins/songs/lib/importers/easyslides.py | 3 ++- .../plugins/songs/lib/importers/mediashout.py | 2 +- .../plugins/songs/lib/importers/openoffice.py | 4 ++-- .../plugins/songs/lib/importers/opensong.py | 3 ++- .../plugins/songs/lib/importers/songimport.py | 23 ++----------------- .../songs/lib/importers/songsoffellowship.py | 1 - openlp/plugins/songs/lib/openlyricsxml.py | 2 +- 17 files changed, 39 insertions(+), 47 deletions(-) diff --git a/openlp/core/common/__init__.py b/openlp/core/common/__init__.py index f8017fdbd..f4c22f490 100644 --- a/openlp/core/common/__init__.py +++ b/openlp/core/common/__init__.py @@ -43,9 +43,13 @@ log = logging.getLogger(__name__ + '.__init__') FIRST_CAMEL_REGEX = re.compile('(.)([A-Z][a-z]+)') SECOND_CAMEL_REGEX = re.compile('([a-z0-9])([A-Z])') -CONTROL_CHARS = re.compile(r'[\x00-\x1F\x7F-\x9F]', re.UNICODE) -INVALID_FILE_CHARS = re.compile(r'[\\/:\*\?"<>\|\+\[\]%]', re.UNICODE) +CONTROL_CHARS = re.compile(r'[\x00-\x1F\x7F-\x9F]') +INVALID_FILE_CHARS = re.compile(r'[\\/:\*\?"<>\|\+\[\]%]') IMAGES_FILTER = None +REPLACMENT_CHARS_MAP = str.maketrans({'\u2018': '\'', '\u2019': '\'', '\u201c': '"', '\u201d': '"', '\u2026': '...', + '\u2013': '-', '\u2014': '-', '\v': '\n\n', '\f': '\n\n'}) +NEW_LINE_REGEX = re.compile(r' ?(\r\n?|\n) ?') +WHITESPACE_REGEX = re.compile(r'[ \t]+') def trace_error_handler(logger): @@ -436,3 +440,13 @@ def get_file_encoding(file_path): return detector.result except OSError: log.exception('Error detecting file encoding') + +def normalize_str(irreg_str): + """ + + :param str irreg_str: + :return: + """ + irreg_str = irreg_str.translate(REPLACMENT_CHARS_MAP) + irreg_str = NEW_LINE_REGEX.sub('\n', irreg_str) + return WHITESPACE_REGEX.sub(' ', irreg_str) diff --git a/openlp/core/common/i18n.py b/openlp/core/common/i18n.py index 1f4357808..9149f3fe6 100644 --- a/openlp/core/common/i18n.py +++ b/openlp/core/common/i18n.py @@ -53,7 +53,7 @@ def translate(context, text, comment=None, qt_translate=QtCore.QCoreApplication. Language = namedtuple('Language', ['id', 'name', 'code']) ICU_COLLATOR = None -DIGITS_OR_NONDIGITS = re.compile(r'\d+|\D+', re.UNICODE) +DIGITS_OR_NONDIGITS = re.compile(r'\d+|\D+') LANGUAGES = sorted([ Language(1, translate('common.languages', '(Afan) Oromo', 'Language code: om'), 'om'), Language(2, translate('common.languages', 'Abkhazian', 'Language code: ab'), 'ab'), diff --git a/openlp/core/lib/__init__.py b/openlp/core/lib/__init__.py index 0f4078420..f78065774 100644 --- a/openlp/core/lib/__init__.py +++ b/openlp/core/lib/__init__.py @@ -38,7 +38,6 @@ log = logging.getLogger(__name__ + '.__init__') SLIMCHARS = 'fiíIÍjlĺľrtť.,;/ ()|"\'!:\\' - class ServiceItemContext(object): """ The context in which a Service Item is being generated diff --git a/openlp/core/lib/mediamanageritem.py b/openlp/core/lib/mediamanageritem.py index c650ad80e..cc884279c 100644 --- a/openlp/core/lib/mediamanageritem.py +++ b/openlp/core/lib/mediamanageritem.py @@ -92,7 +92,7 @@ class MediaManagerItem(QtWidgets.QWidget, RegistryProperties): Run some initial setup. This method is separate from __init__ in order to mock it out in tests. """ self.hide() - self.whitespace = re.compile(r'[\W_]+', re.UNICODE) + self.whitespace = re.compile(r'[\W_]+') visible_title = self.plugin.get_string(StringContent.VisibleName) self.title = str(visible_title['title']) Registry().register(self.plugin.name, self) diff --git a/openlp/core/ui/formattingtagcontroller.py b/openlp/core/ui/formattingtagcontroller.py index e92173fed..4b9d75fee 100644 --- a/openlp/core/ui/formattingtagcontroller.py +++ b/openlp/core/ui/formattingtagcontroller.py @@ -43,7 +43,7 @@ class FormattingTagController(object): r'(?P[^\s/!\?>]+)(?:\s+[^\s=]+="[^"]*")*\s*(?P/)?' r'|(?P!\[CDATA\[(?:(?!\]\]>).)*\]\])' r'|(?P\?(?:(?!\?>).)*\?)' - r'|(?P!--(?:(?!-->).)*--))>', re.UNICODE) + r'|(?P!--(?:(?!-->).)*--))>') self.html_regex = re.compile(r'^(?:[^<>]*%s)*[^<>]*$' % self.html_tag_regex.pattern) def pre_save(self): diff --git a/openlp/plugins/bibles/forms/booknameform.py b/openlp/plugins/bibles/forms/booknameform.py index f78559ce5..7c8a2c3cd 100644 --- a/openlp/plugins/bibles/forms/booknameform.py +++ b/openlp/plugins/bibles/forms/booknameform.py @@ -113,8 +113,7 @@ class BookNameForm(QDialog, Ui_BookNameDialog): cor_book = self.corresponding_combo_box.currentText() for character in '\\.^$*+?{}[]()': cor_book = cor_book.replace(character, '\\' + character) - books = [key for key in list(self.book_names.keys()) if re.match(cor_book, str(self.book_names[key]), - re.UNICODE)] + books = [key for key in list(self.book_names.keys()) if re.match(cor_book, str(self.book_names[key]))] books = [_f for _f in map(BiblesResourcesDB.get_book, books) if _f] if books: self.book_id = books[0]['id'] diff --git a/openlp/plugins/bibles/lib/__init__.py b/openlp/plugins/bibles/lib/__init__.py index 9247485c1..f9d93a43e 100644 --- a/openlp/plugins/bibles/lib/__init__.py +++ b/openlp/plugins/bibles/lib/__init__.py @@ -224,13 +224,13 @@ def update_reference_separators(): range_regex = '(?:(?P[0-9]+){sep_v})?' \ '(?P[0-9]+)(?P{sep_r}(?:(?:(?P' \ '[0-9]+){sep_v})?(?P[0-9]+)|{sep_e})?)?'.format_map(REFERENCE_SEPARATORS) - REFERENCE_MATCHES['range'] = re.compile(r'^\s*{range}\s*$'.format(range=range_regex), re.UNICODE) - REFERENCE_MATCHES['range_separator'] = re.compile(REFERENCE_SEPARATORS['sep_l'], re.UNICODE) + REFERENCE_MATCHES['range'] = re.compile(r'^\s*{range}\s*$'.format(range=range_regex)) + REFERENCE_MATCHES['range_separator'] = re.compile(REFERENCE_SEPARATORS['sep_l']) # full reference match: ((,(?!$)|(?=$)))+ REFERENCE_MATCHES['full'] = \ re.compile(r'^\s*(?!\s)(?P[\d]*[.]?[^\d\.]+)\.*(?(?:{range_regex}(?:{sep_l}(?!\s*$)|(?=\s*$)))+)\s*$'.format( - range_regex=range_regex, sep_l=REFERENCE_SEPARATORS['sep_l']), re.UNICODE) + range_regex=range_regex, sep_l=REFERENCE_SEPARATORS['sep_l'])) def get_reference_separator(separator_type): diff --git a/openlp/plugins/bibles/lib/db.py b/openlp/plugins/bibles/lib/db.py index bc8ce4150..995a9691a 100644 --- a/openlp/plugins/bibles/lib/db.py +++ b/openlp/plugins/bibles/lib/db.py @@ -307,8 +307,7 @@ class BibleDB(Manager): book_escaped = book for character in RESERVED_CHARACTERS: book_escaped = book_escaped.replace(character, '\\' + character) - regex_book = re.compile('\\s*{book}\\s*'.format(book='\\s*'.join(book_escaped.split())), - re.UNICODE | re.IGNORECASE) + regex_book = re.compile('\\s*{book}\\s*'.format(book='\\s*'.join(book_escaped.split())), re.IGNORECASE) if language_selection == LanguageSelection.Bible: db_book = self.get_book(book) if db_book: diff --git a/openlp/plugins/songs/forms/editsongform.py b/openlp/plugins/songs/forms/editsongform.py index fa475a63f..6e0772418 100644 --- a/openlp/plugins/songs/forms/editsongform.py +++ b/openlp/plugins/songs/forms/editsongform.py @@ -105,9 +105,9 @@ class EditSongForm(QtWidgets.QDialog, Ui_EditSongDialog, RegistryProperties): self.topics_list_view.setSortingEnabled(False) self.topics_list_view.setAlternatingRowColors(True) self.audio_list_widget.setAlternatingRowColors(True) - self.find_verse_split = re.compile('---\[\]---\n', re.UNICODE) - self.whitespace = re.compile(r'\W+', re.UNICODE) - self.find_tags = re.compile(u'\{/?\w+\}', re.UNICODE) + self.find_verse_split = re.compile('---\[\]---\n') + self.whitespace = re.compile(r'\W+') + self.find_tags = re.compile(r'\{/?\w+\}') def _load_objects(self, cls, combo, cache): """ diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index f88aa8678..74334ef0d 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -24,7 +24,6 @@ The :mod:`~openlp.plugins.songs.lib` module contains a number of library functio """ import logging -import os import re from PyQt5 import QtWidgets @@ -39,8 +38,8 @@ from openlp.plugins.songs.lib.ui import SongStrings log = logging.getLogger(__name__) -WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) -APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE) +WHITESPACE = re.compile(r'[\W_]+') +APOSTROPHE = re.compile(r'[\'`’ʻ′]') # PATTERN will look for the next occurence of one of these symbols: # \controlword - optionally preceded by \*, optionally followed by a number # \'## - where ## is a pair of hex digits, representing a single character diff --git a/openlp/plugins/songs/lib/importers/easyslides.py b/openlp/plugins/songs/lib/importers/easyslides.py index a1ffb7b7c..6d717bdb4 100644 --- a/openlp/plugins/songs/lib/importers/easyslides.py +++ b/openlp/plugins/songs/lib/importers/easyslides.py @@ -25,6 +25,7 @@ import re from lxml import etree, objectify +from openlp.core.common import normalize_str from openlp.plugins.songs.lib import VerseType from openlp.plugins.songs.lib.importers.songimport import SongImport @@ -225,7 +226,7 @@ class EasySlidesImport(SongImport): verses[reg].setdefault(vt, {}) verses[reg][vt].setdefault(vn, {}) verses[reg][vt][vn].setdefault(inst, []) - verses[reg][vt][vn][inst].append(self.tidy_text(line)) + verses[reg][vt][vn][inst].append(normalize_str(line)) # done parsing versetags = [] # we use our_verse_order to ensure, we insert lyrics in the same order diff --git a/openlp/plugins/songs/lib/importers/mediashout.py b/openlp/plugins/songs/lib/importers/mediashout.py index 67cf0d0fb..9df9baa0f 100644 --- a/openlp/plugins/songs/lib/importers/mediashout.py +++ b/openlp/plugins/songs/lib/importers/mediashout.py @@ -101,7 +101,7 @@ class MediaShoutImport(SongImport): self.song_book_name = song.SongID for verse in verses: tag = VERSE_TAGS[verse.Type] + str(verse.Number) if verse.Type < len(VERSE_TAGS) else 'O' - self.add_verse(self.tidy_text(verse.Text), tag) + self.add_verse(verse.Text, tag) for order in verse_order: if order.Type < len(VERSE_TAGS): self.verse_order_list.append(VERSE_TAGS[order.Type] + str(order.Number)) diff --git a/openlp/plugins/songs/lib/importers/openoffice.py b/openlp/plugins/songs/lib/importers/openoffice.py index a097d8b85..f2a8b2147 100644 --- a/openlp/plugins/songs/lib/importers/openoffice.py +++ b/openlp/plugins/songs/lib/importers/openoffice.py @@ -24,7 +24,7 @@ import time from PyQt5 import QtCore -from openlp.core.common import is_win, get_uno_command, get_uno_instance +from openlp.core.common import get_uno_command, get_uno_instance, is_win, normalize_str from openlp.core.common.i18n import translate from .songimport import SongImport @@ -241,7 +241,7 @@ class OpenOfficeImport(SongImport): :param text: The text. """ - song_texts = self.tidy_text(text).split('\f') + song_texts = normalize_str(text).split('\f') self.set_defaults() for song_text in song_texts: if song_text.strip(): diff --git a/openlp/plugins/songs/lib/importers/opensong.py b/openlp/plugins/songs/lib/importers/opensong.py index e6924e7b2..6cd690562 100644 --- a/openlp/plugins/songs/lib/importers/opensong.py +++ b/openlp/plugins/songs/lib/importers/opensong.py @@ -25,6 +25,7 @@ import re from lxml import objectify from lxml.etree import Error, LxmlError +from openlp.core.common import normalize_str from openlp.core.common.i18n import translate from openlp.core.common.settings import Settings from openlp.plugins.songs.lib import VerseType @@ -262,7 +263,7 @@ class OpenSongImport(SongImport): post=this_line[offset + column:]) offset += len(chord) + 2 # Tidy text and remove the ____s from extended words - this_line = self.tidy_text(this_line) + this_line = normalize_str(this_line) this_line = this_line.replace('_', '') this_line = this_line.replace('||', '\n[---]\n') this_line = this_line.strip() diff --git a/openlp/plugins/songs/lib/importers/songimport.py b/openlp/plugins/songs/lib/importers/songimport.py index a67c17fe7..2bd8c0e56 100644 --- a/openlp/plugins/songs/lib/importers/songimport.py +++ b/openlp/plugins/songs/lib/importers/songimport.py @@ -25,6 +25,7 @@ import re from PyQt5 import QtCore +from openlp.core.common import normalize_str from openlp.core.common.applocation import AppLocation from openlp.core.common.i18n import translate from openlp.core.common.path import copyfile, create_paths @@ -130,26 +131,6 @@ class SongImport(QtCore.QObject): def register(self, import_wizard): self.import_wizard = import_wizard - def tidy_text(self, text): - """ - Get rid of some dodgy unicode and formatting characters we're not interested in. Some can be converted to ascii. - """ - text = text.replace('\u2018', '\'') - text = text.replace('\u2019', '\'') - text = text.replace('\u201c', '"') - text = text.replace('\u201d', '"') - text = text.replace('\u2026', '...') - text = text.replace('\u2013', '-') - text = text.replace('\u2014', '-') - # Replace vertical tab with 2 linebreaks - text = text.replace('\v', '\n\n') - # Replace form feed (page break) with 2 linebreaks - text = text.replace('\f', '\n\n') - # Remove surplus blank lines, spaces, trailing/leading spaces - text = re.sub(r'[ \t]+', ' ', text) - text = re.sub(r' ?(\r\n?|\n) ?', '\n', text) - return text - def process_song_text(self, text): """ Process the song text from import @@ -368,7 +349,7 @@ class SongImport(QtCore.QObject): verse_tag = VerseType.tags[VerseType.Other] log.info('Versetype {old} changing to {new}'.format(old=verse_def, new=new_verse_def)) verse_def = new_verse_def - sxml.add_verse_to_lyrics(verse_tag, verse_def[1:], verse_text, lang) + sxml.add_verse_to_lyrics(verse_tag, verse_def[1:], normalize_str(verse_text), lang) song.lyrics = str(sxml.extract_xml(), 'utf-8') if not self.verse_order_list and self.verse_order_list_generated_useful: self.verse_order_list = self.verse_order_list_generated diff --git a/openlp/plugins/songs/lib/importers/songsoffellowship.py b/openlp/plugins/songs/lib/importers/songsoffellowship.py index 13e073cc1..bbba654c9 100644 --- a/openlp/plugins/songs/lib/importers/songsoffellowship.py +++ b/openlp/plugins/songs/lib/importers/songsoffellowship.py @@ -194,7 +194,6 @@ class SongsOfFellowshipImport(OpenOfficeImport): :param text_portion: A Piece of text """ text = text_portion.getString() - text = self.tidy_text(text) if text.strip() == '': return text if text_portion.CharWeight == BOLD: diff --git a/openlp/plugins/songs/lib/openlyricsxml.py b/openlp/plugins/songs/lib/openlyricsxml.py index 74d91068c..ef47fa77b 100644 --- a/openlp/plugins/songs/lib/openlyricsxml.py +++ b/openlp/plugins/songs/lib/openlyricsxml.py @@ -281,7 +281,7 @@ class OpenLyrics(object): # Process the formatting tags. # Have we any tags in song lyrics? tags_element = None - match = re.search('\{/?\w+\}', song.lyrics, re.UNICODE) + match = re.search(r'\{/?\w+\}', song.lyrics) if match: # Named 'format_' - 'format' is built-in function in Python. format_ = etree.SubElement(song_xml, 'format')