make tidy text part of core.\nRemove redundant (since py3) re.UNICODE flag

Fixes: https://launchpad.net/bugs/1727517
This commit is contained in:
Phill Ridout 2017-10-28 11:04:09 +01:00
parent fe5430e157
commit 960ddedc6f
17 changed files with 39 additions and 47 deletions

View File

@ -43,9 +43,13 @@ log = logging.getLogger(__name__ + '.__init__')
FIRST_CAMEL_REGEX = re.compile('(.)([A-Z][a-z]+)')
SECOND_CAMEL_REGEX = re.compile('([a-z0-9])([A-Z])')
CONTROL_CHARS = re.compile(r'[\x00-\x1F\x7F-\x9F]', re.UNICODE)
INVALID_FILE_CHARS = re.compile(r'[\\/:\*\?"<>\|\+\[\]%]', re.UNICODE)
CONTROL_CHARS = re.compile(r'[\x00-\x1F\x7F-\x9F]')
INVALID_FILE_CHARS = re.compile(r'[\\/:\*\?"<>\|\+\[\]%]')
IMAGES_FILTER = None
REPLACMENT_CHARS_MAP = str.maketrans({'\u2018': '\'', '\u2019': '\'', '\u201c': '"', '\u201d': '"', '\u2026': '...',
'\u2013': '-', '\u2014': '-', '\v': '\n\n', '\f': '\n\n'})
NEW_LINE_REGEX = re.compile(r' ?(\r\n?|\n) ?')
WHITESPACE_REGEX = re.compile(r'[ \t]+')
def trace_error_handler(logger):
@ -436,3 +440,13 @@ def get_file_encoding(file_path):
return detector.result
except OSError:
log.exception('Error detecting file encoding')
def normalize_str(irreg_str):
"""
:param str irreg_str:
:return:
"""
irreg_str = irreg_str.translate(REPLACMENT_CHARS_MAP)
irreg_str = NEW_LINE_REGEX.sub('\n', irreg_str)
return WHITESPACE_REGEX.sub(' ', irreg_str)

View File

@ -53,7 +53,7 @@ def translate(context, text, comment=None, qt_translate=QtCore.QCoreApplication.
Language = namedtuple('Language', ['id', 'name', 'code'])
ICU_COLLATOR = None
DIGITS_OR_NONDIGITS = re.compile(r'\d+|\D+', re.UNICODE)
DIGITS_OR_NONDIGITS = re.compile(r'\d+|\D+')
LANGUAGES = sorted([
Language(1, translate('common.languages', '(Afan) Oromo', 'Language code: om'), 'om'),
Language(2, translate('common.languages', 'Abkhazian', 'Language code: ab'), 'ab'),

View File

@ -38,7 +38,6 @@ log = logging.getLogger(__name__ + '.__init__')
SLIMCHARS = 'fiíIÍjlĺľrtť.,;/ ()|"\'!:\\'
class ServiceItemContext(object):
"""
The context in which a Service Item is being generated

View File

@ -92,7 +92,7 @@ class MediaManagerItem(QtWidgets.QWidget, RegistryProperties):
Run some initial setup. This method is separate from __init__ in order to mock it out in tests.
"""
self.hide()
self.whitespace = re.compile(r'[\W_]+', re.UNICODE)
self.whitespace = re.compile(r'[\W_]+')
visible_title = self.plugin.get_string(StringContent.VisibleName)
self.title = str(visible_title['title'])
Registry().register(self.plugin.name, self)

View File

@ -43,7 +43,7 @@ class FormattingTagController(object):
r'(?P<tag>[^\s/!\?>]+)(?:\s+[^\s=]+="[^"]*")*\s*(?P<empty>/)?'
r'|(?P<cdata>!\[CDATA\[(?:(?!\]\]>).)*\]\])'
r'|(?P<procinst>\?(?:(?!\?>).)*\?)'
r'|(?P<comment>!--(?:(?!-->).)*--))>', re.UNICODE)
r'|(?P<comment>!--(?:(?!-->).)*--))>')
self.html_regex = re.compile(r'^(?:[^<>]*%s)*[^<>]*$' % self.html_tag_regex.pattern)
def pre_save(self):

View File

@ -113,8 +113,7 @@ class BookNameForm(QDialog, Ui_BookNameDialog):
cor_book = self.corresponding_combo_box.currentText()
for character in '\\.^$*+?{}[]()':
cor_book = cor_book.replace(character, '\\' + character)
books = [key for key in list(self.book_names.keys()) if re.match(cor_book, str(self.book_names[key]),
re.UNICODE)]
books = [key for key in list(self.book_names.keys()) if re.match(cor_book, str(self.book_names[key]))]
books = [_f for _f in map(BiblesResourcesDB.get_book, books) if _f]
if books:
self.book_id = books[0]['id']

View File

@ -224,13 +224,13 @@ def update_reference_separators():
range_regex = '(?:(?P<from_chapter>[0-9]+){sep_v})?' \
'(?P<from_verse>[0-9]+)(?P<range_to>{sep_r}(?:(?:(?P<to_chapter>' \
'[0-9]+){sep_v})?(?P<to_verse>[0-9]+)|{sep_e})?)?'.format_map(REFERENCE_SEPARATORS)
REFERENCE_MATCHES['range'] = re.compile(r'^\s*{range}\s*$'.format(range=range_regex), re.UNICODE)
REFERENCE_MATCHES['range_separator'] = re.compile(REFERENCE_SEPARATORS['sep_l'], re.UNICODE)
REFERENCE_MATCHES['range'] = re.compile(r'^\s*{range}\s*$'.format(range=range_regex))
REFERENCE_MATCHES['range_separator'] = re.compile(REFERENCE_SEPARATORS['sep_l'])
# full reference match: <book>(<range>(,(?!$)|(?=$)))+
REFERENCE_MATCHES['full'] = \
re.compile(r'^\s*(?!\s)(?P<book>[\d]*[.]?[^\d\.]+)\.*(?<!\s)\s*'
r'(?P<ranges>(?:{range_regex}(?:{sep_l}(?!\s*$)|(?=\s*$)))+)\s*$'.format(
range_regex=range_regex, sep_l=REFERENCE_SEPARATORS['sep_l']), re.UNICODE)
range_regex=range_regex, sep_l=REFERENCE_SEPARATORS['sep_l']))
def get_reference_separator(separator_type):

View File

@ -307,8 +307,7 @@ class BibleDB(Manager):
book_escaped = book
for character in RESERVED_CHARACTERS:
book_escaped = book_escaped.replace(character, '\\' + character)
regex_book = re.compile('\\s*{book}\\s*'.format(book='\\s*'.join(book_escaped.split())),
re.UNICODE | re.IGNORECASE)
regex_book = re.compile('\\s*{book}\\s*'.format(book='\\s*'.join(book_escaped.split())), re.IGNORECASE)
if language_selection == LanguageSelection.Bible:
db_book = self.get_book(book)
if db_book:

View File

@ -105,9 +105,9 @@ class EditSongForm(QtWidgets.QDialog, Ui_EditSongDialog, RegistryProperties):
self.topics_list_view.setSortingEnabled(False)
self.topics_list_view.setAlternatingRowColors(True)
self.audio_list_widget.setAlternatingRowColors(True)
self.find_verse_split = re.compile('---\[\]---\n', re.UNICODE)
self.whitespace = re.compile(r'\W+', re.UNICODE)
self.find_tags = re.compile(u'\{/?\w+\}', re.UNICODE)
self.find_verse_split = re.compile('---\[\]---\n')
self.whitespace = re.compile(r'\W+')
self.find_tags = re.compile(r'\{/?\w+\}')
def _load_objects(self, cls, combo, cache):
"""

View File

@ -24,7 +24,6 @@ The :mod:`~openlp.plugins.songs.lib` module contains a number of library functio
"""
import logging
import os
import re
from PyQt5 import QtWidgets
@ -39,8 +38,8 @@ from openlp.plugins.songs.lib.ui import SongStrings
log = logging.getLogger(__name__)
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile('[\'`ʻ]', re.UNICODE)
WHITESPACE = re.compile(r'[\W_]+')
APOSTROPHE = re.compile(r'[\'`ʻ]')
# PATTERN will look for the next occurence of one of these symbols:
# \controlword - optionally preceded by \*, optionally followed by a number
# \'## - where ## is a pair of hex digits, representing a single character

View File

@ -25,6 +25,7 @@ import re
from lxml import etree, objectify
from openlp.core.common import normalize_str
from openlp.plugins.songs.lib import VerseType
from openlp.plugins.songs.lib.importers.songimport import SongImport
@ -225,7 +226,7 @@ class EasySlidesImport(SongImport):
verses[reg].setdefault(vt, {})
verses[reg][vt].setdefault(vn, {})
verses[reg][vt][vn].setdefault(inst, [])
verses[reg][vt][vn][inst].append(self.tidy_text(line))
verses[reg][vt][vn][inst].append(normalize_str(line))
# done parsing
versetags = []
# we use our_verse_order to ensure, we insert lyrics in the same order

View File

@ -101,7 +101,7 @@ class MediaShoutImport(SongImport):
self.song_book_name = song.SongID
for verse in verses:
tag = VERSE_TAGS[verse.Type] + str(verse.Number) if verse.Type < len(VERSE_TAGS) else 'O'
self.add_verse(self.tidy_text(verse.Text), tag)
self.add_verse(verse.Text, tag)
for order in verse_order:
if order.Type < len(VERSE_TAGS):
self.verse_order_list.append(VERSE_TAGS[order.Type] + str(order.Number))

View File

@ -24,7 +24,7 @@ import time
from PyQt5 import QtCore
from openlp.core.common import is_win, get_uno_command, get_uno_instance
from openlp.core.common import get_uno_command, get_uno_instance, is_win, normalize_str
from openlp.core.common.i18n import translate
from .songimport import SongImport
@ -241,7 +241,7 @@ class OpenOfficeImport(SongImport):
:param text: The text.
"""
song_texts = self.tidy_text(text).split('\f')
song_texts = normalize_str(text).split('\f')
self.set_defaults()
for song_text in song_texts:
if song_text.strip():

View File

@ -25,6 +25,7 @@ import re
from lxml import objectify
from lxml.etree import Error, LxmlError
from openlp.core.common import normalize_str
from openlp.core.common.i18n import translate
from openlp.core.common.settings import Settings
from openlp.plugins.songs.lib import VerseType
@ -262,7 +263,7 @@ class OpenSongImport(SongImport):
post=this_line[offset + column:])
offset += len(chord) + 2
# Tidy text and remove the ____s from extended words
this_line = self.tidy_text(this_line)
this_line = normalize_str(this_line)
this_line = this_line.replace('_', '')
this_line = this_line.replace('||', '\n[---]\n')
this_line = this_line.strip()

View File

@ -25,6 +25,7 @@ import re
from PyQt5 import QtCore
from openlp.core.common import normalize_str
from openlp.core.common.applocation import AppLocation
from openlp.core.common.i18n import translate
from openlp.core.common.path import copyfile, create_paths
@ -130,26 +131,6 @@ class SongImport(QtCore.QObject):
def register(self, import_wizard):
self.import_wizard = import_wizard
def tidy_text(self, text):
"""
Get rid of some dodgy unicode and formatting characters we're not interested in. Some can be converted to ascii.
"""
text = text.replace('\u2018', '\'')
text = text.replace('\u2019', '\'')
text = text.replace('\u201c', '"')
text = text.replace('\u201d', '"')
text = text.replace('\u2026', '...')
text = text.replace('\u2013', '-')
text = text.replace('\u2014', '-')
# Replace vertical tab with 2 linebreaks
text = text.replace('\v', '\n\n')
# Replace form feed (page break) with 2 linebreaks
text = text.replace('\f', '\n\n')
# Remove surplus blank lines, spaces, trailing/leading spaces
text = re.sub(r'[ \t]+', ' ', text)
text = re.sub(r' ?(\r\n?|\n) ?', '\n', text)
return text
def process_song_text(self, text):
"""
Process the song text from import
@ -368,7 +349,7 @@ class SongImport(QtCore.QObject):
verse_tag = VerseType.tags[VerseType.Other]
log.info('Versetype {old} changing to {new}'.format(old=verse_def, new=new_verse_def))
verse_def = new_verse_def
sxml.add_verse_to_lyrics(verse_tag, verse_def[1:], verse_text, lang)
sxml.add_verse_to_lyrics(verse_tag, verse_def[1:], normalize_str(verse_text), lang)
song.lyrics = str(sxml.extract_xml(), 'utf-8')
if not self.verse_order_list and self.verse_order_list_generated_useful:
self.verse_order_list = self.verse_order_list_generated

View File

@ -194,7 +194,6 @@ class SongsOfFellowshipImport(OpenOfficeImport):
:param text_portion: A Piece of text
"""
text = text_portion.getString()
text = self.tidy_text(text)
if text.strip() == '':
return text
if text_portion.CharWeight == BOLD:

View File

@ -281,7 +281,7 @@ class OpenLyrics(object):
# Process the formatting tags.
# Have we any tags in song lyrics?
tags_element = None
match = re.search('\{/?\w+\}', song.lyrics, re.UNICODE)
match = re.search(r'\{/?\w+\}', song.lyrics)
if match:
# Named 'format_' - 'format' is built-in function in Python.
format_ = etree.SubElement(song_xml, 'format')