From 8a599eab43a970e20a93321f295af7b5f6e017d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 20 Jun 2012 00:56:19 +0300 Subject: [PATCH 01/18] Sunday Plus importer, strip_rtf might be yet nonfunctional. --- openlp/core/ui/wizard.py | 1 + openlp/plugins/songs/forms/songimportform.py | 36 ++ openlp/plugins/songs/lib/importer.py | 39 +- openlp/plugins/songs/lib/sundayplusimport.py | 386 +++++++++++++++++++ 4 files changed, 445 insertions(+), 17 deletions(-) create mode 100644 openlp/plugins/songs/lib/sundayplusimport.py diff --git a/openlp/core/ui/wizard.py b/openlp/core/ui/wizard.py index 91aa42e43..e51c14274 100644 --- a/openlp/core/ui/wizard.py +++ b/openlp/core/ui/wizard.py @@ -55,6 +55,7 @@ class WizardStrings(object): SB = u'SongBeamer' SoF = u'Songs of Fellowship' SSP = u'SongShow Plus' + SP = u'Sunday Plus' WoW = u'Words of Worship' # These strings should need a good reason to be retranslated elsewhere. FinishedImport = translate('OpenLP.Ui', 'Finished import.') diff --git a/openlp/plugins/songs/forms/songimportform.py b/openlp/plugins/songs/forms/songimportform.py index db4d9bf56..214b1a978 100644 --- a/openlp/plugins/songs/forms/songimportform.py +++ b/openlp/plugins/songs/forms/songimportform.py @@ -159,6 +159,12 @@ class SongImportForm(OpenLPWizard): QtCore.QObject.connect(self.songShowPlusRemoveButton, QtCore.SIGNAL(u'clicked()'), self.onSongShowPlusRemoveButtonClicked) + QtCore.QObject.connect(self.sundayPlusAddButton, + QtCore.SIGNAL(u'clicked()'), + self.onSundayPlusAddButtonClicked) + QtCore.QObject.connect(self.sundayPlusRemoveButton, + QtCore.SIGNAL(u'clicked()'), + self.onSundayPlusRemoveButtonClicked) QtCore.QObject.connect(self.foilPresenterAddButton, QtCore.SIGNAL(u'clicked()'), self.onFoilPresenterAddButtonClicked) @@ -215,6 +221,8 @@ class SongImportForm(OpenLPWizard): self.addFileSelectItem(u'songShowPlus') # Songs of Fellowship self.addFileSelectItem(u'songsOfFellowship', None, True) + # Sunday Plus + self.addFileSelectItem(u'sundayPlus') # Words of Worship self.addFileSelectItem(u'wordsOfWorship') # Commented out for future use. @@ -258,6 +266,8 @@ class SongImportForm(OpenLPWizard): SongFormat.SongBeamer, WizardStrings.SB) self.formatComboBox.setItemText( SongFormat.SongShowPlus, WizardStrings.SSP) + self.formatComboBox.setItemText( + SongFormat.SundayPlus, WizardStrings.SP) self.formatComboBox.setItemText( SongFormat.SongsOfFellowship, WizardStrings.SoF) self.formatComboBox.setItemText( @@ -321,6 +331,10 @@ class SongImportForm(OpenLPWizard): translate('SongsPlugin.ImportWizardForm', 'Add Files...')) self.songShowPlusRemoveButton.setText( translate('SongsPlugin.ImportWizardForm', 'Remove File(s)')) + self.sundayPlusAddButton.setText( + translate('SongsPlugin.ImportWizardForm', 'Add Files...')) + self.sundayPlusRemoveButton.setText( + translate('SongsPlugin.ImportWizardForm', 'Remove File(s)')) self.foilPresenterAddButton.setText( translate('SongsPlugin.ImportWizardForm', 'Add Files...')) self.foilPresenterRemoveButton.setText( @@ -636,6 +650,22 @@ class SongImportForm(OpenLPWizard): """ self.removeSelectedItems(self.songShowPlusFileListWidget) + def onSundayPlusAddButtonClicked(self): + """ + Get Sunday Plus song database files + """ + self.getFiles(WizardStrings.OpenTypeFile % WizardStrings.SP, + self.sundayPlusFileListWidget, u'%s (*.ptf)' + % translate('SongsPlugin.ImportWizardForm', + 'Sunday Plus Song Files') + ) + + def onSundayPlusRemoveButtonClicked(self): + """ + Remove selected Sunday Plus files from the import list + """ + self.removeSelectedItems(self.sundayPlusFileListWidget) + def onFoilPresenterAddButtonClicked(self): """ Get FoilPresenter song database files @@ -677,6 +707,7 @@ class SongImportForm(OpenLPWizard): self.ewFilenameEdit.setText(u'') self.songBeamerFileListWidget.clear() self.songShowPlusFileListWidget.clear() + self.sundayPlusFileListWidget.clear() self.foilPresenterFileListWidget.clear() #self.csvFilenameEdit.setText(u'') self.errorReportTextEdit.clear() @@ -763,6 +794,11 @@ class SongImportForm(OpenLPWizard): importer = self.plugin.importSongs(SongFormat.SongShowPlus, filenames=self.getListOfFiles(self.songShowPlusFileListWidget) ) + elif source_format == SongFormat.SundayPlus: + # Import Sunday Plus songs + importer = self.plugin.importSongs(SongFormat.SundayPlus, + filenames=self.getListOfFiles(self.sundayPlusFileListWidget) + ) elif source_format == SongFormat.FoilPresenter: # Import Foilpresenter songs importer = self.plugin.importSongs(SongFormat.FoilPresenter, diff --git a/openlp/plugins/songs/lib/importer.py b/openlp/plugins/songs/lib/importer.py index 867b28c91..14a6d771c 100644 --- a/openlp/plugins/songs/lib/importer.py +++ b/openlp/plugins/songs/lib/importer.py @@ -29,16 +29,17 @@ The :mod:`importer` modules provides the general song import functionality. """ import logging -from opensongimport import OpenSongImport +from cclifileimport import CCLIFileImport from easyslidesimport import EasySlidesImport +from foilpresenterimport import FoilPresenterImport +from ewimport import EasyWorshipSongImport from olpimport import OpenLPSongImport from openlyricsimport import OpenLyricsImport -from wowimport import WowImport -from cclifileimport import CCLIFileImport -from ewimport import EasyWorshipSongImport +from opensongimport import OpenSongImport from songbeamerimport import SongBeamerImport from songshowplusimport import SongShowPlusImport -from foilpresenterimport import FoilPresenterImport +from sundayplusimport import SundayPlusImport +from wowimport import WowImport # Imports that might fail log = logging.getLogger(__name__) try: @@ -80,7 +81,8 @@ class SongFormat(object): SongBeamer = 9 SongShowPlus = 10 SongsOfFellowship = 11 - WordsOfWorship = 12 + SundayPlus = 12 + WordsOfWorship = 13 #CSV = 13 @staticmethod @@ -91,18 +93,12 @@ class SongFormat(object): ``format`` The song format. """ - if format == SongFormat.OpenLP2: + if format == SongFormat.OpenLyrics: + return OpenLyricsImport + elif format == SongFormat.OpenLP2: return OpenLPSongImport elif format == SongFormat.OpenLP1: return OpenLP1SongImport - elif format == SongFormat.OpenLyrics: - return OpenLyricsImport - elif format == SongFormat.OpenSong: - return OpenSongImport - elif format == SongFormat.SongsOfFellowship: - return SofImport - elif format == SongFormat.WordsOfWorship: - return WowImport elif format == SongFormat.Generic: return OooImport elif format == SongFormat.CCLI: @@ -111,12 +107,20 @@ class SongFormat(object): return EasySlidesImport elif format == SongFormat.EasyWorship: return EasyWorshipSongImport + elif format == SongFormat.FoilPresenter: + return FoilPresenterImport + elif format == SongFormat.OpenSong: + return OpenSongImport elif format == SongFormat.SongBeamer: return SongBeamerImport elif format == SongFormat.SongShowPlus: return SongShowPlusImport - elif format == SongFormat.FoilPresenter: - return FoilPresenterImport + elif format == SongFormat.SongsOfFellowship: + return SofImport + elif format == SongFormat.SundayPlus: + return SundayPlusImport + elif format == SongFormat.WordsOfWorship: + return WowImport return None @staticmethod @@ -137,6 +141,7 @@ class SongFormat(object): SongFormat.SongBeamer, SongFormat.SongShowPlus, SongFormat.SongsOfFellowship, + SongFormat.SundayPlus, SongFormat.WordsOfWorship ] diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py new file mode 100644 index 000000000..521b7e646 --- /dev/null +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -0,0 +1,386 @@ +# -*- coding: utf-8 -*- +# vim: autoindent shiftwidth=4 expandtab textwidth=80 tabstop=4 softtabstop=4 + +############################################################################### +# OpenLP - Open Source Lyrics Projection # +# --------------------------------------------------------------------------- # +# Copyright (c) 2008-2012 Raoul Snyman # +# Portions copyright (c) 2008-2012 Tim Bentley, Gerald Britton, Jonathan # +# Corwin, Michael Gorven, Scott Guerrieri, Matthias Hub, Meinert Jordan, # +# Armin Köhler, Joshua Miller, Stevan Pettit, Andreas Preikschat, Mattias # +# Põldaru, Christian Richter, Philip Ridout, Simon Scudder, Jeffrey Smith, # +# Maikel Stuivenberg, Martin Thompson, Jon Tibble, Frode Woldsund # +# --------------------------------------------------------------------------- # +# This program is free software; you can redistribute it and/or modify it # +# under the terms of the GNU General Public License as published by the Free # +# Software Foundation; version 2 of the License. # +# # +# This program is distributed in the hope that it will be useful, but WITHOUT # +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or # +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for # +# more details. # +# # +# You should have received a copy of the GNU General Public License along # +# with this program; if not, write to the Free Software Foundation, Inc., 59 # +# Temple Place, Suite 330, Boston, MA 02111-1307 USA # +############################################################################### + +import logging +import re + +from lxml import objectify +from lxml.etree import Error, LxmlError +from os.path import split + +from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding +from openlp.plugins.songs.lib.songimport import SongImport +from openlp.plugins.songs.lib.ui import SongStrings + +log = logging.getLogger(__name__) + +class SundayPlusImport(SongImport): + """ + Import Sunday Plus songs + + The format examples can be found attached to bug report at + + """ + hotkey_to_verse_type = { + u'1': u'v1', + u'2': u'v2', + u'3': u'v3', + u'4': u'v4', + u'5': u'v5', + u'6': u'v6', + u'7': u'v7', + u'8': u'v8', + u'9': u'v9', + u'C': u'c', + u'+': u'b', + u'Z': u'o'} + + def __init__(self, manager, **kwargs): + """ + Initialise the class. + """ + SongImport.__init__(self, manager, **kwargs) + + def doImport(self): + self.importWizard.progressBar.setMaximum(len(self.importSource)) + self.encoding = 'us-ascii' + for filename in self.importSource: + if self.stopImportFlag: + return + song_file = open(filename) + self.doImportFile(song_file) + song_file.close() + + def doImportFile(self, file): + """ + Process the Sunday Plus file object. + """ + self.setDefaults() + if not self.parse(file.read()): + self.logError(file.name) + return + if self.title == '': + self.title = self.title_from_filename(file.name) + if not self.finish(): + self.logError(file.name) + + def parse(self, data, cell = False): + if data[0] != '[' and data[-1] != ']': + self.logError(u'File is malformed') + return False + i = 1 + verse_type = VerseType.Tags[VerseType.Verse] + while i < len(data): + byte = data[i] + if byte == '#': + end = data.find(':', i+1) + name = data[i+1:end] + i = end + 1 + while data[i] == ' ': + i += 1 + if data[i] == '"': + end = data.find('"', i+1) + value = data[i+1:end] + elif data[i] == '[': + j = i + inside_quotes = False + while j < len(data): + char = data[j] + if char == '"': + inside_quotes = not inside_quotes + elif not inside_quotes and char == ']': + end = j + 1 + break + j += 1 + value = data[i:end] + else: + end = data.find(',', i+1) + if data.find('(', i, end) != -1: + end = data.find(')', i) + 1 + value = data[i:end] + if cell == False: + if name == 'title': + self.title = self.decode(self.unescape(value)) + elif name == 'Author': + author = self.decode(self.unescape(value)) + if len(author): + self.addAuthor(author) + elif name == 'Copyright': + self.copyright = self.decode(self.unescape(value)) + elif name[0:4] == 'CELL': + self.parse(value, cell = name[4:]) + else: + if name == 'MARKER_NAME': + value = value.strip() + if len(value): + verse_type = VerseType.Tags[ + VerseType.from_loose_input(value[0])] + if len(value) >= 2 and value[-1] in ['0', '1', '2', + '3', '4', '5', '6', '7', '8', '9']: + verse_type = "%s%s" % (verse_type, value[-1]) + elif name == 'Hotkey': + # Hotkey always appears after MARKER_NAME, so it + # effectivetly overrides MARKER_NAME, if present. + if len(value) and \ + value in self.hotkey_to_verse_type.keys(): + verse_type = self.hotkey_to_verse_type[value] + if name == 'rtf': + value = self.unescape(value) + verse = strip_rtf(value, self.encoding).strip() + lines = verse.split('\n') + for i in xrange(len(lines)): + lines[i] = lines[i].strip() + line = lines[i] + if line[:4] in u'CCLI': + m = re.search(r'[0-9]+', line) + if m: + self.ccliNumber = int(m.group(0)) + lines.pop(i) + elif line.lower() == u'public domain': + lines.pop(i) + self.addVerse('\n'.join(lines).strip(), verse_type) + if end == -1: + break + i = end + 1 + i += 1 + return True + + def title_from_filename(self, filename): + filename = split(filename)[1] + if len(filename) > 4 and filename[-4:].lower() == u'.ptf': + title = filename[:-4] + else: + title = filename + if title[-3:] == '1-7': + title = title[:-3] + return title.replace(u'_', u' ') + + def decode(self, blob): + while True: + try: + return unicode(blob, self.encoding) + except: + # This is asked again every time the previously chosen + # encoding does not work. + self.encoding = retrieve_windows_encoding() + + def unescape(self, text): + text = text.replace('^^', '"') + text = text.replace('^', '\'') + return text.strip() + + def strip_rtf(self, text, encoding): + # Thanks to Markus Jarderot (MizardX) for this code, used by permission + # + pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'" + r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) + # Control words which specify a "destination" and we can ignore it. + destinations = frozenset(( + 'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor', + 'atndate', 'atnicn', 'atnid', 'atnparent', 'atnref', 'atntime', + 'atrfend', 'atrfstart', 'author', 'background', 'bkmkend', + 'bkmkstart', 'blipuid', 'buptim', 'category', 'colorschememapping', + 'colortbl', 'comment', 'company', 'creatim', 'datafield', + 'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'docvar', + 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'falt', 'fchars', + 'ffdeftext', 'ffentrymcr', 'ffexitmcr', 'ffformat', 'ffhelptext', + 'ffl', 'ffname', 'ffstattext', 'field', 'file', 'filetbl', + 'fldinst', 'fldrslt', 'fldtype', 'fname', 'fontemb', 'fontfile', + 'footer', 'footerf', 'footerl', 'footerr', 'footnote', + 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'generator', + 'gridtbl', 'header', 'headerf', 'headerl', 'headerr', 'hl', 'hlfr', + 'hlinkbase', 'hlloc', 'hlsrc', 'hsv', 'htmltag', 'info', 'keycode', + 'keywords', 'latentstyles', 'lchars', 'levelnumbers', 'leveltext', + 'lfolevel', 'linkval', 'list', 'listlevel', 'listname', + 'listoverride', 'listoverridetable', 'listpicture', 'liststylename', + 'listtable', 'listtext', 'lsdlockedexcept', 'macc', 'maccPr', + 'mailmerge', 'maln', 'malnScr', 'manager', 'margPr', 'mbar', + 'mbarPr', 'mbaseJc', 'mbegChr', 'mborderBox', 'mborderBoxPr', + 'mbox', 'mboxPr', 'mchr', 'mcount', 'mctrlPr', 'md', 'mdeg', + 'mdegHide', 'mden', 'mdiff', 'mdPr', 'me', 'mendChr', 'meqArr', + 'meqArrPr', 'mf', 'mfName', 'mfPr', 'mfunc', 'mfuncPr', 'mgroupChr', + 'mgroupChrPr', 'mgrow', 'mhideBot', 'mhideLeft', 'mhideRight', + 'mhideTop', 'mhtmltag', 'mlim', 'mlimloc', 'mlimlow', 'mlimlowPr', + 'mlimupp', 'mlimuppPr', 'mm', 'mmaddfieldname', 'mmath', + 'mmathPict', 'mmathPr', 'mmaxdist', 'mmc', 'mmcJc', 'mmconnectstr', + 'mmconnectstrdata', 'mmcPr', 'mmcs', 'mmdatasource', + 'mmheadersource', 'mmmailsubject', 'mmodso', 'mmodsofilter', + 'mmodsofldmpdata', 'mmodsomappedname', 'mmodsoname', + 'mmodsorecipdata', 'mmodsosort', 'mmodsosrc', 'mmodsotable', + 'mmodsoudl', 'mmodsoudldata', 'mmodsouniquetag', 'mmPr', 'mmquery', + 'mmr', 'mnary', 'mnaryPr', 'mnoBreak', 'mnum', 'mobjDist', 'moMath', + 'moMathPara', 'moMathParaPr', 'mopEmu', 'mphant', 'mphantPr', + 'mplcHide', 'mpos', 'mr', 'mrad', 'mradPr', 'mrPr', 'msepChr', + 'mshow', 'mshp', 'msPre', 'msPrePr', 'msSub', 'msSubPr', 'msSubSup', + 'msSubSupPr', 'msSup', 'msSupPr', 'mstrikeBLTR', 'mstrikeH', + 'mstrikeTLBR', 'mstrikeV', 'msub', 'msubHide', 'msup', 'msupHide', + 'mtransp', 'mtype', 'mvertJc', 'mvfmf', 'mvfml', 'mvtof', 'mvtol', + 'mzeroAsc', 'mzeroDesc', 'mzeroWid', 'nesttableprops', 'nextfile', + 'nonesttables', 'objalias', 'objclass', 'objdata', 'object', + 'objname', 'objsect', 'objtime', 'oldcprops', 'oldpprops', + 'oldsprops', 'oldtprops', 'oleclsid', 'operator', 'panose', + 'password', 'passwordhash', 'pgp', 'pgptbl', 'picprop', 'pict', + 'pn', 'pnseclvl', 'pntext', 'pntxta', 'pntxtb', 'printim', + 'private', 'propname', 'protend', 'protstart', 'protusertbl', 'pxe', + 'result', 'revtbl', 'revtim', 'rsidtbl', 'rxe', 'shp', 'shpgrp', + 'shpinst', 'shppict', 'shprslt', 'shptxt', 'sn', 'sp', 'staticval', + 'stylesheet', 'subject', 'sv', 'svb', 'tc', 'template', 'themedata', + 'title', 'txe', 'ud', 'upr', 'userprops', 'wgrffmtfilter', + 'windowcaption', 'writereservation', 'writereservhash', 'xe', + 'xform', 'xmlattrname', 'xmlattrvalue', 'xmlclose', 'xmlname', + 'xmlnstbl', 'xmlopen')) + # Translation of some special characters. + specialchars = { + u'par': u'\n', + u'sect': u'\n\n', + u'page': u'\n\n', + u'line': u'\n', + u'tab': u'\t', + u'emdash': u'\u2014', + u'endash': u'\u2013', + u'emspace': u'\u2003', + u'enspace': u'\u2002', + u'qmspace': u'\u2005', + u'bullet': u'\u2022', + u'lquote': u'\u2018', + u'rquote': u'\u2019', + u'ldblquote': u'\u201C', + u'rdblquote': u'\u201D'} + charset_mapping = { + # Thai encoding + 'fcharset222': u'cp874', + 'ansicpg874': u'cp874', + # Central+East European encoding + 'fcharset238': u'cp1250', + 'ansicpg1250': u'cp1250', + # Cyrillic encoding + 'fcharset204': u'cp1251', + 'ansicpg1251': u'cp1251', + # West European encoding + 'fcharset0': u'cp1252', + 'ansicpg1252': u'cp1252', + # Greek encoding + 'fcharset161': u'cp1253', + 'ansicpg1253': u'cp1253', + # Turkish encoding + 'fcharset162': u'cp1254', + 'ansicpg1254': u'cp1254', + # Hebrew encoding + 'fcharset177': u'cp1255', + 'ansicpg1255': u'cp1255', + # Arabic encoding + 'fcharset178': u'cp1256', + 'ansicpg1256': u'cp1256', + # Baltic encoding + 'fcharset186': u'cp1257', + 'ansicpg1257': u'cp1257', + # Vietnamese encoding + 'fcharset163': u'cp1258', + 'ansicpg1258': u'cp1258'} + charsets = charset_mapping.keys() + # Character encoding is defined together with fonts. + # font_table could contain eg 'f0': 'cp1252' + font_table = {'default': encoding} + stack = [] + # Whether this group (and all inside it) are "ignorable". + ignorable = False + # Inside font table + inside_font_table = False + current_font = '' + # Number of ASCII characters to skip after a unicode character. + ucskip = 1 + # Number of ASCII characters left to skip. + curskip = 0 + # Output buffer. + out = [] + for match in pattern.finditer(text): + word, arg, hex, char, brace, tchar = match.groups() + if brace: + curskip = 0 + if brace == u'{': + # Push state + stack.append((ucskip, ignorable)) + elif brace == u'}': + # Pop state + ucskip, ignorable = stack.pop() + # \x (not a letter) + elif char: + curskip = 0 + if char == '~': + if not ignorable: + out.append(u'\xA0') + elif char in u'{}\\': + if not ignorable: + out.append(char) + elif char == u'*': + ignorable = True + # \foo + elif word: + curskip = 0 + if word in destinations: + ignorable = True + elif word in specialchars: + out.append(specialchars[word]) + elif word == u'uc': + ucskip = int(arg) + elif word == u'u': + c = int(arg) + if c < 0: + c += 0x10000 + out.append(unichr(c)) + curskip = ucskip + elif word == 'fonttbl': + inside_font_table = True + elif word == 'f': + if inside_font_table: + font_table[current_font] = font_table['default'] + else: + encoding = font_table[arg] + elif word in charsets: + if inside_font_table: + font_table[current_font] = charset_mapping[word] + else: + font_table['default'] = charset_mapping[word] + elif inside_font_table: + pass + elif ignorable: + pass + # \'xx + elif hex: + if curskip > 0: + curskip -= 1 + elif not ignorable: + c = int(hex, 16) + out.append(chr(c).decode(encoding)) + elif tchar: + if curskip > 0: + curskip -= 1 + elif not ignorable: + out.append(tchar) + return ''.join(out) + From 92baf9887c26d1a6cea251653a62135ffe40ca2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 20 Jun 2012 10:03:25 +0300 Subject: [PATCH 02/18] Fix strip_rtf() --- openlp/plugins/songs/lib/sundayplusimport.py | 28 +++++++++----------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index 12297d520..706e65263 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -304,15 +304,15 @@ class SundayPlusImport(SongImport): 'ansicpg1258': u'cp1258'} charsets = charset_mapping.keys() # Character encoding is defined together with fonts. - # font_table could contain eg 'f0': 'cp1252' - font_table = {'default': encoding} + # font_table could contain eg '0': 'cp1252' + font_table = {} stack = [] # Whether this group (and all inside it) are "ignorable". ignorable = False - # Inside font table + # Whether we are inside the font table. inside_font_table = False current_font = '' - # Number of ASCII characters to skip after a unicode character. + # Number of ASCII characters to skip after an unicode character. ucskip = 1 # Number of ASCII characters left to skip. curskip = 0 @@ -324,10 +324,10 @@ class SundayPlusImport(SongImport): curskip = 0 if brace == u'{': # Push state - stack.append((ucskip, ignorable)) + stack.append((ucskip, ignorable, inside_font_table)) elif brace == u'}': # Pop state - ucskip, ignorable = stack.pop() + ucskip, ignorable, inside_font_table = stack.pop() # \x (not a letter) elif char: curskip = 0 @@ -356,20 +356,16 @@ class SundayPlusImport(SongImport): curskip = ucskip elif word == 'fonttbl': inside_font_table = True + ignorable = True elif word == 'f': - if inside_font_table: - font_table[current_font] = font_table['default'] - else: + current_font = arg + if not inside_font_table: encoding = font_table[arg] - elif word in charsets: + elif word in ('ansicpg', 'fcharset'): if inside_font_table: - font_table[current_font] = charset_mapping[word] + font_table[current_font] = charset_mapping[word + arg] else: - font_table['default'] = charset_mapping[word] - elif inside_font_table: - pass - elif ignorable: - pass + encoding = charset_mapping[word + arg] # \'xx elif hex: if curskip > 0: From 76694891faa17e0baad481f0ac534c1350424957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 20 Jun 2012 10:15:40 +0300 Subject: [PATCH 03/18] SundayPlus is written together on their own site. --- openlp/plugins/songs/lib/importer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openlp/plugins/songs/lib/importer.py b/openlp/plugins/songs/lib/importer.py index 79bbccdff..862a8464c 100644 --- a/openlp/plugins/songs/lib/importer.py +++ b/openlp/plugins/songs/lib/importer.py @@ -280,10 +280,10 @@ class SongFormat(object): }, SundayPlus: { u'class': SundayPlusImport, - u'name': u'Sunday Plus', + u'name': u'SundayPlus', u'prefix': u'sundayPlus', u'filter': u'%s (*.ptf)' % translate( - 'SongsPlugin.ImportWizardForm', 'Sunday Plus Song Files') + 'SongsPlugin.ImportWizardForm', 'SundayPlus Song Files') }, WordsOfWorship: { u'class': WowImport, From 53e490995274d8f59eee56ee7de1f90ea9aab548 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 20 Jun 2012 10:23:41 +0300 Subject: [PATCH 04/18] Remove unnecessary imports. --- openlp/plugins/songs/lib/sundayplusimport.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index 706e65263..eabf3d40f 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -28,13 +28,10 @@ import logging import re -from lxml import objectify -from lxml.etree import Error, LxmlError from os.path import split from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding from openlp.plugins.songs.lib.songimport import SongImport -from openlp.plugins.songs.lib.ui import SongStrings log = logging.getLogger(__name__) From 0a2ea0fc51cf2ca8631c76b7ca52ee2120adb632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 20 Jun 2012 10:43:35 +0300 Subject: [PATCH 05/18] Remove unnecessary reimports which slipped in by accident. --- openlp/plugins/songs/lib/importer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/openlp/plugins/songs/lib/importer.py b/openlp/plugins/songs/lib/importer.py index 862a8464c..c97d09b8c 100644 --- a/openlp/plugins/songs/lib/importer.py +++ b/openlp/plugins/songs/lib/importer.py @@ -34,8 +34,6 @@ from openlp.core.lib.ui import UiStrings from openlp.core.ui.wizard import WizardStrings from opensongimport import OpenSongImport from easyslidesimport import EasySlidesImport -from foilpresenterimport import FoilPresenterImport -from ewimport import EasyWorshipSongImport from olpimport import OpenLPSongImport from openlyricsimport import OpenLyricsImport from wowimport import WowImport From 36f7e03dc0a68373b750f29f6c7157adcef43106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Sun, 24 Jun 2012 21:08:20 +0300 Subject: [PATCH 06/18] Move strip_rtf and change it into a class StripRtf + other changes. It might not work that well yet. --- openlp/plugins/songs/lib/__init__.py | 219 +++++++++++++++- openlp/plugins/songs/lib/ewimport.py | 100 +------- openlp/plugins/songs/lib/sundayplusimport.py | 248 +++---------------- 3 files changed, 255 insertions(+), 312 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index 87540ce54..c7f7ea6b7 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -24,6 +24,7 @@ # with this program; if not, write to the Free Software Foundation, Inc., 59 # # Temple Place, Suite 330, Boston, MA 02111-1307 USA # ############################################################################### +import logging import re from PyQt4 import QtGui @@ -33,6 +34,8 @@ from openlp.core.utils import CONTROL_CHARS from db import Author from ui import SongStrings +log = logging.getLogger(__name__) + WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE) @@ -194,7 +197,7 @@ class VerseType(object): return verse_index -def retrieve_windows_encoding(recommendation=None): +def retrieve_windows_encoding(recommendation=None, example_text=None): """ Determines which encoding to use on an information source. The process uses both automated detection, which is passed to this method as a @@ -203,6 +206,9 @@ def retrieve_windows_encoding(recommendation=None): ``recommendation`` A recommended encoding discovered programmatically for the user to confirm. + + ``example_text`` + Still not decoded text to show to users to help them decide. """ # map chardet result to compatible windows standard code page codepage_mapping = {'IBM866': u'cp866', 'TIS-620': u'cp874', @@ -365,6 +371,217 @@ def clean_song(manager, song): if song.copyright: song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip() +class StripRtf(): + """ + This class strips RTF control structures and returns an unicode string. + + Thanks to Markus Jarderot (MizardX) for this code, used by permission. + http://stackoverflow.com/questions/188545 + """ + pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'" + r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) + # Control words which specify a "destination" to be ignored. + destinations = frozenset(( + u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor', + u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime', + u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend', + u'bkmkstart', u'blipuid', u'buptim', u'category', + u'colorschememapping', u'colortbl', u'comment', u'company', u'creatim', + u'datafield', u'datastore', u'defchp', u'defpap', u'do', u'doccomm', + u'docvar', u'dptxbxtext', u'ebcend', u'ebcstart', u'factoidname', + u'falt', u'fchars', u'ffdeftext', u'ffentrymcr', u'ffexitmcr', + u'ffformat', u'ffhelptext', u'ffl', u'ffname', u'ffstattext', u'field', + u'file', u'filetbl', u'fldinst', u'fldrslt', u'fldtype', u'fname', + u'fontemb', u'fontfile', u'footer', u'footerf', u'footerl', u'footerr', + u'footnote', u'formfield', u'ftncn', u'ftnsep', u'ftnsepc', u'g', + u'generator', u'gridtbl', u'header', u'headerf', u'headerl', + u'headerr', u'hl', u'hlfr', u'hlinkbase', u'hlloc', u'hlsrc', u'hsv', + u'htmltag', u'info', u'keycode', u'keywords', u'latentstyles', + u'lchars', u'levelnumbers', u'leveltext', u'lfolevel', u'linkval', + u'list', u'listlevel', u'listname', u'listoverride', + u'listoverridetable', u'listpicture', u'liststylename', u'listtable', + u'listtext', u'lsdlockedexcept', u'macc', u'maccPr', u'mailmerge', + u'maln', u'malnScr', u'manager', u'margPr', u'mbar', u'mbarPr', + u'mbaseJc', u'mbegChr', u'mborderBox', u'mborderBoxPr', u'mbox', + u'mboxPr', u'mchr', u'mcount', u'mctrlPr', u'md', u'mdeg', u'mdegHide', + u'mden', u'mdiff', u'mdPr', u'me', u'mendChr', u'meqArr', u'meqArrPr', + u'mf', u'mfName', u'mfPr', u'mfunc', u'mfuncPr', u'mgroupChr', + u'mgroupChrPr', u'mgrow', u'mhideBot', u'mhideLeft', u'mhideRight', + u'mhideTop', u'mhtmltag', u'mlim', u'mlimloc', u'mlimlow', + u'mlimlowPr', u'mlimupp', u'mlimuppPr', u'mm', u'mmaddfieldname', + u'mmath', u'mmathPict', u'mmathPr', u'mmaxdist', u'mmc', u'mmcJc', + u'mmconnectstr', u'mmconnectstrdata', u'mmcPr', u'mmcs', + u'mmdatasource', u'mmheadersource', u'mmmailsubject', u'mmodso', + u'mmodsofilter', u'mmodsofldmpdata', u'mmodsomappedname', + u'mmodsoname', u'mmodsorecipdata', u'mmodsosort', u'mmodsosrc', + u'mmodsotable', u'mmodsoudl', u'mmodsoudldata', u'mmodsouniquetag', + u'mmPr', u'mmquery', u'mmr', u'mnary', u'mnaryPr', u'mnoBreak', + u'mnum', u'mobjDist', u'moMath', u'moMathPara', u'moMathParaPr', + u'mopEmu', u'mphant', u'mphantPr', u'mplcHide', u'mpos', u'mr', + u'mrad', u'mradPr', u'mrPr', u'msepChr', u'mshow', u'mshp', u'msPre', + u'msPrePr', u'msSub', u'msSubPr', u'msSubSup', u'msSubSupPr', u'msSup', + u'msSupPr', u'mstrikeBLTR', u'mstrikeH', u'mstrikeTLBR', u'mstrikeV', + u'msub', u'msubHide', u'msup', u'msupHide', u'mtransp', u'mtype', + u'mvertJc', u'mvfmf', u'mvfml', u'mvtof', u'mvtol', u'mzeroAsc', + u'mzeroDesc', u'mzeroWid', u'nesttableprops', u'nextfile', + u'nonesttables', u'objalias', u'objclass', u'objdata', u'object', + u'objname', u'objsect', u'objtime', u'oldcprops', u'oldpprops', + u'oldsprops', u'oldtprops', u'oleclsid', u'operator', u'panose', + u'password', u'passwordhash', u'pgp', u'pgptbl', u'picprop', u'pict', + u'pn', u'pnseclvl', u'pntext', u'pntxta', u'pntxtb', u'printim', + u'private', u'propname', u'protend', u'protstart', u'protusertbl', + u'pxe', u'result', u'revtbl', u'revtim', u'rsidtbl', u'rxe', u'shp', + u'shpgrp', u'shpinst', u'shppict', u'shprslt', u'shptxt', u'sn', u'sp', + u'staticval', u'stylesheet', u'subject', u'sv', u'svb', u'tc', + u'template', u'themedata', u'title', u'txe', u'ud', u'upr', + u'userprops', u'wgrffmtfilter', u'windowcaption', u'writereservation', + u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue', + u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen')) + # Translation of some special characters. + specialchars = { + u'par': u'\n', + u'sect': u'\n\n', + u'page': u'\n\n', + u'line': u'\n', + u'tab': u'\t', + u'emdash': u'\u2014', + u'endash': u'\u2013', + u'emspace': u'\u2003', + u'enspace': u'\u2002', + u'qmspace': u'\u2005', + u'bullet': u'\u2022', + u'lquote': u'\u2018', + u'rquote': u'\u2019', + u'ldblquote': u'\u201C', + u'rdblquote': u'\u201D'} + charset_mapping = { + u'fcharset0': u'cp1252', + u'fcharset1': None, + u'fcharset2': None, + u'fcharset77': None, + u'fcharset128': None, + u'fcharset129': None, + u'fcharset130': None, + u'fcharset134': None, + u'fcharset136': None, + u'fcharset161': u'cp1253', + u'fcharset162': u'cp1254', + u'fcharset163': u'cp1258', + u'fcharset177': u'cp1255', + u'fcharset178': u'cp1256', + u'fcharset186': u'cp1257', + u'fcharset204': u'cp1251', + u'fcharset222': u'cp874', + u'fcharset238': u'cp1250'} + + def strip_rtf(self, text, default_encoding=None): + # Current font is the font tag we last met. + font = u'' + # Character encoding is defined inside fonttable. + # font_table could contain eg u'0': u'cp1252' + font_table = {u'': default_encoding} + # Whether we are inside the font table. + inside_font_table = False + # Stack of things to keep track of when entering/leaving groups. + stack = [] + # Whether this group (and all inside it) are "ignorable". + ignorable = False + # Number of ASCII characters to skip after an unicode character. + ucskip = 1 + # Number of ASCII characters left to skip. + curskip = 0 + # Output buffer. + out = [] + for match in self.pattern.finditer(text): + word, arg, hex, char, brace, tchar = match.groups() + if brace: + curskip = 0 + if brace == u'{': + # Push state + stack.append((ucskip, ignorable, font, inside_font_table)) + elif brace == u'}': + # Pop state + ucskip, ignorable, font, inside_font_table = stack.pop() + # \x (not a letter) + elif char: + curskip = 0 + if char == u'~': + if not ignorable: + out.append(u'\xA0') + elif char in u'{}\\': + if not ignorable: + out.append(char) + elif char == u'*': + ignorable = True + # \command + elif word: + curskip = 0 + if word in self.destinations: + ignorable = True + elif word in self.specialchars: + out.append(self.specialchars[word]) + elif word == u'uc': + ucskip = int(arg) + elif word == u' ': + c = int(arg) + if c < 0: + c += 0x10000 + out.append(unichr(c)) + curskip = ucskip + elif word == u'fonttbl': + inside_font_table = True + ignorable = True + elif word == u'f': + font = arg + if not inside_font_table: + if arg in font_table.keys(): + encoding = font_table[arg] + else: + encoding = default_encoding + elif word == u'ansicpg': + if font == u'': + print "JEEEPASOIDFIJAD" + if inside_font_table or font == u'': + font_table[font] = 'cp' + arg + elif word == u'fcharset': + charset_reference = word + arg + if charset_reference in self.charset_mapping: + charset = self.charset_mapping[charset_reference] + if not charset: + charset = default_encoding + else: + log.error(u"Charset '%s' not in charset_mapping " + u"dictionary in " + u"openlp/plugins/songs/lib/__init__.py" + % charset_reference) + charset = default_encoding + if font == u'': + print "JEEEPASOIDFIadsfJAD" + if inside_font_table or font == u'': + font_table[font] = charset + # \'xx + elif hex: + if curskip > 0: + curskip -= 1 + elif not ignorable: + charcode = int(hex, 16) + while True: + try: + out.append(chr(charcode).decode(encoding)) + except UnicodeDecodeError: + encoding = \ + retrieve_windows_encoding(default_encoding) + if font: + font_table[font] = encoding + else: + break + elif tchar: + if curskip > 0: + curskip -= 1 + elif not ignorable: + out.append(tchar) + return u''.join(out) + from xml import OpenLyrics, SongXML from songstab import SongsTab from mediaitem import SongMediaItem diff --git a/openlp/plugins/songs/lib/ewimport.py b/openlp/plugins/songs/lib/ewimport.py index d58734610..7ecabcdd6 100644 --- a/openlp/plugins/songs/lib/ewimport.py +++ b/openlp/plugins/songs/lib/ewimport.py @@ -35,7 +35,7 @@ import re from openlp.core.lib import translate from openlp.plugins.songs.lib import VerseType -from openlp.plugins.songs.lib import retrieve_windows_encoding +from openlp.plugins.songs.lib import retrieve_windows_encoding, StripRtf from songimport import SongImport RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}') @@ -44,101 +44,6 @@ SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*') NUMBER_REGEX = re.compile(r'[0-9]+') NOTE_REGEX = re.compile(r'\(.*?\)') -def strip_rtf(blob, encoding): - depth = 0 - control = False - clear_text = [] - control_word = [] - - # workaround for \tx bug: remove one pair of curly braces - # if \tx is encountered - match = RTF_STRIPPING_REGEX.search(blob) - if match: - # start and end indices of match are curly braces - filter them out - blob = ''.join([blob[i] for i in xrange(len(blob)) - if i != match.start() and i !=match.end()]) - - for c in blob: - if control: - # for delimiters, set control to False - if c == '{': - if control_word: - depth += 1 - control = False - elif c == '}': - if control_word: - depth -= 1 - control = False - elif c == '\\': - new_control = bool(control_word) - control = False - elif c.isspace(): - control = False - else: - control_word.append(c) - if len(control_word) == 3 and control_word[0] == '\'': - control = False - if not control: - if not control_word: - if c == '{' or c == '}' or c == '\\': - clear_text.append(c) - else: - control_str = ''.join(control_word) - if control_str == 'par' or control_str == 'line': - clear_text.append(u'\n') - elif control_str == 'tab': - clear_text.append(u'\t') - # Prefer the encoding specified by the RTF data to that - # specified by the Paradox table header - # West European encoding - elif control_str == 'fcharset0': - encoding = u'cp1252' - # Greek encoding - elif control_str == 'fcharset161': - encoding = u'cp1253' - # Turkish encoding - elif control_str == 'fcharset162': - encoding = u'cp1254' - # Vietnamese encoding - elif control_str == 'fcharset163': - encoding = u'cp1258' - # Hebrew encoding - elif control_str == 'fcharset177': - encoding = u'cp1255' - # Arabic encoding - elif control_str == 'fcharset178': - encoding = u'cp1256' - # Baltic encoding - elif control_str == 'fcharset186': - encoding = u'cp1257' - # Cyrillic encoding - elif control_str == 'fcharset204': - encoding = u'cp1251' - # Thai encoding - elif control_str == 'fcharset222': - encoding = u'cp874' - # Central+East European encoding - elif control_str == 'fcharset238': - encoding = u'cp1250' - elif control_str[0] == '\'': - s = chr(int(control_str[1:3], 16)) - clear_text.append(s.decode(encoding)) - del control_word[:] - if c == '\\' and new_control: - control = True - elif c == '{': - depth += 1 - elif c == '}': - depth -= 1 - elif depth > 2: - continue - elif c == '\n' or c == '\r': - continue - elif c == '\\': - control = True - else: - clear_text.append(c) - return u''.join(clear_text) class FieldDescEntry: def __init__(self, name, type, size): @@ -154,6 +59,7 @@ class EasyWorshipSongImport(SongImport): """ def __init__(self, manager, **kwargs): SongImport.__init__(self, manager, **kwargs) + self.rtf = StripRtf() def doImport(self): # Open the DB and MB files if they exist @@ -273,7 +179,7 @@ class EasyWorshipSongImport(SongImport): self.addAuthor(author_name.strip()) if words: # Format the lyrics - words = strip_rtf(words, self.encoding) + words = self.rtf.strip_rtf(words, self.encoding) verse_type = VerseType.Tags[VerseType.Verse] for verse in SLIDE_BREAK_REGEX.split(words): verse = verse.strip() diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index eabf3d40f..ab37bd0d2 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -26,11 +26,11 @@ ############################################################################### import logging +import os import re -from os.path import split - from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding +from openlp.plugins.songs.lib import StripRtf from openlp.plugins.songs.lib.songimport import SongImport log = logging.getLogger(__name__) @@ -42,7 +42,7 @@ class SundayPlusImport(SongImport): The format examples can be found attached to bug report at """ - hotkey_to_verse_type = { + hotkeyToVerseType = { u'1': u'v1', u'2': u'v2', u'3': u'v3', @@ -61,6 +61,7 @@ class SundayPlusImport(SongImport): Initialise the class. """ SongImport.__init__(self, manager, **kwargs) + self.rtf = StripRtf() def doImport(self): self.importWizard.progressBar.setMaximum(len(self.importSource)) @@ -81,32 +82,33 @@ class SundayPlusImport(SongImport): self.logError(file.name) return if self.title == '': - self.title = self.title_from_filename(file.name) + self.title = self.titleFromFilename(file.name) if not self.finish(): self.logError(file.name) def parse(self, data, cell = False): - if data[0] != '[' and data[-1] != ']': + if len(data) == 0 or data[0:1] != '[' or data[-1] != ']': self.logError(u'File is malformed') return False i = 1 verse_type = VerseType.Tags[VerseType.Verse] while i < len(data): - byte = data[i] - if byte == '#': - end = data.find(':', i+1) - name = data[i+1:end] - i = end + 1 - while data[i] == ' ': + # Data is held as #name: value pairs inside groups marked as []. + # Now we are looking for name. + if data[i:i+1] == '#': + name_end = data.find(':', i+1) + name = data[i+1:name_end] + i = name_end + 1 + while data[i:i+1] == ' ': i += 1 - if data[i] == '"': + if data[i:i+1] == '"': end = data.find('"', i+1) value = data[i+1:end] - elif data[i] == '[': + elif data[i:i+1] == '[': j = i inside_quotes = False while j < len(data): - char = data[j] + char = data[j:j+1] if char == '"': inside_quotes = not inside_quotes elif not inside_quotes and char == ']': @@ -119,6 +121,7 @@ class SundayPlusImport(SongImport): if data.find('(', i, end) != -1: end = data.find(')', i) + 1 value = data[i:end] + # If we are in the main group. if cell == False: if name == 'title': self.title = self.decode(self.unescape(value)) @@ -130,6 +133,7 @@ class SundayPlusImport(SongImport): self.copyright = self.decode(self.unescape(value)) elif name[0:4] == 'CELL': self.parse(value, cell = name[4:]) + # We are in a verse group. else: if name == 'MARKER_NAME': value = value.strip() @@ -141,23 +145,27 @@ class SundayPlusImport(SongImport): verse_type = "%s%s" % (verse_type, value[-1]) elif name == 'Hotkey': # Hotkey always appears after MARKER_NAME, so it - # effectivetly overrides MARKER_NAME, if present. + # effectively overrides MARKER_NAME, if present. if len(value) and \ - value in self.hotkey_to_verse_type.keys(): - verse_type = self.hotkey_to_verse_type[value] + value in self.hotkeyToVerseType.keys(): + verse_type = self.hotkeyToVerseType[value] if name == 'rtf': value = self.unescape(value) - verse = self.strip_rtf(value, self.encoding).strip() - lines = verse.split('\n') + verse = self.rtf.strip_rtf(value, self.encoding) + lines = verse.strip().split('\n') + # If any line inside any verse contains CCLI or + # only Public Domain, we treat this as special data: + # we remove that line and add data to specific field. for i in xrange(len(lines)): lines[i] = lines[i].strip() line = lines[i] - if line[:4] in u'CCLI': + if line[:4].lower() == u'ccli': m = re.search(r'[0-9]+', line) if m: self.ccliNumber = int(m.group(0)) lines.pop(i) elif line.lower() == u'public domain': + self.copyright = u'Public Domain' lines.pop(i) self.addVerse('\n'.join(lines).strip(), verse_type) if end == -1: @@ -166,13 +174,12 @@ class SundayPlusImport(SongImport): i += 1 return True - def title_from_filename(self, filename): - filename = split(filename)[1] - if len(filename) > 4 and filename[-4:].lower() == u'.ptf': - title = filename[:-4] - else: - title = filename - if title[-3:] == '1-7': + def titleFromFilename(self, filename): + title = os.path.split(filename)[1] + if title.endswith(u'.ptf'): + title = title[:-4] + # For some strange reason all example files names ended with 1-7. + if title.endswith('1-7'): title = title[:-3] return title.replace(u'_', u' ') @@ -190,190 +197,3 @@ class SundayPlusImport(SongImport): text = text.replace('^', '\'') return text.strip() - def strip_rtf(self, text, encoding): - # Thanks to Markus Jarderot (MizardX) for this code, used by permission - # - pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'" - r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) - # Control words which specify a "destination" and we can ignore it. - destinations = frozenset(( - 'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor', - 'atndate', 'atnicn', 'atnid', 'atnparent', 'atnref', 'atntime', - 'atrfend', 'atrfstart', 'author', 'background', 'bkmkend', - 'bkmkstart', 'blipuid', 'buptim', 'category', 'colorschememapping', - 'colortbl', 'comment', 'company', 'creatim', 'datafield', - 'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'docvar', - 'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'falt', 'fchars', - 'ffdeftext', 'ffentrymcr', 'ffexitmcr', 'ffformat', 'ffhelptext', - 'ffl', 'ffname', 'ffstattext', 'field', 'file', 'filetbl', - 'fldinst', 'fldrslt', 'fldtype', 'fname', 'fontemb', 'fontfile', - 'footer', 'footerf', 'footerl', 'footerr', 'footnote', - 'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'generator', - 'gridtbl', 'header', 'headerf', 'headerl', 'headerr', 'hl', 'hlfr', - 'hlinkbase', 'hlloc', 'hlsrc', 'hsv', 'htmltag', 'info', 'keycode', - 'keywords', 'latentstyles', 'lchars', 'levelnumbers', 'leveltext', - 'lfolevel', 'linkval', 'list', 'listlevel', 'listname', - 'listoverride', 'listoverridetable', 'listpicture', 'liststylename', - 'listtable', 'listtext', 'lsdlockedexcept', 'macc', 'maccPr', - 'mailmerge', 'maln', 'malnScr', 'manager', 'margPr', 'mbar', - 'mbarPr', 'mbaseJc', 'mbegChr', 'mborderBox', 'mborderBoxPr', - 'mbox', 'mboxPr', 'mchr', 'mcount', 'mctrlPr', 'md', 'mdeg', - 'mdegHide', 'mden', 'mdiff', 'mdPr', 'me', 'mendChr', 'meqArr', - 'meqArrPr', 'mf', 'mfName', 'mfPr', 'mfunc', 'mfuncPr', 'mgroupChr', - 'mgroupChrPr', 'mgrow', 'mhideBot', 'mhideLeft', 'mhideRight', - 'mhideTop', 'mhtmltag', 'mlim', 'mlimloc', 'mlimlow', 'mlimlowPr', - 'mlimupp', 'mlimuppPr', 'mm', 'mmaddfieldname', 'mmath', - 'mmathPict', 'mmathPr', 'mmaxdist', 'mmc', 'mmcJc', 'mmconnectstr', - 'mmconnectstrdata', 'mmcPr', 'mmcs', 'mmdatasource', - 'mmheadersource', 'mmmailsubject', 'mmodso', 'mmodsofilter', - 'mmodsofldmpdata', 'mmodsomappedname', 'mmodsoname', - 'mmodsorecipdata', 'mmodsosort', 'mmodsosrc', 'mmodsotable', - 'mmodsoudl', 'mmodsoudldata', 'mmodsouniquetag', 'mmPr', 'mmquery', - 'mmr', 'mnary', 'mnaryPr', 'mnoBreak', 'mnum', 'mobjDist', 'moMath', - 'moMathPara', 'moMathParaPr', 'mopEmu', 'mphant', 'mphantPr', - 'mplcHide', 'mpos', 'mr', 'mrad', 'mradPr', 'mrPr', 'msepChr', - 'mshow', 'mshp', 'msPre', 'msPrePr', 'msSub', 'msSubPr', 'msSubSup', - 'msSubSupPr', 'msSup', 'msSupPr', 'mstrikeBLTR', 'mstrikeH', - 'mstrikeTLBR', 'mstrikeV', 'msub', 'msubHide', 'msup', 'msupHide', - 'mtransp', 'mtype', 'mvertJc', 'mvfmf', 'mvfml', 'mvtof', 'mvtol', - 'mzeroAsc', 'mzeroDesc', 'mzeroWid', 'nesttableprops', 'nextfile', - 'nonesttables', 'objalias', 'objclass', 'objdata', 'object', - 'objname', 'objsect', 'objtime', 'oldcprops', 'oldpprops', - 'oldsprops', 'oldtprops', 'oleclsid', 'operator', 'panose', - 'password', 'passwordhash', 'pgp', 'pgptbl', 'picprop', 'pict', - 'pn', 'pnseclvl', 'pntext', 'pntxta', 'pntxtb', 'printim', - 'private', 'propname', 'protend', 'protstart', 'protusertbl', 'pxe', - 'result', 'revtbl', 'revtim', 'rsidtbl', 'rxe', 'shp', 'shpgrp', - 'shpinst', 'shppict', 'shprslt', 'shptxt', 'sn', 'sp', 'staticval', - 'stylesheet', 'subject', 'sv', 'svb', 'tc', 'template', 'themedata', - 'title', 'txe', 'ud', 'upr', 'userprops', 'wgrffmtfilter', - 'windowcaption', 'writereservation', 'writereservhash', 'xe', - 'xform', 'xmlattrname', 'xmlattrvalue', 'xmlclose', 'xmlname', - 'xmlnstbl', 'xmlopen')) - # Translation of some special characters. - specialchars = { - u'par': u'\n', - u'sect': u'\n\n', - u'page': u'\n\n', - u'line': u'\n', - u'tab': u'\t', - u'emdash': u'\u2014', - u'endash': u'\u2013', - u'emspace': u'\u2003', - u'enspace': u'\u2002', - u'qmspace': u'\u2005', - u'bullet': u'\u2022', - u'lquote': u'\u2018', - u'rquote': u'\u2019', - u'ldblquote': u'\u201C', - u'rdblquote': u'\u201D'} - charset_mapping = { - # Thai encoding - 'fcharset222': u'cp874', - 'ansicpg874': u'cp874', - # Central+East European encoding - 'fcharset238': u'cp1250', - 'ansicpg1250': u'cp1250', - # Cyrillic encoding - 'fcharset204': u'cp1251', - 'ansicpg1251': u'cp1251', - # West European encoding - 'fcharset0': u'cp1252', - 'ansicpg1252': u'cp1252', - # Greek encoding - 'fcharset161': u'cp1253', - 'ansicpg1253': u'cp1253', - # Turkish encoding - 'fcharset162': u'cp1254', - 'ansicpg1254': u'cp1254', - # Hebrew encoding - 'fcharset177': u'cp1255', - 'ansicpg1255': u'cp1255', - # Arabic encoding - 'fcharset178': u'cp1256', - 'ansicpg1256': u'cp1256', - # Baltic encoding - 'fcharset186': u'cp1257', - 'ansicpg1257': u'cp1257', - # Vietnamese encoding - 'fcharset163': u'cp1258', - 'ansicpg1258': u'cp1258'} - charsets = charset_mapping.keys() - # Character encoding is defined together with fonts. - # font_table could contain eg '0': 'cp1252' - font_table = {} - stack = [] - # Whether this group (and all inside it) are "ignorable". - ignorable = False - # Whether we are inside the font table. - inside_font_table = False - current_font = '' - # Number of ASCII characters to skip after an unicode character. - ucskip = 1 - # Number of ASCII characters left to skip. - curskip = 0 - # Output buffer. - out = [] - for match in pattern.finditer(text): - word, arg, hex, char, brace, tchar = match.groups() - if brace: - curskip = 0 - if brace == u'{': - # Push state - stack.append((ucskip, ignorable, inside_font_table)) - elif brace == u'}': - # Pop state - ucskip, ignorable, inside_font_table = stack.pop() - # \x (not a letter) - elif char: - curskip = 0 - if char == '~': - if not ignorable: - out.append(u'\xA0') - elif char in u'{}\\': - if not ignorable: - out.append(char) - elif char == u'*': - ignorable = True - # \foo - elif word: - curskip = 0 - if word in destinations: - ignorable = True - elif word in specialchars: - out.append(specialchars[word]) - elif word == u'uc': - ucskip = int(arg) - elif word == u'u': - c = int(arg) - if c < 0: - c += 0x10000 - out.append(unichr(c)) - curskip = ucskip - elif word == 'fonttbl': - inside_font_table = True - ignorable = True - elif word == 'f': - current_font = arg - if not inside_font_table: - encoding = font_table[arg] - elif word in ('ansicpg', 'fcharset'): - if inside_font_table: - font_table[current_font] = charset_mapping[word + arg] - else: - encoding = charset_mapping[word + arg] - # \'xx - elif hex: - if curskip > 0: - curskip -= 1 - elif not ignorable: - c = int(hex, 16) - out.append(chr(c).decode(encoding)) - elif tchar: - if curskip > 0: - curskip -= 1 - elif not ignorable: - out.append(tchar) - return ''.join(out) - From 75ae9065d00f6c336bbb21acfba205e10d38d508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Mon, 25 Jun 2012 01:59:28 +0300 Subject: [PATCH 07/18] Uppercase names for static variables. --- openlp/plugins/songs/lib/__init__.py | 24 +++++++++----------- openlp/plugins/songs/lib/sundayplusimport.py | 6 ++--- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index c7f7ea6b7..400817aad 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -378,10 +378,10 @@ class StripRtf(): Thanks to Markus Jarderot (MizardX) for this code, used by permission. http://stackoverflow.com/questions/188545 """ - pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'" + PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'" r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) # Control words which specify a "destination" to be ignored. - destinations = frozenset(( + DESTINATIONS = frozenset(( u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor', u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime', u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend', @@ -438,7 +438,7 @@ class StripRtf(): u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue', u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen')) # Translation of some special characters. - specialchars = { + SPECIALCHARS = { u'par': u'\n', u'sect': u'\n\n', u'page': u'\n\n', @@ -454,7 +454,7 @@ class StripRtf(): u'rquote': u'\u2019', u'ldblquote': u'\u201C', u'rdblquote': u'\u201D'} - charset_mapping = { + CHARSET_MAPPING = { u'fcharset0': u'cp1252', u'fcharset1': None, u'fcharset2': None, @@ -492,7 +492,7 @@ class StripRtf(): curskip = 0 # Output buffer. out = [] - for match in self.pattern.finditer(text): + for match in self.PATTERN.finditer(text): word, arg, hex, char, brace, tchar = match.groups() if brace: curskip = 0 @@ -516,10 +516,10 @@ class StripRtf(): # \command elif word: curskip = 0 - if word in self.destinations: + if word in self.DESTINATIONS: ignorable = True - elif word in self.specialchars: - out.append(self.specialchars[word]) + elif word in self.SPECIALCHARS: + out.append(self.SPECIALCHARS[word]) elif word == u'uc': ucskip = int(arg) elif word == u' ': @@ -540,23 +540,21 @@ class StripRtf(): encoding = default_encoding elif word == u'ansicpg': if font == u'': - print "JEEEPASOIDFIJAD" if inside_font_table or font == u'': font_table[font] = 'cp' + arg elif word == u'fcharset': charset_reference = word + arg - if charset_reference in self.charset_mapping: - charset = self.charset_mapping[charset_reference] + if charset_reference in self.CHARSET_MAPPING: + charset = self.CHARSET_MAPPING[charset_reference] if not charset: charset = default_encoding else: - log.error(u"Charset '%s' not in charset_mapping " + log.error(u"Charset '%s' not in CHARSET_MAPPING " u"dictionary in " u"openlp/plugins/songs/lib/__init__.py" % charset_reference) charset = default_encoding if font == u'': - print "JEEEPASOIDFIadsfJAD" if inside_font_table or font == u'': font_table[font] = charset # \'xx diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index ab37bd0d2..cc36896e9 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -42,7 +42,7 @@ class SundayPlusImport(SongImport): The format examples can be found attached to bug report at """ - hotkeyToVerseType = { + HOTKEYTOVERSETYPE = { u'1': u'v1', u'2': u'v2', u'3': u'v3', @@ -147,8 +147,8 @@ class SundayPlusImport(SongImport): # Hotkey always appears after MARKER_NAME, so it # effectively overrides MARKER_NAME, if present. if len(value) and \ - value in self.hotkeyToVerseType.keys(): - verse_type = self.hotkeyToVerseType[value] + value in self.HOTKEYTOVERSETYPE.keys(): + verse_type = self.HOTKEYTOVERSETYPE[value] if name == 'rtf': value = self.unescape(value) verse = self.rtf.strip_rtf(value, self.encoding) From 25dc4fe36c2c3272f18cc0bd13ebbb88a7aad091 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Mon, 25 Jun 2012 09:44:11 +0300 Subject: [PATCH 08/18] Better handling of encodings. User is asked only once, if possible. --- openlp/plugins/songs/lib/__init__.py | 49 ++++++++++++++++------------ 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index 400817aad..152d2a79c 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -473,15 +473,16 @@ class StripRtf(): u'fcharset204': u'cp1251', u'fcharset222': u'cp874', u'fcharset238': u'cp1250'} + # If user is asked for an encoding, it is used since then. + user_encoding = [] def strip_rtf(self, text, default_encoding=None): + self.default_encoding = default_encoding # Current font is the font tag we last met. font = u'' # Character encoding is defined inside fonttable. # font_table could contain eg u'0': u'cp1252' font_table = {u'': default_encoding} - # Whether we are inside the font table. - inside_font_table = False # Stack of things to keep track of when entering/leaving groups. stack = [] # Whether this group (and all inside it) are "ignorable". @@ -498,10 +499,10 @@ class StripRtf(): curskip = 0 if brace == u'{': # Push state - stack.append((ucskip, ignorable, font, inside_font_table)) + stack.append((ucskip, ignorable, font)) elif brace == u'}': # Pop state - ucskip, ignorable, font, inside_font_table = stack.pop() + ucskip, ignorable, font = stack.pop() # \x (not a letter) elif char: curskip = 0 @@ -533,29 +534,19 @@ class StripRtf(): ignorable = True elif word == u'f': font = arg - if not inside_font_table: - if arg in font_table.keys(): - encoding = font_table[arg] - else: - encoding = default_encoding elif word == u'ansicpg': - if font == u'': - if inside_font_table or font == u'': - font_table[font] = 'cp' + arg + font_table[font] = 'cp' + arg elif word == u'fcharset': charset_reference = word + arg if charset_reference in self.CHARSET_MAPPING: charset = self.CHARSET_MAPPING[charset_reference] - if not charset: - charset = default_encoding else: + charset = None log.error(u"Charset '%s' not in CHARSET_MAPPING " u"dictionary in " u"openlp/plugins/songs/lib/__init__.py" % charset_reference) - charset = default_encoding - if font == u'': - if inside_font_table or font == u'': + if font not in font_table: font_table[font] = charset # \'xx elif hex: @@ -563,14 +554,13 @@ class StripRtf(): curskip -= 1 elif not ignorable: charcode = int(hex, 16) + encoding = self.get_encoding(font, font_table) while True: try: out.append(chr(charcode).decode(encoding)) except UnicodeDecodeError: - encoding = \ - retrieve_windows_encoding(default_encoding) - if font: - font_table[font] = encoding + encoding = self.get_encoding(font, font_table, + failed=True) else: break elif tchar: @@ -580,6 +570,23 @@ class StripRtf(): out.append(tchar) return u''.join(out) + def get_encoding(self, font, font_table, failed=False): + encoding = None + if font in font_table: + encoding = font_table[font] + if not encoding and len(self.user_encoding): + encoding = self.user_encoding[-1] + if not encoding and self.default_encoding: + encoding = self.default_encoding + if not encoding or (failed and self.user_encoding == encoding): + encoding = retrieve_windows_encoding(self.default_encoding) + if encoding not in self.user_encoding: + self.user_encoding.append(encoding) + elif failed: + encoding = self.user_encoding + font_table[font] = encoding + return encoding + from xml import OpenLyrics, SongXML from songstab import SongsTab from mediaitem import SongMediaItem From bfeef67048475dd44ee8e5a16ac72477f35a3eef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Mon, 25 Jun 2012 11:08:53 +0300 Subject: [PATCH 09/18] Handling for some extra characters. --- openlp/plugins/songs/lib/__init__.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index 152d2a79c..a2230cc0a 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -441,7 +441,13 @@ class StripRtf(): SPECIALCHARS = { u'par': u'\n', u'sect': u'\n\n', + # Required page and column break. + # Would be good if we could split verse into subverses here. u'page': u'\n\n', + u'column': u'\n\n', + # Soft breaks. + u'softpage': u'[---]', + u'softcol': u'[---]', u'line': u'\n', u'tab': u'\t', u'emdash': u'\u2014', @@ -453,7 +459,11 @@ class StripRtf(): u'lquote': u'\u2018', u'rquote': u'\u2019', u'ldblquote': u'\u201C', - u'rdblquote': u'\u201D'} + u'rdblquote': u'\u201D', + u'ltrmark': u'\u200E', + u'rtlmark': u'\u200F', + u'zwj': u'\u200D', + u'zwnj': u'\u200C'} CHARSET_MAPPING = { u'fcharset0': u'cp1252', u'fcharset1': None, @@ -506,12 +516,14 @@ class StripRtf(): # \x (not a letter) elif char: curskip = 0 - if char == u'~': - if not ignorable: - out.append(u'\xA0') - elif char in u'{}\\': - if not ignorable: - out.append(char) + if char == u'~' and not ignorable: + out.append(u'\xA0') + elif char in u'{}\\' and not ignorable: + out.append(char) + elif char == u'-' and not ignorable: + out.append(u'\u00AD') + elif char == u'_' and not ignorable: + out.append(u'\u2011') elif char == u'*': ignorable = True # \command @@ -546,6 +558,7 @@ class StripRtf(): u"dictionary in " u"openlp/plugins/songs/lib/__init__.py" % charset_reference) + # This makes ansicpg always override fcharset if present. if font not in font_table: font_table[font] = charset # \'xx From 3b5bd4852f805b5f98e644f2dbabfe41a4e80e55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Mon, 25 Jun 2012 13:31:31 +0300 Subject: [PATCH 10/18] Sorry for leftovers of a thought. Maybe some other time. --- openlp/plugins/songs/lib/__init__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index a2230cc0a..07af491c6 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -197,7 +197,7 @@ class VerseType(object): return verse_index -def retrieve_windows_encoding(recommendation=None, example_text=None): +def retrieve_windows_encoding(recommendation=None): """ Determines which encoding to use on an information source. The process uses both automated detection, which is passed to this method as a @@ -206,9 +206,6 @@ def retrieve_windows_encoding(recommendation=None, example_text=None): ``recommendation`` A recommended encoding discovered programmatically for the user to confirm. - - ``example_text`` - Still not decoded text to show to users to help them decide. """ # map chardet result to compatible windows standard code page codepage_mapping = {'IBM866': u'cp866', 'TIS-620': u'cp874', From 4917831206698babdef46402af690b55255f4c34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Mon, 25 Jun 2012 13:41:55 +0300 Subject: [PATCH 11/18] Simplify previous cruft. --- openlp/plugins/songs/lib/__init__.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index 07af491c6..a9e360ceb 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -463,14 +463,6 @@ class StripRtf(): u'zwnj': u'\u200C'} CHARSET_MAPPING = { u'fcharset0': u'cp1252', - u'fcharset1': None, - u'fcharset2': None, - u'fcharset77': None, - u'fcharset128': None, - u'fcharset129': None, - u'fcharset130': None, - u'fcharset134': None, - u'fcharset136': None, u'fcharset161': u'cp1253', u'fcharset162': u'cp1254', u'fcharset163': u'cp1258', @@ -545,19 +537,10 @@ class StripRtf(): font = arg elif word == u'ansicpg': font_table[font] = 'cp' + arg - elif word == u'fcharset': - charset_reference = word + arg - if charset_reference in self.CHARSET_MAPPING: - charset = self.CHARSET_MAPPING[charset_reference] - else: - charset = None - log.error(u"Charset '%s' not in CHARSET_MAPPING " - u"dictionary in " - u"openlp/plugins/songs/lib/__init__.py" - % charset_reference) - # This makes ansicpg always override fcharset if present. - if font not in font_table: - font_table[font] = charset + elif word == u'fcharset' and font not in font_table and \ + word + arg in self.CHARSET_MAPPING: + # \ansicpg overrides \fcharset, if present. + font_table[font] = self.CHARSET_MAPPING[word + arg] # \'xx elif hex: if curskip > 0: From 556f28db3363310ae7e4c769305c76fad1d50ab9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Mon, 25 Jun 2012 17:49:33 +0300 Subject: [PATCH 12/18] Constant names' readability. Thanks, Samuel. --- openlp/plugins/songs/lib/__init__.py | 6 +++--- openlp/plugins/songs/lib/sundayplusimport.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index a9e360ceb..eb616d30d 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -435,7 +435,7 @@ class StripRtf(): u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue', u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen')) # Translation of some special characters. - SPECIALCHARS = { + SPECIAL_CHARS = { u'par': u'\n', u'sect': u'\n\n', # Required page and column break. @@ -520,8 +520,8 @@ class StripRtf(): curskip = 0 if word in self.DESTINATIONS: ignorable = True - elif word in self.SPECIALCHARS: - out.append(self.SPECIALCHARS[word]) + elif word in self.SPECIAL_CHARS: + out.append(self.SPECIAL_CHARS[word]) elif word == u'uc': ucskip = int(arg) elif word == u' ': diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index cc36896e9..b9b985e8f 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -42,7 +42,7 @@ class SundayPlusImport(SongImport): The format examples can be found attached to bug report at """ - HOTKEYTOVERSETYPE = { + HOTKEY_TO_VERSE_TYPE = { u'1': u'v1', u'2': u'v2', u'3': u'v3', @@ -147,8 +147,8 @@ class SundayPlusImport(SongImport): # Hotkey always appears after MARKER_NAME, so it # effectively overrides MARKER_NAME, if present. if len(value) and \ - value in self.HOTKEYTOVERSETYPE.keys(): - verse_type = self.HOTKEYTOVERSETYPE[value] + value in self.HOTKEY_TO_VERSE_TYPE.keys(): + verse_type = self.HOTKEY_TO_VERSE_TYPE[value] if name == 'rtf': value = self.unescape(value) verse = self.rtf.strip_rtf(value, self.encoding) From 4aa62141a052b7af4bd8222bff42fda5165e8907 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Mon, 25 Jun 2012 23:30:24 +0300 Subject: [PATCH 13/18] Ensure it is asked only once for encoding. --- openlp/plugins/songs/lib/sundayplusimport.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index b9b985e8f..6260026b6 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -61,11 +61,11 @@ class SundayPlusImport(SongImport): Initialise the class. """ SongImport.__init__(self, manager, **kwargs) + self.encoding = u'us-ascii' self.rtf = StripRtf() def doImport(self): self.importWizard.progressBar.setMaximum(len(self.importSource)) - self.encoding = 'us-ascii' for filename in self.importSource: if self.stopImportFlag: return @@ -81,12 +81,12 @@ class SundayPlusImport(SongImport): if not self.parse(file.read()): self.logError(file.name) return - if self.title == '': + if not self.title: self.title = self.titleFromFilename(file.name) if not self.finish(): self.logError(file.name) - def parse(self, data, cell = False): + def parse(self, data, cell=False): if len(data) == 0 or data[0:1] != '[' or data[-1] != ']': self.logError(u'File is malformed') return False @@ -94,7 +94,7 @@ class SundayPlusImport(SongImport): verse_type = VerseType.Tags[VerseType.Verse] while i < len(data): # Data is held as #name: value pairs inside groups marked as []. - # Now we are looking for name. + # Now we are looking for the name. if data[i:i+1] == '#': name_end = data.find(':', i+1) name = data[i+1:name_end] @@ -179,7 +179,7 @@ class SundayPlusImport(SongImport): if title.endswith(u'.ptf'): title = title[:-4] # For some strange reason all example files names ended with 1-7. - if title.endswith('1-7'): + if title.endswith(u'1-7'): title = title[:-3] return title.replace(u'_', u' ') @@ -189,8 +189,13 @@ class SundayPlusImport(SongImport): return unicode(blob, self.encoding) except: # This is asked again every time the previously chosen - # encoding does not work. - self.encoding = retrieve_windows_encoding() + # encoding does not work. Integrated with StripRtf encoding. + if len(self.rtf.user_encoding) and \ + self.encoding != self.rtf.user_encoding[-1]: + self.encoding = self.rtf.user_encoding[-1] + else: + self.encoding = retrieve_windows_encoding() + self.rtf.user_encoding.append(self.encoding) def unescape(self, text): text = text.replace('^^', '"') From aef1c550e03842095564e9b47e0686ded289af38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 4 Jul 2012 01:26:54 +0300 Subject: [PATCH 14/18] Make strip_rtf a function again. --- openlp/plugins/songs/lib/__init__.py | 430 ++++++++++--------- openlp/plugins/songs/lib/ewimport.py | 5 +- openlp/plugins/songs/lib/sundayplusimport.py | 14 +- 3 files changed, 227 insertions(+), 222 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index eb616d30d..b6177c3d7 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -24,7 +24,6 @@ # with this program; if not, write to the Free Software Foundation, Inc., 59 # # Temple Place, Suite 330, Boston, MA 02111-1307 USA # ############################################################################### -import logging import re from PyQt4 import QtGui @@ -34,10 +33,106 @@ from openlp.core.utils import CONTROL_CHARS from db import Author from ui import SongStrings -log = logging.getLogger(__name__) - WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE) +PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'" + r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) +# RTF control words which specify a "destination" to be ignored. +DESTINATIONS = frozenset(( + u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor', + u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime', + u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend', + u'bkmkstart', u'blipuid', u'buptim', u'category', + u'colorschememapping', u'colortbl', u'comment', u'company', u'creatim', + u'datafield', u'datastore', u'defchp', u'defpap', u'do', u'doccomm', + u'docvar', u'dptxbxtext', u'ebcend', u'ebcstart', u'factoidname', + u'falt', u'fchars', u'ffdeftext', u'ffentrymcr', u'ffexitmcr', + u'ffformat', u'ffhelptext', u'ffl', u'ffname', u'ffstattext', u'field', + u'file', u'filetbl', u'fldinst', u'fldrslt', u'fldtype', u'fname', + u'fontemb', u'fontfile', u'footer', u'footerf', u'footerl', u'footerr', + u'footnote', u'formfield', u'ftncn', u'ftnsep', u'ftnsepc', u'g', + u'generator', u'gridtbl', u'header', u'headerf', u'headerl', + u'headerr', u'hl', u'hlfr', u'hlinkbase', u'hlloc', u'hlsrc', u'hsv', + u'htmltag', u'info', u'keycode', u'keywords', u'latentstyles', + u'lchars', u'levelnumbers', u'leveltext', u'lfolevel', u'linkval', + u'list', u'listlevel', u'listname', u'listoverride', + u'listoverridetable', u'listpicture', u'liststylename', u'listtable', + u'listtext', u'lsdlockedexcept', u'macc', u'maccPr', u'mailmerge', + u'maln', u'malnScr', u'manager', u'margPr', u'mbar', u'mbarPr', + u'mbaseJc', u'mbegChr', u'mborderBox', u'mborderBoxPr', u'mbox', + u'mboxPr', u'mchr', u'mcount', u'mctrlPr', u'md', u'mdeg', u'mdegHide', + u'mden', u'mdiff', u'mdPr', u'me', u'mendChr', u'meqArr', u'meqArrPr', + u'mf', u'mfName', u'mfPr', u'mfunc', u'mfuncPr', u'mgroupChr', + u'mgroupChrPr', u'mgrow', u'mhideBot', u'mhideLeft', u'mhideRight', + u'mhideTop', u'mhtmltag', u'mlim', u'mlimloc', u'mlimlow', + u'mlimlowPr', u'mlimupp', u'mlimuppPr', u'mm', u'mmaddfieldname', + u'mmath', u'mmathPict', u'mmathPr', u'mmaxdist', u'mmc', u'mmcJc', + u'mmconnectstr', u'mmconnectstrdata', u'mmcPr', u'mmcs', + u'mmdatasource', u'mmheadersource', u'mmmailsubject', u'mmodso', + u'mmodsofilter', u'mmodsofldmpdata', u'mmodsomappedname', + u'mmodsoname', u'mmodsorecipdata', u'mmodsosort', u'mmodsosrc', + u'mmodsotable', u'mmodsoudl', u'mmodsoudldata', u'mmodsouniquetag', + u'mmPr', u'mmquery', u'mmr', u'mnary', u'mnaryPr', u'mnoBreak', + u'mnum', u'mobjDist', u'moMath', u'moMathPara', u'moMathParaPr', + u'mopEmu', u'mphant', u'mphantPr', u'mplcHide', u'mpos', u'mr', + u'mrad', u'mradPr', u'mrPr', u'msepChr', u'mshow', u'mshp', u'msPre', + u'msPrePr', u'msSub', u'msSubPr', u'msSubSup', u'msSubSupPr', u'msSup', + u'msSupPr', u'mstrikeBLTR', u'mstrikeH', u'mstrikeTLBR', u'mstrikeV', + u'msub', u'msubHide', u'msup', u'msupHide', u'mtransp', u'mtype', + u'mvertJc', u'mvfmf', u'mvfml', u'mvtof', u'mvtol', u'mzeroAsc', + u'mzeroDesc', u'mzeroWid', u'nesttableprops', u'nextfile', + u'nonesttables', u'objalias', u'objclass', u'objdata', u'object', + u'objname', u'objsect', u'objtime', u'oldcprops', u'oldpprops', + u'oldsprops', u'oldtprops', u'oleclsid', u'operator', u'panose', + u'password', u'passwordhash', u'pgp', u'pgptbl', u'picprop', u'pict', + u'pn', u'pnseclvl', u'pntext', u'pntxta', u'pntxtb', u'printim', + u'private', u'propname', u'protend', u'protstart', u'protusertbl', + u'pxe', u'result', u'revtbl', u'revtim', u'rsidtbl', u'rxe', u'shp', + u'shpgrp', u'shpinst', u'shppict', u'shprslt', u'shptxt', u'sn', u'sp', + u'staticval', u'stylesheet', u'subject', u'sv', u'svb', u'tc', + u'template', u'themedata', u'title', u'txe', u'ud', u'upr', + u'userprops', u'wgrffmtfilter', u'windowcaption', u'writereservation', + u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue', + u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen')) +# Translation of some special characters. +SPECIAL_CHARS = { + u'par': u'\n', + u'sect': u'\n\n', + # Required page and column break. + # Would be good if we could split verse into subverses here. + u'page': u'\n\n', + u'column': u'\n\n', + # Soft breaks. + u'softpage': u'[---]', + u'softcol': u'[---]', + u'line': u'\n', + u'tab': u'\t', + u'emdash': u'\u2014', + u'endash': u'\u2013', + u'emspace': u'\u2003', + u'enspace': u'\u2002', + u'qmspace': u'\u2005', + u'bullet': u'\u2022', + u'lquote': u'\u2018', + u'rquote': u'\u2019', + u'ldblquote': u'\u201C', + u'rdblquote': u'\u201D', + u'ltrmark': u'\u200E', + u'rtlmark': u'\u200F', + u'zwj': u'\u200D', + u'zwnj': u'\u200C'} +CHARSET_MAPPING = { + u'fcharset0': u'cp1252', + u'fcharset161': u'cp1253', + u'fcharset162': u'cp1254', + u'fcharset163': u'cp1258', + u'fcharset177': u'cp1255', + u'fcharset178': u'cp1256', + u'fcharset186': u'cp1257', + u'fcharset204': u'cp1251', + u'fcharset222': u'cp874', + u'fcharset238': u'cp1250'} + class VerseType(object): """ @@ -368,217 +463,136 @@ def clean_song(manager, song): if song.copyright: song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip() -class StripRtf(): + +def get_encoding(font, font_table, default_encoding, failed=False): """ - This class strips RTF control structures and returns an unicode string. + Finds an encoding to use. Asks user, if necessary. + + ``font`` + The number of currently active font. + + ``font_table`` + Dictionary of fonts and respective encodings. + + ``default_encoding`` + The defaul encoding to use when font_table is empty or no font is used. + + ``failed`` + A boolean indicating whether the previous encoding didn't work. + """ + encoding = None + if font in font_table: + encoding = font_table[font] + if not encoding and default_encoding: + encoding = default_encoding + if not encoding or failed: + encoding = retrieve_windows_encoding() + default_encoding = encoding + font_table[font] = encoding + return encoding, default_encoding + + +def strip_rtf(text, default_encoding=None): + """ + This function strips RTF control structures and returns an unicode string. Thanks to Markus Jarderot (MizardX) for this code, used by permission. http://stackoverflow.com/questions/188545 + + ``text`` + RTF-encoded text, a string. + + ``default_encoding`` + Default encoding to use when no encoding is specified. """ - PATTERN = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'" - r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) - # Control words which specify a "destination" to be ignored. - DESTINATIONS = frozenset(( - u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor', - u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime', - u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend', - u'bkmkstart', u'blipuid', u'buptim', u'category', - u'colorschememapping', u'colortbl', u'comment', u'company', u'creatim', - u'datafield', u'datastore', u'defchp', u'defpap', u'do', u'doccomm', - u'docvar', u'dptxbxtext', u'ebcend', u'ebcstart', u'factoidname', - u'falt', u'fchars', u'ffdeftext', u'ffentrymcr', u'ffexitmcr', - u'ffformat', u'ffhelptext', u'ffl', u'ffname', u'ffstattext', u'field', - u'file', u'filetbl', u'fldinst', u'fldrslt', u'fldtype', u'fname', - u'fontemb', u'fontfile', u'footer', u'footerf', u'footerl', u'footerr', - u'footnote', u'formfield', u'ftncn', u'ftnsep', u'ftnsepc', u'g', - u'generator', u'gridtbl', u'header', u'headerf', u'headerl', - u'headerr', u'hl', u'hlfr', u'hlinkbase', u'hlloc', u'hlsrc', u'hsv', - u'htmltag', u'info', u'keycode', u'keywords', u'latentstyles', - u'lchars', u'levelnumbers', u'leveltext', u'lfolevel', u'linkval', - u'list', u'listlevel', u'listname', u'listoverride', - u'listoverridetable', u'listpicture', u'liststylename', u'listtable', - u'listtext', u'lsdlockedexcept', u'macc', u'maccPr', u'mailmerge', - u'maln', u'malnScr', u'manager', u'margPr', u'mbar', u'mbarPr', - u'mbaseJc', u'mbegChr', u'mborderBox', u'mborderBoxPr', u'mbox', - u'mboxPr', u'mchr', u'mcount', u'mctrlPr', u'md', u'mdeg', u'mdegHide', - u'mden', u'mdiff', u'mdPr', u'me', u'mendChr', u'meqArr', u'meqArrPr', - u'mf', u'mfName', u'mfPr', u'mfunc', u'mfuncPr', u'mgroupChr', - u'mgroupChrPr', u'mgrow', u'mhideBot', u'mhideLeft', u'mhideRight', - u'mhideTop', u'mhtmltag', u'mlim', u'mlimloc', u'mlimlow', - u'mlimlowPr', u'mlimupp', u'mlimuppPr', u'mm', u'mmaddfieldname', - u'mmath', u'mmathPict', u'mmathPr', u'mmaxdist', u'mmc', u'mmcJc', - u'mmconnectstr', u'mmconnectstrdata', u'mmcPr', u'mmcs', - u'mmdatasource', u'mmheadersource', u'mmmailsubject', u'mmodso', - u'mmodsofilter', u'mmodsofldmpdata', u'mmodsomappedname', - u'mmodsoname', u'mmodsorecipdata', u'mmodsosort', u'mmodsosrc', - u'mmodsotable', u'mmodsoudl', u'mmodsoudldata', u'mmodsouniquetag', - u'mmPr', u'mmquery', u'mmr', u'mnary', u'mnaryPr', u'mnoBreak', - u'mnum', u'mobjDist', u'moMath', u'moMathPara', u'moMathParaPr', - u'mopEmu', u'mphant', u'mphantPr', u'mplcHide', u'mpos', u'mr', - u'mrad', u'mradPr', u'mrPr', u'msepChr', u'mshow', u'mshp', u'msPre', - u'msPrePr', u'msSub', u'msSubPr', u'msSubSup', u'msSubSupPr', u'msSup', - u'msSupPr', u'mstrikeBLTR', u'mstrikeH', u'mstrikeTLBR', u'mstrikeV', - u'msub', u'msubHide', u'msup', u'msupHide', u'mtransp', u'mtype', - u'mvertJc', u'mvfmf', u'mvfml', u'mvtof', u'mvtol', u'mzeroAsc', - u'mzeroDesc', u'mzeroWid', u'nesttableprops', u'nextfile', - u'nonesttables', u'objalias', u'objclass', u'objdata', u'object', - u'objname', u'objsect', u'objtime', u'oldcprops', u'oldpprops', - u'oldsprops', u'oldtprops', u'oleclsid', u'operator', u'panose', - u'password', u'passwordhash', u'pgp', u'pgptbl', u'picprop', u'pict', - u'pn', u'pnseclvl', u'pntext', u'pntxta', u'pntxtb', u'printim', - u'private', u'propname', u'protend', u'protstart', u'protusertbl', - u'pxe', u'result', u'revtbl', u'revtim', u'rsidtbl', u'rxe', u'shp', - u'shpgrp', u'shpinst', u'shppict', u'shprslt', u'shptxt', u'sn', u'sp', - u'staticval', u'stylesheet', u'subject', u'sv', u'svb', u'tc', - u'template', u'themedata', u'title', u'txe', u'ud', u'upr', - u'userprops', u'wgrffmtfilter', u'windowcaption', u'writereservation', - u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue', - u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen')) - # Translation of some special characters. - SPECIAL_CHARS = { - u'par': u'\n', - u'sect': u'\n\n', - # Required page and column break. - # Would be good if we could split verse into subverses here. - u'page': u'\n\n', - u'column': u'\n\n', - # Soft breaks. - u'softpage': u'[---]', - u'softcol': u'[---]', - u'line': u'\n', - u'tab': u'\t', - u'emdash': u'\u2014', - u'endash': u'\u2013', - u'emspace': u'\u2003', - u'enspace': u'\u2002', - u'qmspace': u'\u2005', - u'bullet': u'\u2022', - u'lquote': u'\u2018', - u'rquote': u'\u2019', - u'ldblquote': u'\u201C', - u'rdblquote': u'\u201D', - u'ltrmark': u'\u200E', - u'rtlmark': u'\u200F', - u'zwj': u'\u200D', - u'zwnj': u'\u200C'} - CHARSET_MAPPING = { - u'fcharset0': u'cp1252', - u'fcharset161': u'cp1253', - u'fcharset162': u'cp1254', - u'fcharset163': u'cp1258', - u'fcharset177': u'cp1255', - u'fcharset178': u'cp1256', - u'fcharset186': u'cp1257', - u'fcharset204': u'cp1251', - u'fcharset222': u'cp874', - u'fcharset238': u'cp1250'} - # If user is asked for an encoding, it is used since then. - user_encoding = [] + # Current font is the font tag we last met. + font = u'' + # Character encoding is defined inside fonttable. + # font_table could contain eg u'0': u'cp1252' + font_table = {u'': u''} + # Stack of things to keep track of when entering/leaving groups. + stack = [] + # Whether this group (and all inside it) are "ignorable". + ignorable = False + # Number of ASCII characters to skip after an unicode character. + ucskip = 1 + # Number of ASCII characters left to skip. + curskip = 0 + # Output buffer. + out = [] + for match in PATTERN.finditer(text): + word, arg, hex, char, brace, tchar = match.groups() + if brace: + curskip = 0 + if brace == u'{': + # Push state + stack.append((ucskip, ignorable, font)) + elif brace == u'}': + # Pop state + ucskip, ignorable, font = stack.pop() + # \x (not a letter) + elif char: + curskip = 0 + if char == u'~' and not ignorable: + out.append(u'\xA0') + elif char in u'{}\\' and not ignorable: + out.append(char) + elif char == u'-' and not ignorable: + out.append(u'\u00AD') + elif char == u'_' and not ignorable: + out.append(u'\u2011') + elif char == u'*': + ignorable = True + # \command + elif word: + curskip = 0 + if word in DESTINATIONS: + ignorable = True + elif word in SPECIAL_CHARS: + out.append(SPECIAL_CHARS[word]) + elif word == u'uc': + ucskip = int(arg) + elif word == u' ': + c = int(arg) + if c < 0: + c += 0x10000 + out.append(unichr(c)) + curskip = ucskip + elif word == u'fonttbl': + inside_font_table = True + ignorable = True + elif word == u'f': + font = arg + elif word == u'ansicpg': + font_table[font] = 'cp' + arg + elif word == u'fcharset' and font not in font_table and \ + word + arg in CHARSET_MAPPING: + # \ansicpg overrides \fcharset, if present. + font_table[font] = CHARSET_MAPPING[word + arg] + # \'xx + elif hex: + if curskip > 0: + curskip -= 1 + elif not ignorable: + charcode = int(hex, 16) + encoding, default_encoding = get_encoding(font, font_table, + default_encoding) + while True: + try: + out.append(chr(charcode).decode(encoding)) + except UnicodeDecodeError: + encoding, default_encoding = get_encoding(font, + font_table, default_encoding, failed=True) + else: + break + elif tchar: + if curskip > 0: + curskip -= 1 + elif not ignorable: + out.append(tchar) + text = u''.join(out) + return text, default_encoding - def strip_rtf(self, text, default_encoding=None): - self.default_encoding = default_encoding - # Current font is the font tag we last met. - font = u'' - # Character encoding is defined inside fonttable. - # font_table could contain eg u'0': u'cp1252' - font_table = {u'': default_encoding} - # Stack of things to keep track of when entering/leaving groups. - stack = [] - # Whether this group (and all inside it) are "ignorable". - ignorable = False - # Number of ASCII characters to skip after an unicode character. - ucskip = 1 - # Number of ASCII characters left to skip. - curskip = 0 - # Output buffer. - out = [] - for match in self.PATTERN.finditer(text): - word, arg, hex, char, brace, tchar = match.groups() - if brace: - curskip = 0 - if brace == u'{': - # Push state - stack.append((ucskip, ignorable, font)) - elif brace == u'}': - # Pop state - ucskip, ignorable, font = stack.pop() - # \x (not a letter) - elif char: - curskip = 0 - if char == u'~' and not ignorable: - out.append(u'\xA0') - elif char in u'{}\\' and not ignorable: - out.append(char) - elif char == u'-' and not ignorable: - out.append(u'\u00AD') - elif char == u'_' and not ignorable: - out.append(u'\u2011') - elif char == u'*': - ignorable = True - # \command - elif word: - curskip = 0 - if word in self.DESTINATIONS: - ignorable = True - elif word in self.SPECIAL_CHARS: - out.append(self.SPECIAL_CHARS[word]) - elif word == u'uc': - ucskip = int(arg) - elif word == u' ': - c = int(arg) - if c < 0: - c += 0x10000 - out.append(unichr(c)) - curskip = ucskip - elif word == u'fonttbl': - inside_font_table = True - ignorable = True - elif word == u'f': - font = arg - elif word == u'ansicpg': - font_table[font] = 'cp' + arg - elif word == u'fcharset' and font not in font_table and \ - word + arg in self.CHARSET_MAPPING: - # \ansicpg overrides \fcharset, if present. - font_table[font] = self.CHARSET_MAPPING[word + arg] - # \'xx - elif hex: - if curskip > 0: - curskip -= 1 - elif not ignorable: - charcode = int(hex, 16) - encoding = self.get_encoding(font, font_table) - while True: - try: - out.append(chr(charcode).decode(encoding)) - except UnicodeDecodeError: - encoding = self.get_encoding(font, font_table, - failed=True) - else: - break - elif tchar: - if curskip > 0: - curskip -= 1 - elif not ignorable: - out.append(tchar) - return u''.join(out) - - def get_encoding(self, font, font_table, failed=False): - encoding = None - if font in font_table: - encoding = font_table[font] - if not encoding and len(self.user_encoding): - encoding = self.user_encoding[-1] - if not encoding and self.default_encoding: - encoding = self.default_encoding - if not encoding or (failed and self.user_encoding == encoding): - encoding = retrieve_windows_encoding(self.default_encoding) - if encoding not in self.user_encoding: - self.user_encoding.append(encoding) - elif failed: - encoding = self.user_encoding - font_table[font] = encoding - return encoding from xml import OpenLyrics, SongXML from songstab import SongsTab diff --git a/openlp/plugins/songs/lib/ewimport.py b/openlp/plugins/songs/lib/ewimport.py index 7ecabcdd6..adc4b4976 100644 --- a/openlp/plugins/songs/lib/ewimport.py +++ b/openlp/plugins/songs/lib/ewimport.py @@ -35,7 +35,7 @@ import re from openlp.core.lib import translate from openlp.plugins.songs.lib import VerseType -from openlp.plugins.songs.lib import retrieve_windows_encoding, StripRtf +from openlp.plugins.songs.lib import retrieve_windows_encoding, strip_rtf from songimport import SongImport RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}') @@ -59,7 +59,6 @@ class EasyWorshipSongImport(SongImport): """ def __init__(self, manager, **kwargs): SongImport.__init__(self, manager, **kwargs) - self.rtf = StripRtf() def doImport(self): # Open the DB and MB files if they exist @@ -179,7 +178,7 @@ class EasyWorshipSongImport(SongImport): self.addAuthor(author_name.strip()) if words: # Format the lyrics - words = self.rtf.strip_rtf(words, self.encoding) + words, self.encoding = strip_rtf(words, self.encoding) verse_type = VerseType.Tags[VerseType.Verse] for verse in SLIDE_BREAK_REGEX.split(words): verse = verse.strip() diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index 6260026b6..81579ca3a 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -30,7 +30,7 @@ import os import re from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding -from openlp.plugins.songs.lib import StripRtf +from openlp.plugins.songs.lib import strip_rtf from openlp.plugins.songs.lib.songimport import SongImport log = logging.getLogger(__name__) @@ -62,7 +62,6 @@ class SundayPlusImport(SongImport): """ SongImport.__init__(self, manager, **kwargs) self.encoding = u'us-ascii' - self.rtf = StripRtf() def doImport(self): self.importWizard.progressBar.setMaximum(len(self.importSource)) @@ -151,7 +150,7 @@ class SundayPlusImport(SongImport): verse_type = self.HOTKEY_TO_VERSE_TYPE[value] if name == 'rtf': value = self.unescape(value) - verse = self.rtf.strip_rtf(value, self.encoding) + verse, self.encoding = strip_rtf(value, self.encoding) lines = verse.strip().split('\n') # If any line inside any verse contains CCLI or # only Public Domain, we treat this as special data: @@ -188,14 +187,7 @@ class SundayPlusImport(SongImport): try: return unicode(blob, self.encoding) except: - # This is asked again every time the previously chosen - # encoding does not work. Integrated with StripRtf encoding. - if len(self.rtf.user_encoding) and \ - self.encoding != self.rtf.user_encoding[-1]: - self.encoding = self.rtf.user_encoding[-1] - else: - self.encoding = retrieve_windows_encoding() - self.rtf.user_encoding.append(self.encoding) + self.encoding = retrieve_windows_encoding() def unescape(self, text): text = text.replace('^^', '"') From 7871dfdf001d69773e80729dac04fe7029c09073 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 4 Jul 2012 01:55:53 +0300 Subject: [PATCH 15/18] Cleanup. --- openlp/plugins/songs/lib/__init__.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index b6177c3d7..3f3d3ad7d 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -559,7 +559,6 @@ def strip_rtf(text, default_encoding=None): out.append(unichr(c)) curskip = ucskip elif word == u'fonttbl': - inside_font_table = True ignorable = True elif word == u'f': font = arg @@ -575,14 +574,14 @@ def strip_rtf(text, default_encoding=None): curskip -= 1 elif not ignorable: charcode = int(hex, 16) - encoding, default_encoding = get_encoding(font, font_table, - default_encoding) + failed = False while True: try: + encoding, default_encoding = get_encoding(font, + font_table, default_encoding, failed=failed) out.append(chr(charcode).decode(encoding)) except UnicodeDecodeError: - encoding, default_encoding = get_encoding(font, - font_table, default_encoding, failed=True) + failed = True else: break elif tchar: From 16186c39607cd1542a2e81130186361ec0a5ce72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 4 Jul 2012 08:31:27 +0300 Subject: [PATCH 16/18] Remove unnecessary import and move one more constant to module level. Thanks, Tim. --- openlp/plugins/songs/lib/sundayplusimport.py | 32 +++++++++----------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index 81579ca3a..58fa29511 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -25,7 +25,6 @@ # Temple Place, Suite 330, Boston, MA 02111-1307 USA # ############################################################################### -import logging import os import re @@ -33,7 +32,19 @@ from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding from openlp.plugins.songs.lib import strip_rtf from openlp.plugins.songs.lib.songimport import SongImport -log = logging.getLogger(__name__) +HOTKEY_TO_VERSE_TYPE = { + u'1': u'v1', + u'2': u'v2', + u'3': u'v3', + u'4': u'v4', + u'5': u'v5', + u'6': u'v6', + u'7': u'v7', + u'8': u'v8', + u'9': u'v9', + u'C': u'c', + u'+': u'b', + u'Z': u'o'} class SundayPlusImport(SongImport): """ @@ -42,19 +53,6 @@ class SundayPlusImport(SongImport): The format examples can be found attached to bug report at """ - HOTKEY_TO_VERSE_TYPE = { - u'1': u'v1', - u'2': u'v2', - u'3': u'v3', - u'4': u'v4', - u'5': u'v5', - u'6': u'v6', - u'7': u'v7', - u'8': u'v8', - u'9': u'v9', - u'C': u'c', - u'+': u'b', - u'Z': u'o'} def __init__(self, manager, **kwargs): """ @@ -146,8 +144,8 @@ class SundayPlusImport(SongImport): # Hotkey always appears after MARKER_NAME, so it # effectively overrides MARKER_NAME, if present. if len(value) and \ - value in self.HOTKEY_TO_VERSE_TYPE.keys(): - verse_type = self.HOTKEY_TO_VERSE_TYPE[value] + value in HOTKEY_TO_VERSE_TYPE.keys(): + verse_type = HOTKEY_TO_VERSE_TYPE[value] if name == 'rtf': value = self.unescape(value) verse, self.encoding = strip_rtf(value, self.encoding) From f730280ce0dd4c6424475daf977a425c38f3c2b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 4 Jul 2012 12:48:29 +0300 Subject: [PATCH 17/18] Update copyright header. --- openlp/plugins/songs/lib/sundayplusimport.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index 58fa29511..81d452f55 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -6,10 +6,11 @@ # --------------------------------------------------------------------------- # # Copyright (c) 2008-2012 Raoul Snyman # # Portions copyright (c) 2008-2012 Tim Bentley, Gerald Britton, Jonathan # -# Corwin, Michael Gorven, Scott Guerrieri, Matthias Hub, Meinert Jordan, # -# Armin Köhler, Joshua Miller, Stevan Pettit, Andreas Preikschat, Mattias # -# Põldaru, Christian Richter, Philip Ridout, Simon Scudder, Jeffrey Smith, # -# Maikel Stuivenberg, Martin Thompson, Jon Tibble, Frode Woldsund # +# Corwin, Samuel Findlay, Michael Gorven, Scott Guerrieri, Matthias Hub, # +# Meinert Jordan, Armin Köhler, Edwin Lunando, Joshua Miller, Stevan Pettit, # +# Andreas Preikschat, Mattias Põldaru, Christian Richter, Philip Ridout, # +# Simon Scudder, Jeffrey Smith, Maikel Stuivenberg, Martin Thompson, Jon # +# Tibble, Dave Warnock, Frode Woldsund # # --------------------------------------------------------------------------- # # This program is free software; you can redistribute it and/or modify it # # under the terms of the GNU General Public License as published by the Free # From 5f04399d6c66c0ca826f6c509a59906fa75585c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20P=C3=B5ldaru?= Date: Wed, 4 Jul 2012 23:08:04 +0300 Subject: [PATCH 18/18] Spacing around operator. --- openlp/plugins/songs/lib/sundayplusimport.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/openlp/plugins/songs/lib/sundayplusimport.py b/openlp/plugins/songs/lib/sundayplusimport.py index 81d452f55..fcf324d41 100644 --- a/openlp/plugins/songs/lib/sundayplusimport.py +++ b/openlp/plugins/songs/lib/sundayplusimport.py @@ -93,20 +93,20 @@ class SundayPlusImport(SongImport): while i < len(data): # Data is held as #name: value pairs inside groups marked as []. # Now we are looking for the name. - if data[i:i+1] == '#': - name_end = data.find(':', i+1) - name = data[i+1:name_end] + if data[i:i + 1] == '#': + name_end = data.find(':', i + 1) + name = data[i + 1:name_end] i = name_end + 1 - while data[i:i+1] == ' ': + while data[i:i + 1] == ' ': i += 1 - if data[i:i+1] == '"': - end = data.find('"', i+1) - value = data[i+1:end] - elif data[i:i+1] == '[': + if data[i:i + 1] == '"': + end = data.find('"', i + 1) + value = data[i + 1:end] + elif data[i:i + 1] == '[': j = i inside_quotes = False while j < len(data): - char = data[j:j+1] + char = data[j:j + 1] if char == '"': inside_quotes = not inside_quotes elif not inside_quotes and char == ']': @@ -115,7 +115,7 @@ class SundayPlusImport(SongImport): j += 1 value = data[i:end] else: - end = data.find(',', i+1) + end = data.find(',', i + 1) if data.find('(', i, end) != -1: end = data.find(')', i) + 1 value = data[i:end]