From 30943752334e76fcf3b901ca27ed31978b41b147 Mon Sep 17 00:00:00 2001 From: Jonathan Corwin Date: Tue, 3 Jul 2012 22:14:12 +0100 Subject: [PATCH] Share ew strip_rtf routine --- openlp/plugins/songs/lib/__init__.py | 96 +++++++++++++++++ openlp/plugins/songs/lib/ewimport.py | 98 +---------------- openlp/plugins/songs/lib/songproimport.py | 124 +--------------------- 3 files changed, 100 insertions(+), 218 deletions(-) diff --git a/openlp/plugins/songs/lib/__init__.py b/openlp/plugins/songs/lib/__init__.py index ce41b6faa..7f0bfd2fa 100644 --- a/openlp/plugins/songs/lib/__init__.py +++ b/openlp/plugins/songs/lib/__init__.py @@ -36,6 +36,7 @@ from ui import SongStrings WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE) +RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}') class VerseType(object): """ @@ -366,6 +367,101 @@ def clean_song(manager, song): if song.copyright: song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip() +def strip_rtf(blob, encoding): + depth = 0 + control = False + clear_text = [] + control_word = [] + + # workaround for \tx bug: remove one pair of curly braces + # if \tx is encountered + match = RTF_STRIPPING_REGEX.search(blob) + if match: + # start and end indices of match are curly braces - filter them out + blob = ''.join([blob[i] for i in xrange(len(blob)) + if i != match.start() and i !=match.end()]) + for c in blob: + if control: + # for delimiters, set control to False + if c == '{': + if control_word: + depth += 1 + control = False + elif c == '}': + if control_word: + depth -= 1 + control = False + elif c == '\\': + new_control = bool(control_word) + control = False + elif c.isspace(): + control = False + else: + control_word.append(c) + if len(control_word) == 3 and control_word[0] == '\'': + control = False + if not control: + if not control_word: + if c == '{' or c == '}' or c == '\\': + clear_text.append(c) + else: + control_str = ''.join(control_word) + if control_str == 'par' or control_str == 'line': + clear_text.append(u'\n') + elif control_str == 'tab': + clear_text.append(u'\t') + # Prefer the encoding specified by the RTF data to that + # specified by the Paradox table header + # West European encoding + elif control_str == 'fcharset0': + encoding = u'cp1252' + # Greek encoding + elif control_str == 'fcharset161': + encoding = u'cp1253' + # Turkish encoding + elif control_str == 'fcharset162': + encoding = u'cp1254' + # Vietnamese encoding + elif control_str == 'fcharset163': + encoding = u'cp1258' + # Hebrew encoding + elif control_str == 'fcharset177': + encoding = u'cp1255' + # Arabic encoding + elif control_str == 'fcharset178': + encoding = u'cp1256' + # Baltic encoding + elif control_str == 'fcharset186': + encoding = u'cp1257' + # Cyrillic encoding + elif control_str == 'fcharset204': + encoding = u'cp1251' + # Thai encoding + elif control_str == 'fcharset222': + encoding = u'cp874' + # Central+East European encoding + elif control_str == 'fcharset238': + encoding = u'cp1250' + elif control_str[0] == '\'': + s = chr(int(control_str[1:3], 16)) + clear_text.append(s.decode(encoding)) + del control_word[:] + if c == '\\' and new_control: + control = True + elif c == '{': + depth += 1 + elif c == '}': + depth -= 1 + elif depth > 2: + continue + elif c == '\n' or c == '\r': + continue + elif c == '\\': + control = True + else: + clear_text.append(c) + return u''.join(clear_text) + from xml import OpenLyrics, SongXML from songstab import SongsTab from mediaitem import SongMediaItem diff --git a/openlp/plugins/songs/lib/ewimport.py b/openlp/plugins/songs/lib/ewimport.py index 227b8e4b6..020c489bb 100644 --- a/openlp/plugins/songs/lib/ewimport.py +++ b/openlp/plugins/songs/lib/ewimport.py @@ -36,110 +36,14 @@ import re from openlp.core.lib import translate from openlp.plugins.songs.lib import VerseType -from openlp.plugins.songs.lib import retrieve_windows_encoding +from openlp.plugins.songs.lib import retrieve_windows_encoding, strip_rtf from songimport import SongImport -RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}') # regex: at least two newlines, can have spaces between them SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*') NUMBER_REGEX = re.compile(r'[0-9]+') NOTE_REGEX = re.compile(r'\(.*?\)') -def strip_rtf(blob, encoding): - depth = 0 - control = False - clear_text = [] - control_word = [] - - # workaround for \tx bug: remove one pair of curly braces - # if \tx is encountered - match = RTF_STRIPPING_REGEX.search(blob) - if match: - # start and end indices of match are curly braces - filter them out - blob = ''.join([blob[i] for i in xrange(len(blob)) - if i != match.start() and i !=match.end()]) - - for c in blob: - if control: - # for delimiters, set control to False - if c == '{': - if control_word: - depth += 1 - control = False - elif c == '}': - if control_word: - depth -= 1 - control = False - elif c == '\\': - new_control = bool(control_word) - control = False - elif c.isspace(): - control = False - else: - control_word.append(c) - if len(control_word) == 3 and control_word[0] == '\'': - control = False - if not control: - if not control_word: - if c == '{' or c == '}' or c == '\\': - clear_text.append(c) - else: - control_str = ''.join(control_word) - if control_str == 'par' or control_str == 'line': - clear_text.append(u'\n') - elif control_str == 'tab': - clear_text.append(u'\t') - # Prefer the encoding specified by the RTF data to that - # specified by the Paradox table header - # West European encoding - elif control_str == 'fcharset0': - encoding = u'cp1252' - # Greek encoding - elif control_str == 'fcharset161': - encoding = u'cp1253' - # Turkish encoding - elif control_str == 'fcharset162': - encoding = u'cp1254' - # Vietnamese encoding - elif control_str == 'fcharset163': - encoding = u'cp1258' - # Hebrew encoding - elif control_str == 'fcharset177': - encoding = u'cp1255' - # Arabic encoding - elif control_str == 'fcharset178': - encoding = u'cp1256' - # Baltic encoding - elif control_str == 'fcharset186': - encoding = u'cp1257' - # Cyrillic encoding - elif control_str == 'fcharset204': - encoding = u'cp1251' - # Thai encoding - elif control_str == 'fcharset222': - encoding = u'cp874' - # Central+East European encoding - elif control_str == 'fcharset238': - encoding = u'cp1250' - elif control_str[0] == '\'': - s = chr(int(control_str[1:3], 16)) - clear_text.append(s.decode(encoding)) - del control_word[:] - if c == '\\' and new_control: - control = True - elif c == '{': - depth += 1 - elif c == '}': - depth -= 1 - elif depth > 2: - continue - elif c == '\n' or c == '\r': - continue - elif c == '\\': - control = True - else: - clear_text.append(c) - return u''.join(clear_text) class FieldDescEntry: def __init__(self, name, type, size): diff --git a/openlp/plugins/songs/lib/songproimport.py b/openlp/plugins/songs/lib/songproimport.py index a22f18d14..ddc5a79b5 100644 --- a/openlp/plugins/songs/lib/songproimport.py +++ b/openlp/plugins/songs/lib/songproimport.py @@ -33,6 +33,7 @@ import os import logging from openlp.core.lib import translate +from openlp.plugins.songs.lib import strip_rtf from openlp.plugins.songs.lib.songimport import SongImport log = logging.getLogger(__name__) @@ -110,7 +111,7 @@ class SongProImport(SongImport): self.finish() return if u'rtf1' in text: - text = striprtf(text).rstrip() + text = strip_rtf(text, u'cp1252').rstrip() if not text: return if tag == u'A': @@ -136,128 +137,9 @@ class SongProImport(SongImport): self.verseOrderList.append(u'B1') elif char == u'D': self.verseOrderList.append(u'E1') - elif u'1' <= char <= '7': + elif u'1' <= char <= u'7': self.verseOrderList.append(u'V' + char) elif tag == u'R': self.addCopyright(text) elif u'1' <= tag <= u'7': self.addVerse(text, u'V' + tag[1:]) - -# replace with mahfiaz's shared one when his import is merged -def striprtf(text): - pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I) - # control words which specify a "destionation". - destinations = frozenset(( - 'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid', - 'atnparent','atnref','atntime','atrfend','atrfstart','author','background', - 'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping', - 'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap', - 'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt', - 'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl', - 'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype', - 'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr', - 'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl', - 'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc', - 'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers', - 'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride', - 'listoverridetable','listpicture','liststylename','listtable','listtext', - 'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr', - 'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr', - 'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me', - 'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr', - 'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag', - 'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname', - 'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr', - 'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject', - 'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname', - 'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl', - 'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr', - 'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu', - 'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr', - 'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup', - 'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide', - 'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol', - 'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables', - 'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops', - 'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password', - 'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta', - 'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe', - 'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst', - 'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv', - 'svb','tc','template','themedata','title','txe','ud','upr','userprops', - 'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform', - 'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl', - 'xmlopen', - )) - # Translation of some special characters. - specialchars = { - 'par': '\n', - 'sect': '\n\n', - 'page': '\n\n', - 'line': '\n', - 'tab': '\t', - 'emdash': u'\u2014', - 'endash': u'\u2013', - 'emspace': u'\u2003', - 'enspace': u'\u2002', - 'qmspace': u'\u2005', - 'bullet': u'\u2022', - 'lquote': u'\u2018', - 'rquote': u'\u2019', - 'ldblquote': u'\201C', - 'rdblquote': u'\u201D', - } - stack = [] - ignorable = False # Whether this group (and all inside it) are "ignorable". - ucskip = 1 # Number of ASCII characters to skip after a unicode character. - curskip = 0 # Number of ASCII characters left to skip - out = [] # Output buffer. - for match in pattern.finditer(text): - word,arg,hex,char,brace,tchar = match.groups() - if brace: - curskip = 0 - if brace == '{': - # Push state - stack.append((ucskip,ignorable)) - elif brace == '}': - # Pop state - ucskip,ignorable = stack.pop() - elif char: # \x (not a letter) - curskip = 0 - if char == '~': - if not ignorable: - out.append(u'\xA0') - elif char in '{}\\': - if not ignorable: - out.append(char) - elif char == '*': - ignorable = True - elif word: # \foo - curskip = 0 - if word in destinations: - ignorable = True - elif ignorable: - pass - elif word in specialchars: - out.append(specialchars[word]) - elif word == 'uc': - ucskip = int(arg) - elif word == 'u': - c = int(arg) - if c < 0: c += 0x10000 - if c > 127: out.append(unichr(c)) - else: out.append(chr(c)) - curskip = ucskip - elif hex: # \'xx - if curskip > 0: - curskip -= 1 - elif not ignorable: - c = int(hex,16) - if c > 127: out.append(unichr(c)) - else: out.append(chr(c)) - elif tchar: - if curskip > 0: - curskip -= 1 - elif not ignorable: - out.append(tchar) - return ''.join(out)