forked from openlp/openlp
Share ew strip_rtf routine
This commit is contained in:
parent
f70b9d3547
commit
3094375233
@ -36,6 +36,7 @@ from ui import SongStrings
|
|||||||
|
|
||||||
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
|
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
|
||||||
APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE)
|
APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE)
|
||||||
|
RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')
|
||||||
|
|
||||||
class VerseType(object):
|
class VerseType(object):
|
||||||
"""
|
"""
|
||||||
@ -366,6 +367,101 @@ def clean_song(manager, song):
|
|||||||
if song.copyright:
|
if song.copyright:
|
||||||
song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
|
song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
|
||||||
|
|
||||||
|
def strip_rtf(blob, encoding):
|
||||||
|
depth = 0
|
||||||
|
control = False
|
||||||
|
clear_text = []
|
||||||
|
control_word = []
|
||||||
|
|
||||||
|
# workaround for \tx bug: remove one pair of curly braces
|
||||||
|
# if \tx is encountered
|
||||||
|
match = RTF_STRIPPING_REGEX.search(blob)
|
||||||
|
if match:
|
||||||
|
# start and end indices of match are curly braces - filter them out
|
||||||
|
blob = ''.join([blob[i] for i in xrange(len(blob))
|
||||||
|
if i != match.start() and i !=match.end()])
|
||||||
|
for c in blob:
|
||||||
|
if control:
|
||||||
|
# for delimiters, set control to False
|
||||||
|
if c == '{':
|
||||||
|
if control_word:
|
||||||
|
depth += 1
|
||||||
|
control = False
|
||||||
|
elif c == '}':
|
||||||
|
if control_word:
|
||||||
|
depth -= 1
|
||||||
|
control = False
|
||||||
|
elif c == '\\':
|
||||||
|
new_control = bool(control_word)
|
||||||
|
control = False
|
||||||
|
elif c.isspace():
|
||||||
|
control = False
|
||||||
|
else:
|
||||||
|
control_word.append(c)
|
||||||
|
if len(control_word) == 3 and control_word[0] == '\'':
|
||||||
|
control = False
|
||||||
|
if not control:
|
||||||
|
if not control_word:
|
||||||
|
if c == '{' or c == '}' or c == '\\':
|
||||||
|
clear_text.append(c)
|
||||||
|
else:
|
||||||
|
control_str = ''.join(control_word)
|
||||||
|
if control_str == 'par' or control_str == 'line':
|
||||||
|
clear_text.append(u'\n')
|
||||||
|
elif control_str == 'tab':
|
||||||
|
clear_text.append(u'\t')
|
||||||
|
# Prefer the encoding specified by the RTF data to that
|
||||||
|
# specified by the Paradox table header
|
||||||
|
# West European encoding
|
||||||
|
elif control_str == 'fcharset0':
|
||||||
|
encoding = u'cp1252'
|
||||||
|
# Greek encoding
|
||||||
|
elif control_str == 'fcharset161':
|
||||||
|
encoding = u'cp1253'
|
||||||
|
# Turkish encoding
|
||||||
|
elif control_str == 'fcharset162':
|
||||||
|
encoding = u'cp1254'
|
||||||
|
# Vietnamese encoding
|
||||||
|
elif control_str == 'fcharset163':
|
||||||
|
encoding = u'cp1258'
|
||||||
|
# Hebrew encoding
|
||||||
|
elif control_str == 'fcharset177':
|
||||||
|
encoding = u'cp1255'
|
||||||
|
# Arabic encoding
|
||||||
|
elif control_str == 'fcharset178':
|
||||||
|
encoding = u'cp1256'
|
||||||
|
# Baltic encoding
|
||||||
|
elif control_str == 'fcharset186':
|
||||||
|
encoding = u'cp1257'
|
||||||
|
# Cyrillic encoding
|
||||||
|
elif control_str == 'fcharset204':
|
||||||
|
encoding = u'cp1251'
|
||||||
|
# Thai encoding
|
||||||
|
elif control_str == 'fcharset222':
|
||||||
|
encoding = u'cp874'
|
||||||
|
# Central+East European encoding
|
||||||
|
elif control_str == 'fcharset238':
|
||||||
|
encoding = u'cp1250'
|
||||||
|
elif control_str[0] == '\'':
|
||||||
|
s = chr(int(control_str[1:3], 16))
|
||||||
|
clear_text.append(s.decode(encoding))
|
||||||
|
del control_word[:]
|
||||||
|
if c == '\\' and new_control:
|
||||||
|
control = True
|
||||||
|
elif c == '{':
|
||||||
|
depth += 1
|
||||||
|
elif c == '}':
|
||||||
|
depth -= 1
|
||||||
|
elif depth > 2:
|
||||||
|
continue
|
||||||
|
elif c == '\n' or c == '\r':
|
||||||
|
continue
|
||||||
|
elif c == '\\':
|
||||||
|
control = True
|
||||||
|
else:
|
||||||
|
clear_text.append(c)
|
||||||
|
return u''.join(clear_text)
|
||||||
|
|
||||||
from xml import OpenLyrics, SongXML
|
from xml import OpenLyrics, SongXML
|
||||||
from songstab import SongsTab
|
from songstab import SongsTab
|
||||||
from mediaitem import SongMediaItem
|
from mediaitem import SongMediaItem
|
||||||
|
@ -36,110 +36,14 @@ import re
|
|||||||
|
|
||||||
from openlp.core.lib import translate
|
from openlp.core.lib import translate
|
||||||
from openlp.plugins.songs.lib import VerseType
|
from openlp.plugins.songs.lib import VerseType
|
||||||
from openlp.plugins.songs.lib import retrieve_windows_encoding
|
from openlp.plugins.songs.lib import retrieve_windows_encoding, strip_rtf
|
||||||
from songimport import SongImport
|
from songimport import SongImport
|
||||||
|
|
||||||
RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')
|
|
||||||
# regex: at least two newlines, can have spaces between them
|
# regex: at least two newlines, can have spaces between them
|
||||||
SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*')
|
SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*')
|
||||||
NUMBER_REGEX = re.compile(r'[0-9]+')
|
NUMBER_REGEX = re.compile(r'[0-9]+')
|
||||||
NOTE_REGEX = re.compile(r'\(.*?\)')
|
NOTE_REGEX = re.compile(r'\(.*?\)')
|
||||||
|
|
||||||
def strip_rtf(blob, encoding):
|
|
||||||
depth = 0
|
|
||||||
control = False
|
|
||||||
clear_text = []
|
|
||||||
control_word = []
|
|
||||||
|
|
||||||
# workaround for \tx bug: remove one pair of curly braces
|
|
||||||
# if \tx is encountered
|
|
||||||
match = RTF_STRIPPING_REGEX.search(blob)
|
|
||||||
if match:
|
|
||||||
# start and end indices of match are curly braces - filter them out
|
|
||||||
blob = ''.join([blob[i] for i in xrange(len(blob))
|
|
||||||
if i != match.start() and i !=match.end()])
|
|
||||||
|
|
||||||
for c in blob:
|
|
||||||
if control:
|
|
||||||
# for delimiters, set control to False
|
|
||||||
if c == '{':
|
|
||||||
if control_word:
|
|
||||||
depth += 1
|
|
||||||
control = False
|
|
||||||
elif c == '}':
|
|
||||||
if control_word:
|
|
||||||
depth -= 1
|
|
||||||
control = False
|
|
||||||
elif c == '\\':
|
|
||||||
new_control = bool(control_word)
|
|
||||||
control = False
|
|
||||||
elif c.isspace():
|
|
||||||
control = False
|
|
||||||
else:
|
|
||||||
control_word.append(c)
|
|
||||||
if len(control_word) == 3 and control_word[0] == '\'':
|
|
||||||
control = False
|
|
||||||
if not control:
|
|
||||||
if not control_word:
|
|
||||||
if c == '{' or c == '}' or c == '\\':
|
|
||||||
clear_text.append(c)
|
|
||||||
else:
|
|
||||||
control_str = ''.join(control_word)
|
|
||||||
if control_str == 'par' or control_str == 'line':
|
|
||||||
clear_text.append(u'\n')
|
|
||||||
elif control_str == 'tab':
|
|
||||||
clear_text.append(u'\t')
|
|
||||||
# Prefer the encoding specified by the RTF data to that
|
|
||||||
# specified by the Paradox table header
|
|
||||||
# West European encoding
|
|
||||||
elif control_str == 'fcharset0':
|
|
||||||
encoding = u'cp1252'
|
|
||||||
# Greek encoding
|
|
||||||
elif control_str == 'fcharset161':
|
|
||||||
encoding = u'cp1253'
|
|
||||||
# Turkish encoding
|
|
||||||
elif control_str == 'fcharset162':
|
|
||||||
encoding = u'cp1254'
|
|
||||||
# Vietnamese encoding
|
|
||||||
elif control_str == 'fcharset163':
|
|
||||||
encoding = u'cp1258'
|
|
||||||
# Hebrew encoding
|
|
||||||
elif control_str == 'fcharset177':
|
|
||||||
encoding = u'cp1255'
|
|
||||||
# Arabic encoding
|
|
||||||
elif control_str == 'fcharset178':
|
|
||||||
encoding = u'cp1256'
|
|
||||||
# Baltic encoding
|
|
||||||
elif control_str == 'fcharset186':
|
|
||||||
encoding = u'cp1257'
|
|
||||||
# Cyrillic encoding
|
|
||||||
elif control_str == 'fcharset204':
|
|
||||||
encoding = u'cp1251'
|
|
||||||
# Thai encoding
|
|
||||||
elif control_str == 'fcharset222':
|
|
||||||
encoding = u'cp874'
|
|
||||||
# Central+East European encoding
|
|
||||||
elif control_str == 'fcharset238':
|
|
||||||
encoding = u'cp1250'
|
|
||||||
elif control_str[0] == '\'':
|
|
||||||
s = chr(int(control_str[1:3], 16))
|
|
||||||
clear_text.append(s.decode(encoding))
|
|
||||||
del control_word[:]
|
|
||||||
if c == '\\' and new_control:
|
|
||||||
control = True
|
|
||||||
elif c == '{':
|
|
||||||
depth += 1
|
|
||||||
elif c == '}':
|
|
||||||
depth -= 1
|
|
||||||
elif depth > 2:
|
|
||||||
continue
|
|
||||||
elif c == '\n' or c == '\r':
|
|
||||||
continue
|
|
||||||
elif c == '\\':
|
|
||||||
control = True
|
|
||||||
else:
|
|
||||||
clear_text.append(c)
|
|
||||||
return u''.join(clear_text)
|
|
||||||
|
|
||||||
class FieldDescEntry:
|
class FieldDescEntry:
|
||||||
def __init__(self, name, type, size):
|
def __init__(self, name, type, size):
|
||||||
|
@ -33,6 +33,7 @@ import os
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from openlp.core.lib import translate
|
from openlp.core.lib import translate
|
||||||
|
from openlp.plugins.songs.lib import strip_rtf
|
||||||
from openlp.plugins.songs.lib.songimport import SongImport
|
from openlp.plugins.songs.lib.songimport import SongImport
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
@ -110,7 +111,7 @@ class SongProImport(SongImport):
|
|||||||
self.finish()
|
self.finish()
|
||||||
return
|
return
|
||||||
if u'rtf1' in text:
|
if u'rtf1' in text:
|
||||||
text = striprtf(text).rstrip()
|
text = strip_rtf(text, u'cp1252').rstrip()
|
||||||
if not text:
|
if not text:
|
||||||
return
|
return
|
||||||
if tag == u'A':
|
if tag == u'A':
|
||||||
@ -136,128 +137,9 @@ class SongProImport(SongImport):
|
|||||||
self.verseOrderList.append(u'B1')
|
self.verseOrderList.append(u'B1')
|
||||||
elif char == u'D':
|
elif char == u'D':
|
||||||
self.verseOrderList.append(u'E1')
|
self.verseOrderList.append(u'E1')
|
||||||
elif u'1' <= char <= '7':
|
elif u'1' <= char <= u'7':
|
||||||
self.verseOrderList.append(u'V' + char)
|
self.verseOrderList.append(u'V' + char)
|
||||||
elif tag == u'R':
|
elif tag == u'R':
|
||||||
self.addCopyright(text)
|
self.addCopyright(text)
|
||||||
elif u'1' <= tag <= u'7':
|
elif u'1' <= tag <= u'7':
|
||||||
self.addVerse(text, u'V' + tag[1:])
|
self.addVerse(text, u'V' + tag[1:])
|
||||||
|
|
||||||
# replace with mahfiaz's shared one when his import is merged
|
|
||||||
def striprtf(text):
|
|
||||||
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
|
|
||||||
# control words which specify a "destionation".
|
|
||||||
destinations = frozenset((
|
|
||||||
'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
|
|
||||||
'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
|
|
||||||
'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
|
|
||||||
'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
|
|
||||||
'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
|
|
||||||
'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
|
|
||||||
'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
|
|
||||||
'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
|
|
||||||
'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
|
|
||||||
'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
|
|
||||||
'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
|
|
||||||
'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
|
|
||||||
'listoverridetable','listpicture','liststylename','listtable','listtext',
|
|
||||||
'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
|
|
||||||
'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
|
|
||||||
'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
|
|
||||||
'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
|
|
||||||
'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
|
|
||||||
'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
|
|
||||||
'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
|
|
||||||
'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
|
|
||||||
'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
|
|
||||||
'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
|
|
||||||
'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
|
|
||||||
'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
|
|
||||||
'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
|
|
||||||
'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
|
|
||||||
'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
|
|
||||||
'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
|
|
||||||
'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
|
|
||||||
'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
|
|
||||||
'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
|
|
||||||
'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
|
|
||||||
'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
|
|
||||||
'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
|
|
||||||
'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
|
|
||||||
'svb','tc','template','themedata','title','txe','ud','upr','userprops',
|
|
||||||
'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
|
|
||||||
'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
|
|
||||||
'xmlopen',
|
|
||||||
))
|
|
||||||
# Translation of some special characters.
|
|
||||||
specialchars = {
|
|
||||||
'par': '\n',
|
|
||||||
'sect': '\n\n',
|
|
||||||
'page': '\n\n',
|
|
||||||
'line': '\n',
|
|
||||||
'tab': '\t',
|
|
||||||
'emdash': u'\u2014',
|
|
||||||
'endash': u'\u2013',
|
|
||||||
'emspace': u'\u2003',
|
|
||||||
'enspace': u'\u2002',
|
|
||||||
'qmspace': u'\u2005',
|
|
||||||
'bullet': u'\u2022',
|
|
||||||
'lquote': u'\u2018',
|
|
||||||
'rquote': u'\u2019',
|
|
||||||
'ldblquote': u'\201C',
|
|
||||||
'rdblquote': u'\u201D',
|
|
||||||
}
|
|
||||||
stack = []
|
|
||||||
ignorable = False # Whether this group (and all inside it) are "ignorable".
|
|
||||||
ucskip = 1 # Number of ASCII characters to skip after a unicode character.
|
|
||||||
curskip = 0 # Number of ASCII characters left to skip
|
|
||||||
out = [] # Output buffer.
|
|
||||||
for match in pattern.finditer(text):
|
|
||||||
word,arg,hex,char,brace,tchar = match.groups()
|
|
||||||
if brace:
|
|
||||||
curskip = 0
|
|
||||||
if brace == '{':
|
|
||||||
# Push state
|
|
||||||
stack.append((ucskip,ignorable))
|
|
||||||
elif brace == '}':
|
|
||||||
# Pop state
|
|
||||||
ucskip,ignorable = stack.pop()
|
|
||||||
elif char: # \x (not a letter)
|
|
||||||
curskip = 0
|
|
||||||
if char == '~':
|
|
||||||
if not ignorable:
|
|
||||||
out.append(u'\xA0')
|
|
||||||
elif char in '{}\\':
|
|
||||||
if not ignorable:
|
|
||||||
out.append(char)
|
|
||||||
elif char == '*':
|
|
||||||
ignorable = True
|
|
||||||
elif word: # \foo
|
|
||||||
curskip = 0
|
|
||||||
if word in destinations:
|
|
||||||
ignorable = True
|
|
||||||
elif ignorable:
|
|
||||||
pass
|
|
||||||
elif word in specialchars:
|
|
||||||
out.append(specialchars[word])
|
|
||||||
elif word == 'uc':
|
|
||||||
ucskip = int(arg)
|
|
||||||
elif word == 'u':
|
|
||||||
c = int(arg)
|
|
||||||
if c < 0: c += 0x10000
|
|
||||||
if c > 127: out.append(unichr(c))
|
|
||||||
else: out.append(chr(c))
|
|
||||||
curskip = ucskip
|
|
||||||
elif hex: # \'xx
|
|
||||||
if curskip > 0:
|
|
||||||
curskip -= 1
|
|
||||||
elif not ignorable:
|
|
||||||
c = int(hex,16)
|
|
||||||
if c > 127: out.append(unichr(c))
|
|
||||||
else: out.append(chr(c))
|
|
||||||
elif tchar:
|
|
||||||
if curskip > 0:
|
|
||||||
curskip -= 1
|
|
||||||
elif not ignorable:
|
|
||||||
out.append(tchar)
|
|
||||||
return ''.join(out)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user