Share ew strip_rtf routine

2012-07-03 22:14:12 +01:00 · 2012-07-03 22:14:12 +01:00 · 3094375233
commit 3094375233
parent f70b9d3547
3 changed files with 100 additions and 218 deletions
--- a/openlp/plugins/songs/lib/init.py
+++ b/openlp/plugins/songs/lib/init.py
@ -36,6 +36,7 @@ from ui import SongStrings

 WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
 APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE)
+RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')

 class VerseType(object):
    """
@ -366,6 +367,101 @@ def clean_song(manager, song):
    if song.copyright:
        song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()

+def strip_rtf(blob, encoding):
+    depth = 0
+    control = False
+    clear_text = []
+    control_word = []
+
+    # workaround for \tx bug: remove one pair of curly braces
+    # if \tx is encountered
+    match = RTF_STRIPPING_REGEX.search(blob)
+    if match:
+        # start and end indices of match are curly braces - filter them out
+        blob = ''.join([blob[i] for i in xrange(len(blob))
+                        if i != match.start() and i !=match.end()])
+    for c in blob:
+        if control:
+            # for delimiters, set control to False
+            if c == '{':
+                if control_word:
+                    depth += 1
+                control = False
+            elif c == '}':
+                if control_word:
+                    depth -= 1
+                control = False
+            elif c == '\\':
+                new_control = bool(control_word)
+                control = False
+            elif c.isspace():
+                control = False
+            else:
+                control_word.append(c)
+                if len(control_word) == 3 and control_word[0] == '\'':
+                    control = False
+            if not control:
+                if not control_word:
+                    if c == '{' or c == '}' or c == '\\':
+                        clear_text.append(c)
+                else:
+                    control_str = ''.join(control_word)
+                    if control_str == 'par' or control_str == 'line':
+                        clear_text.append(u'\n')
+                    elif control_str == 'tab':
+                        clear_text.append(u'\t')
+                    # Prefer the encoding specified by the RTF data to that
+                    # specified by the Paradox table header
+                    # West European encoding
+                    elif control_str == 'fcharset0':
+                        encoding = u'cp1252'
+                    # Greek encoding
+                    elif control_str == 'fcharset161':
+                        encoding = u'cp1253'
+                    # Turkish encoding
+                    elif control_str == 'fcharset162':
+                        encoding = u'cp1254'
+                    # Vietnamese encoding
+                    elif control_str == 'fcharset163':
+                        encoding = u'cp1258'
+                    # Hebrew encoding
+                    elif control_str == 'fcharset177':
+                        encoding = u'cp1255'
+                    # Arabic encoding
+                    elif control_str == 'fcharset178':
+                        encoding = u'cp1256'
+                    # Baltic encoding
+                    elif control_str == 'fcharset186':
+                        encoding = u'cp1257'
+                    # Cyrillic encoding
+                    elif control_str == 'fcharset204':
+                        encoding = u'cp1251'
+                    # Thai encoding
+                    elif control_str == 'fcharset222':
+                        encoding = u'cp874'
+                    # Central+East European encoding
+                    elif control_str == 'fcharset238':
+                        encoding = u'cp1250'
+                    elif control_str[0] == '\'':
+                        s = chr(int(control_str[1:3], 16))
+                        clear_text.append(s.decode(encoding))
+                    del control_word[:]
+            if c == '\\' and new_control:
+                control = True
+        elif c == '{':
+            depth += 1
+        elif c == '}':
+            depth -= 1
+        elif depth > 2:
+            continue
+        elif c == '\n' or c == '\r':
+            continue
+        elif c == '\\':
+            control = True
+        else:
+            clear_text.append(c)
+    return u''.join(clear_text)
+
 from xml import OpenLyrics, SongXML
 from songstab import SongsTab
 from mediaitem import SongMediaItem
--- a/openlp/plugins/songs/lib/ewimport.py
+++ b/openlp/plugins/songs/lib/ewimport.py
@ -36,110 +36,14 @@ import re

 from openlp.core.lib import translate
 from openlp.plugins.songs.lib import VerseType
-from openlp.plugins.songs.lib import retrieve_windows_encoding
+from openlp.plugins.songs.lib import retrieve_windows_encoding, strip_rtf
 from songimport import SongImport

-RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')
 # regex: at least two newlines, can have spaces between them
 SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*')
 NUMBER_REGEX = re.compile(r'[0-9]+')
 NOTE_REGEX = re.compile(r'\(.*?\)')

-def strip_rtf(blob, encoding):
-    depth = 0
-    control = False
-    clear_text = []
-    control_word = []
-
-    # workaround for \tx bug: remove one pair of curly braces
-    # if \tx is encountered
-    match = RTF_STRIPPING_REGEX.search(blob)
-    if match:
-        # start and end indices of match are curly braces - filter them out
-        blob = ''.join([blob[i] for i in xrange(len(blob))
-            if i != match.start() and i !=match.end()])
-
-    for c in blob:
-        if control:
-            # for delimiters, set control to False
-            if c == '{':
-                if control_word:
-                    depth += 1
-                control = False
-            elif c == '}':
-                if control_word:
-                    depth -= 1
-                control = False
-            elif c == '\\':
-                new_control = bool(control_word)
-                control = False
-            elif c.isspace():
-                control = False
-            else:
-                control_word.append(c)
-                if len(control_word) == 3 and control_word[0] == '\'':
-                    control = False
-            if not control:
-                if not control_word:
-                    if c == '{' or c == '}' or c == '\\':
-                        clear_text.append(c)
-                else:
-                    control_str = ''.join(control_word)
-                    if control_str == 'par' or control_str == 'line':
-                        clear_text.append(u'\n')
-                    elif control_str == 'tab':
-                        clear_text.append(u'\t')
-                    # Prefer the encoding specified by the RTF data to that
-                    # specified by the Paradox table header
-                    # West European encoding
-                    elif control_str == 'fcharset0':
-                        encoding = u'cp1252'
-                    # Greek encoding
-                    elif control_str == 'fcharset161':
-                        encoding = u'cp1253'
-                    # Turkish encoding
-                    elif control_str == 'fcharset162':
-                        encoding = u'cp1254'
-                    # Vietnamese encoding
-                    elif control_str == 'fcharset163':
-                        encoding = u'cp1258'
-                    # Hebrew encoding
-                    elif control_str == 'fcharset177':
-                        encoding = u'cp1255'
-                    # Arabic encoding
-                    elif control_str == 'fcharset178':
-                        encoding = u'cp1256'
-                    # Baltic encoding
-                    elif control_str == 'fcharset186':
-                        encoding = u'cp1257'
-                    # Cyrillic encoding
-                    elif control_str == 'fcharset204':
-                        encoding = u'cp1251'
-                    # Thai encoding
-                    elif control_str == 'fcharset222':
-                        encoding = u'cp874'
-                    # Central+East European encoding
-                    elif control_str == 'fcharset238':
-                        encoding = u'cp1250'
-                    elif control_str[0] == '\'':
-                        s = chr(int(control_str[1:3], 16))
-                        clear_text.append(s.decode(encoding))
-                    del control_word[:]
-            if c == '\\' and new_control:
-                control = True
-        elif c == '{':
-            depth += 1
-        elif c == '}':
-            depth -= 1
-        elif depth > 2:
-            continue
-        elif c == '\n' or c == '\r':
-            continue
-        elif c == '\\':
-            control = True
-        else:
-            clear_text.append(c)
-    return u''.join(clear_text)

 class FieldDescEntry:
    def __init__(self, name, type, size):
--- a/openlp/plugins/songs/lib/songproimport.py
+++ b/openlp/plugins/songs/lib/songproimport.py
@ -33,6 +33,7 @@ import os
 import logging

 from openlp.core.lib import translate
+from openlp.plugins.songs.lib import strip_rtf
 from openlp.plugins.songs.lib.songimport import SongImport

 log = logging.getLogger(__name__)
@ -110,7 +111,7 @@ class SongProImport(SongImport):
            self.finish()
            return
        if u'rtf1' in text:
-            text = striprtf(text).rstrip()
+            text = strip_rtf(text, u'cp1252').rstrip()
        if not text:
            return
        if tag == u'A':
@ -136,128 +137,9 @@ class SongProImport(SongImport):
                    self.verseOrderList.append(u'B1')
                elif char == u'D':
                    self.verseOrderList.append(u'E1')
-                elif u'1' <= char <= '7':
+                elif u'1' <= char <= u'7':
                    self.verseOrderList.append(u'V' + char)
        elif tag == u'R':
            self.addCopyright(text)
        elif u'1' <= tag <= u'7':
            self.addVerse(text, u'V' + tag[1:])
-
-# replace with mahfiaz's shared one when his import is merged
-def striprtf(text):
-   pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
-   # control words which specify a "destionation".
-   destinations = frozenset((
-      'aftncn','aftnsep','aftnsepc','annotation','atnauthor','atndate','atnicn','atnid',
-      'atnparent','atnref','atntime','atrfend','atrfstart','author','background',
-      'bkmkend','bkmkstart','blipuid','buptim','category','colorschememapping',
-      'colortbl','comment','company','creatim','datafield','datastore','defchp','defpap',
-      'do','doccomm','docvar','dptxbxtext','ebcend','ebcstart','factoidname','falt',
-      'fchars','ffdeftext','ffentrymcr','ffexitmcr','ffformat','ffhelptext','ffl',
-      'ffname','ffstattext','field','file','filetbl','fldinst','fldrslt','fldtype',
-      'fname','fontemb','fontfile','fonttbl','footer','footerf','footerl','footerr',
-      'footnote','formfield','ftncn','ftnsep','ftnsepc','g','generator','gridtbl',
-      'header','headerf','headerl','headerr','hl','hlfr','hlinkbase','hlloc','hlsrc',
-      'hsv','htmltag','info','keycode','keywords','latentstyles','lchars','levelnumbers',
-      'leveltext','lfolevel','linkval','list','listlevel','listname','listoverride',
-      'listoverridetable','listpicture','liststylename','listtable','listtext',
-      'lsdlockedexcept','macc','maccPr','mailmerge','maln','malnScr','manager','margPr',
-      'mbar','mbarPr','mbaseJc','mbegChr','mborderBox','mborderBoxPr','mbox','mboxPr',
-      'mchr','mcount','mctrlPr','md','mdeg','mdegHide','mden','mdiff','mdPr','me',
-      'mendChr','meqArr','meqArrPr','mf','mfName','mfPr','mfunc','mfuncPr','mgroupChr',
-      'mgroupChrPr','mgrow','mhideBot','mhideLeft','mhideRight','mhideTop','mhtmltag',
-      'mlim','mlimloc','mlimlow','mlimlowPr','mlimupp','mlimuppPr','mm','mmaddfieldname',
-      'mmath','mmathPict','mmathPr','mmaxdist','mmc','mmcJc','mmconnectstr',
-      'mmconnectstrdata','mmcPr','mmcs','mmdatasource','mmheadersource','mmmailsubject',
-      'mmodso','mmodsofilter','mmodsofldmpdata','mmodsomappedname','mmodsoname',
-      'mmodsorecipdata','mmodsosort','mmodsosrc','mmodsotable','mmodsoudl',
-      'mmodsoudldata','mmodsouniquetag','mmPr','mmquery','mmr','mnary','mnaryPr',
-      'mnoBreak','mnum','mobjDist','moMath','moMathPara','moMathParaPr','mopEmu',
-      'mphant','mphantPr','mplcHide','mpos','mr','mrad','mradPr','mrPr','msepChr',
-      'mshow','mshp','msPre','msPrePr','msSub','msSubPr','msSubSup','msSubSupPr','msSup',
-      'msSupPr','mstrikeBLTR','mstrikeH','mstrikeTLBR','mstrikeV','msub','msubHide',
-      'msup','msupHide','mtransp','mtype','mvertJc','mvfmf','mvfml','mvtof','mvtol',
-      'mzeroAsc','mzeroDesc','mzeroWid','nesttableprops','nextfile','nonesttables',
-      'objalias','objclass','objdata','object','objname','objsect','objtime','oldcprops',
-      'oldpprops','oldsprops','oldtprops','oleclsid','operator','panose','password',
-      'passwordhash','pgp','pgptbl','picprop','pict','pn','pnseclvl','pntext','pntxta',
-      'pntxtb','printim','private','propname','protend','protstart','protusertbl','pxe',
-      'result','revtbl','revtim','rsidtbl','rxe','shp','shpgrp','shpinst',
-      'shppict','shprslt','shptxt','sn','sp','staticval','stylesheet','subject','sv',
-      'svb','tc','template','themedata','title','txe','ud','upr','userprops',
-      'wgrffmtfilter','windowcaption','writereservation','writereservhash','xe','xform',
-      'xmlattrname','xmlattrvalue','xmlclose','xmlname','xmlnstbl',
-      'xmlopen',
-   ))
-   # Translation of some special characters.
-   specialchars = {
-      'par': '\n',
-      'sect': '\n\n',
-      'page': '\n\n',
-      'line': '\n',
-      'tab': '\t',
-      'emdash': u'\u2014',
-      'endash': u'\u2013',
-      'emspace': u'\u2003',
-      'enspace': u'\u2002',
-      'qmspace': u'\u2005',
-      'bullet': u'\u2022',
-      'lquote': u'\u2018',
-      'rquote': u'\u2019',
-      'ldblquote': u'\201C',
-      'rdblquote': u'\u201D',
-   }
-   stack = []
-   ignorable = False       # Whether this group (and all inside it) are "ignorable".
-   ucskip = 1              # Number of ASCII characters to skip after a unicode character.
-   curskip = 0             # Number of ASCII characters left to skip
-   out = []                # Output buffer.
-   for match in pattern.finditer(text):
-      word,arg,hex,char,brace,tchar = match.groups()
-      if brace:
-         curskip = 0
-         if brace == '{':
-            # Push state
-            stack.append((ucskip,ignorable))
-         elif brace == '}':
-            # Pop state
-            ucskip,ignorable = stack.pop()
-      elif char: # \x (not a letter)
-         curskip = 0
-         if char == '~':
-            if not ignorable:
-                out.append(u'\xA0')
-         elif char in '{}\\':
-            if not ignorable:
-               out.append(char)
-         elif char == '*':
-            ignorable = True
-      elif word: # \foo
-         curskip = 0
-         if word in destinations:
-            ignorable = True
-         elif ignorable:
-            pass
-         elif word in specialchars:
-            out.append(specialchars[word])
-         elif word == 'uc':
-            ucskip = int(arg)
-         elif word == 'u':
-            c = int(arg)
-            if c < 0: c += 0x10000
-            if c > 127: out.append(unichr(c))
-            else: out.append(chr(c))
-            curskip = ucskip
-      elif hex: # \'xx
-         if curskip > 0:
-            curskip -= 1
-         elif not ignorable:
-            c = int(hex,16)
-            if c > 127: out.append(unichr(c))
-            else: out.append(chr(c))
-      elif tchar:
-         if curskip > 0:
-            curskip -= 1
-         elif not ignorable:
-            out.append(tchar)
-   return ''.join(out)