Move strip_rtf and change it into a class StripRtf + other changes. It might not work that well yet.

2012-06-24 21:08:20 +03:00 · 2012-06-24 21:08:20 +03:00 · 36f7e03dc0
parent 0a2ea0fc51
commit 36f7e03dc0
3 changed files with 255 additions and 312 deletions
--- a/openlp/plugins/songs/lib/init.py
+++ b/openlp/plugins/songs/lib/init.py
@ -24,6 +24,7 @@
 # with this program; if not, write to the Free Software Foundation, Inc., 59  #
 # Temple Place, Suite 330, Boston, MA 02111-1307 USA                          #
 ###############################################################################
 import logging
 import re
 from PyQt4 import QtGui
@ -33,6 +34,8 @@ from openlp.core.utils import CONTROL_CHARS
 from db import Author
 from ui import SongStrings
 log = logging.getLogger(__name__)
 WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
 APOSTROPHE = re.compile(u'[\'`’ʻ′]', re.UNICODE)
@ -194,7 +197,7 @@ class VerseType(object):
        return verse_index
-def retrieve_windows_encoding(recommendation=None):
+def retrieve_windows_encoding(recommendation=None, example_text=None):
    """
    Determines which encoding to use on an information source. The process uses
    both automated detection, which is passed to this method as a
@ -203,6 +206,9 @@ def retrieve_windows_encoding(recommendation=None):
    ``recommendation``
        A recommended encoding discovered programmatically for the user to
        confirm.
    ``example_text``
        Still not decoded text to show to users to help them decide.
    """
    # map chardet result to compatible windows standard code page
    codepage_mapping = {'IBM866': u'cp866', 'TIS-620': u'cp874',
@ -365,6 +371,217 @@ def clean_song(manager, song):
    if song.copyright:
        song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
 class StripRtf():
    """
    This class strips RTF control structures and returns an unicode string.
    Thanks to Markus Jarderot (MizardX) for this code, used by permission.
    http://stackoverflow.com/questions/188545
    """
    pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'"
        r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
    # Control words which specify a "destination" to be ignored.
    destinations = frozenset((
        u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor', 
        u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime',
        u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend',
        u'bkmkstart', u'blipuid', u'buptim', u'category',
        u'colorschememapping', u'colortbl', u'comment', u'company', u'creatim',
        u'datafield', u'datastore', u'defchp', u'defpap', u'do', u'doccomm',
        u'docvar', u'dptxbxtext', u'ebcend', u'ebcstart', u'factoidname',
        u'falt', u'fchars', u'ffdeftext', u'ffentrymcr', u'ffexitmcr',
        u'ffformat', u'ffhelptext', u'ffl', u'ffname', u'ffstattext', u'field',
        u'file', u'filetbl', u'fldinst', u'fldrslt', u'fldtype', u'fname',
        u'fontemb', u'fontfile', u'footer', u'footerf', u'footerl', u'footerr',
        u'footnote', u'formfield', u'ftncn', u'ftnsep', u'ftnsepc', u'g',
        u'generator', u'gridtbl', u'header', u'headerf', u'headerl',
        u'headerr', u'hl', u'hlfr', u'hlinkbase', u'hlloc', u'hlsrc', u'hsv',
        u'htmltag', u'info', u'keycode', u'keywords', u'latentstyles',
        u'lchars', u'levelnumbers', u'leveltext', u'lfolevel', u'linkval',
        u'list', u'listlevel', u'listname', u'listoverride',
        u'listoverridetable', u'listpicture', u'liststylename', u'listtable',
        u'listtext', u'lsdlockedexcept', u'macc', u'maccPr', u'mailmerge',
        u'maln', u'malnScr', u'manager', u'margPr', u'mbar', u'mbarPr',
        u'mbaseJc', u'mbegChr', u'mborderBox', u'mborderBoxPr', u'mbox',
        u'mboxPr', u'mchr', u'mcount', u'mctrlPr', u'md', u'mdeg', u'mdegHide',
        u'mden', u'mdiff', u'mdPr', u'me', u'mendChr', u'meqArr', u'meqArrPr',
        u'mf', u'mfName', u'mfPr', u'mfunc', u'mfuncPr', u'mgroupChr',
        u'mgroupChrPr', u'mgrow', u'mhideBot', u'mhideLeft', u'mhideRight',
        u'mhideTop', u'mhtmltag', u'mlim', u'mlimloc', u'mlimlow',
        u'mlimlowPr', u'mlimupp', u'mlimuppPr', u'mm', u'mmaddfieldname',
        u'mmath', u'mmathPict', u'mmathPr', u'mmaxdist', u'mmc', u'mmcJc',
        u'mmconnectstr', u'mmconnectstrdata', u'mmcPr', u'mmcs',
        u'mmdatasource', u'mmheadersource', u'mmmailsubject', u'mmodso',
        u'mmodsofilter', u'mmodsofldmpdata', u'mmodsomappedname',
        u'mmodsoname', u'mmodsorecipdata', u'mmodsosort', u'mmodsosrc',
        u'mmodsotable', u'mmodsoudl', u'mmodsoudldata', u'mmodsouniquetag',
        u'mmPr', u'mmquery', u'mmr', u'mnary', u'mnaryPr', u'mnoBreak',
        u'mnum', u'mobjDist', u'moMath', u'moMathPara', u'moMathParaPr',
        u'mopEmu', u'mphant', u'mphantPr', u'mplcHide', u'mpos', u'mr',
        u'mrad', u'mradPr', u'mrPr', u'msepChr', u'mshow', u'mshp', u'msPre',
        u'msPrePr', u'msSub', u'msSubPr', u'msSubSup', u'msSubSupPr', u'msSup',
        u'msSupPr', u'mstrikeBLTR', u'mstrikeH', u'mstrikeTLBR', u'mstrikeV',
        u'msub', u'msubHide', u'msup', u'msupHide', u'mtransp', u'mtype',
        u'mvertJc', u'mvfmf', u'mvfml', u'mvtof', u'mvtol', u'mzeroAsc',
        u'mzeroDesc', u'mzeroWid', u'nesttableprops', u'nextfile',
        u'nonesttables', u'objalias', u'objclass', u'objdata', u'object',
        u'objname', u'objsect', u'objtime', u'oldcprops', u'oldpprops',
        u'oldsprops', u'oldtprops', u'oleclsid', u'operator', u'panose',
        u'password', u'passwordhash', u'pgp', u'pgptbl', u'picprop', u'pict',
        u'pn', u'pnseclvl', u'pntext', u'pntxta', u'pntxtb', u'printim',
        u'private', u'propname', u'protend', u'protstart', u'protusertbl',
        u'pxe', u'result', u'revtbl', u'revtim', u'rsidtbl', u'rxe', u'shp',
        u'shpgrp', u'shpinst', u'shppict', u'shprslt', u'shptxt', u'sn', u'sp',
        u'staticval', u'stylesheet', u'subject', u'sv', u'svb', u'tc',
        u'template', u'themedata', u'title', u'txe', u'ud', u'upr',
        u'userprops', u'wgrffmtfilter', u'windowcaption', u'writereservation',
        u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue',
        u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen'))
    # Translation of some special characters.
    specialchars = {
        u'par': u'\n',
        u'sect': u'\n\n',
        u'page': u'\n\n',
        u'line': u'\n',
        u'tab': u'\t',
        u'emdash': u'\u2014',
        u'endash': u'\u2013',
        u'emspace': u'\u2003',
        u'enspace': u'\u2002',
        u'qmspace': u'\u2005',
        u'bullet': u'\u2022',
        u'lquote': u'\u2018',
        u'rquote': u'\u2019',
        u'ldblquote': u'\u201C',
        u'rdblquote': u'\u201D'}
    charset_mapping = {
        u'fcharset0': u'cp1252',
        u'fcharset1': None,
        u'fcharset2': None,
        u'fcharset77': None,
        u'fcharset128': None,
        u'fcharset129': None,
        u'fcharset130': None,
        u'fcharset134': None,
        u'fcharset136': None,
        u'fcharset161': u'cp1253',
        u'fcharset162': u'cp1254',
        u'fcharset163': u'cp1258',
        u'fcharset177': u'cp1255',
        u'fcharset178': u'cp1256',
        u'fcharset186': u'cp1257',
        u'fcharset204': u'cp1251',
        u'fcharset222': u'cp874',
        u'fcharset238': u'cp1250'}
    def strip_rtf(self, text, default_encoding=None):
        # Current font is the font tag we last met.
        font = u''
        # Character encoding is defined inside fonttable.
        # font_table could contain eg u'0': u'cp1252'
        font_table = {u'': default_encoding}
        # Whether we are inside the font table.
        inside_font_table = False
        # Stack of things to keep track of when entering/leaving groups.
        stack = []
        # Whether this group (and all inside it) are "ignorable".
        ignorable = False
        # Number of ASCII characters to skip after an unicode character.
        ucskip = 1
        # Number of ASCII characters left to skip.
        curskip = 0
        # Output buffer.
        out = []
        for match in self.pattern.finditer(text):
            word, arg, hex, char, brace, tchar = match.groups()
            if brace:
                curskip = 0
                if brace == u'{':
                    # Push state
                    stack.append((ucskip, ignorable, font, inside_font_table))
                elif brace == u'}':
                    # Pop state
                    ucskip, ignorable, font, inside_font_table = stack.pop()
            # \x (not a letter)
            elif char:
                curskip = 0
                if char == u'~':
                    if not ignorable:
                        out.append(u'\xA0')
                elif char in u'{}\\':
                    if not ignorable:
                        out.append(char)
                elif char == u'*':
                    ignorable = True
            # \command
            elif word:
                curskip = 0
                if word in self.destinations:
                    ignorable = True
                elif word in self.specialchars:
                    out.append(self.specialchars[word])
                elif word == u'uc':
                    ucskip = int(arg)
                elif word == u' ':
                    c = int(arg)
                    if c < 0:
                        c += 0x10000
                    out.append(unichr(c))
                    curskip = ucskip
                elif word == u'fonttbl':
                    inside_font_table = True
                    ignorable = True
                elif word == u'f':
                    font = arg
                    if not inside_font_table:
                        if arg in font_table.keys():
                            encoding = font_table[arg]
                        else:
                            encoding = default_encoding
                elif word == u'ansicpg':
                    if font == u'':
                        print "JEEEPASOIDFIJAD"
                    if inside_font_table or font == u'':
                        font_table[font] = 'cp' + arg
                elif word == u'fcharset':
                    charset_reference = word + arg
                    if charset_reference in self.charset_mapping:
                        charset = self.charset_mapping[charset_reference]
                        if not charset:
                            charset = default_encoding
                    else:
                        log.error(u"Charset '%s' not in charset_mapping "
                            u"dictionary in "
                            u"openlp/plugins/songs/lib/__init__.py"
                            % charset_reference)
                        charset = default_encoding
                    if font == u'':
                        print "JEEEPASOIDFIadsfJAD"
                    if inside_font_table or font == u'':
                        font_table[font] = charset
            # \'xx
            elif hex:
                if curskip > 0:
                    curskip -= 1
                elif not ignorable:
                    charcode = int(hex, 16)
                    while True:
                        try:
                            out.append(chr(charcode).decode(encoding))
                        except UnicodeDecodeError:
                            encoding = \
                                retrieve_windows_encoding(default_encoding)
                            if font:
                                font_table[font] = encoding
                        else:
                            break
            elif tchar:
                if curskip > 0:
                    curskip -= 1
                elif not ignorable:
                    out.append(tchar)
        return u''.join(out)
 from xml import OpenLyrics, SongXML
 from songstab import SongsTab
 from mediaitem import SongMediaItem
--- a/openlp/plugins/songs/lib/ewimport.py
+++ b/openlp/plugins/songs/lib/ewimport.py
@ -35,7 +35,7 @@ import re
 from openlp.core.lib import translate
 from openlp.plugins.songs.lib import VerseType
-from openlp.plugins.songs.lib import retrieve_windows_encoding
+from openlp.plugins.songs.lib import retrieve_windows_encoding, StripRtf
 from songimport import SongImport
 RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')
@ -44,101 +44,6 @@ SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*')
 NUMBER_REGEX = re.compile(r'[0-9]+')
 NOTE_REGEX = re.compile(r'\(.*?\)')
 def strip_rtf(blob, encoding):
    depth = 0
    control = False
    clear_text = []
    control_word = []
    # workaround for \tx bug: remove one pair of curly braces
    # if \tx is encountered
    match = RTF_STRIPPING_REGEX.search(blob)
    if match:
        # start and end indices of match are curly braces - filter them out
        blob = ''.join([blob[i] for i in xrange(len(blob))
            if i != match.start() and i !=match.end()])
    for c in blob:
        if control:
            # for delimiters, set control to False
            if c == '{':
                if control_word:
                    depth += 1
                control = False
            elif c == '}':
                if control_word:
                    depth -= 1
                control = False
            elif c == '\\':
                new_control = bool(control_word)
                control = False
            elif c.isspace():
                control = False
            else:
                control_word.append(c)
                if len(control_word) == 3 and control_word[0] == '\'':
                    control = False
            if not control:
                if not control_word:
                    if c == '{' or c == '}' or c == '\\':
                        clear_text.append(c)
                else:
                    control_str = ''.join(control_word)
                    if control_str == 'par' or control_str == 'line':
                        clear_text.append(u'\n')
                    elif control_str == 'tab':
                        clear_text.append(u'\t')
                    # Prefer the encoding specified by the RTF data to that
                    # specified by the Paradox table header
                    # West European encoding
                    elif control_str == 'fcharset0':
                        encoding = u'cp1252'
                    # Greek encoding
                    elif control_str == 'fcharset161':
                        encoding = u'cp1253'
                    # Turkish encoding
                    elif control_str == 'fcharset162':
                        encoding = u'cp1254'
                    # Vietnamese encoding
                    elif control_str == 'fcharset163':
                        encoding = u'cp1258'
                    # Hebrew encoding
                    elif control_str == 'fcharset177':
                        encoding = u'cp1255'
                    # Arabic encoding
                    elif control_str == 'fcharset178':
                        encoding = u'cp1256'
                    # Baltic encoding
                    elif control_str == 'fcharset186':
                        encoding = u'cp1257'
                    # Cyrillic encoding
                    elif control_str == 'fcharset204':
                        encoding = u'cp1251'
                    # Thai encoding
                    elif control_str == 'fcharset222':
                        encoding = u'cp874'
                    # Central+East European encoding
                    elif control_str == 'fcharset238':
                        encoding = u'cp1250'
                    elif control_str[0] == '\'':
                        s = chr(int(control_str[1:3], 16))
                        clear_text.append(s.decode(encoding))
                    del control_word[:]
            if c == '\\' and new_control:
                control = True
        elif c == '{':
            depth += 1
        elif c == '}':
            depth -= 1
        elif depth > 2:
            continue
        elif c == '\n' or c == '\r':
            continue
        elif c == '\\':
            control = True
        else:
            clear_text.append(c)
    return u''.join(clear_text)
 class FieldDescEntry:
    def __init__(self, name, type, size):
@ -154,6 +59,7 @@ class EasyWorshipSongImport(SongImport):
    """
    def __init__(self, manager, **kwargs):
        SongImport.__init__(self, manager, **kwargs)
        self.rtf = StripRtf()
    def doImport(self):
        # Open the DB and MB files if they exist
@ -273,7 +179,7 @@ class EasyWorshipSongImport(SongImport):
                        self.addAuthor(author_name.strip())
                if words:
                    # Format the lyrics
-                    words = strip_rtf(words, self.encoding)
+                    words = self.rtf.strip_rtf(words, self.encoding)
                    verse_type = VerseType.Tags[VerseType.Verse]
                    for verse in SLIDE_BREAK_REGEX.split(words):
                        verse = verse.strip()
--- a/openlp/plugins/songs/lib/sundayplusimport.py
+++ b/openlp/plugins/songs/lib/sundayplusimport.py
@ -26,11 +26,11 @@
 ###############################################################################
 import logging
 import os
 import re
 from os.path import split
 from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding
 from openlp.plugins.songs.lib import StripRtf
 from openlp.plugins.songs.lib.songimport import SongImport
 log = logging.getLogger(__name__)
@ -42,7 +42,7 @@ class SundayPlusImport(SongImport):
    The format examples can be found attached to bug report at
    <http://support.openlp.org/issues/395>
    """
-    hotkey_to_verse_type = {
+    hotkeyToVerseType = {
        u'1': u'v1',
        u'2': u'v2',
        u'3': u'v3',
@ -61,6 +61,7 @@ class SundayPlusImport(SongImport):
        Initialise the class.
        """
        SongImport.__init__(self, manager, **kwargs)
        self.rtf = StripRtf()
    def doImport(self):
        self.importWizard.progressBar.setMaximum(len(self.importSource))
@ -81,32 +82,33 @@ class SundayPlusImport(SongImport):
            self.logError(file.name)
            return
        if self.title == '':
-            self.title = self.title_from_filename(file.name)
+            self.title = self.titleFromFilename(file.name)
        if not self.finish():
            self.logError(file.name)
    def parse(self, data, cell = False):
-        if data[0] != '[' and data[-1] != ']':
+        if len(data) == 0 or data[0:1] != '[' or data[-1] != ']':
            self.logError(u'File is malformed')
            return False
        i = 1
        verse_type = VerseType.Tags[VerseType.Verse]
        while i < len(data):
-            byte = data[i]
+            # Data is held as #name: value pairs inside groups marked as [].
-            if byte == '#':
+            # Now we are looking for name.
-                end = data.find(':', i+1)
+            if data[i:i+1] == '#':
-                name = data[i+1:end]
+                name_end = data.find(':', i+1)
-                i = end + 1
+                name = data[i+1:name_end]
-                while data[i] == ' ':
+                i = name_end + 1
                while data[i:i+1] == ' ':
                    i += 1
-                if data[i] == '"':
+                if data[i:i+1] == '"':
                    end = data.find('"', i+1)
                    value = data[i+1:end]
-                elif data[i] == '[':
+                elif data[i:i+1] == '[':
                    j = i
                    inside_quotes = False
                    while j < len(data):
-                        char = data[j]
+                        char = data[j:j+1]
                        if char == '"':
                            inside_quotes = not inside_quotes
                        elif not inside_quotes and char == ']':
@ -119,6 +121,7 @@ class SundayPlusImport(SongImport):
                    if data.find('(', i, end) != -1:
                        end = data.find(')', i) + 1
                    value = data[i:end]
                # If we are in the main group.
                if cell == False:
                    if name == 'title':
                        self.title = self.decode(self.unescape(value))
@ -130,6 +133,7 @@ class SundayPlusImport(SongImport):
                        self.copyright = self.decode(self.unescape(value))
                    elif name[0:4] == 'CELL':
                        self.parse(value, cell = name[4:])
                # We are in a verse group.
                else:
                    if name == 'MARKER_NAME':
                        value = value.strip()
@ -141,23 +145,27 @@ class SundayPlusImport(SongImport):
                                verse_type = "%s%s" % (verse_type, value[-1])
                    elif name == 'Hotkey':
                        # Hotkey always appears after MARKER_NAME, so it
-                        # effectivetly overrides MARKER_NAME, if present.
+                        # effectively overrides MARKER_NAME, if present.
                        if len(value) and \
-                            value in self.hotkey_to_verse_type.keys():
+                            value in self.hotkeyToVerseType.keys():
-                            verse_type = self.hotkey_to_verse_type[value]
+                            verse_type = self.hotkeyToVerseType[value]
                    if name == 'rtf':
                        value = self.unescape(value)
-                        verse = self.strip_rtf(value, self.encoding).strip()
+                        verse = self.rtf.strip_rtf(value, self.encoding)
-                        lines = verse.split('\n')
+                        lines = verse.strip().split('\n')
                        # If any line inside any verse contains CCLI or
                        # only Public Domain, we treat this as special data:
                        # we remove that line and add data to specific field.
                        for i in xrange(len(lines)):
                            lines[i] = lines[i].strip()
                            line = lines[i]
-                            if line[:4] in u'CCLI':
+                            if line[:4].lower() == u'ccli':
                                m = re.search(r'[0-9]+', line)
                                if m:
                                    self.ccliNumber = int(m.group(0))
                                    lines.pop(i)
                            elif line.lower() == u'public domain':
                                self.copyright = u'Public Domain'
                                lines.pop(i)
                        self.addVerse('\n'.join(lines).strip(), verse_type)
                if end == -1:
@ -166,13 +174,12 @@ class SundayPlusImport(SongImport):
            i += 1
        return True
-    def title_from_filename(self, filename):
+    def titleFromFilename(self, filename):
-        filename = split(filename)[1]
+        title = os.path.split(filename)[1]
-        if len(filename) > 4 and filename[-4:].lower() == u'.ptf':
+        if title.endswith(u'.ptf'):
-            title = filename[:-4]
+            title = title[:-4]
-        else:
+        # For some strange reason all example files names ended with 1-7.
-            title = filename
+        if title.endswith('1-7'):
        if title[-3:] == '1-7':
            title = title[:-3]
        return title.replace(u'_', u' ')
@ -190,190 +197,3 @@ class SundayPlusImport(SongImport):
        text = text.replace('^', '\'')
        return text.strip()
    def strip_rtf(self, text, encoding):
        # Thanks to Markus Jarderot (MizardX) for this code, used by permission
        # <http://stackoverflow.com/questions/188545/regular-expression-for-
        # extracting-text-from-an-rtf-string>
        pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'"
            r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
        # Control words which specify a "destination" and we can ignore it.
        destinations = frozenset((
            'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor', 
            'atndate', 'atnicn', 'atnid', 'atnparent', 'atnref', 'atntime',
            'atrfend', 'atrfstart', 'author', 'background', 'bkmkend',
            'bkmkstart', 'blipuid', 'buptim', 'category', 'colorschememapping',
            'colortbl', 'comment', 'company', 'creatim', 'datafield',
            'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'docvar',
            'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'falt', 'fchars',
            'ffdeftext', 'ffentrymcr', 'ffexitmcr', 'ffformat', 'ffhelptext',
            'ffl', 'ffname', 'ffstattext', 'field', 'file', 'filetbl',
            'fldinst', 'fldrslt', 'fldtype', 'fname', 'fontemb', 'fontfile', 
            'footer', 'footerf', 'footerl', 'footerr', 'footnote',
            'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'generator',
            'gridtbl', 'header', 'headerf', 'headerl', 'headerr', 'hl', 'hlfr',
            'hlinkbase', 'hlloc', 'hlsrc', 'hsv', 'htmltag', 'info', 'keycode',
            'keywords', 'latentstyles', 'lchars', 'levelnumbers', 'leveltext',
            'lfolevel', 'linkval', 'list', 'listlevel', 'listname',
            'listoverride', 'listoverridetable', 'listpicture', 'liststylename',
            'listtable', 'listtext', 'lsdlockedexcept', 'macc', 'maccPr', 
            'mailmerge', 'maln', 'malnScr', 'manager', 'margPr', 'mbar',
            'mbarPr', 'mbaseJc', 'mbegChr', 'mborderBox', 'mborderBoxPr',
            'mbox', 'mboxPr', 'mchr', 'mcount', 'mctrlPr', 'md', 'mdeg', 
            'mdegHide', 'mden', 'mdiff', 'mdPr', 'me', 'mendChr', 'meqArr',
            'meqArrPr', 'mf', 'mfName', 'mfPr', 'mfunc', 'mfuncPr', 'mgroupChr',
            'mgroupChrPr', 'mgrow', 'mhideBot', 'mhideLeft', 'mhideRight',
            'mhideTop', 'mhtmltag', 'mlim', 'mlimloc', 'mlimlow', 'mlimlowPr',
            'mlimupp', 'mlimuppPr', 'mm', 'mmaddfieldname', 'mmath',
            'mmathPict', 'mmathPr', 'mmaxdist', 'mmc', 'mmcJc', 'mmconnectstr',
            'mmconnectstrdata', 'mmcPr', 'mmcs', 'mmdatasource',
            'mmheadersource', 'mmmailsubject', 'mmodso', 'mmodsofilter',
            'mmodsofldmpdata', 'mmodsomappedname', 'mmodsoname',
            'mmodsorecipdata', 'mmodsosort', 'mmodsosrc', 'mmodsotable',
            'mmodsoudl', 'mmodsoudldata', 'mmodsouniquetag', 'mmPr', 'mmquery',
            'mmr', 'mnary', 'mnaryPr', 'mnoBreak', 'mnum', 'mobjDist', 'moMath',
            'moMathPara', 'moMathParaPr', 'mopEmu', 'mphant', 'mphantPr',
            'mplcHide', 'mpos', 'mr', 'mrad', 'mradPr', 'mrPr', 'msepChr',
            'mshow', 'mshp', 'msPre', 'msPrePr', 'msSub', 'msSubPr', 'msSubSup',
            'msSubSupPr', 'msSup', 'msSupPr', 'mstrikeBLTR', 'mstrikeH',
            'mstrikeTLBR', 'mstrikeV', 'msub', 'msubHide', 'msup', 'msupHide',
            'mtransp', 'mtype', 'mvertJc', 'mvfmf', 'mvfml', 'mvtof', 'mvtol',
            'mzeroAsc', 'mzeroDesc', 'mzeroWid', 'nesttableprops', 'nextfile',
            'nonesttables', 'objalias', 'objclass', 'objdata', 'object',
            'objname', 'objsect', 'objtime', 'oldcprops', 'oldpprops',
            'oldsprops', 'oldtprops', 'oleclsid', 'operator', 'panose',
            'password', 'passwordhash', 'pgp', 'pgptbl', 'picprop', 'pict',
            'pn', 'pnseclvl', 'pntext', 'pntxta', 'pntxtb', 'printim',
            'private', 'propname', 'protend', 'protstart', 'protusertbl', 'pxe',
            'result', 'revtbl', 'revtim', 'rsidtbl', 'rxe', 'shp', 'shpgrp',
            'shpinst', 'shppict', 'shprslt', 'shptxt', 'sn', 'sp', 'staticval',
            'stylesheet', 'subject', 'sv', 'svb', 'tc', 'template', 'themedata',
            'title', 'txe', 'ud', 'upr', 'userprops', 'wgrffmtfilter',
            'windowcaption', 'writereservation', 'writereservhash', 'xe',
            'xform', 'xmlattrname', 'xmlattrvalue', 'xmlclose', 'xmlname',
            'xmlnstbl', 'xmlopen'))
        # Translation of some special characters.
        specialchars = {
            u'par': u'\n',
            u'sect': u'\n\n',
            u'page': u'\n\n',
            u'line': u'\n',
            u'tab': u'\t',
            u'emdash': u'\u2014',
            u'endash': u'\u2013',
            u'emspace': u'\u2003',
            u'enspace': u'\u2002',
            u'qmspace': u'\u2005',
            u'bullet': u'\u2022',
            u'lquote': u'\u2018',
            u'rquote': u'\u2019',
            u'ldblquote': u'\u201C',
            u'rdblquote': u'\u201D'}
        charset_mapping = {
            # Thai encoding
            'fcharset222': u'cp874',
            'ansicpg874': u'cp874',
            # Central+East European encoding
            'fcharset238': u'cp1250',
            'ansicpg1250': u'cp1250',
            # Cyrillic encoding
            'fcharset204': u'cp1251',
            'ansicpg1251': u'cp1251',
            # West European encoding
            'fcharset0': u'cp1252',
            'ansicpg1252': u'cp1252',
            # Greek encoding
            'fcharset161': u'cp1253',
            'ansicpg1253': u'cp1253',
            # Turkish encoding
            'fcharset162': u'cp1254',
            'ansicpg1254': u'cp1254',
            # Hebrew encoding
            'fcharset177': u'cp1255',
            'ansicpg1255': u'cp1255',
            # Arabic encoding
            'fcharset178': u'cp1256',
            'ansicpg1256': u'cp1256',
            # Baltic encoding
            'fcharset186': u'cp1257',
            'ansicpg1257': u'cp1257',
            # Vietnamese encoding
            'fcharset163': u'cp1258',
            'ansicpg1258': u'cp1258'}
        charsets = charset_mapping.keys()
        # Character encoding is defined together with fonts.
        # font_table could contain eg '0': 'cp1252'
        font_table = {}
        stack = []
        # Whether this group (and all inside it) are "ignorable".
        ignorable = False
        # Whether we are inside the font table.
        inside_font_table = False
        current_font = ''
        # Number of ASCII characters to skip after an unicode character.
        ucskip = 1
        # Number of ASCII characters left to skip.
        curskip = 0
        # Output buffer.
        out = []
        for match in pattern.finditer(text):
            word, arg, hex, char, brace, tchar = match.groups()
            if brace:
                curskip = 0
                if brace == u'{':
                    # Push state
                    stack.append((ucskip, ignorable, inside_font_table))
                elif brace == u'}':
                    # Pop state
                    ucskip, ignorable, inside_font_table = stack.pop()
            # \x (not a letter)
            elif char:
                curskip = 0
                if char == '~':
                    if not ignorable:
                        out.append(u'\xA0')
                elif char in u'{}\\':
                    if not ignorable:
                        out.append(char)
                elif char == u'*':
                    ignorable = True
            # \foo
            elif word:
                curskip = 0
                if word in destinations:
                    ignorable = True
                elif word in specialchars:
                    out.append(specialchars[word])
                elif word == u'uc':
                    ucskip = int(arg)
                elif word == u'u':
                    c = int(arg)
                    if c < 0:
                        c += 0x10000
                    out.append(unichr(c))
                    curskip = ucskip
                elif word == 'fonttbl':
                    inside_font_table = True
                    ignorable = True
                elif word == 'f':
                    current_font = arg
                    if not inside_font_table:
                        encoding = font_table[arg]
                elif word in ('ansicpg', 'fcharset'):
                    if inside_font_table:
                        font_table[current_font] = charset_mapping[word + arg]
                    else:
                        encoding = charset_mapping[word + arg]
            # \'xx
            elif hex:
                if curskip > 0:
                    curskip -= 1
                elif not ignorable:
                    c = int(hex, 16)
                    out.append(chr(c).decode(encoding))
            elif tchar:
                if curskip > 0:
                    curskip -= 1
                elif not ignorable:
                    out.append(tchar)
        return ''.join(out)