Move strip_rtf and change it into a class StripRtf + other changes. It might not work that well yet.

This commit is contained in:
Mattias Põldaru 2012-06-24 21:08:20 +03:00
parent 0a2ea0fc51
commit 36f7e03dc0
3 changed files with 255 additions and 312 deletions

View File

@ -24,6 +24,7 @@
# with this program; if not, write to the Free Software Foundation, Inc., 59 # # with this program; if not, write to the Free Software Foundation, Inc., 59 #
# Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Temple Place, Suite 330, Boston, MA 02111-1307 USA #
############################################################################### ###############################################################################
import logging
import re import re
from PyQt4 import QtGui from PyQt4 import QtGui
@ -33,6 +34,8 @@ from openlp.core.utils import CONTROL_CHARS
from db import Author from db import Author
from ui import SongStrings from ui import SongStrings
log = logging.getLogger(__name__)
WHITESPACE = re.compile(r'[\W_]+', re.UNICODE) WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
APOSTROPHE = re.compile(u'[\'`ʻ]', re.UNICODE) APOSTROPHE = re.compile(u'[\'`ʻ]', re.UNICODE)
@ -194,7 +197,7 @@ class VerseType(object):
return verse_index return verse_index
def retrieve_windows_encoding(recommendation=None): def retrieve_windows_encoding(recommendation=None, example_text=None):
""" """
Determines which encoding to use on an information source. The process uses Determines which encoding to use on an information source. The process uses
both automated detection, which is passed to this method as a both automated detection, which is passed to this method as a
@ -203,6 +206,9 @@ def retrieve_windows_encoding(recommendation=None):
``recommendation`` ``recommendation``
A recommended encoding discovered programmatically for the user to A recommended encoding discovered programmatically for the user to
confirm. confirm.
``example_text``
Still not decoded text to show to users to help them decide.
""" """
# map chardet result to compatible windows standard code page # map chardet result to compatible windows standard code page
codepage_mapping = {'IBM866': u'cp866', 'TIS-620': u'cp874', codepage_mapping = {'IBM866': u'cp866', 'TIS-620': u'cp874',
@ -365,6 +371,217 @@ def clean_song(manager, song):
if song.copyright: if song.copyright:
song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip() song.copyright = CONTROL_CHARS.sub(u'', song.copyright).strip()
class StripRtf():
"""
This class strips RTF control structures and returns an unicode string.
Thanks to Markus Jarderot (MizardX) for this code, used by permission.
http://stackoverflow.com/questions/188545
"""
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'"
r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# Control words which specify a "destination" to be ignored.
destinations = frozenset((
u'aftncn', u'aftnsep', u'aftnsepc', u'annotation', u'atnauthor',
u'atndate', u'atnicn', u'atnid', u'atnparent', u'atnref', u'atntime',
u'atrfend', u'atrfstart', u'author', u'background', u'bkmkend',
u'bkmkstart', u'blipuid', u'buptim', u'category',
u'colorschememapping', u'colortbl', u'comment', u'company', u'creatim',
u'datafield', u'datastore', u'defchp', u'defpap', u'do', u'doccomm',
u'docvar', u'dptxbxtext', u'ebcend', u'ebcstart', u'factoidname',
u'falt', u'fchars', u'ffdeftext', u'ffentrymcr', u'ffexitmcr',
u'ffformat', u'ffhelptext', u'ffl', u'ffname', u'ffstattext', u'field',
u'file', u'filetbl', u'fldinst', u'fldrslt', u'fldtype', u'fname',
u'fontemb', u'fontfile', u'footer', u'footerf', u'footerl', u'footerr',
u'footnote', u'formfield', u'ftncn', u'ftnsep', u'ftnsepc', u'g',
u'generator', u'gridtbl', u'header', u'headerf', u'headerl',
u'headerr', u'hl', u'hlfr', u'hlinkbase', u'hlloc', u'hlsrc', u'hsv',
u'htmltag', u'info', u'keycode', u'keywords', u'latentstyles',
u'lchars', u'levelnumbers', u'leveltext', u'lfolevel', u'linkval',
u'list', u'listlevel', u'listname', u'listoverride',
u'listoverridetable', u'listpicture', u'liststylename', u'listtable',
u'listtext', u'lsdlockedexcept', u'macc', u'maccPr', u'mailmerge',
u'maln', u'malnScr', u'manager', u'margPr', u'mbar', u'mbarPr',
u'mbaseJc', u'mbegChr', u'mborderBox', u'mborderBoxPr', u'mbox',
u'mboxPr', u'mchr', u'mcount', u'mctrlPr', u'md', u'mdeg', u'mdegHide',
u'mden', u'mdiff', u'mdPr', u'me', u'mendChr', u'meqArr', u'meqArrPr',
u'mf', u'mfName', u'mfPr', u'mfunc', u'mfuncPr', u'mgroupChr',
u'mgroupChrPr', u'mgrow', u'mhideBot', u'mhideLeft', u'mhideRight',
u'mhideTop', u'mhtmltag', u'mlim', u'mlimloc', u'mlimlow',
u'mlimlowPr', u'mlimupp', u'mlimuppPr', u'mm', u'mmaddfieldname',
u'mmath', u'mmathPict', u'mmathPr', u'mmaxdist', u'mmc', u'mmcJc',
u'mmconnectstr', u'mmconnectstrdata', u'mmcPr', u'mmcs',
u'mmdatasource', u'mmheadersource', u'mmmailsubject', u'mmodso',
u'mmodsofilter', u'mmodsofldmpdata', u'mmodsomappedname',
u'mmodsoname', u'mmodsorecipdata', u'mmodsosort', u'mmodsosrc',
u'mmodsotable', u'mmodsoudl', u'mmodsoudldata', u'mmodsouniquetag',
u'mmPr', u'mmquery', u'mmr', u'mnary', u'mnaryPr', u'mnoBreak',
u'mnum', u'mobjDist', u'moMath', u'moMathPara', u'moMathParaPr',
u'mopEmu', u'mphant', u'mphantPr', u'mplcHide', u'mpos', u'mr',
u'mrad', u'mradPr', u'mrPr', u'msepChr', u'mshow', u'mshp', u'msPre',
u'msPrePr', u'msSub', u'msSubPr', u'msSubSup', u'msSubSupPr', u'msSup',
u'msSupPr', u'mstrikeBLTR', u'mstrikeH', u'mstrikeTLBR', u'mstrikeV',
u'msub', u'msubHide', u'msup', u'msupHide', u'mtransp', u'mtype',
u'mvertJc', u'mvfmf', u'mvfml', u'mvtof', u'mvtol', u'mzeroAsc',
u'mzeroDesc', u'mzeroWid', u'nesttableprops', u'nextfile',
u'nonesttables', u'objalias', u'objclass', u'objdata', u'object',
u'objname', u'objsect', u'objtime', u'oldcprops', u'oldpprops',
u'oldsprops', u'oldtprops', u'oleclsid', u'operator', u'panose',
u'password', u'passwordhash', u'pgp', u'pgptbl', u'picprop', u'pict',
u'pn', u'pnseclvl', u'pntext', u'pntxta', u'pntxtb', u'printim',
u'private', u'propname', u'protend', u'protstart', u'protusertbl',
u'pxe', u'result', u'revtbl', u'revtim', u'rsidtbl', u'rxe', u'shp',
u'shpgrp', u'shpinst', u'shppict', u'shprslt', u'shptxt', u'sn', u'sp',
u'staticval', u'stylesheet', u'subject', u'sv', u'svb', u'tc',
u'template', u'themedata', u'title', u'txe', u'ud', u'upr',
u'userprops', u'wgrffmtfilter', u'windowcaption', u'writereservation',
u'writereservhash', u'xe', u'xform', u'xmlattrname', u'xmlattrvalue',
u'xmlclose', u'xmlname', u'xmlnstbl', u'xmlopen'))
# Translation of some special characters.
specialchars = {
u'par': u'\n',
u'sect': u'\n\n',
u'page': u'\n\n',
u'line': u'\n',
u'tab': u'\t',
u'emdash': u'\u2014',
u'endash': u'\u2013',
u'emspace': u'\u2003',
u'enspace': u'\u2002',
u'qmspace': u'\u2005',
u'bullet': u'\u2022',
u'lquote': u'\u2018',
u'rquote': u'\u2019',
u'ldblquote': u'\u201C',
u'rdblquote': u'\u201D'}
charset_mapping = {
u'fcharset0': u'cp1252',
u'fcharset1': None,
u'fcharset2': None,
u'fcharset77': None,
u'fcharset128': None,
u'fcharset129': None,
u'fcharset130': None,
u'fcharset134': None,
u'fcharset136': None,
u'fcharset161': u'cp1253',
u'fcharset162': u'cp1254',
u'fcharset163': u'cp1258',
u'fcharset177': u'cp1255',
u'fcharset178': u'cp1256',
u'fcharset186': u'cp1257',
u'fcharset204': u'cp1251',
u'fcharset222': u'cp874',
u'fcharset238': u'cp1250'}
def strip_rtf(self, text, default_encoding=None):
# Current font is the font tag we last met.
font = u''
# Character encoding is defined inside fonttable.
# font_table could contain eg u'0': u'cp1252'
font_table = {u'': default_encoding}
# Whether we are inside the font table.
inside_font_table = False
# Stack of things to keep track of when entering/leaving groups.
stack = []
# Whether this group (and all inside it) are "ignorable".
ignorable = False
# Number of ASCII characters to skip after an unicode character.
ucskip = 1
# Number of ASCII characters left to skip.
curskip = 0
# Output buffer.
out = []
for match in self.pattern.finditer(text):
word, arg, hex, char, brace, tchar = match.groups()
if brace:
curskip = 0
if brace == u'{':
# Push state
stack.append((ucskip, ignorable, font, inside_font_table))
elif brace == u'}':
# Pop state
ucskip, ignorable, font, inside_font_table = stack.pop()
# \x (not a letter)
elif char:
curskip = 0
if char == u'~':
if not ignorable:
out.append(u'\xA0')
elif char in u'{}\\':
if not ignorable:
out.append(char)
elif char == u'*':
ignorable = True
# \command
elif word:
curskip = 0
if word in self.destinations:
ignorable = True
elif word in self.specialchars:
out.append(self.specialchars[word])
elif word == u'uc':
ucskip = int(arg)
elif word == u' ':
c = int(arg)
if c < 0:
c += 0x10000
out.append(unichr(c))
curskip = ucskip
elif word == u'fonttbl':
inside_font_table = True
ignorable = True
elif word == u'f':
font = arg
if not inside_font_table:
if arg in font_table.keys():
encoding = font_table[arg]
else:
encoding = default_encoding
elif word == u'ansicpg':
if font == u'':
print "JEEEPASOIDFIJAD"
if inside_font_table or font == u'':
font_table[font] = 'cp' + arg
elif word == u'fcharset':
charset_reference = word + arg
if charset_reference in self.charset_mapping:
charset = self.charset_mapping[charset_reference]
if not charset:
charset = default_encoding
else:
log.error(u"Charset '%s' not in charset_mapping "
u"dictionary in "
u"openlp/plugins/songs/lib/__init__.py"
% charset_reference)
charset = default_encoding
if font == u'':
print "JEEEPASOIDFIadsfJAD"
if inside_font_table or font == u'':
font_table[font] = charset
# \'xx
elif hex:
if curskip > 0:
curskip -= 1
elif not ignorable:
charcode = int(hex, 16)
while True:
try:
out.append(chr(charcode).decode(encoding))
except UnicodeDecodeError:
encoding = \
retrieve_windows_encoding(default_encoding)
if font:
font_table[font] = encoding
else:
break
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
return u''.join(out)
from xml import OpenLyrics, SongXML from xml import OpenLyrics, SongXML
from songstab import SongsTab from songstab import SongsTab
from mediaitem import SongMediaItem from mediaitem import SongMediaItem

View File

@ -35,7 +35,7 @@ import re
from openlp.core.lib import translate from openlp.core.lib import translate
from openlp.plugins.songs.lib import VerseType from openlp.plugins.songs.lib import VerseType
from openlp.plugins.songs.lib import retrieve_windows_encoding from openlp.plugins.songs.lib import retrieve_windows_encoding, StripRtf
from songimport import SongImport from songimport import SongImport
RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}') RTF_STRIPPING_REGEX = re.compile(r'\{\\tx[^}]*\}')
@ -44,101 +44,6 @@ SLIDE_BREAK_REGEX = re.compile(r'\n *?\n[\n ]*')
NUMBER_REGEX = re.compile(r'[0-9]+') NUMBER_REGEX = re.compile(r'[0-9]+')
NOTE_REGEX = re.compile(r'\(.*?\)') NOTE_REGEX = re.compile(r'\(.*?\)')
def strip_rtf(blob, encoding):
depth = 0
control = False
clear_text = []
control_word = []
# workaround for \tx bug: remove one pair of curly braces
# if \tx is encountered
match = RTF_STRIPPING_REGEX.search(blob)
if match:
# start and end indices of match are curly braces - filter them out
blob = ''.join([blob[i] for i in xrange(len(blob))
if i != match.start() and i !=match.end()])
for c in blob:
if control:
# for delimiters, set control to False
if c == '{':
if control_word:
depth += 1
control = False
elif c == '}':
if control_word:
depth -= 1
control = False
elif c == '\\':
new_control = bool(control_word)
control = False
elif c.isspace():
control = False
else:
control_word.append(c)
if len(control_word) == 3 and control_word[0] == '\'':
control = False
if not control:
if not control_word:
if c == '{' or c == '}' or c == '\\':
clear_text.append(c)
else:
control_str = ''.join(control_word)
if control_str == 'par' or control_str == 'line':
clear_text.append(u'\n')
elif control_str == 'tab':
clear_text.append(u'\t')
# Prefer the encoding specified by the RTF data to that
# specified by the Paradox table header
# West European encoding
elif control_str == 'fcharset0':
encoding = u'cp1252'
# Greek encoding
elif control_str == 'fcharset161':
encoding = u'cp1253'
# Turkish encoding
elif control_str == 'fcharset162':
encoding = u'cp1254'
# Vietnamese encoding
elif control_str == 'fcharset163':
encoding = u'cp1258'
# Hebrew encoding
elif control_str == 'fcharset177':
encoding = u'cp1255'
# Arabic encoding
elif control_str == 'fcharset178':
encoding = u'cp1256'
# Baltic encoding
elif control_str == 'fcharset186':
encoding = u'cp1257'
# Cyrillic encoding
elif control_str == 'fcharset204':
encoding = u'cp1251'
# Thai encoding
elif control_str == 'fcharset222':
encoding = u'cp874'
# Central+East European encoding
elif control_str == 'fcharset238':
encoding = u'cp1250'
elif control_str[0] == '\'':
s = chr(int(control_str[1:3], 16))
clear_text.append(s.decode(encoding))
del control_word[:]
if c == '\\' and new_control:
control = True
elif c == '{':
depth += 1
elif c == '}':
depth -= 1
elif depth > 2:
continue
elif c == '\n' or c == '\r':
continue
elif c == '\\':
control = True
else:
clear_text.append(c)
return u''.join(clear_text)
class FieldDescEntry: class FieldDescEntry:
def __init__(self, name, type, size): def __init__(self, name, type, size):
@ -154,6 +59,7 @@ class EasyWorshipSongImport(SongImport):
""" """
def __init__(self, manager, **kwargs): def __init__(self, manager, **kwargs):
SongImport.__init__(self, manager, **kwargs) SongImport.__init__(self, manager, **kwargs)
self.rtf = StripRtf()
def doImport(self): def doImport(self):
# Open the DB and MB files if they exist # Open the DB and MB files if they exist
@ -273,7 +179,7 @@ class EasyWorshipSongImport(SongImport):
self.addAuthor(author_name.strip()) self.addAuthor(author_name.strip())
if words: if words:
# Format the lyrics # Format the lyrics
words = strip_rtf(words, self.encoding) words = self.rtf.strip_rtf(words, self.encoding)
verse_type = VerseType.Tags[VerseType.Verse] verse_type = VerseType.Tags[VerseType.Verse]
for verse in SLIDE_BREAK_REGEX.split(words): for verse in SLIDE_BREAK_REGEX.split(words):
verse = verse.strip() verse = verse.strip()

View File

@ -26,11 +26,11 @@
############################################################################### ###############################################################################
import logging import logging
import os
import re import re
from os.path import split
from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding from openlp.plugins.songs.lib import VerseType, retrieve_windows_encoding
from openlp.plugins.songs.lib import StripRtf
from openlp.plugins.songs.lib.songimport import SongImport from openlp.plugins.songs.lib.songimport import SongImport
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -42,7 +42,7 @@ class SundayPlusImport(SongImport):
The format examples can be found attached to bug report at The format examples can be found attached to bug report at
<http://support.openlp.org/issues/395> <http://support.openlp.org/issues/395>
""" """
hotkey_to_verse_type = { hotkeyToVerseType = {
u'1': u'v1', u'1': u'v1',
u'2': u'v2', u'2': u'v2',
u'3': u'v3', u'3': u'v3',
@ -61,6 +61,7 @@ class SundayPlusImport(SongImport):
Initialise the class. Initialise the class.
""" """
SongImport.__init__(self, manager, **kwargs) SongImport.__init__(self, manager, **kwargs)
self.rtf = StripRtf()
def doImport(self): def doImport(self):
self.importWizard.progressBar.setMaximum(len(self.importSource)) self.importWizard.progressBar.setMaximum(len(self.importSource))
@ -81,32 +82,33 @@ class SundayPlusImport(SongImport):
self.logError(file.name) self.logError(file.name)
return return
if self.title == '': if self.title == '':
self.title = self.title_from_filename(file.name) self.title = self.titleFromFilename(file.name)
if not self.finish(): if not self.finish():
self.logError(file.name) self.logError(file.name)
def parse(self, data, cell = False): def parse(self, data, cell = False):
if data[0] != '[' and data[-1] != ']': if len(data) == 0 or data[0:1] != '[' or data[-1] != ']':
self.logError(u'File is malformed') self.logError(u'File is malformed')
return False return False
i = 1 i = 1
verse_type = VerseType.Tags[VerseType.Verse] verse_type = VerseType.Tags[VerseType.Verse]
while i < len(data): while i < len(data):
byte = data[i] # Data is held as #name: value pairs inside groups marked as [].
if byte == '#': # Now we are looking for name.
end = data.find(':', i+1) if data[i:i+1] == '#':
name = data[i+1:end] name_end = data.find(':', i+1)
i = end + 1 name = data[i+1:name_end]
while data[i] == ' ': i = name_end + 1
while data[i:i+1] == ' ':
i += 1 i += 1
if data[i] == '"': if data[i:i+1] == '"':
end = data.find('"', i+1) end = data.find('"', i+1)
value = data[i+1:end] value = data[i+1:end]
elif data[i] == '[': elif data[i:i+1] == '[':
j = i j = i
inside_quotes = False inside_quotes = False
while j < len(data): while j < len(data):
char = data[j] char = data[j:j+1]
if char == '"': if char == '"':
inside_quotes = not inside_quotes inside_quotes = not inside_quotes
elif not inside_quotes and char == ']': elif not inside_quotes and char == ']':
@ -119,6 +121,7 @@ class SundayPlusImport(SongImport):
if data.find('(', i, end) != -1: if data.find('(', i, end) != -1:
end = data.find(')', i) + 1 end = data.find(')', i) + 1
value = data[i:end] value = data[i:end]
# If we are in the main group.
if cell == False: if cell == False:
if name == 'title': if name == 'title':
self.title = self.decode(self.unescape(value)) self.title = self.decode(self.unescape(value))
@ -130,6 +133,7 @@ class SundayPlusImport(SongImport):
self.copyright = self.decode(self.unescape(value)) self.copyright = self.decode(self.unescape(value))
elif name[0:4] == 'CELL': elif name[0:4] == 'CELL':
self.parse(value, cell = name[4:]) self.parse(value, cell = name[4:])
# We are in a verse group.
else: else:
if name == 'MARKER_NAME': if name == 'MARKER_NAME':
value = value.strip() value = value.strip()
@ -141,23 +145,27 @@ class SundayPlusImport(SongImport):
verse_type = "%s%s" % (verse_type, value[-1]) verse_type = "%s%s" % (verse_type, value[-1])
elif name == 'Hotkey': elif name == 'Hotkey':
# Hotkey always appears after MARKER_NAME, so it # Hotkey always appears after MARKER_NAME, so it
# effectivetly overrides MARKER_NAME, if present. # effectively overrides MARKER_NAME, if present.
if len(value) and \ if len(value) and \
value in self.hotkey_to_verse_type.keys(): value in self.hotkeyToVerseType.keys():
verse_type = self.hotkey_to_verse_type[value] verse_type = self.hotkeyToVerseType[value]
if name == 'rtf': if name == 'rtf':
value = self.unescape(value) value = self.unescape(value)
verse = self.strip_rtf(value, self.encoding).strip() verse = self.rtf.strip_rtf(value, self.encoding)
lines = verse.split('\n') lines = verse.strip().split('\n')
# If any line inside any verse contains CCLI or
# only Public Domain, we treat this as special data:
# we remove that line and add data to specific field.
for i in xrange(len(lines)): for i in xrange(len(lines)):
lines[i] = lines[i].strip() lines[i] = lines[i].strip()
line = lines[i] line = lines[i]
if line[:4] in u'CCLI': if line[:4].lower() == u'ccli':
m = re.search(r'[0-9]+', line) m = re.search(r'[0-9]+', line)
if m: if m:
self.ccliNumber = int(m.group(0)) self.ccliNumber = int(m.group(0))
lines.pop(i) lines.pop(i)
elif line.lower() == u'public domain': elif line.lower() == u'public domain':
self.copyright = u'Public Domain'
lines.pop(i) lines.pop(i)
self.addVerse('\n'.join(lines).strip(), verse_type) self.addVerse('\n'.join(lines).strip(), verse_type)
if end == -1: if end == -1:
@ -166,13 +174,12 @@ class SundayPlusImport(SongImport):
i += 1 i += 1
return True return True
def title_from_filename(self, filename): def titleFromFilename(self, filename):
filename = split(filename)[1] title = os.path.split(filename)[1]
if len(filename) > 4 and filename[-4:].lower() == u'.ptf': if title.endswith(u'.ptf'):
title = filename[:-4] title = title[:-4]
else: # For some strange reason all example files names ended with 1-7.
title = filename if title.endswith('1-7'):
if title[-3:] == '1-7':
title = title[:-3] title = title[:-3]
return title.replace(u'_', u' ') return title.replace(u'_', u' ')
@ -190,190 +197,3 @@ class SundayPlusImport(SongImport):
text = text.replace('^', '\'') text = text.replace('^', '\'')
return text.strip() return text.strip()
def strip_rtf(self, text, encoding):
# Thanks to Markus Jarderot (MizardX) for this code, used by permission
# <http://stackoverflow.com/questions/188545/regular-expression-for-
# extracting-text-from-an-rtf-string>
pattern = re.compile(r"\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'"
r"([0-9a-f]{2})|\\([^a-z])|([{}])|[\r\n]+|(.)", re.I)
# Control words which specify a "destination" and we can ignore it.
destinations = frozenset((
'aftncn', 'aftnsep', 'aftnsepc', 'annotation', 'atnauthor',
'atndate', 'atnicn', 'atnid', 'atnparent', 'atnref', 'atntime',
'atrfend', 'atrfstart', 'author', 'background', 'bkmkend',
'bkmkstart', 'blipuid', 'buptim', 'category', 'colorschememapping',
'colortbl', 'comment', 'company', 'creatim', 'datafield',
'datastore', 'defchp', 'defpap', 'do', 'doccomm', 'docvar',
'dptxbxtext', 'ebcend', 'ebcstart', 'factoidname', 'falt', 'fchars',
'ffdeftext', 'ffentrymcr', 'ffexitmcr', 'ffformat', 'ffhelptext',
'ffl', 'ffname', 'ffstattext', 'field', 'file', 'filetbl',
'fldinst', 'fldrslt', 'fldtype', 'fname', 'fontemb', 'fontfile',
'footer', 'footerf', 'footerl', 'footerr', 'footnote',
'formfield', 'ftncn', 'ftnsep', 'ftnsepc', 'g', 'generator',
'gridtbl', 'header', 'headerf', 'headerl', 'headerr', 'hl', 'hlfr',
'hlinkbase', 'hlloc', 'hlsrc', 'hsv', 'htmltag', 'info', 'keycode',
'keywords', 'latentstyles', 'lchars', 'levelnumbers', 'leveltext',
'lfolevel', 'linkval', 'list', 'listlevel', 'listname',
'listoverride', 'listoverridetable', 'listpicture', 'liststylename',
'listtable', 'listtext', 'lsdlockedexcept', 'macc', 'maccPr',
'mailmerge', 'maln', 'malnScr', 'manager', 'margPr', 'mbar',
'mbarPr', 'mbaseJc', 'mbegChr', 'mborderBox', 'mborderBoxPr',
'mbox', 'mboxPr', 'mchr', 'mcount', 'mctrlPr', 'md', 'mdeg',
'mdegHide', 'mden', 'mdiff', 'mdPr', 'me', 'mendChr', 'meqArr',
'meqArrPr', 'mf', 'mfName', 'mfPr', 'mfunc', 'mfuncPr', 'mgroupChr',
'mgroupChrPr', 'mgrow', 'mhideBot', 'mhideLeft', 'mhideRight',
'mhideTop', 'mhtmltag', 'mlim', 'mlimloc', 'mlimlow', 'mlimlowPr',
'mlimupp', 'mlimuppPr', 'mm', 'mmaddfieldname', 'mmath',
'mmathPict', 'mmathPr', 'mmaxdist', 'mmc', 'mmcJc', 'mmconnectstr',
'mmconnectstrdata', 'mmcPr', 'mmcs', 'mmdatasource',
'mmheadersource', 'mmmailsubject', 'mmodso', 'mmodsofilter',
'mmodsofldmpdata', 'mmodsomappedname', 'mmodsoname',
'mmodsorecipdata', 'mmodsosort', 'mmodsosrc', 'mmodsotable',
'mmodsoudl', 'mmodsoudldata', 'mmodsouniquetag', 'mmPr', 'mmquery',
'mmr', 'mnary', 'mnaryPr', 'mnoBreak', 'mnum', 'mobjDist', 'moMath',
'moMathPara', 'moMathParaPr', 'mopEmu', 'mphant', 'mphantPr',
'mplcHide', 'mpos', 'mr', 'mrad', 'mradPr', 'mrPr', 'msepChr',
'mshow', 'mshp', 'msPre', 'msPrePr', 'msSub', 'msSubPr', 'msSubSup',
'msSubSupPr', 'msSup', 'msSupPr', 'mstrikeBLTR', 'mstrikeH',
'mstrikeTLBR', 'mstrikeV', 'msub', 'msubHide', 'msup', 'msupHide',
'mtransp', 'mtype', 'mvertJc', 'mvfmf', 'mvfml', 'mvtof', 'mvtol',
'mzeroAsc', 'mzeroDesc', 'mzeroWid', 'nesttableprops', 'nextfile',
'nonesttables', 'objalias', 'objclass', 'objdata', 'object',
'objname', 'objsect', 'objtime', 'oldcprops', 'oldpprops',
'oldsprops', 'oldtprops', 'oleclsid', 'operator', 'panose',
'password', 'passwordhash', 'pgp', 'pgptbl', 'picprop', 'pict',
'pn', 'pnseclvl', 'pntext', 'pntxta', 'pntxtb', 'printim',
'private', 'propname', 'protend', 'protstart', 'protusertbl', 'pxe',
'result', 'revtbl', 'revtim', 'rsidtbl', 'rxe', 'shp', 'shpgrp',
'shpinst', 'shppict', 'shprslt', 'shptxt', 'sn', 'sp', 'staticval',
'stylesheet', 'subject', 'sv', 'svb', 'tc', 'template', 'themedata',
'title', 'txe', 'ud', 'upr', 'userprops', 'wgrffmtfilter',
'windowcaption', 'writereservation', 'writereservhash', 'xe',
'xform', 'xmlattrname', 'xmlattrvalue', 'xmlclose', 'xmlname',
'xmlnstbl', 'xmlopen'))
# Translation of some special characters.
specialchars = {
u'par': u'\n',
u'sect': u'\n\n',
u'page': u'\n\n',
u'line': u'\n',
u'tab': u'\t',
u'emdash': u'\u2014',
u'endash': u'\u2013',
u'emspace': u'\u2003',
u'enspace': u'\u2002',
u'qmspace': u'\u2005',
u'bullet': u'\u2022',
u'lquote': u'\u2018',
u'rquote': u'\u2019',
u'ldblquote': u'\u201C',
u'rdblquote': u'\u201D'}
charset_mapping = {
# Thai encoding
'fcharset222': u'cp874',
'ansicpg874': u'cp874',
# Central+East European encoding
'fcharset238': u'cp1250',
'ansicpg1250': u'cp1250',
# Cyrillic encoding
'fcharset204': u'cp1251',
'ansicpg1251': u'cp1251',
# West European encoding
'fcharset0': u'cp1252',
'ansicpg1252': u'cp1252',
# Greek encoding
'fcharset161': u'cp1253',
'ansicpg1253': u'cp1253',
# Turkish encoding
'fcharset162': u'cp1254',
'ansicpg1254': u'cp1254',
# Hebrew encoding
'fcharset177': u'cp1255',
'ansicpg1255': u'cp1255',
# Arabic encoding
'fcharset178': u'cp1256',
'ansicpg1256': u'cp1256',
# Baltic encoding
'fcharset186': u'cp1257',
'ansicpg1257': u'cp1257',
# Vietnamese encoding
'fcharset163': u'cp1258',
'ansicpg1258': u'cp1258'}
charsets = charset_mapping.keys()
# Character encoding is defined together with fonts.
# font_table could contain eg '0': 'cp1252'
font_table = {}
stack = []
# Whether this group (and all inside it) are "ignorable".
ignorable = False
# Whether we are inside the font table.
inside_font_table = False
current_font = ''
# Number of ASCII characters to skip after an unicode character.
ucskip = 1
# Number of ASCII characters left to skip.
curskip = 0
# Output buffer.
out = []
for match in pattern.finditer(text):
word, arg, hex, char, brace, tchar = match.groups()
if brace:
curskip = 0
if brace == u'{':
# Push state
stack.append((ucskip, ignorable, inside_font_table))
elif brace == u'}':
# Pop state
ucskip, ignorable, inside_font_table = stack.pop()
# \x (not a letter)
elif char:
curskip = 0
if char == '~':
if not ignorable:
out.append(u'\xA0')
elif char in u'{}\\':
if not ignorable:
out.append(char)
elif char == u'*':
ignorable = True
# \foo
elif word:
curskip = 0
if word in destinations:
ignorable = True
elif word in specialchars:
out.append(specialchars[word])
elif word == u'uc':
ucskip = int(arg)
elif word == u'u':
c = int(arg)
if c < 0:
c += 0x10000
out.append(unichr(c))
curskip = ucskip
elif word == 'fonttbl':
inside_font_table = True
ignorable = True
elif word == 'f':
current_font = arg
if not inside_font_table:
encoding = font_table[arg]
elif word in ('ansicpg', 'fcharset'):
if inside_font_table:
font_table[current_font] = charset_mapping[word + arg]
else:
encoding = charset_mapping[word + arg]
# \'xx
elif hex:
if curskip > 0:
curskip -= 1
elif not ignorable:
c = int(hex, 16)
out.append(chr(c).decode(encoding))
elif tchar:
if curskip > 0:
curskip -= 1
elif not ignorable:
out.append(tchar)
return ''.join(out)