Explain a complex regex used by strip_rtf

2013-09-10 15:36:46 -05:00 · 2013-09-10 15:36:46 -05:00 · 30618ad60c
commit 30618ad60c
parent 53ac150337
1 changed files with 6 additions and 0 deletions
--- a/openlp/plugins/songs/lib/init.py
+++ b/openlp/plugins/songs/lib/init.py
@ -46,6 +46,12 @@ log = logging.getLogger(__name__)

 WHITESPACE = re.compile(r'[\W_]+', re.UNICODE)
 APOSTROPHE = re.compile('[\'`’ʻ′]', re.UNICODE)
+# PATTERN will look for the next occurence of one of these symbols:
+#   \controlword - optionally preceded by \*, optionally followed by a number
+#   \'## - where ## is a pair of hex digits, representing a single character
+#   \# - where # is a single non-alpha character, representing a special symbol
+#   { or } - marking the beginning/end of a group
+#   a run of characters without any \ { } or end-of-line
 PATTERN = re.compile(r"(\\\*)?\\([a-z]{1,32})(-?\d{1,10})?[ ]?|\\'([0-9a-f]{2})|\\([^a-z*])|([{}])|[\r\n]+|([^\\{}\r\n]+)", re.I)
 # RTF control words which specify a "destination" to be ignored.
 DESTINATIONS = frozenset((