root/packages/feaTools/trunk/Lib/feaTools/parser.py

Revision 110, 14.7 kB (checked in by tal, 3 years ago)
- Added support for enum in GPOS Lookup Type 2.
- The parser will raise an error when it encounters something it can't parse. Specifically, it will raise FeaToolsParserSyntaxError.
Line 
1 import re
2
3
4 class FeaToolsParserSyntaxError(Exception):
5
6     def __init__(self, value):
7         self.value = value
8
9     def __str__(self):
10         return repr(self.value)
11
12
13 # used for removing all comments
14 commentRE = re.compile("#.*")
15
16 # used for removing all comments
17 terminatorRE = re.compile(";")
18
19 # used for finding all feature names.
20 feature_findAll_RE = re.compile(
21         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
22         "feature\s+"           # feature
23         "([\w\d]{4})"          # name
24         "\s*{"                 # {
25         )
26
27 # used for finding the content of features.
28 # this regular expression will be compiled
29 # for each feature name found.
30 featureContentRE = [
31         "([\s;\{\}]|^)",       # whitepace, ; {, } or start of line
32         "feature\s+",          # feature
33         # feature name         # name
34         "\s*\{",               # {
35         "([\S\s]*)",           # content
36         "}\s*",                # }
37         # feature name         # name
38         "\s*;"                 # ;
39         ]
40
41 # used for finding all lookup names.
42 lookup_findAll_RE = re.compile(
43         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
44         "lookup\s+"            # lookup
45         "([\w\d_.]+)"          # name
46         "\s*{"                 # {
47         )
48
49 # used for finding the content of lookups.
50 # this regular expression will be compiled
51 # for each lookup name found.
52 lookupContentRE = [
53         "([\s;\{\}]|^)",       # whitepace, ; {, } or start of line
54         "lookup\s+",           # lookup
55         # lookup name          # name
56         "\s*\{",               # {
57         "([\S\s]*)",           # content
58         "}\s*",                # }
59         # lookup name          # name
60         "\s*;"                 # ;
61         ]
62
63 # used for finding all class definitions.
64 classDefinitionRE = re.compile(
65         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
66         "@"                    # @
67         "([\w\d_.]+)"          # name
68         "\s*=\s*"              #  =
69         "\["                   # [
70         "([\w\d\s_.@]+)"       # content
71         "\]"                   # ]
72         "\s*;"                 # ;
73         , re.M
74         )
75
76 # used for getting the contents of a class definition
77 classContentRE = re.compile(
78         "([\w\d_.@]+)"
79         )
80
81 # used for finding inline classes within a sequence
82 sequenceInlineClassRE = re.compile(
83         "\["                   # [
84         "([\w\d\s_.@]+)"       # content
85         "\]"                   # ]
86         )
87
88 # used for finding all substitution type 1
89 subType1And4RE = re.compile(
90         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
91         "substitute|sub\s+"    # sub
92         "([\w\d\s_.@\[\]]+)"   # target
93         "\s+by\s+"             #  by
94         "([\w\d\s_.@\[\]]+)"   # replacement
95         "\s*;"                 # ;
96         )
97
98 # used for finding all substitution type 3
99 subType3RE = re.compile(
100         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
101         "substitute|sub\s+"    # sub
102         "([\w\d\s_.@\[\]]+)"   # target
103         "\s+from\s+"           #  from
104         "([\w\d\s_.@\[\]]+)"   # replacement
105         "\s*;"                 # ;
106         )
107
108 # used for finding all substitution type 6
109 # XXX see failing unit test
110 subType6RE = re.compile(
111         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
112         "substitute|sub\s+"    # sub
113         "([\w\d\s_.@\[\]']+)"  # preceding context, target, trailing context
114         "\s+by\s+"             #  by
115         "([\w\d\s_.@\[\]]+)"   # replacement
116         "\s*;"                 # ;
117         )
118
119 subType6TargetRE = re.compile(
120         "(\["                  # [
121         "[\w\d\s_.@]+"         # content
122         "\]"                   # ]'
123         "|"                    # <or>
124         "[\w\d_.@]+)'"         # content
125         )
126
127 subType6TargetExtractRE = re.compile(
128         "([\w\d_.@]*)"       # glyph or class names
129         )
130
131 # used for finding positioning type 1
132 posType1RE = re.compile(
133     "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
134     "position|pos\s+"      # pos
135     "([\w\d\s_.@\[\]]+)"   # target
136     "\s+<"                 # <
137     "([-\d\s]+)"           # value
138     "\s*>\s*;"             # >;       
139     )
140
141 # used for finding positioning type 2
142 posType2RE = re.compile(
143     "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
144     "(enum\s+|\s*)"        # enum
145     "(position|pos\s+)"    # pos
146     "([-\w\d\s_.@\[\]]+)"  # left, right, value
147     "\s*;"                 # ;
148     )
149
150 # used for finding all languagesystem
151 languagesystemRE = re.compile(
152         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
153         "languagesystem\s+"    # languagesystem
154         "([\w\d]+)"            # script tag
155         "\s+"                  #
156         "([\w\d]+)"            # language tag
157         "\s*;"                 # ;
158         )
159
160 # use for finding all script
161 scriptRE = re.compile(
162         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
163         "script\s+"            # script
164         "([\w\d]+)"            # script tag
165         "\s*;"                 # ;
166         )
167
168 # used for finding all language
169 languageRE = re.compile(
170         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
171         "language\s+"          # language
172         "([\w\d]+)"            # language tag
173         "\s*"                  #
174         "([\w\d]*)"            # include_dflt or exclude_dflt or nothing
175         "\s*;"                 # ;
176         )
177
178 # use for finding all includes
179 includeRE = re.compile(
180         "([\s;\{\}]|^)"        # whitepace, ; {, } or start of line
181         "include\s*"           # include
182         "\(\s*"                # (
183         "([^\)]+)"             # anything but )
184         "\s*\)"                # )
185         "\s*;{0,1}"            # ; which will occur zero or one times (ugh!)
186         )
187
188 def _parseUnknown(writer, text):
189     text = text.strip()
190     ## extract all feature names
191     featureNames = feature_findAll_RE.findall(text)
192     for precedingMark, featureName in featureNames:
193         # a regular expression specific to this lookup must
194         # be created so that nested lookups are safely handled
195         thisFeatureContentRE = list(featureContentRE)
196         thisFeatureContentRE.insert(2, featureName)
197         thisFeatureContentRE.insert(6, featureName)
198         thisFeatureContentRE = re.compile("".join(thisFeatureContentRE))
199         found = thisFeatureContentRE.search(text)
200         featureText = found.group(2)
201         start, end = found.span()
202         precedingText = text[:start]
203         if precedingMark:
204             precedingText += precedingMark
205         _parseUnknown(writer, precedingText)
206         _parseFeature(writer, featureName, featureText)
207         text = text[end:]
208     ## extract all lookup names
209     lookupNames = lookup_findAll_RE.findall(text)
210     for precedingMark, lookupName in lookupNames:
211         # a regular expression specific to this lookup must
212         # be created so that nested lookups are safely handled
213         thisLookupContentRE = list(lookupContentRE)
214         thisLookupContentRE.insert(2, lookupName)
215         thisLookupContentRE.insert(6, lookupName)
216         thisLookupContentRE = re.compile("".join(thisLookupContentRE))
217         found = thisLookupContentRE.search(text)
218         lookupText = found.group(2)
219         start, end = found.span()
220         precedingText = text[:start]
221         if precedingMark:
222             precedingText += precedingMark
223         _parseUnknown(writer, precedingText)
224         _parseLookup(writer, lookupName, lookupText)
225         text = text[end:]
226     ## extract all class data
227     classes = classDefinitionRE.findall(text)
228     for precedingMark, className, classContent in classes:
229         text = _executeSimpleSlice(precedingMark, text, classDefinitionRE, writer)
230         className = "@" + className
231         _parseClass(writer, className, classContent)
232     ## extract substitutions
233     # sub type 1 and 4
234     subType1s = subType1And4RE.findall(text)
235     for precedingMark, target, replacement in subType1s:
236         text = _executeSimpleSlice(precedingMark, text, subType1And4RE, writer)
237         _parseSubType1And4(writer, target, replacement)
238     # sub type 3
239     subType3s = subType3RE.findall(text)
240     for precedingMark, target, replacement in subType3s:
241         text = _executeSimpleSlice(precedingMark, text, subType3RE, writer)
242         _parseSubType3(writer, target, replacement)
243     # sub type 6
244     subType6s = subType6RE.findall(text)
245     for precedingMark, target, replacement in subType6s:
246         text = _executeSimpleSlice(precedingMark, text, subType6RE, writer)
247         _parseSubType6(writer, target, replacement)
248     ## extract positions
249     # pos type 1
250     posType1s = posType1RE.findall(text)
251     for precedingMark, target, value in posType1s:
252         text = _executeSimpleSlice(precedingMark, text, posType1RE, writer)
253         _parsePosType1(writer, target, value)
254     # pos type 2
255     posType2s = posType2RE.findall(text)
256     for precedingMark, enumTag, posTag, targetAndValue in posType2s:
257         text = _executeSimpleSlice(precedingMark, text, posType2RE, writer)
258         _parsePosType2(writer, targetAndValue)
259     ## extract other data
260     # XXX look at FDK spec. sometimes a language tag of dflt will be passed
261     # it should be handled differently than the other tags.
262     # languagesystem
263     languagesystems = languagesystemRE.findall(text)
264     for precedingMark, scriptTag, languageTag in languagesystems:
265         text = _executeSimpleSlice(precedingMark, text, languagesystemRE, writer)
266         writer.languageSystem(scriptTag, languageTag)
267     # script
268     scripts = scriptRE.findall(text)
269     for precedingMark, scriptTag in scripts:
270         text = _executeSimpleSlice(precedingMark, text, scriptRE, writer)
271         writer.script(scriptTag)
272     # language
273     languages = languageRE.findall(text)
274     for precedingMark, languageTag, otherKeyword in languages:
275         text = _executeSimpleSlice(precedingMark, text, languageRE, writer)
276         if not otherKeyword or otherKeyword == "include_dflt":
277             writer.language(languageTag)
278         elif otherKeyword == "exclude_dflt":
279             writer.language(languageTag, includeDefault=False)
280     # include
281     inclusions = includeRE.findall(text)
282     for precedingMark, path in inclusions:
283         text = _executeSimpleSlice(precedingMark, text, includeRE, writer)
284         writer.include(path)
285     text = text.strip()
286     if text:
287         raise FeaToolsParserSyntaxError("Invalid Syntax: %s" % text)
288
289 def _executeSimpleSlice(precedingMark, text, regex, writer):
290     first = regex.search(text)
291     start, end = first.span()
292     precedingText = text[:start]
293     if precedingMark:
294         precedingText += precedingMark
295     _parseUnknown(writer, precedingText)
296     text = text[end:]
297     return text
298
299 def _parseFeature(writer, name, feature):
300     featureWriter = writer.feature(name)
301     parsed = _parseUnknown(featureWriter, feature)
302
303 def _parseLookup(writer, name, lookup):
304     lookupWriter = writer.lookup(name)
305     parsed = _parseUnknown(lookupWriter, lookup)
306
307 def _parseClass(writer, name, content):
308     content = classContentRE.findall(content)
309     writer.classDefinition(name, content)
310
311 def _parseSequence(sequence):
312     parsed = []
313     for content in sequenceInlineClassRE.findall(sequence):
314         first = sequenceInlineClassRE.search(sequence)
315         start, end = first.span()
316         precedingText = sequence[:start]
317         parsed.extend(_parseSequence(precedingText))
318         parsed.append(_parseSequence(content))
319         sequence = sequence[end:]
320     content = [i for i in sequence.split(" ") if i]
321     parsed.extend(content)
322     return parsed
323
324 def _parseSubType1And4(writer, target, replacement):
325     target = _parseSequence(target)
326     # replacement will always be one item.
327     # either a single glyph/class or a list
328     # reresenting an inline class.
329     replacement = _parseSequence(replacement)
330     replacement = replacement[0]
331     if len(target) == 1:
332         target = target[0]
333         writer.gsubType1(target, replacement)
334     else:
335         # target will always be a list representing a sequence.
336         # the list may contain strings representing a single
337         # glyph/class or a list representing an inline class.
338         writer.gsubType4(target, replacement)
339
340 def _parseSubType3(writer, target, replacement):
341     # target will only be one item representing
342     # a glyph/class name.
343     target = classContentRE.findall(target)
344     target = target[0]
345     replacement = classContentRE.findall(replacement)
346     writer.gsubType3(target, replacement)
347
348 def _parseSubType6(writer, target, replacement):
349     # replacement will always be one item.
350     # either a single glyph/class or a list
351     # reresenting an inline class.
352     replacement = classContentRE.findall(replacement)
353     if len(replacement) == 1:
354         replacement = replacement[0]
355     #
356     targetText = target
357     #
358     precedingContext = ""
359     targets = subType6TargetRE.findall(targetText)
360     trailingContext = ""
361     #
362     targetCount = len(targets)
363     counter = 1
364     extractedTargets = []
365     for target in targets:
366         first = subType6TargetRE.search(targetText)
367         start, end = first.span()
368         if counter == 1:
369             precedingContext = _parseSequence(targetText[:start])
370         if counter == targetCount:
371             trailingContext = _parseSequence(targetText[end:])
372         # the target could be in a form like [o o.alt]
373         # so it has to be broken down
374         target = classContentRE.findall(target)
375         if len(target) == 1:
376             target = target[0]
377         extractedTargets.append(target)
378         counter += 1
379         targetText = targetText[end:]
380     writer.gsubType6(precedingContext, extractedTargets, trailingContext, replacement)
381
382 def _parsePosType1(writer, target, value):
383     # target will only be one item representing
384     # a glyph/class name
385     value = tuple([float(i) for i in value.strip().split(" ")])
386     writer.gposType1(target, value)
387
388 def _parsePosType2(writer, targetAndValue):
389     # the target and value will be coming
390     # in as single string.
391     target = " ".join(targetAndValue.split(" ")[:-1])
392     value = targetAndValue.split(" ")[-1]
393     # XXX this could cause a choke
394     value = float(value)
395     target = _parseSequence(target)
396     writer.gposType2(target, value)
397
398 def _parsePosType2WithEnum(writer, targetAndValue):
399     # the target and value will be coming
400     # in as single string.
401     target = " ".join(targetAndValue.split(" ")[:-1])
402     value = targetAndValue.split(" ")[-1]
403     # XXX this could cause a choke
404     value = float(value)
405     target = _parseSequence(target)
406     writer.gposType2(target, value)
407
408 def parseFeatures(writer, text):
409     # strip the comments
410     text = commentRE.sub("", text)
411     # make sure there is a space after all ;
412     # since it makes the text more digestable
413     # for the regular expressions
414     text = terminatorRE.sub("; ", text)
415     _parseUnknown(writer, text)
Note: See TracBrowser for help on using the browser.