root/packages/compositor/trunk/Lib/compositor/textUtilities.py

Revision 15, 21.6 kB (checked in by tal, 3 years ago)
Added handler for final sigma upper to lower case conversion.
Line 
1 import unicodedata
2 from compositor.cmap import reverseCMAP
3 from compositor.caseConversionMaps import lowerToSingleUpper, upperToSingleLower, specialCasing, softDotted
4 from compositor.wordBreakProperties import wordBreakProperties
5
6 try:
7     set
8 except NameError:
9     from sets import Set as set
10
11 try:
12     reversed
13 except NameError:
14     def reversed(iterable):
15         iterable = list(iterable)
16         iterable.reverse()
17         return iterable
18
19 # ---------------
20 # Case Conversion
21 # ---------------
22
23 def convertCase(case, glyphNames, cmap, reversedCMAP, language=None, fallbackGlyph=".notdef"):
24     """
25     Case Conversion Function
26
27     This function converts a list of glyph names to their
28     upper or lowercase forms following the Unicode locale
29     specific case conversion rules.
30
31     Arguments:
32     - case
33       The case to convert to. Valid values are "upper" and "lower".
34     - glyphNames
35       A list of glyph names.
36     - cmap
37       The CMAP for the font formatted as a dictionary.
38     - reversedCMAP
39       Reversed version of cmap.
40     - language
41       The language tag being processed. May be None.
42     - fallbackGlyph
43       The glyph name that should be used when the converted
44       glyph does not exist in the font.
45     """
46     # before anything else happens, the glyph names
47     # have to be converted to unicode values. if no
48     # unicode value is available, the glyph name is used.
49     glyphs = []
50     for glyphName in glyphNames:
51         uniValue = reversedCMAP.get(glyphName)
52         if uniValue is None:
53             glyphs.append(glyphName)
54         else:
55             glyphs.append(uniValue[0])
56     converted = []
57     for index, uniValue in enumerate(glyphs):
58         # glyph name indicating that there is no available unicode
59         if isinstance(uniValue, basestring):
60             converted.append(uniValue)
61             continue
62         ## special casing
63         # specific language
64         if language is not None:
65             madeChange = _handleSpecialCasing(case, glyphs, index, uniValue, converted, cmap, reversedCMAP, language)
66             if madeChange:
67                 continue
68         # no specific language required
69         madeChange = _handleSpecialCasing(case, glyphs, index, uniValue, converted, cmap, reversedCMAP, None)
70         if madeChange:
71             continue
72         ## single casing
73         if case == "upper":
74             d = lowerToSingleUpper
75         else:
76             d = upperToSingleLower
77         if uniValue in d:
78             converted.append(d[uniValue])
79             continue
80         ## fallback
81         converted.append(uniValue)
82     # convert back to glyph names
83     glyphNames = []
84     for uniValue in converted:
85         if isinstance(uniValue, basestring):
86             glyphNames.append(uniValue)
87             continue
88         glyphNames.append(cmap.get(uniValue, fallbackGlyph))
89     return glyphNames
90
91 def convertCodeToInt(code):
92     if not code:
93         return None
94     if " " in code:
95         return tuple([convertCodeToInt(i) for i in code.split(" ")])
96     return int(code, 16)
97
98 def _handleSpecialCasing(case, glyphs, index, uniValue, converted, cmap, reversedCMAP, language):
99     """
100     Handle a language specific lookup.
101     Returns a boolean indicating if a change was made.
102     """
103     if language not in specialCasing:
104         return False
105     languageMap = specialCasing[language]
106     if uniValue in languageMap:
107         contextMatch = True
108         context = languageMap[uniValue]["context"]
109         if context:
110             contextMatch = False
111             ## After_I
112             # The last preceding base character was
113             # an uppercase I, and there is no inter-
114             # vening combining character class 230.
115             if context == "After_I":
116                 previous = None
117                 for otherUniValue in reversed(glyphs[:index]):
118                     previous = otherUniValue
119                     if isinstance(otherUniValue, basestring):
120                         break
121                     combining = unicodedata.combining(unichr(otherUniValue))
122                     if combining == 230:
123                         previous = None
124                         break
125                     if combining == 0:
126                         break
127                 if previous == convertCodeToInt("0049"):
128                     contextMatch = True
129             elif context == "Not_After_I":
130                 # not referenced in SpecialCasing
131                 raise NotImplementedError
132             ## After_Soft_Dotted
133             # The last preceding character with a
134             # combining class of zero before C was
135             # Soft_Dotted, and there is no interven-
136             # ing combining character class 230
137             elif context == "After_Soft_Dotted":
138                 previous = None
139                 for otherUniValue in reversed(glyphs[:index]):
140                     previous = otherUniValue
141                     if isinstance(otherUniValue, basestring):
142                         break
143                     combining = unicodedata.combining(unichr(otherUniValue))
144                     if combining == 230:
145                         previous = None
146                         break
147                     if combining == 0:
148                         break
149                 if previous in softDotted:
150                     contextMatch = True
151             elif context == "Not_After_Soft_Dotted":
152                 # not referenced in SpecialCasing
153                 raise NotImplementedError
154             ## More_Above
155             # C is followed by one or more charac-
156             # ters of combining class 230 (ABOVE)
157             # in the combining character sequence.
158             elif context == "More_Above":
159                 next = None
160                 for otherUniValue in glyphs[index+1:]:
161                     next = otherUniValue
162                     if isinstance(otherUniValue, basestring):
163                         break
164                     combining = unicodedata.combining(unichr(otherUniValue))
165                     if combining == 230:
166                         contextMatch = True
167                         break
168                     else:
169                         break
170             elif context == "Not_More_Above":
171                 # not referenced in SpecialCasing
172                 raise NotImplementedError
173             ## Before_Dot
174             # C is followed by U+0307 combining
175             # dot above. Any sequence of charac-
176             # ters with a combining class that is nei-
177             # ther 0 nor 230 may intervene between
178             # the current character and the com-
179             # bining dot above.
180             elif context == "Before_Dot":
181                 # not referenced in SpecialCasing
182                 raise NotImplementedError
183             elif context == "Not_Before_Dot":
184                 next = None
185                 contextMatch = True
186                 for otherUniValue in glyphs[index+1:]:
187                     if isinstance(otherUniValue, basestring):
188                         break
189                     if otherUniValue == convertCodeToInt("0307"):
190                         contextMatch = False
191                         break
192                     else:
193                         combining = unicodedata.combining(unichr(otherUniValue))
194                         if combining == 0 or combining == 230:
195                             break
196             ## Final_Sigma
197             # Within the closest word boundaries
198             # containing C, there is a cased letter
199             # before C, and there is no cased letter
200             # after C.
201             elif context == "Final_Sigma":
202                 glyphNames = [cmap.get(i, i) for i in glyphs]
203                 if isWordBreakAfter(glyphNames, index, reversedCMAP):
204                     contextMatch = True
205             ## Unknown
206             else:
207                 raise NotImplementedError(context)
208         if contextMatch:
209             conversion = languageMap[uniValue][case]
210             # if the conversion is None, it means that the character should be removed.
211             if conversion is None:
212                 return True
213             # apply the conversion to the list of converted characters.
214             if not isinstance(conversion, tuple):
215                 conversion = [conversion]
216             for code in conversion:
217                 converted.append(code)
218             return True
219     return False
220
221 # -----------------------
222 # Word Boundary Detection
223 # -----------------------
224 # This implements the default word boundary algorithm explained here:
225 # http://www.unicode.org/reports/tr29/tr29-11.html#Default_Word_Boundaries
226
227 _notBreakBefore = set([
228     # Do not break within CRLF
229     (convertCodeToInt("240D"), convertCodeToInt("240A")),
230     # Do not break between most letters.
231     ("ALetter", "ALetter"),
232     # Do not break across certain punctuation.
233     ("ALetter", "MidLetter", "ALetter"),
234     # Do not break within sequences of digits, or digits adjacent to letters.
235     ("Numeric", "Numeric"),
236     ("Numeric", "ALetter"),
237     ("ALetter", "Numeric"),
238     # Do not break within sequences, such as "3.2" or "3,456.789".
239     ("Numeric", "MidNum", "Numeric"),
240     # Do not break between Katakana.
241     ("Katakana", "Katakana"),
242     # Do not break from extenders.
243     ("ALetter", "ExtendNumLet"),
244     ("Numeric", "ExtendNumLet"),
245     ("Katakana", "ExtendNumLet"),
246     ("ExtendNumLet", "ExtendNumLet"),
247 ])
248
249 def isWordBreakBefore(glyphNames, index, reversedCMAP):
250     """
251     Returns a boolean declaring if the position
252     before index can be considered a word break.
253     """
254     # Start of line
255     if index == 0:
256         return True
257     # get the unicode values and word break properties
258     # for the previous two, current and next glyphs.
259     unicodeValue = reversedCMAP.get(glyphNames[index], [None])[0]
260     wordBreakProperty = wordBreakProperties.get(unicodeValue)
261     backOneUnicodeValue = reversedCMAP.get(glyphNames[index - 1], [None])[0]
262     backOneWordBreakProperty = wordBreakProperties.get(backOneUnicodeValue)
263     if index > 1:
264         backTwoUnicodeValue = reversedCMAP.get(glyphNames[index - 2], [None])[0]
265         backTwoWordBreakProperty = wordBreakProperties.get(backTwoUnicodeValue)
266     else:
267         backTwoUnicodeValue = False
268         backTwoWordBreakProperty = False
269     if index < len(glyphNames) - 1:
270         forwardOneUnicodeValue = reversedCMAP.get(glyphNames[index + 1], [None])[0]
271         forwardOneWordBreakProperty = wordBreakProperties.get(forwardOneUnicodeValue)
272     else:
273         forwardOneUnicodeValue = None
274         forwardOneWordBreakProperty = None
275     # test the previous and current unicode values
276     if (backOneUnicodeValue, unicodeValue) in _notBreakBefore:
277         return False
278     # test the previous and current word break properties
279     if (backOneWordBreakProperty, wordBreakProperty) in _notBreakBefore:
280         return False
281     # test the previous, current and next word break properties
282     if (backOneWordBreakProperty, wordBreakProperty, forwardOneWordBreakProperty) in _notBreakBefore:
283         return False
284     # test the previous, current and next word break properties
285     if (backTwoWordBreakProperty, backOneWordBreakProperty, wordBreakProperty) in _notBreakBefore:
286         return False
287     # Otherwise, break everywhere (including around ideographs).
288     return True
289
290 _notBreakAfter = set([
291     # Do not break within CRLF
292     (convertCodeToInt("240D"), convertCodeToInt("240A")),
293     # Do not break between most letters.
294     ("ALetter", "ALetter"),
295     # Do not break across certain punctuation.
296     ("ALetter", "MidLetter", "ALetter"),
297     # Do not break within sequences of digits, or digits adjacent to letters.
298     ("Numeric", "Numeric"),
299     ("Numeric", "ALetter"),
300     ("ALetter", "Numeric"),
301     # Do not break within sequences, such as "3.2" or "3,456.789".
302     ("Numeric", "MidNum", "Numeric"),
303     # Do not break between Katakana.
304     ("Katakana", "Katakana"),
305     # Do not break from extenders.
306     ("ExtendNumLet", "ALetter"),
307     ("ExtendNumLet", "Numeric"),
308     ("ExtendNumLet", "Katakana"),
309 ])
310
311 def isWordBreakAfter(glyphNames, index, reversedCMAP):
312     """
313     Returns a boolean declaring if the position
314     after index can be considered a word break.
315     """
316     # End of line
317     if index == len(glyphNames) - 1:
318         return True
319     # get the unicode values and word break properties
320     # for the previous, current and next two glyphs.
321     unicodeValue = reversedCMAP.get(glyphNames[index], [None])[0]
322     wordBreakProperty = wordBreakProperties.get(unicodeValue)
323     forwardOneUnicodeValue = reversedCMAP.get(glyphNames[index + 1], [None])[0]
324     forwardOneWordBreakProperty = wordBreakProperties.get(forwardOneUnicodeValue)
325     if index > 0:
326         backOneUnicodeValue = reversedCMAP.get(glyphNames[index - 1], [None])[0]
327         backOneWordBreakProperty = wordBreakProperties.get(backOneUnicodeValue)
328     else:
329         backOneUnicodeValue = None
330         backOneWordBreakProperty = None
331     if index < len(glyphNames) - 2:
332         forwardTwoUnicodeValue = reversedCMAP.get(glyphNames[index + 2], [None])[0]
333         forwardTwoWordBreakProperty = wordBreakProperties.get(forwardTwoUnicodeValue)
334     else:
335         forwardTwoUnicodeValue = None
336         forwardTwoWordBreakProperty = None
337     # test the current and next unicode values
338     if (unicodeValue, forwardOneUnicodeValue) in _notBreakAfter:
339         return False
340     # test the current and next word break properties
341     if (wordBreakProperty, forwardOneWordBreakProperty) in _notBreakAfter:
342         return False
343     # test the previous, current and next word break properties
344     if (backOneWordBreakProperty, wordBreakProperty, forwardOneWordBreakProperty) in _notBreakAfter:
345         return False
346     # test the current and next two word break properties
347     if (wordBreakProperty, forwardOneWordBreakProperty, forwardTwoWordBreakProperty) in _notBreakAfter:
348         return False
349     # Otherwise, break everywhere (including around ideographs).
350     return True
351
352 # -----
353 # Tests
354 # -----
355
356 # Case Conversion
357
358 def testCaseConversionSimple():
359     """
360     >>> cmap = {convertCodeToInt("0041") : "A",
361     ...         convertCodeToInt("0061") : "a"
362     ...         }
363     >>> convertCase("upper", ["a", "a.alt"], cmap, reverseCMAP(cmap), None)
364     ['A', 'a.alt']
365     """
366
367 def testCaseConversionSimpleMissing():
368     """
369     >>> cmap = {convertCodeToInt("0061") : "a"}
370     >>> convertCase("upper", ["a"], cmap, reverseCMAP(cmap), None)
371     ['.notdef']
372     """
373
374 def testCaseConversionLowerAfterI():
375     """
376     >>> cmap = {convertCodeToInt("0049") : "I",
377     ...         convertCodeToInt("0069") : "i",
378     ...         convertCodeToInt("0307") : "dotabove",
379     ...         convertCodeToInt("0300") : "grave"
380     ...         }
381     >>> convertCase("lower", ["I", "dotabove"], cmap, reverseCMAP(cmap), "TRK")
382     ['i']
383     """
384
385 def testCaseConversionUpperAfterSoftDotted():
386     """
387     >>> cmap = {convertCodeToInt("0049") : "I",
388     ...         convertCodeToInt("0069") : "i",
389     ...         convertCodeToInt("0307") : "dotabove",
390     ...         convertCodeToInt("0300") : "grave"
391     ...         }
392     >>> convertCase("upper", ["i", "dotabove"], cmap, reverseCMAP(cmap), "LTH")
393     ['I']
394     >>> convertCase("upper", ["i", "grave", "dotabove"], cmap, reverseCMAP(cmap), "LTH")
395     ['I', 'grave', 'dotabove']
396     """
397
398 def testCaseConversionLowerMoreAbove():
399     """
400     >>> cmap = {convertCodeToInt("0049") : "I",
401     ...         convertCodeToInt("0069") : "i",
402     ...         convertCodeToInt("0307") : "dotabove",
403     ...         convertCodeToInt("0300") : "grave"
404     ...         }
405     >>> convertCase("lower", ["I", "grave"], cmap, reverseCMAP(cmap), "LTH")
406     ['i', 'dotabove', 'grave']
407     >>> convertCase("lower", ["I", "I", "grave"], cmap, reverseCMAP(cmap), "LTH")
408     ['i', 'i', 'dotabove', 'grave']
409     >>> convertCase("lower", ["I", "I"], cmap, reverseCMAP(cmap), "LTH")
410     ['i', 'i']
411     """
412
413 def testCaseConversionLowerNotBeforeDot():
414     """
415     >>> cmap = {convertCodeToInt("0049") : "I",
416     ...         convertCodeToInt("0069") : "i",
417     ...         convertCodeToInt("0307") : "dotabove",
418     ...         convertCodeToInt("0131") : "dotlessi",
419     ...         convertCodeToInt("0327") : "cedilla"
420     ...         }
421     >>> convertCase("lower", ["I"], cmap, reverseCMAP(cmap), "TRK")
422     ['dotlessi']
423     >>> convertCase("lower", ["I", "dotabove"], cmap, reverseCMAP(cmap), "TRK")
424     ['i']
425     >>> convertCase("lower", ["I", "cedilla", "dotabove"], cmap, reverseCMAP(cmap), "TRK")
426     ['i', 'cedilla']
427     """
428
429 def testCaseConversionFinalSigma():
430     """
431     >>> cmap = {convertCodeToInt("03A3") : "Sigma",
432     ...         convertCodeToInt("03C3") : "sigma",
433     ...         convertCodeToInt("03C2") : "finalsigma",
434     ...         convertCodeToInt("0020") : "space",
435     ...         }
436     >>> convertCase("lower", ["Sigma", "Sigma"], cmap, reverseCMAP(cmap))
437     ['sigma', 'finalsigma']
438     >>> convertCase("lower", ["Sigma", "Sigma", "space"], cmap, reverseCMAP(cmap))
439     ['sigma', 'finalsigma', 'space']
440     """
441
442 # Word Boundaries
443
444 def testBreakBefore():
445     """
446     >>> cmap = {convertCodeToInt("0020") : "space",
447     ...         convertCodeToInt("0041") : "A",
448     ...         convertCodeToInt("002E") : "period",
449     ...         convertCodeToInt("003A") : "colon",
450     ...         convertCodeToInt("005F") : "underscore",
451     ...         convertCodeToInt("0031") : "one",
452     ...         convertCodeToInt("31F0") : "ku",
453     ...         }
454     >>> cmap = reverseCMAP(cmap)
455
456     # Start of line
457     >>> isWordBreakBefore(["A", "A"], 0, cmap)
458     True
459
460     # ALetter, ALetter
461     >>> isWordBreakBefore(["space", "A", "A"], 1, cmap)
462     True
463     >>> isWordBreakBefore(["space", "A", "A"], 2, cmap)
464     False
465
466     # ALetter, MidLetter, ALetter
467     >>> isWordBreakBefore(["A", "colon", "A"], 1, cmap)
468     False
469     >>> isWordBreakBefore(["A", "colon", "A"], 2, cmap)
470     False
471     >>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 1, cmap)
472     False
473     >>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 2, cmap)
474     False
475     >>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 3, cmap)
476     False
477     >>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 4, cmap)
478     False
479
480     # Numeric, Numeric
481     >>> isWordBreakBefore(["space", "one", "one"], 1, cmap)
482     True
483     >>> isWordBreakBefore(["space", "one", "one"], 2, cmap)
484     False
485
486     # ALetter, Numeric
487     >>> isWordBreakBefore(["space", "A", "one"], 1, cmap)
488     True
489     >>> isWordBreakBefore(["space", "A", "one"], 2, cmap)
490     False
491
492     # Numeric, ALetter
493     >>> isWordBreakBefore(["space", "one", "A"], 1, cmap)
494     True
495     >>> isWordBreakBefore(["space", "one", "A"], 2, cmap)
496     False
497
498     # Numeric, MidNum, Numeric
499     >>> isWordBreakBefore(["one", "period", "one"], 1, cmap)
500     False
501     >>> isWordBreakBefore(["one", "period", "one"], 2, cmap)
502     False
503
504     # Katakana, Katakana
505     >>> isWordBreakBefore(["space", "ku", "ku"], 1, cmap)
506     True
507     >>> isWordBreakBefore(["space", "ku", "ku"], 2, cmap)
508     False
509
510     # ALetter, ExtendNumLet
511     >>> isWordBreakBefore(["A", "underscore"], 1, cmap)
512     False
513
514     # Numeric, ExtendNumLet
515     >>> isWordBreakBefore(["one", "underscore"], 1, cmap)
516     False
517
518     # Katakana, ExtendNumLet
519     >>> isWordBreakBefore(["ku", "underscore"], 1, cmap)
520     False
521
522     # ExtendNumLet, ExtendNumLet
523     >>> isWordBreakBefore(["underscore", "underscore"], 1, cmap)
524     False
525     """
526
527 def testBreakAfter():
528     """
529     >>> cmap = {convertCodeToInt("0020") : "space",
530     ...         convertCodeToInt("0041") : "A",
531     ...         convertCodeToInt("002E") : "period",
532     ...         convertCodeToInt("003A") : "colon",
533     ...         convertCodeToInt("005F") : "underscore",
534     ...         convertCodeToInt("0031") : "one",
535     ...         convertCodeToInt("31F0") : "ku",
536     ...         }
537     >>> cmap = reverseCMAP(cmap)
538
539     # End of line
540     >>> isWordBreakAfter(["A", "A"], 1, cmap)
541     True
542
543     # ALetter, ALetter
544     >>> isWordBreakAfter(["A", "A", "space"], 0, cmap)
545     False
546     >>> isWordBreakAfter(["A", "A", "space"], 1, cmap)
547     True
548
549     # ALetter, MidLetter, ALetter
550     >>> isWordBreakAfter(["A", "colon", "A"], 0, cmap)
551     False
552     >>> isWordBreakAfter(["A", "colon", "A"], 1, cmap)
553     False
554     >>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 0, cmap)
555     False
556     >>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 1, cmap)
557     False
558     >>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 2, cmap)
559     False
560     >>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 3, cmap)
561     False
562
563     # Numeric, Numeric
564     >>> isWordBreakAfter(["one", "one", "space"], 0, cmap)
565     False
566     >>> isWordBreakAfter(["one", "one", "space"], 1, cmap)
567     True
568
569     # ALetter, Numeric
570     >>> isWordBreakAfter(["A", "one", "space"], 0, cmap)
571     False
572     >>> isWordBreakAfter(["A", "one", "space"], 1, cmap)
573     True
574
575     # Numeric, ALetter
576     >>> isWordBreakAfter(["one", "A", "space"], 0, cmap)
577     False
578     >>> isWordBreakAfter(["one", "A", "space"], 1, cmap)
579     True
580
581     # Numeric, MidNum, Numeric
582     >>> isWordBreakAfter(["one", "period", "one"], 0, cmap)
583     False
584     >>> isWordBreakAfter(["one", "period", "one"], 1, cmap)
585     False
586     >>> isWordBreakAfter(["one", "period", "one", "period", "one"], 0, cmap)
587     False
588     >>> isWordBreakAfter(["one", "period", "one", "period", "one"], 1, cmap)
589     False
590     >>> isWordBreakAfter(["one", "period", "one", "period", "one"], 2, cmap)
591     False
592     >>> isWordBreakAfter(["one", "period", "one", "period", "one"], 3, cmap)
593     False
594
595     # Katakana, Katakana
596     >>> isWordBreakAfter(["ku", "ku", "space"], 0, cmap)
597     False
598     >>> isWordBreakAfter(["ku", "ku", "space"], 1, cmap)
599     True
600
601     # ALetter, ExtendNumLet
602     >>> isWordBreakAfter(["underscore", "A"], 0, cmap)
603     False
604
605     # Numeric, ExtendNumLet
606     >>> isWordBreakAfter(["underscore", "one"], 0, cmap)
607     False
608
609     # Katakana, ExtendNumLet
610     >>> isWordBreakAfter(["underscore", "ku"], 0, cmap)
611     False
612     """
613
614 if __name__ == "__main__":
615     import doctest
616     doctest.testmod()
Note: See TracBrowser for help on using the browser.