| 1 |
import unicodedata |
|---|
| 2 |
from compositor.cmap import reverseCMAP |
|---|
| 3 |
from compositor.caseConversionMaps import lowerToSingleUpper, upperToSingleLower, specialCasing, softDotted |
|---|
| 4 |
from compositor.wordBreakProperties import wordBreakProperties |
|---|
| 5 |
|
|---|
| 6 |
try: |
|---|
| 7 |
set |
|---|
| 8 |
except NameError: |
|---|
| 9 |
from sets import Set as set |
|---|
| 10 |
|
|---|
| 11 |
try: |
|---|
| 12 |
reversed |
|---|
| 13 |
except NameError: |
|---|
| 14 |
def reversed(iterable): |
|---|
| 15 |
iterable = list(iterable) |
|---|
| 16 |
iterable.reverse() |
|---|
| 17 |
return iterable |
|---|
| 18 |
|
|---|
| 19 |
|
|---|
| 20 |
|
|---|
| 21 |
|
|---|
| 22 |
|
|---|
| 23 |
def convertCase(case, glyphNames, cmap, reversedCMAP, language=None, fallbackGlyph=".notdef"): |
|---|
| 24 |
""" |
|---|
| 25 |
Case Conversion Function |
|---|
| 26 |
|
|---|
| 27 |
This function converts a list of glyph names to their |
|---|
| 28 |
upper or lowercase forms following the Unicode locale |
|---|
| 29 |
specific case conversion rules. |
|---|
| 30 |
|
|---|
| 31 |
Arguments: |
|---|
| 32 |
- case |
|---|
| 33 |
The case to convert to. Valid values are "upper" and "lower". |
|---|
| 34 |
- glyphNames |
|---|
| 35 |
A list of glyph names. |
|---|
| 36 |
- cmap |
|---|
| 37 |
The CMAP for the font formatted as a dictionary. |
|---|
| 38 |
- reversedCMAP |
|---|
| 39 |
Reversed version of cmap. |
|---|
| 40 |
- language |
|---|
| 41 |
The language tag being processed. May be None. |
|---|
| 42 |
- fallbackGlyph |
|---|
| 43 |
The glyph name that should be used when the converted |
|---|
| 44 |
glyph does not exist in the font. |
|---|
| 45 |
""" |
|---|
| 46 |
|
|---|
| 47 |
|
|---|
| 48 |
|
|---|
| 49 |
glyphs = [] |
|---|
| 50 |
for glyphName in glyphNames: |
|---|
| 51 |
uniValue = reversedCMAP.get(glyphName) |
|---|
| 52 |
if uniValue is None: |
|---|
| 53 |
glyphs.append(glyphName) |
|---|
| 54 |
else: |
|---|
| 55 |
glyphs.append(uniValue[0]) |
|---|
| 56 |
converted = [] |
|---|
| 57 |
for index, uniValue in enumerate(glyphs): |
|---|
| 58 |
|
|---|
| 59 |
if isinstance(uniValue, basestring): |
|---|
| 60 |
converted.append(uniValue) |
|---|
| 61 |
continue |
|---|
| 62 |
|
|---|
| 63 |
|
|---|
| 64 |
if language is not None: |
|---|
| 65 |
madeChange = _handleSpecialCasing(case, glyphs, index, uniValue, converted, cmap, reversedCMAP, language) |
|---|
| 66 |
if madeChange: |
|---|
| 67 |
continue |
|---|
| 68 |
|
|---|
| 69 |
madeChange = _handleSpecialCasing(case, glyphs, index, uniValue, converted, cmap, reversedCMAP, None) |
|---|
| 70 |
if madeChange: |
|---|
| 71 |
continue |
|---|
| 72 |
|
|---|
| 73 |
if case == "upper": |
|---|
| 74 |
d = lowerToSingleUpper |
|---|
| 75 |
else: |
|---|
| 76 |
d = upperToSingleLower |
|---|
| 77 |
if uniValue in d: |
|---|
| 78 |
converted.append(d[uniValue]) |
|---|
| 79 |
continue |
|---|
| 80 |
|
|---|
| 81 |
converted.append(uniValue) |
|---|
| 82 |
|
|---|
| 83 |
glyphNames = [] |
|---|
| 84 |
for uniValue in converted: |
|---|
| 85 |
if isinstance(uniValue, basestring): |
|---|
| 86 |
glyphNames.append(uniValue) |
|---|
| 87 |
continue |
|---|
| 88 |
glyphNames.append(cmap.get(uniValue, fallbackGlyph)) |
|---|
| 89 |
return glyphNames |
|---|
| 90 |
|
|---|
| 91 |
def convertCodeToInt(code): |
|---|
| 92 |
if not code: |
|---|
| 93 |
return None |
|---|
| 94 |
if " " in code: |
|---|
| 95 |
return tuple([convertCodeToInt(i) for i in code.split(" ")]) |
|---|
| 96 |
return int(code, 16) |
|---|
| 97 |
|
|---|
| 98 |
def _handleSpecialCasing(case, glyphs, index, uniValue, converted, cmap, reversedCMAP, language): |
|---|
| 99 |
""" |
|---|
| 100 |
Handle a language specific lookup. |
|---|
| 101 |
Returns a boolean indicating if a change was made. |
|---|
| 102 |
""" |
|---|
| 103 |
if language not in specialCasing: |
|---|
| 104 |
return False |
|---|
| 105 |
languageMap = specialCasing[language] |
|---|
| 106 |
if uniValue in languageMap: |
|---|
| 107 |
contextMatch = True |
|---|
| 108 |
context = languageMap[uniValue]["context"] |
|---|
| 109 |
if context: |
|---|
| 110 |
contextMatch = False |
|---|
| 111 |
|
|---|
| 112 |
|
|---|
| 113 |
|
|---|
| 114 |
|
|---|
| 115 |
if context == "After_I": |
|---|
| 116 |
previous = None |
|---|
| 117 |
for otherUniValue in reversed(glyphs[:index]): |
|---|
| 118 |
previous = otherUniValue |
|---|
| 119 |
if isinstance(otherUniValue, basestring): |
|---|
| 120 |
break |
|---|
| 121 |
combining = unicodedata.combining(unichr(otherUniValue)) |
|---|
| 122 |
if combining == 230: |
|---|
| 123 |
previous = None |
|---|
| 124 |
break |
|---|
| 125 |
if combining == 0: |
|---|
| 126 |
break |
|---|
| 127 |
if previous == convertCodeToInt("0049"): |
|---|
| 128 |
contextMatch = True |
|---|
| 129 |
elif context == "Not_After_I": |
|---|
| 130 |
|
|---|
| 131 |
raise NotImplementedError |
|---|
| 132 |
|
|---|
| 133 |
|
|---|
| 134 |
|
|---|
| 135 |
|
|---|
| 136 |
|
|---|
| 137 |
elif context == "After_Soft_Dotted": |
|---|
| 138 |
previous = None |
|---|
| 139 |
for otherUniValue in reversed(glyphs[:index]): |
|---|
| 140 |
previous = otherUniValue |
|---|
| 141 |
if isinstance(otherUniValue, basestring): |
|---|
| 142 |
break |
|---|
| 143 |
combining = unicodedata.combining(unichr(otherUniValue)) |
|---|
| 144 |
if combining == 230: |
|---|
| 145 |
previous = None |
|---|
| 146 |
break |
|---|
| 147 |
if combining == 0: |
|---|
| 148 |
break |
|---|
| 149 |
if previous in softDotted: |
|---|
| 150 |
contextMatch = True |
|---|
| 151 |
elif context == "Not_After_Soft_Dotted": |
|---|
| 152 |
|
|---|
| 153 |
raise NotImplementedError |
|---|
| 154 |
|
|---|
| 155 |
|
|---|
| 156 |
|
|---|
| 157 |
|
|---|
| 158 |
elif context == "More_Above": |
|---|
| 159 |
next = None |
|---|
| 160 |
for otherUniValue in glyphs[index+1:]: |
|---|
| 161 |
next = otherUniValue |
|---|
| 162 |
if isinstance(otherUniValue, basestring): |
|---|
| 163 |
break |
|---|
| 164 |
combining = unicodedata.combining(unichr(otherUniValue)) |
|---|
| 165 |
if combining == 230: |
|---|
| 166 |
contextMatch = True |
|---|
| 167 |
break |
|---|
| 168 |
else: |
|---|
| 169 |
break |
|---|
| 170 |
elif context == "Not_More_Above": |
|---|
| 171 |
|
|---|
| 172 |
raise NotImplementedError |
|---|
| 173 |
|
|---|
| 174 |
|
|---|
| 175 |
|
|---|
| 176 |
|
|---|
| 177 |
|
|---|
| 178 |
|
|---|
| 179 |
|
|---|
| 180 |
elif context == "Before_Dot": |
|---|
| 181 |
|
|---|
| 182 |
raise NotImplementedError |
|---|
| 183 |
elif context == "Not_Before_Dot": |
|---|
| 184 |
next = None |
|---|
| 185 |
contextMatch = True |
|---|
| 186 |
for otherUniValue in glyphs[index+1:]: |
|---|
| 187 |
if isinstance(otherUniValue, basestring): |
|---|
| 188 |
break |
|---|
| 189 |
if otherUniValue == convertCodeToInt("0307"): |
|---|
| 190 |
contextMatch = False |
|---|
| 191 |
break |
|---|
| 192 |
else: |
|---|
| 193 |
combining = unicodedata.combining(unichr(otherUniValue)) |
|---|
| 194 |
if combining == 0 or combining == 230: |
|---|
| 195 |
break |
|---|
| 196 |
|
|---|
| 197 |
|
|---|
| 198 |
|
|---|
| 199 |
|
|---|
| 200 |
|
|---|
| 201 |
elif context == "Final_Sigma": |
|---|
| 202 |
glyphNames = [cmap.get(i, i) for i in glyphs] |
|---|
| 203 |
if isWordBreakAfter(glyphNames, index, reversedCMAP): |
|---|
| 204 |
contextMatch = True |
|---|
| 205 |
|
|---|
| 206 |
else: |
|---|
| 207 |
raise NotImplementedError(context) |
|---|
| 208 |
if contextMatch: |
|---|
| 209 |
conversion = languageMap[uniValue][case] |
|---|
| 210 |
|
|---|
| 211 |
if conversion is None: |
|---|
| 212 |
return True |
|---|
| 213 |
|
|---|
| 214 |
if not isinstance(conversion, tuple): |
|---|
| 215 |
conversion = [conversion] |
|---|
| 216 |
for code in conversion: |
|---|
| 217 |
converted.append(code) |
|---|
| 218 |
return True |
|---|
| 219 |
return False |
|---|
| 220 |
|
|---|
| 221 |
|
|---|
| 222 |
|
|---|
| 223 |
|
|---|
| 224 |
|
|---|
| 225 |
|
|---|
| 226 |
|
|---|
| 227 |
_notBreakBefore = set([ |
|---|
| 228 |
|
|---|
| 229 |
(convertCodeToInt("240D"), convertCodeToInt("240A")), |
|---|
| 230 |
|
|---|
| 231 |
("ALetter", "ALetter"), |
|---|
| 232 |
|
|---|
| 233 |
("ALetter", "MidLetter", "ALetter"), |
|---|
| 234 |
|
|---|
| 235 |
("Numeric", "Numeric"), |
|---|
| 236 |
("Numeric", "ALetter"), |
|---|
| 237 |
("ALetter", "Numeric"), |
|---|
| 238 |
|
|---|
| 239 |
("Numeric", "MidNum", "Numeric"), |
|---|
| 240 |
|
|---|
| 241 |
("Katakana", "Katakana"), |
|---|
| 242 |
|
|---|
| 243 |
("ALetter", "ExtendNumLet"), |
|---|
| 244 |
("Numeric", "ExtendNumLet"), |
|---|
| 245 |
("Katakana", "ExtendNumLet"), |
|---|
| 246 |
("ExtendNumLet", "ExtendNumLet"), |
|---|
| 247 |
]) |
|---|
| 248 |
|
|---|
| 249 |
def isWordBreakBefore(glyphNames, index, reversedCMAP): |
|---|
| 250 |
""" |
|---|
| 251 |
Returns a boolean declaring if the position |
|---|
| 252 |
before index can be considered a word break. |
|---|
| 253 |
""" |
|---|
| 254 |
|
|---|
| 255 |
if index == 0: |
|---|
| 256 |
return True |
|---|
| 257 |
|
|---|
| 258 |
|
|---|
| 259 |
unicodeValue = reversedCMAP.get(glyphNames[index], [None])[0] |
|---|
| 260 |
wordBreakProperty = wordBreakProperties.get(unicodeValue) |
|---|
| 261 |
backOneUnicodeValue = reversedCMAP.get(glyphNames[index - 1], [None])[0] |
|---|
| 262 |
backOneWordBreakProperty = wordBreakProperties.get(backOneUnicodeValue) |
|---|
| 263 |
if index > 1: |
|---|
| 264 |
backTwoUnicodeValue = reversedCMAP.get(glyphNames[index - 2], [None])[0] |
|---|
| 265 |
backTwoWordBreakProperty = wordBreakProperties.get(backTwoUnicodeValue) |
|---|
| 266 |
else: |
|---|
| 267 |
backTwoUnicodeValue = False |
|---|
| 268 |
backTwoWordBreakProperty = False |
|---|
| 269 |
if index < len(glyphNames) - 1: |
|---|
| 270 |
forwardOneUnicodeValue = reversedCMAP.get(glyphNames[index + 1], [None])[0] |
|---|
| 271 |
forwardOneWordBreakProperty = wordBreakProperties.get(forwardOneUnicodeValue) |
|---|
| 272 |
else: |
|---|
| 273 |
forwardOneUnicodeValue = None |
|---|
| 274 |
forwardOneWordBreakProperty = None |
|---|
| 275 |
|
|---|
| 276 |
if (backOneUnicodeValue, unicodeValue) in _notBreakBefore: |
|---|
| 277 |
return False |
|---|
| 278 |
|
|---|
| 279 |
if (backOneWordBreakProperty, wordBreakProperty) in _notBreakBefore: |
|---|
| 280 |
return False |
|---|
| 281 |
|
|---|
| 282 |
if (backOneWordBreakProperty, wordBreakProperty, forwardOneWordBreakProperty) in _notBreakBefore: |
|---|
| 283 |
return False |
|---|
| 284 |
|
|---|
| 285 |
if (backTwoWordBreakProperty, backOneWordBreakProperty, wordBreakProperty) in _notBreakBefore: |
|---|
| 286 |
return False |
|---|
| 287 |
|
|---|
| 288 |
return True |
|---|
| 289 |
|
|---|
| 290 |
_notBreakAfter = set([ |
|---|
| 291 |
|
|---|
| 292 |
(convertCodeToInt("240D"), convertCodeToInt("240A")), |
|---|
| 293 |
|
|---|
| 294 |
("ALetter", "ALetter"), |
|---|
| 295 |
|
|---|
| 296 |
("ALetter", "MidLetter", "ALetter"), |
|---|
| 297 |
|
|---|
| 298 |
("Numeric", "Numeric"), |
|---|
| 299 |
("Numeric", "ALetter"), |
|---|
| 300 |
("ALetter", "Numeric"), |
|---|
| 301 |
|
|---|
| 302 |
("Numeric", "MidNum", "Numeric"), |
|---|
| 303 |
|
|---|
| 304 |
("Katakana", "Katakana"), |
|---|
| 305 |
|
|---|
| 306 |
("ExtendNumLet", "ALetter"), |
|---|
| 307 |
("ExtendNumLet", "Numeric"), |
|---|
| 308 |
("ExtendNumLet", "Katakana"), |
|---|
| 309 |
]) |
|---|
| 310 |
|
|---|
| 311 |
def isWordBreakAfter(glyphNames, index, reversedCMAP): |
|---|
| 312 |
""" |
|---|
| 313 |
Returns a boolean declaring if the position |
|---|
| 314 |
after index can be considered a word break. |
|---|
| 315 |
""" |
|---|
| 316 |
|
|---|
| 317 |
if index == len(glyphNames) - 1: |
|---|
| 318 |
return True |
|---|
| 319 |
|
|---|
| 320 |
|
|---|
| 321 |
unicodeValue = reversedCMAP.get(glyphNames[index], [None])[0] |
|---|
| 322 |
wordBreakProperty = wordBreakProperties.get(unicodeValue) |
|---|
| 323 |
forwardOneUnicodeValue = reversedCMAP.get(glyphNames[index + 1], [None])[0] |
|---|
| 324 |
forwardOneWordBreakProperty = wordBreakProperties.get(forwardOneUnicodeValue) |
|---|
| 325 |
if index > 0: |
|---|
| 326 |
backOneUnicodeValue = reversedCMAP.get(glyphNames[index - 1], [None])[0] |
|---|
| 327 |
backOneWordBreakProperty = wordBreakProperties.get(backOneUnicodeValue) |
|---|
| 328 |
else: |
|---|
| 329 |
backOneUnicodeValue = None |
|---|
| 330 |
backOneWordBreakProperty = None |
|---|
| 331 |
if index < len(glyphNames) - 2: |
|---|
| 332 |
forwardTwoUnicodeValue = reversedCMAP.get(glyphNames[index + 2], [None])[0] |
|---|
| 333 |
forwardTwoWordBreakProperty = wordBreakProperties.get(forwardTwoUnicodeValue) |
|---|
| 334 |
else: |
|---|
| 335 |
forwardTwoUnicodeValue = None |
|---|
| 336 |
forwardTwoWordBreakProperty = None |
|---|
| 337 |
|
|---|
| 338 |
if (unicodeValue, forwardOneUnicodeValue) in _notBreakAfter: |
|---|
| 339 |
return False |
|---|
| 340 |
|
|---|
| 341 |
if (wordBreakProperty, forwardOneWordBreakProperty) in _notBreakAfter: |
|---|
| 342 |
return False |
|---|
| 343 |
|
|---|
| 344 |
if (backOneWordBreakProperty, wordBreakProperty, forwardOneWordBreakProperty) in _notBreakAfter: |
|---|
| 345 |
return False |
|---|
| 346 |
|
|---|
| 347 |
if (wordBreakProperty, forwardOneWordBreakProperty, forwardTwoWordBreakProperty) in _notBreakAfter: |
|---|
| 348 |
return False |
|---|
| 349 |
|
|---|
| 350 |
return True |
|---|
| 351 |
|
|---|
| 352 |
|
|---|
| 353 |
|
|---|
| 354 |
|
|---|
| 355 |
|
|---|
| 356 |
|
|---|
| 357 |
|
|---|
| 358 |
def testCaseConversionSimple(): |
|---|
| 359 |
""" |
|---|
| 360 |
>>> cmap = {convertCodeToInt("0041") : "A", |
|---|
| 361 |
... convertCodeToInt("0061") : "a" |
|---|
| 362 |
... } |
|---|
| 363 |
>>> convertCase("upper", ["a", "a.alt"], cmap, reverseCMAP(cmap), None) |
|---|
| 364 |
['A', 'a.alt'] |
|---|
| 365 |
""" |
|---|
| 366 |
|
|---|
| 367 |
def testCaseConversionSimpleMissing(): |
|---|
| 368 |
""" |
|---|
| 369 |
>>> cmap = {convertCodeToInt("0061") : "a"} |
|---|
| 370 |
>>> convertCase("upper", ["a"], cmap, reverseCMAP(cmap), None) |
|---|
| 371 |
['.notdef'] |
|---|
| 372 |
""" |
|---|
| 373 |
|
|---|
| 374 |
def testCaseConversionLowerAfterI(): |
|---|
| 375 |
""" |
|---|
| 376 |
>>> cmap = {convertCodeToInt("0049") : "I", |
|---|
| 377 |
... convertCodeToInt("0069") : "i", |
|---|
| 378 |
... convertCodeToInt("0307") : "dotabove", |
|---|
| 379 |
... convertCodeToInt("0300") : "grave" |
|---|
| 380 |
... } |
|---|
| 381 |
>>> convertCase("lower", ["I", "dotabove"], cmap, reverseCMAP(cmap), "TRK") |
|---|
| 382 |
['i'] |
|---|
| 383 |
""" |
|---|
| 384 |
|
|---|
| 385 |
def testCaseConversionUpperAfterSoftDotted(): |
|---|
| 386 |
""" |
|---|
| 387 |
>>> cmap = {convertCodeToInt("0049") : "I", |
|---|
| 388 |
... convertCodeToInt("0069") : "i", |
|---|
| 389 |
... convertCodeToInt("0307") : "dotabove", |
|---|
| 390 |
... convertCodeToInt("0300") : "grave" |
|---|
| 391 |
... } |
|---|
| 392 |
>>> convertCase("upper", ["i", "dotabove"], cmap, reverseCMAP(cmap), "LTH") |
|---|
| 393 |
['I'] |
|---|
| 394 |
>>> convertCase("upper", ["i", "grave", "dotabove"], cmap, reverseCMAP(cmap), "LTH") |
|---|
| 395 |
['I', 'grave', 'dotabove'] |
|---|
| 396 |
""" |
|---|
| 397 |
|
|---|
| 398 |
def testCaseConversionLowerMoreAbove(): |
|---|
| 399 |
""" |
|---|
| 400 |
>>> cmap = {convertCodeToInt("0049") : "I", |
|---|
| 401 |
... convertCodeToInt("0069") : "i", |
|---|
| 402 |
... convertCodeToInt("0307") : "dotabove", |
|---|
| 403 |
... convertCodeToInt("0300") : "grave" |
|---|
| 404 |
... } |
|---|
| 405 |
>>> convertCase("lower", ["I", "grave"], cmap, reverseCMAP(cmap), "LTH") |
|---|
| 406 |
['i', 'dotabove', 'grave'] |
|---|
| 407 |
>>> convertCase("lower", ["I", "I", "grave"], cmap, reverseCMAP(cmap), "LTH") |
|---|
| 408 |
['i', 'i', 'dotabove', 'grave'] |
|---|
| 409 |
>>> convertCase("lower", ["I", "I"], cmap, reverseCMAP(cmap), "LTH") |
|---|
| 410 |
['i', 'i'] |
|---|
| 411 |
""" |
|---|
| 412 |
|
|---|
| 413 |
def testCaseConversionLowerNotBeforeDot(): |
|---|
| 414 |
""" |
|---|
| 415 |
>>> cmap = {convertCodeToInt("0049") : "I", |
|---|
| 416 |
... convertCodeToInt("0069") : "i", |
|---|
| 417 |
... convertCodeToInt("0307") : "dotabove", |
|---|
| 418 |
... convertCodeToInt("0131") : "dotlessi", |
|---|
| 419 |
... convertCodeToInt("0327") : "cedilla" |
|---|
| 420 |
... } |
|---|
| 421 |
>>> convertCase("lower", ["I"], cmap, reverseCMAP(cmap), "TRK") |
|---|
| 422 |
['dotlessi'] |
|---|
| 423 |
>>> convertCase("lower", ["I", "dotabove"], cmap, reverseCMAP(cmap), "TRK") |
|---|
| 424 |
['i'] |
|---|
| 425 |
>>> convertCase("lower", ["I", "cedilla", "dotabove"], cmap, reverseCMAP(cmap), "TRK") |
|---|
| 426 |
['i', 'cedilla'] |
|---|
| 427 |
""" |
|---|
| 428 |
|
|---|
| 429 |
def testCaseConversionFinalSigma(): |
|---|
| 430 |
""" |
|---|
| 431 |
>>> cmap = {convertCodeToInt("03A3") : "Sigma", |
|---|
| 432 |
... convertCodeToInt("03C3") : "sigma", |
|---|
| 433 |
... convertCodeToInt("03C2") : "finalsigma", |
|---|
| 434 |
... convertCodeToInt("0020") : "space", |
|---|
| 435 |
... } |
|---|
| 436 |
>>> convertCase("lower", ["Sigma", "Sigma"], cmap, reverseCMAP(cmap)) |
|---|
| 437 |
['sigma', 'finalsigma'] |
|---|
| 438 |
>>> convertCase("lower", ["Sigma", "Sigma", "space"], cmap, reverseCMAP(cmap)) |
|---|
| 439 |
['sigma', 'finalsigma', 'space'] |
|---|
| 440 |
""" |
|---|
| 441 |
|
|---|
| 442 |
|
|---|
| 443 |
|
|---|
| 444 |
def testBreakBefore(): |
|---|
| 445 |
""" |
|---|
| 446 |
>>> cmap = {convertCodeToInt("0020") : "space", |
|---|
| 447 |
... convertCodeToInt("0041") : "A", |
|---|
| 448 |
... convertCodeToInt("002E") : "period", |
|---|
| 449 |
... convertCodeToInt("003A") : "colon", |
|---|
| 450 |
... convertCodeToInt("005F") : "underscore", |
|---|
| 451 |
... convertCodeToInt("0031") : "one", |
|---|
| 452 |
... convertCodeToInt("31F0") : "ku", |
|---|
| 453 |
... } |
|---|
| 454 |
>>> cmap = reverseCMAP(cmap) |
|---|
| 455 |
|
|---|
| 456 |
# Start of line |
|---|
| 457 |
>>> isWordBreakBefore(["A", "A"], 0, cmap) |
|---|
| 458 |
True |
|---|
| 459 |
|
|---|
| 460 |
# ALetter, ALetter |
|---|
| 461 |
>>> isWordBreakBefore(["space", "A", "A"], 1, cmap) |
|---|
| 462 |
True |
|---|
| 463 |
>>> isWordBreakBefore(["space", "A", "A"], 2, cmap) |
|---|
| 464 |
False |
|---|
| 465 |
|
|---|
| 466 |
# ALetter, MidLetter, ALetter |
|---|
| 467 |
>>> isWordBreakBefore(["A", "colon", "A"], 1, cmap) |
|---|
| 468 |
False |
|---|
| 469 |
>>> isWordBreakBefore(["A", "colon", "A"], 2, cmap) |
|---|
| 470 |
False |
|---|
| 471 |
>>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 1, cmap) |
|---|
| 472 |
False |
|---|
| 473 |
>>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 2, cmap) |
|---|
| 474 |
False |
|---|
| 475 |
>>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 3, cmap) |
|---|
| 476 |
False |
|---|
| 477 |
>>> isWordBreakBefore(["A", "colon", "A", "colon", "A"], 4, cmap) |
|---|
| 478 |
False |
|---|
| 479 |
|
|---|
| 480 |
# Numeric, Numeric |
|---|
| 481 |
>>> isWordBreakBefore(["space", "one", "one"], 1, cmap) |
|---|
| 482 |
True |
|---|
| 483 |
>>> isWordBreakBefore(["space", "one", "one"], 2, cmap) |
|---|
| 484 |
False |
|---|
| 485 |
|
|---|
| 486 |
# ALetter, Numeric |
|---|
| 487 |
>>> isWordBreakBefore(["space", "A", "one"], 1, cmap) |
|---|
| 488 |
True |
|---|
| 489 |
>>> isWordBreakBefore(["space", "A", "one"], 2, cmap) |
|---|
| 490 |
False |
|---|
| 491 |
|
|---|
| 492 |
# Numeric, ALetter |
|---|
| 493 |
>>> isWordBreakBefore(["space", "one", "A"], 1, cmap) |
|---|
| 494 |
True |
|---|
| 495 |
>>> isWordBreakBefore(["space", "one", "A"], 2, cmap) |
|---|
| 496 |
False |
|---|
| 497 |
|
|---|
| 498 |
# Numeric, MidNum, Numeric |
|---|
| 499 |
>>> isWordBreakBefore(["one", "period", "one"], 1, cmap) |
|---|
| 500 |
False |
|---|
| 501 |
>>> isWordBreakBefore(["one", "period", "one"], 2, cmap) |
|---|
| 502 |
False |
|---|
| 503 |
|
|---|
| 504 |
# Katakana, Katakana |
|---|
| 505 |
>>> isWordBreakBefore(["space", "ku", "ku"], 1, cmap) |
|---|
| 506 |
True |
|---|
| 507 |
>>> isWordBreakBefore(["space", "ku", "ku"], 2, cmap) |
|---|
| 508 |
False |
|---|
| 509 |
|
|---|
| 510 |
# ALetter, ExtendNumLet |
|---|
| 511 |
>>> isWordBreakBefore(["A", "underscore"], 1, cmap) |
|---|
| 512 |
False |
|---|
| 513 |
|
|---|
| 514 |
# Numeric, ExtendNumLet |
|---|
| 515 |
>>> isWordBreakBefore(["one", "underscore"], 1, cmap) |
|---|
| 516 |
False |
|---|
| 517 |
|
|---|
| 518 |
# Katakana, ExtendNumLet |
|---|
| 519 |
>>> isWordBreakBefore(["ku", "underscore"], 1, cmap) |
|---|
| 520 |
False |
|---|
| 521 |
|
|---|
| 522 |
# ExtendNumLet, ExtendNumLet |
|---|
| 523 |
>>> isWordBreakBefore(["underscore", "underscore"], 1, cmap) |
|---|
| 524 |
False |
|---|
| 525 |
""" |
|---|
| 526 |
|
|---|
| 527 |
def testBreakAfter(): |
|---|
| 528 |
""" |
|---|
| 529 |
>>> cmap = {convertCodeToInt("0020") : "space", |
|---|
| 530 |
... convertCodeToInt("0041") : "A", |
|---|
| 531 |
... convertCodeToInt("002E") : "period", |
|---|
| 532 |
... convertCodeToInt("003A") : "colon", |
|---|
| 533 |
... convertCodeToInt("005F") : "underscore", |
|---|
| 534 |
... convertCodeToInt("0031") : "one", |
|---|
| 535 |
... convertCodeToInt("31F0") : "ku", |
|---|
| 536 |
... } |
|---|
| 537 |
>>> cmap = reverseCMAP(cmap) |
|---|
| 538 |
|
|---|
| 539 |
# End of line |
|---|
| 540 |
>>> isWordBreakAfter(["A", "A"], 1, cmap) |
|---|
| 541 |
True |
|---|
| 542 |
|
|---|
| 543 |
# ALetter, ALetter |
|---|
| 544 |
>>> isWordBreakAfter(["A", "A", "space"], 0, cmap) |
|---|
| 545 |
False |
|---|
| 546 |
>>> isWordBreakAfter(["A", "A", "space"], 1, cmap) |
|---|
| 547 |
True |
|---|
| 548 |
|
|---|
| 549 |
# ALetter, MidLetter, ALetter |
|---|
| 550 |
>>> isWordBreakAfter(["A", "colon", "A"], 0, cmap) |
|---|
| 551 |
False |
|---|
| 552 |
>>> isWordBreakAfter(["A", "colon", "A"], 1, cmap) |
|---|
| 553 |
False |
|---|
| 554 |
>>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 0, cmap) |
|---|
| 555 |
False |
|---|
| 556 |
>>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 1, cmap) |
|---|
| 557 |
False |
|---|
| 558 |
>>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 2, cmap) |
|---|
| 559 |
False |
|---|
| 560 |
>>> isWordBreakAfter(["A", "colon", "A", "colon", "A"], 3, cmap) |
|---|
| 561 |
False |
|---|
| 562 |
|
|---|
| 563 |
# Numeric, Numeric |
|---|
| 564 |
>>> isWordBreakAfter(["one", "one", "space"], 0, cmap) |
|---|
| 565 |
False |
|---|
| 566 |
>>> isWordBreakAfter(["one", "one", "space"], 1, cmap) |
|---|
| 567 |
True |
|---|
| 568 |
|
|---|
| 569 |
# ALetter, Numeric |
|---|
| 570 |
>>> isWordBreakAfter(["A", "one", "space"], 0, cmap) |
|---|
| 571 |
False |
|---|
| 572 |
>>> isWordBreakAfter(["A", "one", "space"], 1, cmap) |
|---|
| 573 |
True |
|---|
| 574 |
|
|---|
| 575 |
# Numeric, ALetter |
|---|
| 576 |
>>> isWordBreakAfter(["one", "A", "space"], 0, cmap) |
|---|
| 577 |
False |
|---|
| 578 |
>>> isWordBreakAfter(["one", "A", "space"], 1, cmap) |
|---|
| 579 |
True |
|---|
| 580 |
|
|---|
| 581 |
# Numeric, MidNum, Numeric |
|---|
| 582 |
>>> isWordBreakAfter(["one", "period", "one"], 0, cmap) |
|---|
| 583 |
False |
|---|
| 584 |
>>> isWordBreakAfter(["one", "period", "one"], 1, cmap) |
|---|
| 585 |
False |
|---|
| 586 |
>>> isWordBreakAfter(["one", "period", "one", "period", "one"], 0, cmap) |
|---|
| 587 |
False |
|---|
| 588 |
>>> isWordBreakAfter(["one", "period", "one", "period", "one"], 1, cmap) |
|---|
| 589 |
False |
|---|
| 590 |
>>> isWordBreakAfter(["one", "period", "one", "period", "one"], 2, cmap) |
|---|
| 591 |
False |
|---|
| 592 |
>>> isWordBreakAfter(["one", "period", "one", "period", "one"], 3, cmap) |
|---|
| 593 |
False |
|---|
| 594 |
|
|---|
| 595 |
# Katakana, Katakana |
|---|
| 596 |
>>> isWordBreakAfter(["ku", "ku", "space"], 0, cmap) |
|---|
| 597 |
False |
|---|
| 598 |
>>> isWordBreakAfter(["ku", "ku", "space"], 1, cmap) |
|---|
| 599 |
True |
|---|
| 600 |
|
|---|
| 601 |
# ALetter, ExtendNumLet |
|---|
| 602 |
>>> isWordBreakAfter(["underscore", "A"], 0, cmap) |
|---|
| 603 |
False |
|---|
| 604 |
|
|---|
| 605 |
# Numeric, ExtendNumLet |
|---|
| 606 |
>>> isWordBreakAfter(["underscore", "one"], 0, cmap) |
|---|
| 607 |
False |
|---|
| 608 |
|
|---|
| 609 |
# Katakana, ExtendNumLet |
|---|
| 610 |
>>> isWordBreakAfter(["underscore", "ku"], 0, cmap) |
|---|
| 611 |
False |
|---|
| 612 |
""" |
|---|
| 613 |
|
|---|
| 614 |
if __name__ == "__main__": |
|---|
| 615 |
import doctest |
|---|
| 616 |
doctest.testmod() |
|---|