1 /* 2 * Copyright (c) 2014 SGI. 3 * All rights reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 */ 15 16 #include "utf8n.h" 17 18 struct utf8data { 19 unsigned int maxage; 20 unsigned int offset; 21 }; 22 23 #define __INCLUDED_FROM_UTF8NORM_C__ 24 #include "utf8data.h" 25 #undef __INCLUDED_FROM_UTF8NORM_C__ 26 27 int utf8version_is_supported(u8 maj, u8 min, u8 rev) 28 { 29 int i = ARRAY_SIZE(utf8agetab) - 1; 30 unsigned int sb_utf8version = UNICODE_AGE(maj, min, rev); 31 32 while (i >= 0 && utf8agetab[i] != 0) { 33 if (sb_utf8version == utf8agetab[i]) 34 return 1; 35 i--; 36 } 37 return 0; 38 } 39 EXPORT_SYMBOL(utf8version_is_supported); 40 41 int utf8version_latest(void) 42 { 43 return utf8vers; 44 } 45 EXPORT_SYMBOL(utf8version_latest); 46 47 /* 48 * UTF-8 valid ranges. 49 * 50 * The UTF-8 encoding spreads the bits of a 32bit word over several 51 * bytes. This table gives the ranges that can be held and how they'd 52 * be represented. 53 * 54 * 0x00000000 0x0000007F: 0xxxxxxx 55 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 56 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 57 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 58 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 59 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 60 * 61 * There is an additional requirement on UTF-8, in that only the 62 * shortest representation of a 32bit value is to be used. A decoder 63 * must not decode sequences that do not satisfy this requirement. 64 * Thus the allowed ranges have a lower bound. 65 * 66 * 0x00000000 0x0000007F: 0xxxxxxx 67 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 68 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 69 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 70 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 71 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 72 * 73 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 74 * 17 planes of 65536 values. This limits the sequences actually seen 75 * even more, to just the following. 76 * 77 * 0 - 0x7F: 0 - 0x7F 78 * 0x80 - 0x7FF: 0xC2 0x80 - 0xDF 0xBF 79 * 0x800 - 0xFFFF: 0xE0 0xA0 0x80 - 0xEF 0xBF 0xBF 80 * 0x10000 - 0x10FFFF: 0xF0 0x90 0x80 0x80 - 0xF4 0x8F 0xBF 0xBF 81 * 82 * Within those ranges the surrogates 0xD800 - 0xDFFF are not allowed. 83 * 84 * Note that the longest sequence seen with valid usage is 4 bytes, 85 * the same a single UTF-32 character. This makes the UTF-8 86 * representation of Unicode strictly smaller than UTF-32. 87 * 88 * The shortest sequence requirement was introduced by: 89 * Corrigendum #1: UTF-8 Shortest Form 90 * It can be found here: 91 * http://www.unicode.org/versions/corrigendum1.html 92 * 93 */ 94 95 /* 96 * Return the number of bytes used by the current UTF-8 sequence. 97 * Assumes the input points to the first byte of a valid UTF-8 98 * sequence. 99 */ 100 static inline int utf8clen(const char *s) 101 { 102 unsigned char c = *s; 103 104 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 105 } 106 107 /* 108 * Decode a 3-byte UTF-8 sequence. 109 */ 110 static unsigned int 111 utf8decode3(const char *str) 112 { 113 unsigned int uc; 114 115 uc = *str++ & 0x0F; 116 uc <<= 6; 117 uc |= *str++ & 0x3F; 118 uc <<= 6; 119 uc |= *str++ & 0x3F; 120 121 return uc; 122 } 123 124 /* 125 * Encode a 3-byte UTF-8 sequence. 126 */ 127 static int 128 utf8encode3(char *str, unsigned int val) 129 { 130 str[2] = (val & 0x3F) | 0x80; 131 val >>= 6; 132 str[1] = (val & 0x3F) | 0x80; 133 val >>= 6; 134 str[0] = val | 0xE0; 135 136 return 3; 137 } 138 139 /* 140 * utf8trie_t 141 * 142 * A compact binary tree, used to decode UTF-8 characters. 143 * 144 * Internal nodes are one byte for the node itself, and up to three 145 * bytes for an offset into the tree. The first byte contains the 146 * following information: 147 * NEXTBYTE - flag - advance to next byte if set 148 * BITNUM - 3 bit field - the bit number to tested 149 * OFFLEN - 2 bit field - number of bytes in the offset 150 * if offlen == 0 (non-branching node) 151 * RIGHTPATH - 1 bit field - set if the following node is for the 152 * right-hand path (tested bit is set) 153 * TRIENODE - 1 bit field - set if the following node is an internal 154 * node, otherwise it is a leaf node 155 * if offlen != 0 (branching node) 156 * LEFTNODE - 1 bit field - set if the left-hand node is internal 157 * RIGHTNODE - 1 bit field - set if the right-hand node is internal 158 * 159 * Due to the way utf8 works, there cannot be branching nodes with 160 * NEXTBYTE set, and moreover those nodes always have a righthand 161 * descendant. 162 */ 163 typedef const unsigned char utf8trie_t; 164 #define BITNUM 0x07 165 #define NEXTBYTE 0x08 166 #define OFFLEN 0x30 167 #define OFFLEN_SHIFT 4 168 #define RIGHTPATH 0x40 169 #define TRIENODE 0x80 170 #define RIGHTNODE 0x40 171 #define LEFTNODE 0x80 172 173 /* 174 * utf8leaf_t 175 * 176 * The leaves of the trie are embedded in the trie, and so the same 177 * underlying datatype: unsigned char. 178 * 179 * leaf[0]: The unicode version, stored as a generation number that is 180 * an index into utf8agetab[]. With this we can filter code 181 * points based on the unicode version in which they were 182 * defined. The CCC of a non-defined code point is 0. 183 * leaf[1]: Canonical Combining Class. During normalization, we need 184 * to do a stable sort into ascending order of all characters 185 * with a non-zero CCC that occur between two characters with 186 * a CCC of 0, or at the begin or end of a string. 187 * The unicode standard guarantees that all CCC values are 188 * between 0 and 254 inclusive, which leaves 255 available as 189 * a special value. 190 * Code points with CCC 0 are known as stoppers. 191 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 192 * start of a NUL-terminated string that is the decomposition 193 * of the character. 194 * The CCC of a decomposable character is the same as the CCC 195 * of the first character of its decomposition. 196 * Some characters decompose as the empty string: these are 197 * characters with the Default_Ignorable_Code_Point property. 198 * These do affect normalization, as they all have CCC 0. 199 * 200 * The decompositions in the trie have been fully expanded, with the 201 * exception of Hangul syllables, which are decomposed algorithmically. 202 * 203 * Casefolding, if applicable, is also done using decompositions. 204 * 205 * The trie is constructed in such a way that leaves exist for all 206 * UTF-8 sequences that match the criteria from the "UTF-8 valid 207 * ranges" comment above, and only for those sequences. Therefore a 208 * lookup in the trie can be used to validate the UTF-8 input. 209 */ 210 typedef const unsigned char utf8leaf_t; 211 212 #define LEAF_GEN(LEAF) ((LEAF)[0]) 213 #define LEAF_CCC(LEAF) ((LEAF)[1]) 214 #define LEAF_STR(LEAF) ((const char *)((LEAF) + 2)) 215 216 #define MINCCC (0) 217 #define MAXCCC (254) 218 #define STOPPER (0) 219 #define DECOMPOSE (255) 220 221 /* Marker for hangul syllable decomposition. */ 222 #define HANGUL ((char)(255)) 223 /* Size of the synthesized leaf used for Hangul syllable decomposition. */ 224 #define UTF8HANGULLEAF (12) 225 226 /* 227 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 228 * 229 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 230 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 231 * 232 * SBase = 0xAC00 233 * LBase = 0x1100 234 * VBase = 0x1161 235 * TBase = 0x11A7 236 * LCount = 19 237 * VCount = 21 238 * TCount = 28 239 * NCount = 588 (VCount * TCount) 240 * SCount = 11172 (LCount * NCount) 241 * 242 * Decomposition: 243 * SIndex = s - SBase 244 * 245 * LV (Canonical/Full) 246 * LIndex = SIndex / NCount 247 * VIndex = (Sindex % NCount) / TCount 248 * LPart = LBase + LIndex 249 * VPart = VBase + VIndex 250 * 251 * LVT (Canonical) 252 * LVIndex = (SIndex / TCount) * TCount 253 * TIndex = (Sindex % TCount) 254 * LVPart = SBase + LVIndex 255 * TPart = TBase + TIndex 256 * 257 * LVT (Full) 258 * LIndex = SIndex / NCount 259 * VIndex = (Sindex % NCount) / TCount 260 * TIndex = (Sindex % TCount) 261 * LPart = LBase + LIndex 262 * VPart = VBase + VIndex 263 * if (TIndex == 0) { 264 * d = <LPart, VPart> 265 * } else { 266 * TPart = TBase + TIndex 267 * d = <LPart, TPart, VPart> 268 * } 269 */ 270 271 /* Constants */ 272 #define SB (0xAC00) 273 #define LB (0x1100) 274 #define VB (0x1161) 275 #define TB (0x11A7) 276 #define LC (19) 277 #define VC (21) 278 #define TC (28) 279 #define NC (VC * TC) 280 #define SC (LC * NC) 281 282 /* Algorithmic decomposition of hangul syllable. */ 283 static utf8leaf_t * 284 utf8hangul(const char *str, unsigned char *hangul) 285 { 286 unsigned int si; 287 unsigned int li; 288 unsigned int vi; 289 unsigned int ti; 290 unsigned char *h; 291 292 /* Calculate the SI, LI, VI, and TI values. */ 293 si = utf8decode3(str) - SB; 294 li = si / NC; 295 vi = (si % NC) / TC; 296 ti = si % TC; 297 298 /* Fill in base of leaf. */ 299 h = hangul; 300 LEAF_GEN(h) = 2; 301 LEAF_CCC(h) = DECOMPOSE; 302 h += 2; 303 304 /* Add LPart, a 3-byte UTF-8 sequence. */ 305 h += utf8encode3((char *)h, li + LB); 306 307 /* Add VPart, a 3-byte UTF-8 sequence. */ 308 h += utf8encode3((char *)h, vi + VB); 309 310 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 311 if (ti) 312 h += utf8encode3((char *)h, ti + TB); 313 314 /* Terminate string. */ 315 h[0] = '\0'; 316 317 return hangul; 318 } 319 320 /* 321 * Use trie to scan s, touching at most len bytes. 322 * Returns the leaf if one exists, NULL otherwise. 323 * 324 * A non-NULL return guarantees that the UTF-8 sequence starting at s 325 * is well-formed and corresponds to a known unicode code point. The 326 * shorthand for this will be "is valid UTF-8 unicode". 327 */ 328 static utf8leaf_t *utf8nlookup(const struct utf8data *data, 329 unsigned char *hangul, const char *s, size_t len) 330 { 331 utf8trie_t *trie = NULL; 332 int offlen; 333 int offset; 334 int mask; 335 int node; 336 337 if (!data) 338 return NULL; 339 if (len == 0) 340 return NULL; 341 342 trie = utf8data + data->offset; 343 node = 1; 344 while (node) { 345 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 346 if (*trie & NEXTBYTE) { 347 if (--len == 0) 348 return NULL; 349 s++; 350 } 351 mask = 1 << (*trie & BITNUM); 352 if (*s & mask) { 353 /* Right leg */ 354 if (offlen) { 355 /* Right node at offset of trie */ 356 node = (*trie & RIGHTNODE); 357 offset = trie[offlen]; 358 while (--offlen) { 359 offset <<= 8; 360 offset |= trie[offlen]; 361 } 362 trie += offset; 363 } else if (*trie & RIGHTPATH) { 364 /* Right node after this node */ 365 node = (*trie & TRIENODE); 366 trie++; 367 } else { 368 /* No right node. */ 369 return NULL; 370 } 371 } else { 372 /* Left leg */ 373 if (offlen) { 374 /* Left node after this node. */ 375 node = (*trie & LEFTNODE); 376 trie += offlen + 1; 377 } else if (*trie & RIGHTPATH) { 378 /* No left node. */ 379 return NULL; 380 } else { 381 /* Left node after this node */ 382 node = (*trie & TRIENODE); 383 trie++; 384 } 385 } 386 } 387 /* 388 * Hangul decomposition is done algorithmically. These are the 389 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 390 * always 3 bytes long, so s has been advanced twice, and the 391 * start of the sequence is at s-2. 392 */ 393 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 394 trie = utf8hangul(s - 2, hangul); 395 return trie; 396 } 397 398 /* 399 * Use trie to scan s. 400 * Returns the leaf if one exists, NULL otherwise. 401 * 402 * Forwards to utf8nlookup(). 403 */ 404 static utf8leaf_t *utf8lookup(const struct utf8data *data, 405 unsigned char *hangul, const char *s) 406 { 407 return utf8nlookup(data, hangul, s, (size_t)-1); 408 } 409 410 /* 411 * Maximum age of any character in s. 412 * Return -1 if s is not valid UTF-8 unicode. 413 * Return 0 if only non-assigned code points are used. 414 */ 415 int utf8agemax(const struct utf8data *data, const char *s) 416 { 417 utf8leaf_t *leaf; 418 int age = 0; 419 int leaf_age; 420 unsigned char hangul[UTF8HANGULLEAF]; 421 422 if (!data) 423 return -1; 424 425 while (*s) { 426 leaf = utf8lookup(data, hangul, s); 427 if (!leaf) 428 return -1; 429 430 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 431 if (leaf_age <= data->maxage && leaf_age > age) 432 age = leaf_age; 433 s += utf8clen(s); 434 } 435 return age; 436 } 437 EXPORT_SYMBOL(utf8agemax); 438 439 /* 440 * Minimum age of any character in s. 441 * Return -1 if s is not valid UTF-8 unicode. 442 * Return 0 if non-assigned code points are used. 443 */ 444 int utf8agemin(const struct utf8data *data, const char *s) 445 { 446 utf8leaf_t *leaf; 447 int age; 448 int leaf_age; 449 unsigned char hangul[UTF8HANGULLEAF]; 450 451 if (!data) 452 return -1; 453 age = data->maxage; 454 while (*s) { 455 leaf = utf8lookup(data, hangul, s); 456 if (!leaf) 457 return -1; 458 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 459 if (leaf_age <= data->maxage && leaf_age < age) 460 age = leaf_age; 461 s += utf8clen(s); 462 } 463 return age; 464 } 465 EXPORT_SYMBOL(utf8agemin); 466 467 /* 468 * Maximum age of any character in s, touch at most len bytes. 469 * Return -1 if s is not valid UTF-8 unicode. 470 */ 471 int utf8nagemax(const struct utf8data *data, const char *s, size_t len) 472 { 473 utf8leaf_t *leaf; 474 int age = 0; 475 int leaf_age; 476 unsigned char hangul[UTF8HANGULLEAF]; 477 478 if (!data) 479 return -1; 480 481 while (len && *s) { 482 leaf = utf8nlookup(data, hangul, s, len); 483 if (!leaf) 484 return -1; 485 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 486 if (leaf_age <= data->maxage && leaf_age > age) 487 age = leaf_age; 488 len -= utf8clen(s); 489 s += utf8clen(s); 490 } 491 return age; 492 } 493 EXPORT_SYMBOL(utf8nagemax); 494 495 /* 496 * Maximum age of any character in s, touch at most len bytes. 497 * Return -1 if s is not valid UTF-8 unicode. 498 */ 499 int utf8nagemin(const struct utf8data *data, const char *s, size_t len) 500 { 501 utf8leaf_t *leaf; 502 int leaf_age; 503 int age; 504 unsigned char hangul[UTF8HANGULLEAF]; 505 506 if (!data) 507 return -1; 508 age = data->maxage; 509 while (len && *s) { 510 leaf = utf8nlookup(data, hangul, s, len); 511 if (!leaf) 512 return -1; 513 leaf_age = utf8agetab[LEAF_GEN(leaf)]; 514 if (leaf_age <= data->maxage && leaf_age < age) 515 age = leaf_age; 516 len -= utf8clen(s); 517 s += utf8clen(s); 518 } 519 return age; 520 } 521 EXPORT_SYMBOL(utf8nagemin); 522 523 /* 524 * Length of the normalization of s. 525 * Return -1 if s is not valid UTF-8 unicode. 526 * 527 * A string of Default_Ignorable_Code_Point has length 0. 528 */ 529 ssize_t utf8len(const struct utf8data *data, const char *s) 530 { 531 utf8leaf_t *leaf; 532 size_t ret = 0; 533 unsigned char hangul[UTF8HANGULLEAF]; 534 535 if (!data) 536 return -1; 537 while (*s) { 538 leaf = utf8lookup(data, hangul, s); 539 if (!leaf) 540 return -1; 541 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 542 ret += utf8clen(s); 543 else if (LEAF_CCC(leaf) == DECOMPOSE) 544 ret += strlen(LEAF_STR(leaf)); 545 else 546 ret += utf8clen(s); 547 s += utf8clen(s); 548 } 549 return ret; 550 } 551 EXPORT_SYMBOL(utf8len); 552 553 /* 554 * Length of the normalization of s, touch at most len bytes. 555 * Return -1 if s is not valid UTF-8 unicode. 556 */ 557 ssize_t utf8nlen(const struct utf8data *data, const char *s, size_t len) 558 { 559 utf8leaf_t *leaf; 560 size_t ret = 0; 561 unsigned char hangul[UTF8HANGULLEAF]; 562 563 if (!data) 564 return -1; 565 while (len && *s) { 566 leaf = utf8nlookup(data, hangul, s, len); 567 if (!leaf) 568 return -1; 569 if (utf8agetab[LEAF_GEN(leaf)] > data->maxage) 570 ret += utf8clen(s); 571 else if (LEAF_CCC(leaf) == DECOMPOSE) 572 ret += strlen(LEAF_STR(leaf)); 573 else 574 ret += utf8clen(s); 575 len -= utf8clen(s); 576 s += utf8clen(s); 577 } 578 return ret; 579 } 580 EXPORT_SYMBOL(utf8nlen); 581 582 /* 583 * Set up an utf8cursor for use by utf8byte(). 584 * 585 * u8c : pointer to cursor. 586 * data : const struct utf8data to use for normalization. 587 * s : string. 588 * len : length of s. 589 * 590 * Returns -1 on error, 0 on success. 591 */ 592 int utf8ncursor(struct utf8cursor *u8c, const struct utf8data *data, 593 const char *s, size_t len) 594 { 595 if (!data) 596 return -1; 597 if (!s) 598 return -1; 599 u8c->data = data; 600 u8c->s = s; 601 u8c->p = NULL; 602 u8c->ss = NULL; 603 u8c->sp = NULL; 604 u8c->len = len; 605 u8c->slen = 0; 606 u8c->ccc = STOPPER; 607 u8c->nccc = STOPPER; 608 /* Check we didn't clobber the maximum length. */ 609 if (u8c->len != len) 610 return -1; 611 /* The first byte of s may not be an utf8 continuation. */ 612 if (len > 0 && (*s & 0xC0) == 0x80) 613 return -1; 614 return 0; 615 } 616 EXPORT_SYMBOL(utf8ncursor); 617 618 /* 619 * Set up an utf8cursor for use by utf8byte(). 620 * 621 * u8c : pointer to cursor. 622 * data : const struct utf8data to use for normalization. 623 * s : NUL-terminated string. 624 * 625 * Returns -1 on error, 0 on success. 626 */ 627 int utf8cursor(struct utf8cursor *u8c, const struct utf8data *data, 628 const char *s) 629 { 630 return utf8ncursor(u8c, data, s, (unsigned int)-1); 631 } 632 EXPORT_SYMBOL(utf8cursor); 633 634 /* 635 * Get one byte from the normalized form of the string described by u8c. 636 * 637 * Returns the byte cast to an unsigned char on succes, and -1 on failure. 638 * 639 * The cursor keeps track of the location in the string in u8c->s. 640 * When a character is decomposed, the current location is stored in 641 * u8c->p, and u8c->s is set to the start of the decomposition. Note 642 * that bytes from a decomposition do not count against u8c->len. 643 * 644 * Characters are emitted if they match the current CCC in u8c->ccc. 645 * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 646 * and the function returns 0 in that case. 647 * 648 * Sorting by CCC is done by repeatedly scanning the string. The 649 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 650 * the start of the scan. The first pass finds the lowest CCC to be 651 * emitted and stores it in u8c->nccc, the second pass emits the 652 * characters with this CCC and finds the next lowest CCC. This limits 653 * the number of passes to 1 + the number of different CCCs in the 654 * sequence being scanned. 655 * 656 * Therefore: 657 * u8c->p != NULL -> a decomposition is being scanned. 658 * u8c->ss != NULL -> this is a repeating scan. 659 * u8c->ccc == -1 -> this is the first scan of a repeating scan. 660 */ 661 int utf8byte(struct utf8cursor *u8c) 662 { 663 utf8leaf_t *leaf; 664 int ccc; 665 666 for (;;) { 667 /* Check for the end of a decomposed character. */ 668 if (u8c->p && *u8c->s == '\0') { 669 u8c->s = u8c->p; 670 u8c->p = NULL; 671 } 672 673 /* Check for end-of-string. */ 674 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 675 /* There is no next byte. */ 676 if (u8c->ccc == STOPPER) 677 return 0; 678 /* End-of-string during a scan counts as a stopper. */ 679 ccc = STOPPER; 680 goto ccc_mismatch; 681 } else if ((*u8c->s & 0xC0) == 0x80) { 682 /* This is a continuation of the current character. */ 683 if (!u8c->p) 684 u8c->len--; 685 return (unsigned char)*u8c->s++; 686 } 687 688 /* Look up the data for the current character. */ 689 if (u8c->p) { 690 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 691 } else { 692 leaf = utf8nlookup(u8c->data, u8c->hangul, 693 u8c->s, u8c->len); 694 } 695 696 /* No leaf found implies that the input is a binary blob. */ 697 if (!leaf) 698 return -1; 699 700 ccc = LEAF_CCC(leaf); 701 /* Characters that are too new have CCC 0. */ 702 if (utf8agetab[LEAF_GEN(leaf)] > u8c->data->maxage) { 703 ccc = STOPPER; 704 } else if (ccc == DECOMPOSE) { 705 u8c->len -= utf8clen(u8c->s); 706 u8c->p = u8c->s + utf8clen(u8c->s); 707 u8c->s = LEAF_STR(leaf); 708 /* Empty decomposition implies CCC 0. */ 709 if (*u8c->s == '\0') { 710 if (u8c->ccc == STOPPER) 711 continue; 712 ccc = STOPPER; 713 goto ccc_mismatch; 714 } 715 716 leaf = utf8lookup(u8c->data, u8c->hangul, u8c->s); 717 if (!leaf) 718 return -1; 719 ccc = LEAF_CCC(leaf); 720 } 721 722 /* 723 * If this is not a stopper, then see if it updates 724 * the next canonical class to be emitted. 725 */ 726 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 727 u8c->nccc = ccc; 728 729 /* 730 * Return the current byte if this is the current 731 * combining class. 732 */ 733 if (ccc == u8c->ccc) { 734 if (!u8c->p) 735 u8c->len--; 736 return (unsigned char)*u8c->s++; 737 } 738 739 /* Current combining class mismatch. */ 740 ccc_mismatch: 741 if (u8c->nccc == STOPPER) { 742 /* 743 * Scan forward for the first canonical class 744 * to be emitted. Save the position from 745 * which to restart. 746 */ 747 u8c->ccc = MINCCC - 1; 748 u8c->nccc = ccc; 749 u8c->sp = u8c->p; 750 u8c->ss = u8c->s; 751 u8c->slen = u8c->len; 752 if (!u8c->p) 753 u8c->len -= utf8clen(u8c->s); 754 u8c->s += utf8clen(u8c->s); 755 } else if (ccc != STOPPER) { 756 /* Not a stopper, and not the ccc we're emitting. */ 757 if (!u8c->p) 758 u8c->len -= utf8clen(u8c->s); 759 u8c->s += utf8clen(u8c->s); 760 } else if (u8c->nccc != MAXCCC + 1) { 761 /* At a stopper, restart for next ccc. */ 762 u8c->ccc = u8c->nccc; 763 u8c->nccc = MAXCCC + 1; 764 u8c->s = u8c->ss; 765 u8c->p = u8c->sp; 766 u8c->len = u8c->slen; 767 } else { 768 /* All done, proceed from here. */ 769 u8c->ccc = STOPPER; 770 u8c->nccc = STOPPER; 771 u8c->sp = NULL; 772 u8c->ss = NULL; 773 u8c->slen = 0; 774 } 775 } 776 } 777 EXPORT_SYMBOL(utf8byte); 778 779 const struct utf8data *utf8nfdi(unsigned int maxage) 780 { 781 int i = ARRAY_SIZE(utf8nfdidata) - 1; 782 783 while (maxage < utf8nfdidata[i].maxage) 784 i--; 785 if (maxage > utf8nfdidata[i].maxage) 786 return NULL; 787 return &utf8nfdidata[i]; 788 } 789 EXPORT_SYMBOL(utf8nfdi); 790 791 const struct utf8data *utf8nfdicf(unsigned int maxage) 792 { 793 int i = ARRAY_SIZE(utf8nfdicfdata) - 1; 794 795 while (maxage < utf8nfdicfdata[i].maxage) 796 i--; 797 if (maxage > utf8nfdicfdata[i].maxage) 798 return NULL; 799 return &utf8nfdicfdata[i]; 800 } 801 EXPORT_SYMBOL(utf8nfdicf); 802