Lines Matching +full:nc +full:- +full:si

16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
30 /* Default names of the in- and output files. */
65 /* ------------------------------------------------------------------ */
80 #define UNICODE_MAJ_MAX ((unsigned short)-1)
81 #define UNICODE_MIN_MAX ((unsigned char)-1)
82 #define UNICODE_REV_MAX ((unsigned char)-1)
106 /* ------------------------------------------------------------------ */
111 * A compact binary tree, used to decode UTF-8 characters.
116 * NEXTBYTE - flag - advance to next byte if set
117 * BITNUM - 3 bit field - the bit number to tested
118 * OFFLEN - 2 bit field - number of bytes in the offset
119 * if offlen == 0 (non-branching node)
120 * RIGHTPATH - 1 bit field - set if the following node is for the
121 * right-hand path (tested bit is set)
122 * TRIENODE - 1 bit field - set if the following node is an internal
125 * LEFTNODE - 1 bit field - set if the left-hand node is internal
126 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
151 * defined. The CCC of a non-defined code point is 0.
154 * with a non-zero CCC that occur between two characters with
161 * start of a NUL-terminated string that is the decomposition
200 /* ------------------------------------------------------------------ */
205 * The UTF-8 encoding spreads the bits of a 32bit word over several
216 * There is an additional requirement on UTF-8, in that only the
228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
232 * 0 - 0x7f: 0 0x7f
233 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
234 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
235 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
238 * 0xd800 - 0xdfff should never be seen.
241 * the same a single UTF-32 character. This makes the UTF-8
242 * representation of Unicode strictly smaller than UTF-32.
245 * Corrigendum #1: UTF-8 Shortest Form
385 node = tree->root; in lookup()
387 if (node->nextbyte) in lookup()
389 if (*key & (1 << (node->bitnum & 7))) { in lookup()
391 if (node->rightnode == NODE) { in lookup()
392 node = node->right; in lookup()
393 } else if (node->rightnode == LEAF) { in lookup()
394 leaf = node->right; in lookup()
400 if (node->leftnode == NODE) { in lookup()
401 node = node->left; in lookup()
402 } else if (node->leftnode == LEAF) { in lookup()
403 leaf = node->left; in lookup()
414 * A simple non-recursive tree walker: keep track of visits to the
428 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); in tree_walk()
429 if (tree->childnode == LEAF) { in tree_walk()
430 assert(tree->root); in tree_walk()
431 tree->leaf_print(tree->root, indent); in tree_walk()
434 assert(tree->childnode == NODE); in tree_walk()
435 node = tree->root; in tree_walk()
441 node->bitnum, node->nextbyte, in tree_walk()
442 node->left, node->right, in tree_walk()
443 node->keymask, node->keybits); in tree_walk()
445 if (!(node->left && node->right)) in tree_walk()
449 bitmask = 1 << node->bitnum; in tree_walk()
452 if (node->leftnode == LEAF) { in tree_walk()
453 assert(node->left); in tree_walk()
454 tree->leaf_print(node->left, in tree_walk()
457 } else if (node->left) { in tree_walk()
458 assert(node->leftnode == NODE); in tree_walk()
460 node = node->left; in tree_walk()
466 if (node->rightnode == LEAF) { in tree_walk()
467 assert(node->right); in tree_walk()
468 tree->leaf_print(node->right, in tree_walk()
471 } else if (node->right) { in tree_walk()
472 assert(node->rightnode == NODE); in tree_walk()
474 node = node->right; in tree_walk()
480 node = node->parent; in tree_walk()
481 indent -= 1; in tree_walk()
498 node->left = node->right = NULL; in alloc_node()
499 node->parent = parent; in alloc_node()
500 node->leftnode = NODE; in alloc_node()
501 node->rightnode = NODE; in alloc_node()
502 node->keybits = 0; in alloc_node()
503 node->keymask = 0; in alloc_node()
504 node->mark = 0; in alloc_node()
505 node->index = 0; in alloc_node()
506 node->offset = -1; in alloc_node()
507 node->size = 4; in alloc_node()
509 if (node->parent) { in alloc_node()
510 bitnum = parent->bitnum; in alloc_node()
512 node->bitnum = bitnum + 7 + 8; in alloc_node()
513 node->nextbyte = 1; in alloc_node()
515 node->bitnum = bitnum - 1; in alloc_node()
516 node->nextbyte = 0; in alloc_node()
519 node->bitnum = 7; in alloc_node()
520 node->nextbyte = 0; in alloc_node()
543 cursor = &tree->root; in insert()
551 if (node->nextbyte) in insert()
553 if (*key & (1 << (node->bitnum & 7))) in insert()
554 cursor = &node->right; in insert()
556 cursor = &node->left; in insert()
557 keybits--; in insert()
563 if (*key & (1 << (node->bitnum & 7))) in insert()
564 node->rightnode = LEAF; in insert()
566 node->leftnode = LEAF; in insert()
567 if (node->nextbyte) in insert()
569 if (node->leftnode == NODE || node->rightnode == NODE) in insert()
571 assert(node->left); in insert()
572 assert(node->right); in insert()
574 if (! tree->leaf_equal(node->left, node->right)) in insert()
577 leaf = node->left; in insert()
579 parent = node->parent; in insert()
582 tree->root = leaf; in insert()
583 tree->childnode = LEAF; in insert()
584 } else if (parent->left == node) { in insert()
585 parent->left = leaf; in insert()
586 parent->leftnode = LEAF; in insert()
587 if (parent->right) { in insert()
588 parent->keymask = 0; in insert()
589 parent->keybits = 0; in insert()
591 parent->keymask |= (1 << node->bitnum); in insert()
593 } else if (parent->right == node) { in insert()
594 parent->right = leaf; in insert()
595 parent->rightnode = LEAF; in insert()
596 if (parent->left) { in insert()
597 parent->keymask = 0; in insert()
598 parent->keybits = 0; in insert()
600 parent->keymask |= (1 << node->bitnum); in insert()
601 parent->keybits |= (1 << node->bitnum); in insert()
613 parent = node->parent; in insert()
617 if (node->keymask == 0) { in insert()
618 parent->keymask = 0; in insert()
619 parent->keybits = 0; in insert()
620 } else if (parent->left && parent->right) { in insert()
621 parent->keymask = 0; in insert()
622 parent->keybits = 0; in insert()
624 assert((parent->keymask & node->keymask) == 0); in insert()
625 parent->keymask |= node->keymask; in insert()
626 parent->keymask |= (1 << parent->bitnum); in insert()
627 parent->keybits |= node->keybits; in insert()
628 if (parent->right) in insert()
629 parent->keybits |= (1 << parent->bitnum); in insert()
668 printf("Pruning %s_%x\n", tree->type, tree->maxage); in prune()
671 if (tree->childnode == LEAF) in prune()
673 if (!tree->root) in prune()
677 node = tree->root; in prune()
679 if (node->nextbyte) in prune()
681 if (node->leftnode == LEAF) in prune()
683 if (node->rightnode == LEAF) in prune()
685 if (!node->left) in prune()
687 if (!node->right) in prune()
689 left = node->left; in prune()
690 right = node->right; in prune()
691 if (left->keymask == 0) in prune()
693 if (right->keymask == 0) in prune()
695 if (left->keymask != right->keymask) in prune()
697 if (left->keybits != right->keybits) in prune()
701 assert(left->left || left->right); in prune()
702 if (left->leftnode == LEAF) in prune()
703 leftleaf = left->left; in prune()
704 else if (left->rightnode == LEAF) in prune()
705 leftleaf = left->right; in prune()
706 else if (left->left) in prune()
707 left = left->left; in prune()
708 else if (left->right) in prune()
709 left = left->right; in prune()
715 assert(right->left || right->right); in prune()
716 if (right->leftnode == LEAF) in prune()
717 rightleaf = right->left; in prune()
718 else if (right->rightnode == LEAF) in prune()
719 rightleaf = right->right; in prune()
720 else if (right->left) in prune()
721 right = right->left; in prune()
722 else if (right->right) in prune()
723 right = right->right; in prune()
727 if (! tree->leaf_equal(leftleaf, rightleaf)) in prune()
730 * This node has identical singleton-only subtrees. in prune()
733 parent = node->parent; in prune()
734 left = node->left; in prune()
735 right = node->right; in prune()
736 if (parent->left == node) in prune()
737 parent->left = left; in prune()
738 else if (parent->right == node) in prune()
739 parent->right = left; in prune()
742 left->parent = parent; in prune()
743 left->keymask |= (1 << node->bitnum); in prune()
744 node->left = NULL; in prune()
746 bitmask = 1 << node->bitnum; in prune()
749 if (node->leftnode == NODE && node->left) { in prune()
750 left = node->left; in prune()
754 } else if (node->rightnode == NODE && node->right) { in prune()
755 right = node->right; in prune()
765 /* Force re-check */ in prune()
766 bitmask = 1 << node->bitnum; in prune()
770 if (node->left && node->right) in prune()
772 if (node->left) { in prune()
773 left = node->left; in prune()
774 node->keymask |= left->keymask; in prune()
775 node->keybits |= left->keybits; in prune()
777 if (node->right) { in prune()
778 right = node->right; in prune()
779 node->keymask |= right->keymask; in prune()
780 node->keybits |= right->keybits; in prune()
782 node->keymask |= (1 << node->bitnum); in prune()
783 node = node->parent; in prune()
784 /* Force re-check */ in prune()
785 bitmask = 1 << node->bitnum; in prune()
790 bitmask = 1 << node->bitnum; in prune()
792 node->leftnode == NODE && in prune()
793 node->left) { in prune()
795 node = node->left; in prune()
797 node->rightnode == NODE && in prune()
798 node->right) { in prune()
800 node = node->right; in prune()
804 node = node->parent; in prune()
826 printf("Marking %s_%x\n", tree->type, tree->maxage); in mark_nodes()
827 if (tree->childnode == LEAF) in mark_nodes()
830 assert(tree->childnode == NODE); in mark_nodes()
831 node = tree->root; in mark_nodes()
834 bitmask = 1 << node->bitnum; in mark_nodes()
837 if (node->leftnode == LEAF) { in mark_nodes()
838 assert(node->left); in mark_nodes()
839 if (tree->leaf_mark(node->left)) { in mark_nodes()
841 while (n && !n->mark) { in mark_nodes()
843 n->mark = 1; in mark_nodes()
844 n = n->parent; in mark_nodes()
847 } else if (node->left) { in mark_nodes()
848 assert(node->leftnode == NODE); in mark_nodes()
849 node = node->left; in mark_nodes()
855 if (node->rightnode == LEAF) { in mark_nodes()
856 assert(node->right); in mark_nodes()
857 if (tree->leaf_mark(node->right)) { in mark_nodes()
859 while (n && !n->mark) { in mark_nodes()
861 n->mark = 1; in mark_nodes()
862 n = n->parent; in mark_nodes()
865 } else if (node->right) { in mark_nodes()
866 assert(node->rightnode == NODE); in mark_nodes()
867 node = node->right; in mark_nodes()
873 node = node->parent; in mark_nodes()
878 assert(tree->childnode == NODE); in mark_nodes()
879 node = tree->root; in mark_nodes()
882 bitmask = 1 << node->bitnum; in mark_nodes()
885 if (node->leftnode == LEAF) { in mark_nodes()
886 assert(node->left); in mark_nodes()
887 if (tree->leaf_mark(node->left)) { in mark_nodes()
889 while (n && !n->mark) { in mark_nodes()
891 n->mark = 1; in mark_nodes()
892 n = n->parent; in mark_nodes()
895 } else if (node->left) { in mark_nodes()
896 assert(node->leftnode == NODE); in mark_nodes()
897 node = node->left; in mark_nodes()
898 if (!node->mark && node->parent->mark) { in mark_nodes()
900 node->mark = 1; in mark_nodes()
907 if (node->rightnode == LEAF) { in mark_nodes()
908 assert(node->right); in mark_nodes()
909 if (tree->leaf_mark(node->right)) { in mark_nodes()
911 while (n && !n->mark) { in mark_nodes()
913 n->mark = 1; in mark_nodes()
914 n = n->parent; in mark_nodes()
917 } else if (node->right) { in mark_nodes()
918 assert(node->rightnode == NODE); in mark_nodes()
919 node = node->right; in mark_nodes()
920 if (!node->mark && node->parent->mark && in mark_nodes()
921 !node->parent->left) { in mark_nodes()
923 node->mark = 1; in mark_nodes()
930 node = node->parent; in mark_nodes()
939 * emitted trie. These values must be pre-computed because relative
954 tree->index = index; in index_nodes()
959 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); in index_nodes()
960 if (tree->childnode == LEAF) { in index_nodes()
961 index += tree->leaf_size(tree->root); in index_nodes()
965 assert(tree->childnode == NODE); in index_nodes()
966 node = tree->root; in index_nodes()
969 if (!node->mark) in index_nodes()
972 if (node->index != index) in index_nodes()
973 node->index = index; in index_nodes()
974 index += node->size; in index_nodes()
977 bitmask = 1 << node->bitnum; in index_nodes()
978 if (node->mark && (leftmask & bitmask) == 0) { in index_nodes()
980 if (node->leftnode == LEAF) { in index_nodes()
981 assert(node->left); in index_nodes()
982 *tree->leaf_index(tree, node->left) = in index_nodes()
984 index += tree->leaf_size(node->left); in index_nodes()
986 } else if (node->left) { in index_nodes()
987 assert(node->leftnode == NODE); in index_nodes()
989 node = node->left; in index_nodes()
993 if (node->mark && (rightmask & bitmask) == 0) { in index_nodes()
995 if (node->rightnode == LEAF) { in index_nodes()
996 assert(node->right); in index_nodes()
997 *tree->leaf_index(tree, node->right) = index; in index_nodes()
998 index += tree->leaf_size(node->right); in index_nodes()
1000 } else if (node->right) { in index_nodes()
1001 assert(node->rightnode == NODE); in index_nodes()
1003 node = node->right; in index_nodes()
1009 node = node->parent; in index_nodes()
1010 indent -= 1; in index_nodes()
1029 if (!node || node->mark) in mark_subtree()
1031 node->mark = 1; in mark_subtree()
1032 node->index = node->parent->index; in mark_subtree()
1034 if (node->leftnode == NODE) in mark_subtree()
1035 changed += mark_subtree(node->left); in mark_subtree()
1036 if (node->rightnode == NODE) in mark_subtree()
1037 changed += mark_subtree(node->right); in mark_subtree()
1043 * each node needs to store a three-byte offset. The indexes of the
1070 printf("Sizing %s_%x\n", tree->type, tree->maxage); in size_nodes()
1071 if (tree->childnode == LEAF) in size_nodes()
1074 assert(tree->childnode == NODE); in size_nodes()
1077 node = tree->root; in size_nodes()
1080 if (!node->mark) in size_nodes()
1083 if (!node->left || !node->right) { in size_nodes()
1086 if (node->rightnode == NODE) { in size_nodes()
1093 right = node->right; in size_nodes()
1094 next = tree->next; in size_nodes()
1095 while (!right->mark) { in size_nodes()
1097 n = next->root; in size_nodes()
1098 while (n->bitnum != node->bitnum) { in size_nodes()
1099 nbit = 1 << n->bitnum; in size_nodes()
1103 if (n->rightnode == LEAF) in size_nodes()
1105 n = n->right; in size_nodes()
1107 if (n->leftnode == LEAF) in size_nodes()
1109 n = n->left; in size_nodes()
1112 if (n->bitnum != node->bitnum) in size_nodes()
1114 n = n->right; in size_nodes()
1116 next = next->next; in size_nodes()
1119 if (!right->mark) in size_nodes()
1121 offset = right->index - node->index; in size_nodes()
1123 offset = *tree->leaf_index(tree, node->right); in size_nodes()
1124 offset -= node->index; in size_nodes()
1136 if (node->size != size || node->offset != offset) { in size_nodes()
1137 node->size = size; in size_nodes()
1138 node->offset = offset; in size_nodes()
1143 bitmask = 1 << node->bitnum; in size_nodes()
1145 if (node->mark && (leftmask & bitmask) == 0) { in size_nodes()
1147 if (node->leftnode == LEAF) { in size_nodes()
1148 assert(node->left); in size_nodes()
1149 } else if (node->left) { in size_nodes()
1150 assert(node->leftnode == NODE); in size_nodes()
1152 node = node->left; in size_nodes()
1156 if (node->mark && (rightmask & bitmask) == 0) { in size_nodes()
1159 if (node->rightnode == LEAF) { in size_nodes()
1160 assert(node->right); in size_nodes()
1161 } else if (node->right) { in size_nodes()
1162 assert(node->rightnode == NODE); in size_nodes()
1164 node = node->right; in size_nodes()
1172 node = node->parent; in size_nodes()
1173 indent -= 1; in size_nodes()
1204 index = tree->index; in emit()
1208 printf("Emitting %s_%x\n", tree->type, tree->maxage); in emit()
1209 if (tree->childnode == LEAF) { in emit()
1210 assert(tree->root); in emit()
1211 tree->leaf_emit(tree->root, data); in emit()
1212 size = tree->leaf_size(tree->root); in emit()
1218 assert(tree->childnode == NODE); in emit()
1219 node = tree->root; in emit()
1222 if (!node->mark) in emit()
1224 assert(node->offset != -1); in emit()
1225 assert(node->index == index); in emit()
1228 if (node->nextbyte) in emit()
1230 byte |= (node->bitnum & BITNUM); in emit()
1231 if (node->left && node->right) { in emit()
1232 if (node->leftnode == NODE) in emit()
1234 if (node->rightnode == NODE) in emit()
1236 if (node->offset <= 0xff) in emit()
1238 else if (node->offset <= 0xffff) in emit()
1243 offset = node->offset; in emit()
1247 while (offlen--) { in emit()
1252 } else if (node->left) { in emit()
1253 if (node->leftnode == NODE) in emit()
1258 } else if (node->right) { in emit()
1260 if (node->rightnode == NODE) in emit()
1270 bitmask = 1 << node->bitnum; in emit()
1271 if (node->mark && (leftmask & bitmask) == 0) { in emit()
1273 if (node->leftnode == LEAF) { in emit()
1274 assert(node->left); in emit()
1275 data = tree->leaf_emit(node->left, in emit()
1277 size = tree->leaf_size(node->left); in emit()
1281 } else if (node->left) { in emit()
1282 assert(node->leftnode == NODE); in emit()
1284 node = node->left; in emit()
1288 if (node->mark && (rightmask & bitmask) == 0) { in emit()
1290 if (node->rightnode == LEAF) { in emit()
1291 assert(node->right); in emit()
1292 data = tree->leaf_emit(node->right, in emit()
1294 size = tree->leaf_size(node->right); in emit()
1298 } else if (node->right) { in emit()
1299 assert(node->rightnode == NODE); in emit()
1301 node = node->right; in emit()
1307 node = node->parent; in emit()
1308 indent -= 1; in emit()
1318 printf(" %d total\n", index - tree->index); in emit()
1322 /* ------------------------------------------------------------------ */
1369 if (u->code == corrections[i].code) in corrections_lookup()
1379 if (left->gen != right->gen) in nfdi_equal()
1381 if (left->ccc != right->ccc) in nfdi_equal()
1383 if (left->utf8nfdi && right->utf8nfdi && in nfdi_equal()
1384 strcmp(left->utf8nfdi, right->utf8nfdi) == 0) in nfdi_equal()
1386 if (left->utf8nfdi || right->utf8nfdi) in nfdi_equal()
1396 if (left->gen != right->gen) in nfdicf_equal()
1398 if (left->ccc != right->ccc) in nfdicf_equal()
1400 if (left->utf8nfdicf && right->utf8nfdicf && in nfdicf_equal()
1401 strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) in nfdicf_equal()
1403 if (left->utf8nfdicf && right->utf8nfdicf) in nfdicf_equal()
1405 if (left->utf8nfdicf || right->utf8nfdicf) in nfdicf_equal()
1407 if (left->utf8nfdi && right->utf8nfdi && in nfdicf_equal()
1408 strcmp(left->utf8nfdi, right->utf8nfdi) == 0) in nfdicf_equal()
1410 if (left->utf8nfdi || right->utf8nfdi) in nfdicf_equal()
1420 leaf->code, leaf->ccc, leaf->gen); in nfdi_print()
1422 if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) in nfdi_print()
1424 else if (leaf->utf8nfdi) in nfdi_print()
1425 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); in nfdi_print()
1435 leaf->code, leaf->ccc, leaf->gen); in nfdicf_print()
1437 if (leaf->utf8nfdicf) in nfdicf_print()
1438 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); in nfdicf_print()
1439 else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) in nfdicf_print()
1441 else if (leaf->utf8nfdi) in nfdicf_print()
1442 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); in nfdicf_print()
1455 if (leaf->utf8nfdicf) in nfdicf_mark()
1464 return leaf->correction; in correction_mark()
1472 if (HANGUL_SYLLABLE(leaf->code)) in nfdi_size()
1474 else if (leaf->utf8nfdi) in nfdi_size()
1475 size += strlen(leaf->utf8nfdi) + 1; in nfdi_size()
1484 if (HANGUL_SYLLABLE(leaf->code)) in nfdicf_size()
1486 else if (leaf->utf8nfdicf) in nfdicf_size()
1487 size += strlen(leaf->utf8nfdicf) + 1; in nfdicf_size()
1488 else if (leaf->utf8nfdi) in nfdicf_size()
1489 size += strlen(leaf->utf8nfdi) + 1; in nfdicf_size()
1497 return &tree->leafindex[leaf->code]; in nfdi_index()
1504 return &tree->leafindex[leaf->code]; in nfdicf_index()
1512 *data++ = leaf->gen; in nfdi_emit()
1514 if (HANGUL_SYLLABLE(leaf->code)) { in nfdi_emit()
1517 } else if (leaf->utf8nfdi) { in nfdi_emit()
1519 s = (unsigned char*)leaf->utf8nfdi; in nfdi_emit()
1523 *data++ = leaf->ccc; in nfdi_emit()
1533 *data++ = leaf->gen; in nfdicf_emit()
1535 if (HANGUL_SYLLABLE(leaf->code)) { in nfdicf_emit()
1538 } else if (leaf->utf8nfdicf) { in nfdicf_emit()
1540 s = (unsigned char*)leaf->utf8nfdicf; in nfdicf_emit()
1543 } else if (leaf->utf8nfdi) { in nfdicf_emit()
1545 s = (unsigned char*)leaf->utf8nfdi; in nfdicf_emit()
1549 *data++ = leaf->ccc; in nfdicf_emit()
1561 if (data->utf8nfdi) { in utf8_create()
1562 assert(data->utf8nfdi[0] == HANGUL); in utf8_create()
1567 um = data->utf32nfdi; in utf8_create()
1572 data->utf8nfdi = strdup(utf); in utf8_create()
1575 um = data->utf32nfdicf; in utf8_create()
1580 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) in utf8_create()
1581 data->utf8nfdicf = strdup(utf); in utf8_create()
1608 nextage = (unsigned int)-1; in trees_init()
1614 if (nextage < data->correction && in trees_init()
1615 data->correction < maxage) in trees_init()
1616 nextage = data->correction; in trees_init()
1627 nextage = (unsigned int)-1; in trees_init()
1630 trees[--count].maxage = maxage; in trees_init()
1631 trees[--count].maxage = maxage; in trees_init()
1635 if (nextage < data->correction && in trees_init()
1636 data->correction < maxage) in trees_init()
1637 nextage = data->correction; in trees_init()
1646 trees[i].maxage = ages[j-1]; in trees_init()
1650 trees[trees_count-2].next = &trees[trees_count-1]; in trees_init()
1651 trees[trees_count-1].leaf_mark = nfdi_mark; in trees_init()
1652 trees[trees_count-2].leaf_mark = nfdicf_mark; in trees_init()
1653 for (i = 0; i != trees_count-2; i += 2) { in trees_init()
1654 trees[i].next = &trees[trees_count-2]; in trees_init()
1656 trees[i+1].next = &trees[trees_count-1]; in trees_init()
1700 if (data->correction <= trees[i].maxage) in trees_populate()
1738 nfdi = utf8data + trees[trees_count-1].index; in trees_reduce()
1739 nfdicf = utf8data + trees[trees_count-2].index; in trees_reduce()
1741 nfdi_tree = &trees[trees_count-1]; in trees_reduce()
1742 nfdicf_tree = &trees[trees_count-2]; in trees_reduce()
1756 printf("Verifying %s_%x\n", tree->type, tree->maxage); in verify()
1757 nocf = strcmp(tree->type, "nfdicf"); in verify()
1762 if (data->correction <= tree->maxage) in verify()
1768 if (data->gen != -1) in verify()
1775 if (data->gen == -1) in verify()
1777 if (data->gen != LEAF_GEN(leaf)) in verify()
1780 if (HANGUL_SYLLABLE(data->code)) { in verify()
1781 if (data->utf8nfdi[0] != HANGUL) in verify()
1784 if (!data->utf8nfdi) { in verify()
1786 } else if (strcmp(data->utf8nfdi, in verify()
1791 if (!data->utf8nfdicf && in verify()
1792 !data->utf8nfdi) { in verify()
1794 } else if (data->utf8nfdicf) { in verify()
1795 if (strcmp(data->utf8nfdicf, in verify()
1798 } else if (strcmp(data->utf8nfdi, in verify()
1803 } else if (data->ccc != LEAF_CCC(leaf)) { in verify()
1809 " nfdi -> \"%s\"", in verify()
1810 unichar, data->code, data->gen, in verify()
1811 data->ccc, in verify()
1812 data->utf8nfdi); in verify()
1815 " nfdi -> \"%s\"", in verify()
1834 /* ------------------------------------------------------------------ */
1841 printf("normalization of UTF-8 strings. The trie is derived from\n"); in help()
1848 printf("\t- Apply unicode normalization form NFD.\n"); in help()
1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); in help()
1852 printf("\t- Apply unicode normalization form NFD.\n"); in help()
1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n"); in help()
1854 printf("\t- Apply a full casefold (C + F).\n"); in help()
1866 printf("\t-a %s\n", AGE_NAME); in help()
1867 printf("\t-c %s\n", CCC_NAME); in help()
1868 printf("\t-p %s\n", PROP_NAME); in help()
1869 printf("\t-d %s\n", DATA_NAME); in help()
1870 printf("\t-f %s\n", FOLD_NAME); in help()
1871 printf("\t-n %s\n", NORM_NAME); in help()
1874 printf("\t-t %s\n", TEST_NAME); in help()
1877 printf("\t-o %s\n", UTF8_NAME); in help()
1905 /* ------------------------------------------------------------------ */
1917 printf(" %X ->", unichar); in print_utf32nfdi()
1924 printf(" %X ->", unichar); in print_utf32nfdicf()
1929 /* ------------------------------------------------------------------ */
1986 ages[ages_count] = (unsigned int)-1; in age_init()
2019 count += 1 + last - first; in age_init()
2044 unicode_data[unichar].gen = -1; in age_init()
2120 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdi_init()
2152 /* decode the decomposition into UTF-32 */ in nfdi_init()
2181 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdicf_init()
2311 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in corrections_init()
2367 printf(" %X -> %s -> %s V%d_%d_%d\n", in corrections_init()
2379 /* ------------------------------------------------------------------ */
2398 * SIndex = s - SBase
2436 unsigned int nc = (vc * tc); in hangul_decompose() local
2437 /* unsigned int sc = (lc * nc); */ in hangul_decompose()
2449 unsigned int si = unichar - sb; in hangul_decompose() local
2450 unsigned int li = si / nc; in hangul_decompose()
2451 unsigned int vi = (si % nc) / tc; in hangul_decompose()
2452 unsigned int ti = si % tc; in hangul_decompose()
2492 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdi_decompose()
2547 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ in nfdicf_decompose()
2592 /* ------------------------------------------------------------------ */
2622 * SIndex = s - SBase
2658 #define NC (VC * TC) macro
2659 #define SC (LC * NC)
2664 unsigned int si; in utf8hangul() local
2670 /* Calculate the SI, LI, VI, and TI values. */ in utf8hangul()
2671 si = utf8decode(str) - SB; in utf8hangul()
2672 li = si / NC; in utf8hangul()
2673 vi = (si % NC) / TC; in utf8hangul()
2674 ti = si % TC; in utf8hangul()
2682 /* Add LPart, a 3-byte UTF-8 sequence. */ in utf8hangul()
2685 /* Add VPart, a 3-byte UTF-8 sequence. */ in utf8hangul()
2688 /* Add TPart if required, also a 3-byte UTF-8 sequence. */ in utf8hangul()
2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703 * is well-formed and corresponds to a known unicode code point. The
2704 * shorthand for this will be "is valid UTF-8 unicode".
2720 trie = utf8data + tree->index; in utf8nlookup()
2724 if (--len == 0) in utf8nlookup()
2735 while (--offlen) { in utf8nlookup()
2766 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is in utf8nlookup()
2768 * start of the sequence is at s-2. in utf8nlookup()
2771 trie = utf8hangul(s - 2, hangul); in utf8nlookup()
2784 return utf8nlookup(tree, hangul, s, (size_t)-1); in utf8lookup()
2788 * Return the number of bytes used by the current UTF-8 sequence.
2789 * Assumes the input points to the first byte of a valid UTF-8
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if only non-assigned code points are used.
2811 return -1; in utf8agemax()
2816 return -1; in utf8agemax()
2818 if (leaf_age <= tree->maxage && leaf_age > age) in utf8agemax()
2827 * Return -1 if s is not valid UTF-8 unicode.
2828 * Return 0 if non-assigned code points are used.
2838 return -1; in utf8agemin()
2839 age = tree->maxage; in utf8agemin()
2843 return -1; in utf8agemin()
2845 if (leaf_age <= tree->maxage && leaf_age < age) in utf8agemin()
2854 * Return -1 if s is not valid UTF-8 unicode.
2864 return -1; in utf8nagemax()
2869 return -1; in utf8nagemax()
2871 if (leaf_age <= tree->maxage && leaf_age > age) in utf8nagemax()
2873 len -= utf8clen(s); in utf8nagemax()
2881 * Return -1 if s is not valid UTF-8 unicode.
2891 return -1; in utf8nagemin()
2892 age = tree->maxage; in utf8nagemin()
2896 return -1; in utf8nagemin()
2898 if (leaf_age <= tree->maxage && leaf_age < age) in utf8nagemin()
2900 len -= utf8clen(s); in utf8nagemin()
2908 * Return -1 if s is not valid UTF-8 unicode.
2919 return -1; in utf8len()
2923 return -1; in utf8len()
2924 if (ages[LEAF_GEN(leaf)] > tree->maxage) in utf8len()
2937 * Return -1 if s is not valid UTF-8 unicode.
2946 return -1; in utf8nlen()
2950 return -1; in utf8nlen()
2951 if (ages[LEAF_GEN(leaf)] > tree->maxage) in utf8nlen()
2957 len -= utf8clen(s); in utf8nlen()
2988 * Returns -1 on error, 0 on success.
2994 return -1; in utf8ncursor()
2996 return -1; in utf8ncursor()
2997 u8c->tree = tree; in utf8ncursor()
2998 u8c->s = s; in utf8ncursor()
2999 u8c->p = NULL; in utf8ncursor()
3000 u8c->ss = NULL; in utf8ncursor()
3001 u8c->sp = NULL; in utf8ncursor()
3002 u8c->len = len; in utf8ncursor()
3003 u8c->slen = 0; in utf8ncursor()
3004 u8c->ccc = STOPPER; in utf8ncursor()
3005 u8c->nccc = STOPPER; in utf8ncursor()
3006 u8c->unichar = 0; in utf8ncursor()
3008 if (u8c->len != len) in utf8ncursor()
3009 return -1; in utf8ncursor()
3012 return -1; in utf8ncursor()
3019 * s : NUL-terminated string.
3023 * Returns -1 on error, 0 on success.
3027 return utf8ncursor(u8c, tree, s, (unsigned int)-1); in utf8cursor()
3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3035 * The cursor keeps track of the location in the string in u8c->s.
3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038 * that bytes from a decomposition do not count against u8c->len.
3040 * Characters are emitted if they match the current CCC in u8c->ccc.
3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3047 * emitted and stores it in u8c->nccc, the second pass emits the
3053 * u8c->p != NULL -> a decomposition is being scanned.
3054 * u8c->ss != NULL -> this is a repeating scan.
3055 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3064 if (u8c->p && *u8c->s == '\0') { in utf8byte()
3065 u8c->s = u8c->p; in utf8byte()
3066 u8c->p = NULL; in utf8byte()
3069 /* Check for end-of-string. */ in utf8byte()
3070 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { in utf8byte()
3072 if (u8c->ccc == STOPPER) in utf8byte()
3074 /* End-of-string during a scan counts as a stopper. */ in utf8byte()
3077 } else if ((*u8c->s & 0xC0) == 0x80) { in utf8byte()
3079 if (!u8c->p) in utf8byte()
3080 u8c->len--; in utf8byte()
3081 return (unsigned char)*u8c->s++; in utf8byte()
3085 if (u8c->p) { in utf8byte()
3086 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); in utf8byte()
3088 leaf = utf8nlookup(u8c->tree, u8c->hangul, in utf8byte()
3089 u8c->s, u8c->len); in utf8byte()
3094 return -1; in utf8byte()
3097 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { in utf8byte()
3100 u8c->len -= utf8clen(u8c->s); in utf8byte()
3101 u8c->p = u8c->s + utf8clen(u8c->s); in utf8byte()
3102 u8c->s = LEAF_STR(leaf); in utf8byte()
3104 if (*u8c->s == '\0') { in utf8byte()
3105 if (u8c->ccc == STOPPER) in utf8byte()
3110 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); in utf8byte()
3113 u8c->unichar = utf8decode(u8c->s); in utf8byte()
3119 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) in utf8byte()
3120 u8c->nccc = ccc; in utf8byte()
3126 if (ccc == u8c->ccc) { in utf8byte()
3127 if (!u8c->p) in utf8byte()
3128 u8c->len--; in utf8byte()
3129 return (unsigned char)*u8c->s++; in utf8byte()
3134 if (u8c->nccc == STOPPER) { in utf8byte()
3140 assert(u8c->ccc == STOPPER); in utf8byte()
3141 u8c->ccc = MINCCC - 1; in utf8byte()
3142 u8c->nccc = ccc; in utf8byte()
3143 u8c->sp = u8c->p; in utf8byte()
3144 u8c->ss = u8c->s; in utf8byte()
3145 u8c->slen = u8c->len; in utf8byte()
3146 if (!u8c->p) in utf8byte()
3147 u8c->len -= utf8clen(u8c->s); in utf8byte()
3148 u8c->s += utf8clen(u8c->s); in utf8byte()
3151 if (!u8c->p) in utf8byte()
3152 u8c->len -= utf8clen(u8c->s); in utf8byte()
3153 u8c->s += utf8clen(u8c->s); in utf8byte()
3154 } else if (u8c->nccc != MAXCCC + 1) { in utf8byte()
3156 u8c->ccc = u8c->nccc; in utf8byte()
3157 u8c->nccc = MAXCCC + 1; in utf8byte()
3158 u8c->s = u8c->ss; in utf8byte()
3159 u8c->p = u8c->sp; in utf8byte()
3160 u8c->len = u8c->slen; in utf8byte()
3163 u8c->ccc = STOPPER; in utf8byte()
3164 u8c->nccc = STOPPER; in utf8byte()
3165 u8c->sp = NULL; in utf8byte()
3166 u8c->ss = NULL; in utf8byte()
3167 u8c->slen = 0; in utf8byte()
3172 /* ------------------------------------------------------------------ */
3181 /* First test: null-terminated string. */ in normalize_line()
3185 return -1; in normalize_line()
3188 return -1; in normalize_line()
3190 return -1; in normalize_line()
3192 return -1; in normalize_line()
3194 /* Second test: length-limited string. */ in normalize_line()
3197 s[strlen(s) + 1] = -1; in normalize_line()
3200 return -1; in normalize_line()
3203 return -1; in normalize_line()
3205 return -1; in normalize_line()
3207 return -1; in normalize_line()
3250 if (data->utf8nfdi && !*data->utf8nfdi) in normalization_test()
3259 printf("Line %s -> %s", buf0, buf1); in normalization_test()
3273 /* ------------------------------------------------------------------ */
3330 if (t < trees_count-1) in write_file()
3336 (j < utf8data_size -1 ? "," : "")); in write_file()
3360 /* ------------------------------------------------------------------ */
3369 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { in main()