1 /*
2 * Copyright (c) 2014 SGI.
3 * All rights reserved.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 */
18
19 /* Generator for a compact trie for unicode normalization */
20
21 #include <sys/types.h>
22 #include <stddef.h>
23 #include <stdlib.h>
24 #include <stdio.h>
25 #include <assert.h>
26 #include <string.h>
27 #include <unistd.h>
28 #include <errno.h>
29
30 /* Default names of the in- and output files. */
31
32 #define AGE_NAME "DerivedAge.txt"
33 #define CCC_NAME "DerivedCombiningClass.txt"
34 #define PROP_NAME "DerivedCoreProperties.txt"
35 #define DATA_NAME "UnicodeData.txt"
36 #define FOLD_NAME "CaseFolding.txt"
37 #define NORM_NAME "NormalizationCorrections.txt"
38 #define TEST_NAME "NormalizationTest.txt"
39 #define UTF8_NAME "utf8data.c"
40
41 const char *age_name = AGE_NAME;
42 const char *ccc_name = CCC_NAME;
43 const char *prop_name = PROP_NAME;
44 const char *data_name = DATA_NAME;
45 const char *fold_name = FOLD_NAME;
46 const char *norm_name = NORM_NAME;
47 const char *test_name = TEST_NAME;
48 const char *utf8_name = UTF8_NAME;
49
50 int verbose = 0;
51
52 /* An arbitrary line size limit on input lines. */
53
54 #define LINESIZE 1024
55 char line[LINESIZE];
56 char buf0[LINESIZE];
57 char buf1[LINESIZE];
58 char buf2[LINESIZE];
59 char buf3[LINESIZE];
60
61 const char *argv0;
62
63 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
64
65 /* ------------------------------------------------------------------ */
66
67 /*
68 * Unicode version numbers consist of three parts: major, minor, and a
69 * revision. These numbers are packed into an unsigned int to obtain
70 * a single version number.
71 *
72 * To save space in the generated trie, the unicode version is not
73 * stored directly, instead we calculate a generation number from the
74 * unicode versions seen in the DerivedAge file, and use that as an
75 * index into a table of unicode versions.
76 */
77 #define UNICODE_MAJ_SHIFT (16)
78 #define UNICODE_MIN_SHIFT (8)
79
80 #define UNICODE_MAJ_MAX ((unsigned short)-1)
81 #define UNICODE_MIN_MAX ((unsigned char)-1)
82 #define UNICODE_REV_MAX ((unsigned char)-1)
83
84 #define UNICODE_AGE(MAJ,MIN,REV) \
85 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
86 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
87 ((unsigned int)(REV)))
88
89 unsigned int *ages;
90 int ages_count;
91
92 unsigned int unicode_maxage;
93
age_valid(unsigned int major,unsigned int minor,unsigned int revision)94 static int age_valid(unsigned int major, unsigned int minor,
95 unsigned int revision)
96 {
97 if (major > UNICODE_MAJ_MAX)
98 return 0;
99 if (minor > UNICODE_MIN_MAX)
100 return 0;
101 if (revision > UNICODE_REV_MAX)
102 return 0;
103 return 1;
104 }
105
106 /* ------------------------------------------------------------------ */
107
108 /*
109 * utf8trie_t
110 *
111 * A compact binary tree, used to decode UTF-8 characters.
112 *
113 * Internal nodes are one byte for the node itself, and up to three
114 * bytes for an offset into the tree. The first byte contains the
115 * following information:
116 * NEXTBYTE - flag - advance to next byte if set
117 * BITNUM - 3 bit field - the bit number to tested
118 * OFFLEN - 2 bit field - number of bytes in the offset
119 * if offlen == 0 (non-branching node)
120 * RIGHTPATH - 1 bit field - set if the following node is for the
121 * right-hand path (tested bit is set)
122 * TRIENODE - 1 bit field - set if the following node is an internal
123 * node, otherwise it is a leaf node
124 * if offlen != 0 (branching node)
125 * LEFTNODE - 1 bit field - set if the left-hand node is internal
126 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
127 *
128 * Due to the way utf8 works, there cannot be branching nodes with
129 * NEXTBYTE set, and moreover those nodes always have a righthand
130 * descendant.
131 */
132 typedef unsigned char utf8trie_t;
133 #define BITNUM 0x07
134 #define NEXTBYTE 0x08
135 #define OFFLEN 0x30
136 #define OFFLEN_SHIFT 4
137 #define RIGHTPATH 0x40
138 #define TRIENODE 0x80
139 #define RIGHTNODE 0x40
140 #define LEFTNODE 0x80
141
142 /*
143 * utf8leaf_t
144 *
145 * The leaves of the trie are embedded in the trie, and so the same
146 * underlying datatype, unsigned char.
147 *
148 * leaf[0]: The unicode version, stored as a generation number that is
149 * an index into utf8agetab[]. With this we can filter code
150 * points based on the unicode version in which they were
151 * defined. The CCC of a non-defined code point is 0.
152 * leaf[1]: Canonical Combining Class. During normalization, we need
153 * to do a stable sort into ascending order of all characters
154 * with a non-zero CCC that occur between two characters with
155 * a CCC of 0, or at the begin or end of a string.
156 * The unicode standard guarantees that all CCC values are
157 * between 0 and 254 inclusive, which leaves 255 available as
158 * a special value.
159 * Code points with CCC 0 are known as stoppers.
160 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
161 * start of a NUL-terminated string that is the decomposition
162 * of the character.
163 * The CCC of a decomposable character is the same as the CCC
164 * of the first character of its decomposition.
165 * Some characters decompose as the empty string: these are
166 * characters with the Default_Ignorable_Code_Point property.
167 * These do affect normalization, as they all have CCC 0.
168 *
169 * The decompositions in the trie have been fully expanded.
170 *
171 * Casefolding, if applicable, is also done using decompositions.
172 */
173 typedef unsigned char utf8leaf_t;
174
175 #define LEAF_GEN(LEAF) ((LEAF)[0])
176 #define LEAF_CCC(LEAF) ((LEAF)[1])
177 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
178
179 #define MAXGEN (255)
180
181 #define MINCCC (0)
182 #define MAXCCC (254)
183 #define STOPPER (0)
184 #define DECOMPOSE (255)
185 #define HANGUL ((char)(255))
186
187 #define UTF8HANGULLEAF (12)
188
189 struct tree;
190 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
191 const char *, size_t);
192 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
193
194 unsigned char *utf8data;
195 size_t utf8data_size;
196
197 utf8trie_t *nfdi;
198 utf8trie_t *nfdicf;
199
200 /* ------------------------------------------------------------------ */
201
202 /*
203 * UTF8 valid ranges.
204 *
205 * The UTF-8 encoding spreads the bits of a 32bit word over several
206 * bytes. This table gives the ranges that can be held and how they'd
207 * be represented.
208 *
209 * 0x00000000 0x0000007F: 0xxxxxxx
210 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
211 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
212 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
213 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
215 *
216 * There is an additional requirement on UTF-8, in that only the
217 * shortest representation of a 32bit value is to be used. A decoder
218 * must not decode sequences that do not satisfy this requirement.
219 * Thus the allowed ranges have a lower bound.
220 *
221 * 0x00000000 0x0000007F: 0xxxxxxx
222 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
223 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
224 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
225 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
227 *
228 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
229 * 17 planes of 65536 values. This limits the sequences actually seen
230 * even more, to just the following.
231 *
232 * 0 - 0x7f: 0 0x7f
233 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
234 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
235 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
236 *
237 * Even within those ranges not all values are allowed: the surrogates
238 * 0xd800 - 0xdfff should never be seen.
239 *
240 * Note that the longest sequence seen with valid usage is 4 bytes,
241 * the same a single UTF-32 character. This makes the UTF-8
242 * representation of Unicode strictly smaller than UTF-32.
243 *
244 * The shortest sequence requirement was introduced by:
245 * Corrigendum #1: UTF-8 Shortest Form
246 * It can be found here:
247 * http://www.unicode.org/versions/corrigendum1.html
248 *
249 */
250
251 #define UTF8_2_BITS 0xC0
252 #define UTF8_3_BITS 0xE0
253 #define UTF8_4_BITS 0xF0
254 #define UTF8_N_BITS 0x80
255 #define UTF8_2_MASK 0xE0
256 #define UTF8_3_MASK 0xF0
257 #define UTF8_4_MASK 0xF8
258 #define UTF8_N_MASK 0xC0
259 #define UTF8_V_MASK 0x3F
260 #define UTF8_V_SHIFT 6
261
utf8encode(char * str,unsigned int val)262 static int utf8encode(char *str, unsigned int val)
263 {
264 int len;
265
266 if (val < 0x80) {
267 str[0] = val;
268 len = 1;
269 } else if (val < 0x800) {
270 str[1] = val & UTF8_V_MASK;
271 str[1] |= UTF8_N_BITS;
272 val >>= UTF8_V_SHIFT;
273 str[0] = val;
274 str[0] |= UTF8_2_BITS;
275 len = 2;
276 } else if (val < 0x10000) {
277 str[2] = val & UTF8_V_MASK;
278 str[2] |= UTF8_N_BITS;
279 val >>= UTF8_V_SHIFT;
280 str[1] = val & UTF8_V_MASK;
281 str[1] |= UTF8_N_BITS;
282 val >>= UTF8_V_SHIFT;
283 str[0] = val;
284 str[0] |= UTF8_3_BITS;
285 len = 3;
286 } else if (val < 0x110000) {
287 str[3] = val & UTF8_V_MASK;
288 str[3] |= UTF8_N_BITS;
289 val >>= UTF8_V_SHIFT;
290 str[2] = val & UTF8_V_MASK;
291 str[2] |= UTF8_N_BITS;
292 val >>= UTF8_V_SHIFT;
293 str[1] = val & UTF8_V_MASK;
294 str[1] |= UTF8_N_BITS;
295 val >>= UTF8_V_SHIFT;
296 str[0] = val;
297 str[0] |= UTF8_4_BITS;
298 len = 4;
299 } else {
300 printf("%#x: illegal val\n", val);
301 len = 0;
302 }
303 return len;
304 }
305
utf8decode(const char * str)306 static unsigned int utf8decode(const char *str)
307 {
308 const unsigned char *s = (const unsigned char*)str;
309 unsigned int unichar = 0;
310
311 if (*s < 0x80) {
312 unichar = *s;
313 } else if (*s < UTF8_3_BITS) {
314 unichar = *s++ & 0x1F;
315 unichar <<= UTF8_V_SHIFT;
316 unichar |= *s & 0x3F;
317 } else if (*s < UTF8_4_BITS) {
318 unichar = *s++ & 0x0F;
319 unichar <<= UTF8_V_SHIFT;
320 unichar |= *s++ & 0x3F;
321 unichar <<= UTF8_V_SHIFT;
322 unichar |= *s & 0x3F;
323 } else {
324 unichar = *s++ & 0x0F;
325 unichar <<= UTF8_V_SHIFT;
326 unichar |= *s++ & 0x3F;
327 unichar <<= UTF8_V_SHIFT;
328 unichar |= *s++ & 0x3F;
329 unichar <<= UTF8_V_SHIFT;
330 unichar |= *s & 0x3F;
331 }
332 return unichar;
333 }
334
utf32valid(unsigned int unichar)335 static int utf32valid(unsigned int unichar)
336 {
337 return unichar < 0x110000;
338 }
339
340 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
341
342 #define NODE 1
343 #define LEAF 0
344
345 struct tree {
346 void *root;
347 int childnode;
348 const char *type;
349 unsigned int maxage;
350 struct tree *next;
351 int (*leaf_equal)(void *, void *);
352 void (*leaf_print)(void *, int);
353 int (*leaf_mark)(void *);
354 int (*leaf_size)(void *);
355 int *(*leaf_index)(struct tree *, void *);
356 unsigned char *(*leaf_emit)(void *, unsigned char *);
357 int leafindex[0x110000];
358 int index;
359 };
360
361 struct node {
362 int index;
363 int offset;
364 int mark;
365 int size;
366 struct node *parent;
367 void *left;
368 void *right;
369 unsigned char bitnum;
370 unsigned char nextbyte;
371 unsigned char leftnode;
372 unsigned char rightnode;
373 unsigned int keybits;
374 unsigned int keymask;
375 };
376
377 /*
378 * Example lookup function for a tree.
379 */
lookup(struct tree * tree,const char * key)380 static void *lookup(struct tree *tree, const char *key)
381 {
382 struct node *node;
383 void *leaf = NULL;
384
385 node = tree->root;
386 while (!leaf && node) {
387 if (node->nextbyte)
388 key++;
389 if (*key & (1 << (node->bitnum & 7))) {
390 /* Right leg */
391 if (node->rightnode == NODE) {
392 node = node->right;
393 } else if (node->rightnode == LEAF) {
394 leaf = node->right;
395 } else {
396 node = NULL;
397 }
398 } else {
399 /* Left leg */
400 if (node->leftnode == NODE) {
401 node = node->left;
402 } else if (node->leftnode == LEAF) {
403 leaf = node->left;
404 } else {
405 node = NULL;
406 }
407 }
408 }
409
410 return leaf;
411 }
412
413 /*
414 * A simple non-recursive tree walker: keep track of visits to the
415 * left and right branches in the leftmask and rightmask.
416 */
tree_walk(struct tree * tree)417 static void tree_walk(struct tree *tree)
418 {
419 struct node *node;
420 unsigned int leftmask;
421 unsigned int rightmask;
422 unsigned int bitmask;
423 int indent = 1;
424 int nodes, singletons, leaves;
425
426 nodes = singletons = leaves = 0;
427
428 printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
429 if (tree->childnode == LEAF) {
430 assert(tree->root);
431 tree->leaf_print(tree->root, indent);
432 leaves = 1;
433 } else {
434 assert(tree->childnode == NODE);
435 node = tree->root;
436 leftmask = rightmask = 0;
437 while (node) {
438 printf("%*snode @ %p bitnum %d nextbyte %d"
439 " left %p right %p mask %x bits %x\n",
440 indent, "", node,
441 node->bitnum, node->nextbyte,
442 node->left, node->right,
443 node->keymask, node->keybits);
444 nodes += 1;
445 if (!(node->left && node->right))
446 singletons += 1;
447
448 while (node) {
449 bitmask = 1 << node->bitnum;
450 if ((leftmask & bitmask) == 0) {
451 leftmask |= bitmask;
452 if (node->leftnode == LEAF) {
453 assert(node->left);
454 tree->leaf_print(node->left,
455 indent+1);
456 leaves += 1;
457 } else if (node->left) {
458 assert(node->leftnode == NODE);
459 indent += 1;
460 node = node->left;
461 break;
462 }
463 }
464 if ((rightmask & bitmask) == 0) {
465 rightmask |= bitmask;
466 if (node->rightnode == LEAF) {
467 assert(node->right);
468 tree->leaf_print(node->right,
469 indent+1);
470 leaves += 1;
471 } else if (node->right) {
472 assert(node->rightnode == NODE);
473 indent += 1;
474 node = node->right;
475 break;
476 }
477 }
478 leftmask &= ~bitmask;
479 rightmask &= ~bitmask;
480 node = node->parent;
481 indent -= 1;
482 }
483 }
484 }
485 printf("nodes %d leaves %d singletons %d\n",
486 nodes, leaves, singletons);
487 }
488
489 /*
490 * Allocate an initialize a new internal node.
491 */
alloc_node(struct node * parent)492 static struct node *alloc_node(struct node *parent)
493 {
494 struct node *node;
495 int bitnum;
496
497 node = malloc(sizeof(*node));
498 node->left = node->right = NULL;
499 node->parent = parent;
500 node->leftnode = NODE;
501 node->rightnode = NODE;
502 node->keybits = 0;
503 node->keymask = 0;
504 node->mark = 0;
505 node->index = 0;
506 node->offset = -1;
507 node->size = 4;
508
509 if (node->parent) {
510 bitnum = parent->bitnum;
511 if ((bitnum & 7) == 0) {
512 node->bitnum = bitnum + 7 + 8;
513 node->nextbyte = 1;
514 } else {
515 node->bitnum = bitnum - 1;
516 node->nextbyte = 0;
517 }
518 } else {
519 node->bitnum = 7;
520 node->nextbyte = 0;
521 }
522
523 return node;
524 }
525
526 /*
527 * Insert a new leaf into the tree, and collapse any subtrees that are
528 * fully populated and end in identical leaves. A nextbyte tagged
529 * internal node will not be removed to preserve the tree's integrity.
530 * Note that due to the structure of utf8, no nextbyte tagged node
531 * will be a candidate for removal.
532 */
insert(struct tree * tree,char * key,int keylen,void * leaf)533 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
534 {
535 struct node *node;
536 struct node *parent;
537 void **cursor;
538 int keybits;
539
540 assert(keylen >= 1 && keylen <= 4);
541
542 node = NULL;
543 cursor = &tree->root;
544 keybits = 8 * keylen;
545
546 /* Insert, creating path along the way. */
547 while (keybits) {
548 if (!*cursor)
549 *cursor = alloc_node(node);
550 node = *cursor;
551 if (node->nextbyte)
552 key++;
553 if (*key & (1 << (node->bitnum & 7)))
554 cursor = &node->right;
555 else
556 cursor = &node->left;
557 keybits--;
558 }
559 *cursor = leaf;
560
561 /* Merge subtrees if possible. */
562 while (node) {
563 if (*key & (1 << (node->bitnum & 7)))
564 node->rightnode = LEAF;
565 else
566 node->leftnode = LEAF;
567 if (node->nextbyte)
568 break;
569 if (node->leftnode == NODE || node->rightnode == NODE)
570 break;
571 assert(node->left);
572 assert(node->right);
573 /* Compare */
574 if (! tree->leaf_equal(node->left, node->right))
575 break;
576 /* Keep left, drop right leaf. */
577 leaf = node->left;
578 /* Check in parent */
579 parent = node->parent;
580 if (!parent) {
581 /* root of tree! */
582 tree->root = leaf;
583 tree->childnode = LEAF;
584 } else if (parent->left == node) {
585 parent->left = leaf;
586 parent->leftnode = LEAF;
587 if (parent->right) {
588 parent->keymask = 0;
589 parent->keybits = 0;
590 } else {
591 parent->keymask |= (1 << node->bitnum);
592 }
593 } else if (parent->right == node) {
594 parent->right = leaf;
595 parent->rightnode = LEAF;
596 if (parent->left) {
597 parent->keymask = 0;
598 parent->keybits = 0;
599 } else {
600 parent->keymask |= (1 << node->bitnum);
601 parent->keybits |= (1 << node->bitnum);
602 }
603 } else {
604 /* internal tree error */
605 assert(0);
606 }
607 free(node);
608 node = parent;
609 }
610
611 /* Propagate keymasks up along singleton chains. */
612 while (node) {
613 parent = node->parent;
614 if (!parent)
615 break;
616 /* Nix the mask for parents with two children. */
617 if (node->keymask == 0) {
618 parent->keymask = 0;
619 parent->keybits = 0;
620 } else if (parent->left && parent->right) {
621 parent->keymask = 0;
622 parent->keybits = 0;
623 } else {
624 assert((parent->keymask & node->keymask) == 0);
625 parent->keymask |= node->keymask;
626 parent->keymask |= (1 << parent->bitnum);
627 parent->keybits |= node->keybits;
628 if (parent->right)
629 parent->keybits |= (1 << parent->bitnum);
630 }
631 node = parent;
632 }
633
634 return 0;
635 }
636
637 /*
638 * Prune internal nodes.
639 *
640 * Fully populated subtrees that end at the same leaf have already
641 * been collapsed. There are still internal nodes that have for both
642 * their left and right branches a sequence of singletons that make
643 * identical choices and end in identical leaves. The keymask and
644 * keybits collected in the nodes describe the choices made in these
645 * singleton chains. When they are identical for the left and right
646 * branch of a node, and the two leaves comare identical, the node in
647 * question can be removed.
648 *
649 * Note that nodes with the nextbyte tag set will not be removed by
650 * this to ensure tree integrity. Note as well that the structure of
651 * utf8 ensures that these nodes would not have been candidates for
652 * removal in any case.
653 */
prune(struct tree * tree)654 static void prune(struct tree *tree)
655 {
656 struct node *node;
657 struct node *left;
658 struct node *right;
659 struct node *parent;
660 void *leftleaf;
661 void *rightleaf;
662 unsigned int leftmask;
663 unsigned int rightmask;
664 unsigned int bitmask;
665 int count;
666
667 if (verbose > 0)
668 printf("Pruning %s_%x\n", tree->type, tree->maxage);
669
670 count = 0;
671 if (tree->childnode == LEAF)
672 return;
673 if (!tree->root)
674 return;
675
676 leftmask = rightmask = 0;
677 node = tree->root;
678 while (node) {
679 if (node->nextbyte)
680 goto advance;
681 if (node->leftnode == LEAF)
682 goto advance;
683 if (node->rightnode == LEAF)
684 goto advance;
685 if (!node->left)
686 goto advance;
687 if (!node->right)
688 goto advance;
689 left = node->left;
690 right = node->right;
691 if (left->keymask == 0)
692 goto advance;
693 if (right->keymask == 0)
694 goto advance;
695 if (left->keymask != right->keymask)
696 goto advance;
697 if (left->keybits != right->keybits)
698 goto advance;
699 leftleaf = NULL;
700 while (!leftleaf) {
701 assert(left->left || left->right);
702 if (left->leftnode == LEAF)
703 leftleaf = left->left;
704 else if (left->rightnode == LEAF)
705 leftleaf = left->right;
706 else if (left->left)
707 left = left->left;
708 else if (left->right)
709 left = left->right;
710 else
711 assert(0);
712 }
713 rightleaf = NULL;
714 while (!rightleaf) {
715 assert(right->left || right->right);
716 if (right->leftnode == LEAF)
717 rightleaf = right->left;
718 else if (right->rightnode == LEAF)
719 rightleaf = right->right;
720 else if (right->left)
721 right = right->left;
722 else if (right->right)
723 right = right->right;
724 else
725 assert(0);
726 }
727 if (! tree->leaf_equal(leftleaf, rightleaf))
728 goto advance;
729 /*
730 * This node has identical singleton-only subtrees.
731 * Remove it.
732 */
733 parent = node->parent;
734 left = node->left;
735 right = node->right;
736 if (parent->left == node)
737 parent->left = left;
738 else if (parent->right == node)
739 parent->right = left;
740 else
741 assert(0);
742 left->parent = parent;
743 left->keymask |= (1 << node->bitnum);
744 node->left = NULL;
745 while (node) {
746 bitmask = 1 << node->bitnum;
747 leftmask &= ~bitmask;
748 rightmask &= ~bitmask;
749 if (node->leftnode == NODE && node->left) {
750 left = node->left;
751 free(node);
752 count++;
753 node = left;
754 } else if (node->rightnode == NODE && node->right) {
755 right = node->right;
756 free(node);
757 count++;
758 node = right;
759 } else {
760 node = NULL;
761 }
762 }
763 /* Propagate keymasks up along singleton chains. */
764 node = parent;
765 /* Force re-check */
766 bitmask = 1 << node->bitnum;
767 leftmask &= ~bitmask;
768 rightmask &= ~bitmask;
769 for (;;) {
770 if (node->left && node->right)
771 break;
772 if (node->left) {
773 left = node->left;
774 node->keymask |= left->keymask;
775 node->keybits |= left->keybits;
776 }
777 if (node->right) {
778 right = node->right;
779 node->keymask |= right->keymask;
780 node->keybits |= right->keybits;
781 }
782 node->keymask |= (1 << node->bitnum);
783 node = node->parent;
784 /* Force re-check */
785 bitmask = 1 << node->bitnum;
786 leftmask &= ~bitmask;
787 rightmask &= ~bitmask;
788 }
789 advance:
790 bitmask = 1 << node->bitnum;
791 if ((leftmask & bitmask) == 0 &&
792 node->leftnode == NODE &&
793 node->left) {
794 leftmask |= bitmask;
795 node = node->left;
796 } else if ((rightmask & bitmask) == 0 &&
797 node->rightnode == NODE &&
798 node->right) {
799 rightmask |= bitmask;
800 node = node->right;
801 } else {
802 leftmask &= ~bitmask;
803 rightmask &= ~bitmask;
804 node = node->parent;
805 }
806 }
807 if (verbose > 0)
808 printf("Pruned %d nodes\n", count);
809 }
810
811 /*
812 * Mark the nodes in the tree that lead to leaves that must be
813 * emitted.
814 */
mark_nodes(struct tree * tree)815 static void mark_nodes(struct tree *tree)
816 {
817 struct node *node;
818 struct node *n;
819 unsigned int leftmask;
820 unsigned int rightmask;
821 unsigned int bitmask;
822 int marked;
823
824 marked = 0;
825 if (verbose > 0)
826 printf("Marking %s_%x\n", tree->type, tree->maxage);
827 if (tree->childnode == LEAF)
828 goto done;
829
830 assert(tree->childnode == NODE);
831 node = tree->root;
832 leftmask = rightmask = 0;
833 while (node) {
834 bitmask = 1 << node->bitnum;
835 if ((leftmask & bitmask) == 0) {
836 leftmask |= bitmask;
837 if (node->leftnode == LEAF) {
838 assert(node->left);
839 if (tree->leaf_mark(node->left)) {
840 n = node;
841 while (n && !n->mark) {
842 marked++;
843 n->mark = 1;
844 n = n->parent;
845 }
846 }
847 } else if (node->left) {
848 assert(node->leftnode == NODE);
849 node = node->left;
850 continue;
851 }
852 }
853 if ((rightmask & bitmask) == 0) {
854 rightmask |= bitmask;
855 if (node->rightnode == LEAF) {
856 assert(node->right);
857 if (tree->leaf_mark(node->right)) {
858 n = node;
859 while (n && !n->mark) {
860 marked++;
861 n->mark = 1;
862 n = n->parent;
863 }
864 }
865 } else if (node->right) {
866 assert(node->rightnode == NODE);
867 node = node->right;
868 continue;
869 }
870 }
871 leftmask &= ~bitmask;
872 rightmask &= ~bitmask;
873 node = node->parent;
874 }
875
876 /* second pass: left siblings and singletons */
877
878 assert(tree->childnode == NODE);
879 node = tree->root;
880 leftmask = rightmask = 0;
881 while (node) {
882 bitmask = 1 << node->bitnum;
883 if ((leftmask & bitmask) == 0) {
884 leftmask |= bitmask;
885 if (node->leftnode == LEAF) {
886 assert(node->left);
887 if (tree->leaf_mark(node->left)) {
888 n = node;
889 while (n && !n->mark) {
890 marked++;
891 n->mark = 1;
892 n = n->parent;
893 }
894 }
895 } else if (node->left) {
896 assert(node->leftnode == NODE);
897 node = node->left;
898 if (!node->mark && node->parent->mark) {
899 marked++;
900 node->mark = 1;
901 }
902 continue;
903 }
904 }
905 if ((rightmask & bitmask) == 0) {
906 rightmask |= bitmask;
907 if (node->rightnode == LEAF) {
908 assert(node->right);
909 if (tree->leaf_mark(node->right)) {
910 n = node;
911 while (n && !n->mark) {
912 marked++;
913 n->mark = 1;
914 n = n->parent;
915 }
916 }
917 } else if (node->right) {
918 assert(node->rightnode == NODE);
919 node = node->right;
920 if (!node->mark && node->parent->mark &&
921 !node->parent->left) {
922 marked++;
923 node->mark = 1;
924 }
925 continue;
926 }
927 }
928 leftmask &= ~bitmask;
929 rightmask &= ~bitmask;
930 node = node->parent;
931 }
932 done:
933 if (verbose > 0)
934 printf("Marked %d nodes\n", marked);
935 }
936
937 /*
938 * Compute the index of each node and leaf, which is the offset in the
939 * emitted trie. These values must be pre-computed because relative
940 * offsets between nodes are used to navigate the tree.
941 */
index_nodes(struct tree * tree,int index)942 static int index_nodes(struct tree *tree, int index)
943 {
944 struct node *node;
945 unsigned int leftmask;
946 unsigned int rightmask;
947 unsigned int bitmask;
948 int count;
949 int indent;
950
951 /* Align to a cache line (or half a cache line?). */
952 while (index % 64)
953 index++;
954 tree->index = index;
955 indent = 1;
956 count = 0;
957
958 if (verbose > 0)
959 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
960 if (tree->childnode == LEAF) {
961 index += tree->leaf_size(tree->root);
962 goto done;
963 }
964
965 assert(tree->childnode == NODE);
966 node = tree->root;
967 leftmask = rightmask = 0;
968 while (node) {
969 if (!node->mark)
970 goto skip;
971 count++;
972 if (node->index != index)
973 node->index = index;
974 index += node->size;
975 skip:
976 while (node) {
977 bitmask = 1 << node->bitnum;
978 if (node->mark && (leftmask & bitmask) == 0) {
979 leftmask |= bitmask;
980 if (node->leftnode == LEAF) {
981 assert(node->left);
982 *tree->leaf_index(tree, node->left) =
983 index;
984 index += tree->leaf_size(node->left);
985 count++;
986 } else if (node->left) {
987 assert(node->leftnode == NODE);
988 indent += 1;
989 node = node->left;
990 break;
991 }
992 }
993 if (node->mark && (rightmask & bitmask) == 0) {
994 rightmask |= bitmask;
995 if (node->rightnode == LEAF) {
996 assert(node->right);
997 *tree->leaf_index(tree, node->right) = index;
998 index += tree->leaf_size(node->right);
999 count++;
1000 } else if (node->right) {
1001 assert(node->rightnode == NODE);
1002 indent += 1;
1003 node = node->right;
1004 break;
1005 }
1006 }
1007 leftmask &= ~bitmask;
1008 rightmask &= ~bitmask;
1009 node = node->parent;
1010 indent -= 1;
1011 }
1012 }
1013 done:
1014 /* Round up to a multiple of 16 */
1015 while (index % 16)
1016 index++;
1017 if (verbose > 0)
1018 printf("Final index %d\n", index);
1019 return index;
1020 }
1021
1022 /*
1023 * Mark the nodes in a subtree, helper for size_nodes().
1024 */
mark_subtree(struct node * node)1025 static int mark_subtree(struct node *node)
1026 {
1027 int changed;
1028
1029 if (!node || node->mark)
1030 return 0;
1031 node->mark = 1;
1032 node->index = node->parent->index;
1033 changed = 1;
1034 if (node->leftnode == NODE)
1035 changed += mark_subtree(node->left);
1036 if (node->rightnode == NODE)
1037 changed += mark_subtree(node->right);
1038 return changed;
1039 }
1040
1041 /*
1042 * Compute the size of nodes and leaves. We start by assuming that
1043 * each node needs to store a three-byte offset. The indexes of the
1044 * nodes are calculated based on that, and then this function is
1045 * called to see if the sizes of some nodes can be reduced. This is
1046 * repeated until no more changes are seen.
1047 */
size_nodes(struct tree * tree)1048 static int size_nodes(struct tree *tree)
1049 {
1050 struct tree *next;
1051 struct node *node;
1052 struct node *right;
1053 struct node *n;
1054 unsigned int leftmask;
1055 unsigned int rightmask;
1056 unsigned int bitmask;
1057 unsigned int pathbits;
1058 unsigned int pathmask;
1059 unsigned int nbit;
1060 int changed;
1061 int offset;
1062 int size;
1063 int indent;
1064
1065 indent = 1;
1066 changed = 0;
1067 size = 0;
1068
1069 if (verbose > 0)
1070 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1071 if (tree->childnode == LEAF)
1072 goto done;
1073
1074 assert(tree->childnode == NODE);
1075 pathbits = 0;
1076 pathmask = 0;
1077 node = tree->root;
1078 leftmask = rightmask = 0;
1079 while (node) {
1080 if (!node->mark)
1081 goto skip;
1082 offset = 0;
1083 if (!node->left || !node->right) {
1084 size = 1;
1085 } else {
1086 if (node->rightnode == NODE) {
1087 /*
1088 * If the right node is not marked,
1089 * look for a corresponding node in
1090 * the next tree. Such a node need
1091 * not exist.
1092 */
1093 right = node->right;
1094 next = tree->next;
1095 while (!right->mark) {
1096 assert(next);
1097 n = next->root;
1098 while (n->bitnum != node->bitnum) {
1099 nbit = 1 << n->bitnum;
1100 if (!(pathmask & nbit))
1101 break;
1102 if (pathbits & nbit) {
1103 if (n->rightnode == LEAF)
1104 break;
1105 n = n->right;
1106 } else {
1107 if (n->leftnode == LEAF)
1108 break;
1109 n = n->left;
1110 }
1111 }
1112 if (n->bitnum != node->bitnum)
1113 break;
1114 n = n->right;
1115 right = n;
1116 next = next->next;
1117 }
1118 /* Make sure the right node is marked. */
1119 if (!right->mark)
1120 changed += mark_subtree(right);
1121 offset = right->index - node->index;
1122 } else {
1123 offset = *tree->leaf_index(tree, node->right);
1124 offset -= node->index;
1125 }
1126 assert(offset >= 0);
1127 assert(offset <= 0xffffff);
1128 if (offset <= 0xff) {
1129 size = 2;
1130 } else if (offset <= 0xffff) {
1131 size = 3;
1132 } else { /* offset <= 0xffffff */
1133 size = 4;
1134 }
1135 }
1136 if (node->size != size || node->offset != offset) {
1137 node->size = size;
1138 node->offset = offset;
1139 changed++;
1140 }
1141 skip:
1142 while (node) {
1143 bitmask = 1 << node->bitnum;
1144 pathmask |= bitmask;
1145 if (node->mark && (leftmask & bitmask) == 0) {
1146 leftmask |= bitmask;
1147 if (node->leftnode == LEAF) {
1148 assert(node->left);
1149 } else if (node->left) {
1150 assert(node->leftnode == NODE);
1151 indent += 1;
1152 node = node->left;
1153 break;
1154 }
1155 }
1156 if (node->mark && (rightmask & bitmask) == 0) {
1157 rightmask |= bitmask;
1158 pathbits |= bitmask;
1159 if (node->rightnode == LEAF) {
1160 assert(node->right);
1161 } else if (node->right) {
1162 assert(node->rightnode == NODE);
1163 indent += 1;
1164 node = node->right;
1165 break;
1166 }
1167 }
1168 leftmask &= ~bitmask;
1169 rightmask &= ~bitmask;
1170 pathmask &= ~bitmask;
1171 pathbits &= ~bitmask;
1172 node = node->parent;
1173 indent -= 1;
1174 }
1175 }
1176 done:
1177 if (verbose > 0)
1178 printf("Found %d changes\n", changed);
1179 return changed;
1180 }
1181
1182 /*
1183 * Emit a trie for the given tree into the data array.
1184 */
emit(struct tree * tree,unsigned char * data)1185 static void emit(struct tree *tree, unsigned char *data)
1186 {
1187 struct node *node;
1188 unsigned int leftmask;
1189 unsigned int rightmask;
1190 unsigned int bitmask;
1191 int offlen;
1192 int offset;
1193 int index;
1194 int indent;
1195 int size;
1196 int bytes;
1197 int leaves;
1198 int nodes[4];
1199 unsigned char byte;
1200
1201 nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1202 leaves = 0;
1203 bytes = 0;
1204 index = tree->index;
1205 data += index;
1206 indent = 1;
1207 if (verbose > 0)
1208 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1209 if (tree->childnode == LEAF) {
1210 assert(tree->root);
1211 tree->leaf_emit(tree->root, data);
1212 size = tree->leaf_size(tree->root);
1213 index += size;
1214 leaves++;
1215 goto done;
1216 }
1217
1218 assert(tree->childnode == NODE);
1219 node = tree->root;
1220 leftmask = rightmask = 0;
1221 while (node) {
1222 if (!node->mark)
1223 goto skip;
1224 assert(node->offset != -1);
1225 assert(node->index == index);
1226
1227 byte = 0;
1228 if (node->nextbyte)
1229 byte |= NEXTBYTE;
1230 byte |= (node->bitnum & BITNUM);
1231 if (node->left && node->right) {
1232 if (node->leftnode == NODE)
1233 byte |= LEFTNODE;
1234 if (node->rightnode == NODE)
1235 byte |= RIGHTNODE;
1236 if (node->offset <= 0xff)
1237 offlen = 1;
1238 else if (node->offset <= 0xffff)
1239 offlen = 2;
1240 else
1241 offlen = 3;
1242 nodes[offlen]++;
1243 offset = node->offset;
1244 byte |= offlen << OFFLEN_SHIFT;
1245 *data++ = byte;
1246 index++;
1247 while (offlen--) {
1248 *data++ = offset & 0xff;
1249 index++;
1250 offset >>= 8;
1251 }
1252 } else if (node->left) {
1253 if (node->leftnode == NODE)
1254 byte |= TRIENODE;
1255 nodes[0]++;
1256 *data++ = byte;
1257 index++;
1258 } else if (node->right) {
1259 byte |= RIGHTNODE;
1260 if (node->rightnode == NODE)
1261 byte |= TRIENODE;
1262 nodes[0]++;
1263 *data++ = byte;
1264 index++;
1265 } else {
1266 assert(0);
1267 }
1268 skip:
1269 while (node) {
1270 bitmask = 1 << node->bitnum;
1271 if (node->mark && (leftmask & bitmask) == 0) {
1272 leftmask |= bitmask;
1273 if (node->leftnode == LEAF) {
1274 assert(node->left);
1275 data = tree->leaf_emit(node->left,
1276 data);
1277 size = tree->leaf_size(node->left);
1278 index += size;
1279 bytes += size;
1280 leaves++;
1281 } else if (node->left) {
1282 assert(node->leftnode == NODE);
1283 indent += 1;
1284 node = node->left;
1285 break;
1286 }
1287 }
1288 if (node->mark && (rightmask & bitmask) == 0) {
1289 rightmask |= bitmask;
1290 if (node->rightnode == LEAF) {
1291 assert(node->right);
1292 data = tree->leaf_emit(node->right,
1293 data);
1294 size = tree->leaf_size(node->right);
1295 index += size;
1296 bytes += size;
1297 leaves++;
1298 } else if (node->right) {
1299 assert(node->rightnode == NODE);
1300 indent += 1;
1301 node = node->right;
1302 break;
1303 }
1304 }
1305 leftmask &= ~bitmask;
1306 rightmask &= ~bitmask;
1307 node = node->parent;
1308 indent -= 1;
1309 }
1310 }
1311 done:
1312 if (verbose > 0) {
1313 printf("Emitted %d (%d) leaves",
1314 leaves, bytes);
1315 printf(" %d (%d+%d+%d+%d) nodes",
1316 nodes[0] + nodes[1] + nodes[2] + nodes[3],
1317 nodes[0], nodes[1], nodes[2], nodes[3]);
1318 printf(" %d total\n", index - tree->index);
1319 }
1320 }
1321
1322 /* ------------------------------------------------------------------ */
1323
1324 /*
1325 * Unicode data.
1326 *
1327 * We need to keep track of the Canonical Combining Class, the Age,
1328 * and decompositions for a code point.
1329 *
1330 * For the Age, we store the index into the ages table. Effectively
1331 * this is a generation number that the table maps to a unicode
1332 * version.
1333 *
1334 * The correction field is used to indicate that this entry is in the
1335 * corrections array, which contains decompositions that were
1336 * corrected in later revisions. The value of the correction field is
1337 * the Unicode version in which the mapping was corrected.
1338 */
1339 struct unicode_data {
1340 unsigned int code;
1341 int ccc;
1342 int gen;
1343 int correction;
1344 unsigned int *utf32nfdi;
1345 unsigned int *utf32nfdicf;
1346 char *utf8nfdi;
1347 char *utf8nfdicf;
1348 };
1349
1350 struct unicode_data unicode_data[0x110000];
1351 struct unicode_data *corrections;
1352 int corrections_count;
1353
1354 struct tree *nfdi_tree;
1355 struct tree *nfdicf_tree;
1356
1357 struct tree *trees;
1358 int trees_count;
1359
1360 /*
1361 * Check the corrections array to see if this entry was corrected at
1362 * some point.
1363 */
corrections_lookup(struct unicode_data * u)1364 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1365 {
1366 int i;
1367
1368 for (i = 0; i != corrections_count; i++)
1369 if (u->code == corrections[i].code)
1370 return &corrections[i];
1371 return u;
1372 }
1373
nfdi_equal(void * l,void * r)1374 static int nfdi_equal(void *l, void *r)
1375 {
1376 struct unicode_data *left = l;
1377 struct unicode_data *right = r;
1378
1379 if (left->gen != right->gen)
1380 return 0;
1381 if (left->ccc != right->ccc)
1382 return 0;
1383 if (left->utf8nfdi && right->utf8nfdi &&
1384 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1385 return 1;
1386 if (left->utf8nfdi || right->utf8nfdi)
1387 return 0;
1388 return 1;
1389 }
1390
nfdicf_equal(void * l,void * r)1391 static int nfdicf_equal(void *l, void *r)
1392 {
1393 struct unicode_data *left = l;
1394 struct unicode_data *right = r;
1395
1396 if (left->gen != right->gen)
1397 return 0;
1398 if (left->ccc != right->ccc)
1399 return 0;
1400 if (left->utf8nfdicf && right->utf8nfdicf &&
1401 strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0)
1402 return 1;
1403 if (left->utf8nfdicf && right->utf8nfdicf)
1404 return 0;
1405 if (left->utf8nfdicf || right->utf8nfdicf)
1406 return 0;
1407 if (left->utf8nfdi && right->utf8nfdi &&
1408 strcmp(left->utf8nfdi, right->utf8nfdi) == 0)
1409 return 1;
1410 if (left->utf8nfdi || right->utf8nfdi)
1411 return 0;
1412 return 1;
1413 }
1414
nfdi_print(void * l,int indent)1415 static void nfdi_print(void *l, int indent)
1416 {
1417 struct unicode_data *leaf = l;
1418
1419 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1420 leaf->code, leaf->ccc, leaf->gen);
1421
1422 if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1423 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1424 else if (leaf->utf8nfdi)
1425 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1426
1427 printf("\n");
1428 }
1429
nfdicf_print(void * l,int indent)1430 static void nfdicf_print(void *l, int indent)
1431 {
1432 struct unicode_data *leaf = l;
1433
1434 printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1435 leaf->code, leaf->ccc, leaf->gen);
1436
1437 if (leaf->utf8nfdicf)
1438 printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf);
1439 else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL)
1440 printf(" nfdi \"%s\"", "HANGUL SYLLABLE");
1441 else if (leaf->utf8nfdi)
1442 printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi);
1443 printf("\n");
1444 }
1445
nfdi_mark(void * l)1446 static int nfdi_mark(void *l)
1447 {
1448 return 1;
1449 }
1450
nfdicf_mark(void * l)1451 static int nfdicf_mark(void *l)
1452 {
1453 struct unicode_data *leaf = l;
1454
1455 if (leaf->utf8nfdicf)
1456 return 1;
1457 return 0;
1458 }
1459
correction_mark(void * l)1460 static int correction_mark(void *l)
1461 {
1462 struct unicode_data *leaf = l;
1463
1464 return leaf->correction;
1465 }
1466
nfdi_size(void * l)1467 static int nfdi_size(void *l)
1468 {
1469 struct unicode_data *leaf = l;
1470 int size = 2;
1471
1472 if (HANGUL_SYLLABLE(leaf->code))
1473 size += 1;
1474 else if (leaf->utf8nfdi)
1475 size += strlen(leaf->utf8nfdi) + 1;
1476 return size;
1477 }
1478
nfdicf_size(void * l)1479 static int nfdicf_size(void *l)
1480 {
1481 struct unicode_data *leaf = l;
1482 int size = 2;
1483
1484 if (HANGUL_SYLLABLE(leaf->code))
1485 size += 1;
1486 else if (leaf->utf8nfdicf)
1487 size += strlen(leaf->utf8nfdicf) + 1;
1488 else if (leaf->utf8nfdi)
1489 size += strlen(leaf->utf8nfdi) + 1;
1490 return size;
1491 }
1492
nfdi_index(struct tree * tree,void * l)1493 static int *nfdi_index(struct tree *tree, void *l)
1494 {
1495 struct unicode_data *leaf = l;
1496
1497 return &tree->leafindex[leaf->code];
1498 }
1499
nfdicf_index(struct tree * tree,void * l)1500 static int *nfdicf_index(struct tree *tree, void *l)
1501 {
1502 struct unicode_data *leaf = l;
1503
1504 return &tree->leafindex[leaf->code];
1505 }
1506
nfdi_emit(void * l,unsigned char * data)1507 static unsigned char *nfdi_emit(void *l, unsigned char *data)
1508 {
1509 struct unicode_data *leaf = l;
1510 unsigned char *s;
1511
1512 *data++ = leaf->gen;
1513
1514 if (HANGUL_SYLLABLE(leaf->code)) {
1515 *data++ = DECOMPOSE;
1516 *data++ = HANGUL;
1517 } else if (leaf->utf8nfdi) {
1518 *data++ = DECOMPOSE;
1519 s = (unsigned char*)leaf->utf8nfdi;
1520 while ((*data++ = *s++) != 0)
1521 ;
1522 } else {
1523 *data++ = leaf->ccc;
1524 }
1525 return data;
1526 }
1527
nfdicf_emit(void * l,unsigned char * data)1528 static unsigned char *nfdicf_emit(void *l, unsigned char *data)
1529 {
1530 struct unicode_data *leaf = l;
1531 unsigned char *s;
1532
1533 *data++ = leaf->gen;
1534
1535 if (HANGUL_SYLLABLE(leaf->code)) {
1536 *data++ = DECOMPOSE;
1537 *data++ = HANGUL;
1538 } else if (leaf->utf8nfdicf) {
1539 *data++ = DECOMPOSE;
1540 s = (unsigned char*)leaf->utf8nfdicf;
1541 while ((*data++ = *s++) != 0)
1542 ;
1543 } else if (leaf->utf8nfdi) {
1544 *data++ = DECOMPOSE;
1545 s = (unsigned char*)leaf->utf8nfdi;
1546 while ((*data++ = *s++) != 0)
1547 ;
1548 } else {
1549 *data++ = leaf->ccc;
1550 }
1551 return data;
1552 }
1553
utf8_create(struct unicode_data * data)1554 static void utf8_create(struct unicode_data *data)
1555 {
1556 char utf[18*4+1];
1557 char *u;
1558 unsigned int *um;
1559 int i;
1560
1561 if (data->utf8nfdi) {
1562 assert(data->utf8nfdi[0] == HANGUL);
1563 return;
1564 }
1565
1566 u = utf;
1567 um = data->utf32nfdi;
1568 if (um) {
1569 for (i = 0; um[i]; i++)
1570 u += utf8encode(u, um[i]);
1571 *u = '\0';
1572 data->utf8nfdi = strdup(utf);
1573 }
1574 u = utf;
1575 um = data->utf32nfdicf;
1576 if (um) {
1577 for (i = 0; um[i]; i++)
1578 u += utf8encode(u, um[i]);
1579 *u = '\0';
1580 if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf))
1581 data->utf8nfdicf = strdup(utf);
1582 }
1583 }
1584
utf8_init(void)1585 static void utf8_init(void)
1586 {
1587 unsigned int unichar;
1588 int i;
1589
1590 for (unichar = 0; unichar != 0x110000; unichar++)
1591 utf8_create(&unicode_data[unichar]);
1592
1593 for (i = 0; i != corrections_count; i++)
1594 utf8_create(&corrections[i]);
1595 }
1596
trees_init(void)1597 static void trees_init(void)
1598 {
1599 struct unicode_data *data;
1600 unsigned int maxage;
1601 unsigned int nextage;
1602 int count;
1603 int i;
1604 int j;
1605
1606 /* Count the number of different ages. */
1607 count = 0;
1608 nextage = (unsigned int)-1;
1609 do {
1610 maxage = nextage;
1611 nextage = 0;
1612 for (i = 0; i <= corrections_count; i++) {
1613 data = &corrections[i];
1614 if (nextage < data->correction &&
1615 data->correction < maxage)
1616 nextage = data->correction;
1617 }
1618 count++;
1619 } while (nextage);
1620
1621 /* Two trees per age: nfdi and nfdicf */
1622 trees_count = count * 2;
1623 trees = calloc(trees_count, sizeof(struct tree));
1624
1625 /* Assign ages to the trees. */
1626 count = trees_count;
1627 nextage = (unsigned int)-1;
1628 do {
1629 maxage = nextage;
1630 trees[--count].maxage = maxage;
1631 trees[--count].maxage = maxage;
1632 nextage = 0;
1633 for (i = 0; i <= corrections_count; i++) {
1634 data = &corrections[i];
1635 if (nextage < data->correction &&
1636 data->correction < maxage)
1637 nextage = data->correction;
1638 }
1639 } while (nextage);
1640
1641 /* The ages assigned above are off by one. */
1642 for (i = 0; i != trees_count; i++) {
1643 j = 0;
1644 while (ages[j] < trees[i].maxage)
1645 j++;
1646 trees[i].maxage = ages[j-1];
1647 }
1648
1649 /* Set up the forwarding between trees. */
1650 trees[trees_count-2].next = &trees[trees_count-1];
1651 trees[trees_count-1].leaf_mark = nfdi_mark;
1652 trees[trees_count-2].leaf_mark = nfdicf_mark;
1653 for (i = 0; i != trees_count-2; i += 2) {
1654 trees[i].next = &trees[trees_count-2];
1655 trees[i].leaf_mark = correction_mark;
1656 trees[i+1].next = &trees[trees_count-1];
1657 trees[i+1].leaf_mark = correction_mark;
1658 }
1659
1660 /* Assign the callouts. */
1661 for (i = 0; i != trees_count; i += 2) {
1662 trees[i].type = "nfdicf";
1663 trees[i].leaf_equal = nfdicf_equal;
1664 trees[i].leaf_print = nfdicf_print;
1665 trees[i].leaf_size = nfdicf_size;
1666 trees[i].leaf_index = nfdicf_index;
1667 trees[i].leaf_emit = nfdicf_emit;
1668
1669 trees[i+1].type = "nfdi";
1670 trees[i+1].leaf_equal = nfdi_equal;
1671 trees[i+1].leaf_print = nfdi_print;
1672 trees[i+1].leaf_size = nfdi_size;
1673 trees[i+1].leaf_index = nfdi_index;
1674 trees[i+1].leaf_emit = nfdi_emit;
1675 }
1676
1677 /* Finish init. */
1678 for (i = 0; i != trees_count; i++)
1679 trees[i].childnode = NODE;
1680 }
1681
trees_populate(void)1682 static void trees_populate(void)
1683 {
1684 struct unicode_data *data;
1685 unsigned int unichar;
1686 char keyval[4];
1687 int keylen;
1688 int i;
1689
1690 for (i = 0; i != trees_count; i++) {
1691 if (verbose > 0) {
1692 printf("Populating %s_%x\n",
1693 trees[i].type, trees[i].maxage);
1694 }
1695 for (unichar = 0; unichar != 0x110000; unichar++) {
1696 if (unicode_data[unichar].gen < 0)
1697 continue;
1698 keylen = utf8encode(keyval, unichar);
1699 data = corrections_lookup(&unicode_data[unichar]);
1700 if (data->correction <= trees[i].maxage)
1701 data = &unicode_data[unichar];
1702 insert(&trees[i], keyval, keylen, data);
1703 }
1704 }
1705 }
1706
trees_reduce(void)1707 static void trees_reduce(void)
1708 {
1709 int i;
1710 int size;
1711 int changed;
1712
1713 for (i = 0; i != trees_count; i++)
1714 prune(&trees[i]);
1715 for (i = 0; i != trees_count; i++)
1716 mark_nodes(&trees[i]);
1717 do {
1718 size = 0;
1719 for (i = 0; i != trees_count; i++)
1720 size = index_nodes(&trees[i], size);
1721 changed = 0;
1722 for (i = 0; i != trees_count; i++)
1723 changed += size_nodes(&trees[i]);
1724 } while (changed);
1725
1726 utf8data = calloc(size, 1);
1727 utf8data_size = size;
1728 for (i = 0; i != trees_count; i++)
1729 emit(&trees[i], utf8data);
1730
1731 if (verbose > 0) {
1732 for (i = 0; i != trees_count; i++) {
1733 printf("%s_%x idx %d\n",
1734 trees[i].type, trees[i].maxage, trees[i].index);
1735 }
1736 }
1737
1738 nfdi = utf8data + trees[trees_count-1].index;
1739 nfdicf = utf8data + trees[trees_count-2].index;
1740
1741 nfdi_tree = &trees[trees_count-1];
1742 nfdicf_tree = &trees[trees_count-2];
1743 }
1744
verify(struct tree * tree)1745 static void verify(struct tree *tree)
1746 {
1747 struct unicode_data *data;
1748 utf8leaf_t *leaf;
1749 unsigned int unichar;
1750 char key[4];
1751 unsigned char hangul[UTF8HANGULLEAF];
1752 int report;
1753 int nocf;
1754
1755 if (verbose > 0)
1756 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1757 nocf = strcmp(tree->type, "nfdicf");
1758
1759 for (unichar = 0; unichar != 0x110000; unichar++) {
1760 report = 0;
1761 data = corrections_lookup(&unicode_data[unichar]);
1762 if (data->correction <= tree->maxage)
1763 data = &unicode_data[unichar];
1764 utf8encode(key,unichar);
1765 leaf = utf8lookup(tree, hangul, key);
1766
1767 if (!leaf) {
1768 if (data->gen != -1)
1769 report++;
1770 if (unichar < 0xd800 || unichar > 0xdfff)
1771 report++;
1772 } else {
1773 if (unichar >= 0xd800 && unichar <= 0xdfff)
1774 report++;
1775 if (data->gen == -1)
1776 report++;
1777 if (data->gen != LEAF_GEN(leaf))
1778 report++;
1779 if (LEAF_CCC(leaf) == DECOMPOSE) {
1780 if (HANGUL_SYLLABLE(data->code)) {
1781 if (data->utf8nfdi[0] != HANGUL)
1782 report++;
1783 } else if (nocf) {
1784 if (!data->utf8nfdi) {
1785 report++;
1786 } else if (strcmp(data->utf8nfdi,
1787 LEAF_STR(leaf))) {
1788 report++;
1789 }
1790 } else {
1791 if (!data->utf8nfdicf &&
1792 !data->utf8nfdi) {
1793 report++;
1794 } else if (data->utf8nfdicf) {
1795 if (strcmp(data->utf8nfdicf,
1796 LEAF_STR(leaf)))
1797 report++;
1798 } else if (strcmp(data->utf8nfdi,
1799 LEAF_STR(leaf))) {
1800 report++;
1801 }
1802 }
1803 } else if (data->ccc != LEAF_CCC(leaf)) {
1804 report++;
1805 }
1806 }
1807 if (report) {
1808 printf("%X code %X gen %d ccc %d"
1809 " nfdi -> \"%s\"",
1810 unichar, data->code, data->gen,
1811 data->ccc,
1812 data->utf8nfdi);
1813 if (leaf) {
1814 printf(" gen %d ccc %d"
1815 " nfdi -> \"%s\"",
1816 LEAF_GEN(leaf),
1817 LEAF_CCC(leaf),
1818 LEAF_CCC(leaf) == DECOMPOSE ?
1819 LEAF_STR(leaf) : "");
1820 }
1821 printf("\n");
1822 }
1823 }
1824 }
1825
trees_verify(void)1826 static void trees_verify(void)
1827 {
1828 int i;
1829
1830 for (i = 0; i != trees_count; i++)
1831 verify(&trees[i]);
1832 }
1833
1834 /* ------------------------------------------------------------------ */
1835
help(void)1836 static void help(void)
1837 {
1838 printf("Usage: %s [options]\n", argv0);
1839 printf("\n");
1840 printf("This program creates an a data trie used for parsing and\n");
1841 printf("normalization of UTF-8 strings. The trie is derived from\n");
1842 printf("a set of input files from the Unicode character database\n");
1843 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1844 printf("\n");
1845 printf("The generated tree supports two normalization forms:\n");
1846 printf("\n");
1847 printf("\tnfdi:\n");
1848 printf("\t- Apply unicode normalization form NFD.\n");
1849 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1850 printf("\n");
1851 printf("\tnfdicf:\n");
1852 printf("\t- Apply unicode normalization form NFD.\n");
1853 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1854 printf("\t- Apply a full casefold (C + F).\n");
1855 printf("\n");
1856 printf("These forms were chosen as being most useful when dealing\n");
1857 printf("with file names: NFD catches most cases where characters\n");
1858 printf("should be considered equivalent. The ignorables are mostly\n");
1859 printf("invisible, making names hard to type.\n");
1860 printf("\n");
1861 printf("The options to specify the files to be used are listed\n");
1862 printf("below with their default values, which are the names used\n");
1863 printf("by version 11.0.0 of the Unicode Character Database.\n");
1864 printf("\n");
1865 printf("The input files:\n");
1866 printf("\t-a %s\n", AGE_NAME);
1867 printf("\t-c %s\n", CCC_NAME);
1868 printf("\t-p %s\n", PROP_NAME);
1869 printf("\t-d %s\n", DATA_NAME);
1870 printf("\t-f %s\n", FOLD_NAME);
1871 printf("\t-n %s\n", NORM_NAME);
1872 printf("\n");
1873 printf("Additionally, the generated tables are tested using:\n");
1874 printf("\t-t %s\n", TEST_NAME);
1875 printf("\n");
1876 printf("Finally, the output file:\n");
1877 printf("\t-o %s\n", UTF8_NAME);
1878 printf("\n");
1879 }
1880
usage(void)1881 static void usage(void)
1882 {
1883 help();
1884 exit(1);
1885 }
1886
open_fail(const char * name,int error)1887 static void open_fail(const char *name, int error)
1888 {
1889 printf("Error %d opening %s: %s\n", error, name, strerror(error));
1890 exit(1);
1891 }
1892
file_fail(const char * filename)1893 static void file_fail(const char *filename)
1894 {
1895 printf("Error parsing %s\n", filename);
1896 exit(1);
1897 }
1898
line_fail(const char * filename,const char * line)1899 static void line_fail(const char *filename, const char *line)
1900 {
1901 printf("Error parsing %s:%s\n", filename, line);
1902 exit(1);
1903 }
1904
1905 /* ------------------------------------------------------------------ */
1906
print_utf32(unsigned int * utf32str)1907 static void print_utf32(unsigned int *utf32str)
1908 {
1909 int i;
1910
1911 for (i = 0; utf32str[i]; i++)
1912 printf(" %X", utf32str[i]);
1913 }
1914
print_utf32nfdi(unsigned int unichar)1915 static void print_utf32nfdi(unsigned int unichar)
1916 {
1917 printf(" %X ->", unichar);
1918 print_utf32(unicode_data[unichar].utf32nfdi);
1919 printf("\n");
1920 }
1921
print_utf32nfdicf(unsigned int unichar)1922 static void print_utf32nfdicf(unsigned int unichar)
1923 {
1924 printf(" %X ->", unichar);
1925 print_utf32(unicode_data[unichar].utf32nfdicf);
1926 printf("\n");
1927 }
1928
1929 /* ------------------------------------------------------------------ */
1930
age_init(void)1931 static void age_init(void)
1932 {
1933 FILE *file;
1934 unsigned int first;
1935 unsigned int last;
1936 unsigned int unichar;
1937 unsigned int major;
1938 unsigned int minor;
1939 unsigned int revision;
1940 int gen;
1941 int count;
1942 int ret;
1943
1944 if (verbose > 0)
1945 printf("Parsing %s\n", age_name);
1946
1947 file = fopen(age_name, "r");
1948 if (!file)
1949 open_fail(age_name, errno);
1950 count = 0;
1951
1952 gen = 0;
1953 while (fgets(line, LINESIZE, file)) {
1954 ret = sscanf(line, "# Age=V%d_%d_%d",
1955 &major, &minor, &revision);
1956 if (ret == 3) {
1957 ages_count++;
1958 if (verbose > 1)
1959 printf(" Age V%d_%d_%d\n",
1960 major, minor, revision);
1961 if (!age_valid(major, minor, revision))
1962 line_fail(age_name, line);
1963 continue;
1964 }
1965 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1966 if (ret == 2) {
1967 ages_count++;
1968 if (verbose > 1)
1969 printf(" Age V%d_%d\n", major, minor);
1970 if (!age_valid(major, minor, 0))
1971 line_fail(age_name, line);
1972 continue;
1973 }
1974 }
1975
1976 /* We must have found something above. */
1977 if (verbose > 1)
1978 printf("%d age entries\n", ages_count);
1979 if (ages_count == 0 || ages_count > MAXGEN)
1980 file_fail(age_name);
1981
1982 /* There is a 0 entry. */
1983 ages_count++;
1984 ages = calloc(ages_count + 1, sizeof(*ages));
1985 /* And a guard entry. */
1986 ages[ages_count] = (unsigned int)-1;
1987
1988 rewind(file);
1989 count = 0;
1990 gen = 0;
1991 while (fgets(line, LINESIZE, file)) {
1992 ret = sscanf(line, "# Age=V%d_%d_%d",
1993 &major, &minor, &revision);
1994 if (ret == 3) {
1995 ages[++gen] =
1996 UNICODE_AGE(major, minor, revision);
1997 if (verbose > 1)
1998 printf(" Age V%d_%d_%d = gen %d\n",
1999 major, minor, revision, gen);
2000 if (!age_valid(major, minor, revision))
2001 line_fail(age_name, line);
2002 continue;
2003 }
2004 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
2005 if (ret == 2) {
2006 ages[++gen] = UNICODE_AGE(major, minor, 0);
2007 if (verbose > 1)
2008 printf(" Age V%d_%d = %d\n",
2009 major, minor, gen);
2010 if (!age_valid(major, minor, 0))
2011 line_fail(age_name, line);
2012 continue;
2013 }
2014 ret = sscanf(line, "%X..%X ; %d.%d #",
2015 &first, &last, &major, &minor);
2016 if (ret == 4) {
2017 for (unichar = first; unichar <= last; unichar++)
2018 unicode_data[unichar].gen = gen;
2019 count += 1 + last - first;
2020 if (verbose > 1)
2021 printf(" %X..%X gen %d\n", first, last, gen);
2022 if (!utf32valid(first) || !utf32valid(last))
2023 line_fail(age_name, line);
2024 continue;
2025 }
2026 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2027 if (ret == 3) {
2028 unicode_data[unichar].gen = gen;
2029 count++;
2030 if (verbose > 1)
2031 printf(" %X gen %d\n", unichar, gen);
2032 if (!utf32valid(unichar))
2033 line_fail(age_name, line);
2034 continue;
2035 }
2036 }
2037 unicode_maxage = ages[gen];
2038 fclose(file);
2039
2040 /* Nix surrogate block */
2041 if (verbose > 1)
2042 printf(" Removing surrogate block D800..DFFF\n");
2043 for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2044 unicode_data[unichar].gen = -1;
2045
2046 if (verbose > 0)
2047 printf("Found %d entries\n", count);
2048 if (count == 0)
2049 file_fail(age_name);
2050 }
2051
ccc_init(void)2052 static void ccc_init(void)
2053 {
2054 FILE *file;
2055 unsigned int first;
2056 unsigned int last;
2057 unsigned int unichar;
2058 unsigned int value;
2059 int count;
2060 int ret;
2061
2062 if (verbose > 0)
2063 printf("Parsing %s\n", ccc_name);
2064
2065 file = fopen(ccc_name, "r");
2066 if (!file)
2067 open_fail(ccc_name, errno);
2068
2069 count = 0;
2070 while (fgets(line, LINESIZE, file)) {
2071 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2072 if (ret == 3) {
2073 for (unichar = first; unichar <= last; unichar++) {
2074 unicode_data[unichar].ccc = value;
2075 count++;
2076 }
2077 if (verbose > 1)
2078 printf(" %X..%X ccc %d\n", first, last, value);
2079 if (!utf32valid(first) || !utf32valid(last))
2080 line_fail(ccc_name, line);
2081 continue;
2082 }
2083 ret = sscanf(line, "%X ; %d #", &unichar, &value);
2084 if (ret == 2) {
2085 unicode_data[unichar].ccc = value;
2086 count++;
2087 if (verbose > 1)
2088 printf(" %X ccc %d\n", unichar, value);
2089 if (!utf32valid(unichar))
2090 line_fail(ccc_name, line);
2091 continue;
2092 }
2093 }
2094 fclose(file);
2095
2096 if (verbose > 0)
2097 printf("Found %d entries\n", count);
2098 if (count == 0)
2099 file_fail(ccc_name);
2100 }
2101
ignore_compatibility_form(char * type)2102 static int ignore_compatibility_form(char *type)
2103 {
2104 int i;
2105 char *ignored_types[] = {"font", "noBreak", "initial", "medial",
2106 "final", "isolated", "circle", "super",
2107 "sub", "vertical", "wide", "narrow",
2108 "small", "square", "fraction", "compat"};
2109
2110 for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++)
2111 if (strcmp(type, ignored_types[i]) == 0)
2112 return 1;
2113 return 0;
2114 }
2115
nfdi_init(void)2116 static void nfdi_init(void)
2117 {
2118 FILE *file;
2119 unsigned int unichar;
2120 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2121 char *s;
2122 char *type;
2123 unsigned int *um;
2124 int count;
2125 int i;
2126 int ret;
2127
2128 if (verbose > 0)
2129 printf("Parsing %s\n", data_name);
2130 file = fopen(data_name, "r");
2131 if (!file)
2132 open_fail(data_name, errno);
2133
2134 count = 0;
2135 while (fgets(line, LINESIZE, file)) {
2136 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2137 &unichar, buf0);
2138 if (ret != 2)
2139 continue;
2140 if (!utf32valid(unichar))
2141 line_fail(data_name, line);
2142
2143 s = buf0;
2144 /* skip over <tag> */
2145 if (*s == '<') {
2146 type = ++s;
2147 while (*++s != '>');
2148 *s++ = '\0';
2149 if(ignore_compatibility_form(type))
2150 continue;
2151 }
2152 /* decode the decomposition into UTF-32 */
2153 i = 0;
2154 while (*s) {
2155 mapping[i] = strtoul(s, &s, 16);
2156 if (!utf32valid(mapping[i]))
2157 line_fail(data_name, line);
2158 i++;
2159 }
2160 mapping[i++] = 0;
2161
2162 um = malloc(i * sizeof(unsigned int));
2163 memcpy(um, mapping, i * sizeof(unsigned int));
2164 unicode_data[unichar].utf32nfdi = um;
2165
2166 if (verbose > 1)
2167 print_utf32nfdi(unichar);
2168 count++;
2169 }
2170 fclose(file);
2171 if (verbose > 0)
2172 printf("Found %d entries\n", count);
2173 if (count == 0)
2174 file_fail(data_name);
2175 }
2176
nfdicf_init(void)2177 static void nfdicf_init(void)
2178 {
2179 FILE *file;
2180 unsigned int unichar;
2181 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2182 char status;
2183 char *s;
2184 unsigned int *um;
2185 int i;
2186 int count;
2187 int ret;
2188
2189 if (verbose > 0)
2190 printf("Parsing %s\n", fold_name);
2191 file = fopen(fold_name, "r");
2192 if (!file)
2193 open_fail(fold_name, errno);
2194
2195 count = 0;
2196 while (fgets(line, LINESIZE, file)) {
2197 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2198 if (ret != 3)
2199 continue;
2200 if (!utf32valid(unichar))
2201 line_fail(fold_name, line);
2202 /* Use the C+F casefold. */
2203 if (status != 'C' && status != 'F')
2204 continue;
2205 s = buf0;
2206 if (*s == '<')
2207 while (*s++ != ' ')
2208 ;
2209 i = 0;
2210 while (*s) {
2211 mapping[i] = strtoul(s, &s, 16);
2212 if (!utf32valid(mapping[i]))
2213 line_fail(fold_name, line);
2214 i++;
2215 }
2216 mapping[i++] = 0;
2217
2218 um = malloc(i * sizeof(unsigned int));
2219 memcpy(um, mapping, i * sizeof(unsigned int));
2220 unicode_data[unichar].utf32nfdicf = um;
2221
2222 if (verbose > 1)
2223 print_utf32nfdicf(unichar);
2224 count++;
2225 }
2226 fclose(file);
2227 if (verbose > 0)
2228 printf("Found %d entries\n", count);
2229 if (count == 0)
2230 file_fail(fold_name);
2231 }
2232
ignore_init(void)2233 static void ignore_init(void)
2234 {
2235 FILE *file;
2236 unsigned int unichar;
2237 unsigned int first;
2238 unsigned int last;
2239 unsigned int *um;
2240 int count;
2241 int ret;
2242
2243 if (verbose > 0)
2244 printf("Parsing %s\n", prop_name);
2245 file = fopen(prop_name, "r");
2246 if (!file)
2247 open_fail(prop_name, errno);
2248 assert(file);
2249 count = 0;
2250 while (fgets(line, LINESIZE, file)) {
2251 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2252 if (ret == 3) {
2253 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2254 continue;
2255 if (!utf32valid(first) || !utf32valid(last))
2256 line_fail(prop_name, line);
2257 for (unichar = first; unichar <= last; unichar++) {
2258 free(unicode_data[unichar].utf32nfdi);
2259 um = malloc(sizeof(unsigned int));
2260 *um = 0;
2261 unicode_data[unichar].utf32nfdi = um;
2262 free(unicode_data[unichar].utf32nfdicf);
2263 um = malloc(sizeof(unsigned int));
2264 *um = 0;
2265 unicode_data[unichar].utf32nfdicf = um;
2266 count++;
2267 }
2268 if (verbose > 1)
2269 printf(" %X..%X Default_Ignorable_Code_Point\n",
2270 first, last);
2271 continue;
2272 }
2273 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2274 if (ret == 2) {
2275 if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2276 continue;
2277 if (!utf32valid(unichar))
2278 line_fail(prop_name, line);
2279 free(unicode_data[unichar].utf32nfdi);
2280 um = malloc(sizeof(unsigned int));
2281 *um = 0;
2282 unicode_data[unichar].utf32nfdi = um;
2283 free(unicode_data[unichar].utf32nfdicf);
2284 um = malloc(sizeof(unsigned int));
2285 *um = 0;
2286 unicode_data[unichar].utf32nfdicf = um;
2287 if (verbose > 1)
2288 printf(" %X Default_Ignorable_Code_Point\n",
2289 unichar);
2290 count++;
2291 continue;
2292 }
2293 }
2294 fclose(file);
2295
2296 if (verbose > 0)
2297 printf("Found %d entries\n", count);
2298 if (count == 0)
2299 file_fail(prop_name);
2300 }
2301
corrections_init(void)2302 static void corrections_init(void)
2303 {
2304 FILE *file;
2305 unsigned int unichar;
2306 unsigned int major;
2307 unsigned int minor;
2308 unsigned int revision;
2309 unsigned int age;
2310 unsigned int *um;
2311 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2312 char *s;
2313 int i;
2314 int count;
2315 int ret;
2316
2317 if (verbose > 0)
2318 printf("Parsing %s\n", norm_name);
2319 file = fopen(norm_name, "r");
2320 if (!file)
2321 open_fail(norm_name, errno);
2322
2323 count = 0;
2324 while (fgets(line, LINESIZE, file)) {
2325 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2326 &unichar, buf0, buf1,
2327 &major, &minor, &revision);
2328 if (ret != 6)
2329 continue;
2330 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2331 line_fail(norm_name, line);
2332 count++;
2333 }
2334 corrections = calloc(count, sizeof(struct unicode_data));
2335 corrections_count = count;
2336 rewind(file);
2337
2338 count = 0;
2339 while (fgets(line, LINESIZE, file)) {
2340 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2341 &unichar, buf0, buf1,
2342 &major, &minor, &revision);
2343 if (ret != 6)
2344 continue;
2345 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2346 line_fail(norm_name, line);
2347 corrections[count] = unicode_data[unichar];
2348 assert(corrections[count].code == unichar);
2349 age = UNICODE_AGE(major, minor, revision);
2350 corrections[count].correction = age;
2351
2352 i = 0;
2353 s = buf0;
2354 while (*s) {
2355 mapping[i] = strtoul(s, &s, 16);
2356 if (!utf32valid(mapping[i]))
2357 line_fail(norm_name, line);
2358 i++;
2359 }
2360 mapping[i++] = 0;
2361
2362 um = malloc(i * sizeof(unsigned int));
2363 memcpy(um, mapping, i * sizeof(unsigned int));
2364 corrections[count].utf32nfdi = um;
2365
2366 if (verbose > 1)
2367 printf(" %X -> %s -> %s V%d_%d_%d\n",
2368 unichar, buf0, buf1, major, minor, revision);
2369 count++;
2370 }
2371 fclose(file);
2372
2373 if (verbose > 0)
2374 printf("Found %d entries\n", count);
2375 if (count == 0)
2376 file_fail(norm_name);
2377 }
2378
2379 /* ------------------------------------------------------------------ */
2380
2381 /*
2382 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2383 *
2384 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2385 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2386 *
2387 * SBase = 0xAC00
2388 * LBase = 0x1100
2389 * VBase = 0x1161
2390 * TBase = 0x11A7
2391 * LCount = 19
2392 * VCount = 21
2393 * TCount = 28
2394 * NCount = 588 (VCount * TCount)
2395 * SCount = 11172 (LCount * NCount)
2396 *
2397 * Decomposition:
2398 * SIndex = s - SBase
2399 *
2400 * LV (Canonical/Full)
2401 * LIndex = SIndex / NCount
2402 * VIndex = (Sindex % NCount) / TCount
2403 * LPart = LBase + LIndex
2404 * VPart = VBase + VIndex
2405 *
2406 * LVT (Canonical)
2407 * LVIndex = (SIndex / TCount) * TCount
2408 * TIndex = (Sindex % TCount)
2409 * LVPart = SBase + LVIndex
2410 * TPart = TBase + TIndex
2411 *
2412 * LVT (Full)
2413 * LIndex = SIndex / NCount
2414 * VIndex = (Sindex % NCount) / TCount
2415 * TIndex = (Sindex % TCount)
2416 * LPart = LBase + LIndex
2417 * VPart = VBase + VIndex
2418 * if (TIndex == 0) {
2419 * d = <LPart, VPart>
2420 * } else {
2421 * TPart = TBase + TIndex
2422 * d = <LPart, VPart, TPart>
2423 * }
2424 *
2425 */
2426
hangul_decompose(void)2427 static void hangul_decompose(void)
2428 {
2429 unsigned int sb = 0xAC00;
2430 unsigned int lb = 0x1100;
2431 unsigned int vb = 0x1161;
2432 unsigned int tb = 0x11a7;
2433 /* unsigned int lc = 19; */
2434 unsigned int vc = 21;
2435 unsigned int tc = 28;
2436 unsigned int nc = (vc * tc);
2437 /* unsigned int sc = (lc * nc); */
2438 unsigned int unichar;
2439 unsigned int mapping[4];
2440 unsigned int *um;
2441 int count;
2442 int i;
2443
2444 if (verbose > 0)
2445 printf("Decomposing hangul\n");
2446 /* Hangul */
2447 count = 0;
2448 for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2449 unsigned int si = unichar - sb;
2450 unsigned int li = si / nc;
2451 unsigned int vi = (si % nc) / tc;
2452 unsigned int ti = si % tc;
2453
2454 i = 0;
2455 mapping[i++] = lb + li;
2456 mapping[i++] = vb + vi;
2457 if (ti)
2458 mapping[i++] = tb + ti;
2459 mapping[i++] = 0;
2460
2461 assert(!unicode_data[unichar].utf32nfdi);
2462 um = malloc(i * sizeof(unsigned int));
2463 memcpy(um, mapping, i * sizeof(unsigned int));
2464 unicode_data[unichar].utf32nfdi = um;
2465
2466 assert(!unicode_data[unichar].utf32nfdicf);
2467 um = malloc(i * sizeof(unsigned int));
2468 memcpy(um, mapping, i * sizeof(unsigned int));
2469 unicode_data[unichar].utf32nfdicf = um;
2470
2471 /*
2472 * Add a cookie as a reminder that the hangul syllable
2473 * decompositions must not be stored in the generated
2474 * trie.
2475 */
2476 unicode_data[unichar].utf8nfdi = malloc(2);
2477 unicode_data[unichar].utf8nfdi[0] = HANGUL;
2478 unicode_data[unichar].utf8nfdi[1] = '\0';
2479
2480 if (verbose > 1)
2481 print_utf32nfdi(unichar);
2482
2483 count++;
2484 }
2485 if (verbose > 0)
2486 printf("Created %d entries\n", count);
2487 }
2488
nfdi_decompose(void)2489 static void nfdi_decompose(void)
2490 {
2491 unsigned int unichar;
2492 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2493 unsigned int *um;
2494 unsigned int *dc;
2495 int count;
2496 int i;
2497 int j;
2498 int ret;
2499
2500 if (verbose > 0)
2501 printf("Decomposing nfdi\n");
2502
2503 count = 0;
2504 for (unichar = 0; unichar != 0x110000; unichar++) {
2505 if (!unicode_data[unichar].utf32nfdi)
2506 continue;
2507 for (;;) {
2508 ret = 1;
2509 i = 0;
2510 um = unicode_data[unichar].utf32nfdi;
2511 while (*um) {
2512 dc = unicode_data[*um].utf32nfdi;
2513 if (dc) {
2514 for (j = 0; dc[j]; j++)
2515 mapping[i++] = dc[j];
2516 ret = 0;
2517 } else {
2518 mapping[i++] = *um;
2519 }
2520 um++;
2521 }
2522 mapping[i++] = 0;
2523 if (ret)
2524 break;
2525 free(unicode_data[unichar].utf32nfdi);
2526 um = malloc(i * sizeof(unsigned int));
2527 memcpy(um, mapping, i * sizeof(unsigned int));
2528 unicode_data[unichar].utf32nfdi = um;
2529 }
2530 /* Add this decomposition to nfdicf if there is no entry. */
2531 if (!unicode_data[unichar].utf32nfdicf) {
2532 um = malloc(i * sizeof(unsigned int));
2533 memcpy(um, mapping, i * sizeof(unsigned int));
2534 unicode_data[unichar].utf32nfdicf = um;
2535 }
2536 if (verbose > 1)
2537 print_utf32nfdi(unichar);
2538 count++;
2539 }
2540 if (verbose > 0)
2541 printf("Processed %d entries\n", count);
2542 }
2543
nfdicf_decompose(void)2544 static void nfdicf_decompose(void)
2545 {
2546 unsigned int unichar;
2547 unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2548 unsigned int *um;
2549 unsigned int *dc;
2550 int count;
2551 int i;
2552 int j;
2553 int ret;
2554
2555 if (verbose > 0)
2556 printf("Decomposing nfdicf\n");
2557 count = 0;
2558 for (unichar = 0; unichar != 0x110000; unichar++) {
2559 if (!unicode_data[unichar].utf32nfdicf)
2560 continue;
2561 for (;;) {
2562 ret = 1;
2563 i = 0;
2564 um = unicode_data[unichar].utf32nfdicf;
2565 while (*um) {
2566 dc = unicode_data[*um].utf32nfdicf;
2567 if (dc) {
2568 for (j = 0; dc[j]; j++)
2569 mapping[i++] = dc[j];
2570 ret = 0;
2571 } else {
2572 mapping[i++] = *um;
2573 }
2574 um++;
2575 }
2576 mapping[i++] = 0;
2577 if (ret)
2578 break;
2579 free(unicode_data[unichar].utf32nfdicf);
2580 um = malloc(i * sizeof(unsigned int));
2581 memcpy(um, mapping, i * sizeof(unsigned int));
2582 unicode_data[unichar].utf32nfdicf = um;
2583 }
2584 if (verbose > 1)
2585 print_utf32nfdicf(unichar);
2586 count++;
2587 }
2588 if (verbose > 0)
2589 printf("Processed %d entries\n", count);
2590 }
2591
2592 /* ------------------------------------------------------------------ */
2593
2594 int utf8agemax(struct tree *, const char *);
2595 int utf8nagemax(struct tree *, const char *, size_t);
2596 int utf8agemin(struct tree *, const char *);
2597 int utf8nagemin(struct tree *, const char *, size_t);
2598 ssize_t utf8len(struct tree *, const char *);
2599 ssize_t utf8nlen(struct tree *, const char *, size_t);
2600 struct utf8cursor;
2601 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2602 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2603 int utf8byte(struct utf8cursor *);
2604
2605 /*
2606 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2607 *
2608 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2609 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2610 *
2611 * SBase = 0xAC00
2612 * LBase = 0x1100
2613 * VBase = 0x1161
2614 * TBase = 0x11A7
2615 * LCount = 19
2616 * VCount = 21
2617 * TCount = 28
2618 * NCount = 588 (VCount * TCount)
2619 * SCount = 11172 (LCount * NCount)
2620 *
2621 * Decomposition:
2622 * SIndex = s - SBase
2623 *
2624 * LV (Canonical/Full)
2625 * LIndex = SIndex / NCount
2626 * VIndex = (Sindex % NCount) / TCount
2627 * LPart = LBase + LIndex
2628 * VPart = VBase + VIndex
2629 *
2630 * LVT (Canonical)
2631 * LVIndex = (SIndex / TCount) * TCount
2632 * TIndex = (Sindex % TCount)
2633 * LVPart = SBase + LVIndex
2634 * TPart = TBase + TIndex
2635 *
2636 * LVT (Full)
2637 * LIndex = SIndex / NCount
2638 * VIndex = (Sindex % NCount) / TCount
2639 * TIndex = (Sindex % TCount)
2640 * LPart = LBase + LIndex
2641 * VPart = VBase + VIndex
2642 * if (TIndex == 0) {
2643 * d = <LPart, VPart>
2644 * } else {
2645 * TPart = TBase + TIndex
2646 * d = <LPart, VPart, TPart>
2647 * }
2648 */
2649
2650 /* Constants */
2651 #define SB (0xAC00)
2652 #define LB (0x1100)
2653 #define VB (0x1161)
2654 #define TB (0x11A7)
2655 #define LC (19)
2656 #define VC (21)
2657 #define TC (28)
2658 #define NC (VC * TC)
2659 #define SC (LC * NC)
2660
2661 /* Algorithmic decomposition of hangul syllable. */
utf8hangul(const char * str,unsigned char * hangul)2662 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2663 {
2664 unsigned int si;
2665 unsigned int li;
2666 unsigned int vi;
2667 unsigned int ti;
2668 unsigned char *h;
2669
2670 /* Calculate the SI, LI, VI, and TI values. */
2671 si = utf8decode(str) - SB;
2672 li = si / NC;
2673 vi = (si % NC) / TC;
2674 ti = si % TC;
2675
2676 /* Fill in base of leaf. */
2677 h = hangul;
2678 LEAF_GEN(h) = 2;
2679 LEAF_CCC(h) = DECOMPOSE;
2680 h += 2;
2681
2682 /* Add LPart, a 3-byte UTF-8 sequence. */
2683 h += utf8encode((char *)h, li + LB);
2684
2685 /* Add VPart, a 3-byte UTF-8 sequence. */
2686 h += utf8encode((char *)h, vi + VB);
2687
2688 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2689 if (ti)
2690 h += utf8encode((char *)h, ti + TB);
2691
2692 /* Terminate string. */
2693 h[0] = '\0';
2694
2695 return hangul;
2696 }
2697
2698 /*
2699 * Use trie to scan s, touching at most len bytes.
2700 * Returns the leaf if one exists, NULL otherwise.
2701 *
2702 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2703 * is well-formed and corresponds to a known unicode code point. The
2704 * shorthand for this will be "is valid UTF-8 unicode".
2705 */
utf8nlookup(struct tree * tree,unsigned char * hangul,const char * s,size_t len)2706 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2707 const char *s, size_t len)
2708 {
2709 utf8trie_t *trie;
2710 int offlen;
2711 int offset;
2712 int mask;
2713 int node;
2714
2715 if (!tree)
2716 return NULL;
2717 if (len == 0)
2718 return NULL;
2719 node = 1;
2720 trie = utf8data + tree->index;
2721 while (node) {
2722 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2723 if (*trie & NEXTBYTE) {
2724 if (--len == 0)
2725 return NULL;
2726 s++;
2727 }
2728 mask = 1 << (*trie & BITNUM);
2729 if (*s & mask) {
2730 /* Right leg */
2731 if (offlen) {
2732 /* Right node at offset of trie */
2733 node = (*trie & RIGHTNODE);
2734 offset = trie[offlen];
2735 while (--offlen) {
2736 offset <<= 8;
2737 offset |= trie[offlen];
2738 }
2739 trie += offset;
2740 } else if (*trie & RIGHTPATH) {
2741 /* Right node after this node */
2742 node = (*trie & TRIENODE);
2743 trie++;
2744 } else {
2745 /* No right node. */
2746 return NULL;
2747 }
2748 } else {
2749 /* Left leg */
2750 if (offlen) {
2751 /* Left node after this node. */
2752 node = (*trie & LEFTNODE);
2753 trie += offlen + 1;
2754 } else if (*trie & RIGHTPATH) {
2755 /* No left node. */
2756 return NULL;
2757 } else {
2758 /* Left node after this node */
2759 node = (*trie & TRIENODE);
2760 trie++;
2761 }
2762 }
2763 }
2764 /*
2765 * Hangul decomposition is done algorithmically. These are the
2766 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2767 * always 3 bytes long, so s has been advanced twice, and the
2768 * start of the sequence is at s-2.
2769 */
2770 if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2771 trie = utf8hangul(s - 2, hangul);
2772 return trie;
2773 }
2774
2775 /*
2776 * Use trie to scan s.
2777 * Returns the leaf if one exists, NULL otherwise.
2778 *
2779 * Forwards to trie_nlookup().
2780 */
utf8lookup(struct tree * tree,unsigned char * hangul,const char * s)2781 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2782 const char *s)
2783 {
2784 return utf8nlookup(tree, hangul, s, (size_t)-1);
2785 }
2786
2787 /*
2788 * Return the number of bytes used by the current UTF-8 sequence.
2789 * Assumes the input points to the first byte of a valid UTF-8
2790 * sequence.
2791 */
utf8clen(const char * s)2792 static inline int utf8clen(const char *s)
2793 {
2794 unsigned char c = *s;
2795 return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2796 }
2797
2798 /*
2799 * Maximum age of any character in s.
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if only non-assigned code points are used.
2802 */
utf8agemax(struct tree * tree,const char * s)2803 int utf8agemax(struct tree *tree, const char *s)
2804 {
2805 utf8leaf_t *leaf;
2806 int age = 0;
2807 int leaf_age;
2808 unsigned char hangul[UTF8HANGULLEAF];
2809
2810 if (!tree)
2811 return -1;
2812
2813 while (*s) {
2814 leaf = utf8lookup(tree, hangul, s);
2815 if (!leaf)
2816 return -1;
2817 leaf_age = ages[LEAF_GEN(leaf)];
2818 if (leaf_age <= tree->maxage && leaf_age > age)
2819 age = leaf_age;
2820 s += utf8clen(s);
2821 }
2822 return age;
2823 }
2824
2825 /*
2826 * Minimum age of any character in s.
2827 * Return -1 if s is not valid UTF-8 unicode.
2828 * Return 0 if non-assigned code points are used.
2829 */
utf8agemin(struct tree * tree,const char * s)2830 int utf8agemin(struct tree *tree, const char *s)
2831 {
2832 utf8leaf_t *leaf;
2833 int age;
2834 int leaf_age;
2835 unsigned char hangul[UTF8HANGULLEAF];
2836
2837 if (!tree)
2838 return -1;
2839 age = tree->maxage;
2840 while (*s) {
2841 leaf = utf8lookup(tree, hangul, s);
2842 if (!leaf)
2843 return -1;
2844 leaf_age = ages[LEAF_GEN(leaf)];
2845 if (leaf_age <= tree->maxage && leaf_age < age)
2846 age = leaf_age;
2847 s += utf8clen(s);
2848 }
2849 return age;
2850 }
2851
2852 /*
2853 * Maximum age of any character in s, touch at most len bytes.
2854 * Return -1 if s is not valid UTF-8 unicode.
2855 */
utf8nagemax(struct tree * tree,const char * s,size_t len)2856 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2857 {
2858 utf8leaf_t *leaf;
2859 int age = 0;
2860 int leaf_age;
2861 unsigned char hangul[UTF8HANGULLEAF];
2862
2863 if (!tree)
2864 return -1;
2865
2866 while (len && *s) {
2867 leaf = utf8nlookup(tree, hangul, s, len);
2868 if (!leaf)
2869 return -1;
2870 leaf_age = ages[LEAF_GEN(leaf)];
2871 if (leaf_age <= tree->maxage && leaf_age > age)
2872 age = leaf_age;
2873 len -= utf8clen(s);
2874 s += utf8clen(s);
2875 }
2876 return age;
2877 }
2878
2879 /*
2880 * Maximum age of any character in s, touch at most len bytes.
2881 * Return -1 if s is not valid UTF-8 unicode.
2882 */
utf8nagemin(struct tree * tree,const char * s,size_t len)2883 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2884 {
2885 utf8leaf_t *leaf;
2886 int leaf_age;
2887 int age;
2888 unsigned char hangul[UTF8HANGULLEAF];
2889
2890 if (!tree)
2891 return -1;
2892 age = tree->maxage;
2893 while (len && *s) {
2894 leaf = utf8nlookup(tree, hangul, s, len);
2895 if (!leaf)
2896 return -1;
2897 leaf_age = ages[LEAF_GEN(leaf)];
2898 if (leaf_age <= tree->maxage && leaf_age < age)
2899 age = leaf_age;
2900 len -= utf8clen(s);
2901 s += utf8clen(s);
2902 }
2903 return age;
2904 }
2905
2906 /*
2907 * Length of the normalization of s.
2908 * Return -1 if s is not valid UTF-8 unicode.
2909 *
2910 * A string of Default_Ignorable_Code_Point has length 0.
2911 */
utf8len(struct tree * tree,const char * s)2912 ssize_t utf8len(struct tree *tree, const char *s)
2913 {
2914 utf8leaf_t *leaf;
2915 size_t ret = 0;
2916 unsigned char hangul[UTF8HANGULLEAF];
2917
2918 if (!tree)
2919 return -1;
2920 while (*s) {
2921 leaf = utf8lookup(tree, hangul, s);
2922 if (!leaf)
2923 return -1;
2924 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925 ret += utf8clen(s);
2926 else if (LEAF_CCC(leaf) == DECOMPOSE)
2927 ret += strlen(LEAF_STR(leaf));
2928 else
2929 ret += utf8clen(s);
2930 s += utf8clen(s);
2931 }
2932 return ret;
2933 }
2934
2935 /*
2936 * Length of the normalization of s, touch at most len bytes.
2937 * Return -1 if s is not valid UTF-8 unicode.
2938 */
utf8nlen(struct tree * tree,const char * s,size_t len)2939 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2940 {
2941 utf8leaf_t *leaf;
2942 size_t ret = 0;
2943 unsigned char hangul[UTF8HANGULLEAF];
2944
2945 if (!tree)
2946 return -1;
2947 while (len && *s) {
2948 leaf = utf8nlookup(tree, hangul, s, len);
2949 if (!leaf)
2950 return -1;
2951 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2952 ret += utf8clen(s);
2953 else if (LEAF_CCC(leaf) == DECOMPOSE)
2954 ret += strlen(LEAF_STR(leaf));
2955 else
2956 ret += utf8clen(s);
2957 len -= utf8clen(s);
2958 s += utf8clen(s);
2959 }
2960 return ret;
2961 }
2962
2963 /*
2964 * Cursor structure used by the normalizer.
2965 */
2966 struct utf8cursor {
2967 struct tree *tree;
2968 const char *s;
2969 const char *p;
2970 const char *ss;
2971 const char *sp;
2972 unsigned int len;
2973 unsigned int slen;
2974 short int ccc;
2975 short int nccc;
2976 unsigned int unichar;
2977 unsigned char hangul[UTF8HANGULLEAF];
2978 };
2979
2980 /*
2981 * Set up an utf8cursor for use by utf8byte().
2982 *
2983 * s : string.
2984 * len : length of s.
2985 * u8c : pointer to cursor.
2986 * trie : utf8trie_t to use for normalization.
2987 *
2988 * Returns -1 on error, 0 on success.
2989 */
utf8ncursor(struct utf8cursor * u8c,struct tree * tree,const char * s,size_t len)2990 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2991 size_t len)
2992 {
2993 if (!tree)
2994 return -1;
2995 if (!s)
2996 return -1;
2997 u8c->tree = tree;
2998 u8c->s = s;
2999 u8c->p = NULL;
3000 u8c->ss = NULL;
3001 u8c->sp = NULL;
3002 u8c->len = len;
3003 u8c->slen = 0;
3004 u8c->ccc = STOPPER;
3005 u8c->nccc = STOPPER;
3006 u8c->unichar = 0;
3007 /* Check we didn't clobber the maximum length. */
3008 if (u8c->len != len)
3009 return -1;
3010 /* The first byte of s may not be an utf8 continuation. */
3011 if (len > 0 && (*s & 0xC0) == 0x80)
3012 return -1;
3013 return 0;
3014 }
3015
3016 /*
3017 * Set up an utf8cursor for use by utf8byte().
3018 *
3019 * s : NUL-terminated string.
3020 * u8c : pointer to cursor.
3021 * trie : utf8trie_t to use for normalization.
3022 *
3023 * Returns -1 on error, 0 on success.
3024 */
utf8cursor(struct utf8cursor * u8c,struct tree * tree,const char * s)3025 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
3026 {
3027 return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3028 }
3029
3030 /*
3031 * Get one byte from the normalized form of the string described by u8c.
3032 *
3033 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3034 *
3035 * The cursor keeps track of the location in the string in u8c->s.
3036 * When a character is decomposed, the current location is stored in
3037 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3038 * that bytes from a decomposition do not count against u8c->len.
3039 *
3040 * Characters are emitted if they match the current CCC in u8c->ccc.
3041 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3042 * and the function returns 0 in that case.
3043 *
3044 * Sorting by CCC is done by repeatedly scanning the string. The
3045 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3046 * the start of the scan. The first pass finds the lowest CCC to be
3047 * emitted and stores it in u8c->nccc, the second pass emits the
3048 * characters with this CCC and finds the next lowest CCC. This limits
3049 * the number of passes to 1 + the number of different CCCs in the
3050 * sequence being scanned.
3051 *
3052 * Therefore:
3053 * u8c->p != NULL -> a decomposition is being scanned.
3054 * u8c->ss != NULL -> this is a repeating scan.
3055 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3056 */
utf8byte(struct utf8cursor * u8c)3057 int utf8byte(struct utf8cursor *u8c)
3058 {
3059 utf8leaf_t *leaf;
3060 int ccc;
3061
3062 for (;;) {
3063 /* Check for the end of a decomposed character. */
3064 if (u8c->p && *u8c->s == '\0') {
3065 u8c->s = u8c->p;
3066 u8c->p = NULL;
3067 }
3068
3069 /* Check for end-of-string. */
3070 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3071 /* There is no next byte. */
3072 if (u8c->ccc == STOPPER)
3073 return 0;
3074 /* End-of-string during a scan counts as a stopper. */
3075 ccc = STOPPER;
3076 goto ccc_mismatch;
3077 } else if ((*u8c->s & 0xC0) == 0x80) {
3078 /* This is a continuation of the current character. */
3079 if (!u8c->p)
3080 u8c->len--;
3081 return (unsigned char)*u8c->s++;
3082 }
3083
3084 /* Look up the data for the current character. */
3085 if (u8c->p) {
3086 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3087 } else {
3088 leaf = utf8nlookup(u8c->tree, u8c->hangul,
3089 u8c->s, u8c->len);
3090 }
3091
3092 /* No leaf found implies that the input is a binary blob. */
3093 if (!leaf)
3094 return -1;
3095
3096 /* Characters that are too new have CCC 0. */
3097 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3098 ccc = STOPPER;
3099 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3100 u8c->len -= utf8clen(u8c->s);
3101 u8c->p = u8c->s + utf8clen(u8c->s);
3102 u8c->s = LEAF_STR(leaf);
3103 /* Empty decomposition implies CCC 0. */
3104 if (*u8c->s == '\0') {
3105 if (u8c->ccc == STOPPER)
3106 continue;
3107 ccc = STOPPER;
3108 goto ccc_mismatch;
3109 }
3110 leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3111 ccc = LEAF_CCC(leaf);
3112 }
3113 u8c->unichar = utf8decode(u8c->s);
3114
3115 /*
3116 * If this is not a stopper, then see if it updates
3117 * the next canonical class to be emitted.
3118 */
3119 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3120 u8c->nccc = ccc;
3121
3122 /*
3123 * Return the current byte if this is the current
3124 * combining class.
3125 */
3126 if (ccc == u8c->ccc) {
3127 if (!u8c->p)
3128 u8c->len--;
3129 return (unsigned char)*u8c->s++;
3130 }
3131
3132 /* Current combining class mismatch. */
3133 ccc_mismatch:
3134 if (u8c->nccc == STOPPER) {
3135 /*
3136 * Scan forward for the first canonical class
3137 * to be emitted. Save the position from
3138 * which to restart.
3139 */
3140 assert(u8c->ccc == STOPPER);
3141 u8c->ccc = MINCCC - 1;
3142 u8c->nccc = ccc;
3143 u8c->sp = u8c->p;
3144 u8c->ss = u8c->s;
3145 u8c->slen = u8c->len;
3146 if (!u8c->p)
3147 u8c->len -= utf8clen(u8c->s);
3148 u8c->s += utf8clen(u8c->s);
3149 } else if (ccc != STOPPER) {
3150 /* Not a stopper, and not the ccc we're emitting. */
3151 if (!u8c->p)
3152 u8c->len -= utf8clen(u8c->s);
3153 u8c->s += utf8clen(u8c->s);
3154 } else if (u8c->nccc != MAXCCC + 1) {
3155 /* At a stopper, restart for next ccc. */
3156 u8c->ccc = u8c->nccc;
3157 u8c->nccc = MAXCCC + 1;
3158 u8c->s = u8c->ss;
3159 u8c->p = u8c->sp;
3160 u8c->len = u8c->slen;
3161 } else {
3162 /* All done, proceed from here. */
3163 u8c->ccc = STOPPER;
3164 u8c->nccc = STOPPER;
3165 u8c->sp = NULL;
3166 u8c->ss = NULL;
3167 u8c->slen = 0;
3168 }
3169 }
3170 }
3171
3172 /* ------------------------------------------------------------------ */
3173
normalize_line(struct tree * tree)3174 static int normalize_line(struct tree *tree)
3175 {
3176 char *s;
3177 char *t;
3178 int c;
3179 struct utf8cursor u8c;
3180
3181 /* First test: null-terminated string. */
3182 s = buf2;
3183 t = buf3;
3184 if (utf8cursor(&u8c, tree, s))
3185 return -1;
3186 while ((c = utf8byte(&u8c)) > 0)
3187 if (c != (unsigned char)*t++)
3188 return -1;
3189 if (c < 0)
3190 return -1;
3191 if (*t != 0)
3192 return -1;
3193
3194 /* Second test: length-limited string. */
3195 s = buf2;
3196 /* Replace NUL with a value that will cause an error if seen. */
3197 s[strlen(s) + 1] = -1;
3198 t = buf3;
3199 if (utf8cursor(&u8c, tree, s))
3200 return -1;
3201 while ((c = utf8byte(&u8c)) > 0)
3202 if (c != (unsigned char)*t++)
3203 return -1;
3204 if (c < 0)
3205 return -1;
3206 if (*t != 0)
3207 return -1;
3208
3209 return 0;
3210 }
3211
normalization_test(void)3212 static void normalization_test(void)
3213 {
3214 FILE *file;
3215 unsigned int unichar;
3216 struct unicode_data *data;
3217 char *s;
3218 char *t;
3219 int ret;
3220 int ignorables;
3221 int tests = 0;
3222 int failures = 0;
3223
3224 if (verbose > 0)
3225 printf("Parsing %s\n", test_name);
3226 /* Step one, read data from file. */
3227 file = fopen(test_name, "r");
3228 if (!file)
3229 open_fail(test_name, errno);
3230
3231 while (fgets(line, LINESIZE, file)) {
3232 ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];",
3233 buf0, buf1);
3234 if (ret != 2 || *line == '#')
3235 continue;
3236 s = buf0;
3237 t = buf2;
3238 while (*s) {
3239 unichar = strtoul(s, &s, 16);
3240 t += utf8encode(t, unichar);
3241 }
3242 *t = '\0';
3243
3244 ignorables = 0;
3245 s = buf1;
3246 t = buf3;
3247 while (*s) {
3248 unichar = strtoul(s, &s, 16);
3249 data = &unicode_data[unichar];
3250 if (data->utf8nfdi && !*data->utf8nfdi)
3251 ignorables = 1;
3252 else
3253 t += utf8encode(t, unichar);
3254 }
3255 *t = '\0';
3256
3257 tests++;
3258 if (normalize_line(nfdi_tree) < 0) {
3259 printf("Line %s -> %s", buf0, buf1);
3260 if (ignorables)
3261 printf(" (ignorables removed)");
3262 printf(" failure\n");
3263 failures++;
3264 }
3265 }
3266 fclose(file);
3267 if (verbose > 0)
3268 printf("Ran %d tests with %d failures\n", tests, failures);
3269 if (failures)
3270 file_fail(test_name);
3271 }
3272
3273 /* ------------------------------------------------------------------ */
3274
write_file(void)3275 static void write_file(void)
3276 {
3277 FILE *file;
3278 int i;
3279 int j;
3280 int t;
3281 int gen;
3282
3283 if (verbose > 0)
3284 printf("Writing %s\n", utf8_name);
3285 file = fopen(utf8_name, "w");
3286 if (!file)
3287 open_fail(utf8_name, errno);
3288
3289 fprintf(file, "/* This file is generated code, do not edit. */\n");
3290 fprintf(file, "\n");
3291 fprintf(file, "#include <linux/module.h>\n");
3292 fprintf(file, "#include <linux/kernel.h>\n");
3293 fprintf(file, "#include \"utf8n.h\"\n");
3294 fprintf(file, "\n");
3295 fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3296 for (i = 0; i != ages_count; i++)
3297 fprintf(file, "\t%#x%s\n", ages[i],
3298 ages[i] == unicode_maxage ? "" : ",");
3299 fprintf(file, "};\n");
3300 fprintf(file, "\n");
3301 fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n");
3302 t = 0;
3303 for (gen = 0; gen < ages_count; gen++) {
3304 fprintf(file, "\t{ %#x, %d }%s\n",
3305 ages[gen], trees[t].index,
3306 ages[gen] == unicode_maxage ? "" : ",");
3307 if (trees[t].maxage == ages[gen])
3308 t += 2;
3309 }
3310 fprintf(file, "};\n");
3311 fprintf(file, "\n");
3312 fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n");
3313 t = 1;
3314 for (gen = 0; gen < ages_count; gen++) {
3315 fprintf(file, "\t{ %#x, %d }%s\n",
3316 ages[gen], trees[t].index,
3317 ages[gen] == unicode_maxage ? "" : ",");
3318 if (trees[t].maxage == ages[gen])
3319 t += 2;
3320 }
3321 fprintf(file, "};\n");
3322 fprintf(file, "\n");
3323 fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3324 utf8data_size);
3325 t = 0;
3326 for (i = 0; i != utf8data_size; i += 16) {
3327 if (i == trees[t].index) {
3328 fprintf(file, "\t/* %s_%x */\n",
3329 trees[t].type, trees[t].maxage);
3330 if (t < trees_count-1)
3331 t++;
3332 }
3333 fprintf(file, "\t");
3334 for (j = i; j != i + 16; j++)
3335 fprintf(file, "0x%.2x%s", utf8data[j],
3336 (j < utf8data_size -1 ? "," : ""));
3337 fprintf(file, "\n");
3338 }
3339 fprintf(file, "};\n");
3340 fprintf(file, "\n");
3341 fprintf(file, "const struct utf8data_table utf8_data_table = {\n");
3342 fprintf(file, "\t.utf8agetab = utf8agetab,\n");
3343 fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n");
3344 fprintf(file, "\n");
3345 fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n");
3346 fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n");
3347 fprintf(file, "\n");
3348 fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n");
3349 fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n");
3350 fprintf(file, "\n");
3351 fprintf(file, "\t.utf8data = utf8data,\n");
3352 fprintf(file, "};\n");
3353 fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);");
3354 fprintf(file, "\n");
3355 fprintf(file, "MODULE_DESCRIPTION(\"UTF8 data table\");\n");
3356 fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n");
3357 fclose(file);
3358 }
3359
3360 /* ------------------------------------------------------------------ */
3361
main(int argc,char * argv[])3362 int main(int argc, char *argv[])
3363 {
3364 unsigned int unichar;
3365 int opt;
3366
3367 argv0 = argv[0];
3368
3369 while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3370 switch (opt) {
3371 case 'a':
3372 age_name = optarg;
3373 break;
3374 case 'c':
3375 ccc_name = optarg;
3376 break;
3377 case 'd':
3378 data_name = optarg;
3379 break;
3380 case 'f':
3381 fold_name = optarg;
3382 break;
3383 case 'n':
3384 norm_name = optarg;
3385 break;
3386 case 'o':
3387 utf8_name = optarg;
3388 break;
3389 case 'p':
3390 prop_name = optarg;
3391 break;
3392 case 't':
3393 test_name = optarg;
3394 break;
3395 case 'v':
3396 verbose++;
3397 break;
3398 case 'h':
3399 help();
3400 exit(0);
3401 default:
3402 usage();
3403 }
3404 }
3405
3406 if (verbose > 1)
3407 help();
3408 for (unichar = 0; unichar != 0x110000; unichar++)
3409 unicode_data[unichar].code = unichar;
3410 age_init();
3411 ccc_init();
3412 nfdi_init();
3413 nfdicf_init();
3414 ignore_init();
3415 corrections_init();
3416 hangul_decompose();
3417 nfdi_decompose();
3418 nfdicf_decompose();
3419 utf8_init();
3420 trees_init();
3421 trees_populate();
3422 trees_reduce();
3423 trees_verify();
3424 /* Prevent "unused function" warning. */
3425 (void)lookup(nfdi_tree, " ");
3426 if (verbose > 2)
3427 tree_walk(nfdi_tree);
3428 if (verbose > 2)
3429 tree_walk(nfdicf_tree);
3430 normalization_test();
3431 write_file();
3432
3433 return 0;
3434 }
3435