128ba53c0SMasahiro Yamada /* 228ba53c0SMasahiro Yamada * Copyright (c) 2014 SGI. 328ba53c0SMasahiro Yamada * All rights reserved. 428ba53c0SMasahiro Yamada * 528ba53c0SMasahiro Yamada * This program is free software; you can redistribute it and/or 628ba53c0SMasahiro Yamada * modify it under the terms of the GNU General Public License as 728ba53c0SMasahiro Yamada * published by the Free Software Foundation. 828ba53c0SMasahiro Yamada * 928ba53c0SMasahiro Yamada * This program is distributed in the hope that it would be useful, 1028ba53c0SMasahiro Yamada * but WITHOUT ANY WARRANTY; without even the implied warranty of 1128ba53c0SMasahiro Yamada * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 1228ba53c0SMasahiro Yamada * GNU General Public License for more details. 1328ba53c0SMasahiro Yamada * 1428ba53c0SMasahiro Yamada * You should have received a copy of the GNU General Public License 1528ba53c0SMasahiro Yamada * along with this program; if not, write the Free Software Foundation, 1628ba53c0SMasahiro Yamada * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 1728ba53c0SMasahiro Yamada */ 1828ba53c0SMasahiro Yamada 1928ba53c0SMasahiro Yamada /* Generator for a compact trie for unicode normalization */ 2028ba53c0SMasahiro Yamada 2128ba53c0SMasahiro Yamada #include <sys/types.h> 2228ba53c0SMasahiro Yamada #include <stddef.h> 2328ba53c0SMasahiro Yamada #include <stdlib.h> 2428ba53c0SMasahiro Yamada #include <stdio.h> 2528ba53c0SMasahiro Yamada #include <assert.h> 2628ba53c0SMasahiro Yamada #include <string.h> 2728ba53c0SMasahiro Yamada #include <unistd.h> 2828ba53c0SMasahiro Yamada #include <errno.h> 2928ba53c0SMasahiro Yamada 3028ba53c0SMasahiro Yamada /* Default names of the in- and output files. */ 3128ba53c0SMasahiro Yamada 3228ba53c0SMasahiro Yamada #define AGE_NAME "DerivedAge.txt" 3328ba53c0SMasahiro Yamada #define CCC_NAME "DerivedCombiningClass.txt" 3428ba53c0SMasahiro Yamada #define PROP_NAME "DerivedCoreProperties.txt" 3528ba53c0SMasahiro Yamada #define DATA_NAME "UnicodeData.txt" 3628ba53c0SMasahiro Yamada #define FOLD_NAME "CaseFolding.txt" 3728ba53c0SMasahiro Yamada #define NORM_NAME "NormalizationCorrections.txt" 3828ba53c0SMasahiro Yamada #define TEST_NAME "NormalizationTest.txt" 3928ba53c0SMasahiro Yamada #define UTF8_NAME "utf8data.h" 4028ba53c0SMasahiro Yamada 4128ba53c0SMasahiro Yamada const char *age_name = AGE_NAME; 4228ba53c0SMasahiro Yamada const char *ccc_name = CCC_NAME; 4328ba53c0SMasahiro Yamada const char *prop_name = PROP_NAME; 4428ba53c0SMasahiro Yamada const char *data_name = DATA_NAME; 4528ba53c0SMasahiro Yamada const char *fold_name = FOLD_NAME; 4628ba53c0SMasahiro Yamada const char *norm_name = NORM_NAME; 4728ba53c0SMasahiro Yamada const char *test_name = TEST_NAME; 4828ba53c0SMasahiro Yamada const char *utf8_name = UTF8_NAME; 4928ba53c0SMasahiro Yamada 5028ba53c0SMasahiro Yamada int verbose = 0; 5128ba53c0SMasahiro Yamada 5228ba53c0SMasahiro Yamada /* An arbitrary line size limit on input lines. */ 5328ba53c0SMasahiro Yamada 5428ba53c0SMasahiro Yamada #define LINESIZE 1024 5528ba53c0SMasahiro Yamada char line[LINESIZE]; 5628ba53c0SMasahiro Yamada char buf0[LINESIZE]; 5728ba53c0SMasahiro Yamada char buf1[LINESIZE]; 5828ba53c0SMasahiro Yamada char buf2[LINESIZE]; 5928ba53c0SMasahiro Yamada char buf3[LINESIZE]; 6028ba53c0SMasahiro Yamada 6128ba53c0SMasahiro Yamada const char *argv0; 6228ba53c0SMasahiro Yamada 6328ba53c0SMasahiro Yamada #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 6428ba53c0SMasahiro Yamada 6528ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 6628ba53c0SMasahiro Yamada 6728ba53c0SMasahiro Yamada /* 6828ba53c0SMasahiro Yamada * Unicode version numbers consist of three parts: major, minor, and a 6928ba53c0SMasahiro Yamada * revision. These numbers are packed into an unsigned int to obtain 7028ba53c0SMasahiro Yamada * a single version number. 7128ba53c0SMasahiro Yamada * 7228ba53c0SMasahiro Yamada * To save space in the generated trie, the unicode version is not 7328ba53c0SMasahiro Yamada * stored directly, instead we calculate a generation number from the 7428ba53c0SMasahiro Yamada * unicode versions seen in the DerivedAge file, and use that as an 7528ba53c0SMasahiro Yamada * index into a table of unicode versions. 7628ba53c0SMasahiro Yamada */ 7728ba53c0SMasahiro Yamada #define UNICODE_MAJ_SHIFT (16) 7828ba53c0SMasahiro Yamada #define UNICODE_MIN_SHIFT (8) 7928ba53c0SMasahiro Yamada 8028ba53c0SMasahiro Yamada #define UNICODE_MAJ_MAX ((unsigned short)-1) 8128ba53c0SMasahiro Yamada #define UNICODE_MIN_MAX ((unsigned char)-1) 8228ba53c0SMasahiro Yamada #define UNICODE_REV_MAX ((unsigned char)-1) 8328ba53c0SMasahiro Yamada 8428ba53c0SMasahiro Yamada #define UNICODE_AGE(MAJ,MIN,REV) \ 8528ba53c0SMasahiro Yamada (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \ 8628ba53c0SMasahiro Yamada ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \ 8728ba53c0SMasahiro Yamada ((unsigned int)(REV))) 8828ba53c0SMasahiro Yamada 8928ba53c0SMasahiro Yamada unsigned int *ages; 9028ba53c0SMasahiro Yamada int ages_count; 9128ba53c0SMasahiro Yamada 9228ba53c0SMasahiro Yamada unsigned int unicode_maxage; 9328ba53c0SMasahiro Yamada 9428ba53c0SMasahiro Yamada static int age_valid(unsigned int major, unsigned int minor, 9528ba53c0SMasahiro Yamada unsigned int revision) 9628ba53c0SMasahiro Yamada { 9728ba53c0SMasahiro Yamada if (major > UNICODE_MAJ_MAX) 9828ba53c0SMasahiro Yamada return 0; 9928ba53c0SMasahiro Yamada if (minor > UNICODE_MIN_MAX) 10028ba53c0SMasahiro Yamada return 0; 10128ba53c0SMasahiro Yamada if (revision > UNICODE_REV_MAX) 10228ba53c0SMasahiro Yamada return 0; 10328ba53c0SMasahiro Yamada return 1; 10428ba53c0SMasahiro Yamada } 10528ba53c0SMasahiro Yamada 10628ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 10728ba53c0SMasahiro Yamada 10828ba53c0SMasahiro Yamada /* 10928ba53c0SMasahiro Yamada * utf8trie_t 11028ba53c0SMasahiro Yamada * 11128ba53c0SMasahiro Yamada * A compact binary tree, used to decode UTF-8 characters. 11228ba53c0SMasahiro Yamada * 11328ba53c0SMasahiro Yamada * Internal nodes are one byte for the node itself, and up to three 11428ba53c0SMasahiro Yamada * bytes for an offset into the tree. The first byte contains the 11528ba53c0SMasahiro Yamada * following information: 11628ba53c0SMasahiro Yamada * NEXTBYTE - flag - advance to next byte if set 11728ba53c0SMasahiro Yamada * BITNUM - 3 bit field - the bit number to tested 11828ba53c0SMasahiro Yamada * OFFLEN - 2 bit field - number of bytes in the offset 11928ba53c0SMasahiro Yamada * if offlen == 0 (non-branching node) 12028ba53c0SMasahiro Yamada * RIGHTPATH - 1 bit field - set if the following node is for the 12128ba53c0SMasahiro Yamada * right-hand path (tested bit is set) 12228ba53c0SMasahiro Yamada * TRIENODE - 1 bit field - set if the following node is an internal 12328ba53c0SMasahiro Yamada * node, otherwise it is a leaf node 12428ba53c0SMasahiro Yamada * if offlen != 0 (branching node) 12528ba53c0SMasahiro Yamada * LEFTNODE - 1 bit field - set if the left-hand node is internal 12628ba53c0SMasahiro Yamada * RIGHTNODE - 1 bit field - set if the right-hand node is internal 12728ba53c0SMasahiro Yamada * 12828ba53c0SMasahiro Yamada * Due to the way utf8 works, there cannot be branching nodes with 12928ba53c0SMasahiro Yamada * NEXTBYTE set, and moreover those nodes always have a righthand 13028ba53c0SMasahiro Yamada * descendant. 13128ba53c0SMasahiro Yamada */ 13228ba53c0SMasahiro Yamada typedef unsigned char utf8trie_t; 13328ba53c0SMasahiro Yamada #define BITNUM 0x07 13428ba53c0SMasahiro Yamada #define NEXTBYTE 0x08 13528ba53c0SMasahiro Yamada #define OFFLEN 0x30 13628ba53c0SMasahiro Yamada #define OFFLEN_SHIFT 4 13728ba53c0SMasahiro Yamada #define RIGHTPATH 0x40 13828ba53c0SMasahiro Yamada #define TRIENODE 0x80 13928ba53c0SMasahiro Yamada #define RIGHTNODE 0x40 14028ba53c0SMasahiro Yamada #define LEFTNODE 0x80 14128ba53c0SMasahiro Yamada 14228ba53c0SMasahiro Yamada /* 14328ba53c0SMasahiro Yamada * utf8leaf_t 14428ba53c0SMasahiro Yamada * 14528ba53c0SMasahiro Yamada * The leaves of the trie are embedded in the trie, and so the same 14628ba53c0SMasahiro Yamada * underlying datatype, unsigned char. 14728ba53c0SMasahiro Yamada * 14828ba53c0SMasahiro Yamada * leaf[0]: The unicode version, stored as a generation number that is 14928ba53c0SMasahiro Yamada * an index into utf8agetab[]. With this we can filter code 15028ba53c0SMasahiro Yamada * points based on the unicode version in which they were 15128ba53c0SMasahiro Yamada * defined. The CCC of a non-defined code point is 0. 15228ba53c0SMasahiro Yamada * leaf[1]: Canonical Combining Class. During normalization, we need 15328ba53c0SMasahiro Yamada * to do a stable sort into ascending order of all characters 15428ba53c0SMasahiro Yamada * with a non-zero CCC that occur between two characters with 15528ba53c0SMasahiro Yamada * a CCC of 0, or at the begin or end of a string. 15628ba53c0SMasahiro Yamada * The unicode standard guarantees that all CCC values are 15728ba53c0SMasahiro Yamada * between 0 and 254 inclusive, which leaves 255 available as 15828ba53c0SMasahiro Yamada * a special value. 15928ba53c0SMasahiro Yamada * Code points with CCC 0 are known as stoppers. 16028ba53c0SMasahiro Yamada * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the 16128ba53c0SMasahiro Yamada * start of a NUL-terminated string that is the decomposition 16228ba53c0SMasahiro Yamada * of the character. 16328ba53c0SMasahiro Yamada * The CCC of a decomposable character is the same as the CCC 16428ba53c0SMasahiro Yamada * of the first character of its decomposition. 16528ba53c0SMasahiro Yamada * Some characters decompose as the empty string: these are 16628ba53c0SMasahiro Yamada * characters with the Default_Ignorable_Code_Point property. 16728ba53c0SMasahiro Yamada * These do affect normalization, as they all have CCC 0. 16828ba53c0SMasahiro Yamada * 16928ba53c0SMasahiro Yamada * The decompositions in the trie have been fully expanded. 17028ba53c0SMasahiro Yamada * 17128ba53c0SMasahiro Yamada * Casefolding, if applicable, is also done using decompositions. 17228ba53c0SMasahiro Yamada */ 17328ba53c0SMasahiro Yamada typedef unsigned char utf8leaf_t; 17428ba53c0SMasahiro Yamada 17528ba53c0SMasahiro Yamada #define LEAF_GEN(LEAF) ((LEAF)[0]) 17628ba53c0SMasahiro Yamada #define LEAF_CCC(LEAF) ((LEAF)[1]) 17728ba53c0SMasahiro Yamada #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2)) 17828ba53c0SMasahiro Yamada 17928ba53c0SMasahiro Yamada #define MAXGEN (255) 18028ba53c0SMasahiro Yamada 18128ba53c0SMasahiro Yamada #define MINCCC (0) 18228ba53c0SMasahiro Yamada #define MAXCCC (254) 18328ba53c0SMasahiro Yamada #define STOPPER (0) 18428ba53c0SMasahiro Yamada #define DECOMPOSE (255) 18528ba53c0SMasahiro Yamada #define HANGUL ((char)(255)) 18628ba53c0SMasahiro Yamada 18728ba53c0SMasahiro Yamada #define UTF8HANGULLEAF (12) 18828ba53c0SMasahiro Yamada 18928ba53c0SMasahiro Yamada struct tree; 19028ba53c0SMasahiro Yamada static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *, 19128ba53c0SMasahiro Yamada const char *, size_t); 19228ba53c0SMasahiro Yamada static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *); 19328ba53c0SMasahiro Yamada 19428ba53c0SMasahiro Yamada unsigned char *utf8data; 19528ba53c0SMasahiro Yamada size_t utf8data_size; 19628ba53c0SMasahiro Yamada 19728ba53c0SMasahiro Yamada utf8trie_t *nfdi; 19828ba53c0SMasahiro Yamada utf8trie_t *nfdicf; 19928ba53c0SMasahiro Yamada 20028ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 20128ba53c0SMasahiro Yamada 20228ba53c0SMasahiro Yamada /* 20328ba53c0SMasahiro Yamada * UTF8 valid ranges. 20428ba53c0SMasahiro Yamada * 20528ba53c0SMasahiro Yamada * The UTF-8 encoding spreads the bits of a 32bit word over several 20628ba53c0SMasahiro Yamada * bytes. This table gives the ranges that can be held and how they'd 20728ba53c0SMasahiro Yamada * be represented. 20828ba53c0SMasahiro Yamada * 20928ba53c0SMasahiro Yamada * 0x00000000 0x0000007F: 0xxxxxxx 21028ba53c0SMasahiro Yamada * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx 21128ba53c0SMasahiro Yamada * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 21228ba53c0SMasahiro Yamada * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 21328ba53c0SMasahiro Yamada * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 21428ba53c0SMasahiro Yamada * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 21528ba53c0SMasahiro Yamada * 21628ba53c0SMasahiro Yamada * There is an additional requirement on UTF-8, in that only the 21728ba53c0SMasahiro Yamada * shortest representation of a 32bit value is to be used. A decoder 21828ba53c0SMasahiro Yamada * must not decode sequences that do not satisfy this requirement. 21928ba53c0SMasahiro Yamada * Thus the allowed ranges have a lower bound. 22028ba53c0SMasahiro Yamada * 22128ba53c0SMasahiro Yamada * 0x00000000 0x0000007F: 0xxxxxxx 22228ba53c0SMasahiro Yamada * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx 22328ba53c0SMasahiro Yamada * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx 22428ba53c0SMasahiro Yamada * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 22528ba53c0SMasahiro Yamada * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 22628ba53c0SMasahiro Yamada * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 22728ba53c0SMasahiro Yamada * 22828ba53c0SMasahiro Yamada * Actual unicode characters are limited to the range 0x0 - 0x10FFFF, 22928ba53c0SMasahiro Yamada * 17 planes of 65536 values. This limits the sequences actually seen 23028ba53c0SMasahiro Yamada * even more, to just the following. 23128ba53c0SMasahiro Yamada * 23228ba53c0SMasahiro Yamada * 0 - 0x7f: 0 0x7f 23328ba53c0SMasahiro Yamada * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf 23428ba53c0SMasahiro Yamada * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf 23528ba53c0SMasahiro Yamada * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf 23628ba53c0SMasahiro Yamada * 23728ba53c0SMasahiro Yamada * Even within those ranges not all values are allowed: the surrogates 23828ba53c0SMasahiro Yamada * 0xd800 - 0xdfff should never be seen. 23928ba53c0SMasahiro Yamada * 24028ba53c0SMasahiro Yamada * Note that the longest sequence seen with valid usage is 4 bytes, 24128ba53c0SMasahiro Yamada * the same a single UTF-32 character. This makes the UTF-8 24228ba53c0SMasahiro Yamada * representation of Unicode strictly smaller than UTF-32. 24328ba53c0SMasahiro Yamada * 24428ba53c0SMasahiro Yamada * The shortest sequence requirement was introduced by: 24528ba53c0SMasahiro Yamada * Corrigendum #1: UTF-8 Shortest Form 24628ba53c0SMasahiro Yamada * It can be found here: 24728ba53c0SMasahiro Yamada * http://www.unicode.org/versions/corrigendum1.html 24828ba53c0SMasahiro Yamada * 24928ba53c0SMasahiro Yamada */ 25028ba53c0SMasahiro Yamada 25128ba53c0SMasahiro Yamada #define UTF8_2_BITS 0xC0 25228ba53c0SMasahiro Yamada #define UTF8_3_BITS 0xE0 25328ba53c0SMasahiro Yamada #define UTF8_4_BITS 0xF0 25428ba53c0SMasahiro Yamada #define UTF8_N_BITS 0x80 25528ba53c0SMasahiro Yamada #define UTF8_2_MASK 0xE0 25628ba53c0SMasahiro Yamada #define UTF8_3_MASK 0xF0 25728ba53c0SMasahiro Yamada #define UTF8_4_MASK 0xF8 25828ba53c0SMasahiro Yamada #define UTF8_N_MASK 0xC0 25928ba53c0SMasahiro Yamada #define UTF8_V_MASK 0x3F 26028ba53c0SMasahiro Yamada #define UTF8_V_SHIFT 6 26128ba53c0SMasahiro Yamada 26228ba53c0SMasahiro Yamada static int utf8encode(char *str, unsigned int val) 26328ba53c0SMasahiro Yamada { 26428ba53c0SMasahiro Yamada int len; 26528ba53c0SMasahiro Yamada 26628ba53c0SMasahiro Yamada if (val < 0x80) { 26728ba53c0SMasahiro Yamada str[0] = val; 26828ba53c0SMasahiro Yamada len = 1; 26928ba53c0SMasahiro Yamada } else if (val < 0x800) { 27028ba53c0SMasahiro Yamada str[1] = val & UTF8_V_MASK; 27128ba53c0SMasahiro Yamada str[1] |= UTF8_N_BITS; 27228ba53c0SMasahiro Yamada val >>= UTF8_V_SHIFT; 27328ba53c0SMasahiro Yamada str[0] = val; 27428ba53c0SMasahiro Yamada str[0] |= UTF8_2_BITS; 27528ba53c0SMasahiro Yamada len = 2; 27628ba53c0SMasahiro Yamada } else if (val < 0x10000) { 27728ba53c0SMasahiro Yamada str[2] = val & UTF8_V_MASK; 27828ba53c0SMasahiro Yamada str[2] |= UTF8_N_BITS; 27928ba53c0SMasahiro Yamada val >>= UTF8_V_SHIFT; 28028ba53c0SMasahiro Yamada str[1] = val & UTF8_V_MASK; 28128ba53c0SMasahiro Yamada str[1] |= UTF8_N_BITS; 28228ba53c0SMasahiro Yamada val >>= UTF8_V_SHIFT; 28328ba53c0SMasahiro Yamada str[0] = val; 28428ba53c0SMasahiro Yamada str[0] |= UTF8_3_BITS; 28528ba53c0SMasahiro Yamada len = 3; 28628ba53c0SMasahiro Yamada } else if (val < 0x110000) { 28728ba53c0SMasahiro Yamada str[3] = val & UTF8_V_MASK; 28828ba53c0SMasahiro Yamada str[3] |= UTF8_N_BITS; 28928ba53c0SMasahiro Yamada val >>= UTF8_V_SHIFT; 29028ba53c0SMasahiro Yamada str[2] = val & UTF8_V_MASK; 29128ba53c0SMasahiro Yamada str[2] |= UTF8_N_BITS; 29228ba53c0SMasahiro Yamada val >>= UTF8_V_SHIFT; 29328ba53c0SMasahiro Yamada str[1] = val & UTF8_V_MASK; 29428ba53c0SMasahiro Yamada str[1] |= UTF8_N_BITS; 29528ba53c0SMasahiro Yamada val >>= UTF8_V_SHIFT; 29628ba53c0SMasahiro Yamada str[0] = val; 29728ba53c0SMasahiro Yamada str[0] |= UTF8_4_BITS; 29828ba53c0SMasahiro Yamada len = 4; 29928ba53c0SMasahiro Yamada } else { 30028ba53c0SMasahiro Yamada printf("%#x: illegal val\n", val); 30128ba53c0SMasahiro Yamada len = 0; 30228ba53c0SMasahiro Yamada } 30328ba53c0SMasahiro Yamada return len; 30428ba53c0SMasahiro Yamada } 30528ba53c0SMasahiro Yamada 30628ba53c0SMasahiro Yamada static unsigned int utf8decode(const char *str) 30728ba53c0SMasahiro Yamada { 30828ba53c0SMasahiro Yamada const unsigned char *s = (const unsigned char*)str; 30928ba53c0SMasahiro Yamada unsigned int unichar = 0; 31028ba53c0SMasahiro Yamada 31128ba53c0SMasahiro Yamada if (*s < 0x80) { 31228ba53c0SMasahiro Yamada unichar = *s; 31328ba53c0SMasahiro Yamada } else if (*s < UTF8_3_BITS) { 31428ba53c0SMasahiro Yamada unichar = *s++ & 0x1F; 31528ba53c0SMasahiro Yamada unichar <<= UTF8_V_SHIFT; 31628ba53c0SMasahiro Yamada unichar |= *s & 0x3F; 31728ba53c0SMasahiro Yamada } else if (*s < UTF8_4_BITS) { 31828ba53c0SMasahiro Yamada unichar = *s++ & 0x0F; 31928ba53c0SMasahiro Yamada unichar <<= UTF8_V_SHIFT; 32028ba53c0SMasahiro Yamada unichar |= *s++ & 0x3F; 32128ba53c0SMasahiro Yamada unichar <<= UTF8_V_SHIFT; 32228ba53c0SMasahiro Yamada unichar |= *s & 0x3F; 32328ba53c0SMasahiro Yamada } else { 32428ba53c0SMasahiro Yamada unichar = *s++ & 0x0F; 32528ba53c0SMasahiro Yamada unichar <<= UTF8_V_SHIFT; 32628ba53c0SMasahiro Yamada unichar |= *s++ & 0x3F; 32728ba53c0SMasahiro Yamada unichar <<= UTF8_V_SHIFT; 32828ba53c0SMasahiro Yamada unichar |= *s++ & 0x3F; 32928ba53c0SMasahiro Yamada unichar <<= UTF8_V_SHIFT; 33028ba53c0SMasahiro Yamada unichar |= *s & 0x3F; 33128ba53c0SMasahiro Yamada } 33228ba53c0SMasahiro Yamada return unichar; 33328ba53c0SMasahiro Yamada } 33428ba53c0SMasahiro Yamada 33528ba53c0SMasahiro Yamada static int utf32valid(unsigned int unichar) 33628ba53c0SMasahiro Yamada { 33728ba53c0SMasahiro Yamada return unichar < 0x110000; 33828ba53c0SMasahiro Yamada } 33928ba53c0SMasahiro Yamada 34028ba53c0SMasahiro Yamada #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3) 34128ba53c0SMasahiro Yamada 34228ba53c0SMasahiro Yamada #define NODE 1 34328ba53c0SMasahiro Yamada #define LEAF 0 34428ba53c0SMasahiro Yamada 34528ba53c0SMasahiro Yamada struct tree { 34628ba53c0SMasahiro Yamada void *root; 34728ba53c0SMasahiro Yamada int childnode; 34828ba53c0SMasahiro Yamada const char *type; 34928ba53c0SMasahiro Yamada unsigned int maxage; 35028ba53c0SMasahiro Yamada struct tree *next; 35128ba53c0SMasahiro Yamada int (*leaf_equal)(void *, void *); 35228ba53c0SMasahiro Yamada void (*leaf_print)(void *, int); 35328ba53c0SMasahiro Yamada int (*leaf_mark)(void *); 35428ba53c0SMasahiro Yamada int (*leaf_size)(void *); 35528ba53c0SMasahiro Yamada int *(*leaf_index)(struct tree *, void *); 35628ba53c0SMasahiro Yamada unsigned char *(*leaf_emit)(void *, unsigned char *); 35728ba53c0SMasahiro Yamada int leafindex[0x110000]; 35828ba53c0SMasahiro Yamada int index; 35928ba53c0SMasahiro Yamada }; 36028ba53c0SMasahiro Yamada 36128ba53c0SMasahiro Yamada struct node { 36228ba53c0SMasahiro Yamada int index; 36328ba53c0SMasahiro Yamada int offset; 36428ba53c0SMasahiro Yamada int mark; 36528ba53c0SMasahiro Yamada int size; 36628ba53c0SMasahiro Yamada struct node *parent; 36728ba53c0SMasahiro Yamada void *left; 36828ba53c0SMasahiro Yamada void *right; 36928ba53c0SMasahiro Yamada unsigned char bitnum; 37028ba53c0SMasahiro Yamada unsigned char nextbyte; 37128ba53c0SMasahiro Yamada unsigned char leftnode; 37228ba53c0SMasahiro Yamada unsigned char rightnode; 37328ba53c0SMasahiro Yamada unsigned int keybits; 37428ba53c0SMasahiro Yamada unsigned int keymask; 37528ba53c0SMasahiro Yamada }; 37628ba53c0SMasahiro Yamada 37728ba53c0SMasahiro Yamada /* 37828ba53c0SMasahiro Yamada * Example lookup function for a tree. 37928ba53c0SMasahiro Yamada */ 38028ba53c0SMasahiro Yamada static void *lookup(struct tree *tree, const char *key) 38128ba53c0SMasahiro Yamada { 38228ba53c0SMasahiro Yamada struct node *node; 38328ba53c0SMasahiro Yamada void *leaf = NULL; 38428ba53c0SMasahiro Yamada 38528ba53c0SMasahiro Yamada node = tree->root; 38628ba53c0SMasahiro Yamada while (!leaf && node) { 38728ba53c0SMasahiro Yamada if (node->nextbyte) 38828ba53c0SMasahiro Yamada key++; 38928ba53c0SMasahiro Yamada if (*key & (1 << (node->bitnum & 7))) { 39028ba53c0SMasahiro Yamada /* Right leg */ 39128ba53c0SMasahiro Yamada if (node->rightnode == NODE) { 39228ba53c0SMasahiro Yamada node = node->right; 39328ba53c0SMasahiro Yamada } else if (node->rightnode == LEAF) { 39428ba53c0SMasahiro Yamada leaf = node->right; 39528ba53c0SMasahiro Yamada } else { 39628ba53c0SMasahiro Yamada node = NULL; 39728ba53c0SMasahiro Yamada } 39828ba53c0SMasahiro Yamada } else { 39928ba53c0SMasahiro Yamada /* Left leg */ 40028ba53c0SMasahiro Yamada if (node->leftnode == NODE) { 40128ba53c0SMasahiro Yamada node = node->left; 40228ba53c0SMasahiro Yamada } else if (node->leftnode == LEAF) { 40328ba53c0SMasahiro Yamada leaf = node->left; 40428ba53c0SMasahiro Yamada } else { 40528ba53c0SMasahiro Yamada node = NULL; 40628ba53c0SMasahiro Yamada } 40728ba53c0SMasahiro Yamada } 40828ba53c0SMasahiro Yamada } 40928ba53c0SMasahiro Yamada 41028ba53c0SMasahiro Yamada return leaf; 41128ba53c0SMasahiro Yamada } 41228ba53c0SMasahiro Yamada 41328ba53c0SMasahiro Yamada /* 41428ba53c0SMasahiro Yamada * A simple non-recursive tree walker: keep track of visits to the 41528ba53c0SMasahiro Yamada * left and right branches in the leftmask and rightmask. 41628ba53c0SMasahiro Yamada */ 41728ba53c0SMasahiro Yamada static void tree_walk(struct tree *tree) 41828ba53c0SMasahiro Yamada { 41928ba53c0SMasahiro Yamada struct node *node; 42028ba53c0SMasahiro Yamada unsigned int leftmask; 42128ba53c0SMasahiro Yamada unsigned int rightmask; 42228ba53c0SMasahiro Yamada unsigned int bitmask; 42328ba53c0SMasahiro Yamada int indent = 1; 42428ba53c0SMasahiro Yamada int nodes, singletons, leaves; 42528ba53c0SMasahiro Yamada 42628ba53c0SMasahiro Yamada nodes = singletons = leaves = 0; 42728ba53c0SMasahiro Yamada 42828ba53c0SMasahiro Yamada printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root); 42928ba53c0SMasahiro Yamada if (tree->childnode == LEAF) { 43028ba53c0SMasahiro Yamada assert(tree->root); 43128ba53c0SMasahiro Yamada tree->leaf_print(tree->root, indent); 43228ba53c0SMasahiro Yamada leaves = 1; 43328ba53c0SMasahiro Yamada } else { 43428ba53c0SMasahiro Yamada assert(tree->childnode == NODE); 43528ba53c0SMasahiro Yamada node = tree->root; 43628ba53c0SMasahiro Yamada leftmask = rightmask = 0; 43728ba53c0SMasahiro Yamada while (node) { 43828ba53c0SMasahiro Yamada printf("%*snode @ %p bitnum %d nextbyte %d" 43928ba53c0SMasahiro Yamada " left %p right %p mask %x bits %x\n", 44028ba53c0SMasahiro Yamada indent, "", node, 44128ba53c0SMasahiro Yamada node->bitnum, node->nextbyte, 44228ba53c0SMasahiro Yamada node->left, node->right, 44328ba53c0SMasahiro Yamada node->keymask, node->keybits); 44428ba53c0SMasahiro Yamada nodes += 1; 44528ba53c0SMasahiro Yamada if (!(node->left && node->right)) 44628ba53c0SMasahiro Yamada singletons += 1; 44728ba53c0SMasahiro Yamada 44828ba53c0SMasahiro Yamada while (node) { 44928ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 45028ba53c0SMasahiro Yamada if ((leftmask & bitmask) == 0) { 45128ba53c0SMasahiro Yamada leftmask |= bitmask; 45228ba53c0SMasahiro Yamada if (node->leftnode == LEAF) { 45328ba53c0SMasahiro Yamada assert(node->left); 45428ba53c0SMasahiro Yamada tree->leaf_print(node->left, 45528ba53c0SMasahiro Yamada indent+1); 45628ba53c0SMasahiro Yamada leaves += 1; 45728ba53c0SMasahiro Yamada } else if (node->left) { 45828ba53c0SMasahiro Yamada assert(node->leftnode == NODE); 45928ba53c0SMasahiro Yamada indent += 1; 46028ba53c0SMasahiro Yamada node = node->left; 46128ba53c0SMasahiro Yamada break; 46228ba53c0SMasahiro Yamada } 46328ba53c0SMasahiro Yamada } 46428ba53c0SMasahiro Yamada if ((rightmask & bitmask) == 0) { 46528ba53c0SMasahiro Yamada rightmask |= bitmask; 46628ba53c0SMasahiro Yamada if (node->rightnode == LEAF) { 46728ba53c0SMasahiro Yamada assert(node->right); 46828ba53c0SMasahiro Yamada tree->leaf_print(node->right, 46928ba53c0SMasahiro Yamada indent+1); 47028ba53c0SMasahiro Yamada leaves += 1; 47128ba53c0SMasahiro Yamada } else if (node->right) { 47228ba53c0SMasahiro Yamada assert(node->rightnode == NODE); 47328ba53c0SMasahiro Yamada indent += 1; 47428ba53c0SMasahiro Yamada node = node->right; 47528ba53c0SMasahiro Yamada break; 47628ba53c0SMasahiro Yamada } 47728ba53c0SMasahiro Yamada } 47828ba53c0SMasahiro Yamada leftmask &= ~bitmask; 47928ba53c0SMasahiro Yamada rightmask &= ~bitmask; 48028ba53c0SMasahiro Yamada node = node->parent; 48128ba53c0SMasahiro Yamada indent -= 1; 48228ba53c0SMasahiro Yamada } 48328ba53c0SMasahiro Yamada } 48428ba53c0SMasahiro Yamada } 48528ba53c0SMasahiro Yamada printf("nodes %d leaves %d singletons %d\n", 48628ba53c0SMasahiro Yamada nodes, leaves, singletons); 48728ba53c0SMasahiro Yamada } 48828ba53c0SMasahiro Yamada 48928ba53c0SMasahiro Yamada /* 49028ba53c0SMasahiro Yamada * Allocate an initialize a new internal node. 49128ba53c0SMasahiro Yamada */ 49228ba53c0SMasahiro Yamada static struct node *alloc_node(struct node *parent) 49328ba53c0SMasahiro Yamada { 49428ba53c0SMasahiro Yamada struct node *node; 49528ba53c0SMasahiro Yamada int bitnum; 49628ba53c0SMasahiro Yamada 49728ba53c0SMasahiro Yamada node = malloc(sizeof(*node)); 49828ba53c0SMasahiro Yamada node->left = node->right = NULL; 49928ba53c0SMasahiro Yamada node->parent = parent; 50028ba53c0SMasahiro Yamada node->leftnode = NODE; 50128ba53c0SMasahiro Yamada node->rightnode = NODE; 50228ba53c0SMasahiro Yamada node->keybits = 0; 50328ba53c0SMasahiro Yamada node->keymask = 0; 50428ba53c0SMasahiro Yamada node->mark = 0; 50528ba53c0SMasahiro Yamada node->index = 0; 50628ba53c0SMasahiro Yamada node->offset = -1; 50728ba53c0SMasahiro Yamada node->size = 4; 50828ba53c0SMasahiro Yamada 50928ba53c0SMasahiro Yamada if (node->parent) { 51028ba53c0SMasahiro Yamada bitnum = parent->bitnum; 51128ba53c0SMasahiro Yamada if ((bitnum & 7) == 0) { 51228ba53c0SMasahiro Yamada node->bitnum = bitnum + 7 + 8; 51328ba53c0SMasahiro Yamada node->nextbyte = 1; 51428ba53c0SMasahiro Yamada } else { 51528ba53c0SMasahiro Yamada node->bitnum = bitnum - 1; 51628ba53c0SMasahiro Yamada node->nextbyte = 0; 51728ba53c0SMasahiro Yamada } 51828ba53c0SMasahiro Yamada } else { 51928ba53c0SMasahiro Yamada node->bitnum = 7; 52028ba53c0SMasahiro Yamada node->nextbyte = 0; 52128ba53c0SMasahiro Yamada } 52228ba53c0SMasahiro Yamada 52328ba53c0SMasahiro Yamada return node; 52428ba53c0SMasahiro Yamada } 52528ba53c0SMasahiro Yamada 52628ba53c0SMasahiro Yamada /* 52728ba53c0SMasahiro Yamada * Insert a new leaf into the tree, and collapse any subtrees that are 52828ba53c0SMasahiro Yamada * fully populated and end in identical leaves. A nextbyte tagged 52928ba53c0SMasahiro Yamada * internal node will not be removed to preserve the tree's integrity. 53028ba53c0SMasahiro Yamada * Note that due to the structure of utf8, no nextbyte tagged node 53128ba53c0SMasahiro Yamada * will be a candidate for removal. 53228ba53c0SMasahiro Yamada */ 53328ba53c0SMasahiro Yamada static int insert(struct tree *tree, char *key, int keylen, void *leaf) 53428ba53c0SMasahiro Yamada { 53528ba53c0SMasahiro Yamada struct node *node; 53628ba53c0SMasahiro Yamada struct node *parent; 53728ba53c0SMasahiro Yamada void **cursor; 53828ba53c0SMasahiro Yamada int keybits; 53928ba53c0SMasahiro Yamada 54028ba53c0SMasahiro Yamada assert(keylen >= 1 && keylen <= 4); 54128ba53c0SMasahiro Yamada 54228ba53c0SMasahiro Yamada node = NULL; 54328ba53c0SMasahiro Yamada cursor = &tree->root; 54428ba53c0SMasahiro Yamada keybits = 8 * keylen; 54528ba53c0SMasahiro Yamada 54628ba53c0SMasahiro Yamada /* Insert, creating path along the way. */ 54728ba53c0SMasahiro Yamada while (keybits) { 54828ba53c0SMasahiro Yamada if (!*cursor) 54928ba53c0SMasahiro Yamada *cursor = alloc_node(node); 55028ba53c0SMasahiro Yamada node = *cursor; 55128ba53c0SMasahiro Yamada if (node->nextbyte) 55228ba53c0SMasahiro Yamada key++; 55328ba53c0SMasahiro Yamada if (*key & (1 << (node->bitnum & 7))) 55428ba53c0SMasahiro Yamada cursor = &node->right; 55528ba53c0SMasahiro Yamada else 55628ba53c0SMasahiro Yamada cursor = &node->left; 55728ba53c0SMasahiro Yamada keybits--; 55828ba53c0SMasahiro Yamada } 55928ba53c0SMasahiro Yamada *cursor = leaf; 56028ba53c0SMasahiro Yamada 56128ba53c0SMasahiro Yamada /* Merge subtrees if possible. */ 56228ba53c0SMasahiro Yamada while (node) { 56328ba53c0SMasahiro Yamada if (*key & (1 << (node->bitnum & 7))) 56428ba53c0SMasahiro Yamada node->rightnode = LEAF; 56528ba53c0SMasahiro Yamada else 56628ba53c0SMasahiro Yamada node->leftnode = LEAF; 56728ba53c0SMasahiro Yamada if (node->nextbyte) 56828ba53c0SMasahiro Yamada break; 56928ba53c0SMasahiro Yamada if (node->leftnode == NODE || node->rightnode == NODE) 57028ba53c0SMasahiro Yamada break; 57128ba53c0SMasahiro Yamada assert(node->left); 57228ba53c0SMasahiro Yamada assert(node->right); 57328ba53c0SMasahiro Yamada /* Compare */ 57428ba53c0SMasahiro Yamada if (! tree->leaf_equal(node->left, node->right)) 57528ba53c0SMasahiro Yamada break; 57628ba53c0SMasahiro Yamada /* Keep left, drop right leaf. */ 57728ba53c0SMasahiro Yamada leaf = node->left; 57828ba53c0SMasahiro Yamada /* Check in parent */ 57928ba53c0SMasahiro Yamada parent = node->parent; 58028ba53c0SMasahiro Yamada if (!parent) { 58128ba53c0SMasahiro Yamada /* root of tree! */ 58228ba53c0SMasahiro Yamada tree->root = leaf; 58328ba53c0SMasahiro Yamada tree->childnode = LEAF; 58428ba53c0SMasahiro Yamada } else if (parent->left == node) { 58528ba53c0SMasahiro Yamada parent->left = leaf; 58628ba53c0SMasahiro Yamada parent->leftnode = LEAF; 58728ba53c0SMasahiro Yamada if (parent->right) { 58828ba53c0SMasahiro Yamada parent->keymask = 0; 58928ba53c0SMasahiro Yamada parent->keybits = 0; 59028ba53c0SMasahiro Yamada } else { 59128ba53c0SMasahiro Yamada parent->keymask |= (1 << node->bitnum); 59228ba53c0SMasahiro Yamada } 59328ba53c0SMasahiro Yamada } else if (parent->right == node) { 59428ba53c0SMasahiro Yamada parent->right = leaf; 59528ba53c0SMasahiro Yamada parent->rightnode = LEAF; 59628ba53c0SMasahiro Yamada if (parent->left) { 59728ba53c0SMasahiro Yamada parent->keymask = 0; 59828ba53c0SMasahiro Yamada parent->keybits = 0; 59928ba53c0SMasahiro Yamada } else { 60028ba53c0SMasahiro Yamada parent->keymask |= (1 << node->bitnum); 60128ba53c0SMasahiro Yamada parent->keybits |= (1 << node->bitnum); 60228ba53c0SMasahiro Yamada } 60328ba53c0SMasahiro Yamada } else { 60428ba53c0SMasahiro Yamada /* internal tree error */ 60528ba53c0SMasahiro Yamada assert(0); 60628ba53c0SMasahiro Yamada } 60728ba53c0SMasahiro Yamada free(node); 60828ba53c0SMasahiro Yamada node = parent; 60928ba53c0SMasahiro Yamada } 61028ba53c0SMasahiro Yamada 61128ba53c0SMasahiro Yamada /* Propagate keymasks up along singleton chains. */ 61228ba53c0SMasahiro Yamada while (node) { 61328ba53c0SMasahiro Yamada parent = node->parent; 61428ba53c0SMasahiro Yamada if (!parent) 61528ba53c0SMasahiro Yamada break; 61628ba53c0SMasahiro Yamada /* Nix the mask for parents with two children. */ 61728ba53c0SMasahiro Yamada if (node->keymask == 0) { 61828ba53c0SMasahiro Yamada parent->keymask = 0; 61928ba53c0SMasahiro Yamada parent->keybits = 0; 62028ba53c0SMasahiro Yamada } else if (parent->left && parent->right) { 62128ba53c0SMasahiro Yamada parent->keymask = 0; 62228ba53c0SMasahiro Yamada parent->keybits = 0; 62328ba53c0SMasahiro Yamada } else { 62428ba53c0SMasahiro Yamada assert((parent->keymask & node->keymask) == 0); 62528ba53c0SMasahiro Yamada parent->keymask |= node->keymask; 62628ba53c0SMasahiro Yamada parent->keymask |= (1 << parent->bitnum); 62728ba53c0SMasahiro Yamada parent->keybits |= node->keybits; 62828ba53c0SMasahiro Yamada if (parent->right) 62928ba53c0SMasahiro Yamada parent->keybits |= (1 << parent->bitnum); 63028ba53c0SMasahiro Yamada } 63128ba53c0SMasahiro Yamada node = parent; 63228ba53c0SMasahiro Yamada } 63328ba53c0SMasahiro Yamada 63428ba53c0SMasahiro Yamada return 0; 63528ba53c0SMasahiro Yamada } 63628ba53c0SMasahiro Yamada 63728ba53c0SMasahiro Yamada /* 63828ba53c0SMasahiro Yamada * Prune internal nodes. 63928ba53c0SMasahiro Yamada * 64028ba53c0SMasahiro Yamada * Fully populated subtrees that end at the same leaf have already 64128ba53c0SMasahiro Yamada * been collapsed. There are still internal nodes that have for both 64228ba53c0SMasahiro Yamada * their left and right branches a sequence of singletons that make 64328ba53c0SMasahiro Yamada * identical choices and end in identical leaves. The keymask and 64428ba53c0SMasahiro Yamada * keybits collected in the nodes describe the choices made in these 64528ba53c0SMasahiro Yamada * singleton chains. When they are identical for the left and right 64628ba53c0SMasahiro Yamada * branch of a node, and the two leaves comare identical, the node in 64728ba53c0SMasahiro Yamada * question can be removed. 64828ba53c0SMasahiro Yamada * 64928ba53c0SMasahiro Yamada * Note that nodes with the nextbyte tag set will not be removed by 65028ba53c0SMasahiro Yamada * this to ensure tree integrity. Note as well that the structure of 65128ba53c0SMasahiro Yamada * utf8 ensures that these nodes would not have been candidates for 65228ba53c0SMasahiro Yamada * removal in any case. 65328ba53c0SMasahiro Yamada */ 65428ba53c0SMasahiro Yamada static void prune(struct tree *tree) 65528ba53c0SMasahiro Yamada { 65628ba53c0SMasahiro Yamada struct node *node; 65728ba53c0SMasahiro Yamada struct node *left; 65828ba53c0SMasahiro Yamada struct node *right; 65928ba53c0SMasahiro Yamada struct node *parent; 66028ba53c0SMasahiro Yamada void *leftleaf; 66128ba53c0SMasahiro Yamada void *rightleaf; 66228ba53c0SMasahiro Yamada unsigned int leftmask; 66328ba53c0SMasahiro Yamada unsigned int rightmask; 66428ba53c0SMasahiro Yamada unsigned int bitmask; 66528ba53c0SMasahiro Yamada int count; 66628ba53c0SMasahiro Yamada 66728ba53c0SMasahiro Yamada if (verbose > 0) 66828ba53c0SMasahiro Yamada printf("Pruning %s_%x\n", tree->type, tree->maxage); 66928ba53c0SMasahiro Yamada 67028ba53c0SMasahiro Yamada count = 0; 67128ba53c0SMasahiro Yamada if (tree->childnode == LEAF) 67228ba53c0SMasahiro Yamada return; 67328ba53c0SMasahiro Yamada if (!tree->root) 67428ba53c0SMasahiro Yamada return; 67528ba53c0SMasahiro Yamada 67628ba53c0SMasahiro Yamada leftmask = rightmask = 0; 67728ba53c0SMasahiro Yamada node = tree->root; 67828ba53c0SMasahiro Yamada while (node) { 67928ba53c0SMasahiro Yamada if (node->nextbyte) 68028ba53c0SMasahiro Yamada goto advance; 68128ba53c0SMasahiro Yamada if (node->leftnode == LEAF) 68228ba53c0SMasahiro Yamada goto advance; 68328ba53c0SMasahiro Yamada if (node->rightnode == LEAF) 68428ba53c0SMasahiro Yamada goto advance; 68528ba53c0SMasahiro Yamada if (!node->left) 68628ba53c0SMasahiro Yamada goto advance; 68728ba53c0SMasahiro Yamada if (!node->right) 68828ba53c0SMasahiro Yamada goto advance; 68928ba53c0SMasahiro Yamada left = node->left; 69028ba53c0SMasahiro Yamada right = node->right; 69128ba53c0SMasahiro Yamada if (left->keymask == 0) 69228ba53c0SMasahiro Yamada goto advance; 69328ba53c0SMasahiro Yamada if (right->keymask == 0) 69428ba53c0SMasahiro Yamada goto advance; 69528ba53c0SMasahiro Yamada if (left->keymask != right->keymask) 69628ba53c0SMasahiro Yamada goto advance; 69728ba53c0SMasahiro Yamada if (left->keybits != right->keybits) 69828ba53c0SMasahiro Yamada goto advance; 69928ba53c0SMasahiro Yamada leftleaf = NULL; 70028ba53c0SMasahiro Yamada while (!leftleaf) { 70128ba53c0SMasahiro Yamada assert(left->left || left->right); 70228ba53c0SMasahiro Yamada if (left->leftnode == LEAF) 70328ba53c0SMasahiro Yamada leftleaf = left->left; 70428ba53c0SMasahiro Yamada else if (left->rightnode == LEAF) 70528ba53c0SMasahiro Yamada leftleaf = left->right; 70628ba53c0SMasahiro Yamada else if (left->left) 70728ba53c0SMasahiro Yamada left = left->left; 70828ba53c0SMasahiro Yamada else if (left->right) 70928ba53c0SMasahiro Yamada left = left->right; 71028ba53c0SMasahiro Yamada else 71128ba53c0SMasahiro Yamada assert(0); 71228ba53c0SMasahiro Yamada } 71328ba53c0SMasahiro Yamada rightleaf = NULL; 71428ba53c0SMasahiro Yamada while (!rightleaf) { 71528ba53c0SMasahiro Yamada assert(right->left || right->right); 71628ba53c0SMasahiro Yamada if (right->leftnode == LEAF) 71728ba53c0SMasahiro Yamada rightleaf = right->left; 71828ba53c0SMasahiro Yamada else if (right->rightnode == LEAF) 71928ba53c0SMasahiro Yamada rightleaf = right->right; 72028ba53c0SMasahiro Yamada else if (right->left) 72128ba53c0SMasahiro Yamada right = right->left; 72228ba53c0SMasahiro Yamada else if (right->right) 72328ba53c0SMasahiro Yamada right = right->right; 72428ba53c0SMasahiro Yamada else 72528ba53c0SMasahiro Yamada assert(0); 72628ba53c0SMasahiro Yamada } 72728ba53c0SMasahiro Yamada if (! tree->leaf_equal(leftleaf, rightleaf)) 72828ba53c0SMasahiro Yamada goto advance; 72928ba53c0SMasahiro Yamada /* 73028ba53c0SMasahiro Yamada * This node has identical singleton-only subtrees. 73128ba53c0SMasahiro Yamada * Remove it. 73228ba53c0SMasahiro Yamada */ 73328ba53c0SMasahiro Yamada parent = node->parent; 73428ba53c0SMasahiro Yamada left = node->left; 73528ba53c0SMasahiro Yamada right = node->right; 73628ba53c0SMasahiro Yamada if (parent->left == node) 73728ba53c0SMasahiro Yamada parent->left = left; 73828ba53c0SMasahiro Yamada else if (parent->right == node) 73928ba53c0SMasahiro Yamada parent->right = left; 74028ba53c0SMasahiro Yamada else 74128ba53c0SMasahiro Yamada assert(0); 74228ba53c0SMasahiro Yamada left->parent = parent; 74328ba53c0SMasahiro Yamada left->keymask |= (1 << node->bitnum); 74428ba53c0SMasahiro Yamada node->left = NULL; 74528ba53c0SMasahiro Yamada while (node) { 74628ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 74728ba53c0SMasahiro Yamada leftmask &= ~bitmask; 74828ba53c0SMasahiro Yamada rightmask &= ~bitmask; 74928ba53c0SMasahiro Yamada if (node->leftnode == NODE && node->left) { 75028ba53c0SMasahiro Yamada left = node->left; 75128ba53c0SMasahiro Yamada free(node); 75228ba53c0SMasahiro Yamada count++; 75328ba53c0SMasahiro Yamada node = left; 75428ba53c0SMasahiro Yamada } else if (node->rightnode == NODE && node->right) { 75528ba53c0SMasahiro Yamada right = node->right; 75628ba53c0SMasahiro Yamada free(node); 75728ba53c0SMasahiro Yamada count++; 75828ba53c0SMasahiro Yamada node = right; 75928ba53c0SMasahiro Yamada } else { 76028ba53c0SMasahiro Yamada node = NULL; 76128ba53c0SMasahiro Yamada } 76228ba53c0SMasahiro Yamada } 76328ba53c0SMasahiro Yamada /* Propagate keymasks up along singleton chains. */ 76428ba53c0SMasahiro Yamada node = parent; 76528ba53c0SMasahiro Yamada /* Force re-check */ 76628ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 76728ba53c0SMasahiro Yamada leftmask &= ~bitmask; 76828ba53c0SMasahiro Yamada rightmask &= ~bitmask; 76928ba53c0SMasahiro Yamada for (;;) { 77028ba53c0SMasahiro Yamada if (node->left && node->right) 77128ba53c0SMasahiro Yamada break; 77228ba53c0SMasahiro Yamada if (node->left) { 77328ba53c0SMasahiro Yamada left = node->left; 77428ba53c0SMasahiro Yamada node->keymask |= left->keymask; 77528ba53c0SMasahiro Yamada node->keybits |= left->keybits; 77628ba53c0SMasahiro Yamada } 77728ba53c0SMasahiro Yamada if (node->right) { 77828ba53c0SMasahiro Yamada right = node->right; 77928ba53c0SMasahiro Yamada node->keymask |= right->keymask; 78028ba53c0SMasahiro Yamada node->keybits |= right->keybits; 78128ba53c0SMasahiro Yamada } 78228ba53c0SMasahiro Yamada node->keymask |= (1 << node->bitnum); 78328ba53c0SMasahiro Yamada node = node->parent; 78428ba53c0SMasahiro Yamada /* Force re-check */ 78528ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 78628ba53c0SMasahiro Yamada leftmask &= ~bitmask; 78728ba53c0SMasahiro Yamada rightmask &= ~bitmask; 78828ba53c0SMasahiro Yamada } 78928ba53c0SMasahiro Yamada advance: 79028ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 79128ba53c0SMasahiro Yamada if ((leftmask & bitmask) == 0 && 79228ba53c0SMasahiro Yamada node->leftnode == NODE && 79328ba53c0SMasahiro Yamada node->left) { 79428ba53c0SMasahiro Yamada leftmask |= bitmask; 79528ba53c0SMasahiro Yamada node = node->left; 79628ba53c0SMasahiro Yamada } else if ((rightmask & bitmask) == 0 && 79728ba53c0SMasahiro Yamada node->rightnode == NODE && 79828ba53c0SMasahiro Yamada node->right) { 79928ba53c0SMasahiro Yamada rightmask |= bitmask; 80028ba53c0SMasahiro Yamada node = node->right; 80128ba53c0SMasahiro Yamada } else { 80228ba53c0SMasahiro Yamada leftmask &= ~bitmask; 80328ba53c0SMasahiro Yamada rightmask &= ~bitmask; 80428ba53c0SMasahiro Yamada node = node->parent; 80528ba53c0SMasahiro Yamada } 80628ba53c0SMasahiro Yamada } 80728ba53c0SMasahiro Yamada if (verbose > 0) 80828ba53c0SMasahiro Yamada printf("Pruned %d nodes\n", count); 80928ba53c0SMasahiro Yamada } 81028ba53c0SMasahiro Yamada 81128ba53c0SMasahiro Yamada /* 81228ba53c0SMasahiro Yamada * Mark the nodes in the tree that lead to leaves that must be 81328ba53c0SMasahiro Yamada * emitted. 81428ba53c0SMasahiro Yamada */ 81528ba53c0SMasahiro Yamada static void mark_nodes(struct tree *tree) 81628ba53c0SMasahiro Yamada { 81728ba53c0SMasahiro Yamada struct node *node; 81828ba53c0SMasahiro Yamada struct node *n; 81928ba53c0SMasahiro Yamada unsigned int leftmask; 82028ba53c0SMasahiro Yamada unsigned int rightmask; 82128ba53c0SMasahiro Yamada unsigned int bitmask; 82228ba53c0SMasahiro Yamada int marked; 82328ba53c0SMasahiro Yamada 82428ba53c0SMasahiro Yamada marked = 0; 82528ba53c0SMasahiro Yamada if (verbose > 0) 82628ba53c0SMasahiro Yamada printf("Marking %s_%x\n", tree->type, tree->maxage); 82728ba53c0SMasahiro Yamada if (tree->childnode == LEAF) 82828ba53c0SMasahiro Yamada goto done; 82928ba53c0SMasahiro Yamada 83028ba53c0SMasahiro Yamada assert(tree->childnode == NODE); 83128ba53c0SMasahiro Yamada node = tree->root; 83228ba53c0SMasahiro Yamada leftmask = rightmask = 0; 83328ba53c0SMasahiro Yamada while (node) { 83428ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 83528ba53c0SMasahiro Yamada if ((leftmask & bitmask) == 0) { 83628ba53c0SMasahiro Yamada leftmask |= bitmask; 83728ba53c0SMasahiro Yamada if (node->leftnode == LEAF) { 83828ba53c0SMasahiro Yamada assert(node->left); 83928ba53c0SMasahiro Yamada if (tree->leaf_mark(node->left)) { 84028ba53c0SMasahiro Yamada n = node; 84128ba53c0SMasahiro Yamada while (n && !n->mark) { 84228ba53c0SMasahiro Yamada marked++; 84328ba53c0SMasahiro Yamada n->mark = 1; 84428ba53c0SMasahiro Yamada n = n->parent; 84528ba53c0SMasahiro Yamada } 84628ba53c0SMasahiro Yamada } 84728ba53c0SMasahiro Yamada } else if (node->left) { 84828ba53c0SMasahiro Yamada assert(node->leftnode == NODE); 84928ba53c0SMasahiro Yamada node = node->left; 85028ba53c0SMasahiro Yamada continue; 85128ba53c0SMasahiro Yamada } 85228ba53c0SMasahiro Yamada } 85328ba53c0SMasahiro Yamada if ((rightmask & bitmask) == 0) { 85428ba53c0SMasahiro Yamada rightmask |= bitmask; 85528ba53c0SMasahiro Yamada if (node->rightnode == LEAF) { 85628ba53c0SMasahiro Yamada assert(node->right); 85728ba53c0SMasahiro Yamada if (tree->leaf_mark(node->right)) { 85828ba53c0SMasahiro Yamada n = node; 85928ba53c0SMasahiro Yamada while (n && !n->mark) { 86028ba53c0SMasahiro Yamada marked++; 86128ba53c0SMasahiro Yamada n->mark = 1; 86228ba53c0SMasahiro Yamada n = n->parent; 86328ba53c0SMasahiro Yamada } 86428ba53c0SMasahiro Yamada } 86528ba53c0SMasahiro Yamada } else if (node->right) { 86628ba53c0SMasahiro Yamada assert(node->rightnode == NODE); 86728ba53c0SMasahiro Yamada node = node->right; 86828ba53c0SMasahiro Yamada continue; 86928ba53c0SMasahiro Yamada } 87028ba53c0SMasahiro Yamada } 87128ba53c0SMasahiro Yamada leftmask &= ~bitmask; 87228ba53c0SMasahiro Yamada rightmask &= ~bitmask; 87328ba53c0SMasahiro Yamada node = node->parent; 87428ba53c0SMasahiro Yamada } 87528ba53c0SMasahiro Yamada 87628ba53c0SMasahiro Yamada /* second pass: left siblings and singletons */ 87728ba53c0SMasahiro Yamada 87828ba53c0SMasahiro Yamada assert(tree->childnode == NODE); 87928ba53c0SMasahiro Yamada node = tree->root; 88028ba53c0SMasahiro Yamada leftmask = rightmask = 0; 88128ba53c0SMasahiro Yamada while (node) { 88228ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 88328ba53c0SMasahiro Yamada if ((leftmask & bitmask) == 0) { 88428ba53c0SMasahiro Yamada leftmask |= bitmask; 88528ba53c0SMasahiro Yamada if (node->leftnode == LEAF) { 88628ba53c0SMasahiro Yamada assert(node->left); 88728ba53c0SMasahiro Yamada if (tree->leaf_mark(node->left)) { 88828ba53c0SMasahiro Yamada n = node; 88928ba53c0SMasahiro Yamada while (n && !n->mark) { 89028ba53c0SMasahiro Yamada marked++; 89128ba53c0SMasahiro Yamada n->mark = 1; 89228ba53c0SMasahiro Yamada n = n->parent; 89328ba53c0SMasahiro Yamada } 89428ba53c0SMasahiro Yamada } 89528ba53c0SMasahiro Yamada } else if (node->left) { 89628ba53c0SMasahiro Yamada assert(node->leftnode == NODE); 89728ba53c0SMasahiro Yamada node = node->left; 89828ba53c0SMasahiro Yamada if (!node->mark && node->parent->mark) { 89928ba53c0SMasahiro Yamada marked++; 90028ba53c0SMasahiro Yamada node->mark = 1; 90128ba53c0SMasahiro Yamada } 90228ba53c0SMasahiro Yamada continue; 90328ba53c0SMasahiro Yamada } 90428ba53c0SMasahiro Yamada } 90528ba53c0SMasahiro Yamada if ((rightmask & bitmask) == 0) { 90628ba53c0SMasahiro Yamada rightmask |= bitmask; 90728ba53c0SMasahiro Yamada if (node->rightnode == LEAF) { 90828ba53c0SMasahiro Yamada assert(node->right); 90928ba53c0SMasahiro Yamada if (tree->leaf_mark(node->right)) { 91028ba53c0SMasahiro Yamada n = node; 91128ba53c0SMasahiro Yamada while (n && !n->mark) { 91228ba53c0SMasahiro Yamada marked++; 91328ba53c0SMasahiro Yamada n->mark = 1; 91428ba53c0SMasahiro Yamada n = n->parent; 91528ba53c0SMasahiro Yamada } 91628ba53c0SMasahiro Yamada } 91728ba53c0SMasahiro Yamada } else if (node->right) { 91828ba53c0SMasahiro Yamada assert(node->rightnode == NODE); 91928ba53c0SMasahiro Yamada node = node->right; 92028ba53c0SMasahiro Yamada if (!node->mark && node->parent->mark && 92128ba53c0SMasahiro Yamada !node->parent->left) { 92228ba53c0SMasahiro Yamada marked++; 92328ba53c0SMasahiro Yamada node->mark = 1; 92428ba53c0SMasahiro Yamada } 92528ba53c0SMasahiro Yamada continue; 92628ba53c0SMasahiro Yamada } 92728ba53c0SMasahiro Yamada } 92828ba53c0SMasahiro Yamada leftmask &= ~bitmask; 92928ba53c0SMasahiro Yamada rightmask &= ~bitmask; 93028ba53c0SMasahiro Yamada node = node->parent; 93128ba53c0SMasahiro Yamada } 93228ba53c0SMasahiro Yamada done: 93328ba53c0SMasahiro Yamada if (verbose > 0) 93428ba53c0SMasahiro Yamada printf("Marked %d nodes\n", marked); 93528ba53c0SMasahiro Yamada } 93628ba53c0SMasahiro Yamada 93728ba53c0SMasahiro Yamada /* 93828ba53c0SMasahiro Yamada * Compute the index of each node and leaf, which is the offset in the 93928ba53c0SMasahiro Yamada * emitted trie. These values must be pre-computed because relative 94028ba53c0SMasahiro Yamada * offsets between nodes are used to navigate the tree. 94128ba53c0SMasahiro Yamada */ 94228ba53c0SMasahiro Yamada static int index_nodes(struct tree *tree, int index) 94328ba53c0SMasahiro Yamada { 94428ba53c0SMasahiro Yamada struct node *node; 94528ba53c0SMasahiro Yamada unsigned int leftmask; 94628ba53c0SMasahiro Yamada unsigned int rightmask; 94728ba53c0SMasahiro Yamada unsigned int bitmask; 94828ba53c0SMasahiro Yamada int count; 94928ba53c0SMasahiro Yamada int indent; 95028ba53c0SMasahiro Yamada 95128ba53c0SMasahiro Yamada /* Align to a cache line (or half a cache line?). */ 95228ba53c0SMasahiro Yamada while (index % 64) 95328ba53c0SMasahiro Yamada index++; 95428ba53c0SMasahiro Yamada tree->index = index; 95528ba53c0SMasahiro Yamada indent = 1; 95628ba53c0SMasahiro Yamada count = 0; 95728ba53c0SMasahiro Yamada 95828ba53c0SMasahiro Yamada if (verbose > 0) 95928ba53c0SMasahiro Yamada printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index); 96028ba53c0SMasahiro Yamada if (tree->childnode == LEAF) { 96128ba53c0SMasahiro Yamada index += tree->leaf_size(tree->root); 96228ba53c0SMasahiro Yamada goto done; 96328ba53c0SMasahiro Yamada } 96428ba53c0SMasahiro Yamada 96528ba53c0SMasahiro Yamada assert(tree->childnode == NODE); 96628ba53c0SMasahiro Yamada node = tree->root; 96728ba53c0SMasahiro Yamada leftmask = rightmask = 0; 96828ba53c0SMasahiro Yamada while (node) { 96928ba53c0SMasahiro Yamada if (!node->mark) 97028ba53c0SMasahiro Yamada goto skip; 97128ba53c0SMasahiro Yamada count++; 97228ba53c0SMasahiro Yamada if (node->index != index) 97328ba53c0SMasahiro Yamada node->index = index; 97428ba53c0SMasahiro Yamada index += node->size; 97528ba53c0SMasahiro Yamada skip: 97628ba53c0SMasahiro Yamada while (node) { 97728ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 97828ba53c0SMasahiro Yamada if (node->mark && (leftmask & bitmask) == 0) { 97928ba53c0SMasahiro Yamada leftmask |= bitmask; 98028ba53c0SMasahiro Yamada if (node->leftnode == LEAF) { 98128ba53c0SMasahiro Yamada assert(node->left); 98228ba53c0SMasahiro Yamada *tree->leaf_index(tree, node->left) = 98328ba53c0SMasahiro Yamada index; 98428ba53c0SMasahiro Yamada index += tree->leaf_size(node->left); 98528ba53c0SMasahiro Yamada count++; 98628ba53c0SMasahiro Yamada } else if (node->left) { 98728ba53c0SMasahiro Yamada assert(node->leftnode == NODE); 98828ba53c0SMasahiro Yamada indent += 1; 98928ba53c0SMasahiro Yamada node = node->left; 99028ba53c0SMasahiro Yamada break; 99128ba53c0SMasahiro Yamada } 99228ba53c0SMasahiro Yamada } 99328ba53c0SMasahiro Yamada if (node->mark && (rightmask & bitmask) == 0) { 99428ba53c0SMasahiro Yamada rightmask |= bitmask; 99528ba53c0SMasahiro Yamada if (node->rightnode == LEAF) { 99628ba53c0SMasahiro Yamada assert(node->right); 99728ba53c0SMasahiro Yamada *tree->leaf_index(tree, node->right) = index; 99828ba53c0SMasahiro Yamada index += tree->leaf_size(node->right); 99928ba53c0SMasahiro Yamada count++; 100028ba53c0SMasahiro Yamada } else if (node->right) { 100128ba53c0SMasahiro Yamada assert(node->rightnode == NODE); 100228ba53c0SMasahiro Yamada indent += 1; 100328ba53c0SMasahiro Yamada node = node->right; 100428ba53c0SMasahiro Yamada break; 100528ba53c0SMasahiro Yamada } 100628ba53c0SMasahiro Yamada } 100728ba53c0SMasahiro Yamada leftmask &= ~bitmask; 100828ba53c0SMasahiro Yamada rightmask &= ~bitmask; 100928ba53c0SMasahiro Yamada node = node->parent; 101028ba53c0SMasahiro Yamada indent -= 1; 101128ba53c0SMasahiro Yamada } 101228ba53c0SMasahiro Yamada } 101328ba53c0SMasahiro Yamada done: 101428ba53c0SMasahiro Yamada /* Round up to a multiple of 16 */ 101528ba53c0SMasahiro Yamada while (index % 16) 101628ba53c0SMasahiro Yamada index++; 101728ba53c0SMasahiro Yamada if (verbose > 0) 101828ba53c0SMasahiro Yamada printf("Final index %d\n", index); 101928ba53c0SMasahiro Yamada return index; 102028ba53c0SMasahiro Yamada } 102128ba53c0SMasahiro Yamada 102228ba53c0SMasahiro Yamada /* 102328ba53c0SMasahiro Yamada * Mark the nodes in a subtree, helper for size_nodes(). 102428ba53c0SMasahiro Yamada */ 102528ba53c0SMasahiro Yamada static int mark_subtree(struct node *node) 102628ba53c0SMasahiro Yamada { 102728ba53c0SMasahiro Yamada int changed; 102828ba53c0SMasahiro Yamada 102928ba53c0SMasahiro Yamada if (!node || node->mark) 103028ba53c0SMasahiro Yamada return 0; 103128ba53c0SMasahiro Yamada node->mark = 1; 103228ba53c0SMasahiro Yamada node->index = node->parent->index; 103328ba53c0SMasahiro Yamada changed = 1; 103428ba53c0SMasahiro Yamada if (node->leftnode == NODE) 103528ba53c0SMasahiro Yamada changed += mark_subtree(node->left); 103628ba53c0SMasahiro Yamada if (node->rightnode == NODE) 103728ba53c0SMasahiro Yamada changed += mark_subtree(node->right); 103828ba53c0SMasahiro Yamada return changed; 103928ba53c0SMasahiro Yamada } 104028ba53c0SMasahiro Yamada 104128ba53c0SMasahiro Yamada /* 104228ba53c0SMasahiro Yamada * Compute the size of nodes and leaves. We start by assuming that 104328ba53c0SMasahiro Yamada * each node needs to store a three-byte offset. The indexes of the 104428ba53c0SMasahiro Yamada * nodes are calculated based on that, and then this function is 104528ba53c0SMasahiro Yamada * called to see if the sizes of some nodes can be reduced. This is 104628ba53c0SMasahiro Yamada * repeated until no more changes are seen. 104728ba53c0SMasahiro Yamada */ 104828ba53c0SMasahiro Yamada static int size_nodes(struct tree *tree) 104928ba53c0SMasahiro Yamada { 105028ba53c0SMasahiro Yamada struct tree *next; 105128ba53c0SMasahiro Yamada struct node *node; 105228ba53c0SMasahiro Yamada struct node *right; 105328ba53c0SMasahiro Yamada struct node *n; 105428ba53c0SMasahiro Yamada unsigned int leftmask; 105528ba53c0SMasahiro Yamada unsigned int rightmask; 105628ba53c0SMasahiro Yamada unsigned int bitmask; 105728ba53c0SMasahiro Yamada unsigned int pathbits; 105828ba53c0SMasahiro Yamada unsigned int pathmask; 105928ba53c0SMasahiro Yamada unsigned int nbit; 106028ba53c0SMasahiro Yamada int changed; 106128ba53c0SMasahiro Yamada int offset; 106228ba53c0SMasahiro Yamada int size; 106328ba53c0SMasahiro Yamada int indent; 106428ba53c0SMasahiro Yamada 106528ba53c0SMasahiro Yamada indent = 1; 106628ba53c0SMasahiro Yamada changed = 0; 106728ba53c0SMasahiro Yamada size = 0; 106828ba53c0SMasahiro Yamada 106928ba53c0SMasahiro Yamada if (verbose > 0) 107028ba53c0SMasahiro Yamada printf("Sizing %s_%x\n", tree->type, tree->maxage); 107128ba53c0SMasahiro Yamada if (tree->childnode == LEAF) 107228ba53c0SMasahiro Yamada goto done; 107328ba53c0SMasahiro Yamada 107428ba53c0SMasahiro Yamada assert(tree->childnode == NODE); 107528ba53c0SMasahiro Yamada pathbits = 0; 107628ba53c0SMasahiro Yamada pathmask = 0; 107728ba53c0SMasahiro Yamada node = tree->root; 107828ba53c0SMasahiro Yamada leftmask = rightmask = 0; 107928ba53c0SMasahiro Yamada while (node) { 108028ba53c0SMasahiro Yamada if (!node->mark) 108128ba53c0SMasahiro Yamada goto skip; 108228ba53c0SMasahiro Yamada offset = 0; 108328ba53c0SMasahiro Yamada if (!node->left || !node->right) { 108428ba53c0SMasahiro Yamada size = 1; 108528ba53c0SMasahiro Yamada } else { 108628ba53c0SMasahiro Yamada if (node->rightnode == NODE) { 108728ba53c0SMasahiro Yamada /* 108828ba53c0SMasahiro Yamada * If the right node is not marked, 108928ba53c0SMasahiro Yamada * look for a corresponding node in 109028ba53c0SMasahiro Yamada * the next tree. Such a node need 109128ba53c0SMasahiro Yamada * not exist. 109228ba53c0SMasahiro Yamada */ 109328ba53c0SMasahiro Yamada right = node->right; 109428ba53c0SMasahiro Yamada next = tree->next; 109528ba53c0SMasahiro Yamada while (!right->mark) { 109628ba53c0SMasahiro Yamada assert(next); 109728ba53c0SMasahiro Yamada n = next->root; 109828ba53c0SMasahiro Yamada while (n->bitnum != node->bitnum) { 109928ba53c0SMasahiro Yamada nbit = 1 << n->bitnum; 110028ba53c0SMasahiro Yamada if (!(pathmask & nbit)) 110128ba53c0SMasahiro Yamada break; 110228ba53c0SMasahiro Yamada if (pathbits & nbit) { 110328ba53c0SMasahiro Yamada if (n->rightnode == LEAF) 110428ba53c0SMasahiro Yamada break; 110528ba53c0SMasahiro Yamada n = n->right; 110628ba53c0SMasahiro Yamada } else { 110728ba53c0SMasahiro Yamada if (n->leftnode == LEAF) 110828ba53c0SMasahiro Yamada break; 110928ba53c0SMasahiro Yamada n = n->left; 111028ba53c0SMasahiro Yamada } 111128ba53c0SMasahiro Yamada } 111228ba53c0SMasahiro Yamada if (n->bitnum != node->bitnum) 111328ba53c0SMasahiro Yamada break; 111428ba53c0SMasahiro Yamada n = n->right; 111528ba53c0SMasahiro Yamada right = n; 111628ba53c0SMasahiro Yamada next = next->next; 111728ba53c0SMasahiro Yamada } 111828ba53c0SMasahiro Yamada /* Make sure the right node is marked. */ 111928ba53c0SMasahiro Yamada if (!right->mark) 112028ba53c0SMasahiro Yamada changed += mark_subtree(right); 112128ba53c0SMasahiro Yamada offset = right->index - node->index; 112228ba53c0SMasahiro Yamada } else { 112328ba53c0SMasahiro Yamada offset = *tree->leaf_index(tree, node->right); 112428ba53c0SMasahiro Yamada offset -= node->index; 112528ba53c0SMasahiro Yamada } 112628ba53c0SMasahiro Yamada assert(offset >= 0); 112728ba53c0SMasahiro Yamada assert(offset <= 0xffffff); 112828ba53c0SMasahiro Yamada if (offset <= 0xff) { 112928ba53c0SMasahiro Yamada size = 2; 113028ba53c0SMasahiro Yamada } else if (offset <= 0xffff) { 113128ba53c0SMasahiro Yamada size = 3; 113228ba53c0SMasahiro Yamada } else { /* offset <= 0xffffff */ 113328ba53c0SMasahiro Yamada size = 4; 113428ba53c0SMasahiro Yamada } 113528ba53c0SMasahiro Yamada } 113628ba53c0SMasahiro Yamada if (node->size != size || node->offset != offset) { 113728ba53c0SMasahiro Yamada node->size = size; 113828ba53c0SMasahiro Yamada node->offset = offset; 113928ba53c0SMasahiro Yamada changed++; 114028ba53c0SMasahiro Yamada } 114128ba53c0SMasahiro Yamada skip: 114228ba53c0SMasahiro Yamada while (node) { 114328ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 114428ba53c0SMasahiro Yamada pathmask |= bitmask; 114528ba53c0SMasahiro Yamada if (node->mark && (leftmask & bitmask) == 0) { 114628ba53c0SMasahiro Yamada leftmask |= bitmask; 114728ba53c0SMasahiro Yamada if (node->leftnode == LEAF) { 114828ba53c0SMasahiro Yamada assert(node->left); 114928ba53c0SMasahiro Yamada } else if (node->left) { 115028ba53c0SMasahiro Yamada assert(node->leftnode == NODE); 115128ba53c0SMasahiro Yamada indent += 1; 115228ba53c0SMasahiro Yamada node = node->left; 115328ba53c0SMasahiro Yamada break; 115428ba53c0SMasahiro Yamada } 115528ba53c0SMasahiro Yamada } 115628ba53c0SMasahiro Yamada if (node->mark && (rightmask & bitmask) == 0) { 115728ba53c0SMasahiro Yamada rightmask |= bitmask; 115828ba53c0SMasahiro Yamada pathbits |= bitmask; 115928ba53c0SMasahiro Yamada if (node->rightnode == LEAF) { 116028ba53c0SMasahiro Yamada assert(node->right); 116128ba53c0SMasahiro Yamada } else if (node->right) { 116228ba53c0SMasahiro Yamada assert(node->rightnode == NODE); 116328ba53c0SMasahiro Yamada indent += 1; 116428ba53c0SMasahiro Yamada node = node->right; 116528ba53c0SMasahiro Yamada break; 116628ba53c0SMasahiro Yamada } 116728ba53c0SMasahiro Yamada } 116828ba53c0SMasahiro Yamada leftmask &= ~bitmask; 116928ba53c0SMasahiro Yamada rightmask &= ~bitmask; 117028ba53c0SMasahiro Yamada pathmask &= ~bitmask; 117128ba53c0SMasahiro Yamada pathbits &= ~bitmask; 117228ba53c0SMasahiro Yamada node = node->parent; 117328ba53c0SMasahiro Yamada indent -= 1; 117428ba53c0SMasahiro Yamada } 117528ba53c0SMasahiro Yamada } 117628ba53c0SMasahiro Yamada done: 117728ba53c0SMasahiro Yamada if (verbose > 0) 117828ba53c0SMasahiro Yamada printf("Found %d changes\n", changed); 117928ba53c0SMasahiro Yamada return changed; 118028ba53c0SMasahiro Yamada } 118128ba53c0SMasahiro Yamada 118228ba53c0SMasahiro Yamada /* 118328ba53c0SMasahiro Yamada * Emit a trie for the given tree into the data array. 118428ba53c0SMasahiro Yamada */ 118528ba53c0SMasahiro Yamada static void emit(struct tree *tree, unsigned char *data) 118628ba53c0SMasahiro Yamada { 118728ba53c0SMasahiro Yamada struct node *node; 118828ba53c0SMasahiro Yamada unsigned int leftmask; 118928ba53c0SMasahiro Yamada unsigned int rightmask; 119028ba53c0SMasahiro Yamada unsigned int bitmask; 119128ba53c0SMasahiro Yamada int offlen; 119228ba53c0SMasahiro Yamada int offset; 119328ba53c0SMasahiro Yamada int index; 119428ba53c0SMasahiro Yamada int indent; 119528ba53c0SMasahiro Yamada int size; 119628ba53c0SMasahiro Yamada int bytes; 119728ba53c0SMasahiro Yamada int leaves; 119828ba53c0SMasahiro Yamada int nodes[4]; 119928ba53c0SMasahiro Yamada unsigned char byte; 120028ba53c0SMasahiro Yamada 120128ba53c0SMasahiro Yamada nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0; 120228ba53c0SMasahiro Yamada leaves = 0; 120328ba53c0SMasahiro Yamada bytes = 0; 120428ba53c0SMasahiro Yamada index = tree->index; 120528ba53c0SMasahiro Yamada data += index; 120628ba53c0SMasahiro Yamada indent = 1; 120728ba53c0SMasahiro Yamada if (verbose > 0) 120828ba53c0SMasahiro Yamada printf("Emitting %s_%x\n", tree->type, tree->maxage); 120928ba53c0SMasahiro Yamada if (tree->childnode == LEAF) { 121028ba53c0SMasahiro Yamada assert(tree->root); 121128ba53c0SMasahiro Yamada tree->leaf_emit(tree->root, data); 121228ba53c0SMasahiro Yamada size = tree->leaf_size(tree->root); 121328ba53c0SMasahiro Yamada index += size; 121428ba53c0SMasahiro Yamada leaves++; 121528ba53c0SMasahiro Yamada goto done; 121628ba53c0SMasahiro Yamada } 121728ba53c0SMasahiro Yamada 121828ba53c0SMasahiro Yamada assert(tree->childnode == NODE); 121928ba53c0SMasahiro Yamada node = tree->root; 122028ba53c0SMasahiro Yamada leftmask = rightmask = 0; 122128ba53c0SMasahiro Yamada while (node) { 122228ba53c0SMasahiro Yamada if (!node->mark) 122328ba53c0SMasahiro Yamada goto skip; 122428ba53c0SMasahiro Yamada assert(node->offset != -1); 122528ba53c0SMasahiro Yamada assert(node->index == index); 122628ba53c0SMasahiro Yamada 122728ba53c0SMasahiro Yamada byte = 0; 122828ba53c0SMasahiro Yamada if (node->nextbyte) 122928ba53c0SMasahiro Yamada byte |= NEXTBYTE; 123028ba53c0SMasahiro Yamada byte |= (node->bitnum & BITNUM); 123128ba53c0SMasahiro Yamada if (node->left && node->right) { 123228ba53c0SMasahiro Yamada if (node->leftnode == NODE) 123328ba53c0SMasahiro Yamada byte |= LEFTNODE; 123428ba53c0SMasahiro Yamada if (node->rightnode == NODE) 123528ba53c0SMasahiro Yamada byte |= RIGHTNODE; 123628ba53c0SMasahiro Yamada if (node->offset <= 0xff) 123728ba53c0SMasahiro Yamada offlen = 1; 123828ba53c0SMasahiro Yamada else if (node->offset <= 0xffff) 123928ba53c0SMasahiro Yamada offlen = 2; 124028ba53c0SMasahiro Yamada else 124128ba53c0SMasahiro Yamada offlen = 3; 124228ba53c0SMasahiro Yamada nodes[offlen]++; 124328ba53c0SMasahiro Yamada offset = node->offset; 124428ba53c0SMasahiro Yamada byte |= offlen << OFFLEN_SHIFT; 124528ba53c0SMasahiro Yamada *data++ = byte; 124628ba53c0SMasahiro Yamada index++; 124728ba53c0SMasahiro Yamada while (offlen--) { 124828ba53c0SMasahiro Yamada *data++ = offset & 0xff; 124928ba53c0SMasahiro Yamada index++; 125028ba53c0SMasahiro Yamada offset >>= 8; 125128ba53c0SMasahiro Yamada } 125228ba53c0SMasahiro Yamada } else if (node->left) { 125328ba53c0SMasahiro Yamada if (node->leftnode == NODE) 125428ba53c0SMasahiro Yamada byte |= TRIENODE; 125528ba53c0SMasahiro Yamada nodes[0]++; 125628ba53c0SMasahiro Yamada *data++ = byte; 125728ba53c0SMasahiro Yamada index++; 125828ba53c0SMasahiro Yamada } else if (node->right) { 125928ba53c0SMasahiro Yamada byte |= RIGHTNODE; 126028ba53c0SMasahiro Yamada if (node->rightnode == NODE) 126128ba53c0SMasahiro Yamada byte |= TRIENODE; 126228ba53c0SMasahiro Yamada nodes[0]++; 126328ba53c0SMasahiro Yamada *data++ = byte; 126428ba53c0SMasahiro Yamada index++; 126528ba53c0SMasahiro Yamada } else { 126628ba53c0SMasahiro Yamada assert(0); 126728ba53c0SMasahiro Yamada } 126828ba53c0SMasahiro Yamada skip: 126928ba53c0SMasahiro Yamada while (node) { 127028ba53c0SMasahiro Yamada bitmask = 1 << node->bitnum; 127128ba53c0SMasahiro Yamada if (node->mark && (leftmask & bitmask) == 0) { 127228ba53c0SMasahiro Yamada leftmask |= bitmask; 127328ba53c0SMasahiro Yamada if (node->leftnode == LEAF) { 127428ba53c0SMasahiro Yamada assert(node->left); 127528ba53c0SMasahiro Yamada data = tree->leaf_emit(node->left, 127628ba53c0SMasahiro Yamada data); 127728ba53c0SMasahiro Yamada size = tree->leaf_size(node->left); 127828ba53c0SMasahiro Yamada index += size; 127928ba53c0SMasahiro Yamada bytes += size; 128028ba53c0SMasahiro Yamada leaves++; 128128ba53c0SMasahiro Yamada } else if (node->left) { 128228ba53c0SMasahiro Yamada assert(node->leftnode == NODE); 128328ba53c0SMasahiro Yamada indent += 1; 128428ba53c0SMasahiro Yamada node = node->left; 128528ba53c0SMasahiro Yamada break; 128628ba53c0SMasahiro Yamada } 128728ba53c0SMasahiro Yamada } 128828ba53c0SMasahiro Yamada if (node->mark && (rightmask & bitmask) == 0) { 128928ba53c0SMasahiro Yamada rightmask |= bitmask; 129028ba53c0SMasahiro Yamada if (node->rightnode == LEAF) { 129128ba53c0SMasahiro Yamada assert(node->right); 129228ba53c0SMasahiro Yamada data = tree->leaf_emit(node->right, 129328ba53c0SMasahiro Yamada data); 129428ba53c0SMasahiro Yamada size = tree->leaf_size(node->right); 129528ba53c0SMasahiro Yamada index += size; 129628ba53c0SMasahiro Yamada bytes += size; 129728ba53c0SMasahiro Yamada leaves++; 129828ba53c0SMasahiro Yamada } else if (node->right) { 129928ba53c0SMasahiro Yamada assert(node->rightnode == NODE); 130028ba53c0SMasahiro Yamada indent += 1; 130128ba53c0SMasahiro Yamada node = node->right; 130228ba53c0SMasahiro Yamada break; 130328ba53c0SMasahiro Yamada } 130428ba53c0SMasahiro Yamada } 130528ba53c0SMasahiro Yamada leftmask &= ~bitmask; 130628ba53c0SMasahiro Yamada rightmask &= ~bitmask; 130728ba53c0SMasahiro Yamada node = node->parent; 130828ba53c0SMasahiro Yamada indent -= 1; 130928ba53c0SMasahiro Yamada } 131028ba53c0SMasahiro Yamada } 131128ba53c0SMasahiro Yamada done: 131228ba53c0SMasahiro Yamada if (verbose > 0) { 131328ba53c0SMasahiro Yamada printf("Emitted %d (%d) leaves", 131428ba53c0SMasahiro Yamada leaves, bytes); 131528ba53c0SMasahiro Yamada printf(" %d (%d+%d+%d+%d) nodes", 131628ba53c0SMasahiro Yamada nodes[0] + nodes[1] + nodes[2] + nodes[3], 131728ba53c0SMasahiro Yamada nodes[0], nodes[1], nodes[2], nodes[3]); 131828ba53c0SMasahiro Yamada printf(" %d total\n", index - tree->index); 131928ba53c0SMasahiro Yamada } 132028ba53c0SMasahiro Yamada } 132128ba53c0SMasahiro Yamada 132228ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 132328ba53c0SMasahiro Yamada 132428ba53c0SMasahiro Yamada /* 132528ba53c0SMasahiro Yamada * Unicode data. 132628ba53c0SMasahiro Yamada * 132728ba53c0SMasahiro Yamada * We need to keep track of the Canonical Combining Class, the Age, 132828ba53c0SMasahiro Yamada * and decompositions for a code point. 132928ba53c0SMasahiro Yamada * 133028ba53c0SMasahiro Yamada * For the Age, we store the index into the ages table. Effectively 133128ba53c0SMasahiro Yamada * this is a generation number that the table maps to a unicode 133228ba53c0SMasahiro Yamada * version. 133328ba53c0SMasahiro Yamada * 133428ba53c0SMasahiro Yamada * The correction field is used to indicate that this entry is in the 133528ba53c0SMasahiro Yamada * corrections array, which contains decompositions that were 133628ba53c0SMasahiro Yamada * corrected in later revisions. The value of the correction field is 133728ba53c0SMasahiro Yamada * the Unicode version in which the mapping was corrected. 133828ba53c0SMasahiro Yamada */ 133928ba53c0SMasahiro Yamada struct unicode_data { 134028ba53c0SMasahiro Yamada unsigned int code; 134128ba53c0SMasahiro Yamada int ccc; 134228ba53c0SMasahiro Yamada int gen; 134328ba53c0SMasahiro Yamada int correction; 134428ba53c0SMasahiro Yamada unsigned int *utf32nfdi; 134528ba53c0SMasahiro Yamada unsigned int *utf32nfdicf; 134628ba53c0SMasahiro Yamada char *utf8nfdi; 134728ba53c0SMasahiro Yamada char *utf8nfdicf; 134828ba53c0SMasahiro Yamada }; 134928ba53c0SMasahiro Yamada 135028ba53c0SMasahiro Yamada struct unicode_data unicode_data[0x110000]; 135128ba53c0SMasahiro Yamada struct unicode_data *corrections; 135228ba53c0SMasahiro Yamada int corrections_count; 135328ba53c0SMasahiro Yamada 135428ba53c0SMasahiro Yamada struct tree *nfdi_tree; 135528ba53c0SMasahiro Yamada struct tree *nfdicf_tree; 135628ba53c0SMasahiro Yamada 135728ba53c0SMasahiro Yamada struct tree *trees; 135828ba53c0SMasahiro Yamada int trees_count; 135928ba53c0SMasahiro Yamada 136028ba53c0SMasahiro Yamada /* 136128ba53c0SMasahiro Yamada * Check the corrections array to see if this entry was corrected at 136228ba53c0SMasahiro Yamada * some point. 136328ba53c0SMasahiro Yamada */ 136428ba53c0SMasahiro Yamada static struct unicode_data *corrections_lookup(struct unicode_data *u) 136528ba53c0SMasahiro Yamada { 136628ba53c0SMasahiro Yamada int i; 136728ba53c0SMasahiro Yamada 136828ba53c0SMasahiro Yamada for (i = 0; i != corrections_count; i++) 136928ba53c0SMasahiro Yamada if (u->code == corrections[i].code) 137028ba53c0SMasahiro Yamada return &corrections[i]; 137128ba53c0SMasahiro Yamada return u; 137228ba53c0SMasahiro Yamada } 137328ba53c0SMasahiro Yamada 137428ba53c0SMasahiro Yamada static int nfdi_equal(void *l, void *r) 137528ba53c0SMasahiro Yamada { 137628ba53c0SMasahiro Yamada struct unicode_data *left = l; 137728ba53c0SMasahiro Yamada struct unicode_data *right = r; 137828ba53c0SMasahiro Yamada 137928ba53c0SMasahiro Yamada if (left->gen != right->gen) 138028ba53c0SMasahiro Yamada return 0; 138128ba53c0SMasahiro Yamada if (left->ccc != right->ccc) 138228ba53c0SMasahiro Yamada return 0; 138328ba53c0SMasahiro Yamada if (left->utf8nfdi && right->utf8nfdi && 138428ba53c0SMasahiro Yamada strcmp(left->utf8nfdi, right->utf8nfdi) == 0) 138528ba53c0SMasahiro Yamada return 1; 138628ba53c0SMasahiro Yamada if (left->utf8nfdi || right->utf8nfdi) 138728ba53c0SMasahiro Yamada return 0; 138828ba53c0SMasahiro Yamada return 1; 138928ba53c0SMasahiro Yamada } 139028ba53c0SMasahiro Yamada 139128ba53c0SMasahiro Yamada static int nfdicf_equal(void *l, void *r) 139228ba53c0SMasahiro Yamada { 139328ba53c0SMasahiro Yamada struct unicode_data *left = l; 139428ba53c0SMasahiro Yamada struct unicode_data *right = r; 139528ba53c0SMasahiro Yamada 139628ba53c0SMasahiro Yamada if (left->gen != right->gen) 139728ba53c0SMasahiro Yamada return 0; 139828ba53c0SMasahiro Yamada if (left->ccc != right->ccc) 139928ba53c0SMasahiro Yamada return 0; 140028ba53c0SMasahiro Yamada if (left->utf8nfdicf && right->utf8nfdicf && 140128ba53c0SMasahiro Yamada strcmp(left->utf8nfdicf, right->utf8nfdicf) == 0) 140228ba53c0SMasahiro Yamada return 1; 140328ba53c0SMasahiro Yamada if (left->utf8nfdicf && right->utf8nfdicf) 140428ba53c0SMasahiro Yamada return 0; 140528ba53c0SMasahiro Yamada if (left->utf8nfdicf || right->utf8nfdicf) 140628ba53c0SMasahiro Yamada return 0; 140728ba53c0SMasahiro Yamada if (left->utf8nfdi && right->utf8nfdi && 140828ba53c0SMasahiro Yamada strcmp(left->utf8nfdi, right->utf8nfdi) == 0) 140928ba53c0SMasahiro Yamada return 1; 141028ba53c0SMasahiro Yamada if (left->utf8nfdi || right->utf8nfdi) 141128ba53c0SMasahiro Yamada return 0; 141228ba53c0SMasahiro Yamada return 1; 141328ba53c0SMasahiro Yamada } 141428ba53c0SMasahiro Yamada 141528ba53c0SMasahiro Yamada static void nfdi_print(void *l, int indent) 141628ba53c0SMasahiro Yamada { 141728ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 141828ba53c0SMasahiro Yamada 141928ba53c0SMasahiro Yamada printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, 142028ba53c0SMasahiro Yamada leaf->code, leaf->ccc, leaf->gen); 142128ba53c0SMasahiro Yamada 142228ba53c0SMasahiro Yamada if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) 142328ba53c0SMasahiro Yamada printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); 142428ba53c0SMasahiro Yamada else if (leaf->utf8nfdi) 142528ba53c0SMasahiro Yamada printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); 142628ba53c0SMasahiro Yamada 142728ba53c0SMasahiro Yamada printf("\n"); 142828ba53c0SMasahiro Yamada } 142928ba53c0SMasahiro Yamada 143028ba53c0SMasahiro Yamada static void nfdicf_print(void *l, int indent) 143128ba53c0SMasahiro Yamada { 143228ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 143328ba53c0SMasahiro Yamada 143428ba53c0SMasahiro Yamada printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf, 143528ba53c0SMasahiro Yamada leaf->code, leaf->ccc, leaf->gen); 143628ba53c0SMasahiro Yamada 143728ba53c0SMasahiro Yamada if (leaf->utf8nfdicf) 143828ba53c0SMasahiro Yamada printf(" nfdicf \"%s\"", (const char*)leaf->utf8nfdicf); 143928ba53c0SMasahiro Yamada else if (leaf->utf8nfdi && leaf->utf8nfdi[0] == HANGUL) 144028ba53c0SMasahiro Yamada printf(" nfdi \"%s\"", "HANGUL SYLLABLE"); 144128ba53c0SMasahiro Yamada else if (leaf->utf8nfdi) 144228ba53c0SMasahiro Yamada printf(" nfdi \"%s\"", (const char*)leaf->utf8nfdi); 144328ba53c0SMasahiro Yamada printf("\n"); 144428ba53c0SMasahiro Yamada } 144528ba53c0SMasahiro Yamada 144628ba53c0SMasahiro Yamada static int nfdi_mark(void *l) 144728ba53c0SMasahiro Yamada { 144828ba53c0SMasahiro Yamada return 1; 144928ba53c0SMasahiro Yamada } 145028ba53c0SMasahiro Yamada 145128ba53c0SMasahiro Yamada static int nfdicf_mark(void *l) 145228ba53c0SMasahiro Yamada { 145328ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 145428ba53c0SMasahiro Yamada 145528ba53c0SMasahiro Yamada if (leaf->utf8nfdicf) 145628ba53c0SMasahiro Yamada return 1; 145728ba53c0SMasahiro Yamada return 0; 145828ba53c0SMasahiro Yamada } 145928ba53c0SMasahiro Yamada 146028ba53c0SMasahiro Yamada static int correction_mark(void *l) 146128ba53c0SMasahiro Yamada { 146228ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 146328ba53c0SMasahiro Yamada 146428ba53c0SMasahiro Yamada return leaf->correction; 146528ba53c0SMasahiro Yamada } 146628ba53c0SMasahiro Yamada 146728ba53c0SMasahiro Yamada static int nfdi_size(void *l) 146828ba53c0SMasahiro Yamada { 146928ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 147028ba53c0SMasahiro Yamada int size = 2; 147128ba53c0SMasahiro Yamada 147228ba53c0SMasahiro Yamada if (HANGUL_SYLLABLE(leaf->code)) 147328ba53c0SMasahiro Yamada size += 1; 147428ba53c0SMasahiro Yamada else if (leaf->utf8nfdi) 147528ba53c0SMasahiro Yamada size += strlen(leaf->utf8nfdi) + 1; 147628ba53c0SMasahiro Yamada return size; 147728ba53c0SMasahiro Yamada } 147828ba53c0SMasahiro Yamada 147928ba53c0SMasahiro Yamada static int nfdicf_size(void *l) 148028ba53c0SMasahiro Yamada { 148128ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 148228ba53c0SMasahiro Yamada int size = 2; 148328ba53c0SMasahiro Yamada 148428ba53c0SMasahiro Yamada if (HANGUL_SYLLABLE(leaf->code)) 148528ba53c0SMasahiro Yamada size += 1; 148628ba53c0SMasahiro Yamada else if (leaf->utf8nfdicf) 148728ba53c0SMasahiro Yamada size += strlen(leaf->utf8nfdicf) + 1; 148828ba53c0SMasahiro Yamada else if (leaf->utf8nfdi) 148928ba53c0SMasahiro Yamada size += strlen(leaf->utf8nfdi) + 1; 149028ba53c0SMasahiro Yamada return size; 149128ba53c0SMasahiro Yamada } 149228ba53c0SMasahiro Yamada 149328ba53c0SMasahiro Yamada static int *nfdi_index(struct tree *tree, void *l) 149428ba53c0SMasahiro Yamada { 149528ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 149628ba53c0SMasahiro Yamada 149728ba53c0SMasahiro Yamada return &tree->leafindex[leaf->code]; 149828ba53c0SMasahiro Yamada } 149928ba53c0SMasahiro Yamada 150028ba53c0SMasahiro Yamada static int *nfdicf_index(struct tree *tree, void *l) 150128ba53c0SMasahiro Yamada { 150228ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 150328ba53c0SMasahiro Yamada 150428ba53c0SMasahiro Yamada return &tree->leafindex[leaf->code]; 150528ba53c0SMasahiro Yamada } 150628ba53c0SMasahiro Yamada 150728ba53c0SMasahiro Yamada static unsigned char *nfdi_emit(void *l, unsigned char *data) 150828ba53c0SMasahiro Yamada { 150928ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 151028ba53c0SMasahiro Yamada unsigned char *s; 151128ba53c0SMasahiro Yamada 151228ba53c0SMasahiro Yamada *data++ = leaf->gen; 151328ba53c0SMasahiro Yamada 151428ba53c0SMasahiro Yamada if (HANGUL_SYLLABLE(leaf->code)) { 151528ba53c0SMasahiro Yamada *data++ = DECOMPOSE; 151628ba53c0SMasahiro Yamada *data++ = HANGUL; 151728ba53c0SMasahiro Yamada } else if (leaf->utf8nfdi) { 151828ba53c0SMasahiro Yamada *data++ = DECOMPOSE; 151928ba53c0SMasahiro Yamada s = (unsigned char*)leaf->utf8nfdi; 152028ba53c0SMasahiro Yamada while ((*data++ = *s++) != 0) 152128ba53c0SMasahiro Yamada ; 152228ba53c0SMasahiro Yamada } else { 152328ba53c0SMasahiro Yamada *data++ = leaf->ccc; 152428ba53c0SMasahiro Yamada } 152528ba53c0SMasahiro Yamada return data; 152628ba53c0SMasahiro Yamada } 152728ba53c0SMasahiro Yamada 152828ba53c0SMasahiro Yamada static unsigned char *nfdicf_emit(void *l, unsigned char *data) 152928ba53c0SMasahiro Yamada { 153028ba53c0SMasahiro Yamada struct unicode_data *leaf = l; 153128ba53c0SMasahiro Yamada unsigned char *s; 153228ba53c0SMasahiro Yamada 153328ba53c0SMasahiro Yamada *data++ = leaf->gen; 153428ba53c0SMasahiro Yamada 153528ba53c0SMasahiro Yamada if (HANGUL_SYLLABLE(leaf->code)) { 153628ba53c0SMasahiro Yamada *data++ = DECOMPOSE; 153728ba53c0SMasahiro Yamada *data++ = HANGUL; 153828ba53c0SMasahiro Yamada } else if (leaf->utf8nfdicf) { 153928ba53c0SMasahiro Yamada *data++ = DECOMPOSE; 154028ba53c0SMasahiro Yamada s = (unsigned char*)leaf->utf8nfdicf; 154128ba53c0SMasahiro Yamada while ((*data++ = *s++) != 0) 154228ba53c0SMasahiro Yamada ; 154328ba53c0SMasahiro Yamada } else if (leaf->utf8nfdi) { 154428ba53c0SMasahiro Yamada *data++ = DECOMPOSE; 154528ba53c0SMasahiro Yamada s = (unsigned char*)leaf->utf8nfdi; 154628ba53c0SMasahiro Yamada while ((*data++ = *s++) != 0) 154728ba53c0SMasahiro Yamada ; 154828ba53c0SMasahiro Yamada } else { 154928ba53c0SMasahiro Yamada *data++ = leaf->ccc; 155028ba53c0SMasahiro Yamada } 155128ba53c0SMasahiro Yamada return data; 155228ba53c0SMasahiro Yamada } 155328ba53c0SMasahiro Yamada 155428ba53c0SMasahiro Yamada static void utf8_create(struct unicode_data *data) 155528ba53c0SMasahiro Yamada { 155628ba53c0SMasahiro Yamada char utf[18*4+1]; 155728ba53c0SMasahiro Yamada char *u; 155828ba53c0SMasahiro Yamada unsigned int *um; 155928ba53c0SMasahiro Yamada int i; 156028ba53c0SMasahiro Yamada 156128ba53c0SMasahiro Yamada if (data->utf8nfdi) { 156228ba53c0SMasahiro Yamada assert(data->utf8nfdi[0] == HANGUL); 156328ba53c0SMasahiro Yamada return; 156428ba53c0SMasahiro Yamada } 156528ba53c0SMasahiro Yamada 156628ba53c0SMasahiro Yamada u = utf; 156728ba53c0SMasahiro Yamada um = data->utf32nfdi; 156828ba53c0SMasahiro Yamada if (um) { 156928ba53c0SMasahiro Yamada for (i = 0; um[i]; i++) 157028ba53c0SMasahiro Yamada u += utf8encode(u, um[i]); 157128ba53c0SMasahiro Yamada *u = '\0'; 157228ba53c0SMasahiro Yamada data->utf8nfdi = strdup(utf); 157328ba53c0SMasahiro Yamada } 157428ba53c0SMasahiro Yamada u = utf; 157528ba53c0SMasahiro Yamada um = data->utf32nfdicf; 157628ba53c0SMasahiro Yamada if (um) { 157728ba53c0SMasahiro Yamada for (i = 0; um[i]; i++) 157828ba53c0SMasahiro Yamada u += utf8encode(u, um[i]); 157928ba53c0SMasahiro Yamada *u = '\0'; 158028ba53c0SMasahiro Yamada if (!data->utf8nfdi || strcmp(data->utf8nfdi, utf)) 158128ba53c0SMasahiro Yamada data->utf8nfdicf = strdup(utf); 158228ba53c0SMasahiro Yamada } 158328ba53c0SMasahiro Yamada } 158428ba53c0SMasahiro Yamada 158528ba53c0SMasahiro Yamada static void utf8_init(void) 158628ba53c0SMasahiro Yamada { 158728ba53c0SMasahiro Yamada unsigned int unichar; 158828ba53c0SMasahiro Yamada int i; 158928ba53c0SMasahiro Yamada 159028ba53c0SMasahiro Yamada for (unichar = 0; unichar != 0x110000; unichar++) 159128ba53c0SMasahiro Yamada utf8_create(&unicode_data[unichar]); 159228ba53c0SMasahiro Yamada 159328ba53c0SMasahiro Yamada for (i = 0; i != corrections_count; i++) 159428ba53c0SMasahiro Yamada utf8_create(&corrections[i]); 159528ba53c0SMasahiro Yamada } 159628ba53c0SMasahiro Yamada 159728ba53c0SMasahiro Yamada static void trees_init(void) 159828ba53c0SMasahiro Yamada { 159928ba53c0SMasahiro Yamada struct unicode_data *data; 160028ba53c0SMasahiro Yamada unsigned int maxage; 160128ba53c0SMasahiro Yamada unsigned int nextage; 160228ba53c0SMasahiro Yamada int count; 160328ba53c0SMasahiro Yamada int i; 160428ba53c0SMasahiro Yamada int j; 160528ba53c0SMasahiro Yamada 160628ba53c0SMasahiro Yamada /* Count the number of different ages. */ 160728ba53c0SMasahiro Yamada count = 0; 160828ba53c0SMasahiro Yamada nextage = (unsigned int)-1; 160928ba53c0SMasahiro Yamada do { 161028ba53c0SMasahiro Yamada maxage = nextage; 161128ba53c0SMasahiro Yamada nextage = 0; 161228ba53c0SMasahiro Yamada for (i = 0; i <= corrections_count; i++) { 161328ba53c0SMasahiro Yamada data = &corrections[i]; 161428ba53c0SMasahiro Yamada if (nextage < data->correction && 161528ba53c0SMasahiro Yamada data->correction < maxage) 161628ba53c0SMasahiro Yamada nextage = data->correction; 161728ba53c0SMasahiro Yamada } 161828ba53c0SMasahiro Yamada count++; 161928ba53c0SMasahiro Yamada } while (nextage); 162028ba53c0SMasahiro Yamada 162128ba53c0SMasahiro Yamada /* Two trees per age: nfdi and nfdicf */ 162228ba53c0SMasahiro Yamada trees_count = count * 2; 162328ba53c0SMasahiro Yamada trees = calloc(trees_count, sizeof(struct tree)); 162428ba53c0SMasahiro Yamada 162528ba53c0SMasahiro Yamada /* Assign ages to the trees. */ 162628ba53c0SMasahiro Yamada count = trees_count; 162728ba53c0SMasahiro Yamada nextage = (unsigned int)-1; 162828ba53c0SMasahiro Yamada do { 162928ba53c0SMasahiro Yamada maxage = nextage; 163028ba53c0SMasahiro Yamada trees[--count].maxage = maxage; 163128ba53c0SMasahiro Yamada trees[--count].maxage = maxage; 163228ba53c0SMasahiro Yamada nextage = 0; 163328ba53c0SMasahiro Yamada for (i = 0; i <= corrections_count; i++) { 163428ba53c0SMasahiro Yamada data = &corrections[i]; 163528ba53c0SMasahiro Yamada if (nextage < data->correction && 163628ba53c0SMasahiro Yamada data->correction < maxage) 163728ba53c0SMasahiro Yamada nextage = data->correction; 163828ba53c0SMasahiro Yamada } 163928ba53c0SMasahiro Yamada } while (nextage); 164028ba53c0SMasahiro Yamada 164128ba53c0SMasahiro Yamada /* The ages assigned above are off by one. */ 164228ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) { 164328ba53c0SMasahiro Yamada j = 0; 164428ba53c0SMasahiro Yamada while (ages[j] < trees[i].maxage) 164528ba53c0SMasahiro Yamada j++; 164628ba53c0SMasahiro Yamada trees[i].maxage = ages[j-1]; 164728ba53c0SMasahiro Yamada } 164828ba53c0SMasahiro Yamada 164928ba53c0SMasahiro Yamada /* Set up the forwarding between trees. */ 165028ba53c0SMasahiro Yamada trees[trees_count-2].next = &trees[trees_count-1]; 165128ba53c0SMasahiro Yamada trees[trees_count-1].leaf_mark = nfdi_mark; 165228ba53c0SMasahiro Yamada trees[trees_count-2].leaf_mark = nfdicf_mark; 165328ba53c0SMasahiro Yamada for (i = 0; i != trees_count-2; i += 2) { 165428ba53c0SMasahiro Yamada trees[i].next = &trees[trees_count-2]; 165528ba53c0SMasahiro Yamada trees[i].leaf_mark = correction_mark; 165628ba53c0SMasahiro Yamada trees[i+1].next = &trees[trees_count-1]; 165728ba53c0SMasahiro Yamada trees[i+1].leaf_mark = correction_mark; 165828ba53c0SMasahiro Yamada } 165928ba53c0SMasahiro Yamada 166028ba53c0SMasahiro Yamada /* Assign the callouts. */ 166128ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i += 2) { 166228ba53c0SMasahiro Yamada trees[i].type = "nfdicf"; 166328ba53c0SMasahiro Yamada trees[i].leaf_equal = nfdicf_equal; 166428ba53c0SMasahiro Yamada trees[i].leaf_print = nfdicf_print; 166528ba53c0SMasahiro Yamada trees[i].leaf_size = nfdicf_size; 166628ba53c0SMasahiro Yamada trees[i].leaf_index = nfdicf_index; 166728ba53c0SMasahiro Yamada trees[i].leaf_emit = nfdicf_emit; 166828ba53c0SMasahiro Yamada 166928ba53c0SMasahiro Yamada trees[i+1].type = "nfdi"; 167028ba53c0SMasahiro Yamada trees[i+1].leaf_equal = nfdi_equal; 167128ba53c0SMasahiro Yamada trees[i+1].leaf_print = nfdi_print; 167228ba53c0SMasahiro Yamada trees[i+1].leaf_size = nfdi_size; 167328ba53c0SMasahiro Yamada trees[i+1].leaf_index = nfdi_index; 167428ba53c0SMasahiro Yamada trees[i+1].leaf_emit = nfdi_emit; 167528ba53c0SMasahiro Yamada } 167628ba53c0SMasahiro Yamada 167728ba53c0SMasahiro Yamada /* Finish init. */ 167828ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) 167928ba53c0SMasahiro Yamada trees[i].childnode = NODE; 168028ba53c0SMasahiro Yamada } 168128ba53c0SMasahiro Yamada 168228ba53c0SMasahiro Yamada static void trees_populate(void) 168328ba53c0SMasahiro Yamada { 168428ba53c0SMasahiro Yamada struct unicode_data *data; 168528ba53c0SMasahiro Yamada unsigned int unichar; 168628ba53c0SMasahiro Yamada char keyval[4]; 168728ba53c0SMasahiro Yamada int keylen; 168828ba53c0SMasahiro Yamada int i; 168928ba53c0SMasahiro Yamada 169028ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) { 169128ba53c0SMasahiro Yamada if (verbose > 0) { 169228ba53c0SMasahiro Yamada printf("Populating %s_%x\n", 169328ba53c0SMasahiro Yamada trees[i].type, trees[i].maxage); 169428ba53c0SMasahiro Yamada } 169528ba53c0SMasahiro Yamada for (unichar = 0; unichar != 0x110000; unichar++) { 169628ba53c0SMasahiro Yamada if (unicode_data[unichar].gen < 0) 169728ba53c0SMasahiro Yamada continue; 169828ba53c0SMasahiro Yamada keylen = utf8encode(keyval, unichar); 169928ba53c0SMasahiro Yamada data = corrections_lookup(&unicode_data[unichar]); 170028ba53c0SMasahiro Yamada if (data->correction <= trees[i].maxage) 170128ba53c0SMasahiro Yamada data = &unicode_data[unichar]; 170228ba53c0SMasahiro Yamada insert(&trees[i], keyval, keylen, data); 170328ba53c0SMasahiro Yamada } 170428ba53c0SMasahiro Yamada } 170528ba53c0SMasahiro Yamada } 170628ba53c0SMasahiro Yamada 170728ba53c0SMasahiro Yamada static void trees_reduce(void) 170828ba53c0SMasahiro Yamada { 170928ba53c0SMasahiro Yamada int i; 171028ba53c0SMasahiro Yamada int size; 171128ba53c0SMasahiro Yamada int changed; 171228ba53c0SMasahiro Yamada 171328ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) 171428ba53c0SMasahiro Yamada prune(&trees[i]); 171528ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) 171628ba53c0SMasahiro Yamada mark_nodes(&trees[i]); 171728ba53c0SMasahiro Yamada do { 171828ba53c0SMasahiro Yamada size = 0; 171928ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) 172028ba53c0SMasahiro Yamada size = index_nodes(&trees[i], size); 172128ba53c0SMasahiro Yamada changed = 0; 172228ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) 172328ba53c0SMasahiro Yamada changed += size_nodes(&trees[i]); 172428ba53c0SMasahiro Yamada } while (changed); 172528ba53c0SMasahiro Yamada 172628ba53c0SMasahiro Yamada utf8data = calloc(size, 1); 172728ba53c0SMasahiro Yamada utf8data_size = size; 172828ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) 172928ba53c0SMasahiro Yamada emit(&trees[i], utf8data); 173028ba53c0SMasahiro Yamada 173128ba53c0SMasahiro Yamada if (verbose > 0) { 173228ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) { 173328ba53c0SMasahiro Yamada printf("%s_%x idx %d\n", 173428ba53c0SMasahiro Yamada trees[i].type, trees[i].maxage, trees[i].index); 173528ba53c0SMasahiro Yamada } 173628ba53c0SMasahiro Yamada } 173728ba53c0SMasahiro Yamada 173828ba53c0SMasahiro Yamada nfdi = utf8data + trees[trees_count-1].index; 173928ba53c0SMasahiro Yamada nfdicf = utf8data + trees[trees_count-2].index; 174028ba53c0SMasahiro Yamada 174128ba53c0SMasahiro Yamada nfdi_tree = &trees[trees_count-1]; 174228ba53c0SMasahiro Yamada nfdicf_tree = &trees[trees_count-2]; 174328ba53c0SMasahiro Yamada } 174428ba53c0SMasahiro Yamada 174528ba53c0SMasahiro Yamada static void verify(struct tree *tree) 174628ba53c0SMasahiro Yamada { 174728ba53c0SMasahiro Yamada struct unicode_data *data; 174828ba53c0SMasahiro Yamada utf8leaf_t *leaf; 174928ba53c0SMasahiro Yamada unsigned int unichar; 175028ba53c0SMasahiro Yamada char key[4]; 175128ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 175228ba53c0SMasahiro Yamada int report; 175328ba53c0SMasahiro Yamada int nocf; 175428ba53c0SMasahiro Yamada 175528ba53c0SMasahiro Yamada if (verbose > 0) 175628ba53c0SMasahiro Yamada printf("Verifying %s_%x\n", tree->type, tree->maxage); 175728ba53c0SMasahiro Yamada nocf = strcmp(tree->type, "nfdicf"); 175828ba53c0SMasahiro Yamada 175928ba53c0SMasahiro Yamada for (unichar = 0; unichar != 0x110000; unichar++) { 176028ba53c0SMasahiro Yamada report = 0; 176128ba53c0SMasahiro Yamada data = corrections_lookup(&unicode_data[unichar]); 176228ba53c0SMasahiro Yamada if (data->correction <= tree->maxage) 176328ba53c0SMasahiro Yamada data = &unicode_data[unichar]; 176428ba53c0SMasahiro Yamada utf8encode(key,unichar); 176528ba53c0SMasahiro Yamada leaf = utf8lookup(tree, hangul, key); 176628ba53c0SMasahiro Yamada 176728ba53c0SMasahiro Yamada if (!leaf) { 176828ba53c0SMasahiro Yamada if (data->gen != -1) 176928ba53c0SMasahiro Yamada report++; 177028ba53c0SMasahiro Yamada if (unichar < 0xd800 || unichar > 0xdfff) 177128ba53c0SMasahiro Yamada report++; 177228ba53c0SMasahiro Yamada } else { 177328ba53c0SMasahiro Yamada if (unichar >= 0xd800 && unichar <= 0xdfff) 177428ba53c0SMasahiro Yamada report++; 177528ba53c0SMasahiro Yamada if (data->gen == -1) 177628ba53c0SMasahiro Yamada report++; 177728ba53c0SMasahiro Yamada if (data->gen != LEAF_GEN(leaf)) 177828ba53c0SMasahiro Yamada report++; 177928ba53c0SMasahiro Yamada if (LEAF_CCC(leaf) == DECOMPOSE) { 178028ba53c0SMasahiro Yamada if (HANGUL_SYLLABLE(data->code)) { 178128ba53c0SMasahiro Yamada if (data->utf8nfdi[0] != HANGUL) 178228ba53c0SMasahiro Yamada report++; 178328ba53c0SMasahiro Yamada } else if (nocf) { 178428ba53c0SMasahiro Yamada if (!data->utf8nfdi) { 178528ba53c0SMasahiro Yamada report++; 178628ba53c0SMasahiro Yamada } else if (strcmp(data->utf8nfdi, 178728ba53c0SMasahiro Yamada LEAF_STR(leaf))) { 178828ba53c0SMasahiro Yamada report++; 178928ba53c0SMasahiro Yamada } 179028ba53c0SMasahiro Yamada } else { 179128ba53c0SMasahiro Yamada if (!data->utf8nfdicf && 179228ba53c0SMasahiro Yamada !data->utf8nfdi) { 179328ba53c0SMasahiro Yamada report++; 179428ba53c0SMasahiro Yamada } else if (data->utf8nfdicf) { 179528ba53c0SMasahiro Yamada if (strcmp(data->utf8nfdicf, 179628ba53c0SMasahiro Yamada LEAF_STR(leaf))) 179728ba53c0SMasahiro Yamada report++; 179828ba53c0SMasahiro Yamada } else if (strcmp(data->utf8nfdi, 179928ba53c0SMasahiro Yamada LEAF_STR(leaf))) { 180028ba53c0SMasahiro Yamada report++; 180128ba53c0SMasahiro Yamada } 180228ba53c0SMasahiro Yamada } 180328ba53c0SMasahiro Yamada } else if (data->ccc != LEAF_CCC(leaf)) { 180428ba53c0SMasahiro Yamada report++; 180528ba53c0SMasahiro Yamada } 180628ba53c0SMasahiro Yamada } 180728ba53c0SMasahiro Yamada if (report) { 180828ba53c0SMasahiro Yamada printf("%X code %X gen %d ccc %d" 180928ba53c0SMasahiro Yamada " nfdi -> \"%s\"", 181028ba53c0SMasahiro Yamada unichar, data->code, data->gen, 181128ba53c0SMasahiro Yamada data->ccc, 181228ba53c0SMasahiro Yamada data->utf8nfdi); 181328ba53c0SMasahiro Yamada if (leaf) { 181428ba53c0SMasahiro Yamada printf(" gen %d ccc %d" 181528ba53c0SMasahiro Yamada " nfdi -> \"%s\"", 181628ba53c0SMasahiro Yamada LEAF_GEN(leaf), 181728ba53c0SMasahiro Yamada LEAF_CCC(leaf), 181828ba53c0SMasahiro Yamada LEAF_CCC(leaf) == DECOMPOSE ? 181928ba53c0SMasahiro Yamada LEAF_STR(leaf) : ""); 182028ba53c0SMasahiro Yamada } 182128ba53c0SMasahiro Yamada printf("\n"); 182228ba53c0SMasahiro Yamada } 182328ba53c0SMasahiro Yamada } 182428ba53c0SMasahiro Yamada } 182528ba53c0SMasahiro Yamada 182628ba53c0SMasahiro Yamada static void trees_verify(void) 182728ba53c0SMasahiro Yamada { 182828ba53c0SMasahiro Yamada int i; 182928ba53c0SMasahiro Yamada 183028ba53c0SMasahiro Yamada for (i = 0; i != trees_count; i++) 183128ba53c0SMasahiro Yamada verify(&trees[i]); 183228ba53c0SMasahiro Yamada } 183328ba53c0SMasahiro Yamada 183428ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 183528ba53c0SMasahiro Yamada 183628ba53c0SMasahiro Yamada static void help(void) 183728ba53c0SMasahiro Yamada { 183828ba53c0SMasahiro Yamada printf("Usage: %s [options]\n", argv0); 183928ba53c0SMasahiro Yamada printf("\n"); 184028ba53c0SMasahiro Yamada printf("This program creates an a data trie used for parsing and\n"); 184128ba53c0SMasahiro Yamada printf("normalization of UTF-8 strings. The trie is derived from\n"); 184228ba53c0SMasahiro Yamada printf("a set of input files from the Unicode character database\n"); 184328ba53c0SMasahiro Yamada printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n"); 184428ba53c0SMasahiro Yamada printf("\n"); 184528ba53c0SMasahiro Yamada printf("The generated tree supports two normalization forms:\n"); 184628ba53c0SMasahiro Yamada printf("\n"); 184728ba53c0SMasahiro Yamada printf("\tnfdi:\n"); 184828ba53c0SMasahiro Yamada printf("\t- Apply unicode normalization form NFD.\n"); 184928ba53c0SMasahiro Yamada printf("\t- Remove any Default_Ignorable_Code_Point.\n"); 185028ba53c0SMasahiro Yamada printf("\n"); 185128ba53c0SMasahiro Yamada printf("\tnfdicf:\n"); 185228ba53c0SMasahiro Yamada printf("\t- Apply unicode normalization form NFD.\n"); 185328ba53c0SMasahiro Yamada printf("\t- Remove any Default_Ignorable_Code_Point.\n"); 185428ba53c0SMasahiro Yamada printf("\t- Apply a full casefold (C + F).\n"); 185528ba53c0SMasahiro Yamada printf("\n"); 185628ba53c0SMasahiro Yamada printf("These forms were chosen as being most useful when dealing\n"); 185728ba53c0SMasahiro Yamada printf("with file names: NFD catches most cases where characters\n"); 185828ba53c0SMasahiro Yamada printf("should be considered equivalent. The ignorables are mostly\n"); 185928ba53c0SMasahiro Yamada printf("invisible, making names hard to type.\n"); 186028ba53c0SMasahiro Yamada printf("\n"); 186128ba53c0SMasahiro Yamada printf("The options to specify the files to be used are listed\n"); 186228ba53c0SMasahiro Yamada printf("below with their default values, which are the names used\n"); 186328ba53c0SMasahiro Yamada printf("by version 11.0.0 of the Unicode Character Database.\n"); 186428ba53c0SMasahiro Yamada printf("\n"); 186528ba53c0SMasahiro Yamada printf("The input files:\n"); 186628ba53c0SMasahiro Yamada printf("\t-a %s\n", AGE_NAME); 186728ba53c0SMasahiro Yamada printf("\t-c %s\n", CCC_NAME); 186828ba53c0SMasahiro Yamada printf("\t-p %s\n", PROP_NAME); 186928ba53c0SMasahiro Yamada printf("\t-d %s\n", DATA_NAME); 187028ba53c0SMasahiro Yamada printf("\t-f %s\n", FOLD_NAME); 187128ba53c0SMasahiro Yamada printf("\t-n %s\n", NORM_NAME); 187228ba53c0SMasahiro Yamada printf("\n"); 187328ba53c0SMasahiro Yamada printf("Additionally, the generated tables are tested using:\n"); 187428ba53c0SMasahiro Yamada printf("\t-t %s\n", TEST_NAME); 187528ba53c0SMasahiro Yamada printf("\n"); 187628ba53c0SMasahiro Yamada printf("Finally, the output file:\n"); 187728ba53c0SMasahiro Yamada printf("\t-o %s\n", UTF8_NAME); 187828ba53c0SMasahiro Yamada printf("\n"); 187928ba53c0SMasahiro Yamada } 188028ba53c0SMasahiro Yamada 188128ba53c0SMasahiro Yamada static void usage(void) 188228ba53c0SMasahiro Yamada { 188328ba53c0SMasahiro Yamada help(); 188428ba53c0SMasahiro Yamada exit(1); 188528ba53c0SMasahiro Yamada } 188628ba53c0SMasahiro Yamada 188728ba53c0SMasahiro Yamada static void open_fail(const char *name, int error) 188828ba53c0SMasahiro Yamada { 188928ba53c0SMasahiro Yamada printf("Error %d opening %s: %s\n", error, name, strerror(error)); 189028ba53c0SMasahiro Yamada exit(1); 189128ba53c0SMasahiro Yamada } 189228ba53c0SMasahiro Yamada 189328ba53c0SMasahiro Yamada static void file_fail(const char *filename) 189428ba53c0SMasahiro Yamada { 189528ba53c0SMasahiro Yamada printf("Error parsing %s\n", filename); 189628ba53c0SMasahiro Yamada exit(1); 189728ba53c0SMasahiro Yamada } 189828ba53c0SMasahiro Yamada 189928ba53c0SMasahiro Yamada static void line_fail(const char *filename, const char *line) 190028ba53c0SMasahiro Yamada { 190128ba53c0SMasahiro Yamada printf("Error parsing %s:%s\n", filename, line); 190228ba53c0SMasahiro Yamada exit(1); 190328ba53c0SMasahiro Yamada } 190428ba53c0SMasahiro Yamada 190528ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 190628ba53c0SMasahiro Yamada 190728ba53c0SMasahiro Yamada static void print_utf32(unsigned int *utf32str) 190828ba53c0SMasahiro Yamada { 190928ba53c0SMasahiro Yamada int i; 191028ba53c0SMasahiro Yamada 191128ba53c0SMasahiro Yamada for (i = 0; utf32str[i]; i++) 191228ba53c0SMasahiro Yamada printf(" %X", utf32str[i]); 191328ba53c0SMasahiro Yamada } 191428ba53c0SMasahiro Yamada 191528ba53c0SMasahiro Yamada static void print_utf32nfdi(unsigned int unichar) 191628ba53c0SMasahiro Yamada { 191728ba53c0SMasahiro Yamada printf(" %X ->", unichar); 191828ba53c0SMasahiro Yamada print_utf32(unicode_data[unichar].utf32nfdi); 191928ba53c0SMasahiro Yamada printf("\n"); 192028ba53c0SMasahiro Yamada } 192128ba53c0SMasahiro Yamada 192228ba53c0SMasahiro Yamada static void print_utf32nfdicf(unsigned int unichar) 192328ba53c0SMasahiro Yamada { 192428ba53c0SMasahiro Yamada printf(" %X ->", unichar); 192528ba53c0SMasahiro Yamada print_utf32(unicode_data[unichar].utf32nfdicf); 192628ba53c0SMasahiro Yamada printf("\n"); 192728ba53c0SMasahiro Yamada } 192828ba53c0SMasahiro Yamada 192928ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 193028ba53c0SMasahiro Yamada 193128ba53c0SMasahiro Yamada static void age_init(void) 193228ba53c0SMasahiro Yamada { 193328ba53c0SMasahiro Yamada FILE *file; 193428ba53c0SMasahiro Yamada unsigned int first; 193528ba53c0SMasahiro Yamada unsigned int last; 193628ba53c0SMasahiro Yamada unsigned int unichar; 193728ba53c0SMasahiro Yamada unsigned int major; 193828ba53c0SMasahiro Yamada unsigned int minor; 193928ba53c0SMasahiro Yamada unsigned int revision; 194028ba53c0SMasahiro Yamada int gen; 194128ba53c0SMasahiro Yamada int count; 194228ba53c0SMasahiro Yamada int ret; 194328ba53c0SMasahiro Yamada 194428ba53c0SMasahiro Yamada if (verbose > 0) 194528ba53c0SMasahiro Yamada printf("Parsing %s\n", age_name); 194628ba53c0SMasahiro Yamada 194728ba53c0SMasahiro Yamada file = fopen(age_name, "r"); 194828ba53c0SMasahiro Yamada if (!file) 194928ba53c0SMasahiro Yamada open_fail(age_name, errno); 195028ba53c0SMasahiro Yamada count = 0; 195128ba53c0SMasahiro Yamada 195228ba53c0SMasahiro Yamada gen = 0; 195328ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 195428ba53c0SMasahiro Yamada ret = sscanf(line, "# Age=V%d_%d_%d", 195528ba53c0SMasahiro Yamada &major, &minor, &revision); 195628ba53c0SMasahiro Yamada if (ret == 3) { 195728ba53c0SMasahiro Yamada ages_count++; 195828ba53c0SMasahiro Yamada if (verbose > 1) 195928ba53c0SMasahiro Yamada printf(" Age V%d_%d_%d\n", 196028ba53c0SMasahiro Yamada major, minor, revision); 196128ba53c0SMasahiro Yamada if (!age_valid(major, minor, revision)) 196228ba53c0SMasahiro Yamada line_fail(age_name, line); 196328ba53c0SMasahiro Yamada continue; 196428ba53c0SMasahiro Yamada } 196528ba53c0SMasahiro Yamada ret = sscanf(line, "# Age=V%d_%d", &major, &minor); 196628ba53c0SMasahiro Yamada if (ret == 2) { 196728ba53c0SMasahiro Yamada ages_count++; 196828ba53c0SMasahiro Yamada if (verbose > 1) 196928ba53c0SMasahiro Yamada printf(" Age V%d_%d\n", major, minor); 197028ba53c0SMasahiro Yamada if (!age_valid(major, minor, 0)) 197128ba53c0SMasahiro Yamada line_fail(age_name, line); 197228ba53c0SMasahiro Yamada continue; 197328ba53c0SMasahiro Yamada } 197428ba53c0SMasahiro Yamada } 197528ba53c0SMasahiro Yamada 197628ba53c0SMasahiro Yamada /* We must have found something above. */ 197728ba53c0SMasahiro Yamada if (verbose > 1) 197828ba53c0SMasahiro Yamada printf("%d age entries\n", ages_count); 197928ba53c0SMasahiro Yamada if (ages_count == 0 || ages_count > MAXGEN) 198028ba53c0SMasahiro Yamada file_fail(age_name); 198128ba53c0SMasahiro Yamada 198228ba53c0SMasahiro Yamada /* There is a 0 entry. */ 198328ba53c0SMasahiro Yamada ages_count++; 198428ba53c0SMasahiro Yamada ages = calloc(ages_count + 1, sizeof(*ages)); 198528ba53c0SMasahiro Yamada /* And a guard entry. */ 198628ba53c0SMasahiro Yamada ages[ages_count] = (unsigned int)-1; 198728ba53c0SMasahiro Yamada 198828ba53c0SMasahiro Yamada rewind(file); 198928ba53c0SMasahiro Yamada count = 0; 199028ba53c0SMasahiro Yamada gen = 0; 199128ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 199228ba53c0SMasahiro Yamada ret = sscanf(line, "# Age=V%d_%d_%d", 199328ba53c0SMasahiro Yamada &major, &minor, &revision); 199428ba53c0SMasahiro Yamada if (ret == 3) { 199528ba53c0SMasahiro Yamada ages[++gen] = 199628ba53c0SMasahiro Yamada UNICODE_AGE(major, minor, revision); 199728ba53c0SMasahiro Yamada if (verbose > 1) 199828ba53c0SMasahiro Yamada printf(" Age V%d_%d_%d = gen %d\n", 199928ba53c0SMasahiro Yamada major, minor, revision, gen); 200028ba53c0SMasahiro Yamada if (!age_valid(major, minor, revision)) 200128ba53c0SMasahiro Yamada line_fail(age_name, line); 200228ba53c0SMasahiro Yamada continue; 200328ba53c0SMasahiro Yamada } 200428ba53c0SMasahiro Yamada ret = sscanf(line, "# Age=V%d_%d", &major, &minor); 200528ba53c0SMasahiro Yamada if (ret == 2) { 200628ba53c0SMasahiro Yamada ages[++gen] = UNICODE_AGE(major, minor, 0); 200728ba53c0SMasahiro Yamada if (verbose > 1) 200828ba53c0SMasahiro Yamada printf(" Age V%d_%d = %d\n", 200928ba53c0SMasahiro Yamada major, minor, gen); 201028ba53c0SMasahiro Yamada if (!age_valid(major, minor, 0)) 201128ba53c0SMasahiro Yamada line_fail(age_name, line); 201228ba53c0SMasahiro Yamada continue; 201328ba53c0SMasahiro Yamada } 201428ba53c0SMasahiro Yamada ret = sscanf(line, "%X..%X ; %d.%d #", 201528ba53c0SMasahiro Yamada &first, &last, &major, &minor); 201628ba53c0SMasahiro Yamada if (ret == 4) { 201728ba53c0SMasahiro Yamada for (unichar = first; unichar <= last; unichar++) 201828ba53c0SMasahiro Yamada unicode_data[unichar].gen = gen; 201928ba53c0SMasahiro Yamada count += 1 + last - first; 202028ba53c0SMasahiro Yamada if (verbose > 1) 202128ba53c0SMasahiro Yamada printf(" %X..%X gen %d\n", first, last, gen); 202228ba53c0SMasahiro Yamada if (!utf32valid(first) || !utf32valid(last)) 202328ba53c0SMasahiro Yamada line_fail(age_name, line); 202428ba53c0SMasahiro Yamada continue; 202528ba53c0SMasahiro Yamada } 202628ba53c0SMasahiro Yamada ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor); 202728ba53c0SMasahiro Yamada if (ret == 3) { 202828ba53c0SMasahiro Yamada unicode_data[unichar].gen = gen; 202928ba53c0SMasahiro Yamada count++; 203028ba53c0SMasahiro Yamada if (verbose > 1) 203128ba53c0SMasahiro Yamada printf(" %X gen %d\n", unichar, gen); 203228ba53c0SMasahiro Yamada if (!utf32valid(unichar)) 203328ba53c0SMasahiro Yamada line_fail(age_name, line); 203428ba53c0SMasahiro Yamada continue; 203528ba53c0SMasahiro Yamada } 203628ba53c0SMasahiro Yamada } 203728ba53c0SMasahiro Yamada unicode_maxage = ages[gen]; 203828ba53c0SMasahiro Yamada fclose(file); 203928ba53c0SMasahiro Yamada 204028ba53c0SMasahiro Yamada /* Nix surrogate block */ 204128ba53c0SMasahiro Yamada if (verbose > 1) 204228ba53c0SMasahiro Yamada printf(" Removing surrogate block D800..DFFF\n"); 204328ba53c0SMasahiro Yamada for (unichar = 0xd800; unichar <= 0xdfff; unichar++) 204428ba53c0SMasahiro Yamada unicode_data[unichar].gen = -1; 204528ba53c0SMasahiro Yamada 204628ba53c0SMasahiro Yamada if (verbose > 0) 204728ba53c0SMasahiro Yamada printf("Found %d entries\n", count); 204828ba53c0SMasahiro Yamada if (count == 0) 204928ba53c0SMasahiro Yamada file_fail(age_name); 205028ba53c0SMasahiro Yamada } 205128ba53c0SMasahiro Yamada 205228ba53c0SMasahiro Yamada static void ccc_init(void) 205328ba53c0SMasahiro Yamada { 205428ba53c0SMasahiro Yamada FILE *file; 205528ba53c0SMasahiro Yamada unsigned int first; 205628ba53c0SMasahiro Yamada unsigned int last; 205728ba53c0SMasahiro Yamada unsigned int unichar; 205828ba53c0SMasahiro Yamada unsigned int value; 205928ba53c0SMasahiro Yamada int count; 206028ba53c0SMasahiro Yamada int ret; 206128ba53c0SMasahiro Yamada 206228ba53c0SMasahiro Yamada if (verbose > 0) 206328ba53c0SMasahiro Yamada printf("Parsing %s\n", ccc_name); 206428ba53c0SMasahiro Yamada 206528ba53c0SMasahiro Yamada file = fopen(ccc_name, "r"); 206628ba53c0SMasahiro Yamada if (!file) 206728ba53c0SMasahiro Yamada open_fail(ccc_name, errno); 206828ba53c0SMasahiro Yamada 206928ba53c0SMasahiro Yamada count = 0; 207028ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 207128ba53c0SMasahiro Yamada ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value); 207228ba53c0SMasahiro Yamada if (ret == 3) { 207328ba53c0SMasahiro Yamada for (unichar = first; unichar <= last; unichar++) { 207428ba53c0SMasahiro Yamada unicode_data[unichar].ccc = value; 207528ba53c0SMasahiro Yamada count++; 207628ba53c0SMasahiro Yamada } 207728ba53c0SMasahiro Yamada if (verbose > 1) 207828ba53c0SMasahiro Yamada printf(" %X..%X ccc %d\n", first, last, value); 207928ba53c0SMasahiro Yamada if (!utf32valid(first) || !utf32valid(last)) 208028ba53c0SMasahiro Yamada line_fail(ccc_name, line); 208128ba53c0SMasahiro Yamada continue; 208228ba53c0SMasahiro Yamada } 208328ba53c0SMasahiro Yamada ret = sscanf(line, "%X ; %d #", &unichar, &value); 208428ba53c0SMasahiro Yamada if (ret == 2) { 208528ba53c0SMasahiro Yamada unicode_data[unichar].ccc = value; 208628ba53c0SMasahiro Yamada count++; 208728ba53c0SMasahiro Yamada if (verbose > 1) 208828ba53c0SMasahiro Yamada printf(" %X ccc %d\n", unichar, value); 208928ba53c0SMasahiro Yamada if (!utf32valid(unichar)) 209028ba53c0SMasahiro Yamada line_fail(ccc_name, line); 209128ba53c0SMasahiro Yamada continue; 209228ba53c0SMasahiro Yamada } 209328ba53c0SMasahiro Yamada } 209428ba53c0SMasahiro Yamada fclose(file); 209528ba53c0SMasahiro Yamada 209628ba53c0SMasahiro Yamada if (verbose > 0) 209728ba53c0SMasahiro Yamada printf("Found %d entries\n", count); 209828ba53c0SMasahiro Yamada if (count == 0) 209928ba53c0SMasahiro Yamada file_fail(ccc_name); 210028ba53c0SMasahiro Yamada } 210128ba53c0SMasahiro Yamada 210228ba53c0SMasahiro Yamada static int ignore_compatibility_form(char *type) 210328ba53c0SMasahiro Yamada { 210428ba53c0SMasahiro Yamada int i; 210528ba53c0SMasahiro Yamada char *ignored_types[] = {"font", "noBreak", "initial", "medial", 210628ba53c0SMasahiro Yamada "final", "isolated", "circle", "super", 210728ba53c0SMasahiro Yamada "sub", "vertical", "wide", "narrow", 210828ba53c0SMasahiro Yamada "small", "square", "fraction", "compat"}; 210928ba53c0SMasahiro Yamada 211028ba53c0SMasahiro Yamada for (i = 0 ; i < ARRAY_SIZE(ignored_types); i++) 211128ba53c0SMasahiro Yamada if (strcmp(type, ignored_types[i]) == 0) 211228ba53c0SMasahiro Yamada return 1; 211328ba53c0SMasahiro Yamada return 0; 211428ba53c0SMasahiro Yamada } 211528ba53c0SMasahiro Yamada 211628ba53c0SMasahiro Yamada static void nfdi_init(void) 211728ba53c0SMasahiro Yamada { 211828ba53c0SMasahiro Yamada FILE *file; 211928ba53c0SMasahiro Yamada unsigned int unichar; 212028ba53c0SMasahiro Yamada unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 212128ba53c0SMasahiro Yamada char *s; 212228ba53c0SMasahiro Yamada char *type; 212328ba53c0SMasahiro Yamada unsigned int *um; 212428ba53c0SMasahiro Yamada int count; 212528ba53c0SMasahiro Yamada int i; 212628ba53c0SMasahiro Yamada int ret; 212728ba53c0SMasahiro Yamada 212828ba53c0SMasahiro Yamada if (verbose > 0) 212928ba53c0SMasahiro Yamada printf("Parsing %s\n", data_name); 213028ba53c0SMasahiro Yamada file = fopen(data_name, "r"); 213128ba53c0SMasahiro Yamada if (!file) 213228ba53c0SMasahiro Yamada open_fail(data_name, errno); 213328ba53c0SMasahiro Yamada 213428ba53c0SMasahiro Yamada count = 0; 213528ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 213628ba53c0SMasahiro Yamada ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];", 213728ba53c0SMasahiro Yamada &unichar, buf0); 213828ba53c0SMasahiro Yamada if (ret != 2) 213928ba53c0SMasahiro Yamada continue; 214028ba53c0SMasahiro Yamada if (!utf32valid(unichar)) 214128ba53c0SMasahiro Yamada line_fail(data_name, line); 214228ba53c0SMasahiro Yamada 214328ba53c0SMasahiro Yamada s = buf0; 214428ba53c0SMasahiro Yamada /* skip over <tag> */ 214528ba53c0SMasahiro Yamada if (*s == '<') { 214628ba53c0SMasahiro Yamada type = ++s; 214728ba53c0SMasahiro Yamada while (*++s != '>'); 214828ba53c0SMasahiro Yamada *s++ = '\0'; 214928ba53c0SMasahiro Yamada if(ignore_compatibility_form(type)) 215028ba53c0SMasahiro Yamada continue; 215128ba53c0SMasahiro Yamada } 215228ba53c0SMasahiro Yamada /* decode the decomposition into UTF-32 */ 215328ba53c0SMasahiro Yamada i = 0; 215428ba53c0SMasahiro Yamada while (*s) { 215528ba53c0SMasahiro Yamada mapping[i] = strtoul(s, &s, 16); 215628ba53c0SMasahiro Yamada if (!utf32valid(mapping[i])) 215728ba53c0SMasahiro Yamada line_fail(data_name, line); 215828ba53c0SMasahiro Yamada i++; 215928ba53c0SMasahiro Yamada } 216028ba53c0SMasahiro Yamada mapping[i++] = 0; 216128ba53c0SMasahiro Yamada 216228ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 216328ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 216428ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdi = um; 216528ba53c0SMasahiro Yamada 216628ba53c0SMasahiro Yamada if (verbose > 1) 216728ba53c0SMasahiro Yamada print_utf32nfdi(unichar); 216828ba53c0SMasahiro Yamada count++; 216928ba53c0SMasahiro Yamada } 217028ba53c0SMasahiro Yamada fclose(file); 217128ba53c0SMasahiro Yamada if (verbose > 0) 217228ba53c0SMasahiro Yamada printf("Found %d entries\n", count); 217328ba53c0SMasahiro Yamada if (count == 0) 217428ba53c0SMasahiro Yamada file_fail(data_name); 217528ba53c0SMasahiro Yamada } 217628ba53c0SMasahiro Yamada 217728ba53c0SMasahiro Yamada static void nfdicf_init(void) 217828ba53c0SMasahiro Yamada { 217928ba53c0SMasahiro Yamada FILE *file; 218028ba53c0SMasahiro Yamada unsigned int unichar; 218128ba53c0SMasahiro Yamada unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 218228ba53c0SMasahiro Yamada char status; 218328ba53c0SMasahiro Yamada char *s; 218428ba53c0SMasahiro Yamada unsigned int *um; 218528ba53c0SMasahiro Yamada int i; 218628ba53c0SMasahiro Yamada int count; 218728ba53c0SMasahiro Yamada int ret; 218828ba53c0SMasahiro Yamada 218928ba53c0SMasahiro Yamada if (verbose > 0) 219028ba53c0SMasahiro Yamada printf("Parsing %s\n", fold_name); 219128ba53c0SMasahiro Yamada file = fopen(fold_name, "r"); 219228ba53c0SMasahiro Yamada if (!file) 219328ba53c0SMasahiro Yamada open_fail(fold_name, errno); 219428ba53c0SMasahiro Yamada 219528ba53c0SMasahiro Yamada count = 0; 219628ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 219728ba53c0SMasahiro Yamada ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0); 219828ba53c0SMasahiro Yamada if (ret != 3) 219928ba53c0SMasahiro Yamada continue; 220028ba53c0SMasahiro Yamada if (!utf32valid(unichar)) 220128ba53c0SMasahiro Yamada line_fail(fold_name, line); 220228ba53c0SMasahiro Yamada /* Use the C+F casefold. */ 220328ba53c0SMasahiro Yamada if (status != 'C' && status != 'F') 220428ba53c0SMasahiro Yamada continue; 220528ba53c0SMasahiro Yamada s = buf0; 220628ba53c0SMasahiro Yamada if (*s == '<') 220728ba53c0SMasahiro Yamada while (*s++ != ' ') 220828ba53c0SMasahiro Yamada ; 220928ba53c0SMasahiro Yamada i = 0; 221028ba53c0SMasahiro Yamada while (*s) { 221128ba53c0SMasahiro Yamada mapping[i] = strtoul(s, &s, 16); 221228ba53c0SMasahiro Yamada if (!utf32valid(mapping[i])) 221328ba53c0SMasahiro Yamada line_fail(fold_name, line); 221428ba53c0SMasahiro Yamada i++; 221528ba53c0SMasahiro Yamada } 221628ba53c0SMasahiro Yamada mapping[i++] = 0; 221728ba53c0SMasahiro Yamada 221828ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 221928ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 222028ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdicf = um; 222128ba53c0SMasahiro Yamada 222228ba53c0SMasahiro Yamada if (verbose > 1) 222328ba53c0SMasahiro Yamada print_utf32nfdicf(unichar); 222428ba53c0SMasahiro Yamada count++; 222528ba53c0SMasahiro Yamada } 222628ba53c0SMasahiro Yamada fclose(file); 222728ba53c0SMasahiro Yamada if (verbose > 0) 222828ba53c0SMasahiro Yamada printf("Found %d entries\n", count); 222928ba53c0SMasahiro Yamada if (count == 0) 223028ba53c0SMasahiro Yamada file_fail(fold_name); 223128ba53c0SMasahiro Yamada } 223228ba53c0SMasahiro Yamada 223328ba53c0SMasahiro Yamada static void ignore_init(void) 223428ba53c0SMasahiro Yamada { 223528ba53c0SMasahiro Yamada FILE *file; 223628ba53c0SMasahiro Yamada unsigned int unichar; 223728ba53c0SMasahiro Yamada unsigned int first; 223828ba53c0SMasahiro Yamada unsigned int last; 223928ba53c0SMasahiro Yamada unsigned int *um; 224028ba53c0SMasahiro Yamada int count; 224128ba53c0SMasahiro Yamada int ret; 224228ba53c0SMasahiro Yamada 224328ba53c0SMasahiro Yamada if (verbose > 0) 224428ba53c0SMasahiro Yamada printf("Parsing %s\n", prop_name); 224528ba53c0SMasahiro Yamada file = fopen(prop_name, "r"); 224628ba53c0SMasahiro Yamada if (!file) 224728ba53c0SMasahiro Yamada open_fail(prop_name, errno); 224828ba53c0SMasahiro Yamada assert(file); 224928ba53c0SMasahiro Yamada count = 0; 225028ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 225128ba53c0SMasahiro Yamada ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0); 225228ba53c0SMasahiro Yamada if (ret == 3) { 225328ba53c0SMasahiro Yamada if (strcmp(buf0, "Default_Ignorable_Code_Point")) 225428ba53c0SMasahiro Yamada continue; 225528ba53c0SMasahiro Yamada if (!utf32valid(first) || !utf32valid(last)) 225628ba53c0SMasahiro Yamada line_fail(prop_name, line); 225728ba53c0SMasahiro Yamada for (unichar = first; unichar <= last; unichar++) { 225828ba53c0SMasahiro Yamada free(unicode_data[unichar].utf32nfdi); 225928ba53c0SMasahiro Yamada um = malloc(sizeof(unsigned int)); 226028ba53c0SMasahiro Yamada *um = 0; 226128ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdi = um; 226228ba53c0SMasahiro Yamada free(unicode_data[unichar].utf32nfdicf); 226328ba53c0SMasahiro Yamada um = malloc(sizeof(unsigned int)); 226428ba53c0SMasahiro Yamada *um = 0; 226528ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdicf = um; 226628ba53c0SMasahiro Yamada count++; 226728ba53c0SMasahiro Yamada } 226828ba53c0SMasahiro Yamada if (verbose > 1) 226928ba53c0SMasahiro Yamada printf(" %X..%X Default_Ignorable_Code_Point\n", 227028ba53c0SMasahiro Yamada first, last); 227128ba53c0SMasahiro Yamada continue; 227228ba53c0SMasahiro Yamada } 227328ba53c0SMasahiro Yamada ret = sscanf(line, "%X ; %s # ", &unichar, buf0); 227428ba53c0SMasahiro Yamada if (ret == 2) { 227528ba53c0SMasahiro Yamada if (strcmp(buf0, "Default_Ignorable_Code_Point")) 227628ba53c0SMasahiro Yamada continue; 227728ba53c0SMasahiro Yamada if (!utf32valid(unichar)) 227828ba53c0SMasahiro Yamada line_fail(prop_name, line); 227928ba53c0SMasahiro Yamada free(unicode_data[unichar].utf32nfdi); 228028ba53c0SMasahiro Yamada um = malloc(sizeof(unsigned int)); 228128ba53c0SMasahiro Yamada *um = 0; 228228ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdi = um; 228328ba53c0SMasahiro Yamada free(unicode_data[unichar].utf32nfdicf); 228428ba53c0SMasahiro Yamada um = malloc(sizeof(unsigned int)); 228528ba53c0SMasahiro Yamada *um = 0; 228628ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdicf = um; 228728ba53c0SMasahiro Yamada if (verbose > 1) 228828ba53c0SMasahiro Yamada printf(" %X Default_Ignorable_Code_Point\n", 228928ba53c0SMasahiro Yamada unichar); 229028ba53c0SMasahiro Yamada count++; 229128ba53c0SMasahiro Yamada continue; 229228ba53c0SMasahiro Yamada } 229328ba53c0SMasahiro Yamada } 229428ba53c0SMasahiro Yamada fclose(file); 229528ba53c0SMasahiro Yamada 229628ba53c0SMasahiro Yamada if (verbose > 0) 229728ba53c0SMasahiro Yamada printf("Found %d entries\n", count); 229828ba53c0SMasahiro Yamada if (count == 0) 229928ba53c0SMasahiro Yamada file_fail(prop_name); 230028ba53c0SMasahiro Yamada } 230128ba53c0SMasahiro Yamada 230228ba53c0SMasahiro Yamada static void corrections_init(void) 230328ba53c0SMasahiro Yamada { 230428ba53c0SMasahiro Yamada FILE *file; 230528ba53c0SMasahiro Yamada unsigned int unichar; 230628ba53c0SMasahiro Yamada unsigned int major; 230728ba53c0SMasahiro Yamada unsigned int minor; 230828ba53c0SMasahiro Yamada unsigned int revision; 230928ba53c0SMasahiro Yamada unsigned int age; 231028ba53c0SMasahiro Yamada unsigned int *um; 231128ba53c0SMasahiro Yamada unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 231228ba53c0SMasahiro Yamada char *s; 231328ba53c0SMasahiro Yamada int i; 231428ba53c0SMasahiro Yamada int count; 231528ba53c0SMasahiro Yamada int ret; 231628ba53c0SMasahiro Yamada 231728ba53c0SMasahiro Yamada if (verbose > 0) 231828ba53c0SMasahiro Yamada printf("Parsing %s\n", norm_name); 231928ba53c0SMasahiro Yamada file = fopen(norm_name, "r"); 232028ba53c0SMasahiro Yamada if (!file) 232128ba53c0SMasahiro Yamada open_fail(norm_name, errno); 232228ba53c0SMasahiro Yamada 232328ba53c0SMasahiro Yamada count = 0; 232428ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 232528ba53c0SMasahiro Yamada ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", 232628ba53c0SMasahiro Yamada &unichar, buf0, buf1, 232728ba53c0SMasahiro Yamada &major, &minor, &revision); 232828ba53c0SMasahiro Yamada if (ret != 6) 232928ba53c0SMasahiro Yamada continue; 233028ba53c0SMasahiro Yamada if (!utf32valid(unichar) || !age_valid(major, minor, revision)) 233128ba53c0SMasahiro Yamada line_fail(norm_name, line); 233228ba53c0SMasahiro Yamada count++; 233328ba53c0SMasahiro Yamada } 233428ba53c0SMasahiro Yamada corrections = calloc(count, sizeof(struct unicode_data)); 233528ba53c0SMasahiro Yamada corrections_count = count; 233628ba53c0SMasahiro Yamada rewind(file); 233728ba53c0SMasahiro Yamada 233828ba53c0SMasahiro Yamada count = 0; 233928ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 234028ba53c0SMasahiro Yamada ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #", 234128ba53c0SMasahiro Yamada &unichar, buf0, buf1, 234228ba53c0SMasahiro Yamada &major, &minor, &revision); 234328ba53c0SMasahiro Yamada if (ret != 6) 234428ba53c0SMasahiro Yamada continue; 234528ba53c0SMasahiro Yamada if (!utf32valid(unichar) || !age_valid(major, minor, revision)) 234628ba53c0SMasahiro Yamada line_fail(norm_name, line); 234728ba53c0SMasahiro Yamada corrections[count] = unicode_data[unichar]; 234828ba53c0SMasahiro Yamada assert(corrections[count].code == unichar); 234928ba53c0SMasahiro Yamada age = UNICODE_AGE(major, minor, revision); 235028ba53c0SMasahiro Yamada corrections[count].correction = age; 235128ba53c0SMasahiro Yamada 235228ba53c0SMasahiro Yamada i = 0; 235328ba53c0SMasahiro Yamada s = buf0; 235428ba53c0SMasahiro Yamada while (*s) { 235528ba53c0SMasahiro Yamada mapping[i] = strtoul(s, &s, 16); 235628ba53c0SMasahiro Yamada if (!utf32valid(mapping[i])) 235728ba53c0SMasahiro Yamada line_fail(norm_name, line); 235828ba53c0SMasahiro Yamada i++; 235928ba53c0SMasahiro Yamada } 236028ba53c0SMasahiro Yamada mapping[i++] = 0; 236128ba53c0SMasahiro Yamada 236228ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 236328ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 236428ba53c0SMasahiro Yamada corrections[count].utf32nfdi = um; 236528ba53c0SMasahiro Yamada 236628ba53c0SMasahiro Yamada if (verbose > 1) 236728ba53c0SMasahiro Yamada printf(" %X -> %s -> %s V%d_%d_%d\n", 236828ba53c0SMasahiro Yamada unichar, buf0, buf1, major, minor, revision); 236928ba53c0SMasahiro Yamada count++; 237028ba53c0SMasahiro Yamada } 237128ba53c0SMasahiro Yamada fclose(file); 237228ba53c0SMasahiro Yamada 237328ba53c0SMasahiro Yamada if (verbose > 0) 237428ba53c0SMasahiro Yamada printf("Found %d entries\n", count); 237528ba53c0SMasahiro Yamada if (count == 0) 237628ba53c0SMasahiro Yamada file_fail(norm_name); 237728ba53c0SMasahiro Yamada } 237828ba53c0SMasahiro Yamada 237928ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 238028ba53c0SMasahiro Yamada 238128ba53c0SMasahiro Yamada /* 238228ba53c0SMasahiro Yamada * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 238328ba53c0SMasahiro Yamada * 238428ba53c0SMasahiro Yamada * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 238528ba53c0SMasahiro Yamada * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 238628ba53c0SMasahiro Yamada * 238728ba53c0SMasahiro Yamada * SBase = 0xAC00 238828ba53c0SMasahiro Yamada * LBase = 0x1100 238928ba53c0SMasahiro Yamada * VBase = 0x1161 239028ba53c0SMasahiro Yamada * TBase = 0x11A7 239128ba53c0SMasahiro Yamada * LCount = 19 239228ba53c0SMasahiro Yamada * VCount = 21 239328ba53c0SMasahiro Yamada * TCount = 28 239428ba53c0SMasahiro Yamada * NCount = 588 (VCount * TCount) 239528ba53c0SMasahiro Yamada * SCount = 11172 (LCount * NCount) 239628ba53c0SMasahiro Yamada * 239728ba53c0SMasahiro Yamada * Decomposition: 239828ba53c0SMasahiro Yamada * SIndex = s - SBase 239928ba53c0SMasahiro Yamada * 240028ba53c0SMasahiro Yamada * LV (Canonical/Full) 240128ba53c0SMasahiro Yamada * LIndex = SIndex / NCount 240228ba53c0SMasahiro Yamada * VIndex = (Sindex % NCount) / TCount 240328ba53c0SMasahiro Yamada * LPart = LBase + LIndex 240428ba53c0SMasahiro Yamada * VPart = VBase + VIndex 240528ba53c0SMasahiro Yamada * 240628ba53c0SMasahiro Yamada * LVT (Canonical) 240728ba53c0SMasahiro Yamada * LVIndex = (SIndex / TCount) * TCount 240828ba53c0SMasahiro Yamada * TIndex = (Sindex % TCount) 240928ba53c0SMasahiro Yamada * LVPart = SBase + LVIndex 241028ba53c0SMasahiro Yamada * TPart = TBase + TIndex 241128ba53c0SMasahiro Yamada * 241228ba53c0SMasahiro Yamada * LVT (Full) 241328ba53c0SMasahiro Yamada * LIndex = SIndex / NCount 241428ba53c0SMasahiro Yamada * VIndex = (Sindex % NCount) / TCount 241528ba53c0SMasahiro Yamada * TIndex = (Sindex % TCount) 241628ba53c0SMasahiro Yamada * LPart = LBase + LIndex 241728ba53c0SMasahiro Yamada * VPart = VBase + VIndex 241828ba53c0SMasahiro Yamada * if (TIndex == 0) { 241928ba53c0SMasahiro Yamada * d = <LPart, VPart> 242028ba53c0SMasahiro Yamada * } else { 242128ba53c0SMasahiro Yamada * TPart = TBase + TIndex 242228ba53c0SMasahiro Yamada * d = <LPart, VPart, TPart> 242328ba53c0SMasahiro Yamada * } 242428ba53c0SMasahiro Yamada * 242528ba53c0SMasahiro Yamada */ 242628ba53c0SMasahiro Yamada 242728ba53c0SMasahiro Yamada static void hangul_decompose(void) 242828ba53c0SMasahiro Yamada { 242928ba53c0SMasahiro Yamada unsigned int sb = 0xAC00; 243028ba53c0SMasahiro Yamada unsigned int lb = 0x1100; 243128ba53c0SMasahiro Yamada unsigned int vb = 0x1161; 243228ba53c0SMasahiro Yamada unsigned int tb = 0x11a7; 243328ba53c0SMasahiro Yamada /* unsigned int lc = 19; */ 243428ba53c0SMasahiro Yamada unsigned int vc = 21; 243528ba53c0SMasahiro Yamada unsigned int tc = 28; 243628ba53c0SMasahiro Yamada unsigned int nc = (vc * tc); 243728ba53c0SMasahiro Yamada /* unsigned int sc = (lc * nc); */ 243828ba53c0SMasahiro Yamada unsigned int unichar; 243928ba53c0SMasahiro Yamada unsigned int mapping[4]; 244028ba53c0SMasahiro Yamada unsigned int *um; 244128ba53c0SMasahiro Yamada int count; 244228ba53c0SMasahiro Yamada int i; 244328ba53c0SMasahiro Yamada 244428ba53c0SMasahiro Yamada if (verbose > 0) 244528ba53c0SMasahiro Yamada printf("Decomposing hangul\n"); 244628ba53c0SMasahiro Yamada /* Hangul */ 244728ba53c0SMasahiro Yamada count = 0; 244828ba53c0SMasahiro Yamada for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) { 244928ba53c0SMasahiro Yamada unsigned int si = unichar - sb; 245028ba53c0SMasahiro Yamada unsigned int li = si / nc; 245128ba53c0SMasahiro Yamada unsigned int vi = (si % nc) / tc; 245228ba53c0SMasahiro Yamada unsigned int ti = si % tc; 245328ba53c0SMasahiro Yamada 245428ba53c0SMasahiro Yamada i = 0; 245528ba53c0SMasahiro Yamada mapping[i++] = lb + li; 245628ba53c0SMasahiro Yamada mapping[i++] = vb + vi; 245728ba53c0SMasahiro Yamada if (ti) 245828ba53c0SMasahiro Yamada mapping[i++] = tb + ti; 245928ba53c0SMasahiro Yamada mapping[i++] = 0; 246028ba53c0SMasahiro Yamada 246128ba53c0SMasahiro Yamada assert(!unicode_data[unichar].utf32nfdi); 246228ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 246328ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 246428ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdi = um; 246528ba53c0SMasahiro Yamada 246628ba53c0SMasahiro Yamada assert(!unicode_data[unichar].utf32nfdicf); 246728ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 246828ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 246928ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdicf = um; 247028ba53c0SMasahiro Yamada 247128ba53c0SMasahiro Yamada /* 247228ba53c0SMasahiro Yamada * Add a cookie as a reminder that the hangul syllable 247328ba53c0SMasahiro Yamada * decompositions must not be stored in the generated 247428ba53c0SMasahiro Yamada * trie. 247528ba53c0SMasahiro Yamada */ 247628ba53c0SMasahiro Yamada unicode_data[unichar].utf8nfdi = malloc(2); 247728ba53c0SMasahiro Yamada unicode_data[unichar].utf8nfdi[0] = HANGUL; 247828ba53c0SMasahiro Yamada unicode_data[unichar].utf8nfdi[1] = '\0'; 247928ba53c0SMasahiro Yamada 248028ba53c0SMasahiro Yamada if (verbose > 1) 248128ba53c0SMasahiro Yamada print_utf32nfdi(unichar); 248228ba53c0SMasahiro Yamada 248328ba53c0SMasahiro Yamada count++; 248428ba53c0SMasahiro Yamada } 248528ba53c0SMasahiro Yamada if (verbose > 0) 248628ba53c0SMasahiro Yamada printf("Created %d entries\n", count); 248728ba53c0SMasahiro Yamada } 248828ba53c0SMasahiro Yamada 248928ba53c0SMasahiro Yamada static void nfdi_decompose(void) 249028ba53c0SMasahiro Yamada { 249128ba53c0SMasahiro Yamada unsigned int unichar; 249228ba53c0SMasahiro Yamada unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 249328ba53c0SMasahiro Yamada unsigned int *um; 249428ba53c0SMasahiro Yamada unsigned int *dc; 249528ba53c0SMasahiro Yamada int count; 249628ba53c0SMasahiro Yamada int i; 249728ba53c0SMasahiro Yamada int j; 249828ba53c0SMasahiro Yamada int ret; 249928ba53c0SMasahiro Yamada 250028ba53c0SMasahiro Yamada if (verbose > 0) 250128ba53c0SMasahiro Yamada printf("Decomposing nfdi\n"); 250228ba53c0SMasahiro Yamada 250328ba53c0SMasahiro Yamada count = 0; 250428ba53c0SMasahiro Yamada for (unichar = 0; unichar != 0x110000; unichar++) { 250528ba53c0SMasahiro Yamada if (!unicode_data[unichar].utf32nfdi) 250628ba53c0SMasahiro Yamada continue; 250728ba53c0SMasahiro Yamada for (;;) { 250828ba53c0SMasahiro Yamada ret = 1; 250928ba53c0SMasahiro Yamada i = 0; 251028ba53c0SMasahiro Yamada um = unicode_data[unichar].utf32nfdi; 251128ba53c0SMasahiro Yamada while (*um) { 251228ba53c0SMasahiro Yamada dc = unicode_data[*um].utf32nfdi; 251328ba53c0SMasahiro Yamada if (dc) { 251428ba53c0SMasahiro Yamada for (j = 0; dc[j]; j++) 251528ba53c0SMasahiro Yamada mapping[i++] = dc[j]; 251628ba53c0SMasahiro Yamada ret = 0; 251728ba53c0SMasahiro Yamada } else { 251828ba53c0SMasahiro Yamada mapping[i++] = *um; 251928ba53c0SMasahiro Yamada } 252028ba53c0SMasahiro Yamada um++; 252128ba53c0SMasahiro Yamada } 252228ba53c0SMasahiro Yamada mapping[i++] = 0; 252328ba53c0SMasahiro Yamada if (ret) 252428ba53c0SMasahiro Yamada break; 252528ba53c0SMasahiro Yamada free(unicode_data[unichar].utf32nfdi); 252628ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 252728ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 252828ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdi = um; 252928ba53c0SMasahiro Yamada } 253028ba53c0SMasahiro Yamada /* Add this decomposition to nfdicf if there is no entry. */ 253128ba53c0SMasahiro Yamada if (!unicode_data[unichar].utf32nfdicf) { 253228ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 253328ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 253428ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdicf = um; 253528ba53c0SMasahiro Yamada } 253628ba53c0SMasahiro Yamada if (verbose > 1) 253728ba53c0SMasahiro Yamada print_utf32nfdi(unichar); 253828ba53c0SMasahiro Yamada count++; 253928ba53c0SMasahiro Yamada } 254028ba53c0SMasahiro Yamada if (verbose > 0) 254128ba53c0SMasahiro Yamada printf("Processed %d entries\n", count); 254228ba53c0SMasahiro Yamada } 254328ba53c0SMasahiro Yamada 254428ba53c0SMasahiro Yamada static void nfdicf_decompose(void) 254528ba53c0SMasahiro Yamada { 254628ba53c0SMasahiro Yamada unsigned int unichar; 254728ba53c0SMasahiro Yamada unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */ 254828ba53c0SMasahiro Yamada unsigned int *um; 254928ba53c0SMasahiro Yamada unsigned int *dc; 255028ba53c0SMasahiro Yamada int count; 255128ba53c0SMasahiro Yamada int i; 255228ba53c0SMasahiro Yamada int j; 255328ba53c0SMasahiro Yamada int ret; 255428ba53c0SMasahiro Yamada 255528ba53c0SMasahiro Yamada if (verbose > 0) 255628ba53c0SMasahiro Yamada printf("Decomposing nfdicf\n"); 255728ba53c0SMasahiro Yamada count = 0; 255828ba53c0SMasahiro Yamada for (unichar = 0; unichar != 0x110000; unichar++) { 255928ba53c0SMasahiro Yamada if (!unicode_data[unichar].utf32nfdicf) 256028ba53c0SMasahiro Yamada continue; 256128ba53c0SMasahiro Yamada for (;;) { 256228ba53c0SMasahiro Yamada ret = 1; 256328ba53c0SMasahiro Yamada i = 0; 256428ba53c0SMasahiro Yamada um = unicode_data[unichar].utf32nfdicf; 256528ba53c0SMasahiro Yamada while (*um) { 256628ba53c0SMasahiro Yamada dc = unicode_data[*um].utf32nfdicf; 256728ba53c0SMasahiro Yamada if (dc) { 256828ba53c0SMasahiro Yamada for (j = 0; dc[j]; j++) 256928ba53c0SMasahiro Yamada mapping[i++] = dc[j]; 257028ba53c0SMasahiro Yamada ret = 0; 257128ba53c0SMasahiro Yamada } else { 257228ba53c0SMasahiro Yamada mapping[i++] = *um; 257328ba53c0SMasahiro Yamada } 257428ba53c0SMasahiro Yamada um++; 257528ba53c0SMasahiro Yamada } 257628ba53c0SMasahiro Yamada mapping[i++] = 0; 257728ba53c0SMasahiro Yamada if (ret) 257828ba53c0SMasahiro Yamada break; 257928ba53c0SMasahiro Yamada free(unicode_data[unichar].utf32nfdicf); 258028ba53c0SMasahiro Yamada um = malloc(i * sizeof(unsigned int)); 258128ba53c0SMasahiro Yamada memcpy(um, mapping, i * sizeof(unsigned int)); 258228ba53c0SMasahiro Yamada unicode_data[unichar].utf32nfdicf = um; 258328ba53c0SMasahiro Yamada } 258428ba53c0SMasahiro Yamada if (verbose > 1) 258528ba53c0SMasahiro Yamada print_utf32nfdicf(unichar); 258628ba53c0SMasahiro Yamada count++; 258728ba53c0SMasahiro Yamada } 258828ba53c0SMasahiro Yamada if (verbose > 0) 258928ba53c0SMasahiro Yamada printf("Processed %d entries\n", count); 259028ba53c0SMasahiro Yamada } 259128ba53c0SMasahiro Yamada 259228ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 259328ba53c0SMasahiro Yamada 259428ba53c0SMasahiro Yamada int utf8agemax(struct tree *, const char *); 259528ba53c0SMasahiro Yamada int utf8nagemax(struct tree *, const char *, size_t); 259628ba53c0SMasahiro Yamada int utf8agemin(struct tree *, const char *); 259728ba53c0SMasahiro Yamada int utf8nagemin(struct tree *, const char *, size_t); 259828ba53c0SMasahiro Yamada ssize_t utf8len(struct tree *, const char *); 259928ba53c0SMasahiro Yamada ssize_t utf8nlen(struct tree *, const char *, size_t); 260028ba53c0SMasahiro Yamada struct utf8cursor; 260128ba53c0SMasahiro Yamada int utf8cursor(struct utf8cursor *, struct tree *, const char *); 260228ba53c0SMasahiro Yamada int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t); 260328ba53c0SMasahiro Yamada int utf8byte(struct utf8cursor *); 260428ba53c0SMasahiro Yamada 260528ba53c0SMasahiro Yamada /* 260628ba53c0SMasahiro Yamada * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0) 260728ba53c0SMasahiro Yamada * 260828ba53c0SMasahiro Yamada * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 260928ba53c0SMasahiro Yamada * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 261028ba53c0SMasahiro Yamada * 261128ba53c0SMasahiro Yamada * SBase = 0xAC00 261228ba53c0SMasahiro Yamada * LBase = 0x1100 261328ba53c0SMasahiro Yamada * VBase = 0x1161 261428ba53c0SMasahiro Yamada * TBase = 0x11A7 261528ba53c0SMasahiro Yamada * LCount = 19 261628ba53c0SMasahiro Yamada * VCount = 21 261728ba53c0SMasahiro Yamada * TCount = 28 261828ba53c0SMasahiro Yamada * NCount = 588 (VCount * TCount) 261928ba53c0SMasahiro Yamada * SCount = 11172 (LCount * NCount) 262028ba53c0SMasahiro Yamada * 262128ba53c0SMasahiro Yamada * Decomposition: 262228ba53c0SMasahiro Yamada * SIndex = s - SBase 262328ba53c0SMasahiro Yamada * 262428ba53c0SMasahiro Yamada * LV (Canonical/Full) 262528ba53c0SMasahiro Yamada * LIndex = SIndex / NCount 262628ba53c0SMasahiro Yamada * VIndex = (Sindex % NCount) / TCount 262728ba53c0SMasahiro Yamada * LPart = LBase + LIndex 262828ba53c0SMasahiro Yamada * VPart = VBase + VIndex 262928ba53c0SMasahiro Yamada * 263028ba53c0SMasahiro Yamada * LVT (Canonical) 263128ba53c0SMasahiro Yamada * LVIndex = (SIndex / TCount) * TCount 263228ba53c0SMasahiro Yamada * TIndex = (Sindex % TCount) 263328ba53c0SMasahiro Yamada * LVPart = SBase + LVIndex 263428ba53c0SMasahiro Yamada * TPart = TBase + TIndex 263528ba53c0SMasahiro Yamada * 263628ba53c0SMasahiro Yamada * LVT (Full) 263728ba53c0SMasahiro Yamada * LIndex = SIndex / NCount 263828ba53c0SMasahiro Yamada * VIndex = (Sindex % NCount) / TCount 263928ba53c0SMasahiro Yamada * TIndex = (Sindex % TCount) 264028ba53c0SMasahiro Yamada * LPart = LBase + LIndex 264128ba53c0SMasahiro Yamada * VPart = VBase + VIndex 264228ba53c0SMasahiro Yamada * if (TIndex == 0) { 264328ba53c0SMasahiro Yamada * d = <LPart, VPart> 264428ba53c0SMasahiro Yamada * } else { 264528ba53c0SMasahiro Yamada * TPart = TBase + TIndex 264628ba53c0SMasahiro Yamada * d = <LPart, VPart, TPart> 264728ba53c0SMasahiro Yamada * } 264828ba53c0SMasahiro Yamada */ 264928ba53c0SMasahiro Yamada 265028ba53c0SMasahiro Yamada /* Constants */ 265128ba53c0SMasahiro Yamada #define SB (0xAC00) 265228ba53c0SMasahiro Yamada #define LB (0x1100) 265328ba53c0SMasahiro Yamada #define VB (0x1161) 265428ba53c0SMasahiro Yamada #define TB (0x11A7) 265528ba53c0SMasahiro Yamada #define LC (19) 265628ba53c0SMasahiro Yamada #define VC (21) 265728ba53c0SMasahiro Yamada #define TC (28) 265828ba53c0SMasahiro Yamada #define NC (VC * TC) 265928ba53c0SMasahiro Yamada #define SC (LC * NC) 266028ba53c0SMasahiro Yamada 266128ba53c0SMasahiro Yamada /* Algorithmic decomposition of hangul syllable. */ 266228ba53c0SMasahiro Yamada static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul) 266328ba53c0SMasahiro Yamada { 266428ba53c0SMasahiro Yamada unsigned int si; 266528ba53c0SMasahiro Yamada unsigned int li; 266628ba53c0SMasahiro Yamada unsigned int vi; 266728ba53c0SMasahiro Yamada unsigned int ti; 266828ba53c0SMasahiro Yamada unsigned char *h; 266928ba53c0SMasahiro Yamada 267028ba53c0SMasahiro Yamada /* Calculate the SI, LI, VI, and TI values. */ 267128ba53c0SMasahiro Yamada si = utf8decode(str) - SB; 267228ba53c0SMasahiro Yamada li = si / NC; 267328ba53c0SMasahiro Yamada vi = (si % NC) / TC; 267428ba53c0SMasahiro Yamada ti = si % TC; 267528ba53c0SMasahiro Yamada 267628ba53c0SMasahiro Yamada /* Fill in base of leaf. */ 267728ba53c0SMasahiro Yamada h = hangul; 267828ba53c0SMasahiro Yamada LEAF_GEN(h) = 2; 267928ba53c0SMasahiro Yamada LEAF_CCC(h) = DECOMPOSE; 268028ba53c0SMasahiro Yamada h += 2; 268128ba53c0SMasahiro Yamada 268228ba53c0SMasahiro Yamada /* Add LPart, a 3-byte UTF-8 sequence. */ 268328ba53c0SMasahiro Yamada h += utf8encode((char *)h, li + LB); 268428ba53c0SMasahiro Yamada 268528ba53c0SMasahiro Yamada /* Add VPart, a 3-byte UTF-8 sequence. */ 268628ba53c0SMasahiro Yamada h += utf8encode((char *)h, vi + VB); 268728ba53c0SMasahiro Yamada 268828ba53c0SMasahiro Yamada /* Add TPart if required, also a 3-byte UTF-8 sequence. */ 268928ba53c0SMasahiro Yamada if (ti) 269028ba53c0SMasahiro Yamada h += utf8encode((char *)h, ti + TB); 269128ba53c0SMasahiro Yamada 269228ba53c0SMasahiro Yamada /* Terminate string. */ 269328ba53c0SMasahiro Yamada h[0] = '\0'; 269428ba53c0SMasahiro Yamada 269528ba53c0SMasahiro Yamada return hangul; 269628ba53c0SMasahiro Yamada } 269728ba53c0SMasahiro Yamada 269828ba53c0SMasahiro Yamada /* 269928ba53c0SMasahiro Yamada * Use trie to scan s, touching at most len bytes. 270028ba53c0SMasahiro Yamada * Returns the leaf if one exists, NULL otherwise. 270128ba53c0SMasahiro Yamada * 270228ba53c0SMasahiro Yamada * A non-NULL return guarantees that the UTF-8 sequence starting at s 270328ba53c0SMasahiro Yamada * is well-formed and corresponds to a known unicode code point. The 270428ba53c0SMasahiro Yamada * shorthand for this will be "is valid UTF-8 unicode". 270528ba53c0SMasahiro Yamada */ 270628ba53c0SMasahiro Yamada static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul, 270728ba53c0SMasahiro Yamada const char *s, size_t len) 270828ba53c0SMasahiro Yamada { 270928ba53c0SMasahiro Yamada utf8trie_t *trie; 271028ba53c0SMasahiro Yamada int offlen; 271128ba53c0SMasahiro Yamada int offset; 271228ba53c0SMasahiro Yamada int mask; 271328ba53c0SMasahiro Yamada int node; 271428ba53c0SMasahiro Yamada 271528ba53c0SMasahiro Yamada if (!tree) 271628ba53c0SMasahiro Yamada return NULL; 271728ba53c0SMasahiro Yamada if (len == 0) 271828ba53c0SMasahiro Yamada return NULL; 271928ba53c0SMasahiro Yamada node = 1; 272028ba53c0SMasahiro Yamada trie = utf8data + tree->index; 272128ba53c0SMasahiro Yamada while (node) { 272228ba53c0SMasahiro Yamada offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT; 272328ba53c0SMasahiro Yamada if (*trie & NEXTBYTE) { 272428ba53c0SMasahiro Yamada if (--len == 0) 272528ba53c0SMasahiro Yamada return NULL; 272628ba53c0SMasahiro Yamada s++; 272728ba53c0SMasahiro Yamada } 272828ba53c0SMasahiro Yamada mask = 1 << (*trie & BITNUM); 272928ba53c0SMasahiro Yamada if (*s & mask) { 273028ba53c0SMasahiro Yamada /* Right leg */ 273128ba53c0SMasahiro Yamada if (offlen) { 273228ba53c0SMasahiro Yamada /* Right node at offset of trie */ 273328ba53c0SMasahiro Yamada node = (*trie & RIGHTNODE); 273428ba53c0SMasahiro Yamada offset = trie[offlen]; 273528ba53c0SMasahiro Yamada while (--offlen) { 273628ba53c0SMasahiro Yamada offset <<= 8; 273728ba53c0SMasahiro Yamada offset |= trie[offlen]; 273828ba53c0SMasahiro Yamada } 273928ba53c0SMasahiro Yamada trie += offset; 274028ba53c0SMasahiro Yamada } else if (*trie & RIGHTPATH) { 274128ba53c0SMasahiro Yamada /* Right node after this node */ 274228ba53c0SMasahiro Yamada node = (*trie & TRIENODE); 274328ba53c0SMasahiro Yamada trie++; 274428ba53c0SMasahiro Yamada } else { 274528ba53c0SMasahiro Yamada /* No right node. */ 274628ba53c0SMasahiro Yamada return NULL; 274728ba53c0SMasahiro Yamada } 274828ba53c0SMasahiro Yamada } else { 274928ba53c0SMasahiro Yamada /* Left leg */ 275028ba53c0SMasahiro Yamada if (offlen) { 275128ba53c0SMasahiro Yamada /* Left node after this node. */ 275228ba53c0SMasahiro Yamada node = (*trie & LEFTNODE); 275328ba53c0SMasahiro Yamada trie += offlen + 1; 275428ba53c0SMasahiro Yamada } else if (*trie & RIGHTPATH) { 275528ba53c0SMasahiro Yamada /* No left node. */ 275628ba53c0SMasahiro Yamada return NULL; 275728ba53c0SMasahiro Yamada } else { 275828ba53c0SMasahiro Yamada /* Left node after this node */ 275928ba53c0SMasahiro Yamada node = (*trie & TRIENODE); 276028ba53c0SMasahiro Yamada trie++; 276128ba53c0SMasahiro Yamada } 276228ba53c0SMasahiro Yamada } 276328ba53c0SMasahiro Yamada } 276428ba53c0SMasahiro Yamada /* 276528ba53c0SMasahiro Yamada * Hangul decomposition is done algorithmically. These are the 276628ba53c0SMasahiro Yamada * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is 276728ba53c0SMasahiro Yamada * always 3 bytes long, so s has been advanced twice, and the 276828ba53c0SMasahiro Yamada * start of the sequence is at s-2. 276928ba53c0SMasahiro Yamada */ 277028ba53c0SMasahiro Yamada if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL) 277128ba53c0SMasahiro Yamada trie = utf8hangul(s - 2, hangul); 277228ba53c0SMasahiro Yamada return trie; 277328ba53c0SMasahiro Yamada } 277428ba53c0SMasahiro Yamada 277528ba53c0SMasahiro Yamada /* 277628ba53c0SMasahiro Yamada * Use trie to scan s. 277728ba53c0SMasahiro Yamada * Returns the leaf if one exists, NULL otherwise. 277828ba53c0SMasahiro Yamada * 277928ba53c0SMasahiro Yamada * Forwards to trie_nlookup(). 278028ba53c0SMasahiro Yamada */ 278128ba53c0SMasahiro Yamada static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul, 278228ba53c0SMasahiro Yamada const char *s) 278328ba53c0SMasahiro Yamada { 278428ba53c0SMasahiro Yamada return utf8nlookup(tree, hangul, s, (size_t)-1); 278528ba53c0SMasahiro Yamada } 278628ba53c0SMasahiro Yamada 278728ba53c0SMasahiro Yamada /* 278828ba53c0SMasahiro Yamada * Return the number of bytes used by the current UTF-8 sequence. 278928ba53c0SMasahiro Yamada * Assumes the input points to the first byte of a valid UTF-8 279028ba53c0SMasahiro Yamada * sequence. 279128ba53c0SMasahiro Yamada */ 279228ba53c0SMasahiro Yamada static inline int utf8clen(const char *s) 279328ba53c0SMasahiro Yamada { 279428ba53c0SMasahiro Yamada unsigned char c = *s; 279528ba53c0SMasahiro Yamada return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0); 279628ba53c0SMasahiro Yamada } 279728ba53c0SMasahiro Yamada 279828ba53c0SMasahiro Yamada /* 279928ba53c0SMasahiro Yamada * Maximum age of any character in s. 280028ba53c0SMasahiro Yamada * Return -1 if s is not valid UTF-8 unicode. 280128ba53c0SMasahiro Yamada * Return 0 if only non-assigned code points are used. 280228ba53c0SMasahiro Yamada */ 280328ba53c0SMasahiro Yamada int utf8agemax(struct tree *tree, const char *s) 280428ba53c0SMasahiro Yamada { 280528ba53c0SMasahiro Yamada utf8leaf_t *leaf; 280628ba53c0SMasahiro Yamada int age = 0; 280728ba53c0SMasahiro Yamada int leaf_age; 280828ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 280928ba53c0SMasahiro Yamada 281028ba53c0SMasahiro Yamada if (!tree) 281128ba53c0SMasahiro Yamada return -1; 281228ba53c0SMasahiro Yamada 281328ba53c0SMasahiro Yamada while (*s) { 281428ba53c0SMasahiro Yamada leaf = utf8lookup(tree, hangul, s); 281528ba53c0SMasahiro Yamada if (!leaf) 281628ba53c0SMasahiro Yamada return -1; 281728ba53c0SMasahiro Yamada leaf_age = ages[LEAF_GEN(leaf)]; 281828ba53c0SMasahiro Yamada if (leaf_age <= tree->maxage && leaf_age > age) 281928ba53c0SMasahiro Yamada age = leaf_age; 282028ba53c0SMasahiro Yamada s += utf8clen(s); 282128ba53c0SMasahiro Yamada } 282228ba53c0SMasahiro Yamada return age; 282328ba53c0SMasahiro Yamada } 282428ba53c0SMasahiro Yamada 282528ba53c0SMasahiro Yamada /* 282628ba53c0SMasahiro Yamada * Minimum age of any character in s. 282728ba53c0SMasahiro Yamada * Return -1 if s is not valid UTF-8 unicode. 282828ba53c0SMasahiro Yamada * Return 0 if non-assigned code points are used. 282928ba53c0SMasahiro Yamada */ 283028ba53c0SMasahiro Yamada int utf8agemin(struct tree *tree, const char *s) 283128ba53c0SMasahiro Yamada { 283228ba53c0SMasahiro Yamada utf8leaf_t *leaf; 283328ba53c0SMasahiro Yamada int age; 283428ba53c0SMasahiro Yamada int leaf_age; 283528ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 283628ba53c0SMasahiro Yamada 283728ba53c0SMasahiro Yamada if (!tree) 283828ba53c0SMasahiro Yamada return -1; 283928ba53c0SMasahiro Yamada age = tree->maxage; 284028ba53c0SMasahiro Yamada while (*s) { 284128ba53c0SMasahiro Yamada leaf = utf8lookup(tree, hangul, s); 284228ba53c0SMasahiro Yamada if (!leaf) 284328ba53c0SMasahiro Yamada return -1; 284428ba53c0SMasahiro Yamada leaf_age = ages[LEAF_GEN(leaf)]; 284528ba53c0SMasahiro Yamada if (leaf_age <= tree->maxage && leaf_age < age) 284628ba53c0SMasahiro Yamada age = leaf_age; 284728ba53c0SMasahiro Yamada s += utf8clen(s); 284828ba53c0SMasahiro Yamada } 284928ba53c0SMasahiro Yamada return age; 285028ba53c0SMasahiro Yamada } 285128ba53c0SMasahiro Yamada 285228ba53c0SMasahiro Yamada /* 285328ba53c0SMasahiro Yamada * Maximum age of any character in s, touch at most len bytes. 285428ba53c0SMasahiro Yamada * Return -1 if s is not valid UTF-8 unicode. 285528ba53c0SMasahiro Yamada */ 285628ba53c0SMasahiro Yamada int utf8nagemax(struct tree *tree, const char *s, size_t len) 285728ba53c0SMasahiro Yamada { 285828ba53c0SMasahiro Yamada utf8leaf_t *leaf; 285928ba53c0SMasahiro Yamada int age = 0; 286028ba53c0SMasahiro Yamada int leaf_age; 286128ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 286228ba53c0SMasahiro Yamada 286328ba53c0SMasahiro Yamada if (!tree) 286428ba53c0SMasahiro Yamada return -1; 286528ba53c0SMasahiro Yamada 286628ba53c0SMasahiro Yamada while (len && *s) { 286728ba53c0SMasahiro Yamada leaf = utf8nlookup(tree, hangul, s, len); 286828ba53c0SMasahiro Yamada if (!leaf) 286928ba53c0SMasahiro Yamada return -1; 287028ba53c0SMasahiro Yamada leaf_age = ages[LEAF_GEN(leaf)]; 287128ba53c0SMasahiro Yamada if (leaf_age <= tree->maxage && leaf_age > age) 287228ba53c0SMasahiro Yamada age = leaf_age; 287328ba53c0SMasahiro Yamada len -= utf8clen(s); 287428ba53c0SMasahiro Yamada s += utf8clen(s); 287528ba53c0SMasahiro Yamada } 287628ba53c0SMasahiro Yamada return age; 287728ba53c0SMasahiro Yamada } 287828ba53c0SMasahiro Yamada 287928ba53c0SMasahiro Yamada /* 288028ba53c0SMasahiro Yamada * Maximum age of any character in s, touch at most len bytes. 288128ba53c0SMasahiro Yamada * Return -1 if s is not valid UTF-8 unicode. 288228ba53c0SMasahiro Yamada */ 288328ba53c0SMasahiro Yamada int utf8nagemin(struct tree *tree, const char *s, size_t len) 288428ba53c0SMasahiro Yamada { 288528ba53c0SMasahiro Yamada utf8leaf_t *leaf; 288628ba53c0SMasahiro Yamada int leaf_age; 288728ba53c0SMasahiro Yamada int age; 288828ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 288928ba53c0SMasahiro Yamada 289028ba53c0SMasahiro Yamada if (!tree) 289128ba53c0SMasahiro Yamada return -1; 289228ba53c0SMasahiro Yamada age = tree->maxage; 289328ba53c0SMasahiro Yamada while (len && *s) { 289428ba53c0SMasahiro Yamada leaf = utf8nlookup(tree, hangul, s, len); 289528ba53c0SMasahiro Yamada if (!leaf) 289628ba53c0SMasahiro Yamada return -1; 289728ba53c0SMasahiro Yamada leaf_age = ages[LEAF_GEN(leaf)]; 289828ba53c0SMasahiro Yamada if (leaf_age <= tree->maxage && leaf_age < age) 289928ba53c0SMasahiro Yamada age = leaf_age; 290028ba53c0SMasahiro Yamada len -= utf8clen(s); 290128ba53c0SMasahiro Yamada s += utf8clen(s); 290228ba53c0SMasahiro Yamada } 290328ba53c0SMasahiro Yamada return age; 290428ba53c0SMasahiro Yamada } 290528ba53c0SMasahiro Yamada 290628ba53c0SMasahiro Yamada /* 290728ba53c0SMasahiro Yamada * Length of the normalization of s. 290828ba53c0SMasahiro Yamada * Return -1 if s is not valid UTF-8 unicode. 290928ba53c0SMasahiro Yamada * 291028ba53c0SMasahiro Yamada * A string of Default_Ignorable_Code_Point has length 0. 291128ba53c0SMasahiro Yamada */ 291228ba53c0SMasahiro Yamada ssize_t utf8len(struct tree *tree, const char *s) 291328ba53c0SMasahiro Yamada { 291428ba53c0SMasahiro Yamada utf8leaf_t *leaf; 291528ba53c0SMasahiro Yamada size_t ret = 0; 291628ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 291728ba53c0SMasahiro Yamada 291828ba53c0SMasahiro Yamada if (!tree) 291928ba53c0SMasahiro Yamada return -1; 292028ba53c0SMasahiro Yamada while (*s) { 292128ba53c0SMasahiro Yamada leaf = utf8lookup(tree, hangul, s); 292228ba53c0SMasahiro Yamada if (!leaf) 292328ba53c0SMasahiro Yamada return -1; 292428ba53c0SMasahiro Yamada if (ages[LEAF_GEN(leaf)] > tree->maxage) 292528ba53c0SMasahiro Yamada ret += utf8clen(s); 292628ba53c0SMasahiro Yamada else if (LEAF_CCC(leaf) == DECOMPOSE) 292728ba53c0SMasahiro Yamada ret += strlen(LEAF_STR(leaf)); 292828ba53c0SMasahiro Yamada else 292928ba53c0SMasahiro Yamada ret += utf8clen(s); 293028ba53c0SMasahiro Yamada s += utf8clen(s); 293128ba53c0SMasahiro Yamada } 293228ba53c0SMasahiro Yamada return ret; 293328ba53c0SMasahiro Yamada } 293428ba53c0SMasahiro Yamada 293528ba53c0SMasahiro Yamada /* 293628ba53c0SMasahiro Yamada * Length of the normalization of s, touch at most len bytes. 293728ba53c0SMasahiro Yamada * Return -1 if s is not valid UTF-8 unicode. 293828ba53c0SMasahiro Yamada */ 293928ba53c0SMasahiro Yamada ssize_t utf8nlen(struct tree *tree, const char *s, size_t len) 294028ba53c0SMasahiro Yamada { 294128ba53c0SMasahiro Yamada utf8leaf_t *leaf; 294228ba53c0SMasahiro Yamada size_t ret = 0; 294328ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 294428ba53c0SMasahiro Yamada 294528ba53c0SMasahiro Yamada if (!tree) 294628ba53c0SMasahiro Yamada return -1; 294728ba53c0SMasahiro Yamada while (len && *s) { 294828ba53c0SMasahiro Yamada leaf = utf8nlookup(tree, hangul, s, len); 294928ba53c0SMasahiro Yamada if (!leaf) 295028ba53c0SMasahiro Yamada return -1; 295128ba53c0SMasahiro Yamada if (ages[LEAF_GEN(leaf)] > tree->maxage) 295228ba53c0SMasahiro Yamada ret += utf8clen(s); 295328ba53c0SMasahiro Yamada else if (LEAF_CCC(leaf) == DECOMPOSE) 295428ba53c0SMasahiro Yamada ret += strlen(LEAF_STR(leaf)); 295528ba53c0SMasahiro Yamada else 295628ba53c0SMasahiro Yamada ret += utf8clen(s); 295728ba53c0SMasahiro Yamada len -= utf8clen(s); 295828ba53c0SMasahiro Yamada s += utf8clen(s); 295928ba53c0SMasahiro Yamada } 296028ba53c0SMasahiro Yamada return ret; 296128ba53c0SMasahiro Yamada } 296228ba53c0SMasahiro Yamada 296328ba53c0SMasahiro Yamada /* 296428ba53c0SMasahiro Yamada * Cursor structure used by the normalizer. 296528ba53c0SMasahiro Yamada */ 296628ba53c0SMasahiro Yamada struct utf8cursor { 296728ba53c0SMasahiro Yamada struct tree *tree; 296828ba53c0SMasahiro Yamada const char *s; 296928ba53c0SMasahiro Yamada const char *p; 297028ba53c0SMasahiro Yamada const char *ss; 297128ba53c0SMasahiro Yamada const char *sp; 297228ba53c0SMasahiro Yamada unsigned int len; 297328ba53c0SMasahiro Yamada unsigned int slen; 297428ba53c0SMasahiro Yamada short int ccc; 297528ba53c0SMasahiro Yamada short int nccc; 297628ba53c0SMasahiro Yamada unsigned int unichar; 297728ba53c0SMasahiro Yamada unsigned char hangul[UTF8HANGULLEAF]; 297828ba53c0SMasahiro Yamada }; 297928ba53c0SMasahiro Yamada 298028ba53c0SMasahiro Yamada /* 298128ba53c0SMasahiro Yamada * Set up an utf8cursor for use by utf8byte(). 298228ba53c0SMasahiro Yamada * 298328ba53c0SMasahiro Yamada * s : string. 298428ba53c0SMasahiro Yamada * len : length of s. 298528ba53c0SMasahiro Yamada * u8c : pointer to cursor. 298628ba53c0SMasahiro Yamada * trie : utf8trie_t to use for normalization. 298728ba53c0SMasahiro Yamada * 298828ba53c0SMasahiro Yamada * Returns -1 on error, 0 on success. 298928ba53c0SMasahiro Yamada */ 299028ba53c0SMasahiro Yamada int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s, 299128ba53c0SMasahiro Yamada size_t len) 299228ba53c0SMasahiro Yamada { 299328ba53c0SMasahiro Yamada if (!tree) 299428ba53c0SMasahiro Yamada return -1; 299528ba53c0SMasahiro Yamada if (!s) 299628ba53c0SMasahiro Yamada return -1; 299728ba53c0SMasahiro Yamada u8c->tree = tree; 299828ba53c0SMasahiro Yamada u8c->s = s; 299928ba53c0SMasahiro Yamada u8c->p = NULL; 300028ba53c0SMasahiro Yamada u8c->ss = NULL; 300128ba53c0SMasahiro Yamada u8c->sp = NULL; 300228ba53c0SMasahiro Yamada u8c->len = len; 300328ba53c0SMasahiro Yamada u8c->slen = 0; 300428ba53c0SMasahiro Yamada u8c->ccc = STOPPER; 300528ba53c0SMasahiro Yamada u8c->nccc = STOPPER; 300628ba53c0SMasahiro Yamada u8c->unichar = 0; 300728ba53c0SMasahiro Yamada /* Check we didn't clobber the maximum length. */ 300828ba53c0SMasahiro Yamada if (u8c->len != len) 300928ba53c0SMasahiro Yamada return -1; 301028ba53c0SMasahiro Yamada /* The first byte of s may not be an utf8 continuation. */ 301128ba53c0SMasahiro Yamada if (len > 0 && (*s & 0xC0) == 0x80) 301228ba53c0SMasahiro Yamada return -1; 301328ba53c0SMasahiro Yamada return 0; 301428ba53c0SMasahiro Yamada } 301528ba53c0SMasahiro Yamada 301628ba53c0SMasahiro Yamada /* 301728ba53c0SMasahiro Yamada * Set up an utf8cursor for use by utf8byte(). 301828ba53c0SMasahiro Yamada * 301928ba53c0SMasahiro Yamada * s : NUL-terminated string. 302028ba53c0SMasahiro Yamada * u8c : pointer to cursor. 302128ba53c0SMasahiro Yamada * trie : utf8trie_t to use for normalization. 302228ba53c0SMasahiro Yamada * 302328ba53c0SMasahiro Yamada * Returns -1 on error, 0 on success. 302428ba53c0SMasahiro Yamada */ 302528ba53c0SMasahiro Yamada int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s) 302628ba53c0SMasahiro Yamada { 302728ba53c0SMasahiro Yamada return utf8ncursor(u8c, tree, s, (unsigned int)-1); 302828ba53c0SMasahiro Yamada } 302928ba53c0SMasahiro Yamada 303028ba53c0SMasahiro Yamada /* 303128ba53c0SMasahiro Yamada * Get one byte from the normalized form of the string described by u8c. 303228ba53c0SMasahiro Yamada * 303328ba53c0SMasahiro Yamada * Returns the byte cast to an unsigned char on succes, and -1 on failure. 303428ba53c0SMasahiro Yamada * 303528ba53c0SMasahiro Yamada * The cursor keeps track of the location in the string in u8c->s. 303628ba53c0SMasahiro Yamada * When a character is decomposed, the current location is stored in 303728ba53c0SMasahiro Yamada * u8c->p, and u8c->s is set to the start of the decomposition. Note 303828ba53c0SMasahiro Yamada * that bytes from a decomposition do not count against u8c->len. 303928ba53c0SMasahiro Yamada * 304028ba53c0SMasahiro Yamada * Characters are emitted if they match the current CCC in u8c->ccc. 304128ba53c0SMasahiro Yamada * Hitting end-of-string while u8c->ccc == STOPPER means we're done, 304228ba53c0SMasahiro Yamada * and the function returns 0 in that case. 304328ba53c0SMasahiro Yamada * 304428ba53c0SMasahiro Yamada * Sorting by CCC is done by repeatedly scanning the string. The 304528ba53c0SMasahiro Yamada * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at 304628ba53c0SMasahiro Yamada * the start of the scan. The first pass finds the lowest CCC to be 304728ba53c0SMasahiro Yamada * emitted and stores it in u8c->nccc, the second pass emits the 304828ba53c0SMasahiro Yamada * characters with this CCC and finds the next lowest CCC. This limits 304928ba53c0SMasahiro Yamada * the number of passes to 1 + the number of different CCCs in the 305028ba53c0SMasahiro Yamada * sequence being scanned. 305128ba53c0SMasahiro Yamada * 305228ba53c0SMasahiro Yamada * Therefore: 305328ba53c0SMasahiro Yamada * u8c->p != NULL -> a decomposition is being scanned. 305428ba53c0SMasahiro Yamada * u8c->ss != NULL -> this is a repeating scan. 305528ba53c0SMasahiro Yamada * u8c->ccc == -1 -> this is the first scan of a repeating scan. 305628ba53c0SMasahiro Yamada */ 305728ba53c0SMasahiro Yamada int utf8byte(struct utf8cursor *u8c) 305828ba53c0SMasahiro Yamada { 305928ba53c0SMasahiro Yamada utf8leaf_t *leaf; 306028ba53c0SMasahiro Yamada int ccc; 306128ba53c0SMasahiro Yamada 306228ba53c0SMasahiro Yamada for (;;) { 306328ba53c0SMasahiro Yamada /* Check for the end of a decomposed character. */ 306428ba53c0SMasahiro Yamada if (u8c->p && *u8c->s == '\0') { 306528ba53c0SMasahiro Yamada u8c->s = u8c->p; 306628ba53c0SMasahiro Yamada u8c->p = NULL; 306728ba53c0SMasahiro Yamada } 306828ba53c0SMasahiro Yamada 306928ba53c0SMasahiro Yamada /* Check for end-of-string. */ 307028ba53c0SMasahiro Yamada if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) { 307128ba53c0SMasahiro Yamada /* There is no next byte. */ 307228ba53c0SMasahiro Yamada if (u8c->ccc == STOPPER) 307328ba53c0SMasahiro Yamada return 0; 307428ba53c0SMasahiro Yamada /* End-of-string during a scan counts as a stopper. */ 307528ba53c0SMasahiro Yamada ccc = STOPPER; 307628ba53c0SMasahiro Yamada goto ccc_mismatch; 307728ba53c0SMasahiro Yamada } else if ((*u8c->s & 0xC0) == 0x80) { 307828ba53c0SMasahiro Yamada /* This is a continuation of the current character. */ 307928ba53c0SMasahiro Yamada if (!u8c->p) 308028ba53c0SMasahiro Yamada u8c->len--; 308128ba53c0SMasahiro Yamada return (unsigned char)*u8c->s++; 308228ba53c0SMasahiro Yamada } 308328ba53c0SMasahiro Yamada 308428ba53c0SMasahiro Yamada /* Look up the data for the current character. */ 308528ba53c0SMasahiro Yamada if (u8c->p) { 308628ba53c0SMasahiro Yamada leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); 308728ba53c0SMasahiro Yamada } else { 308828ba53c0SMasahiro Yamada leaf = utf8nlookup(u8c->tree, u8c->hangul, 308928ba53c0SMasahiro Yamada u8c->s, u8c->len); 309028ba53c0SMasahiro Yamada } 309128ba53c0SMasahiro Yamada 309228ba53c0SMasahiro Yamada /* No leaf found implies that the input is a binary blob. */ 309328ba53c0SMasahiro Yamada if (!leaf) 309428ba53c0SMasahiro Yamada return -1; 309528ba53c0SMasahiro Yamada 309628ba53c0SMasahiro Yamada /* Characters that are too new have CCC 0. */ 309728ba53c0SMasahiro Yamada if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) { 309828ba53c0SMasahiro Yamada ccc = STOPPER; 309928ba53c0SMasahiro Yamada } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) { 310028ba53c0SMasahiro Yamada u8c->len -= utf8clen(u8c->s); 310128ba53c0SMasahiro Yamada u8c->p = u8c->s + utf8clen(u8c->s); 310228ba53c0SMasahiro Yamada u8c->s = LEAF_STR(leaf); 310328ba53c0SMasahiro Yamada /* Empty decomposition implies CCC 0. */ 310428ba53c0SMasahiro Yamada if (*u8c->s == '\0') { 310528ba53c0SMasahiro Yamada if (u8c->ccc == STOPPER) 310628ba53c0SMasahiro Yamada continue; 310728ba53c0SMasahiro Yamada ccc = STOPPER; 310828ba53c0SMasahiro Yamada goto ccc_mismatch; 310928ba53c0SMasahiro Yamada } 311028ba53c0SMasahiro Yamada leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s); 311128ba53c0SMasahiro Yamada ccc = LEAF_CCC(leaf); 311228ba53c0SMasahiro Yamada } 311328ba53c0SMasahiro Yamada u8c->unichar = utf8decode(u8c->s); 311428ba53c0SMasahiro Yamada 311528ba53c0SMasahiro Yamada /* 311628ba53c0SMasahiro Yamada * If this is not a stopper, then see if it updates 311728ba53c0SMasahiro Yamada * the next canonical class to be emitted. 311828ba53c0SMasahiro Yamada */ 311928ba53c0SMasahiro Yamada if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc) 312028ba53c0SMasahiro Yamada u8c->nccc = ccc; 312128ba53c0SMasahiro Yamada 312228ba53c0SMasahiro Yamada /* 312328ba53c0SMasahiro Yamada * Return the current byte if this is the current 312428ba53c0SMasahiro Yamada * combining class. 312528ba53c0SMasahiro Yamada */ 312628ba53c0SMasahiro Yamada if (ccc == u8c->ccc) { 312728ba53c0SMasahiro Yamada if (!u8c->p) 312828ba53c0SMasahiro Yamada u8c->len--; 312928ba53c0SMasahiro Yamada return (unsigned char)*u8c->s++; 313028ba53c0SMasahiro Yamada } 313128ba53c0SMasahiro Yamada 313228ba53c0SMasahiro Yamada /* Current combining class mismatch. */ 313328ba53c0SMasahiro Yamada ccc_mismatch: 313428ba53c0SMasahiro Yamada if (u8c->nccc == STOPPER) { 313528ba53c0SMasahiro Yamada /* 313628ba53c0SMasahiro Yamada * Scan forward for the first canonical class 313728ba53c0SMasahiro Yamada * to be emitted. Save the position from 313828ba53c0SMasahiro Yamada * which to restart. 313928ba53c0SMasahiro Yamada */ 314028ba53c0SMasahiro Yamada assert(u8c->ccc == STOPPER); 314128ba53c0SMasahiro Yamada u8c->ccc = MINCCC - 1; 314228ba53c0SMasahiro Yamada u8c->nccc = ccc; 314328ba53c0SMasahiro Yamada u8c->sp = u8c->p; 314428ba53c0SMasahiro Yamada u8c->ss = u8c->s; 314528ba53c0SMasahiro Yamada u8c->slen = u8c->len; 314628ba53c0SMasahiro Yamada if (!u8c->p) 314728ba53c0SMasahiro Yamada u8c->len -= utf8clen(u8c->s); 314828ba53c0SMasahiro Yamada u8c->s += utf8clen(u8c->s); 314928ba53c0SMasahiro Yamada } else if (ccc != STOPPER) { 315028ba53c0SMasahiro Yamada /* Not a stopper, and not the ccc we're emitting. */ 315128ba53c0SMasahiro Yamada if (!u8c->p) 315228ba53c0SMasahiro Yamada u8c->len -= utf8clen(u8c->s); 315328ba53c0SMasahiro Yamada u8c->s += utf8clen(u8c->s); 315428ba53c0SMasahiro Yamada } else if (u8c->nccc != MAXCCC + 1) { 315528ba53c0SMasahiro Yamada /* At a stopper, restart for next ccc. */ 315628ba53c0SMasahiro Yamada u8c->ccc = u8c->nccc; 315728ba53c0SMasahiro Yamada u8c->nccc = MAXCCC + 1; 315828ba53c0SMasahiro Yamada u8c->s = u8c->ss; 315928ba53c0SMasahiro Yamada u8c->p = u8c->sp; 316028ba53c0SMasahiro Yamada u8c->len = u8c->slen; 316128ba53c0SMasahiro Yamada } else { 316228ba53c0SMasahiro Yamada /* All done, proceed from here. */ 316328ba53c0SMasahiro Yamada u8c->ccc = STOPPER; 316428ba53c0SMasahiro Yamada u8c->nccc = STOPPER; 316528ba53c0SMasahiro Yamada u8c->sp = NULL; 316628ba53c0SMasahiro Yamada u8c->ss = NULL; 316728ba53c0SMasahiro Yamada u8c->slen = 0; 316828ba53c0SMasahiro Yamada } 316928ba53c0SMasahiro Yamada } 317028ba53c0SMasahiro Yamada } 317128ba53c0SMasahiro Yamada 317228ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 317328ba53c0SMasahiro Yamada 317428ba53c0SMasahiro Yamada static int normalize_line(struct tree *tree) 317528ba53c0SMasahiro Yamada { 317628ba53c0SMasahiro Yamada char *s; 317728ba53c0SMasahiro Yamada char *t; 317828ba53c0SMasahiro Yamada int c; 317928ba53c0SMasahiro Yamada struct utf8cursor u8c; 318028ba53c0SMasahiro Yamada 318128ba53c0SMasahiro Yamada /* First test: null-terminated string. */ 318228ba53c0SMasahiro Yamada s = buf2; 318328ba53c0SMasahiro Yamada t = buf3; 318428ba53c0SMasahiro Yamada if (utf8cursor(&u8c, tree, s)) 318528ba53c0SMasahiro Yamada return -1; 318628ba53c0SMasahiro Yamada while ((c = utf8byte(&u8c)) > 0) 318728ba53c0SMasahiro Yamada if (c != (unsigned char)*t++) 318828ba53c0SMasahiro Yamada return -1; 318928ba53c0SMasahiro Yamada if (c < 0) 319028ba53c0SMasahiro Yamada return -1; 319128ba53c0SMasahiro Yamada if (*t != 0) 319228ba53c0SMasahiro Yamada return -1; 319328ba53c0SMasahiro Yamada 319428ba53c0SMasahiro Yamada /* Second test: length-limited string. */ 319528ba53c0SMasahiro Yamada s = buf2; 319628ba53c0SMasahiro Yamada /* Replace NUL with a value that will cause an error if seen. */ 319728ba53c0SMasahiro Yamada s[strlen(s) + 1] = -1; 319828ba53c0SMasahiro Yamada t = buf3; 319928ba53c0SMasahiro Yamada if (utf8cursor(&u8c, tree, s)) 320028ba53c0SMasahiro Yamada return -1; 320128ba53c0SMasahiro Yamada while ((c = utf8byte(&u8c)) > 0) 320228ba53c0SMasahiro Yamada if (c != (unsigned char)*t++) 320328ba53c0SMasahiro Yamada return -1; 320428ba53c0SMasahiro Yamada if (c < 0) 320528ba53c0SMasahiro Yamada return -1; 320628ba53c0SMasahiro Yamada if (*t != 0) 320728ba53c0SMasahiro Yamada return -1; 320828ba53c0SMasahiro Yamada 320928ba53c0SMasahiro Yamada return 0; 321028ba53c0SMasahiro Yamada } 321128ba53c0SMasahiro Yamada 321228ba53c0SMasahiro Yamada static void normalization_test(void) 321328ba53c0SMasahiro Yamada { 321428ba53c0SMasahiro Yamada FILE *file; 321528ba53c0SMasahiro Yamada unsigned int unichar; 321628ba53c0SMasahiro Yamada struct unicode_data *data; 321728ba53c0SMasahiro Yamada char *s; 321828ba53c0SMasahiro Yamada char *t; 321928ba53c0SMasahiro Yamada int ret; 322028ba53c0SMasahiro Yamada int ignorables; 322128ba53c0SMasahiro Yamada int tests = 0; 322228ba53c0SMasahiro Yamada int failures = 0; 322328ba53c0SMasahiro Yamada 322428ba53c0SMasahiro Yamada if (verbose > 0) 322528ba53c0SMasahiro Yamada printf("Parsing %s\n", test_name); 322628ba53c0SMasahiro Yamada /* Step one, read data from file. */ 322728ba53c0SMasahiro Yamada file = fopen(test_name, "r"); 322828ba53c0SMasahiro Yamada if (!file) 322928ba53c0SMasahiro Yamada open_fail(test_name, errno); 323028ba53c0SMasahiro Yamada 323128ba53c0SMasahiro Yamada while (fgets(line, LINESIZE, file)) { 323228ba53c0SMasahiro Yamada ret = sscanf(line, "%[^;];%*[^;];%[^;];%*[^;];%*[^;];", 323328ba53c0SMasahiro Yamada buf0, buf1); 323428ba53c0SMasahiro Yamada if (ret != 2 || *line == '#') 323528ba53c0SMasahiro Yamada continue; 323628ba53c0SMasahiro Yamada s = buf0; 323728ba53c0SMasahiro Yamada t = buf2; 323828ba53c0SMasahiro Yamada while (*s) { 323928ba53c0SMasahiro Yamada unichar = strtoul(s, &s, 16); 324028ba53c0SMasahiro Yamada t += utf8encode(t, unichar); 324128ba53c0SMasahiro Yamada } 324228ba53c0SMasahiro Yamada *t = '\0'; 324328ba53c0SMasahiro Yamada 324428ba53c0SMasahiro Yamada ignorables = 0; 324528ba53c0SMasahiro Yamada s = buf1; 324628ba53c0SMasahiro Yamada t = buf3; 324728ba53c0SMasahiro Yamada while (*s) { 324828ba53c0SMasahiro Yamada unichar = strtoul(s, &s, 16); 324928ba53c0SMasahiro Yamada data = &unicode_data[unichar]; 325028ba53c0SMasahiro Yamada if (data->utf8nfdi && !*data->utf8nfdi) 325128ba53c0SMasahiro Yamada ignorables = 1; 325228ba53c0SMasahiro Yamada else 325328ba53c0SMasahiro Yamada t += utf8encode(t, unichar); 325428ba53c0SMasahiro Yamada } 325528ba53c0SMasahiro Yamada *t = '\0'; 325628ba53c0SMasahiro Yamada 325728ba53c0SMasahiro Yamada tests++; 325828ba53c0SMasahiro Yamada if (normalize_line(nfdi_tree) < 0) { 325928ba53c0SMasahiro Yamada printf("Line %s -> %s", buf0, buf1); 326028ba53c0SMasahiro Yamada if (ignorables) 326128ba53c0SMasahiro Yamada printf(" (ignorables removed)"); 326228ba53c0SMasahiro Yamada printf(" failure\n"); 326328ba53c0SMasahiro Yamada failures++; 326428ba53c0SMasahiro Yamada } 326528ba53c0SMasahiro Yamada } 326628ba53c0SMasahiro Yamada fclose(file); 326728ba53c0SMasahiro Yamada if (verbose > 0) 326828ba53c0SMasahiro Yamada printf("Ran %d tests with %d failures\n", tests, failures); 326928ba53c0SMasahiro Yamada if (failures) 327028ba53c0SMasahiro Yamada file_fail(test_name); 327128ba53c0SMasahiro Yamada } 327228ba53c0SMasahiro Yamada 327328ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 327428ba53c0SMasahiro Yamada 327528ba53c0SMasahiro Yamada static void write_file(void) 327628ba53c0SMasahiro Yamada { 327728ba53c0SMasahiro Yamada FILE *file; 327828ba53c0SMasahiro Yamada int i; 327928ba53c0SMasahiro Yamada int j; 328028ba53c0SMasahiro Yamada int t; 328128ba53c0SMasahiro Yamada int gen; 328228ba53c0SMasahiro Yamada 328328ba53c0SMasahiro Yamada if (verbose > 0) 328428ba53c0SMasahiro Yamada printf("Writing %s\n", utf8_name); 328528ba53c0SMasahiro Yamada file = fopen(utf8_name, "w"); 328628ba53c0SMasahiro Yamada if (!file) 328728ba53c0SMasahiro Yamada open_fail(utf8_name, errno); 328828ba53c0SMasahiro Yamada 328928ba53c0SMasahiro Yamada fprintf(file, "/* This file is generated code, do not edit. */\n"); 329028ba53c0SMasahiro Yamada fprintf(file, "\n"); 32912b3d0478SChristoph Hellwig fprintf(file, "#include <linux/module.h>\n"); 32922b3d0478SChristoph Hellwig fprintf(file, "#include <linux/kernel.h>\n"); 32932b3d0478SChristoph Hellwig fprintf(file, "#include \"utf8n.h\"\n"); 329428ba53c0SMasahiro Yamada fprintf(file, "\n"); 329528ba53c0SMasahiro Yamada fprintf(file, "static const unsigned int utf8agetab[] = {\n"); 329628ba53c0SMasahiro Yamada for (i = 0; i != ages_count; i++) 329728ba53c0SMasahiro Yamada fprintf(file, "\t%#x%s\n", ages[i], 329828ba53c0SMasahiro Yamada ages[i] == unicode_maxage ? "" : ","); 329928ba53c0SMasahiro Yamada fprintf(file, "};\n"); 330028ba53c0SMasahiro Yamada fprintf(file, "\n"); 330128ba53c0SMasahiro Yamada fprintf(file, "static const struct utf8data utf8nfdicfdata[] = {\n"); 330228ba53c0SMasahiro Yamada t = 0; 330328ba53c0SMasahiro Yamada for (gen = 0; gen < ages_count; gen++) { 330428ba53c0SMasahiro Yamada fprintf(file, "\t{ %#x, %d }%s\n", 330528ba53c0SMasahiro Yamada ages[gen], trees[t].index, 330628ba53c0SMasahiro Yamada ages[gen] == unicode_maxage ? "" : ","); 330728ba53c0SMasahiro Yamada if (trees[t].maxage == ages[gen]) 330828ba53c0SMasahiro Yamada t += 2; 330928ba53c0SMasahiro Yamada } 331028ba53c0SMasahiro Yamada fprintf(file, "};\n"); 331128ba53c0SMasahiro Yamada fprintf(file, "\n"); 331228ba53c0SMasahiro Yamada fprintf(file, "static const struct utf8data utf8nfdidata[] = {\n"); 331328ba53c0SMasahiro Yamada t = 1; 331428ba53c0SMasahiro Yamada for (gen = 0; gen < ages_count; gen++) { 331528ba53c0SMasahiro Yamada fprintf(file, "\t{ %#x, %d }%s\n", 331628ba53c0SMasahiro Yamada ages[gen], trees[t].index, 331728ba53c0SMasahiro Yamada ages[gen] == unicode_maxage ? "" : ","); 331828ba53c0SMasahiro Yamada if (trees[t].maxage == ages[gen]) 331928ba53c0SMasahiro Yamada t += 2; 332028ba53c0SMasahiro Yamada } 332128ba53c0SMasahiro Yamada fprintf(file, "};\n"); 332228ba53c0SMasahiro Yamada fprintf(file, "\n"); 332328ba53c0SMasahiro Yamada fprintf(file, "static const unsigned char utf8data[%zd] = {\n", 332428ba53c0SMasahiro Yamada utf8data_size); 332528ba53c0SMasahiro Yamada t = 0; 332628ba53c0SMasahiro Yamada for (i = 0; i != utf8data_size; i += 16) { 332728ba53c0SMasahiro Yamada if (i == trees[t].index) { 332828ba53c0SMasahiro Yamada fprintf(file, "\t/* %s_%x */\n", 332928ba53c0SMasahiro Yamada trees[t].type, trees[t].maxage); 333028ba53c0SMasahiro Yamada if (t < trees_count-1) 333128ba53c0SMasahiro Yamada t++; 333228ba53c0SMasahiro Yamada } 333328ba53c0SMasahiro Yamada fprintf(file, "\t"); 333428ba53c0SMasahiro Yamada for (j = i; j != i + 16; j++) 333528ba53c0SMasahiro Yamada fprintf(file, "0x%.2x%s", utf8data[j], 333628ba53c0SMasahiro Yamada (j < utf8data_size -1 ? "," : "")); 333728ba53c0SMasahiro Yamada fprintf(file, "\n"); 333828ba53c0SMasahiro Yamada } 333928ba53c0SMasahiro Yamada fprintf(file, "};\n"); 33402b3d0478SChristoph Hellwig fprintf(file, "\n"); 33412b3d0478SChristoph Hellwig fprintf(file, "struct utf8data_table utf8_data_table = {\n"); 33422b3d0478SChristoph Hellwig fprintf(file, "\t.utf8agetab = utf8agetab,\n"); 33432b3d0478SChristoph Hellwig fprintf(file, "\t.utf8agetab_size = ARRAY_SIZE(utf8agetab),\n"); 33442b3d0478SChristoph Hellwig fprintf(file, "\n"); 33452b3d0478SChristoph Hellwig fprintf(file, "\t.utf8nfdicfdata = utf8nfdicfdata,\n"); 33462b3d0478SChristoph Hellwig fprintf(file, "\t.utf8nfdicfdata_size = ARRAY_SIZE(utf8nfdicfdata),\n"); 33472b3d0478SChristoph Hellwig fprintf(file, "\n"); 33482b3d0478SChristoph Hellwig fprintf(file, "\t.utf8nfdidata = utf8nfdidata,\n"); 33492b3d0478SChristoph Hellwig fprintf(file, "\t.utf8nfdidata_size = ARRAY_SIZE(utf8nfdidata),\n"); 33502b3d0478SChristoph Hellwig fprintf(file, "\n"); 33512b3d0478SChristoph Hellwig fprintf(file, "\t.utf8data = utf8data,\n"); 33522b3d0478SChristoph Hellwig fprintf(file, "};\n"); 33532b3d0478SChristoph Hellwig fprintf(file, "EXPORT_SYMBOL_GPL(utf8_data_table);"); 33542b3d0478SChristoph Hellwig fprintf(file, "\n"); 3355*68318904SJeff Johnson fprintf(file, "MODULE_DESCRIPTION(\"UTF8 data table\");\n"); 33562b3d0478SChristoph Hellwig fprintf(file, "MODULE_LICENSE(\"GPL v2\");\n"); 335728ba53c0SMasahiro Yamada fclose(file); 335828ba53c0SMasahiro Yamada } 335928ba53c0SMasahiro Yamada 336028ba53c0SMasahiro Yamada /* ------------------------------------------------------------------ */ 336128ba53c0SMasahiro Yamada 336228ba53c0SMasahiro Yamada int main(int argc, char *argv[]) 336328ba53c0SMasahiro Yamada { 336428ba53c0SMasahiro Yamada unsigned int unichar; 336528ba53c0SMasahiro Yamada int opt; 336628ba53c0SMasahiro Yamada 336728ba53c0SMasahiro Yamada argv0 = argv[0]; 336828ba53c0SMasahiro Yamada 336928ba53c0SMasahiro Yamada while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) { 337028ba53c0SMasahiro Yamada switch (opt) { 337128ba53c0SMasahiro Yamada case 'a': 337228ba53c0SMasahiro Yamada age_name = optarg; 337328ba53c0SMasahiro Yamada break; 337428ba53c0SMasahiro Yamada case 'c': 337528ba53c0SMasahiro Yamada ccc_name = optarg; 337628ba53c0SMasahiro Yamada break; 337728ba53c0SMasahiro Yamada case 'd': 337828ba53c0SMasahiro Yamada data_name = optarg; 337928ba53c0SMasahiro Yamada break; 338028ba53c0SMasahiro Yamada case 'f': 338128ba53c0SMasahiro Yamada fold_name = optarg; 338228ba53c0SMasahiro Yamada break; 338328ba53c0SMasahiro Yamada case 'n': 338428ba53c0SMasahiro Yamada norm_name = optarg; 338528ba53c0SMasahiro Yamada break; 338628ba53c0SMasahiro Yamada case 'o': 338728ba53c0SMasahiro Yamada utf8_name = optarg; 338828ba53c0SMasahiro Yamada break; 338928ba53c0SMasahiro Yamada case 'p': 339028ba53c0SMasahiro Yamada prop_name = optarg; 339128ba53c0SMasahiro Yamada break; 339228ba53c0SMasahiro Yamada case 't': 339328ba53c0SMasahiro Yamada test_name = optarg; 339428ba53c0SMasahiro Yamada break; 339528ba53c0SMasahiro Yamada case 'v': 339628ba53c0SMasahiro Yamada verbose++; 339728ba53c0SMasahiro Yamada break; 339828ba53c0SMasahiro Yamada case 'h': 339928ba53c0SMasahiro Yamada help(); 340028ba53c0SMasahiro Yamada exit(0); 340128ba53c0SMasahiro Yamada default: 340228ba53c0SMasahiro Yamada usage(); 340328ba53c0SMasahiro Yamada } 340428ba53c0SMasahiro Yamada } 340528ba53c0SMasahiro Yamada 340628ba53c0SMasahiro Yamada if (verbose > 1) 340728ba53c0SMasahiro Yamada help(); 340828ba53c0SMasahiro Yamada for (unichar = 0; unichar != 0x110000; unichar++) 340928ba53c0SMasahiro Yamada unicode_data[unichar].code = unichar; 341028ba53c0SMasahiro Yamada age_init(); 341128ba53c0SMasahiro Yamada ccc_init(); 341228ba53c0SMasahiro Yamada nfdi_init(); 341328ba53c0SMasahiro Yamada nfdicf_init(); 341428ba53c0SMasahiro Yamada ignore_init(); 341528ba53c0SMasahiro Yamada corrections_init(); 341628ba53c0SMasahiro Yamada hangul_decompose(); 341728ba53c0SMasahiro Yamada nfdi_decompose(); 341828ba53c0SMasahiro Yamada nfdicf_decompose(); 341928ba53c0SMasahiro Yamada utf8_init(); 342028ba53c0SMasahiro Yamada trees_init(); 342128ba53c0SMasahiro Yamada trees_populate(); 342228ba53c0SMasahiro Yamada trees_reduce(); 342328ba53c0SMasahiro Yamada trees_verify(); 342428ba53c0SMasahiro Yamada /* Prevent "unused function" warning. */ 342528ba53c0SMasahiro Yamada (void)lookup(nfdi_tree, " "); 342628ba53c0SMasahiro Yamada if (verbose > 2) 342728ba53c0SMasahiro Yamada tree_walk(nfdi_tree); 342828ba53c0SMasahiro Yamada if (verbose > 2) 342928ba53c0SMasahiro Yamada tree_walk(nfdicf_tree); 343028ba53c0SMasahiro Yamada normalization_test(); 343128ba53c0SMasahiro Yamada write_file(); 343228ba53c0SMasahiro Yamada 343328ba53c0SMasahiro Yamada return 0; 343428ba53c0SMasahiro Yamada } 3435