1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef COMMON_DEFS_H 27 #define COMMON_DEFS_H 28 29 30 31 #define MAGIC_NUMBER 201513 32 33 34 /* ISO/IEC 10646-1/Unicode Byte Order Mark */ 35 #define ICV_BOM_IN_BIG_ENDIAN 0x00feff 36 #define ICV_BOM_IN_LITTLE_ENDIAN_UCS4 0xfffe0000 37 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ 38 defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 39 #define ICV_BOM_IN_LITTLE_ENDIAN 0x00fffe 40 #else 41 #define ICV_BOM_IN_LITTLE_ENDIAN 0xfffe0000 42 #endif 43 44 45 /* 46 * Following type macros are for possible error cases that can be defined for 47 * mapping tables. Valid characters will have the byte length which will be 48 * always a positive integer. 49 */ 50 #define ICV_TYPE_NON_IDENTICAL_CHAR (-1) 51 #define ICV_TYPE_ILLEGAL_CHAR (-2) 52 53 /* Following are replacement characters for non-identical character cases. */ 54 #define ICV_CHAR_ASCII_REPLACEMENT ('?') 55 #define ICV_CHAR_UTF8_REPLACEMENT (0x00efbfbd) 56 #define ICV_CHAR_UCS2_REPLACEMENT (0xfffd) 57 58 59 typedef enum { false = 0, true = 1 } boolean; 60 61 62 /* We only support characters in range of UTF-16. */ 63 typedef struct { 64 unsigned int u8; 65 signed char size; 66 } to_utf8_table_component_t; 67 68 typedef struct { 69 unsigned int u8; 70 unsigned char sb; 71 } to_sb_table_component_t; 72 73 74 /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */ 75 typedef struct { 76 boolean bom_written; 77 boolean little_endian; 78 } ucs_state_t; 79 80 typedef struct { 81 ucs_state_t input; 82 ucs_state_t output; 83 } ucs_ucs_state_t; 84 85 86 /* UTF-7 requires additional state data fields. */ 87 typedef struct { 88 boolean bom_written; 89 boolean little_endian; 90 boolean in_the_middle_of_utf7_sequence; 91 unsigned int remnant; 92 signed char remnant_count; /* in bits */ 93 unsigned char prevch; 94 } utf7_state_t; 95 96 97 /* 98 * Following vector shows the number of bytes in a UTF-8 character. 99 * Index will be the first byte of the character. 100 */ 101 102 #define IL_ ICV_TYPE_ILLEGAL_CHAR 103 104 static const char number_of_bytes_in_utf8_char[0x100] = { 105 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 107 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 109 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 110 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 111 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 112 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 113 114 /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 115 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 116 117 /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 118 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 119 120 /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 121 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 122 123 /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 124 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 125 126 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 127 IL_,IL_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 128 129 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 131 132 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 133 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 134 135 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 136 4, 4, 4, 4, 4, IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 137 }; 138 139 #undef IL_ 140 141 /* 142 * Following is a vector of bit-masks to get used bits in the first byte of 143 * a UTF-8 character. Index is the number of bytes in the UTF-8 character 144 * and the index value comes from above table. 145 */ 146 static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 147 148 /* 149 * The following two vectors are to provide valid minimum and 150 * maximum values for the 2'nd byte of a multibyte UTF-8 character for 151 * better illegal sequence checking. The index value must be the value of 152 * the first byte of the UTF-8 character. 153 */ 154 static const unsigned char valid_min_2nd_byte[0x100] = { 155 0, 0, 0, 0, 0, 0, 0, 0, 156 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0, 158 0, 0, 0, 0, 0, 0, 0, 0, 159 0, 0, 0, 0, 0, 0, 0, 0, 160 0, 0, 0, 0, 0, 0, 0, 0, 161 0, 0, 0, 0, 0, 0, 0, 0, 162 0, 0, 0, 0, 0, 0, 0, 0, 163 0, 0, 0, 0, 0, 0, 0, 0, 164 0, 0, 0, 0, 0, 0, 0, 0, 165 0, 0, 0, 0, 0, 0, 0, 0, 166 0, 0, 0, 0, 0, 0, 0, 0, 167 0, 0, 0, 0, 0, 0, 0, 0, 168 0, 0, 0, 0, 0, 0, 0, 0, 169 0, 0, 0, 0, 0, 0, 0, 0, 170 0, 0, 0, 0, 0, 0, 0, 0, 171 0, 0, 0, 0, 0, 0, 0, 0, 172 0, 0, 0, 0, 0, 0, 0, 0, 173 0, 0, 0, 0, 0, 0, 0, 0, 174 0, 0, 0, 0, 0, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 0, 0, 0, 0, 0, 0, 0, 0, 177 0, 0, 0, 0, 0, 0, 0, 0, 178 0, 0, 0, 0, 0, 0, 0, 0, 179 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 180 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 181 /* C8 C9 CA CB CC CD CE CF */ 182 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 183 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 184 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 185 /* D8 D9 DA DB DC DD DE DF */ 186 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 187 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 188 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 189 /* E8 E9 EA EB EC ED EE EF */ 190 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 191 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 192 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 193 0, 0, 0, 0, 0, 0, 0, 0, 194 }; 195 196 static const unsigned char valid_max_2nd_byte[0x100] = { 197 0, 0, 0, 0, 0, 0, 0, 0, 198 0, 0, 0, 0, 0, 0, 0, 0, 199 0, 0, 0, 0, 0, 0, 0, 0, 200 0, 0, 0, 0, 0, 0, 0, 0, 201 0, 0, 0, 0, 0, 0, 0, 0, 202 0, 0, 0, 0, 0, 0, 0, 0, 203 0, 0, 0, 0, 0, 0, 0, 0, 204 0, 0, 0, 0, 0, 0, 0, 0, 205 0, 0, 0, 0, 0, 0, 0, 0, 206 0, 0, 0, 0, 0, 0, 0, 0, 207 0, 0, 0, 0, 0, 0, 0, 0, 208 0, 0, 0, 0, 0, 0, 0, 0, 209 0, 0, 0, 0, 0, 0, 0, 0, 210 0, 0, 0, 0, 0, 0, 0, 0, 211 0, 0, 0, 0, 0, 0, 0, 0, 212 0, 0, 0, 0, 0, 0, 0, 0, 213 0, 0, 0, 0, 0, 0, 0, 0, 214 0, 0, 0, 0, 0, 0, 0, 0, 215 0, 0, 0, 0, 0, 0, 0, 0, 216 0, 0, 0, 0, 0, 0, 0, 0, 217 0, 0, 0, 0, 0, 0, 0, 0, 218 0, 0, 0, 0, 0, 0, 0, 0, 219 0, 0, 0, 0, 0, 0, 0, 0, 220 0, 0, 0, 0, 0, 0, 0, 0, 221 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 222 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 223 /* C8 C9 CA CB CC CD CE CF */ 224 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 225 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 226 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 227 /* D8 D9 DA DB DC DD DE DF */ 228 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 229 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 230 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 231 /* E8 E9 EA EB EC ED EE EF */ 232 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 233 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 234 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 235 0, 0, 0, 0, 0, 0, 0, 0, 236 }; 237 238 239 /* 240 * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8 241 * characters' second to sixth bytes. 242 */ 243 #define ICV_UTF8_BIT_SHIFT 6 244 #define ICV_UTF8_BIT_MASK 0x3f 245 #define ICV_FETCH_UTF8_BOM_SIZE 6 246 247 #define ICV_FETCH_UCS4_SIZE 4 248 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ 249 defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 250 #define ICV_FETCH_UCS_SIZE 2 251 #define ICV_FETCH_UCS_SIZE_TWO 4 252 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 253 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 254 #define ICV_FETCH_UCS_SIZE 4 255 #define ICV_FETCH_UCS_SIZE_TWO 8 256 #endif 257 258 /* 259 * UTF-8 representations of some useful Unicode values. 260 * 261 * The U+FFFE in UTF-8 is 0x00efbfbe and the U+FFFF is 0x00efbfbf but 262 * we use masked values at the below: 263 */ 264 #define ICV_UTF8_REPRESENTATION_d800 (0x00eda080UL) 265 #define ICV_UTF8_REPRESENTATION_dfff (0x00edbfbfUL) 266 #define ICV_UTF8_REPRESENTATION_fdd0 (0x00efb790UL) 267 #define ICV_UTF8_REPRESENTATION_fdef (0x00efb7afUL) 268 269 #define ICV_UTF8_REPRESENTATION_fffe (0x000fbfbeUL) 270 #define ICV_UTF8_REPRESENTATION_ffff (0x000fbfbfUL) 271 #define ICV_UTF8_REPRESENTATION_ffff_mask (0x000fffffUL) 272 273 #define ICV_UTF8_REPRESENTATION_10fffd (0xf48fbfbdUL) 274 275 /* 276 * UTF-32 and UCS-4 representations of some useful Unicode values for 277 * non-character and out of bound invalid character detection. 278 */ 279 #define ICV_UTF32_NONCHAR_fffe (0xfffeU) 280 #define ICV_UTF32_NONCHAR_ffff (0xffffU) 281 #define ICV_UTF32_NONCHAR_mask (0xffffU) 282 283 #define ICV_UTF32_SURROGATE_START_d800 (0xd800U) 284 #define ICV_UTF32_SURROGATE_END_dfff (0xdfffU) 285 286 #define ICV_UTF32_ARABIC_NONCHAR_START_fdd0 (0xfdd0U) 287 #define ICV_UTF32_ARABIC_NONCHAR_END_fdef (0xfdefU) 288 289 #define ICV_UTF32_LAST_VALID_CHAR (0x10fffdU) 290 291 #define ICV_UCS4_LAST_VALID_CHAR (0x7fffffff) 292 293 294 #endif /* COMMON_DEFS_H */ 295