1*91e1e26aSAlexander Pyhalov /* 2*91e1e26aSAlexander Pyhalov * CDDL HEADER START 3*91e1e26aSAlexander Pyhalov * 4*91e1e26aSAlexander Pyhalov * The contents of this file are subject to the terms of the 5*91e1e26aSAlexander Pyhalov * Common Development and Distribution License (the "License"). 6*91e1e26aSAlexander Pyhalov * You may not use this file except in compliance with the License. 7*91e1e26aSAlexander Pyhalov * 8*91e1e26aSAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9*91e1e26aSAlexander Pyhalov * or http://www.opensolaris.org/os/licensing. 10*91e1e26aSAlexander Pyhalov * See the License for the specific language governing permissions 11*91e1e26aSAlexander Pyhalov * and limitations under the License. 12*91e1e26aSAlexander Pyhalov * 13*91e1e26aSAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each 14*91e1e26aSAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE. 15*91e1e26aSAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the 16*91e1e26aSAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying 17*91e1e26aSAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner] 18*91e1e26aSAlexander Pyhalov * 19*91e1e26aSAlexander Pyhalov * CDDL HEADER END 20*91e1e26aSAlexander Pyhalov */ 21*91e1e26aSAlexander Pyhalov /* 22*91e1e26aSAlexander Pyhalov * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc. All rights reserved. 23*91e1e26aSAlexander Pyhalov * Use is subject to license terms. 24*91e1e26aSAlexander Pyhalov */ 25*91e1e26aSAlexander Pyhalov 26*91e1e26aSAlexander Pyhalov #ifndef COMMON_DEFS_H 27*91e1e26aSAlexander Pyhalov #define COMMON_DEFS_H 28*91e1e26aSAlexander Pyhalov 29*91e1e26aSAlexander Pyhalov #include <sys/types.h> 30*91e1e26aSAlexander Pyhalov 31*91e1e26aSAlexander Pyhalov /* Following are replacement characters for non-identical character cases. */ 32*91e1e26aSAlexander Pyhalov 33*91e1e26aSAlexander Pyhalov #define ICV_TYPE_NON_IDENTICAL_CHAR (-1) 34*91e1e26aSAlexander Pyhalov #define ICV_TYPE_ILLEGAL_CHAR (-2) 35*91e1e26aSAlexander Pyhalov 36*91e1e26aSAlexander Pyhalov #define ICV_CHAR_ASCII_REPLACEMENT ('?') 37*91e1e26aSAlexander Pyhalov #define ICV_CHAR_UTF8_REPLACEMENT (0x00efbfbd) 38*91e1e26aSAlexander Pyhalov #define ICV_CHAR_UCS2_REPLACEMENT (0xfffd) 39*91e1e26aSAlexander Pyhalov 40*91e1e26aSAlexander Pyhalov #define IL_ ICV_TYPE_ILLEGAL_CHAR 41*91e1e26aSAlexander Pyhalov 42*91e1e26aSAlexander Pyhalov typedef enum { false = 0, true = 1 } boolean; 43*91e1e26aSAlexander Pyhalov 44*91e1e26aSAlexander Pyhalov static const char number_of_bytes_in_utf8_char[0x100] = { 45*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52*91e1e26aSAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53*91e1e26aSAlexander Pyhalov 54*91e1e26aSAlexander Pyhalov /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 55*91e1e26aSAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 56*91e1e26aSAlexander Pyhalov 57*91e1e26aSAlexander Pyhalov /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 58*91e1e26aSAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 59*91e1e26aSAlexander Pyhalov 60*91e1e26aSAlexander Pyhalov /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 61*91e1e26aSAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 62*91e1e26aSAlexander Pyhalov 63*91e1e26aSAlexander Pyhalov /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 64*91e1e26aSAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 65*91e1e26aSAlexander Pyhalov 66*91e1e26aSAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 67*91e1e26aSAlexander Pyhalov IL_,IL_,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 68*91e1e26aSAlexander Pyhalov 69*91e1e26aSAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 70*91e1e26aSAlexander Pyhalov 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 71*91e1e26aSAlexander Pyhalov 72*91e1e26aSAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 73*91e1e26aSAlexander Pyhalov 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 74*91e1e26aSAlexander Pyhalov 75*91e1e26aSAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 76*91e1e26aSAlexander Pyhalov 4, 4, 4, 4, 4, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, 77*91e1e26aSAlexander Pyhalov }; 78*91e1e26aSAlexander Pyhalov 79*91e1e26aSAlexander Pyhalov #undef IL_ 80*91e1e26aSAlexander Pyhalov 81*91e1e26aSAlexander Pyhalov /* 82*91e1e26aSAlexander Pyhalov * Following is a vector of bit-masks to get used bits in the first byte of 83*91e1e26aSAlexander Pyhalov * a UTF-8 character. Index is the number of bytes in the UTF-8 character 84*91e1e26aSAlexander Pyhalov * and the index value comes from above table. 85*91e1e26aSAlexander Pyhalov */ 86*91e1e26aSAlexander Pyhalov static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 87*91e1e26aSAlexander Pyhalov 88*91e1e26aSAlexander Pyhalov /* 89*91e1e26aSAlexander Pyhalov * The following two vectors are to provide valid minimum and 90*91e1e26aSAlexander Pyhalov * maximum values for the 2'nd byte of a multibyte UTF-8 character for 91*91e1e26aSAlexander Pyhalov * better illegal sequence checking. The index value must be the value of 92*91e1e26aSAlexander Pyhalov * the first byte of the UTF-8 character. 93*91e1e26aSAlexander Pyhalov */ 94*91e1e26aSAlexander Pyhalov static const unsigned char valid_min_2nd_byte[0x100] = { 95*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 96*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 97*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 98*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 99*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 100*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 101*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 102*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 103*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 104*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 105*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 106*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 107*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 108*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 109*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 110*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 111*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 112*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 113*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 114*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 115*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 116*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 117*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 118*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 119*91e1e26aSAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 */ 120*91e1e26aSAlexander Pyhalov 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 121*91e1e26aSAlexander Pyhalov /* C8 C9 CA CB CC CD CE CF */ 122*91e1e26aSAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 123*91e1e26aSAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 */ 124*91e1e26aSAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 125*91e1e26aSAlexander Pyhalov /* D8 D9 DA DB DC DD DE DF */ 126*91e1e26aSAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 127*91e1e26aSAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 */ 128*91e1e26aSAlexander Pyhalov 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 129*91e1e26aSAlexander Pyhalov /* E8 E9 EA EB EC ED EE EF */ 130*91e1e26aSAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 131*91e1e26aSAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 */ 132*91e1e26aSAlexander Pyhalov 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 133*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 134*91e1e26aSAlexander Pyhalov }; 135*91e1e26aSAlexander Pyhalov 136*91e1e26aSAlexander Pyhalov static const unsigned char valid_max_2nd_byte[0x100] = { 137*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 138*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 139*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 140*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 141*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 142*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 143*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 144*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 145*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 146*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 147*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 148*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 149*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 150*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 151*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 152*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 153*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 154*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 155*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 156*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 157*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 158*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 159*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 160*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 161*91e1e26aSAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 */ 162*91e1e26aSAlexander Pyhalov 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 163*91e1e26aSAlexander Pyhalov /* C8 C9 CA CB CC CD CE CF */ 164*91e1e26aSAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 165*91e1e26aSAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 */ 166*91e1e26aSAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 167*91e1e26aSAlexander Pyhalov /* D8 D9 DA DB DC DD DE DF */ 168*91e1e26aSAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 169*91e1e26aSAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 */ 170*91e1e26aSAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 171*91e1e26aSAlexander Pyhalov /* E8 E9 EA EB EC ED EE EF */ 172*91e1e26aSAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 173*91e1e26aSAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 */ 174*91e1e26aSAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 175*91e1e26aSAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 176*91e1e26aSAlexander Pyhalov }; 177*91e1e26aSAlexander Pyhalov 178*91e1e26aSAlexander Pyhalov 179*91e1e26aSAlexander Pyhalov /* 180*91e1e26aSAlexander Pyhalov * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8 181*91e1e26aSAlexander Pyhalov * characters' second to sixth bytes. 182*91e1e26aSAlexander Pyhalov */ 183*91e1e26aSAlexander Pyhalov #define ICV_UTF8_BIT_SHIFT 6 184*91e1e26aSAlexander Pyhalov #define ICV_UTF8_BIT_MASK 0x3f 185*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UTF8_BOM_SIZE 6 186*91e1e26aSAlexander Pyhalov 187*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS4_SIZE 4 188*91e1e26aSAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ 189*91e1e26aSAlexander Pyhalov defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 190*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE 2 191*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO 4 192*91e1e26aSAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 193*91e1e26aSAlexander Pyhalov defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 194*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE 4 195*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO 8 196*91e1e26aSAlexander Pyhalov #endif 197*91e1e26aSAlexander Pyhalov 198*91e1e26aSAlexander Pyhalov 199*91e1e26aSAlexander Pyhalov /* 200*91e1e26aSAlexander Pyhalov * UTF-8 represantations of critical values 201*91e1e26aSAlexander Pyhalov */ 202*91e1e26aSAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_d800 (0x00eda080UL) 203*91e1e26aSAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_dfff (0x00edbfbfUL) 204*91e1e26aSAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_fffe (0x00efbfbeUL) 205*91e1e26aSAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_ffff (0x00efbfbfUL) 206*91e1e26aSAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_7fffffff (0x00fdbfbfbfbfbfULL) 207*91e1e26aSAlexander Pyhalov 208*91e1e26aSAlexander Pyhalov /* 209*91e1e26aSAlexander Pyhalov * common utility to convert utf8 string to unicode 210*91e1e26aSAlexander Pyhalov */ 211*91e1e26aSAlexander Pyhalov extern int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *); 212*91e1e26aSAlexander Pyhalov 213*91e1e26aSAlexander Pyhalov extern int is_valid_utf8_string(unsigned char *, int); 214*91e1e26aSAlexander Pyhalov 215*91e1e26aSAlexander Pyhalov /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */ 216*91e1e26aSAlexander Pyhalov typedef struct { 217*91e1e26aSAlexander Pyhalov boolean bom_written; 218*91e1e26aSAlexander Pyhalov boolean little_endian; 219*91e1e26aSAlexander Pyhalov } ucs_state_t; 220*91e1e26aSAlexander Pyhalov 221*91e1e26aSAlexander Pyhalov #endif /* COMMON_DEFS_H */ 222