1*880d7978SAlexander Pyhalov /* 2*880d7978SAlexander Pyhalov * CDDL HEADER START 3*880d7978SAlexander Pyhalov * 4*880d7978SAlexander Pyhalov * The contents of this file are subject to the terms of the 5*880d7978SAlexander Pyhalov * Common Development and Distribution License (the "License"). 6*880d7978SAlexander Pyhalov * You may not use this file except in compliance with the License. 7*880d7978SAlexander Pyhalov * 8*880d7978SAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9*880d7978SAlexander Pyhalov * or http://www.opensolaris.org/os/licensing. 10*880d7978SAlexander Pyhalov * See the License for the specific language governing permissions 11*880d7978SAlexander Pyhalov * and limitations under the License. 12*880d7978SAlexander Pyhalov * 13*880d7978SAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each 14*880d7978SAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE. 15*880d7978SAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the 16*880d7978SAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying 17*880d7978SAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner] 18*880d7978SAlexander Pyhalov * 19*880d7978SAlexander Pyhalov * CDDL HEADER END 20*880d7978SAlexander Pyhalov */ 21*880d7978SAlexander Pyhalov /* 22*880d7978SAlexander Pyhalov * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc. All rights reserved. 23*880d7978SAlexander Pyhalov * Use is subject to license terms. 24*880d7978SAlexander Pyhalov */ 25*880d7978SAlexander Pyhalov 26*880d7978SAlexander Pyhalov #ifndef COMMON_DEFS_H 27*880d7978SAlexander Pyhalov #define COMMON_DEFS_H 28*880d7978SAlexander Pyhalov 29*880d7978SAlexander Pyhalov #include <sys/types.h> 30*880d7978SAlexander Pyhalov 31*880d7978SAlexander Pyhalov /* Following are replacement characters for non-identical character cases. */ 32*880d7978SAlexander Pyhalov 33*880d7978SAlexander Pyhalov #define ICV_TYPE_NON_IDENTICAL_CHAR (-1) 34*880d7978SAlexander Pyhalov #define ICV_TYPE_ILLEGAL_CHAR (-2) 35*880d7978SAlexander Pyhalov 36*880d7978SAlexander Pyhalov #define ICV_CHAR_ASCII_REPLACEMENT ('?') 37*880d7978SAlexander Pyhalov #define ICV_CHAR_UTF8_REPLACEMENT (0x00efbfbd) 38*880d7978SAlexander Pyhalov #define ICV_CHAR_UCS2_REPLACEMENT (0xfffd) 39*880d7978SAlexander Pyhalov 40*880d7978SAlexander Pyhalov #define IL_ ICV_TYPE_ILLEGAL_CHAR 41*880d7978SAlexander Pyhalov 42*880d7978SAlexander Pyhalov typedef enum { false = 0, true = 1 } boolean; 43*880d7978SAlexander Pyhalov 44*880d7978SAlexander Pyhalov static const char number_of_bytes_in_utf8_char[0x100] = { 45*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52*880d7978SAlexander Pyhalov 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53*880d7978SAlexander Pyhalov 54*880d7978SAlexander Pyhalov /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 55*880d7978SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 56*880d7978SAlexander Pyhalov 57*880d7978SAlexander Pyhalov /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 58*880d7978SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 59*880d7978SAlexander Pyhalov 60*880d7978SAlexander Pyhalov /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 61*880d7978SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 62*880d7978SAlexander Pyhalov 63*880d7978SAlexander Pyhalov /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 64*880d7978SAlexander Pyhalov IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 65*880d7978SAlexander Pyhalov 66*880d7978SAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 67*880d7978SAlexander Pyhalov IL_,IL_,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 68*880d7978SAlexander Pyhalov 69*880d7978SAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 70*880d7978SAlexander Pyhalov 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 71*880d7978SAlexander Pyhalov 72*880d7978SAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 73*880d7978SAlexander Pyhalov 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 74*880d7978SAlexander Pyhalov 75*880d7978SAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 76*880d7978SAlexander Pyhalov 4, 4, 4, 4, 4, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, 77*880d7978SAlexander Pyhalov }; 78*880d7978SAlexander Pyhalov 79*880d7978SAlexander Pyhalov #undef IL_ 80*880d7978SAlexander Pyhalov 81*880d7978SAlexander Pyhalov /* 82*880d7978SAlexander Pyhalov * Following is a vector of bit-masks to get used bits in the first byte of 83*880d7978SAlexander Pyhalov * a UTF-8 character. Index is the number of bytes in the UTF-8 character 84*880d7978SAlexander Pyhalov * and the index value comes from above table. 85*880d7978SAlexander Pyhalov */ 86*880d7978SAlexander Pyhalov static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 87*880d7978SAlexander Pyhalov 88*880d7978SAlexander Pyhalov /* 89*880d7978SAlexander Pyhalov * The following two vectors are to provide valid minimum and 90*880d7978SAlexander Pyhalov * maximum values for the 2'nd byte of a multibyte UTF-8 character for 91*880d7978SAlexander Pyhalov * better illegal sequence checking. The index value must be the value of 92*880d7978SAlexander Pyhalov * the first byte of the UTF-8 character. 93*880d7978SAlexander Pyhalov */ 94*880d7978SAlexander Pyhalov static const unsigned char valid_min_2nd_byte[0x100] = { 95*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 96*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 97*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 98*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 99*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 100*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 101*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 102*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 103*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 104*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 105*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 106*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 107*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 108*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 109*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 110*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 111*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 112*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 113*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 114*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 115*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 116*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 117*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 118*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 119*880d7978SAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 */ 120*880d7978SAlexander Pyhalov 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 121*880d7978SAlexander Pyhalov /* C8 C9 CA CB CC CD CE CF */ 122*880d7978SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 123*880d7978SAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 */ 124*880d7978SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 125*880d7978SAlexander Pyhalov /* D8 D9 DA DB DC DD DE DF */ 126*880d7978SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 127*880d7978SAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 */ 128*880d7978SAlexander Pyhalov 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 129*880d7978SAlexander Pyhalov /* E8 E9 EA EB EC ED EE EF */ 130*880d7978SAlexander Pyhalov 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 131*880d7978SAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 */ 132*880d7978SAlexander Pyhalov 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 133*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 134*880d7978SAlexander Pyhalov }; 135*880d7978SAlexander Pyhalov 136*880d7978SAlexander Pyhalov static const unsigned char valid_max_2nd_byte[0x100] = { 137*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 138*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 139*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 140*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 141*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 142*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 143*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 144*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 145*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 146*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 147*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 148*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 149*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 150*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 151*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 152*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 153*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 154*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 155*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 156*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 157*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 158*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 159*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 160*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 161*880d7978SAlexander Pyhalov /* C0 C1 C2 C3 C4 C5 C6 C7 */ 162*880d7978SAlexander Pyhalov 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 163*880d7978SAlexander Pyhalov /* C8 C9 CA CB CC CD CE CF */ 164*880d7978SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 165*880d7978SAlexander Pyhalov /* D0 D1 D2 D3 D4 D5 D6 D7 */ 166*880d7978SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 167*880d7978SAlexander Pyhalov /* D8 D9 DA DB DC DD DE DF */ 168*880d7978SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 169*880d7978SAlexander Pyhalov /* E0 E1 E2 E3 E4 E5 E6 E7 */ 170*880d7978SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 171*880d7978SAlexander Pyhalov /* E8 E9 EA EB EC ED EE EF */ 172*880d7978SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 173*880d7978SAlexander Pyhalov /* F0 F1 F2 F3 F4 F5 F6 F7 */ 174*880d7978SAlexander Pyhalov 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 175*880d7978SAlexander Pyhalov 0, 0, 0, 0, 0, 0, 0, 0, 176*880d7978SAlexander Pyhalov }; 177*880d7978SAlexander Pyhalov 178*880d7978SAlexander Pyhalov 179*880d7978SAlexander Pyhalov /* 180*880d7978SAlexander Pyhalov * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8 181*880d7978SAlexander Pyhalov * characters' second to sixth bytes. 182*880d7978SAlexander Pyhalov */ 183*880d7978SAlexander Pyhalov #define ICV_UTF8_BIT_SHIFT 6 184*880d7978SAlexander Pyhalov #define ICV_UTF8_BIT_MASK 0x3f 185*880d7978SAlexander Pyhalov #define ICV_FETCH_UTF8_BOM_SIZE 6 186*880d7978SAlexander Pyhalov 187*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS4_SIZE 4 188*880d7978SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ 189*880d7978SAlexander Pyhalov defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 190*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE 2 191*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO 4 192*880d7978SAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 193*880d7978SAlexander Pyhalov defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 194*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE 4 195*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO 8 196*880d7978SAlexander Pyhalov #endif 197*880d7978SAlexander Pyhalov 198*880d7978SAlexander Pyhalov 199*880d7978SAlexander Pyhalov /* 200*880d7978SAlexander Pyhalov * UTF-8 represantations of critical values 201*880d7978SAlexander Pyhalov */ 202*880d7978SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_d800 (0x00eda080UL) 203*880d7978SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_dfff (0x00edbfbfUL) 204*880d7978SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_fffe (0x00efbfbeUL) 205*880d7978SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_ffff (0x00efbfbfUL) 206*880d7978SAlexander Pyhalov #define ICV_UTF8_REPRESENTATION_7fffffff (0x00fdbfbfbfbfbfULL) 207*880d7978SAlexander Pyhalov 208*880d7978SAlexander Pyhalov /* 209*880d7978SAlexander Pyhalov * common utility to convert utf8 string to unicode 210*880d7978SAlexander Pyhalov */ 211*880d7978SAlexander Pyhalov extern int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *); 212*880d7978SAlexander Pyhalov 213*880d7978SAlexander Pyhalov extern int is_valid_utf8_string(unsigned char *, int); 214*880d7978SAlexander Pyhalov 215*880d7978SAlexander Pyhalov /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */ 216*880d7978SAlexander Pyhalov typedef struct { 217*880d7978SAlexander Pyhalov boolean bom_written; 218*880d7978SAlexander Pyhalov boolean little_endian; 219*880d7978SAlexander Pyhalov } ucs_state_t; 220*880d7978SAlexander Pyhalov 221*880d7978SAlexander Pyhalov #endif /* COMMON_DEFS_H */ 222