xref: /titanic_52/usr/src/lib/iconv_modules/inc/common_defs.h (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1*91e1e26aSAlexander Pyhalov /*
2*91e1e26aSAlexander Pyhalov  * CDDL HEADER START
3*91e1e26aSAlexander Pyhalov  *
4*91e1e26aSAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*91e1e26aSAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*91e1e26aSAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*91e1e26aSAlexander Pyhalov  *
8*91e1e26aSAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*91e1e26aSAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*91e1e26aSAlexander Pyhalov  * See the License for the specific language governing permissions
11*91e1e26aSAlexander Pyhalov  * and limitations under the License.
12*91e1e26aSAlexander Pyhalov  *
13*91e1e26aSAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*91e1e26aSAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*91e1e26aSAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*91e1e26aSAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*91e1e26aSAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*91e1e26aSAlexander Pyhalov  *
19*91e1e26aSAlexander Pyhalov  * CDDL HEADER END
20*91e1e26aSAlexander Pyhalov  */
21*91e1e26aSAlexander Pyhalov /*
22*91e1e26aSAlexander Pyhalov  * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc.  All rights reserved.
23*91e1e26aSAlexander Pyhalov  * Use is subject to license terms.
24*91e1e26aSAlexander Pyhalov  */
25*91e1e26aSAlexander Pyhalov 
26*91e1e26aSAlexander Pyhalov #ifndef	COMMON_DEFS_H
27*91e1e26aSAlexander Pyhalov #define	COMMON_DEFS_H
28*91e1e26aSAlexander Pyhalov 
29*91e1e26aSAlexander Pyhalov #include <sys/types.h>
30*91e1e26aSAlexander Pyhalov 
31*91e1e26aSAlexander Pyhalov /* Following are replacement characters for non-identical character cases. */
32*91e1e26aSAlexander Pyhalov 
33*91e1e26aSAlexander Pyhalov #define ICV_TYPE_NON_IDENTICAL_CHAR	(-1)
34*91e1e26aSAlexander Pyhalov #define ICV_TYPE_ILLEGAL_CHAR		(-2)
35*91e1e26aSAlexander Pyhalov 
36*91e1e26aSAlexander Pyhalov #define	ICV_CHAR_ASCII_REPLACEMENT	('?')
37*91e1e26aSAlexander Pyhalov #define	ICV_CHAR_UTF8_REPLACEMENT	(0x00efbfbd)
38*91e1e26aSAlexander Pyhalov #define	ICV_CHAR_UCS2_REPLACEMENT	(0xfffd)
39*91e1e26aSAlexander Pyhalov 
40*91e1e26aSAlexander Pyhalov #define	IL_				ICV_TYPE_ILLEGAL_CHAR
41*91e1e26aSAlexander Pyhalov 
42*91e1e26aSAlexander Pyhalov typedef enum { false = 0, true = 1 } boolean;
43*91e1e26aSAlexander Pyhalov 
44*91e1e26aSAlexander Pyhalov static const char number_of_bytes_in_utf8_char[0x100] = {
45*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
46*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
47*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
48*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
49*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
50*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
51*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
52*91e1e26aSAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
53*91e1e26aSAlexander Pyhalov 
54*91e1e26aSAlexander Pyhalov     /*  80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
55*91e1e26aSAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
56*91e1e26aSAlexander Pyhalov 
57*91e1e26aSAlexander Pyhalov     /*  90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
58*91e1e26aSAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
59*91e1e26aSAlexander Pyhalov 
60*91e1e26aSAlexander Pyhalov     /*  A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
61*91e1e26aSAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
62*91e1e26aSAlexander Pyhalov 
63*91e1e26aSAlexander Pyhalov     /*  B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
64*91e1e26aSAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
65*91e1e26aSAlexander Pyhalov 
66*91e1e26aSAlexander Pyhalov     /*  C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
67*91e1e26aSAlexander Pyhalov 	IL_,IL_,2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
68*91e1e26aSAlexander Pyhalov 
69*91e1e26aSAlexander Pyhalov     /*  D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
70*91e1e26aSAlexander Pyhalov 	 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
71*91e1e26aSAlexander Pyhalov 
72*91e1e26aSAlexander Pyhalov     /*  E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
73*91e1e26aSAlexander Pyhalov 	 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
74*91e1e26aSAlexander Pyhalov 
75*91e1e26aSAlexander Pyhalov     /*  F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
76*91e1e26aSAlexander Pyhalov 	 4,  4,  4,  4,  4,  IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_,
77*91e1e26aSAlexander Pyhalov };
78*91e1e26aSAlexander Pyhalov 
79*91e1e26aSAlexander Pyhalov #undef IL_
80*91e1e26aSAlexander Pyhalov 
81*91e1e26aSAlexander Pyhalov /*
82*91e1e26aSAlexander Pyhalov  * Following is a vector of bit-masks to get used bits in the first byte of
83*91e1e26aSAlexander Pyhalov  * a UTF-8 character.  Index is the number of bytes in the UTF-8 character
84*91e1e26aSAlexander Pyhalov  * and the index value comes from above table.
85*91e1e26aSAlexander Pyhalov  */
86*91e1e26aSAlexander Pyhalov static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
87*91e1e26aSAlexander Pyhalov 
88*91e1e26aSAlexander Pyhalov /*
89*91e1e26aSAlexander Pyhalov  * The following two vectors are to provide valid minimum and
90*91e1e26aSAlexander Pyhalov  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
91*91e1e26aSAlexander Pyhalov  * better illegal sequence checking. The index value must be the value of
92*91e1e26aSAlexander Pyhalov  * the first byte of the UTF-8 character.
93*91e1e26aSAlexander Pyhalov  */
94*91e1e26aSAlexander Pyhalov static const unsigned char valid_min_2nd_byte[0x100] = {
95*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
96*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
97*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
98*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
99*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
100*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
101*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
102*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
103*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
104*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
105*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
106*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
107*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
108*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
109*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
110*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
111*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
112*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
113*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
114*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
115*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
116*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
117*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
118*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
119*91e1e26aSAlexander Pyhalov      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
120*91e1e26aSAlexander Pyhalov 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
121*91e1e26aSAlexander Pyhalov      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
122*91e1e26aSAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
123*91e1e26aSAlexander Pyhalov      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
124*91e1e26aSAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
125*91e1e26aSAlexander Pyhalov      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
126*91e1e26aSAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
127*91e1e26aSAlexander Pyhalov      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
128*91e1e26aSAlexander Pyhalov 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
129*91e1e26aSAlexander Pyhalov      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
130*91e1e26aSAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
131*91e1e26aSAlexander Pyhalov      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
132*91e1e26aSAlexander Pyhalov 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
133*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
134*91e1e26aSAlexander Pyhalov };
135*91e1e26aSAlexander Pyhalov 
136*91e1e26aSAlexander Pyhalov static const unsigned char valid_max_2nd_byte[0x100] = {
137*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
138*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
139*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
140*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
141*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
142*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
143*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
144*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
145*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
146*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
147*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
148*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
149*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
150*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
151*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
152*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
153*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
154*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
155*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
156*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
157*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
158*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
159*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
160*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
161*91e1e26aSAlexander Pyhalov      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
162*91e1e26aSAlexander Pyhalov 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
163*91e1e26aSAlexander Pyhalov      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
164*91e1e26aSAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
165*91e1e26aSAlexander Pyhalov      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
166*91e1e26aSAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
167*91e1e26aSAlexander Pyhalov      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
168*91e1e26aSAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
169*91e1e26aSAlexander Pyhalov      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
170*91e1e26aSAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
171*91e1e26aSAlexander Pyhalov      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
172*91e1e26aSAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
173*91e1e26aSAlexander Pyhalov      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
174*91e1e26aSAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
175*91e1e26aSAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
176*91e1e26aSAlexander Pyhalov };
177*91e1e26aSAlexander Pyhalov 
178*91e1e26aSAlexander Pyhalov 
179*91e1e26aSAlexander Pyhalov /*
180*91e1e26aSAlexander Pyhalov  * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8
181*91e1e26aSAlexander Pyhalov  * characters' second to sixth bytes.
182*91e1e26aSAlexander Pyhalov  */
183*91e1e26aSAlexander Pyhalov #define	ICV_UTF8_BIT_SHIFT		6
184*91e1e26aSAlexander Pyhalov #define	ICV_UTF8_BIT_MASK		0x3f
185*91e1e26aSAlexander Pyhalov #define	ICV_FETCH_UTF8_BOM_SIZE		6
186*91e1e26aSAlexander Pyhalov 
187*91e1e26aSAlexander Pyhalov #define  ICV_FETCH_UCS4_SIZE     4
188*91e1e26aSAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
189*91e1e26aSAlexander Pyhalov    defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
190*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE              2
191*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO          4
192*91e1e26aSAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
193*91e1e26aSAlexander Pyhalov    defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
194*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE              4
195*91e1e26aSAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO          8
196*91e1e26aSAlexander Pyhalov #endif
197*91e1e26aSAlexander Pyhalov 
198*91e1e26aSAlexander Pyhalov 
199*91e1e26aSAlexander Pyhalov /*
200*91e1e26aSAlexander Pyhalov  * UTF-8 represantations of critical values
201*91e1e26aSAlexander Pyhalov  */
202*91e1e26aSAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_d800		(0x00eda080UL)
203*91e1e26aSAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_dfff		(0x00edbfbfUL)
204*91e1e26aSAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_fffe		(0x00efbfbeUL)
205*91e1e26aSAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_ffff		(0x00efbfbfUL)
206*91e1e26aSAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_7fffffff	(0x00fdbfbfbfbfbfULL)
207*91e1e26aSAlexander Pyhalov 
208*91e1e26aSAlexander Pyhalov /*
209*91e1e26aSAlexander Pyhalov  * common utility to convert utf8 string to unicode
210*91e1e26aSAlexander Pyhalov  */
211*91e1e26aSAlexander Pyhalov extern  int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *);
212*91e1e26aSAlexander Pyhalov 
213*91e1e26aSAlexander Pyhalov extern  int is_valid_utf8_string(unsigned char *, int);
214*91e1e26aSAlexander Pyhalov 
215*91e1e26aSAlexander Pyhalov /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */
216*91e1e26aSAlexander Pyhalov typedef struct {
217*91e1e26aSAlexander Pyhalov    boolean     bom_written;
218*91e1e26aSAlexander Pyhalov    boolean     little_endian;
219*91e1e26aSAlexander Pyhalov } ucs_state_t;
220*91e1e26aSAlexander Pyhalov 
221*91e1e26aSAlexander Pyhalov #endif	/* COMMON_DEFS_H */
222