xref: /titanic_50/usr/src/lib/iconv_modules/inc/common_defs.h (revision 880d797826457b77414b37d531cc3e1aa166ecbe)
1*880d7978SAlexander Pyhalov /*
2*880d7978SAlexander Pyhalov  * CDDL HEADER START
3*880d7978SAlexander Pyhalov  *
4*880d7978SAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*880d7978SAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*880d7978SAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*880d7978SAlexander Pyhalov  *
8*880d7978SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*880d7978SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*880d7978SAlexander Pyhalov  * See the License for the specific language governing permissions
11*880d7978SAlexander Pyhalov  * and limitations under the License.
12*880d7978SAlexander Pyhalov  *
13*880d7978SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*880d7978SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*880d7978SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*880d7978SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*880d7978SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*880d7978SAlexander Pyhalov  *
19*880d7978SAlexander Pyhalov  * CDDL HEADER END
20*880d7978SAlexander Pyhalov  */
21*880d7978SAlexander Pyhalov /*
22*880d7978SAlexander Pyhalov  * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc.  All rights reserved.
23*880d7978SAlexander Pyhalov  * Use is subject to license terms.
24*880d7978SAlexander Pyhalov  */
25*880d7978SAlexander Pyhalov 
26*880d7978SAlexander Pyhalov #ifndef	COMMON_DEFS_H
27*880d7978SAlexander Pyhalov #define	COMMON_DEFS_H
28*880d7978SAlexander Pyhalov 
29*880d7978SAlexander Pyhalov #include <sys/types.h>
30*880d7978SAlexander Pyhalov 
31*880d7978SAlexander Pyhalov /* Following are replacement characters for non-identical character cases. */
32*880d7978SAlexander Pyhalov 
33*880d7978SAlexander Pyhalov #define ICV_TYPE_NON_IDENTICAL_CHAR	(-1)
34*880d7978SAlexander Pyhalov #define ICV_TYPE_ILLEGAL_CHAR		(-2)
35*880d7978SAlexander Pyhalov 
36*880d7978SAlexander Pyhalov #define	ICV_CHAR_ASCII_REPLACEMENT	('?')
37*880d7978SAlexander Pyhalov #define	ICV_CHAR_UTF8_REPLACEMENT	(0x00efbfbd)
38*880d7978SAlexander Pyhalov #define	ICV_CHAR_UCS2_REPLACEMENT	(0xfffd)
39*880d7978SAlexander Pyhalov 
40*880d7978SAlexander Pyhalov #define	IL_				ICV_TYPE_ILLEGAL_CHAR
41*880d7978SAlexander Pyhalov 
42*880d7978SAlexander Pyhalov typedef enum { false = 0, true = 1 } boolean;
43*880d7978SAlexander Pyhalov 
44*880d7978SAlexander Pyhalov static const char number_of_bytes_in_utf8_char[0x100] = {
45*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
46*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
47*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
48*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
49*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
50*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
51*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
52*880d7978SAlexander Pyhalov 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
53*880d7978SAlexander Pyhalov 
54*880d7978SAlexander Pyhalov     /*  80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
55*880d7978SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
56*880d7978SAlexander Pyhalov 
57*880d7978SAlexander Pyhalov     /*  90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
58*880d7978SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
59*880d7978SAlexander Pyhalov 
60*880d7978SAlexander Pyhalov     /*  A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
61*880d7978SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
62*880d7978SAlexander Pyhalov 
63*880d7978SAlexander Pyhalov     /*  B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
64*880d7978SAlexander Pyhalov 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
65*880d7978SAlexander Pyhalov 
66*880d7978SAlexander Pyhalov     /*  C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
67*880d7978SAlexander Pyhalov 	IL_,IL_,2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
68*880d7978SAlexander Pyhalov 
69*880d7978SAlexander Pyhalov     /*  D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
70*880d7978SAlexander Pyhalov 	 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
71*880d7978SAlexander Pyhalov 
72*880d7978SAlexander Pyhalov     /*  E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
73*880d7978SAlexander Pyhalov 	 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
74*880d7978SAlexander Pyhalov 
75*880d7978SAlexander Pyhalov     /*  F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
76*880d7978SAlexander Pyhalov 	 4,  4,  4,  4,  4,  IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_,
77*880d7978SAlexander Pyhalov };
78*880d7978SAlexander Pyhalov 
79*880d7978SAlexander Pyhalov #undef IL_
80*880d7978SAlexander Pyhalov 
81*880d7978SAlexander Pyhalov /*
82*880d7978SAlexander Pyhalov  * Following is a vector of bit-masks to get used bits in the first byte of
83*880d7978SAlexander Pyhalov  * a UTF-8 character.  Index is the number of bytes in the UTF-8 character
84*880d7978SAlexander Pyhalov  * and the index value comes from above table.
85*880d7978SAlexander Pyhalov  */
86*880d7978SAlexander Pyhalov static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
87*880d7978SAlexander Pyhalov 
88*880d7978SAlexander Pyhalov /*
89*880d7978SAlexander Pyhalov  * The following two vectors are to provide valid minimum and
90*880d7978SAlexander Pyhalov  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
91*880d7978SAlexander Pyhalov  * better illegal sequence checking. The index value must be the value of
92*880d7978SAlexander Pyhalov  * the first byte of the UTF-8 character.
93*880d7978SAlexander Pyhalov  */
94*880d7978SAlexander Pyhalov static const unsigned char valid_min_2nd_byte[0x100] = {
95*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
96*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
97*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
98*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
99*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
100*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
101*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
102*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
103*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
104*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
105*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
106*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
107*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
108*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
109*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
110*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
111*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
112*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
113*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
114*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
115*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
116*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
117*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
118*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
119*880d7978SAlexander Pyhalov      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
120*880d7978SAlexander Pyhalov 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
121*880d7978SAlexander Pyhalov      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
122*880d7978SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
123*880d7978SAlexander Pyhalov      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
124*880d7978SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
125*880d7978SAlexander Pyhalov      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
126*880d7978SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
127*880d7978SAlexander Pyhalov      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
128*880d7978SAlexander Pyhalov 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
129*880d7978SAlexander Pyhalov      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
130*880d7978SAlexander Pyhalov 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
131*880d7978SAlexander Pyhalov      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
132*880d7978SAlexander Pyhalov 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
133*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
134*880d7978SAlexander Pyhalov };
135*880d7978SAlexander Pyhalov 
136*880d7978SAlexander Pyhalov static const unsigned char valid_max_2nd_byte[0x100] = {
137*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
138*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
139*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
140*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
141*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
142*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
143*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
144*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
145*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
146*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
147*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
148*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
149*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
150*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
151*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
152*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
153*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
154*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
155*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
156*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
157*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
158*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
159*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
160*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
161*880d7978SAlexander Pyhalov      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
162*880d7978SAlexander Pyhalov 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
163*880d7978SAlexander Pyhalov      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
164*880d7978SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
165*880d7978SAlexander Pyhalov      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
166*880d7978SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
167*880d7978SAlexander Pyhalov      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
168*880d7978SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
169*880d7978SAlexander Pyhalov      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
170*880d7978SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
171*880d7978SAlexander Pyhalov      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
172*880d7978SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
173*880d7978SAlexander Pyhalov      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
174*880d7978SAlexander Pyhalov 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
175*880d7978SAlexander Pyhalov 	0,    0,    0,    0,    0,    0,    0,    0,
176*880d7978SAlexander Pyhalov };
177*880d7978SAlexander Pyhalov 
178*880d7978SAlexander Pyhalov 
179*880d7978SAlexander Pyhalov /*
180*880d7978SAlexander Pyhalov  * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8
181*880d7978SAlexander Pyhalov  * characters' second to sixth bytes.
182*880d7978SAlexander Pyhalov  */
183*880d7978SAlexander Pyhalov #define	ICV_UTF8_BIT_SHIFT		6
184*880d7978SAlexander Pyhalov #define	ICV_UTF8_BIT_MASK		0x3f
185*880d7978SAlexander Pyhalov #define	ICV_FETCH_UTF8_BOM_SIZE		6
186*880d7978SAlexander Pyhalov 
187*880d7978SAlexander Pyhalov #define  ICV_FETCH_UCS4_SIZE     4
188*880d7978SAlexander Pyhalov #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
189*880d7978SAlexander Pyhalov    defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
190*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE              2
191*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO          4
192*880d7978SAlexander Pyhalov #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
193*880d7978SAlexander Pyhalov    defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
194*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE              4
195*880d7978SAlexander Pyhalov #define ICV_FETCH_UCS_SIZE_TWO          8
196*880d7978SAlexander Pyhalov #endif
197*880d7978SAlexander Pyhalov 
198*880d7978SAlexander Pyhalov 
199*880d7978SAlexander Pyhalov /*
200*880d7978SAlexander Pyhalov  * UTF-8 represantations of critical values
201*880d7978SAlexander Pyhalov  */
202*880d7978SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_d800		(0x00eda080UL)
203*880d7978SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_dfff		(0x00edbfbfUL)
204*880d7978SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_fffe		(0x00efbfbeUL)
205*880d7978SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_ffff		(0x00efbfbfUL)
206*880d7978SAlexander Pyhalov #define	ICV_UTF8_REPRESENTATION_7fffffff	(0x00fdbfbfbfbfbfULL)
207*880d7978SAlexander Pyhalov 
208*880d7978SAlexander Pyhalov /*
209*880d7978SAlexander Pyhalov  * common utility to convert utf8 string to unicode
210*880d7978SAlexander Pyhalov  */
211*880d7978SAlexander Pyhalov extern  int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *);
212*880d7978SAlexander Pyhalov 
213*880d7978SAlexander Pyhalov extern  int is_valid_utf8_string(unsigned char *, int);
214*880d7978SAlexander Pyhalov 
215*880d7978SAlexander Pyhalov /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */
216*880d7978SAlexander Pyhalov typedef struct {
217*880d7978SAlexander Pyhalov    boolean     bom_written;
218*880d7978SAlexander Pyhalov    boolean     little_endian;
219*880d7978SAlexander Pyhalov } ucs_state_t;
220*880d7978SAlexander Pyhalov 
221*880d7978SAlexander Pyhalov #endif	/* COMMON_DEFS_H */
222