xref: /illumos-gate/usr/src/lib/iconv_modules/inc/common_defs.h (revision 20a7641f9918de8574b8b3b47dbe35c4bfc78df1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef	COMMON_DEFS_H
27 #define	COMMON_DEFS_H
28 
29 #include <sys/types.h>
30 
31 /* Following are replacement characters for non-identical character cases. */
32 
33 #define ICV_TYPE_NON_IDENTICAL_CHAR	(-1)
34 #define ICV_TYPE_ILLEGAL_CHAR		(-2)
35 
36 #define	ICV_CHAR_ASCII_REPLACEMENT	('?')
37 #define	ICV_CHAR_UTF8_REPLACEMENT	(0x00efbfbd)
38 #define	ICV_CHAR_UCS2_REPLACEMENT	(0xfffd)
39 
40 #define	IL_				ICV_TYPE_ILLEGAL_CHAR
41 
42 typedef enum { false = 0, true = 1 } boolean;
43 
44 static const char number_of_bytes_in_utf8_char[0x100] = {
45 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
46 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
47 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
48 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
49 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
50 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
51 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
52 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
53 
54     /*  80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
55 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
56 
57     /*  90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
58 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
59 
60     /*  A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
61 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
62 
63     /*  B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
64 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
65 
66     /*  C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
67 	IL_,IL_,2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
68 
69     /*  D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
70 	 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
71 
72     /*  E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
73 	 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
74 
75     /*  F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
76 	 4,  4,  4,  4,  4,  IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_,
77 };
78 
79 #undef IL_
80 
81 /*
82  * Following is a vector of bit-masks to get used bits in the first byte of
83  * a UTF-8 character.  Index is the number of bytes in the UTF-8 character
84  * and the index value comes from above table.
85  */
86 static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
87 
88 /*
89  * The following two vectors are to provide valid minimum and
90  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
91  * better illegal sequence checking. The index value must be the value of
92  * the first byte of the UTF-8 character.
93  */
94 static const unsigned char valid_min_2nd_byte[0x100] = {
95 	0,    0,    0,    0,    0,    0,    0,    0,
96 	0,    0,    0,    0,    0,    0,    0,    0,
97 	0,    0,    0,    0,    0,    0,    0,    0,
98 	0,    0,    0,    0,    0,    0,    0,    0,
99 	0,    0,    0,    0,    0,    0,    0,    0,
100 	0,    0,    0,    0,    0,    0,    0,    0,
101 	0,    0,    0,    0,    0,    0,    0,    0,
102 	0,    0,    0,    0,    0,    0,    0,    0,
103 	0,    0,    0,    0,    0,    0,    0,    0,
104 	0,    0,    0,    0,    0,    0,    0,    0,
105 	0,    0,    0,    0,    0,    0,    0,    0,
106 	0,    0,    0,    0,    0,    0,    0,    0,
107 	0,    0,    0,    0,    0,    0,    0,    0,
108 	0,    0,    0,    0,    0,    0,    0,    0,
109 	0,    0,    0,    0,    0,    0,    0,    0,
110 	0,    0,    0,    0,    0,    0,    0,    0,
111 	0,    0,    0,    0,    0,    0,    0,    0,
112 	0,    0,    0,    0,    0,    0,    0,    0,
113 	0,    0,    0,    0,    0,    0,    0,    0,
114 	0,    0,    0,    0,    0,    0,    0,    0,
115 	0,    0,    0,    0,    0,    0,    0,    0,
116 	0,    0,    0,    0,    0,    0,    0,    0,
117 	0,    0,    0,    0,    0,    0,    0,    0,
118 	0,    0,    0,    0,    0,    0,    0,    0,
119      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
120 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
121      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
122 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
123      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
124 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
125      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
126 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
127      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
128 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
129      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
130 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
131      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
132 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
133 	0,    0,    0,    0,    0,    0,    0,    0,
134 };
135 
136 static const unsigned char valid_max_2nd_byte[0x100] = {
137 	0,    0,    0,    0,    0,    0,    0,    0,
138 	0,    0,    0,    0,    0,    0,    0,    0,
139 	0,    0,    0,    0,    0,    0,    0,    0,
140 	0,    0,    0,    0,    0,    0,    0,    0,
141 	0,    0,    0,    0,    0,    0,    0,    0,
142 	0,    0,    0,    0,    0,    0,    0,    0,
143 	0,    0,    0,    0,    0,    0,    0,    0,
144 	0,    0,    0,    0,    0,    0,    0,    0,
145 	0,    0,    0,    0,    0,    0,    0,    0,
146 	0,    0,    0,    0,    0,    0,    0,    0,
147 	0,    0,    0,    0,    0,    0,    0,    0,
148 	0,    0,    0,    0,    0,    0,    0,    0,
149 	0,    0,    0,    0,    0,    0,    0,    0,
150 	0,    0,    0,    0,    0,    0,    0,    0,
151 	0,    0,    0,    0,    0,    0,    0,    0,
152 	0,    0,    0,    0,    0,    0,    0,    0,
153 	0,    0,    0,    0,    0,    0,    0,    0,
154 	0,    0,    0,    0,    0,    0,    0,    0,
155 	0,    0,    0,    0,    0,    0,    0,    0,
156 	0,    0,    0,    0,    0,    0,    0,    0,
157 	0,    0,    0,    0,    0,    0,    0,    0,
158 	0,    0,    0,    0,    0,    0,    0,    0,
159 	0,    0,    0,    0,    0,    0,    0,    0,
160 	0,    0,    0,    0,    0,    0,    0,    0,
161      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
162 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
163      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
164 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
165      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
166 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
167      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
168 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
169      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
170 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
171      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
172 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
173      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
174 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
175 	0,    0,    0,    0,    0,    0,    0,    0,
176 };
177 
178 
179 /*
180  * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8
181  * characters' second to sixth bytes.
182  */
183 #define	ICV_UTF8_BIT_SHIFT		6
184 #define	ICV_UTF8_BIT_MASK		0x3f
185 #define	ICV_FETCH_UTF8_BOM_SIZE		6
186 
187 #define  ICV_FETCH_UCS4_SIZE     4
188 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
189    defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
190 #define ICV_FETCH_UCS_SIZE              2
191 #define ICV_FETCH_UCS_SIZE_TWO          4
192 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
193    defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
194 #define ICV_FETCH_UCS_SIZE              4
195 #define ICV_FETCH_UCS_SIZE_TWO          8
196 #endif
197 
198 
199 /*
200  * UTF-8 represantations of critical values
201  */
202 #define	ICV_UTF8_REPRESENTATION_d800		(0x00eda080UL)
203 #define	ICV_UTF8_REPRESENTATION_dfff		(0x00edbfbfUL)
204 #define	ICV_UTF8_REPRESENTATION_fffe		(0x00efbfbeUL)
205 #define	ICV_UTF8_REPRESENTATION_ffff		(0x00efbfbfUL)
206 #define	ICV_UTF8_REPRESENTATION_7fffffff	(0x00fdbfbfbfbfbfULL)
207 
208 /*
209  * common utility to convert utf8 string to unicode
210  */
211 extern  int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *);
212 
213 extern  int is_valid_utf8_string(unsigned char *, int);
214 
215 /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */
216 typedef struct {
217    boolean     bom_written;
218    boolean     little_endian;
219 } ucs_state_t;
220 
221 #endif	/* COMMON_DEFS_H */
222