xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/common_defs.h (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef	COMMON_DEFS_H
27 #define	COMMON_DEFS_H
28 
29 
30 
31 #define	MAGIC_NUMBER			201513
32 
33 
34 /* ISO/IEC 10646-1/Unicode Byte Order Mark */
35 #define	ICV_BOM_IN_BIG_ENDIAN		0x00feff
36 #define	ICV_BOM_IN_LITTLE_ENDIAN_UCS4	0xfffe0000
37 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
38 	defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
39 #define	ICV_BOM_IN_LITTLE_ENDIAN	0x00fffe
40 #else
41 #define	ICV_BOM_IN_LITTLE_ENDIAN	0xfffe0000
42 #endif
43 
44 
45 /*
46  * Following type macros are for possible error cases that can be defined for
47  * mapping tables. Valid characters will have the byte length which will be
48  * always a positive integer.
49  */
50 #define	ICV_TYPE_NON_IDENTICAL_CHAR	(-1)
51 #define	ICV_TYPE_ILLEGAL_CHAR		(-2)
52 
53 /* Following are replacement characters for non-identical character cases. */
54 #define	ICV_CHAR_ASCII_REPLACEMENT	('?')
55 #define	ICV_CHAR_UTF8_REPLACEMENT	(0x00efbfbd)
56 #define	ICV_CHAR_UCS2_REPLACEMENT	(0xfffd)
57 
58 
59 typedef enum { false = 0, true = 1 } boolean;
60 
61 
62 /* We only support characters in range of UTF-16. */
63 typedef struct {
64 	unsigned int	u8;
65 	signed char	size;
66 } to_utf8_table_component_t;
67 
68 typedef struct {
69 	unsigned int	u8;
70 	unsigned char	sb;
71 } to_sb_table_component_t;
72 
73 
74 /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */
75 typedef struct {
76 	boolean		bom_written;
77 	boolean		little_endian;
78 } ucs_state_t;
79 
80 typedef struct {
81 	ucs_state_t	input;
82 	ucs_state_t	output;
83 } ucs_ucs_state_t;
84 
85 
86 /* UTF-7 requires additional state data fields. */
87 typedef struct {
88 	boolean		bom_written;
89 	boolean		little_endian;
90 	boolean		in_the_middle_of_utf7_sequence;
91 	unsigned int	remnant;
92 	signed char	remnant_count;		/* in bits */
93 	unsigned char	prevch;
94 } utf7_state_t;
95 
96 
97 /*
98  * Following vector shows the number of bytes in a UTF-8 character.
99  * Index will be the first byte of the character.
100  */
101 
102 #define	IL_				ICV_TYPE_ILLEGAL_CHAR
103 
104 static const char number_of_bytes_in_utf8_char[0x100] = {
105 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
106 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
107 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
108 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
109 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
110 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
111 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
112 	 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
113 
114     /*  80  81  82  83  84  85  86  87  88  89  8A  8B  8C  8D  8E  8F  */
115 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
116 
117     /*  90  91  92  93  94  95  96  97  98  99  9A  9B  9C  9D  9E  9F  */
118 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
119 
120     /*  A0  A1  A2  A3  A4  A5  A6  A7  A8  A9  AA  AB  AC  AD  AE  AF  */
121 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
122 
123     /*  B0  B1  B2  B3  B4  B5  B6  B7  B8  B9  BA  BB  BC  BD  BE  BF  */
124 	IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
125 
126     /*  C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
127 	IL_,IL_, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
128 
129     /*  D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
130 	 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
131 
132     /*  E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
133 	 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
134 
135     /*  F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
136 	 4,  4,  4,  4,  4, IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,
137 };
138 
139 #undef IL_
140 
141 /*
142  * Following is a vector of bit-masks to get used bits in the first byte of
143  * a UTF-8 character.  Index is the number of bytes in the UTF-8 character
144  * and the index value comes from above table.
145  */
146 static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147 
148 /*
149  * The following two vectors are to provide valid minimum and
150  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
151  * better illegal sequence checking. The index value must be the value of
152  * the first byte of the UTF-8 character.
153  */
154 static const unsigned char valid_min_2nd_byte[0x100] = {
155 	0,    0,    0,    0,    0,    0,    0,    0,
156 	0,    0,    0,    0,    0,    0,    0,    0,
157 	0,    0,    0,    0,    0,    0,    0,    0,
158 	0,    0,    0,    0,    0,    0,    0,    0,
159 	0,    0,    0,    0,    0,    0,    0,    0,
160 	0,    0,    0,    0,    0,    0,    0,    0,
161 	0,    0,    0,    0,    0,    0,    0,    0,
162 	0,    0,    0,    0,    0,    0,    0,    0,
163 	0,    0,    0,    0,    0,    0,    0,    0,
164 	0,    0,    0,    0,    0,    0,    0,    0,
165 	0,    0,    0,    0,    0,    0,    0,    0,
166 	0,    0,    0,    0,    0,    0,    0,    0,
167 	0,    0,    0,    0,    0,    0,    0,    0,
168 	0,    0,    0,    0,    0,    0,    0,    0,
169 	0,    0,    0,    0,    0,    0,    0,    0,
170 	0,    0,    0,    0,    0,    0,    0,    0,
171 	0,    0,    0,    0,    0,    0,    0,    0,
172 	0,    0,    0,    0,    0,    0,    0,    0,
173 	0,    0,    0,    0,    0,    0,    0,    0,
174 	0,    0,    0,    0,    0,    0,    0,    0,
175 	0,    0,    0,    0,    0,    0,    0,    0,
176 	0,    0,    0,    0,    0,    0,    0,    0,
177 	0,    0,    0,    0,    0,    0,    0,    0,
178 	0,    0,    0,    0,    0,    0,    0,    0,
179      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
180 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
181      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
182 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
183      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
184 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
186 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
187      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
188 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
189      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
190 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
191      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
192 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
193 	0,    0,    0,    0,    0,    0,    0,    0,
194 };
195 
196 static const unsigned char valid_max_2nd_byte[0x100] = {
197 	0,    0,    0,    0,    0,    0,    0,    0,
198 	0,    0,    0,    0,    0,    0,    0,    0,
199 	0,    0,    0,    0,    0,    0,    0,    0,
200 	0,    0,    0,    0,    0,    0,    0,    0,
201 	0,    0,    0,    0,    0,    0,    0,    0,
202 	0,    0,    0,    0,    0,    0,    0,    0,
203 	0,    0,    0,    0,    0,    0,    0,    0,
204 	0,    0,    0,    0,    0,    0,    0,    0,
205 	0,    0,    0,    0,    0,    0,    0,    0,
206 	0,    0,    0,    0,    0,    0,    0,    0,
207 	0,    0,    0,    0,    0,    0,    0,    0,
208 	0,    0,    0,    0,    0,    0,    0,    0,
209 	0,    0,    0,    0,    0,    0,    0,    0,
210 	0,    0,    0,    0,    0,    0,    0,    0,
211 	0,    0,    0,    0,    0,    0,    0,    0,
212 	0,    0,    0,    0,    0,    0,    0,    0,
213 	0,    0,    0,    0,    0,    0,    0,    0,
214 	0,    0,    0,    0,    0,    0,    0,    0,
215 	0,    0,    0,    0,    0,    0,    0,    0,
216 	0,    0,    0,    0,    0,    0,    0,    0,
217 	0,    0,    0,    0,    0,    0,    0,    0,
218 	0,    0,    0,    0,    0,    0,    0,    0,
219 	0,    0,    0,    0,    0,    0,    0,    0,
220 	0,    0,    0,    0,    0,    0,    0,    0,
221      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
222 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
223      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
224 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
225      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
226 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
227      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
228 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
229      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
230 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
231      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
232 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
233      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
234 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
235 	0,    0,    0,    0,    0,    0,    0,    0,
236 };
237 
238 
239 /*
240  * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8
241  * characters' second to sixth bytes.
242  */
243 #define	ICV_UTF8_BIT_SHIFT		6
244 #define	ICV_UTF8_BIT_MASK		0x3f
245 #define	ICV_FETCH_UTF8_BOM_SIZE		6
246 
247 #define	ICV_FETCH_UCS4_SIZE		4
248 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \
249 	defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
250 #define ICV_FETCH_UCS_SIZE              2
251 #define ICV_FETCH_UCS_SIZE_TWO          4
252 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
253 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
254 #define ICV_FETCH_UCS_SIZE              4
255 #define ICV_FETCH_UCS_SIZE_TWO          8
256 #endif
257 
258 /*
259  * UTF-8 representations of some useful Unicode values.
260  *
261  * The U+FFFE in UTF-8 is 0x00efbfbe and the U+FFFF is 0x00efbfbf but
262  * we use masked values at the below:
263  */
264 #define	ICV_UTF8_REPRESENTATION_d800		(0x00eda080UL)
265 #define	ICV_UTF8_REPRESENTATION_dfff		(0x00edbfbfUL)
266 #define	ICV_UTF8_REPRESENTATION_fdd0		(0x00efb790UL)
267 #define	ICV_UTF8_REPRESENTATION_fdef		(0x00efb7afUL)
268 
269 #define	ICV_UTF8_REPRESENTATION_fffe		(0x000fbfbeUL)
270 #define	ICV_UTF8_REPRESENTATION_ffff		(0x000fbfbfUL)
271 #define	ICV_UTF8_REPRESENTATION_ffff_mask	(0x000fffffUL)
272 
273 #define	ICV_UTF8_REPRESENTATION_10fffd		(0xf48fbfbdUL)
274 
275 /*
276  * UTF-32 and UCS-4 representations of some useful Unicode values for
277  * non-character and out of bound invalid character detection.
278  */
279 #define	ICV_UTF32_NONCHAR_fffe			(0xfffeU)
280 #define	ICV_UTF32_NONCHAR_ffff			(0xffffU)
281 #define	ICV_UTF32_NONCHAR_mask			(0xffffU)
282 
283 #define	ICV_UTF32_SURROGATE_START_d800		(0xd800U)
284 #define	ICV_UTF32_SURROGATE_END_dfff		(0xdfffU)
285 
286 #define	ICV_UTF32_ARABIC_NONCHAR_START_fdd0	(0xfdd0U)
287 #define	ICV_UTF32_ARABIC_NONCHAR_END_fdef	(0xfdefU)
288 
289 #define	ICV_UTF32_LAST_VALID_CHAR		(0x10fffdU)
290 
291 #define	ICV_UCS4_LAST_VALID_CHAR		(0x7fffffff)
292 
293 
294 #endif	/* COMMON_DEFS_H */
295