xref: /titanic_51/usr/src/common/unicode/uconv.c (revision 4ac394880b9cfb15287d3e33a9b92c51240c4776)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33  * the section 3C man pages.
34  * Interface stability: Committed
35  */
36 
37 #include <sys/types.h>
38 #ifdef	_KERNEL
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/systm.h>
42 #include <sys/debug.h>
43 #include <sys/kmem.h>
44 #include <sys/sunddi.h>
45 #else
46 #include <sys/u8_textprep.h>
47 #endif	/* _KERNEL */
48 #include <sys/byteorder.h>
49 #include <sys/errno.h>
50 
51 
52 /*
53  * The max and min values of high and low surrogate pairs of UTF-16,
54  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
55  */
56 #define	UCONV_U16_HI_MIN	(0xd800U)
57 #define	UCONV_U16_HI_MAX	(0xdbffU)
58 #define	UCONV_U16_LO_MIN	(0xdc00U)
59 #define	UCONV_U16_LO_MAX	(0xdfffU)
60 #define	UCONV_U16_BIT_SHIFT	(0x0400U)
61 #define	UCONV_U16_BIT_MASK	(0x0fffffU)
62 #define	UCONV_U16_START		(0x010000U)
63 
64 /* The maximum value of Unicode coding space and ASCII coding space. */
65 #define	UCONV_UNICODE_MAX	(0x10ffffU)
66 #define	UCONV_ASCII_MAX		(0x7fU)
67 
68 /* The mask values for input and output endians. */
69 #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
70 #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
71 
72 /* Native and reversed endian macros. */
73 #ifdef	_BIG_ENDIAN
74 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
75 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
76 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
77 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
78 #else
79 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
80 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
81 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
82 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
83 #endif	/* _BIG_ENDIAN */
84 
85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
86 #define	UCONV_BOM_NORMAL	(0xfeffU)
87 #define	UCONV_BOM_SWAPPED	(0xfffeU)
88 #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
89 
90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
91 #define	UCONV_U8_ONE_BYTE	(0x7fU)
92 #define	UCONV_U8_TWO_BYTES	(0x7ffU)
93 #define	UCONV_U8_THREE_BYTES	(0xffffU)
94 #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
95 
96 /* The common minimum and maximum values at the UTF-8 character bytes. */
97 #define	UCONV_U8_BYTE_MIN	(0x80U)
98 #define	UCONV_U8_BYTE_MAX	(0xbfU)
99 
100 /*
101  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
102  * UTF-8 character bytes.
103  */
104 #define	UCONV_U8_BIT_SHIFT	6
105 #define	UCONV_U8_BIT_MASK	0x3f
106 
107 /*
108  * The following vector shows remaining bytes in a UTF-8 character.
109  * Index will be the first byte of the character.
110  */
111 static const uchar_t remaining_bytes_tbl[0x100] = {
112 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
113 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
114 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
115 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
116 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
117 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
118 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
119 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
120 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
121 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
122 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
123 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
124 
125 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
126 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
127 
128 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
129 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
130 
131 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
132 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
133 
134 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
135 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
136 };
137 
138 /*
139  * The following is a vector of bit-masks to get used bits in
140  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
141  * the character.
142  */
143 static const uchar_t masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
144 
145 /*
146  * The following two vectors are to provide valid minimum and
147  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
148  * better illegal sequence checking. The index value must be the value of
149  * the first byte of the UTF-8 character.
150  */
151 static const uchar_t valid_min_2nd_byte[0x100] = {
152 	0,    0,    0,    0,    0,    0,    0,    0,
153 	0,    0,    0,    0,    0,    0,    0,    0,
154 	0,    0,    0,    0,    0,    0,    0,    0,
155 	0,    0,    0,    0,    0,    0,    0,    0,
156 	0,    0,    0,    0,    0,    0,    0,    0,
157 	0,    0,    0,    0,    0,    0,    0,    0,
158 	0,    0,    0,    0,    0,    0,    0,    0,
159 	0,    0,    0,    0,    0,    0,    0,    0,
160 	0,    0,    0,    0,    0,    0,    0,    0,
161 	0,    0,    0,    0,    0,    0,    0,    0,
162 	0,    0,    0,    0,    0,    0,    0,    0,
163 	0,    0,    0,    0,    0,    0,    0,    0,
164 	0,    0,    0,    0,    0,    0,    0,    0,
165 	0,    0,    0,    0,    0,    0,    0,    0,
166 	0,    0,    0,    0,    0,    0,    0,    0,
167 	0,    0,    0,    0,    0,    0,    0,    0,
168 	0,    0,    0,    0,    0,    0,    0,    0,
169 	0,    0,    0,    0,    0,    0,    0,    0,
170 	0,    0,    0,    0,    0,    0,    0,    0,
171 	0,    0,    0,    0,    0,    0,    0,    0,
172 	0,    0,    0,    0,    0,    0,    0,    0,
173 	0,    0,    0,    0,    0,    0,    0,    0,
174 	0,    0,    0,    0,    0,    0,    0,    0,
175 	0,    0,    0,    0,    0,    0,    0,    0,
176 
177 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
178 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
179 
180 /*	C8    C9    CA    CB    CC    CD    CE    CF */
181 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
182 
183 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
184 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
185 
186 /*	D8    D9    DA    DB    DC    DD    DE    DF */
187 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
188 
189 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
190 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
191 
192 /*	E8    E9    EA    EB    EC    ED    EE    EF */
193 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
194 
195 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
196 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
197 
198 	0,    0,    0,    0,    0,    0,    0,    0
199 };
200 
201 static const uchar_t valid_max_2nd_byte[0x100] = {
202 	0,    0,    0,    0,    0,    0,    0,    0,
203 	0,    0,    0,    0,    0,    0,    0,    0,
204 	0,    0,    0,    0,    0,    0,    0,    0,
205 	0,    0,    0,    0,    0,    0,    0,    0,
206 	0,    0,    0,    0,    0,    0,    0,    0,
207 	0,    0,    0,    0,    0,    0,    0,    0,
208 	0,    0,    0,    0,    0,    0,    0,    0,
209 	0,    0,    0,    0,    0,    0,    0,    0,
210 	0,    0,    0,    0,    0,    0,    0,    0,
211 	0,    0,    0,    0,    0,    0,    0,    0,
212 	0,    0,    0,    0,    0,    0,    0,    0,
213 	0,    0,    0,    0,    0,    0,    0,    0,
214 	0,    0,    0,    0,    0,    0,    0,    0,
215 	0,    0,    0,    0,    0,    0,    0,    0,
216 	0,    0,    0,    0,    0,    0,    0,    0,
217 	0,    0,    0,    0,    0,    0,    0,    0,
218 	0,    0,    0,    0,    0,    0,    0,    0,
219 	0,    0,    0,    0,    0,    0,    0,    0,
220 	0,    0,    0,    0,    0,    0,    0,    0,
221 	0,    0,    0,    0,    0,    0,    0,    0,
222 	0,    0,    0,    0,    0,    0,    0,    0,
223 	0,    0,    0,    0,    0,    0,    0,    0,
224 	0,    0,    0,    0,    0,    0,    0,    0,
225 	0,    0,    0,    0,    0,    0,    0,    0,
226 
227 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
228 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
229 
230 /*	C8    C9    CA    CB    CC    CD    CE    CF */
231 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
232 
233 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
234 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
235 
236 /*	D8    D9    DA    DB    DC    DD    DE    DF */
237 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
238 
239 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
240 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
241 
242 /*	E8    E9    EA    EB    EC    ED    EE    EF */
243 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
244 
245 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
246 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
247 
248 	0,    0,    0,    0,    0,    0,    0,    0
249 };
250 
251 
252 static int
253 check_endian(int flag, int *in, int *out)
254 {
255 	*in = flag & UCONV_IN_ENDIAN_MASKS;
256 
257 	/* You cannot have both. */
258 	if (*in == UCONV_IN_ENDIAN_MASKS)
259 		return (EBADF);
260 
261 	if (*in == 0)
262 		*in = UCONV_IN_NAT_ENDIAN;
263 
264 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
265 
266 	/* You cannot have both. */
267 	if (*out == UCONV_OUT_ENDIAN_MASKS)
268 		return (EBADF);
269 
270 	if (*out == 0)
271 		*out = UCONV_OUT_NAT_ENDIAN;
272 
273 	return (0);
274 }
275 
276 static boolean_t
277 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
278 {
279 	if (u16l > 0) {
280 		if (*u16s == UCONV_BOM_NORMAL) {
281 			*in = UCONV_IN_NAT_ENDIAN;
282 			return (B_TRUE);
283 		}
284 		if (*u16s == UCONV_BOM_SWAPPED) {
285 			*in = UCONV_IN_REV_ENDIAN;
286 			return (B_TRUE);
287 		}
288 	}
289 
290 	return (B_FALSE);
291 }
292 
293 static boolean_t
294 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
295 {
296 	if (u32l > 0) {
297 		if (*u32s == UCONV_BOM_NORMAL) {
298 			*in = UCONV_IN_NAT_ENDIAN;
299 			return (B_TRUE);
300 		}
301 		if (*u32s == UCONV_BOM_SWAPPED_32) {
302 			*in = UCONV_IN_REV_ENDIAN;
303 			return (B_TRUE);
304 		}
305 	}
306 
307 	return (B_FALSE);
308 }
309 
310 int
311 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
312     uint32_t *u32s, size_t *utf32len, int flag)
313 {
314 	int inendian;
315 	int outendian;
316 	size_t u16l;
317 	size_t u32l;
318 	uint32_t hi;
319 	uint32_t lo;
320 	boolean_t do_not_ignore_null;
321 
322 	/*
323 	 * Do preliminary validity checks on parameters and collect info on
324 	 * endians.
325 	 */
326 	if (u16s == NULL || utf16len == NULL)
327 		return (EILSEQ);
328 
329 	if (u32s == NULL || utf32len == NULL)
330 		return (E2BIG);
331 
332 	if (check_endian(flag, &inendian, &outendian) != 0)
333 		return (EBADF);
334 
335 	/*
336 	 * Initialize input and output parameter buffer indices and
337 	 * temporary variables.
338 	 */
339 	u16l = u32l = 0;
340 	hi = 0;
341 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
342 
343 	/*
344 	 * Check on the BOM at the beginning of the input buffer if required
345 	 * and if there is indeed one, process it.
346 	 */
347 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
348 	    check_bom16(u16s, *utf16len, &inendian))
349 		u16l++;
350 
351 	/*
352 	 * Reset inendian and outendian so that after this point, those can be
353 	 * used as condition values.
354 	 */
355 	inendian &= UCONV_IN_NAT_ENDIAN;
356 	outendian &= UCONV_OUT_NAT_ENDIAN;
357 
358 	/*
359 	 * If there is something in the input buffer and if necessary and
360 	 * requested, save the BOM at the output buffer.
361 	 */
362 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
363 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
364 		    UCONV_BOM_SWAPPED_32;
365 
366 	/*
367 	 * Do conversion; if encounter a surrogate pair, assemble high and
368 	 * low pair values to form a UTF-32 character. If a half of a pair
369 	 * exists alone, then, either it is an illegal (EILSEQ) or
370 	 * invalid (EINVAL) value.
371 	 */
372 	for (; u16l < *utf16len; u16l++) {
373 		if (u16s[u16l] == 0 && do_not_ignore_null)
374 			break;
375 
376 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
377 
378 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
379 			if (hi)
380 				return (EILSEQ);
381 			hi = lo;
382 			continue;
383 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
384 			if (! hi)
385 				return (EILSEQ);
386 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
387 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
388 			    + UCONV_U16_START;
389 			hi = 0;
390 		} else if (hi) {
391 			return (EILSEQ);
392 		}
393 
394 		if (u32l >= *utf32len)
395 			return (E2BIG);
396 
397 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
398 	}
399 
400 	/*
401 	 * If high half didn't see low half, then, it's most likely the input
402 	 * parameter is incomplete.
403 	 */
404 	if (hi)
405 		return (EINVAL);
406 
407 	/*
408 	 * Save the number of consumed and saved characters. They do not
409 	 * include terminating NULL character (U+0000) at the end of
410 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
411 	 * the input buffer length is big enough to include the terminating
412 	 * NULL character).
413 	 */
414 	*utf16len = u16l;
415 	*utf32len = u32l;
416 
417 	return (0);
418 }
419 
420 int
421 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
422     uchar_t *u8s, size_t *utf8len, int flag)
423 {
424 	int inendian;
425 	int outendian;
426 	size_t u16l;
427 	size_t u8l;
428 	uint32_t hi;
429 	uint32_t lo;
430 	boolean_t do_not_ignore_null;
431 
432 	if (u16s == NULL || utf16len == NULL)
433 		return (EILSEQ);
434 
435 	if (u8s == NULL || utf8len == NULL)
436 		return (E2BIG);
437 
438 	if (check_endian(flag, &inendian, &outendian) != 0)
439 		return (EBADF);
440 
441 	u16l = u8l = 0;
442 	hi = 0;
443 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
444 
445 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
446 	    check_bom16(u16s, *utf16len, &inendian))
447 		u16l++;
448 
449 	inendian &= UCONV_IN_NAT_ENDIAN;
450 
451 	for (; u16l < *utf16len; u16l++) {
452 		if (u16s[u16l] == 0 && do_not_ignore_null)
453 			break;
454 
455 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
456 
457 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
458 			if (hi)
459 				return (EILSEQ);
460 			hi = lo;
461 			continue;
462 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
463 			if (! hi)
464 				return (EILSEQ);
465 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
466 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
467 			    + UCONV_U16_START;
468 			hi = 0;
469 		} else if (hi) {
470 			return (EILSEQ);
471 		}
472 
473 		/*
474 		 * Now we convert a UTF-32 character into a UTF-8 character.
475 		 * Unicode coding space is between U+0000 and U+10FFFF;
476 		 * anything bigger is an illegal character.
477 		 */
478 		if (lo <= UCONV_U8_ONE_BYTE) {
479 			if (u8l >= *utf8len)
480 				return (E2BIG);
481 			u8s[u8l++] = (uchar_t)lo;
482 		} else if (lo <= UCONV_U8_TWO_BYTES) {
483 			if ((u8l + 1) >= *utf8len)
484 				return (E2BIG);
485 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
486 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
487 		} else if (lo <= UCONV_U8_THREE_BYTES) {
488 			if ((u8l + 2) >= *utf8len)
489 				return (E2BIG);
490 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
491 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
492 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
493 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
494 			if ((u8l + 3) >= *utf8len)
495 				return (E2BIG);
496 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
497 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
498 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
499 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
500 		} else {
501 			return (EILSEQ);
502 		}
503 	}
504 
505 	if (hi)
506 		return (EINVAL);
507 
508 	*utf16len = u16l;
509 	*utf8len = u8l;
510 
511 	return (0);
512 }
513 
514 int
515 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
516     uint16_t *u16s, size_t *utf16len, int flag)
517 {
518 	int inendian;
519 	int outendian;
520 	size_t u16l;
521 	size_t u32l;
522 	uint32_t hi;
523 	uint32_t lo;
524 	boolean_t do_not_ignore_null;
525 
526 	if (u32s == NULL || utf32len == NULL)
527 		return (EILSEQ);
528 
529 	if (u16s == NULL || utf16len == NULL)
530 		return (E2BIG);
531 
532 	if (check_endian(flag, &inendian, &outendian) != 0)
533 		return (EBADF);
534 
535 	u16l = u32l = 0;
536 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
537 
538 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
539 	    check_bom32(u32s, *utf32len, &inendian))
540 		u32l++;
541 
542 	inendian &= UCONV_IN_NAT_ENDIAN;
543 	outendian &= UCONV_OUT_NAT_ENDIAN;
544 
545 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
546 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
547 		    UCONV_BOM_SWAPPED;
548 
549 	for (; u32l < *utf32len; u32l++) {
550 		if (u32s[u32l] == 0 && do_not_ignore_null)
551 			break;
552 
553 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
554 
555 		/*
556 		 * Anything bigger than the Unicode coding space, i.e.,
557 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
558 		 * character.
559 		 */
560 		if (hi > UCONV_UNICODE_MAX)
561 			return (EILSEQ);
562 
563 		/*
564 		 * Anything bigger than U+FFFF must be converted into
565 		 * a surrogate pair in UTF-16.
566 		 */
567 		if (hi >= UCONV_U16_START) {
568 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
569 			    UCONV_U16_LO_MIN;
570 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
571 			    UCONV_U16_HI_MIN;
572 
573 			if ((u16l + 1) >= *utf16len)
574 				return (E2BIG);
575 
576 			if (outendian) {
577 				u16s[u16l++] = (uint16_t)hi;
578 				u16s[u16l++] = (uint16_t)lo;
579 			} else {
580 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
581 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
582 			}
583 		} else {
584 			if (u16l >= *utf16len)
585 				return (E2BIG);
586 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
587 			    BSWAP_16(((uint16_t)hi));
588 		}
589 	}
590 
591 	*utf16len = u16l;
592 	*utf32len = u32l;
593 
594 	return (0);
595 }
596 
597 int
598 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
599     uchar_t *u8s, size_t *utf8len, int flag)
600 {
601 	int inendian;
602 	int outendian;
603 	size_t u32l;
604 	size_t u8l;
605 	uint32_t lo;
606 	boolean_t do_not_ignore_null;
607 
608 	if (u32s == NULL || utf32len == NULL)
609 		return (EILSEQ);
610 
611 	if (u8s == NULL || utf8len == NULL)
612 		return (E2BIG);
613 
614 	if (check_endian(flag, &inendian, &outendian) != 0)
615 		return (EBADF);
616 
617 	u32l = u8l = 0;
618 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
619 
620 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
621 	    check_bom32(u32s, *utf32len, &inendian))
622 		u32l++;
623 
624 	inendian &= UCONV_IN_NAT_ENDIAN;
625 
626 	for (; u32l < *utf32len; u32l++) {
627 		if (u32s[u32l] == 0 && do_not_ignore_null)
628 			break;
629 
630 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
631 
632 		if (lo <= UCONV_U8_ONE_BYTE) {
633 			if (u8l >= *utf8len)
634 				return (E2BIG);
635 			u8s[u8l++] = (uchar_t)lo;
636 		} else if (lo <= UCONV_U8_TWO_BYTES) {
637 			if ((u8l + 1) >= *utf8len)
638 				return (E2BIG);
639 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
640 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
641 		} else if (lo <= UCONV_U8_THREE_BYTES) {
642 			if ((u8l + 2) >= *utf8len)
643 				return (E2BIG);
644 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
645 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
646 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
647 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
648 			if ((u8l + 3) >= *utf8len)
649 				return (E2BIG);
650 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
651 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
652 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
653 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
654 		} else {
655 			return (EILSEQ);
656 		}
657 	}
658 
659 	*utf32len = u32l;
660 	*utf8len = u8l;
661 
662 	return (0);
663 }
664 
665 int
666 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
667     uint16_t *u16s, size_t *utf16len, int flag)
668 {
669 	int inendian;
670 	int outendian;
671 	size_t u16l;
672 	size_t u8l;
673 	uint32_t hi;
674 	uint32_t lo;
675 	int remaining_bytes;
676 	int first_b;
677 	boolean_t do_not_ignore_null;
678 
679 	if (u8s == NULL || utf8len == NULL)
680 		return (EILSEQ);
681 
682 	if (u16s == NULL || utf16len == NULL)
683 		return (E2BIG);
684 
685 	if (check_endian(flag, &inendian, &outendian) != 0)
686 		return (EBADF);
687 
688 	u16l = u8l = 0;
689 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
690 
691 	outendian &= UCONV_OUT_NAT_ENDIAN;
692 
693 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
694 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
695 		    UCONV_BOM_SWAPPED;
696 
697 	for (; u8l < *utf8len; ) {
698 		if (u8s[u8l] == 0 && do_not_ignore_null)
699 			break;
700 
701 		/*
702 		 * Collect a UTF-8 character and convert it to a UTF-32
703 		 * character. In doing so, we screen out illegally formed
704 		 * UTF-8 characters and treat such as illegal characters.
705 		 * The algorithm at below also screens out anything bigger
706 		 * than the U+10FFFF.
707 		 *
708 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
709 		 * more details on the illegal values of UTF-8 character
710 		 * bytes.
711 		 */
712 		hi = (uint32_t)u8s[u8l++];
713 
714 		if (hi > UCONV_ASCII_MAX) {
715 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
716 				return (EILSEQ);
717 
718 			first_b = hi;
719 			hi = hi & masks_tbl[remaining_bytes];
720 
721 			for (; remaining_bytes > 0; remaining_bytes--) {
722 				/*
723 				 * If we have no more bytes, the current
724 				 * UTF-8 character is incomplete.
725 				 */
726 				if (u8l >= *utf8len)
727 					return (EINVAL);
728 
729 				lo = (uint32_t)u8s[u8l++];
730 
731 				if (first_b) {
732 					if (lo < valid_min_2nd_byte[first_b] ||
733 					    lo > valid_max_2nd_byte[first_b])
734 						return (EILSEQ);
735 					first_b = 0;
736 				} else if (lo < UCONV_U8_BYTE_MIN ||
737 				    lo > UCONV_U8_BYTE_MAX) {
738 					return (EILSEQ);
739 				}
740 				hi = (hi << UCONV_U8_BIT_SHIFT) |
741 				    (lo & UCONV_U8_BIT_MASK);
742 			}
743 		}
744 
745 		if (hi >= UCONV_U16_START) {
746 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
747 			    UCONV_U16_LO_MIN;
748 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
749 			    UCONV_U16_HI_MIN;
750 
751 			if ((u16l + 1) >= *utf16len)
752 				return (E2BIG);
753 
754 			if (outendian) {
755 				u16s[u16l++] = (uint16_t)hi;
756 				u16s[u16l++] = (uint16_t)lo;
757 			} else {
758 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
759 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
760 			}
761 		} else {
762 			if (u16l >= *utf16len)
763 				return (E2BIG);
764 
765 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
766 			    BSWAP_16(((uint16_t)hi));
767 		}
768 	}
769 
770 	*utf16len = u16l;
771 	*utf8len = u8l;
772 
773 	return (0);
774 }
775 
776 int
777 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
778     uint32_t *u32s, size_t *utf32len, int flag)
779 {
780 	int inendian;
781 	int outendian;
782 	size_t u32l;
783 	size_t u8l;
784 	uint32_t hi;
785 	uint32_t c;
786 	int remaining_bytes;
787 	int first_b;
788 	boolean_t do_not_ignore_null;
789 
790 	if (u8s == NULL || utf8len == NULL)
791 		return (EILSEQ);
792 
793 	if (u32s == NULL || utf32len == NULL)
794 		return (E2BIG);
795 
796 	if (check_endian(flag, &inendian, &outendian) != 0)
797 		return (EBADF);
798 
799 	u32l = u8l = 0;
800 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
801 
802 	outendian &= UCONV_OUT_NAT_ENDIAN;
803 
804 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
805 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
806 		    UCONV_BOM_SWAPPED_32;
807 
808 	for (; u8l < *utf8len; ) {
809 		if (u8s[u8l] == 0 && do_not_ignore_null)
810 			break;
811 
812 		hi = (uint32_t)u8s[u8l++];
813 
814 		if (hi > UCONV_ASCII_MAX) {
815 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
816 				return (EILSEQ);
817 
818 			first_b = hi;
819 			hi = hi & masks_tbl[remaining_bytes];
820 
821 			for (; remaining_bytes > 0; remaining_bytes--) {
822 				if (u8l >= *utf8len)
823 					return (EINVAL);
824 
825 				c = (uint32_t)u8s[u8l++];
826 
827 				if (first_b) {
828 					if (c < valid_min_2nd_byte[first_b] ||
829 					    c > valid_max_2nd_byte[first_b])
830 						return (EILSEQ);
831 					first_b = 0;
832 				} else if (c < UCONV_U8_BYTE_MIN ||
833 				    c > UCONV_U8_BYTE_MAX) {
834 					return (EILSEQ);
835 				}
836 				hi = (hi << UCONV_U8_BIT_SHIFT) |
837 				    (c & UCONV_U8_BIT_MASK);
838 			}
839 		}
840 
841 		if (u32l >= *utf32len)
842 			return (E2BIG);
843 
844 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
845 	}
846 
847 	*utf32len = u32l;
848 	*utf8len = u8l;
849 
850 	return (0);
851 }
852