xref: /illumos-gate/usr/src/common/unicode/uconv.c (revision edb348833aaacfa1176e502ad38875fd0b2717ab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33  * the section 3C man pages.
34  * Interface stability: Committed
35  */
36 
37 #include <sys/types.h>
38 #ifdef	_KERNEL
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/systm.h>
42 #include <sys/debug.h>
43 #include <sys/kmem.h>
44 #include <sys/sunddi.h>
45 #else
46 #include <sys/u8_textprep.h>
47 #endif	/* _KERNEL */
48 #include <sys/byteorder.h>
49 #include <sys/errno.h>
50 
51 
52 /*
53  * The max and min values of high and low surrogate pairs of UTF-16,
54  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
55  */
56 #define	UCONV_U16_HI_MIN	(0xd800U)
57 #define	UCONV_U16_HI_MAX	(0xdbffU)
58 #define	UCONV_U16_LO_MIN	(0xdc00U)
59 #define	UCONV_U16_LO_MAX	(0xdfffU)
60 #define	UCONV_U16_BIT_SHIFT	(0x0400U)
61 #define	UCONV_U16_BIT_MASK	(0x0fffffU)
62 #define	UCONV_U16_START		(0x010000U)
63 
64 /* The maximum value of Unicode coding space and ASCII coding space. */
65 #define	UCONV_UNICODE_MAX	(0x10ffffU)
66 #define	UCONV_ASCII_MAX		(0x7fU)
67 
68 /* The mask values for input and output endians. */
69 #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
70 #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
71 
72 /* Native and reversed endian macros. */
73 #ifdef	_BIG_ENDIAN
74 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
75 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
76 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
77 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
78 #else
79 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
80 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
81 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
82 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
83 #endif	/* _BIG_ENDIAN */
84 
85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
86 #define	UCONV_BOM_NORMAL	(0xfeffU)
87 #define	UCONV_BOM_SWAPPED	(0xfffeU)
88 #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
89 
90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
91 #define	UCONV_U8_ONE_BYTE	(0x7fU)
92 #define	UCONV_U8_TWO_BYTES	(0x7ffU)
93 #define	UCONV_U8_THREE_BYTES	(0xffffU)
94 #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
95 
96 /* The common minimum and maximum values at the UTF-8 character bytes. */
97 #define	UCONV_U8_BYTE_MIN	(0x80U)
98 #define	UCONV_U8_BYTE_MAX	(0xbfU)
99 
100 /*
101  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
102  * UTF-8 character bytes.
103  */
104 #define	UCONV_U8_BIT_SHIFT	6
105 #define	UCONV_U8_BIT_MASK	0x3f
106 
107 /*
108  * The following vector shows remaining bytes in a UTF-8 character.
109  * Index will be the first byte of the character.
110  */
111 static const uchar_t remaining_bytes_tbl[0x100] = {
112 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
113 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
114 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
115 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
116 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
117 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
118 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
119 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
120 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
121 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
122 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
123 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
124 
125 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
126 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
127 
128 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
129 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
130 
131 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
132 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
133 
134 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
135 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
136 };
137 
138 /*
139  * The following is a vector of bit-masks to get used bits in
140  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
141  * the character.
142  */
143 #ifdef	_KERNEL
144 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145 #else
146 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147 #endif	/* _KERNEL */
148 
149 /*
150  * The following two vectors are to provide valid minimum and
151  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
152  * better illegal sequence checking. The index value must be the value of
153  * the first byte of the UTF-8 character.
154  */
155 static const uchar_t valid_min_2nd_byte[0x100] = {
156 	0,    0,    0,    0,    0,    0,    0,    0,
157 	0,    0,    0,    0,    0,    0,    0,    0,
158 	0,    0,    0,    0,    0,    0,    0,    0,
159 	0,    0,    0,    0,    0,    0,    0,    0,
160 	0,    0,    0,    0,    0,    0,    0,    0,
161 	0,    0,    0,    0,    0,    0,    0,    0,
162 	0,    0,    0,    0,    0,    0,    0,    0,
163 	0,    0,    0,    0,    0,    0,    0,    0,
164 	0,    0,    0,    0,    0,    0,    0,    0,
165 	0,    0,    0,    0,    0,    0,    0,    0,
166 	0,    0,    0,    0,    0,    0,    0,    0,
167 	0,    0,    0,    0,    0,    0,    0,    0,
168 	0,    0,    0,    0,    0,    0,    0,    0,
169 	0,    0,    0,    0,    0,    0,    0,    0,
170 	0,    0,    0,    0,    0,    0,    0,    0,
171 	0,    0,    0,    0,    0,    0,    0,    0,
172 	0,    0,    0,    0,    0,    0,    0,    0,
173 	0,    0,    0,    0,    0,    0,    0,    0,
174 	0,    0,    0,    0,    0,    0,    0,    0,
175 	0,    0,    0,    0,    0,    0,    0,    0,
176 	0,    0,    0,    0,    0,    0,    0,    0,
177 	0,    0,    0,    0,    0,    0,    0,    0,
178 	0,    0,    0,    0,    0,    0,    0,    0,
179 	0,    0,    0,    0,    0,    0,    0,    0,
180 
181 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
182 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
183 
184 /*	C8    C9    CA    CB    CC    CD    CE    CF */
185 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
186 
187 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
188 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
189 
190 /*	D8    D9    DA    DB    DC    DD    DE    DF */
191 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
192 
193 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
194 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
195 
196 /*	E8    E9    EA    EB    EC    ED    EE    EF */
197 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
198 
199 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
200 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
201 
202 	0,    0,    0,    0,    0,    0,    0,    0
203 };
204 
205 static const uchar_t valid_max_2nd_byte[0x100] = {
206 	0,    0,    0,    0,    0,    0,    0,    0,
207 	0,    0,    0,    0,    0,    0,    0,    0,
208 	0,    0,    0,    0,    0,    0,    0,    0,
209 	0,    0,    0,    0,    0,    0,    0,    0,
210 	0,    0,    0,    0,    0,    0,    0,    0,
211 	0,    0,    0,    0,    0,    0,    0,    0,
212 	0,    0,    0,    0,    0,    0,    0,    0,
213 	0,    0,    0,    0,    0,    0,    0,    0,
214 	0,    0,    0,    0,    0,    0,    0,    0,
215 	0,    0,    0,    0,    0,    0,    0,    0,
216 	0,    0,    0,    0,    0,    0,    0,    0,
217 	0,    0,    0,    0,    0,    0,    0,    0,
218 	0,    0,    0,    0,    0,    0,    0,    0,
219 	0,    0,    0,    0,    0,    0,    0,    0,
220 	0,    0,    0,    0,    0,    0,    0,    0,
221 	0,    0,    0,    0,    0,    0,    0,    0,
222 	0,    0,    0,    0,    0,    0,    0,    0,
223 	0,    0,    0,    0,    0,    0,    0,    0,
224 	0,    0,    0,    0,    0,    0,    0,    0,
225 	0,    0,    0,    0,    0,    0,    0,    0,
226 	0,    0,    0,    0,    0,    0,    0,    0,
227 	0,    0,    0,    0,    0,    0,    0,    0,
228 	0,    0,    0,    0,    0,    0,    0,    0,
229 	0,    0,    0,    0,    0,    0,    0,    0,
230 
231 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
232 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
233 
234 /*	C8    C9    CA    CB    CC    CD    CE    CF */
235 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
236 
237 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
238 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
239 
240 /*	D8    D9    DA    DB    DC    DD    DE    DF */
241 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
242 
243 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
244 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
245 
246 /*	E8    E9    EA    EB    EC    ED    EE    EF */
247 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
248 
249 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
250 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
251 
252 	0,    0,    0,    0,    0,    0,    0,    0
253 };
254 
255 
256 static int
257 check_endian(int flag, int *in, int *out)
258 {
259 	*in = flag & UCONV_IN_ENDIAN_MASKS;
260 
261 	/* You cannot have both. */
262 	if (*in == UCONV_IN_ENDIAN_MASKS)
263 		return (EBADF);
264 
265 	if (*in == 0)
266 		*in = UCONV_IN_NAT_ENDIAN;
267 
268 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
269 
270 	/* You cannot have both. */
271 	if (*out == UCONV_OUT_ENDIAN_MASKS)
272 		return (EBADF);
273 
274 	if (*out == 0)
275 		*out = UCONV_OUT_NAT_ENDIAN;
276 
277 	return (0);
278 }
279 
280 static boolean_t
281 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
282 {
283 	if (u16l > 0) {
284 		if (*u16s == UCONV_BOM_NORMAL) {
285 			*in = UCONV_IN_NAT_ENDIAN;
286 			return (B_TRUE);
287 		}
288 		if (*u16s == UCONV_BOM_SWAPPED) {
289 			*in = UCONV_IN_REV_ENDIAN;
290 			return (B_TRUE);
291 		}
292 	}
293 
294 	return (B_FALSE);
295 }
296 
297 static boolean_t
298 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
299 {
300 	if (u32l > 0) {
301 		if (*u32s == UCONV_BOM_NORMAL) {
302 			*in = UCONV_IN_NAT_ENDIAN;
303 			return (B_TRUE);
304 		}
305 		if (*u32s == UCONV_BOM_SWAPPED_32) {
306 			*in = UCONV_IN_REV_ENDIAN;
307 			return (B_TRUE);
308 		}
309 	}
310 
311 	return (B_FALSE);
312 }
313 
314 int
315 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
316     uint32_t *u32s, size_t *utf32len, int flag)
317 {
318 	int inendian;
319 	int outendian;
320 	size_t u16l;
321 	size_t u32l;
322 	uint32_t hi;
323 	uint32_t lo;
324 	boolean_t do_not_ignore_null;
325 
326 	/*
327 	 * Do preliminary validity checks on parameters and collect info on
328 	 * endians.
329 	 */
330 	if (u16s == NULL || utf16len == NULL)
331 		return (EILSEQ);
332 
333 	if (u32s == NULL || utf32len == NULL)
334 		return (E2BIG);
335 
336 	if (check_endian(flag, &inendian, &outendian) != 0)
337 		return (EBADF);
338 
339 	/*
340 	 * Initialize input and output parameter buffer indices and
341 	 * temporary variables.
342 	 */
343 	u16l = u32l = 0;
344 	hi = 0;
345 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
346 
347 	/*
348 	 * Check on the BOM at the beginning of the input buffer if required
349 	 * and if there is indeed one, process it.
350 	 */
351 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
352 	    check_bom16(u16s, *utf16len, &inendian))
353 		u16l++;
354 
355 	/*
356 	 * Reset inendian and outendian so that after this point, those can be
357 	 * used as condition values.
358 	 */
359 	inendian &= UCONV_IN_NAT_ENDIAN;
360 	outendian &= UCONV_OUT_NAT_ENDIAN;
361 
362 	/*
363 	 * If there is something in the input buffer and if necessary and
364 	 * requested, save the BOM at the output buffer.
365 	 */
366 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
367 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
368 		    UCONV_BOM_SWAPPED_32;
369 
370 	/*
371 	 * Do conversion; if encounter a surrogate pair, assemble high and
372 	 * low pair values to form a UTF-32 character. If a half of a pair
373 	 * exists alone, then, either it is an illegal (EILSEQ) or
374 	 * invalid (EINVAL) value.
375 	 */
376 	for (; u16l < *utf16len; u16l++) {
377 		if (u16s[u16l] == 0 && do_not_ignore_null)
378 			break;
379 
380 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
381 
382 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
383 			if (hi)
384 				return (EILSEQ);
385 			hi = lo;
386 			continue;
387 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
388 			if (! hi)
389 				return (EILSEQ);
390 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
391 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
392 			    + UCONV_U16_START;
393 			hi = 0;
394 		} else if (hi) {
395 			return (EILSEQ);
396 		}
397 
398 		if (u32l >= *utf32len)
399 			return (E2BIG);
400 
401 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
402 	}
403 
404 	/*
405 	 * If high half didn't see low half, then, it's most likely the input
406 	 * parameter is incomplete.
407 	 */
408 	if (hi)
409 		return (EINVAL);
410 
411 	/*
412 	 * Save the number of consumed and saved characters. They do not
413 	 * include terminating NULL character (U+0000) at the end of
414 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
415 	 * the input buffer length is big enough to include the terminating
416 	 * NULL character).
417 	 */
418 	*utf16len = u16l;
419 	*utf32len = u32l;
420 
421 	return (0);
422 }
423 
424 int
425 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
426     uchar_t *u8s, size_t *utf8len, int flag)
427 {
428 	int inendian;
429 	int outendian;
430 	size_t u16l;
431 	size_t u8l;
432 	uint32_t hi;
433 	uint32_t lo;
434 	boolean_t do_not_ignore_null;
435 
436 	if (u16s == NULL || utf16len == NULL)
437 		return (EILSEQ);
438 
439 	if (u8s == NULL || utf8len == NULL)
440 		return (E2BIG);
441 
442 	if (check_endian(flag, &inendian, &outendian) != 0)
443 		return (EBADF);
444 
445 	u16l = u8l = 0;
446 	hi = 0;
447 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
448 
449 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
450 	    check_bom16(u16s, *utf16len, &inendian))
451 		u16l++;
452 
453 	inendian &= UCONV_IN_NAT_ENDIAN;
454 
455 	for (; u16l < *utf16len; u16l++) {
456 		if (u16s[u16l] == 0 && do_not_ignore_null)
457 			break;
458 
459 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
460 
461 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
462 			if (hi)
463 				return (EILSEQ);
464 			hi = lo;
465 			continue;
466 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
467 			if (! hi)
468 				return (EILSEQ);
469 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
470 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
471 			    + UCONV_U16_START;
472 			hi = 0;
473 		} else if (hi) {
474 			return (EILSEQ);
475 		}
476 
477 		/*
478 		 * Now we convert a UTF-32 character into a UTF-8 character.
479 		 * Unicode coding space is between U+0000 and U+10FFFF;
480 		 * anything bigger is an illegal character.
481 		 */
482 		if (lo <= UCONV_U8_ONE_BYTE) {
483 			if (u8l >= *utf8len)
484 				return (E2BIG);
485 			u8s[u8l++] = (uchar_t)lo;
486 		} else if (lo <= UCONV_U8_TWO_BYTES) {
487 			if ((u8l + 1) >= *utf8len)
488 				return (E2BIG);
489 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
490 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
491 		} else if (lo <= UCONV_U8_THREE_BYTES) {
492 			if ((u8l + 2) >= *utf8len)
493 				return (E2BIG);
494 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
495 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
496 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
497 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
498 			if ((u8l + 3) >= *utf8len)
499 				return (E2BIG);
500 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
501 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
502 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
503 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
504 		} else {
505 			return (EILSEQ);
506 		}
507 	}
508 
509 	if (hi)
510 		return (EINVAL);
511 
512 	*utf16len = u16l;
513 	*utf8len = u8l;
514 
515 	return (0);
516 }
517 
518 int
519 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
520     uint16_t *u16s, size_t *utf16len, int flag)
521 {
522 	int inendian;
523 	int outendian;
524 	size_t u16l;
525 	size_t u32l;
526 	uint32_t hi;
527 	uint32_t lo;
528 	boolean_t do_not_ignore_null;
529 
530 	if (u32s == NULL || utf32len == NULL)
531 		return (EILSEQ);
532 
533 	if (u16s == NULL || utf16len == NULL)
534 		return (E2BIG);
535 
536 	if (check_endian(flag, &inendian, &outendian) != 0)
537 		return (EBADF);
538 
539 	u16l = u32l = 0;
540 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
541 
542 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
543 	    check_bom32(u32s, *utf32len, &inendian))
544 		u32l++;
545 
546 	inendian &= UCONV_IN_NAT_ENDIAN;
547 	outendian &= UCONV_OUT_NAT_ENDIAN;
548 
549 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
550 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
551 		    UCONV_BOM_SWAPPED;
552 
553 	for (; u32l < *utf32len; u32l++) {
554 		if (u32s[u32l] == 0 && do_not_ignore_null)
555 			break;
556 
557 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
558 
559 		/*
560 		 * Anything bigger than the Unicode coding space, i.e.,
561 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
562 		 * character.
563 		 */
564 		if (hi > UCONV_UNICODE_MAX)
565 			return (EILSEQ);
566 
567 		/*
568 		 * Anything bigger than U+FFFF must be converted into
569 		 * a surrogate pair in UTF-16.
570 		 */
571 		if (hi >= UCONV_U16_START) {
572 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
573 			    UCONV_U16_LO_MIN;
574 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
575 			    UCONV_U16_HI_MIN;
576 
577 			if ((u16l + 1) >= *utf16len)
578 				return (E2BIG);
579 
580 			if (outendian) {
581 				u16s[u16l++] = (uint16_t)hi;
582 				u16s[u16l++] = (uint16_t)lo;
583 			} else {
584 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
585 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
586 			}
587 		} else {
588 			if (u16l >= *utf16len)
589 				return (E2BIG);
590 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
591 			    BSWAP_16(((uint16_t)hi));
592 		}
593 	}
594 
595 	*utf16len = u16l;
596 	*utf32len = u32l;
597 
598 	return (0);
599 }
600 
601 int
602 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
603     uchar_t *u8s, size_t *utf8len, int flag)
604 {
605 	int inendian;
606 	int outendian;
607 	size_t u32l;
608 	size_t u8l;
609 	uint32_t lo;
610 	boolean_t do_not_ignore_null;
611 
612 	if (u32s == NULL || utf32len == NULL)
613 		return (EILSEQ);
614 
615 	if (u8s == NULL || utf8len == NULL)
616 		return (E2BIG);
617 
618 	if (check_endian(flag, &inendian, &outendian) != 0)
619 		return (EBADF);
620 
621 	u32l = u8l = 0;
622 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
623 
624 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
625 	    check_bom32(u32s, *utf32len, &inendian))
626 		u32l++;
627 
628 	inendian &= UCONV_IN_NAT_ENDIAN;
629 
630 	for (; u32l < *utf32len; u32l++) {
631 		if (u32s[u32l] == 0 && do_not_ignore_null)
632 			break;
633 
634 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
635 
636 		if (lo <= UCONV_U8_ONE_BYTE) {
637 			if (u8l >= *utf8len)
638 				return (E2BIG);
639 			u8s[u8l++] = (uchar_t)lo;
640 		} else if (lo <= UCONV_U8_TWO_BYTES) {
641 			if ((u8l + 1) >= *utf8len)
642 				return (E2BIG);
643 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
644 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
645 		} else if (lo <= UCONV_U8_THREE_BYTES) {
646 			if ((u8l + 2) >= *utf8len)
647 				return (E2BIG);
648 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
649 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
650 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
651 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
652 			if ((u8l + 3) >= *utf8len)
653 				return (E2BIG);
654 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
655 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
656 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
657 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
658 		} else {
659 			return (EILSEQ);
660 		}
661 	}
662 
663 	*utf32len = u32l;
664 	*utf8len = u8l;
665 
666 	return (0);
667 }
668 
669 int
670 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
671     uint16_t *u16s, size_t *utf16len, int flag)
672 {
673 	int inendian;
674 	int outendian;
675 	size_t u16l;
676 	size_t u8l;
677 	uint32_t hi;
678 	uint32_t lo;
679 	int remaining_bytes;
680 	int first_b;
681 	boolean_t do_not_ignore_null;
682 
683 	if (u8s == NULL || utf8len == NULL)
684 		return (EILSEQ);
685 
686 	if (u16s == NULL || utf16len == NULL)
687 		return (E2BIG);
688 
689 	if (check_endian(flag, &inendian, &outendian) != 0)
690 		return (EBADF);
691 
692 	u16l = u8l = 0;
693 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
694 
695 	outendian &= UCONV_OUT_NAT_ENDIAN;
696 
697 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
698 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
699 		    UCONV_BOM_SWAPPED;
700 
701 	for (; u8l < *utf8len; ) {
702 		if (u8s[u8l] == 0 && do_not_ignore_null)
703 			break;
704 
705 		/*
706 		 * Collect a UTF-8 character and convert it to a UTF-32
707 		 * character. In doing so, we screen out illegally formed
708 		 * UTF-8 characters and treat such as illegal characters.
709 		 * The algorithm at below also screens out anything bigger
710 		 * than the U+10FFFF.
711 		 *
712 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
713 		 * more details on the illegal values of UTF-8 character
714 		 * bytes.
715 		 */
716 		hi = (uint32_t)u8s[u8l++];
717 
718 		if (hi > UCONV_ASCII_MAX) {
719 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
720 				return (EILSEQ);
721 
722 			first_b = hi;
723 			hi = hi & u8_masks_tbl[remaining_bytes];
724 
725 			for (; remaining_bytes > 0; remaining_bytes--) {
726 				/*
727 				 * If we have no more bytes, the current
728 				 * UTF-8 character is incomplete.
729 				 */
730 				if (u8l >= *utf8len)
731 					return (EINVAL);
732 
733 				lo = (uint32_t)u8s[u8l++];
734 
735 				if (first_b) {
736 					if (lo < valid_min_2nd_byte[first_b] ||
737 					    lo > valid_max_2nd_byte[first_b])
738 						return (EILSEQ);
739 					first_b = 0;
740 				} else if (lo < UCONV_U8_BYTE_MIN ||
741 				    lo > UCONV_U8_BYTE_MAX) {
742 					return (EILSEQ);
743 				}
744 				hi = (hi << UCONV_U8_BIT_SHIFT) |
745 				    (lo & UCONV_U8_BIT_MASK);
746 			}
747 		}
748 
749 		if (hi >= UCONV_U16_START) {
750 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
751 			    UCONV_U16_LO_MIN;
752 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
753 			    UCONV_U16_HI_MIN;
754 
755 			if ((u16l + 1) >= *utf16len)
756 				return (E2BIG);
757 
758 			if (outendian) {
759 				u16s[u16l++] = (uint16_t)hi;
760 				u16s[u16l++] = (uint16_t)lo;
761 			} else {
762 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
763 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
764 			}
765 		} else {
766 			if (u16l >= *utf16len)
767 				return (E2BIG);
768 
769 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
770 			    BSWAP_16(((uint16_t)hi));
771 		}
772 	}
773 
774 	*utf16len = u16l;
775 	*utf8len = u8l;
776 
777 	return (0);
778 }
779 
780 int
781 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
782     uint32_t *u32s, size_t *utf32len, int flag)
783 {
784 	int inendian;
785 	int outendian;
786 	size_t u32l;
787 	size_t u8l;
788 	uint32_t hi;
789 	uint32_t c;
790 	int remaining_bytes;
791 	int first_b;
792 	boolean_t do_not_ignore_null;
793 
794 	if (u8s == NULL || utf8len == NULL)
795 		return (EILSEQ);
796 
797 	if (u32s == NULL || utf32len == NULL)
798 		return (E2BIG);
799 
800 	if (check_endian(flag, &inendian, &outendian) != 0)
801 		return (EBADF);
802 
803 	u32l = u8l = 0;
804 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
805 
806 	outendian &= UCONV_OUT_NAT_ENDIAN;
807 
808 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
809 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
810 		    UCONV_BOM_SWAPPED_32;
811 
812 	for (; u8l < *utf8len; ) {
813 		if (u8s[u8l] == 0 && do_not_ignore_null)
814 			break;
815 
816 		hi = (uint32_t)u8s[u8l++];
817 
818 		if (hi > UCONV_ASCII_MAX) {
819 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
820 				return (EILSEQ);
821 
822 			first_b = hi;
823 			hi = hi & u8_masks_tbl[remaining_bytes];
824 
825 			for (; remaining_bytes > 0; remaining_bytes--) {
826 				if (u8l >= *utf8len)
827 					return (EINVAL);
828 
829 				c = (uint32_t)u8s[u8l++];
830 
831 				if (first_b) {
832 					if (c < valid_min_2nd_byte[first_b] ||
833 					    c > valid_max_2nd_byte[first_b])
834 						return (EILSEQ);
835 					first_b = 0;
836 				} else if (c < UCONV_U8_BYTE_MIN ||
837 				    c > UCONV_U8_BYTE_MAX) {
838 					return (EILSEQ);
839 				}
840 				hi = (hi << UCONV_U8_BIT_SHIFT) |
841 				    (c & UCONV_U8_BIT_MASK);
842 			}
843 		}
844 
845 		if (u32l >= *utf32len)
846 			return (E2BIG);
847 
848 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
849 	}
850 
851 	*utf32len = u32l;
852 	*utf8len = u8l;
853 
854 	return (0);
855 }
856