xref: /illumos-gate/usr/src/common/unicode/uconv.c (revision c40a6cd785e883b3f052b122c332e21174fc1871)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
28  * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
29  * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
30  * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
31  * the section 3C man pages.
32  * Interface stability: Committed
33  */
34 
35 #include <sys/types.h>
36 #ifdef	_KERNEL
37 #include <sys/param.h>
38 #include <sys/sysmacros.h>
39 #include <sys/systm.h>
40 #include <sys/debug.h>
41 #include <sys/kmem.h>
42 #include <sys/sunddi.h>
43 #else
44 #include <sys/u8_textprep.h>
45 #endif	/* _KERNEL */
46 #include <sys/byteorder.h>
47 #include <sys/errno.h>
48 
49 
50 /*
51  * The max and min values of high and low surrogate pairs of UTF-16,
52  * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
53  */
54 #define	UCONV_U16_HI_MIN	(0xd800U)
55 #define	UCONV_U16_HI_MAX	(0xdbffU)
56 #define	UCONV_U16_LO_MIN	(0xdc00U)
57 #define	UCONV_U16_LO_MAX	(0xdfffU)
58 #define	UCONV_U16_BIT_SHIFT	(0x0400U)
59 #define	UCONV_U16_BIT_MASK	(0x0fffffU)
60 #define	UCONV_U16_START		(0x010000U)
61 
62 /* The maximum value of Unicode coding space and ASCII coding space. */
63 #define	UCONV_UNICODE_MAX	(0x10ffffU)
64 #define	UCONV_ASCII_MAX		(0x7fU)
65 
66 /* The mask values for input and output endians. */
67 #define	UCONV_IN_ENDIAN_MASKS	(UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
68 #define	UCONV_OUT_ENDIAN_MASKS	(UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
69 
70 /* Native and reversed endian macros. */
71 #ifdef	_BIG_ENDIAN
72 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_BIG_ENDIAN
73 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_LITTLE_ENDIAN
74 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_BIG_ENDIAN
75 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
76 #else
77 #define	UCONV_IN_NAT_ENDIAN	UCONV_IN_LITTLE_ENDIAN
78 #define	UCONV_IN_REV_ENDIAN	UCONV_IN_BIG_ENDIAN
79 #define	UCONV_OUT_NAT_ENDIAN	UCONV_OUT_LITTLE_ENDIAN
80 #define	UCONV_OUT_REV_ENDIAN	UCONV_OUT_BIG_ENDIAN
81 #endif	/* _BIG_ENDIAN */
82 
83 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
84 #define	UCONV_BOM_NORMAL	(0xfeffU)
85 #define	UCONV_BOM_SWAPPED	(0xfffeU)
86 #define	UCONV_BOM_SWAPPED_32	(0xfffe0000U)
87 
88 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
89 #define	UCONV_U8_ONE_BYTE	(0x7fU)
90 #define	UCONV_U8_TWO_BYTES	(0x7ffU)
91 #define	UCONV_U8_THREE_BYTES	(0xffffU)
92 #define	UCONV_U8_FOUR_BYTES	(0x10ffffU)
93 
94 /* The common minimum and maximum values at the UTF-8 character bytes. */
95 #define	UCONV_U8_BYTE_MIN	(0x80U)
96 #define	UCONV_U8_BYTE_MAX	(0xbfU)
97 
98 /*
99  * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
100  * UTF-8 character bytes.
101  */
102 #define	UCONV_U8_BIT_SHIFT	6
103 #define	UCONV_U8_BIT_MASK	0x3f
104 
105 /*
106  * The following vector shows remaining bytes in a UTF-8 character.
107  * Index will be the first byte of the character.
108  */
109 static const uchar_t remaining_bytes_tbl[0x100] = {
110 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
111 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
112 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
113 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
114 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
115 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
116 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
117 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
118 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
119 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
120 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
121 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
122 
123 /*	C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF */
124 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
125 
126 /*	D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF */
127 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
128 
129 /*	E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF */
130 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
131 
132 /*	F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF */
133 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
134 };
135 
136 /*
137  * The following is a vector of bit-masks to get used bits in
138  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
139  * the character.
140  */
141 #ifdef	_KERNEL
142 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
143 #else
144 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145 #endif	/* _KERNEL */
146 
147 /*
148  * The following two vectors are to provide valid minimum and
149  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
150  * better illegal sequence checking. The index value must be the value of
151  * the first byte of the UTF-8 character.
152  */
153 static const uchar_t valid_min_2nd_byte[0x100] = {
154 	0,    0,    0,    0,    0,    0,    0,    0,
155 	0,    0,    0,    0,    0,    0,    0,    0,
156 	0,    0,    0,    0,    0,    0,    0,    0,
157 	0,    0,    0,    0,    0,    0,    0,    0,
158 	0,    0,    0,    0,    0,    0,    0,    0,
159 	0,    0,    0,    0,    0,    0,    0,    0,
160 	0,    0,    0,    0,    0,    0,    0,    0,
161 	0,    0,    0,    0,    0,    0,    0,    0,
162 	0,    0,    0,    0,    0,    0,    0,    0,
163 	0,    0,    0,    0,    0,    0,    0,    0,
164 	0,    0,    0,    0,    0,    0,    0,    0,
165 	0,    0,    0,    0,    0,    0,    0,    0,
166 	0,    0,    0,    0,    0,    0,    0,    0,
167 	0,    0,    0,    0,    0,    0,    0,    0,
168 	0,    0,    0,    0,    0,    0,    0,    0,
169 	0,    0,    0,    0,    0,    0,    0,    0,
170 	0,    0,    0,    0,    0,    0,    0,    0,
171 	0,    0,    0,    0,    0,    0,    0,    0,
172 	0,    0,    0,    0,    0,    0,    0,    0,
173 	0,    0,    0,    0,    0,    0,    0,    0,
174 	0,    0,    0,    0,    0,    0,    0,    0,
175 	0,    0,    0,    0,    0,    0,    0,    0,
176 	0,    0,    0,    0,    0,    0,    0,    0,
177 	0,    0,    0,    0,    0,    0,    0,    0,
178 
179 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
180 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
181 
182 /*	C8    C9    CA    CB    CC    CD    CE    CF */
183 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
184 
185 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
186 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
187 
188 /*	D8    D9    DA    DB    DC    DD    DE    DF */
189 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
190 
191 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
192 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
193 
194 /*	E8    E9    EA    EB    EC    ED    EE    EF */
195 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
196 
197 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
198 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
199 
200 	0,    0,    0,    0,    0,    0,    0,    0
201 };
202 
203 static const uchar_t valid_max_2nd_byte[0x100] = {
204 	0,    0,    0,    0,    0,    0,    0,    0,
205 	0,    0,    0,    0,    0,    0,    0,    0,
206 	0,    0,    0,    0,    0,    0,    0,    0,
207 	0,    0,    0,    0,    0,    0,    0,    0,
208 	0,    0,    0,    0,    0,    0,    0,    0,
209 	0,    0,    0,    0,    0,    0,    0,    0,
210 	0,    0,    0,    0,    0,    0,    0,    0,
211 	0,    0,    0,    0,    0,    0,    0,    0,
212 	0,    0,    0,    0,    0,    0,    0,    0,
213 	0,    0,    0,    0,    0,    0,    0,    0,
214 	0,    0,    0,    0,    0,    0,    0,    0,
215 	0,    0,    0,    0,    0,    0,    0,    0,
216 	0,    0,    0,    0,    0,    0,    0,    0,
217 	0,    0,    0,    0,    0,    0,    0,    0,
218 	0,    0,    0,    0,    0,    0,    0,    0,
219 	0,    0,    0,    0,    0,    0,    0,    0,
220 	0,    0,    0,    0,    0,    0,    0,    0,
221 	0,    0,    0,    0,    0,    0,    0,    0,
222 	0,    0,    0,    0,    0,    0,    0,    0,
223 	0,    0,    0,    0,    0,    0,    0,    0,
224 	0,    0,    0,    0,    0,    0,    0,    0,
225 	0,    0,    0,    0,    0,    0,    0,    0,
226 	0,    0,    0,    0,    0,    0,    0,    0,
227 	0,    0,    0,    0,    0,    0,    0,    0,
228 
229 /*	C0    C1    C2    C3    C4    C5    C6    C7 */
230 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
231 
232 /*	C8    C9    CA    CB    CC    CD    CE    CF */
233 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
234 
235 /*	D0    D1    D2    D3    D4    D5    D6    D7 */
236 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
237 
238 /*	D8    D9    DA    DB    DC    DD    DE    DF */
239 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
240 
241 /*	E0    E1    E2    E3    E4    E5    E6    E7 */
242 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
243 
244 /*	E8    E9    EA    EB    EC    ED    EE    EF */
245 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
246 
247 /*	F0    F1    F2    F3    F4    F5    F6    F7 */
248 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
249 
250 	0,    0,    0,    0,    0,    0,    0,    0
251 };
252 
253 
254 static int
check_endian(int flag,int * in,int * out)255 check_endian(int flag, int *in, int *out)
256 {
257 	*in = flag & UCONV_IN_ENDIAN_MASKS;
258 
259 	/* You cannot have both. */
260 	if (*in == UCONV_IN_ENDIAN_MASKS)
261 		return (EBADF);
262 
263 	if (*in == 0)
264 		*in = UCONV_IN_NAT_ENDIAN;
265 
266 	*out = flag & UCONV_OUT_ENDIAN_MASKS;
267 
268 	/* You cannot have both. */
269 	if (*out == UCONV_OUT_ENDIAN_MASKS)
270 		return (EBADF);
271 
272 	if (*out == 0)
273 		*out = UCONV_OUT_NAT_ENDIAN;
274 
275 	return (0);
276 }
277 
278 static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)279 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
280 {
281 	if (u16l > 0) {
282 		if (*u16s == UCONV_BOM_NORMAL) {
283 			*in = UCONV_IN_NAT_ENDIAN;
284 			return (B_TRUE);
285 		}
286 		if (*u16s == UCONV_BOM_SWAPPED) {
287 			*in = UCONV_IN_REV_ENDIAN;
288 			return (B_TRUE);
289 		}
290 	}
291 
292 	return (B_FALSE);
293 }
294 
295 static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)296 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
297 {
298 	if (u32l > 0) {
299 		if (*u32s == UCONV_BOM_NORMAL) {
300 			*in = UCONV_IN_NAT_ENDIAN;
301 			return (B_TRUE);
302 		}
303 		if (*u32s == UCONV_BOM_SWAPPED_32) {
304 			*in = UCONV_IN_REV_ENDIAN;
305 			return (B_TRUE);
306 		}
307 	}
308 
309 	return (B_FALSE);
310 }
311 
312 int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)313 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
314     uint32_t *u32s, size_t *utf32len, int flag)
315 {
316 	int inendian;
317 	int outendian;
318 	size_t u16l;
319 	size_t u32l;
320 	uint32_t hi;
321 	uint32_t lo;
322 	boolean_t do_not_ignore_null;
323 
324 	/*
325 	 * Do preliminary validity checks on parameters and collect info on
326 	 * endians.
327 	 */
328 	if (u16s == NULL || utf16len == NULL)
329 		return (EILSEQ);
330 
331 	if (u32s == NULL || utf32len == NULL)
332 		return (E2BIG);
333 
334 	if (check_endian(flag, &inendian, &outendian) != 0)
335 		return (EBADF);
336 
337 	/*
338 	 * Initialize input and output parameter buffer indices and
339 	 * temporary variables.
340 	 */
341 	u16l = u32l = 0;
342 	hi = 0;
343 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
344 
345 	/*
346 	 * Check on the BOM at the beginning of the input buffer if required
347 	 * and if there is indeed one, process it.
348 	 */
349 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
350 	    check_bom16(u16s, *utf16len, &inendian))
351 		u16l++;
352 
353 	/*
354 	 * Reset inendian and outendian so that after this point, those can be
355 	 * used as condition values.
356 	 */
357 	inendian &= UCONV_IN_NAT_ENDIAN;
358 	outendian &= UCONV_OUT_NAT_ENDIAN;
359 
360 	/*
361 	 * If there is something in the input buffer and if necessary and
362 	 * requested, save the BOM at the output buffer.
363 	 */
364 	if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
365 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
366 		    UCONV_BOM_SWAPPED_32;
367 
368 	/*
369 	 * Do conversion; if encounter a surrogate pair, assemble high and
370 	 * low pair values to form a UTF-32 character. If a half of a pair
371 	 * exists alone, then, either it is an illegal (EILSEQ) or
372 	 * invalid (EINVAL) value.
373 	 */
374 	for (; u16l < *utf16len; u16l++) {
375 		if (u16s[u16l] == 0 && do_not_ignore_null)
376 			break;
377 
378 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
379 
380 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
381 			if (hi)
382 				return (EILSEQ);
383 			hi = lo;
384 			continue;
385 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
386 			if (! hi)
387 				return (EILSEQ);
388 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
389 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
390 			    + UCONV_U16_START;
391 			hi = 0;
392 		} else if (hi) {
393 			return (EILSEQ);
394 		}
395 
396 		if (u32l >= *utf32len)
397 			return (E2BIG);
398 
399 		u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
400 	}
401 
402 	/*
403 	 * If high half didn't see low half, then, it's most likely the input
404 	 * parameter is incomplete.
405 	 */
406 	if (hi)
407 		return (EINVAL);
408 
409 	/*
410 	 * Save the number of consumed and saved characters. They do not
411 	 * include terminating NULL character (U+0000) at the end of
412 	 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
413 	 * the input buffer length is big enough to include the terminating
414 	 * NULL character).
415 	 */
416 	*utf16len = u16l;
417 	*utf32len = u32l;
418 
419 	return (0);
420 }
421 
422 int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)423 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
424     uchar_t *u8s, size_t *utf8len, int flag)
425 {
426 	int inendian;
427 	int outendian;
428 	size_t u16l;
429 	size_t u8l;
430 	uint32_t hi;
431 	uint32_t lo;
432 	boolean_t do_not_ignore_null;
433 
434 	if (u16s == NULL || utf16len == NULL)
435 		return (EILSEQ);
436 
437 	if (u8s == NULL || utf8len == NULL)
438 		return (E2BIG);
439 
440 	if (check_endian(flag, &inendian, &outendian) != 0)
441 		return (EBADF);
442 
443 	u16l = u8l = 0;
444 	hi = 0;
445 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
446 
447 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
448 	    check_bom16(u16s, *utf16len, &inendian))
449 		u16l++;
450 
451 	inendian &= UCONV_IN_NAT_ENDIAN;
452 
453 	for (; u16l < *utf16len; u16l++) {
454 		if (u16s[u16l] == 0 && do_not_ignore_null)
455 			break;
456 
457 		lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
458 
459 		if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
460 			if (hi)
461 				return (EILSEQ);
462 			hi = lo;
463 			continue;
464 		} else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
465 			if (! hi)
466 				return (EILSEQ);
467 			lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
468 			    lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
469 			    + UCONV_U16_START;
470 			hi = 0;
471 		} else if (hi) {
472 			return (EILSEQ);
473 		}
474 
475 		/*
476 		 * Now we convert a UTF-32 character into a UTF-8 character.
477 		 * Unicode coding space is between U+0000 and U+10FFFF;
478 		 * anything bigger is an illegal character.
479 		 */
480 		if (lo <= UCONV_U8_ONE_BYTE) {
481 			if (u8l >= *utf8len)
482 				return (E2BIG);
483 			u8s[u8l++] = (uchar_t)lo;
484 		} else if (lo <= UCONV_U8_TWO_BYTES) {
485 			if ((u8l + 1) >= *utf8len)
486 				return (E2BIG);
487 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
488 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
489 		} else if (lo <= UCONV_U8_THREE_BYTES) {
490 			if ((u8l + 2) >= *utf8len)
491 				return (E2BIG);
492 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
493 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
494 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
495 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
496 			if ((u8l + 3) >= *utf8len)
497 				return (E2BIG);
498 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
499 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
500 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
501 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
502 		} else {
503 			return (EILSEQ);
504 		}
505 	}
506 
507 	if (hi)
508 		return (EINVAL);
509 
510 	*utf16len = u16l;
511 	*utf8len = u8l;
512 
513 	return (0);
514 }
515 
516 int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)517 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
518     uint16_t *u16s, size_t *utf16len, int flag)
519 {
520 	int inendian;
521 	int outendian;
522 	size_t u16l;
523 	size_t u32l;
524 	uint32_t hi;
525 	uint32_t lo;
526 	boolean_t do_not_ignore_null;
527 
528 	if (u32s == NULL || utf32len == NULL)
529 		return (EILSEQ);
530 
531 	if (u16s == NULL || utf16len == NULL)
532 		return (E2BIG);
533 
534 	if (check_endian(flag, &inendian, &outendian) != 0)
535 		return (EBADF);
536 
537 	u16l = u32l = 0;
538 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
539 
540 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
541 	    check_bom32(u32s, *utf32len, &inendian))
542 		u32l++;
543 
544 	inendian &= UCONV_IN_NAT_ENDIAN;
545 	outendian &= UCONV_OUT_NAT_ENDIAN;
546 
547 	if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
548 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
549 		    UCONV_BOM_SWAPPED;
550 
551 	for (; u32l < *utf32len; u32l++) {
552 		if (u32s[u32l] == 0 && do_not_ignore_null)
553 			break;
554 
555 		hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
556 
557 		/*
558 		 * Anything bigger than the Unicode coding space, i.e.,
559 		 * Unicode scalar value bigger than U+10FFFF, is an illegal
560 		 * character.
561 		 */
562 		if (hi > UCONV_UNICODE_MAX)
563 			return (EILSEQ);
564 
565 		/*
566 		 * Anything bigger than U+FFFF must be converted into
567 		 * a surrogate pair in UTF-16.
568 		 */
569 		if (hi >= UCONV_U16_START) {
570 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
571 			    UCONV_U16_LO_MIN;
572 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
573 			    UCONV_U16_HI_MIN;
574 
575 			if ((u16l + 1) >= *utf16len)
576 				return (E2BIG);
577 
578 			if (outendian) {
579 				u16s[u16l++] = (uint16_t)hi;
580 				u16s[u16l++] = (uint16_t)lo;
581 			} else {
582 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
583 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
584 			}
585 		} else {
586 			if (u16l >= *utf16len)
587 				return (E2BIG);
588 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
589 			    BSWAP_16(((uint16_t)hi));
590 		}
591 	}
592 
593 	*utf16len = u16l;
594 	*utf32len = u32l;
595 
596 	return (0);
597 }
598 
599 int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)600 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
601     uchar_t *u8s, size_t *utf8len, int flag)
602 {
603 	int inendian;
604 	int outendian;
605 	size_t u32l;
606 	size_t u8l;
607 	uint32_t lo;
608 	boolean_t do_not_ignore_null;
609 
610 	if (u32s == NULL || utf32len == NULL)
611 		return (EILSEQ);
612 
613 	if (u8s == NULL || utf8len == NULL)
614 		return (E2BIG);
615 
616 	if (check_endian(flag, &inendian, &outendian) != 0)
617 		return (EBADF);
618 
619 	u32l = u8l = 0;
620 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
621 
622 	if ((flag & UCONV_IN_ACCEPT_BOM) &&
623 	    check_bom32(u32s, *utf32len, &inendian))
624 		u32l++;
625 
626 	inendian &= UCONV_IN_NAT_ENDIAN;
627 
628 	for (; u32l < *utf32len; u32l++) {
629 		if (u32s[u32l] == 0 && do_not_ignore_null)
630 			break;
631 
632 		lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
633 
634 		if (lo <= UCONV_U8_ONE_BYTE) {
635 			if (u8l >= *utf8len)
636 				return (E2BIG);
637 			u8s[u8l++] = (uchar_t)lo;
638 		} else if (lo <= UCONV_U8_TWO_BYTES) {
639 			if ((u8l + 1) >= *utf8len)
640 				return (E2BIG);
641 			u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
642 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x003f));
643 		} else if (lo <= UCONV_U8_THREE_BYTES) {
644 			if ((u8l + 2) >= *utf8len)
645 				return (E2BIG);
646 			u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
647 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
648 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x0003f));
649 		} else if (lo <= UCONV_U8_FOUR_BYTES) {
650 			if ((u8l + 3) >= *utf8len)
651 				return (E2BIG);
652 			u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
653 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
654 			u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
655 			u8s[u8l++] = (uchar_t)(0x80 |  (lo & 0x000003f));
656 		} else {
657 			return (EILSEQ);
658 		}
659 	}
660 
661 	*utf32len = u32l;
662 	*utf8len = u8l;
663 
664 	return (0);
665 }
666 
667 int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)668 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
669     uint16_t *u16s, size_t *utf16len, int flag)
670 {
671 	int inendian;
672 	int outendian;
673 	size_t u16l;
674 	size_t u8l;
675 	uint32_t hi;
676 	uint32_t lo;
677 	int remaining_bytes;
678 	int first_b;
679 	boolean_t do_not_ignore_null;
680 
681 	if (u8s == NULL || utf8len == NULL)
682 		return (EILSEQ);
683 
684 	if (u16s == NULL || utf16len == NULL)
685 		return (E2BIG);
686 
687 	if (check_endian(flag, &inendian, &outendian) != 0)
688 		return (EBADF);
689 
690 	u16l = u8l = 0;
691 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
692 
693 	outendian &= UCONV_OUT_NAT_ENDIAN;
694 
695 	if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
696 		u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
697 		    UCONV_BOM_SWAPPED;
698 
699 	for (; u8l < *utf8len; ) {
700 		if (u8s[u8l] == 0 && do_not_ignore_null)
701 			break;
702 
703 		/*
704 		 * Collect a UTF-8 character and convert it to a UTF-32
705 		 * character. In doing so, we screen out illegally formed
706 		 * UTF-8 characters and treat such as illegal characters.
707 		 * The algorithm at below also screens out anything bigger
708 		 * than the U+10FFFF.
709 		 *
710 		 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
711 		 * more details on the illegal values of UTF-8 character
712 		 * bytes.
713 		 */
714 		hi = (uint32_t)u8s[u8l++];
715 
716 		if (hi > UCONV_ASCII_MAX) {
717 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
718 				return (EILSEQ);
719 
720 			first_b = hi;
721 			hi = hi & u8_masks_tbl[remaining_bytes];
722 
723 			for (; remaining_bytes > 0; remaining_bytes--) {
724 				/*
725 				 * If we have no more bytes, the current
726 				 * UTF-8 character is incomplete.
727 				 */
728 				if (u8l >= *utf8len)
729 					return (EINVAL);
730 
731 				lo = (uint32_t)u8s[u8l++];
732 
733 				if (first_b) {
734 					if (lo < valid_min_2nd_byte[first_b] ||
735 					    lo > valid_max_2nd_byte[first_b])
736 						return (EILSEQ);
737 					first_b = 0;
738 				} else if (lo < UCONV_U8_BYTE_MIN ||
739 				    lo > UCONV_U8_BYTE_MAX) {
740 					return (EILSEQ);
741 				}
742 				hi = (hi << UCONV_U8_BIT_SHIFT) |
743 				    (lo & UCONV_U8_BIT_MASK);
744 			}
745 		}
746 
747 		if (hi >= UCONV_U16_START) {
748 			lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
749 			    UCONV_U16_LO_MIN;
750 			hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
751 			    UCONV_U16_HI_MIN;
752 
753 			if ((u16l + 1) >= *utf16len)
754 				return (E2BIG);
755 
756 			if (outendian) {
757 				u16s[u16l++] = (uint16_t)hi;
758 				u16s[u16l++] = (uint16_t)lo;
759 			} else {
760 				u16s[u16l++] = BSWAP_16(((uint16_t)hi));
761 				u16s[u16l++] = BSWAP_16(((uint16_t)lo));
762 			}
763 		} else {
764 			if (u16l >= *utf16len)
765 				return (E2BIG);
766 
767 			u16s[u16l++] = (outendian) ? (uint16_t)hi :
768 			    BSWAP_16(((uint16_t)hi));
769 		}
770 	}
771 
772 	*utf16len = u16l;
773 	*utf8len = u8l;
774 
775 	return (0);
776 }
777 
778 int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)779 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
780     uint32_t *u32s, size_t *utf32len, int flag)
781 {
782 	int inendian;
783 	int outendian;
784 	size_t u32l;
785 	size_t u8l;
786 	uint32_t hi;
787 	uint32_t c;
788 	int remaining_bytes;
789 	int first_b;
790 	boolean_t do_not_ignore_null;
791 
792 	if (u8s == NULL || utf8len == NULL)
793 		return (EILSEQ);
794 
795 	if (u32s == NULL || utf32len == NULL)
796 		return (E2BIG);
797 
798 	if (check_endian(flag, &inendian, &outendian) != 0)
799 		return (EBADF);
800 
801 	u32l = u8l = 0;
802 	do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
803 
804 	outendian &= UCONV_OUT_NAT_ENDIAN;
805 
806 	if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
807 		u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
808 		    UCONV_BOM_SWAPPED_32;
809 
810 	for (; u8l < *utf8len; ) {
811 		if (u8s[u8l] == 0 && do_not_ignore_null)
812 			break;
813 
814 		hi = (uint32_t)u8s[u8l++];
815 
816 		if (hi > UCONV_ASCII_MAX) {
817 			if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
818 				return (EILSEQ);
819 
820 			first_b = hi;
821 			hi = hi & u8_masks_tbl[remaining_bytes];
822 
823 			for (; remaining_bytes > 0; remaining_bytes--) {
824 				if (u8l >= *utf8len)
825 					return (EINVAL);
826 
827 				c = (uint32_t)u8s[u8l++];
828 
829 				if (first_b) {
830 					if (c < valid_min_2nd_byte[first_b] ||
831 					    c > valid_max_2nd_byte[first_b])
832 						return (EILSEQ);
833 					first_b = 0;
834 				} else if (c < UCONV_U8_BYTE_MIN ||
835 				    c > UCONV_U8_BYTE_MAX) {
836 					return (EILSEQ);
837 				}
838 				hi = (hi << UCONV_U8_BIT_SHIFT) |
839 				    (c & UCONV_U8_BIT_MASK);
840 			}
841 		}
842 
843 		if (u32l >= *utf32len)
844 			return (E2BIG);
845 
846 		u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
847 	}
848 
849 	*utf32len = u32l;
850 	*utf8len = u8l;
851 
852 	return (0);
853 }
854