1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
28 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
29 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
30 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
31 * the section 3C man pages.
32 * Interface stability: Committed
33 */
34
35 #include <sys/types.h>
36 #ifdef _KERNEL
37 #include <sys/param.h>
38 #include <sys/sysmacros.h>
39 #include <sys/systm.h>
40 #include <sys/debug.h>
41 #include <sys/kmem.h>
42 #include <sys/sunddi.h>
43 #else
44 #include <sys/u8_textprep.h>
45 #endif /* _KERNEL */
46 #include <sys/byteorder.h>
47 #include <sys/errno.h>
48
49
50 /*
51 * The max and min values of high and low surrogate pairs of UTF-16,
52 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
53 */
54 #define UCONV_U16_HI_MIN (0xd800U)
55 #define UCONV_U16_HI_MAX (0xdbffU)
56 #define UCONV_U16_LO_MIN (0xdc00U)
57 #define UCONV_U16_LO_MAX (0xdfffU)
58 #define UCONV_U16_BIT_SHIFT (0x0400U)
59 #define UCONV_U16_BIT_MASK (0x0fffffU)
60 #define UCONV_U16_START (0x010000U)
61
62 /* The maximum value of Unicode coding space and ASCII coding space. */
63 #define UCONV_UNICODE_MAX (0x10ffffU)
64 #define UCONV_ASCII_MAX (0x7fU)
65
66 /* The mask values for input and output endians. */
67 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
68 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
69
70 /* Native and reversed endian macros. */
71 #ifdef _BIG_ENDIAN
72 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
73 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
74 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
75 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
76 #else
77 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
78 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
79 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
80 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
81 #endif /* _BIG_ENDIAN */
82
83 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
84 #define UCONV_BOM_NORMAL (0xfeffU)
85 #define UCONV_BOM_SWAPPED (0xfffeU)
86 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
87
88 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
89 #define UCONV_U8_ONE_BYTE (0x7fU)
90 #define UCONV_U8_TWO_BYTES (0x7ffU)
91 #define UCONV_U8_THREE_BYTES (0xffffU)
92 #define UCONV_U8_FOUR_BYTES (0x10ffffU)
93
94 /* The common minimum and maximum values at the UTF-8 character bytes. */
95 #define UCONV_U8_BYTE_MIN (0x80U)
96 #define UCONV_U8_BYTE_MAX (0xbfU)
97
98 /*
99 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
100 * UTF-8 character bytes.
101 */
102 #define UCONV_U8_BIT_SHIFT 6
103 #define UCONV_U8_BIT_MASK 0x3f
104
105 /*
106 * The following vector shows remaining bytes in a UTF-8 character.
107 * Index will be the first byte of the character.
108 */
109 static const uchar_t remaining_bytes_tbl[0x100] = {
110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122
123 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
124 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
125
126 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
128
129 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
130 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
131
132 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
133 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
134 };
135
136 /*
137 * The following is a vector of bit-masks to get used bits in
138 * the first byte of a UTF-8 character. Index is remaining bytes at above of
139 * the character.
140 */
141 #ifdef _KERNEL
142 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
143 #else
144 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145 #endif /* _KERNEL */
146
147 /*
148 * The following two vectors are to provide valid minimum and
149 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
150 * better illegal sequence checking. The index value must be the value of
151 * the first byte of the UTF-8 character.
152 */
153 static const uchar_t valid_min_2nd_byte[0x100] = {
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178
179 /* C0 C1 C2 C3 C4 C5 C6 C7 */
180 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
181
182 /* C8 C9 CA CB CC CD CE CF */
183 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
184
185 /* D0 D1 D2 D3 D4 D5 D6 D7 */
186 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
187
188 /* D8 D9 DA DB DC DD DE DF */
189 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
190
191 /* E0 E1 E2 E3 E4 E5 E6 E7 */
192 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
193
194 /* E8 E9 EA EB EC ED EE EF */
195 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
196
197 /* F0 F1 F2 F3 F4 F5 F6 F7 */
198 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
199
200 0, 0, 0, 0, 0, 0, 0, 0
201 };
202
203 static const uchar_t valid_max_2nd_byte[0x100] = {
204 0, 0, 0, 0, 0, 0, 0, 0,
205 0, 0, 0, 0, 0, 0, 0, 0,
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228
229 /* C0 C1 C2 C3 C4 C5 C6 C7 */
230 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
231
232 /* C8 C9 CA CB CC CD CE CF */
233 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
234
235 /* D0 D1 D2 D3 D4 D5 D6 D7 */
236 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
237
238 /* D8 D9 DA DB DC DD DE DF */
239 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
240
241 /* E0 E1 E2 E3 E4 E5 E6 E7 */
242 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
243
244 /* E8 E9 EA EB EC ED EE EF */
245 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
246
247 /* F0 F1 F2 F3 F4 F5 F6 F7 */
248 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
249
250 0, 0, 0, 0, 0, 0, 0, 0
251 };
252
253
254 static int
check_endian(int flag,int * in,int * out)255 check_endian(int flag, int *in, int *out)
256 {
257 *in = flag & UCONV_IN_ENDIAN_MASKS;
258
259 /* You cannot have both. */
260 if (*in == UCONV_IN_ENDIAN_MASKS)
261 return (EBADF);
262
263 if (*in == 0)
264 *in = UCONV_IN_NAT_ENDIAN;
265
266 *out = flag & UCONV_OUT_ENDIAN_MASKS;
267
268 /* You cannot have both. */
269 if (*out == UCONV_OUT_ENDIAN_MASKS)
270 return (EBADF);
271
272 if (*out == 0)
273 *out = UCONV_OUT_NAT_ENDIAN;
274
275 return (0);
276 }
277
278 static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)279 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
280 {
281 if (u16l > 0) {
282 if (*u16s == UCONV_BOM_NORMAL) {
283 *in = UCONV_IN_NAT_ENDIAN;
284 return (B_TRUE);
285 }
286 if (*u16s == UCONV_BOM_SWAPPED) {
287 *in = UCONV_IN_REV_ENDIAN;
288 return (B_TRUE);
289 }
290 }
291
292 return (B_FALSE);
293 }
294
295 static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)296 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
297 {
298 if (u32l > 0) {
299 if (*u32s == UCONV_BOM_NORMAL) {
300 *in = UCONV_IN_NAT_ENDIAN;
301 return (B_TRUE);
302 }
303 if (*u32s == UCONV_BOM_SWAPPED_32) {
304 *in = UCONV_IN_REV_ENDIAN;
305 return (B_TRUE);
306 }
307 }
308
309 return (B_FALSE);
310 }
311
312 int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)313 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
314 uint32_t *u32s, size_t *utf32len, int flag)
315 {
316 int inendian;
317 int outendian;
318 size_t u16l;
319 size_t u32l;
320 uint32_t hi;
321 uint32_t lo;
322 boolean_t do_not_ignore_null;
323
324 /*
325 * Do preliminary validity checks on parameters and collect info on
326 * endians.
327 */
328 if (u16s == NULL || utf16len == NULL)
329 return (EILSEQ);
330
331 if (u32s == NULL || utf32len == NULL)
332 return (E2BIG);
333
334 if (check_endian(flag, &inendian, &outendian) != 0)
335 return (EBADF);
336
337 /*
338 * Initialize input and output parameter buffer indices and
339 * temporary variables.
340 */
341 u16l = u32l = 0;
342 hi = 0;
343 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
344
345 /*
346 * Check on the BOM at the beginning of the input buffer if required
347 * and if there is indeed one, process it.
348 */
349 if ((flag & UCONV_IN_ACCEPT_BOM) &&
350 check_bom16(u16s, *utf16len, &inendian))
351 u16l++;
352
353 /*
354 * Reset inendian and outendian so that after this point, those can be
355 * used as condition values.
356 */
357 inendian &= UCONV_IN_NAT_ENDIAN;
358 outendian &= UCONV_OUT_NAT_ENDIAN;
359
360 /*
361 * If there is something in the input buffer and if necessary and
362 * requested, save the BOM at the output buffer.
363 */
364 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
365 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
366 UCONV_BOM_SWAPPED_32;
367
368 /*
369 * Do conversion; if encounter a surrogate pair, assemble high and
370 * low pair values to form a UTF-32 character. If a half of a pair
371 * exists alone, then, either it is an illegal (EILSEQ) or
372 * invalid (EINVAL) value.
373 */
374 for (; u16l < *utf16len; u16l++) {
375 if (u16s[u16l] == 0 && do_not_ignore_null)
376 break;
377
378 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
379
380 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
381 if (hi)
382 return (EILSEQ);
383 hi = lo;
384 continue;
385 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
386 if (! hi)
387 return (EILSEQ);
388 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
389 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
390 + UCONV_U16_START;
391 hi = 0;
392 } else if (hi) {
393 return (EILSEQ);
394 }
395
396 if (u32l >= *utf32len)
397 return (E2BIG);
398
399 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
400 }
401
402 /*
403 * If high half didn't see low half, then, it's most likely the input
404 * parameter is incomplete.
405 */
406 if (hi)
407 return (EINVAL);
408
409 /*
410 * Save the number of consumed and saved characters. They do not
411 * include terminating NULL character (U+0000) at the end of
412 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
413 * the input buffer length is big enough to include the terminating
414 * NULL character).
415 */
416 *utf16len = u16l;
417 *utf32len = u32l;
418
419 return (0);
420 }
421
422 int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)423 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
424 uchar_t *u8s, size_t *utf8len, int flag)
425 {
426 int inendian;
427 int outendian;
428 size_t u16l;
429 size_t u8l;
430 uint32_t hi;
431 uint32_t lo;
432 boolean_t do_not_ignore_null;
433
434 if (u16s == NULL || utf16len == NULL)
435 return (EILSEQ);
436
437 if (u8s == NULL || utf8len == NULL)
438 return (E2BIG);
439
440 if (check_endian(flag, &inendian, &outendian) != 0)
441 return (EBADF);
442
443 u16l = u8l = 0;
444 hi = 0;
445 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
446
447 if ((flag & UCONV_IN_ACCEPT_BOM) &&
448 check_bom16(u16s, *utf16len, &inendian))
449 u16l++;
450
451 inendian &= UCONV_IN_NAT_ENDIAN;
452
453 for (; u16l < *utf16len; u16l++) {
454 if (u16s[u16l] == 0 && do_not_ignore_null)
455 break;
456
457 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
458
459 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
460 if (hi)
461 return (EILSEQ);
462 hi = lo;
463 continue;
464 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
465 if (! hi)
466 return (EILSEQ);
467 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
468 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
469 + UCONV_U16_START;
470 hi = 0;
471 } else if (hi) {
472 return (EILSEQ);
473 }
474
475 /*
476 * Now we convert a UTF-32 character into a UTF-8 character.
477 * Unicode coding space is between U+0000 and U+10FFFF;
478 * anything bigger is an illegal character.
479 */
480 if (lo <= UCONV_U8_ONE_BYTE) {
481 if (u8l >= *utf8len)
482 return (E2BIG);
483 u8s[u8l++] = (uchar_t)lo;
484 } else if (lo <= UCONV_U8_TWO_BYTES) {
485 if ((u8l + 1) >= *utf8len)
486 return (E2BIG);
487 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
488 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
489 } else if (lo <= UCONV_U8_THREE_BYTES) {
490 if ((u8l + 2) >= *utf8len)
491 return (E2BIG);
492 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
493 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
494 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
495 } else if (lo <= UCONV_U8_FOUR_BYTES) {
496 if ((u8l + 3) >= *utf8len)
497 return (E2BIG);
498 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
499 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
500 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
501 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
502 } else {
503 return (EILSEQ);
504 }
505 }
506
507 if (hi)
508 return (EINVAL);
509
510 *utf16len = u16l;
511 *utf8len = u8l;
512
513 return (0);
514 }
515
516 int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)517 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
518 uint16_t *u16s, size_t *utf16len, int flag)
519 {
520 int inendian;
521 int outendian;
522 size_t u16l;
523 size_t u32l;
524 uint32_t hi;
525 uint32_t lo;
526 boolean_t do_not_ignore_null;
527
528 if (u32s == NULL || utf32len == NULL)
529 return (EILSEQ);
530
531 if (u16s == NULL || utf16len == NULL)
532 return (E2BIG);
533
534 if (check_endian(flag, &inendian, &outendian) != 0)
535 return (EBADF);
536
537 u16l = u32l = 0;
538 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
539
540 if ((flag & UCONV_IN_ACCEPT_BOM) &&
541 check_bom32(u32s, *utf32len, &inendian))
542 u32l++;
543
544 inendian &= UCONV_IN_NAT_ENDIAN;
545 outendian &= UCONV_OUT_NAT_ENDIAN;
546
547 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
548 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
549 UCONV_BOM_SWAPPED;
550
551 for (; u32l < *utf32len; u32l++) {
552 if (u32s[u32l] == 0 && do_not_ignore_null)
553 break;
554
555 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
556
557 /*
558 * Anything bigger than the Unicode coding space, i.e.,
559 * Unicode scalar value bigger than U+10FFFF, is an illegal
560 * character.
561 */
562 if (hi > UCONV_UNICODE_MAX)
563 return (EILSEQ);
564
565 /*
566 * Anything bigger than U+FFFF must be converted into
567 * a surrogate pair in UTF-16.
568 */
569 if (hi >= UCONV_U16_START) {
570 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
571 UCONV_U16_LO_MIN;
572 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
573 UCONV_U16_HI_MIN;
574
575 if ((u16l + 1) >= *utf16len)
576 return (E2BIG);
577
578 if (outendian) {
579 u16s[u16l++] = (uint16_t)hi;
580 u16s[u16l++] = (uint16_t)lo;
581 } else {
582 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
583 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
584 }
585 } else {
586 if (u16l >= *utf16len)
587 return (E2BIG);
588 u16s[u16l++] = (outendian) ? (uint16_t)hi :
589 BSWAP_16(((uint16_t)hi));
590 }
591 }
592
593 *utf16len = u16l;
594 *utf32len = u32l;
595
596 return (0);
597 }
598
599 int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)600 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
601 uchar_t *u8s, size_t *utf8len, int flag)
602 {
603 int inendian;
604 int outendian;
605 size_t u32l;
606 size_t u8l;
607 uint32_t lo;
608 boolean_t do_not_ignore_null;
609
610 if (u32s == NULL || utf32len == NULL)
611 return (EILSEQ);
612
613 if (u8s == NULL || utf8len == NULL)
614 return (E2BIG);
615
616 if (check_endian(flag, &inendian, &outendian) != 0)
617 return (EBADF);
618
619 u32l = u8l = 0;
620 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
621
622 if ((flag & UCONV_IN_ACCEPT_BOM) &&
623 check_bom32(u32s, *utf32len, &inendian))
624 u32l++;
625
626 inendian &= UCONV_IN_NAT_ENDIAN;
627
628 for (; u32l < *utf32len; u32l++) {
629 if (u32s[u32l] == 0 && do_not_ignore_null)
630 break;
631
632 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
633
634 if (lo <= UCONV_U8_ONE_BYTE) {
635 if (u8l >= *utf8len)
636 return (E2BIG);
637 u8s[u8l++] = (uchar_t)lo;
638 } else if (lo <= UCONV_U8_TWO_BYTES) {
639 if ((u8l + 1) >= *utf8len)
640 return (E2BIG);
641 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
642 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
643 } else if (lo <= UCONV_U8_THREE_BYTES) {
644 if ((u8l + 2) >= *utf8len)
645 return (E2BIG);
646 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
647 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
648 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
649 } else if (lo <= UCONV_U8_FOUR_BYTES) {
650 if ((u8l + 3) >= *utf8len)
651 return (E2BIG);
652 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
653 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
654 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
655 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
656 } else {
657 return (EILSEQ);
658 }
659 }
660
661 *utf32len = u32l;
662 *utf8len = u8l;
663
664 return (0);
665 }
666
667 int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)668 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
669 uint16_t *u16s, size_t *utf16len, int flag)
670 {
671 int inendian;
672 int outendian;
673 size_t u16l;
674 size_t u8l;
675 uint32_t hi;
676 uint32_t lo;
677 int remaining_bytes;
678 int first_b;
679 boolean_t do_not_ignore_null;
680
681 if (u8s == NULL || utf8len == NULL)
682 return (EILSEQ);
683
684 if (u16s == NULL || utf16len == NULL)
685 return (E2BIG);
686
687 if (check_endian(flag, &inendian, &outendian) != 0)
688 return (EBADF);
689
690 u16l = u8l = 0;
691 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
692
693 outendian &= UCONV_OUT_NAT_ENDIAN;
694
695 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
696 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
697 UCONV_BOM_SWAPPED;
698
699 for (; u8l < *utf8len; ) {
700 if (u8s[u8l] == 0 && do_not_ignore_null)
701 break;
702
703 /*
704 * Collect a UTF-8 character and convert it to a UTF-32
705 * character. In doing so, we screen out illegally formed
706 * UTF-8 characters and treat such as illegal characters.
707 * The algorithm at below also screens out anything bigger
708 * than the U+10FFFF.
709 *
710 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
711 * more details on the illegal values of UTF-8 character
712 * bytes.
713 */
714 hi = (uint32_t)u8s[u8l++];
715
716 if (hi > UCONV_ASCII_MAX) {
717 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
718 return (EILSEQ);
719
720 first_b = hi;
721 hi = hi & u8_masks_tbl[remaining_bytes];
722
723 for (; remaining_bytes > 0; remaining_bytes--) {
724 /*
725 * If we have no more bytes, the current
726 * UTF-8 character is incomplete.
727 */
728 if (u8l >= *utf8len)
729 return (EINVAL);
730
731 lo = (uint32_t)u8s[u8l++];
732
733 if (first_b) {
734 if (lo < valid_min_2nd_byte[first_b] ||
735 lo > valid_max_2nd_byte[first_b])
736 return (EILSEQ);
737 first_b = 0;
738 } else if (lo < UCONV_U8_BYTE_MIN ||
739 lo > UCONV_U8_BYTE_MAX) {
740 return (EILSEQ);
741 }
742 hi = (hi << UCONV_U8_BIT_SHIFT) |
743 (lo & UCONV_U8_BIT_MASK);
744 }
745 }
746
747 if (hi >= UCONV_U16_START) {
748 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
749 UCONV_U16_LO_MIN;
750 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
751 UCONV_U16_HI_MIN;
752
753 if ((u16l + 1) >= *utf16len)
754 return (E2BIG);
755
756 if (outendian) {
757 u16s[u16l++] = (uint16_t)hi;
758 u16s[u16l++] = (uint16_t)lo;
759 } else {
760 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
761 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
762 }
763 } else {
764 if (u16l >= *utf16len)
765 return (E2BIG);
766
767 u16s[u16l++] = (outendian) ? (uint16_t)hi :
768 BSWAP_16(((uint16_t)hi));
769 }
770 }
771
772 *utf16len = u16l;
773 *utf8len = u8l;
774
775 return (0);
776 }
777
778 int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)779 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
780 uint32_t *u32s, size_t *utf32len, int flag)
781 {
782 int inendian;
783 int outendian;
784 size_t u32l;
785 size_t u8l;
786 uint32_t hi;
787 uint32_t c;
788 int remaining_bytes;
789 int first_b;
790 boolean_t do_not_ignore_null;
791
792 if (u8s == NULL || utf8len == NULL)
793 return (EILSEQ);
794
795 if (u32s == NULL || utf32len == NULL)
796 return (E2BIG);
797
798 if (check_endian(flag, &inendian, &outendian) != 0)
799 return (EBADF);
800
801 u32l = u8l = 0;
802 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
803
804 outendian &= UCONV_OUT_NAT_ENDIAN;
805
806 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
807 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
808 UCONV_BOM_SWAPPED_32;
809
810 for (; u8l < *utf8len; ) {
811 if (u8s[u8l] == 0 && do_not_ignore_null)
812 break;
813
814 hi = (uint32_t)u8s[u8l++];
815
816 if (hi > UCONV_ASCII_MAX) {
817 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
818 return (EILSEQ);
819
820 first_b = hi;
821 hi = hi & u8_masks_tbl[remaining_bytes];
822
823 for (; remaining_bytes > 0; remaining_bytes--) {
824 if (u8l >= *utf8len)
825 return (EINVAL);
826
827 c = (uint32_t)u8s[u8l++];
828
829 if (first_b) {
830 if (c < valid_min_2nd_byte[first_b] ||
831 c > valid_max_2nd_byte[first_b])
832 return (EILSEQ);
833 first_b = 0;
834 } else if (c < UCONV_U8_BYTE_MIN ||
835 c > UCONV_U8_BYTE_MAX) {
836 return (EILSEQ);
837 }
838 hi = (hi << UCONV_U8_BIT_SHIFT) |
839 (c & UCONV_U8_BIT_MASK);
840 }
841 }
842
843 if (u32l >= *utf32len)
844 return (E2BIG);
845
846 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
847 }
848
849 *utf32len = u32l;
850 *utf8len = u8l;
851
852 return (0);
853 }
854