1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 /*
29 * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32.
30 * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517)
31 * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F),
32 * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also
33 * the section 3C man pages.
34 * Interface stability: Committed
35 */
36
37 #include <sys/types.h>
38 #ifdef _KERNEL
39 #include <sys/param.h>
40 #include <sys/sysmacros.h>
41 #include <sys/systm.h>
42 #include <sys/debug.h>
43 #include <sys/kmem.h>
44 #include <sys/sunddi.h>
45 #else
46 #include <sys/u8_textprep.h>
47 #endif /* _KERNEL */
48 #include <sys/byteorder.h>
49 #include <sys/errno.h>
50
51
52 /*
53 * The max and min values of high and low surrogate pairs of UTF-16,
54 * UTF-16 bit shift value, bit mask, and starting value outside of BMP.
55 */
56 #define UCONV_U16_HI_MIN (0xd800U)
57 #define UCONV_U16_HI_MAX (0xdbffU)
58 #define UCONV_U16_LO_MIN (0xdc00U)
59 #define UCONV_U16_LO_MAX (0xdfffU)
60 #define UCONV_U16_BIT_SHIFT (0x0400U)
61 #define UCONV_U16_BIT_MASK (0x0fffffU)
62 #define UCONV_U16_START (0x010000U)
63
64 /* The maximum value of Unicode coding space and ASCII coding space. */
65 #define UCONV_UNICODE_MAX (0x10ffffU)
66 #define UCONV_ASCII_MAX (0x7fU)
67
68 /* The mask values for input and output endians. */
69 #define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN)
70 #define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN)
71
72 /* Native and reversed endian macros. */
73 #ifdef _BIG_ENDIAN
74 #define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN
75 #define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN
76 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN
77 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN
78 #else
79 #define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN
80 #define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN
81 #define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN
82 #define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN
83 #endif /* _BIG_ENDIAN */
84
85 /* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */
86 #define UCONV_BOM_NORMAL (0xfeffU)
87 #define UCONV_BOM_SWAPPED (0xfffeU)
88 #define UCONV_BOM_SWAPPED_32 (0xfffe0000U)
89
90 /* UTF-32 boundaries based on UTF-8 character byte lengths. */
91 #define UCONV_U8_ONE_BYTE (0x7fU)
92 #define UCONV_U8_TWO_BYTES (0x7ffU)
93 #define UCONV_U8_THREE_BYTES (0xffffU)
94 #define UCONV_U8_FOUR_BYTES (0x10ffffU)
95
96 /* The common minimum and maximum values at the UTF-8 character bytes. */
97 #define UCONV_U8_BYTE_MIN (0x80U)
98 #define UCONV_U8_BYTE_MAX (0xbfU)
99
100 /*
101 * The following "6" and "0x3f" came from "10xx xxxx" bit representation of
102 * UTF-8 character bytes.
103 */
104 #define UCONV_U8_BIT_SHIFT 6
105 #define UCONV_U8_BIT_MASK 0x3f
106
107 /*
108 * The following vector shows remaining bytes in a UTF-8 character.
109 * Index will be the first byte of the character.
110 */
111 static const uchar_t remaining_bytes_tbl[0x100] = {
112 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
114 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
117 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
118 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
120 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
121 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
123 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
124
125 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
126 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
127
128 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
130
131 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
132 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
133
134 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
135 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
136 };
137
138 /*
139 * The following is a vector of bit-masks to get used bits in
140 * the first byte of a UTF-8 character. Index is remaining bytes at above of
141 * the character.
142 */
143 #ifdef _KERNEL
144 const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
145 #else
146 static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
147 #endif /* _KERNEL */
148
149 /*
150 * The following two vectors are to provide valid minimum and
151 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
152 * better illegal sequence checking. The index value must be the value of
153 * the first byte of the UTF-8 character.
154 */
155 static const uchar_t valid_min_2nd_byte[0x100] = {
156 0, 0, 0, 0, 0, 0, 0, 0,
157 0, 0, 0, 0, 0, 0, 0, 0,
158 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
163 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180
181 /* C0 C1 C2 C3 C4 C5 C6 C7 */
182 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
183
184 /* C8 C9 CA CB CC CD CE CF */
185 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
186
187 /* D0 D1 D2 D3 D4 D5 D6 D7 */
188 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
189
190 /* D8 D9 DA DB DC DD DE DF */
191 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
192
193 /* E0 E1 E2 E3 E4 E5 E6 E7 */
194 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
195
196 /* E8 E9 EA EB EC ED EE EF */
197 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
198
199 /* F0 F1 F2 F3 F4 F5 F6 F7 */
200 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
201
202 0, 0, 0, 0, 0, 0, 0, 0
203 };
204
205 static const uchar_t valid_max_2nd_byte[0x100] = {
206 0, 0, 0, 0, 0, 0, 0, 0,
207 0, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
211 0, 0, 0, 0, 0, 0, 0, 0,
212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0,
222 0, 0, 0, 0, 0, 0, 0, 0,
223 0, 0, 0, 0, 0, 0, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
228 0, 0, 0, 0, 0, 0, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
230
231 /* C0 C1 C2 C3 C4 C5 C6 C7 */
232 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
233
234 /* C8 C9 CA CB CC CD CE CF */
235 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
236
237 /* D0 D1 D2 D3 D4 D5 D6 D7 */
238 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
239
240 /* D8 D9 DA DB DC DD DE DF */
241 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
242
243 /* E0 E1 E2 E3 E4 E5 E6 E7 */
244 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
245
246 /* E8 E9 EA EB EC ED EE EF */
247 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
248
249 /* F0 F1 F2 F3 F4 F5 F6 F7 */
250 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
251
252 0, 0, 0, 0, 0, 0, 0, 0
253 };
254
255
256 static int
check_endian(int flag,int * in,int * out)257 check_endian(int flag, int *in, int *out)
258 {
259 *in = flag & UCONV_IN_ENDIAN_MASKS;
260
261 /* You cannot have both. */
262 if (*in == UCONV_IN_ENDIAN_MASKS)
263 return (EBADF);
264
265 if (*in == 0)
266 *in = UCONV_IN_NAT_ENDIAN;
267
268 *out = flag & UCONV_OUT_ENDIAN_MASKS;
269
270 /* You cannot have both. */
271 if (*out == UCONV_OUT_ENDIAN_MASKS)
272 return (EBADF);
273
274 if (*out == 0)
275 *out = UCONV_OUT_NAT_ENDIAN;
276
277 return (0);
278 }
279
280 static boolean_t
check_bom16(const uint16_t * u16s,size_t u16l,int * in)281 check_bom16(const uint16_t *u16s, size_t u16l, int *in)
282 {
283 if (u16l > 0) {
284 if (*u16s == UCONV_BOM_NORMAL) {
285 *in = UCONV_IN_NAT_ENDIAN;
286 return (B_TRUE);
287 }
288 if (*u16s == UCONV_BOM_SWAPPED) {
289 *in = UCONV_IN_REV_ENDIAN;
290 return (B_TRUE);
291 }
292 }
293
294 return (B_FALSE);
295 }
296
297 static boolean_t
check_bom32(const uint32_t * u32s,size_t u32l,int * in)298 check_bom32(const uint32_t *u32s, size_t u32l, int *in)
299 {
300 if (u32l > 0) {
301 if (*u32s == UCONV_BOM_NORMAL) {
302 *in = UCONV_IN_NAT_ENDIAN;
303 return (B_TRUE);
304 }
305 if (*u32s == UCONV_BOM_SWAPPED_32) {
306 *in = UCONV_IN_REV_ENDIAN;
307 return (B_TRUE);
308 }
309 }
310
311 return (B_FALSE);
312 }
313
314 int
uconv_u16tou32(const uint16_t * u16s,size_t * utf16len,uint32_t * u32s,size_t * utf32len,int flag)315 uconv_u16tou32(const uint16_t *u16s, size_t *utf16len,
316 uint32_t *u32s, size_t *utf32len, int flag)
317 {
318 int inendian;
319 int outendian;
320 size_t u16l;
321 size_t u32l;
322 uint32_t hi;
323 uint32_t lo;
324 boolean_t do_not_ignore_null;
325
326 /*
327 * Do preliminary validity checks on parameters and collect info on
328 * endians.
329 */
330 if (u16s == NULL || utf16len == NULL)
331 return (EILSEQ);
332
333 if (u32s == NULL || utf32len == NULL)
334 return (E2BIG);
335
336 if (check_endian(flag, &inendian, &outendian) != 0)
337 return (EBADF);
338
339 /*
340 * Initialize input and output parameter buffer indices and
341 * temporary variables.
342 */
343 u16l = u32l = 0;
344 hi = 0;
345 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
346
347 /*
348 * Check on the BOM at the beginning of the input buffer if required
349 * and if there is indeed one, process it.
350 */
351 if ((flag & UCONV_IN_ACCEPT_BOM) &&
352 check_bom16(u16s, *utf16len, &inendian))
353 u16l++;
354
355 /*
356 * Reset inendian and outendian so that after this point, those can be
357 * used as condition values.
358 */
359 inendian &= UCONV_IN_NAT_ENDIAN;
360 outendian &= UCONV_OUT_NAT_ENDIAN;
361
362 /*
363 * If there is something in the input buffer and if necessary and
364 * requested, save the BOM at the output buffer.
365 */
366 if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
367 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
368 UCONV_BOM_SWAPPED_32;
369
370 /*
371 * Do conversion; if encounter a surrogate pair, assemble high and
372 * low pair values to form a UTF-32 character. If a half of a pair
373 * exists alone, then, either it is an illegal (EILSEQ) or
374 * invalid (EINVAL) value.
375 */
376 for (; u16l < *utf16len; u16l++) {
377 if (u16s[u16l] == 0 && do_not_ignore_null)
378 break;
379
380 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
381
382 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
383 if (hi)
384 return (EILSEQ);
385 hi = lo;
386 continue;
387 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
388 if (! hi)
389 return (EILSEQ);
390 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
391 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
392 + UCONV_U16_START;
393 hi = 0;
394 } else if (hi) {
395 return (EILSEQ);
396 }
397
398 if (u32l >= *utf32len)
399 return (E2BIG);
400
401 u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo);
402 }
403
404 /*
405 * If high half didn't see low half, then, it's most likely the input
406 * parameter is incomplete.
407 */
408 if (hi)
409 return (EINVAL);
410
411 /*
412 * Save the number of consumed and saved characters. They do not
413 * include terminating NULL character (U+0000) at the end of
414 * the input buffer (even when UCONV_IGNORE_NULL isn't specified and
415 * the input buffer length is big enough to include the terminating
416 * NULL character).
417 */
418 *utf16len = u16l;
419 *utf32len = u32l;
420
421 return (0);
422 }
423
424 int
uconv_u16tou8(const uint16_t * u16s,size_t * utf16len,uchar_t * u8s,size_t * utf8len,int flag)425 uconv_u16tou8(const uint16_t *u16s, size_t *utf16len,
426 uchar_t *u8s, size_t *utf8len, int flag)
427 {
428 int inendian;
429 int outendian;
430 size_t u16l;
431 size_t u8l;
432 uint32_t hi;
433 uint32_t lo;
434 boolean_t do_not_ignore_null;
435
436 if (u16s == NULL || utf16len == NULL)
437 return (EILSEQ);
438
439 if (u8s == NULL || utf8len == NULL)
440 return (E2BIG);
441
442 if (check_endian(flag, &inendian, &outendian) != 0)
443 return (EBADF);
444
445 u16l = u8l = 0;
446 hi = 0;
447 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
448
449 if ((flag & UCONV_IN_ACCEPT_BOM) &&
450 check_bom16(u16s, *utf16len, &inendian))
451 u16l++;
452
453 inendian &= UCONV_IN_NAT_ENDIAN;
454
455 for (; u16l < *utf16len; u16l++) {
456 if (u16s[u16l] == 0 && do_not_ignore_null)
457 break;
458
459 lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l]));
460
461 if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) {
462 if (hi)
463 return (EILSEQ);
464 hi = lo;
465 continue;
466 } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) {
467 if (! hi)
468 return (EILSEQ);
469 lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT +
470 lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK)
471 + UCONV_U16_START;
472 hi = 0;
473 } else if (hi) {
474 return (EILSEQ);
475 }
476
477 /*
478 * Now we convert a UTF-32 character into a UTF-8 character.
479 * Unicode coding space is between U+0000 and U+10FFFF;
480 * anything bigger is an illegal character.
481 */
482 if (lo <= UCONV_U8_ONE_BYTE) {
483 if (u8l >= *utf8len)
484 return (E2BIG);
485 u8s[u8l++] = (uchar_t)lo;
486 } else if (lo <= UCONV_U8_TWO_BYTES) {
487 if ((u8l + 1) >= *utf8len)
488 return (E2BIG);
489 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
490 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
491 } else if (lo <= UCONV_U8_THREE_BYTES) {
492 if ((u8l + 2) >= *utf8len)
493 return (E2BIG);
494 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
495 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
496 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
497 } else if (lo <= UCONV_U8_FOUR_BYTES) {
498 if ((u8l + 3) >= *utf8len)
499 return (E2BIG);
500 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
501 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
502 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
503 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
504 } else {
505 return (EILSEQ);
506 }
507 }
508
509 if (hi)
510 return (EINVAL);
511
512 *utf16len = u16l;
513 *utf8len = u8l;
514
515 return (0);
516 }
517
518 int
uconv_u32tou16(const uint32_t * u32s,size_t * utf32len,uint16_t * u16s,size_t * utf16len,int flag)519 uconv_u32tou16(const uint32_t *u32s, size_t *utf32len,
520 uint16_t *u16s, size_t *utf16len, int flag)
521 {
522 int inendian;
523 int outendian;
524 size_t u16l;
525 size_t u32l;
526 uint32_t hi;
527 uint32_t lo;
528 boolean_t do_not_ignore_null;
529
530 if (u32s == NULL || utf32len == NULL)
531 return (EILSEQ);
532
533 if (u16s == NULL || utf16len == NULL)
534 return (E2BIG);
535
536 if (check_endian(flag, &inendian, &outendian) != 0)
537 return (EBADF);
538
539 u16l = u32l = 0;
540 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
541
542 if ((flag & UCONV_IN_ACCEPT_BOM) &&
543 check_bom32(u32s, *utf32len, &inendian))
544 u32l++;
545
546 inendian &= UCONV_IN_NAT_ENDIAN;
547 outendian &= UCONV_OUT_NAT_ENDIAN;
548
549 if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
550 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
551 UCONV_BOM_SWAPPED;
552
553 for (; u32l < *utf32len; u32l++) {
554 if (u32s[u32l] == 0 && do_not_ignore_null)
555 break;
556
557 hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
558
559 /*
560 * Anything bigger than the Unicode coding space, i.e.,
561 * Unicode scalar value bigger than U+10FFFF, is an illegal
562 * character.
563 */
564 if (hi > UCONV_UNICODE_MAX)
565 return (EILSEQ);
566
567 /*
568 * Anything bigger than U+FFFF must be converted into
569 * a surrogate pair in UTF-16.
570 */
571 if (hi >= UCONV_U16_START) {
572 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
573 UCONV_U16_LO_MIN;
574 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
575 UCONV_U16_HI_MIN;
576
577 if ((u16l + 1) >= *utf16len)
578 return (E2BIG);
579
580 if (outendian) {
581 u16s[u16l++] = (uint16_t)hi;
582 u16s[u16l++] = (uint16_t)lo;
583 } else {
584 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
585 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
586 }
587 } else {
588 if (u16l >= *utf16len)
589 return (E2BIG);
590 u16s[u16l++] = (outendian) ? (uint16_t)hi :
591 BSWAP_16(((uint16_t)hi));
592 }
593 }
594
595 *utf16len = u16l;
596 *utf32len = u32l;
597
598 return (0);
599 }
600
601 int
uconv_u32tou8(const uint32_t * u32s,size_t * utf32len,uchar_t * u8s,size_t * utf8len,int flag)602 uconv_u32tou8(const uint32_t *u32s, size_t *utf32len,
603 uchar_t *u8s, size_t *utf8len, int flag)
604 {
605 int inendian;
606 int outendian;
607 size_t u32l;
608 size_t u8l;
609 uint32_t lo;
610 boolean_t do_not_ignore_null;
611
612 if (u32s == NULL || utf32len == NULL)
613 return (EILSEQ);
614
615 if (u8s == NULL || utf8len == NULL)
616 return (E2BIG);
617
618 if (check_endian(flag, &inendian, &outendian) != 0)
619 return (EBADF);
620
621 u32l = u8l = 0;
622 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
623
624 if ((flag & UCONV_IN_ACCEPT_BOM) &&
625 check_bom32(u32s, *utf32len, &inendian))
626 u32l++;
627
628 inendian &= UCONV_IN_NAT_ENDIAN;
629
630 for (; u32l < *utf32len; u32l++) {
631 if (u32s[u32l] == 0 && do_not_ignore_null)
632 break;
633
634 lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]);
635
636 if (lo <= UCONV_U8_ONE_BYTE) {
637 if (u8l >= *utf8len)
638 return (E2BIG);
639 u8s[u8l++] = (uchar_t)lo;
640 } else if (lo <= UCONV_U8_TWO_BYTES) {
641 if ((u8l + 1) >= *utf8len)
642 return (E2BIG);
643 u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6));
644 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f));
645 } else if (lo <= UCONV_U8_THREE_BYTES) {
646 if ((u8l + 2) >= *utf8len)
647 return (E2BIG);
648 u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12));
649 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6));
650 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f));
651 } else if (lo <= UCONV_U8_FOUR_BYTES) {
652 if ((u8l + 3) >= *utf8len)
653 return (E2BIG);
654 u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18));
655 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12));
656 u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6));
657 u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f));
658 } else {
659 return (EILSEQ);
660 }
661 }
662
663 *utf32len = u32l;
664 *utf8len = u8l;
665
666 return (0);
667 }
668
669 int
uconv_u8tou16(const uchar_t * u8s,size_t * utf8len,uint16_t * u16s,size_t * utf16len,int flag)670 uconv_u8tou16(const uchar_t *u8s, size_t *utf8len,
671 uint16_t *u16s, size_t *utf16len, int flag)
672 {
673 int inendian;
674 int outendian;
675 size_t u16l;
676 size_t u8l;
677 uint32_t hi;
678 uint32_t lo;
679 int remaining_bytes;
680 int first_b;
681 boolean_t do_not_ignore_null;
682
683 if (u8s == NULL || utf8len == NULL)
684 return (EILSEQ);
685
686 if (u16s == NULL || utf16len == NULL)
687 return (E2BIG);
688
689 if (check_endian(flag, &inendian, &outendian) != 0)
690 return (EBADF);
691
692 u16l = u8l = 0;
693 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
694
695 outendian &= UCONV_OUT_NAT_ENDIAN;
696
697 if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM))
698 u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL :
699 UCONV_BOM_SWAPPED;
700
701 for (; u8l < *utf8len; ) {
702 if (u8s[u8l] == 0 && do_not_ignore_null)
703 break;
704
705 /*
706 * Collect a UTF-8 character and convert it to a UTF-32
707 * character. In doing so, we screen out illegally formed
708 * UTF-8 characters and treat such as illegal characters.
709 * The algorithm at below also screens out anything bigger
710 * than the U+10FFFF.
711 *
712 * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for
713 * more details on the illegal values of UTF-8 character
714 * bytes.
715 */
716 hi = (uint32_t)u8s[u8l++];
717
718 if (hi > UCONV_ASCII_MAX) {
719 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
720 return (EILSEQ);
721
722 first_b = hi;
723 hi = hi & u8_masks_tbl[remaining_bytes];
724
725 for (; remaining_bytes > 0; remaining_bytes--) {
726 /*
727 * If we have no more bytes, the current
728 * UTF-8 character is incomplete.
729 */
730 if (u8l >= *utf8len)
731 return (EINVAL);
732
733 lo = (uint32_t)u8s[u8l++];
734
735 if (first_b) {
736 if (lo < valid_min_2nd_byte[first_b] ||
737 lo > valid_max_2nd_byte[first_b])
738 return (EILSEQ);
739 first_b = 0;
740 } else if (lo < UCONV_U8_BYTE_MIN ||
741 lo > UCONV_U8_BYTE_MAX) {
742 return (EILSEQ);
743 }
744 hi = (hi << UCONV_U8_BIT_SHIFT) |
745 (lo & UCONV_U8_BIT_MASK);
746 }
747 }
748
749 if (hi >= UCONV_U16_START) {
750 lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) +
751 UCONV_U16_LO_MIN;
752 hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) +
753 UCONV_U16_HI_MIN;
754
755 if ((u16l + 1) >= *utf16len)
756 return (E2BIG);
757
758 if (outendian) {
759 u16s[u16l++] = (uint16_t)hi;
760 u16s[u16l++] = (uint16_t)lo;
761 } else {
762 u16s[u16l++] = BSWAP_16(((uint16_t)hi));
763 u16s[u16l++] = BSWAP_16(((uint16_t)lo));
764 }
765 } else {
766 if (u16l >= *utf16len)
767 return (E2BIG);
768
769 u16s[u16l++] = (outendian) ? (uint16_t)hi :
770 BSWAP_16(((uint16_t)hi));
771 }
772 }
773
774 *utf16len = u16l;
775 *utf8len = u8l;
776
777 return (0);
778 }
779
780 int
uconv_u8tou32(const uchar_t * u8s,size_t * utf8len,uint32_t * u32s,size_t * utf32len,int flag)781 uconv_u8tou32(const uchar_t *u8s, size_t *utf8len,
782 uint32_t *u32s, size_t *utf32len, int flag)
783 {
784 int inendian;
785 int outendian;
786 size_t u32l;
787 size_t u8l;
788 uint32_t hi;
789 uint32_t c;
790 int remaining_bytes;
791 int first_b;
792 boolean_t do_not_ignore_null;
793
794 if (u8s == NULL || utf8len == NULL)
795 return (EILSEQ);
796
797 if (u32s == NULL || utf32len == NULL)
798 return (E2BIG);
799
800 if (check_endian(flag, &inendian, &outendian) != 0)
801 return (EBADF);
802
803 u32l = u8l = 0;
804 do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0);
805
806 outendian &= UCONV_OUT_NAT_ENDIAN;
807
808 if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM))
809 u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL :
810 UCONV_BOM_SWAPPED_32;
811
812 for (; u8l < *utf8len; ) {
813 if (u8s[u8l] == 0 && do_not_ignore_null)
814 break;
815
816 hi = (uint32_t)u8s[u8l++];
817
818 if (hi > UCONV_ASCII_MAX) {
819 if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0)
820 return (EILSEQ);
821
822 first_b = hi;
823 hi = hi & u8_masks_tbl[remaining_bytes];
824
825 for (; remaining_bytes > 0; remaining_bytes--) {
826 if (u8l >= *utf8len)
827 return (EINVAL);
828
829 c = (uint32_t)u8s[u8l++];
830
831 if (first_b) {
832 if (c < valid_min_2nd_byte[first_b] ||
833 c > valid_max_2nd_byte[first_b])
834 return (EILSEQ);
835 first_b = 0;
836 } else if (c < UCONV_U8_BYTE_MIN ||
837 c > UCONV_U8_BYTE_MAX) {
838 return (EILSEQ);
839 }
840 hi = (hi << UCONV_U8_BIT_SHIFT) |
841 (c & UCONV_U8_BIT_MASK);
842 }
843 }
844
845 if (u32l >= *utf32len)
846 return (E2BIG);
847
848 u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi);
849 }
850
851 *utf32len = u32l;
852 *utf8len = u8l;
853
854 return (0);
855 }
856