1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * COPYRIGHT AND PERMISSION NOTICE
23 *
24 * Copyright (c) 1991-2005 Unicode, Inc. All rights reserved. Distributed
25 * under the Terms of Use in http://www.unicode.org/copyright.html.
26 *
27 * This file has been modified by Sun Microsystems, Inc.
28 */
29 /*
30 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
31 * Use is subject to license terms.
32 */
33
34
35 #include <sys/types.h>
36
37 #if defined(JFP_ICONV_FROMCODE_UTF32BE)||defined(JFP_ICONV_FROMCODE_UTF32LE)
38 #define JFP_ICONV_FROMCODE_UTF32
39 #endif
40
41 #if defined(JFP_ICONV_FROMCODE_UTF16BE)||defined(JFP_ICONV_FROMCODE_UTF16LE)
42 #define JFP_ICONV_FROMCODE_UTF16
43 #endif
44
45 #if defined(JFP_ICONV_FROMCODE_UCS2BE)||defined(JFP_ICONV_FROMCODE_UCS2LE)
46 #define JFP_ICONV_FROMCODE_UCS2
47 #endif
48
49 #if defined(JFP_ICONV_TOCODE_UTF32BE)||defined(JFP_ICONV_TOCODE_UTF32LE)
50 #define JFP_ICONV_TOCODE_UTF32
51 #endif
52
53 #if defined(JFP_ICONV_TOCODE_UTF16BE)||defined(JFP_ICONV_TOCODE_UTF16LE)
54 #define JFP_ICONV_TOCODE_UTF16
55 #endif
56
57 #if defined(JFP_ICONV_TOCODE_UCS2BE)||defined(JFP_ICONV_TOCODE_UCS2LE)
58 #define JFP_ICONV_TOCODE_UCS2
59 #endif
60
61
62 #define BOM 0xfeff
63 #define BSBOM16 0xfffe
64 #define BSBOM32 0xfffe0000
65 #define REPLACE 0xfffd
66 #define IFHISUR(x) ((0xd800 <= (x)) && ((x) <= 0xdbff))
67 #define IFLOSUR(x) ((0xdc00 <= (x)) && ((x) <= 0xdfff))
68
69 typedef struct {
70 boolean_t bom_written;
71 boolean_t little_endian;
72 } ucs_state_t;
73
74
75 #if defined(JFP_ICONV_FROMCODE_UTF32)
76
77 static size_t /* return #bytes read, or -1 */
read_unicode(unsigned int * p,unsigned char ** pip,size_t * pileft,ucs_state_t * state)78 read_unicode(
79 unsigned int *p, /* point variable to store UTF-32 */
80 unsigned char **pip, /* point pointer to input buf */
81 size_t *pileft, /* point #bytes left in input buf */
82 ucs_state_t *state) /* BOM state and endian */
83 {
84 unsigned char *ip = *pip;
85 size_t ileft = *pileft;
86 size_t rv = (size_t)0; /* return value */
87 unsigned char ic1, ic2, ic3, ic4; /* bytes read */
88 unsigned int u32; /* resulted UTF-32 */
89
90 NGET(ic1, "UTF32-1");
91 NGET(ic2, "UTF32-2");
92 NGET(ic3, "UTF32-3");
93 NGET(ic4, "UTF32-4");
94
95 if (state->bom_written == B_FALSE) {
96 u32 = 0U;
97 u32 |= (unsigned int)ic1 << 24;
98 u32 |= (unsigned int)ic2 << 16;
99 u32 |= (unsigned int)ic3 << 8;
100 u32 |= (unsigned int)ic4 << 0;
101 if (u32 == BOM) {
102 state->bom_written = B_TRUE;
103 state->little_endian = B_FALSE;
104 *p = BOM;
105 rv = (size_t)0;
106 goto ret;
107 } else if (u32 == BSBOM32) {
108 state->bom_written = B_TRUE;
109 state->little_endian = B_TRUE;
110 *p = BOM;
111 rv = (size_t)0;
112 goto ret;
113 } else {
114 state->bom_written = B_TRUE;
115 }
116 }
117
118 if (state->little_endian == B_TRUE) {
119 u32 = 0U;
120 u32 |= (unsigned int)ic1 << 0;
121 u32 |= (unsigned int)ic2 << 8;
122 u32 |= (unsigned int)ic3 << 16;
123 u32 |= (unsigned int)ic4 << 24;
124 } else {
125 u32 = 0U;
126 u32 |= (unsigned int)ic1 << 24;
127 u32 |= (unsigned int)ic2 << 16;
128 u32 |= (unsigned int)ic3 << 8;
129 u32 |= (unsigned int)ic4 << 0;
130 }
131
132 if (u32 == BSBOM32) {
133 RETERROR(EILSEQ, "byte-swapped BOM detected")
134 }
135
136 if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff)
137 || IFHISUR(u32) || IFLOSUR(u32)) {
138 RETERROR(EILSEQ, "illegal in UTF-32")
139 }
140
141 *p = u32;
142 rv = *pileft - ileft;
143
144 ret:
145 if (rv != (size_t)-1) {
146 /* update *pip and *pileft only on successful return */
147 *pip = ip;
148 *pileft = ileft;
149 }
150
151 return (rv);
152 }
153
154 #elif defined(JFP_ICONV_FROMCODE_UTF16) || defined(JFP_ICONV_FROMCODE_UCS2)
155
156 static size_t /* return #bytes read, or -1 */
read_unicode(unsigned int * p,unsigned char ** pip,size_t * pileft,ucs_state_t * state)157 read_unicode(
158 unsigned int *p, /* point variable to store UTF-32 */
159 unsigned char **pip, /* point pointer to input buf */
160 size_t *pileft, /* point #bytes left in input buf */
161 ucs_state_t *state) /* BOM state and endian */
162 {
163 unsigned char *ip = *pip;
164 size_t ileft = *pileft;
165 size_t rv = (size_t)0; /* return value */
166 unsigned char ic1, ic2; /* bytes read */
167 unsigned int u32; /* resulted UTF-32 */
168 #ifndef JFP_ICONV_FROMCODE_UCS2
169 unsigned int losur; /* low surrogate */
170 #endif
171
172 NGET(ic1, "UTF16-1"); /* read 1st byte */
173 NGET(ic2, "UTF16-2"); /* read 2nd byte */
174
175 if (state->bom_written == B_FALSE) {
176 u32 = 0U;
177 u32 |= (unsigned int)ic1 << 8;
178 u32 |= (unsigned int)ic2 << 0;
179 if (u32 == BOM) {
180 state->bom_written = B_TRUE;
181 state->little_endian = B_FALSE;
182 *p = BOM;
183 rv = (size_t)0;
184 goto ret;
185 } else if (u32 == BSBOM16) {
186 state->bom_written = B_TRUE;
187 state->little_endian = B_TRUE;
188 *p = BOM;
189 rv = (size_t)0;
190 goto ret;
191 } else {
192 state->bom_written = B_TRUE;
193 }
194 }
195
196 if (state->little_endian == B_TRUE) {
197 u32 = (((unsigned int)ic2) << 8) | ic1;
198 } else {
199 u32 = (((unsigned int)ic1) << 8) | ic2;
200 }
201
202 if (u32 == BSBOM16) {
203 RETERROR(EILSEQ, "byte-swapped BOM detected")
204 }
205
206 if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff)
207 || (IFLOSUR(u32))) {
208 RETERROR(EILSEQ, "illegal in UTF16")
209 }
210
211 if (IFHISUR(u32)) {
212 #if defined(JFP_ICONV_FROMCODE_UCS2)
213 RETERROR(EILSEQ, "surrogate is illegal in UCS2")
214 #else /* !defined(JFP_ICONV_FROMCODE_UCS2) */
215 NGET(ic1, "LOSUR-1");
216 NGET(ic2, "LOSUR-2");
217
218 if (state->little_endian == B_TRUE) {
219 losur = (((unsigned int)ic2) << 8) | ic1;
220 } else {
221 losur = (((unsigned int)ic1) << 8) | ic2;
222 }
223
224 if (IFLOSUR(losur)) {
225 u32 = ((u32 - 0xd800) * 0x400)
226 + (losur - 0xdc00) + 0x10000;
227 } else {
228 RETERROR(EILSEQ, "low-surrogate expected")
229 }
230 #endif /* defined(JFP_ICONV_FROMCODE_UCS2) */
231 }
232
233 *p = u32;
234 rv = *pileft - ileft;
235
236 ret:
237 if (rv != (size_t)-1) {
238 /* update *pip and *pileft only on successful return */
239 *pip = ip;
240 *pileft = ileft;
241 }
242
243 return (rv);
244 }
245
246 #else /* JFP_ICONV_FROMCODE_UTF8 (default) */
247
248 /*
249 * The following vector shows remaining bytes in a UTF-8 character.
250 * Index will be the first byte of the character.
251 */
252 static const char remaining_bytes_tbl[0x100] = {
253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
260 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
265
266 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */
267 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
268
269 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */
270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
271
272 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */
273 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
274
275 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */
276 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
277 };
278
279
280 /*
281 * The following is a vector of bit-masks to get used bits in
282 * the first byte of a UTF-8 character. Index is remaining bytes at above of
283 * the character.
284 */
285 static const char masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
286
287
288 /*
289 * The following two vectors are to provide valid minimum and
290 * maximum values for the 2'nd byte of a multibyte UTF-8 character for
291 * better illegal sequence checking. The index value must be the value of
292 * the first byte of the UTF-8 character.
293 */
294 static const unsigned char valid_min_2nd_byte[0x100] = {
295 0, 0, 0, 0, 0, 0, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0,
319 /* C0 C1 C2 C3 C4 C5 C6 C7 */
320 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
321 /* C8 C9 CA CB CC CD CE CF */
322 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
323 /* D0 D1 D2 D3 D4 D5 D6 D7 */
324 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
325 /* D8 D9 DA DB DC DD DE DF */
326 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
327 /* E0 E1 E2 E3 E4 E5 E6 E7 */
328 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
329 /* E8 E9 EA EB EC ED EE EF */
330 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
331 /* F0 F1 F2 F3 F4 F5 F6 F7 */
332 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0,
334 };
335
336 static const unsigned char valid_max_2nd_byte[0x100] = {
337 0, 0, 0, 0, 0, 0, 0, 0,
338 0, 0, 0, 0, 0, 0, 0, 0,
339 0, 0, 0, 0, 0, 0, 0, 0,
340 0, 0, 0, 0, 0, 0, 0, 0,
341 0, 0, 0, 0, 0, 0, 0, 0,
342 0, 0, 0, 0, 0, 0, 0, 0,
343 0, 0, 0, 0, 0, 0, 0, 0,
344 0, 0, 0, 0, 0, 0, 0, 0,
345 0, 0, 0, 0, 0, 0, 0, 0,
346 0, 0, 0, 0, 0, 0, 0, 0,
347 0, 0, 0, 0, 0, 0, 0, 0,
348 0, 0, 0, 0, 0, 0, 0, 0,
349 0, 0, 0, 0, 0, 0, 0, 0,
350 0, 0, 0, 0, 0, 0, 0, 0,
351 0, 0, 0, 0, 0, 0, 0, 0,
352 0, 0, 0, 0, 0, 0, 0, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 /* C0 C1 C2 C3 C4 C5 C6 C7 */
362 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
363 /* C8 C9 CA CB CC CD CE CF */
364 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
365 /* D0 D1 D2 D3 D4 D5 D6 D7 */
366 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
367 /* D8 D9 DA DB DC DD DE DF */
368 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
369 /* E0 E1 E2 E3 E4 E5 E6 E7 */
370 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
371 /* E8 E9 EA EB EC ED EE EF */
372 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
373 /* F0 F1 F2 F3 F4 F5 F6 F7 */
374 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
376 };
377
378 static size_t
utf8_ucs(unsigned int * p,unsigned char ** pip,size_t * pileft)379 utf8_ucs(unsigned int *p, unsigned char **pip, size_t *pileft)
380 {
381 unsigned int l; /* to be copied to *p on successful return */
382 unsigned char ic; /* current byte */
383 unsigned char ic1; /* 1st byte */
384 unsigned char *ip = *pip; /* next byte to read */
385 size_t ileft = *pileft; /* number of bytes available */
386 size_t rv = (size_t)0; /* return value of this function */
387 int remaining_bytes;
388
389 NGET(ic, "no bytes available"); /* read 1st byte */
390 ic1 = ic;
391 l = ic1; /* get bits from 1st byte to UCS value */
392
393 if (ic1 < 0x80) {
394 /* successfully converted */
395 *p = l;
396 rv = *pileft - ileft;
397 goto ret;
398 }
399
400 remaining_bytes = remaining_bytes_tbl[ic1];
401
402 if (remaining_bytes != 0) {
403 l &= masks_tbl[remaining_bytes];
404
405 for (; remaining_bytes > 0; remaining_bytes--) {
406 if (ic1 != 0U) {
407 NGET(ic, "2nd byte of UTF-8");
408 if ((ic < valid_min_2nd_byte[ic1]) ||
409 (ic > valid_max_2nd_byte[ic1])) {
410 RETERROR(EILSEQ, "2nd byte is invalid")
411 }
412 ic1 = 0U; /* 2nd byte check done */
413 } else {
414 NGET(ic, "3rd or later byte of UTF-8");
415 if ((ic < 0x80) || (ic > 0xbf)) {
416 RETERROR(EILSEQ, "3rd or later byte is invalid")
417 }
418 }
419 l = (l << 6) | (ic & 0x3f);
420 }
421
422 /* successfully converted */
423 *p = l;
424 rv = *pileft - ileft;
425 goto ret;
426 } else {
427 RETERROR(EILSEQ, "1st byte is invalid")
428 }
429
430 ret:
431 if (rv != (size_t)-1) {
432 /*
433 * update *pip and *pileft on successful return
434 */
435 *pip = ip;
436 *pileft = ileft;
437 }
438
439 return (rv);
440 }
441
442 /* for UTF-8 */
443 static size_t /* return #bytes read, or -1 */
read_unicode(unsigned int * p,unsigned char ** pip,size_t * pileft,ucs_state_t * state)444 read_unicode(
445 unsigned int *p, /* point variable to store UTF-32 */
446 unsigned char **pip, /* point pointer to input buf */
447 size_t *pileft, /* point #bytes left in input buf */
448 ucs_state_t *state) /* BOM state and endian - unused */
449 {
450 return (utf8_ucs(p, pip, pileft));
451 }
452
453 #endif
454
455 #if defined(JFP_ICONV_TOCODE_UTF32)
456
457 static size_t
write_unicode(unsigned int u32,char ** pop,size_t * poleft,ucs_state_t * state,const char * msg)458 write_unicode(
459 unsigned int u32, /* UTF-32 to write */
460 char **pop, /* point pointer to output buf */
461 size_t *poleft, /* point #bytes left in output buf */
462 ucs_state_t *state, /* BOM state and endian */
463 const char *msg) /* debug message */
464 {
465 char *op = *pop;
466 size_t oleft = *poleft;
467 size_t rv = (size_t)0; /* return value */
468 unsigned char ic1, ic2, ic3, ic4; /* bytes to be written */
469
470 if (state->bom_written == B_FALSE) {
471 if (state->little_endian == B_TRUE) {
472 ic1 = (unsigned char)((BOM >> 0) & 0xff);
473 ic2 = (unsigned char)((BOM >> 8) & 0xff);
474 ic3 = (unsigned char)((BOM >> 16) & 0xff);
475 ic4 = (unsigned char)((BOM >> 24) & 0xff);
476 } else {
477 ic1 = (unsigned char)((BOM >> 24) & 0xff);
478 ic2 = (unsigned char)((BOM >> 16) & 0xff);
479 ic3 = (unsigned char)((BOM >> 8) & 0xff);
480 ic4 = (unsigned char)((BOM >> 0) & 0xff);
481 }
482 rv += 4;
483 NPUT(ic1, "BOM32-1")
484 NPUT(ic2, "BOM32-2")
485 NPUT(ic3, "BOM32-3")
486 NPUT(ic4, "BOM32-4")
487 }
488
489 if (state->little_endian == B_TRUE) {
490 ic1 = (unsigned char)((u32 >> 0) & 0xff);
491 ic2 = (unsigned char)((u32 >> 8) & 0xff);
492 ic3 = (unsigned char)((u32 >> 16) & 0xff);
493 ic4 = (unsigned char)((u32 >> 24) & 0xff);
494 rv += 4;
495 } else {
496 ic1 = (unsigned char)((u32 >> 24) & 0xff);
497 ic2 = (unsigned char)((u32 >> 16) & 0xff);
498 ic3 = (unsigned char)((u32 >> 8) & 0xff);
499 ic4 = (unsigned char)((u32 >> 0) & 0xff);
500 rv += 4;
501 }
502
503 NPUT(ic1, "UTF32-1")
504 NPUT(ic2, "UTF32-2")
505 NPUT(ic3, "UTF32-3")
506 NPUT(ic4, "UTF32-4")
507
508 ret:
509 if (rv != (size_t)-1) {
510 /* update *pop and *poleft only on successful return */
511 *pop = op;
512 *poleft = oleft;
513 if (state->bom_written == B_FALSE)
514 state->bom_written = B_TRUE;
515 }
516
517 return (rv);
518 }
519
520 #elif defined(JFP_ICONV_TOCODE_UTF16) || defined(JFP_ICONV_TOCODE_UCS2)
521
522 static size_t
write_unicode(unsigned int u32,char ** pop,size_t * poleft,ucs_state_t * state,const char * msg)523 write_unicode(
524 unsigned int u32, /* UTF-32 to write */
525 char **pop, /* point pointer to output buf */
526 size_t *poleft, /* point #bytes left in output buf */
527 ucs_state_t *state, /* BOM state and endian */
528 const char *msg) /* debug message */
529 {
530 char *op = *pop;
531 size_t oleft = *poleft;
532 size_t rv = (size_t)0; /* return value */
533 unsigned char ic1, ic2; /* bytes to be written */
534 unsigned int losur = 0U; /* Hi/Lo surrogates */
535
536 if (state->bom_written == B_FALSE) {
537 if (state->little_endian == B_TRUE) {
538 ic1 = (unsigned char)((BOM >> 0) & 0xff);
539 ic2 = (unsigned char)((BOM >> 8) & 0xff);
540 } else {
541 ic1 = (unsigned char)((BOM >> 8) & 0xff);
542 ic2 = (unsigned char)((BOM >> 0) & 0xff);
543 }
544 rv += 2;
545 NPUT(ic1, "BOM16-1")
546 NPUT(ic2, "BOM16-2")
547 }
548
549 if (u32 > 0xffff) {
550 #if defined(JFP_ICONV_TOCODE_UCS2)
551 u32 = REPLACE;
552 #else /* !defined(JFP_ICONV_TOCODE_UCS2) */
553 losur = ((u32 - 0x10000) % 0x400) + 0xdc00;
554 u32 = ((u32 - 0x10000) / 0x400) + 0xd800;
555 #endif /* defined(JFP_ICONV_TOCODE_UCS2) */
556 }
557
558 if (state->little_endian == B_TRUE) {
559 ic1 = (unsigned char)(u32 & 0xff);
560 ic2 = (unsigned char)((u32 >> 8) & 0xff);
561 rv += 2;
562 } else {
563 ic1 = (unsigned char)((u32 >> 8) & 0xff);
564 ic2 = (unsigned char)(u32 & 0xff);
565 rv += 2;
566 }
567
568 NPUT(ic1, "UTF16-1")
569 NPUT(ic2, "UTF16-2")
570
571 if (losur != 0U) {
572 if (state->little_endian == B_TRUE) {
573 ic1 = (unsigned char)(losur & 0xff);
574 ic2 = (unsigned char)((losur >> 8) & 0xff);
575 rv += 2;
576 } else {
577 ic1 = (unsigned char)((losur >> 8) & 0xff);
578 ic2 = (unsigned char)(losur & 0xff);
579 rv += 2;
580 }
581
582 NPUT(ic1, "LOSUR-1")
583 NPUT(ic2, "LOSUR-2")
584 }
585
586
587 ret:
588 if (rv != (size_t)-1) {
589 /* update *pop and *poleft only on successful return */
590 *pop = op;
591 *poleft = oleft;
592 if (state->bom_written == B_FALSE)
593 state->bom_written = B_TRUE;
594 }
595
596 return (rv);
597 }
598
599 #else /* JFP_ICONV_TOCODE_UTF8 (default) */
600
601 static size_t
write_unicode(unsigned int u32,char ** pop,size_t * poleft,ucs_state_t * state,const char * msg)602 write_unicode(
603 unsigned int u32, /* UTF-32 to write */
604 char **pop, /* point pointer to output buf */
605 size_t *poleft, /* point #bytes left in output buf */
606 ucs_state_t *state, /* BOM state and endian - unused */
607 const char *msg) /* debug message */
608 {
609 char *op = *pop;
610 size_t oleft = *poleft;
611 size_t rv = 0; /* return value */
612
613 if (u32 <= 0x7f) {
614 NPUT((unsigned char)(u32), msg);
615 rv = 1;
616 } else if (u32 <= 0x7ff) {
617 NPUT((unsigned char)((((u32)>>6) & 0x1f) | 0xc0), msg);
618 NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
619 rv = 2;
620 } else if ((u32 >= 0xd800) && (u32 <= 0xdfff)) {
621 RETERROR(EILSEQ, "surrogate in UTF-8")
622 } else if (u32 <= 0xffff) {
623 NPUT((unsigned char)((((u32)>>12) & 0x0f) | 0xe0), msg);
624 NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg);
625 NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
626 rv = 3;
627 } else if (u32 <= 0x10ffff) {
628 NPUT((unsigned char)((((u32)>>18) & 0x07) | 0xf0), msg);
629 NPUT((unsigned char)((((u32)>>12) & 0x3f) | 0x80), msg);
630 NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg);
631 NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
632 rv = 4;
633 } else {
634 RETERROR(EILSEQ, "beyond range of UTF-8")
635 }
636
637 ret:
638 if (rv != (size_t)-1) {
639 /* update *pop and *poleft only on successful return */
640 *pop = op;
641 *poleft = oleft;
642 }
643
644 return (rv);
645 }
646
647 #endif
648
649 #define GETU(pu32) \
650 switch (read_unicode(pu32, &ip, &ileft, (ucs_state_t *)cd)) { \
651 case (size_t)-1: \
652 /* errno has been set in read_unicode() */ \
653 rv = (size_t)-1; \
654 goto ret; \
655 case (size_t)0: \
656 /* character read was handled in the read_unicode() */ \
657 /* no further evaluation needed in caller side */ \
658 rv = (size_t)0; \
659 goto next; \
660 default: \
661 break; \
662 }
663
664
665 #define PUTU(u32, msg) \
666 if (write_unicode(u32, &op, &oleft, (ucs_state_t *)cd, msg) \
667 == (size_t)-1) { \
668 rv = ((size_t)-1);\
669 goto ret; \
670 }
671
672 #include <stdlib.h>
673
674 static void
_icv_reset_unicode(void * cd)675 _icv_reset_unicode(void *cd)
676 {
677 ucs_state_t *state = (ucs_state_t *)cd;
678
679 #if defined(JFP_ICONV_FROMCODE_UTF32BE) || \
680 defined(JFP_ICONV_TOCODE_UTF32BE) || \
681 defined(JFP_ICONV_FROMCODE_UTF16BE) || \
682 defined(JFP_ICONV_TOCODE_UTF16BE) || \
683 defined(JFP_ICONV_FROMCODE_UCS2BE) || \
684 defined(JFP_ICONV_TOCODE_UCS2BE)
685 state->little_endian = B_FALSE;
686 state->bom_written = B_TRUE;
687 #elif defined(JFP_ICONV_FROMCODE_UTF32LE) || \
688 defined(JFP_ICONV_TOCODE_UTF32LE) || \
689 defined(JFP_ICONV_FROMCODE_UTF16LE) || \
690 defined(JFP_ICONV_TOCODE_UTF16LE) || \
691 defined(JFP_ICONV_FROMCODE_UCS2LE) || \
692 defined(JFP_ICONV_TOCODE_UCS2LE)
693 state->little_endian = B_TRUE;
694 state->bom_written = B_TRUE;
695 #elif defined(_LITTLE_ENDIAN)
696 state->little_endian = B_TRUE;
697 state->bom_written = B_FALSE;
698 #elif defined(_BIG_ENDIAN)
699 state->little_endian = B_FALSE;
700 state->bom_written = B_FALSE;
701 #endif
702
703 return;
704 }
705
706 static void *
_icv_open_unicode(size_t extsize)707 _icv_open_unicode(size_t extsize)
708 {
709 ucs_state_t *cd;
710
711 if ((cd = (ucs_state_t *)calloc(1,
712 sizeof (ucs_state_t) + extsize)) == NULL) {
713 errno = ENOMEM;
714 return ((void *)-1);
715 }
716
717 _icv_reset_unicode((void *)cd);
718
719 return ((void *)cd);
720 }
721
722 static void
_icv_close_unicode(void * cd)723 _icv_close_unicode(void *cd)
724 {
725 if (cd == NULL) {
726 errno = EBADF;
727 } else {
728 free(cd);
729 }
730 return;
731 }
732
733 static void *
_icv_get_ext(void * cd)734 _icv_get_ext(void *cd)
735 {
736 return ((void *)((unsigned char *)cd + sizeof (ucs_state_t)));
737 }
738