xref: /illumos-gate/usr/src/lib/iconv_modules/ja/common/jfp_iconv_unicode.h (revision a026698cee452cd5e158d158601d992ae9de1e82)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * COPYRIGHT AND PERMISSION NOTICE
23  *
24  * Copyright (c) 1991-2005 Unicode, Inc. All rights reserved. Distributed
25  * under the Terms of Use in http://www.unicode.org/copyright.html.
26  *
27  * This file has been modified by Sun Microsystems, Inc.
28  */
29 /*
30  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
31  * Use is subject to license terms.
32  */
33 
34 
35 #include	<sys/types.h>
36 
37 #if	defined(JFP_ICONV_FROMCODE_UTF32BE)||defined(JFP_ICONV_FROMCODE_UTF32LE)
38 #define	JFP_ICONV_FROMCODE_UTF32
39 #endif
40 
41 #if	defined(JFP_ICONV_FROMCODE_UTF16BE)||defined(JFP_ICONV_FROMCODE_UTF16LE)
42 #define	JFP_ICONV_FROMCODE_UTF16
43 #endif
44 
45 #if	defined(JFP_ICONV_FROMCODE_UCS2BE)||defined(JFP_ICONV_FROMCODE_UCS2LE)
46 #define	JFP_ICONV_FROMCODE_UCS2
47 #endif
48 
49 #if	defined(JFP_ICONV_TOCODE_UTF32BE)||defined(JFP_ICONV_TOCODE_UTF32LE)
50 #define	JFP_ICONV_TOCODE_UTF32
51 #endif
52 
53 #if	defined(JFP_ICONV_TOCODE_UTF16BE)||defined(JFP_ICONV_TOCODE_UTF16LE)
54 #define	JFP_ICONV_TOCODE_UTF16
55 #endif
56 
57 #if	defined(JFP_ICONV_TOCODE_UCS2BE)||defined(JFP_ICONV_TOCODE_UCS2LE)
58 #define	JFP_ICONV_TOCODE_UCS2
59 #endif
60 
61 
62 #define	BOM	0xfeff
63 #define	BSBOM16	0xfffe
64 #define	BSBOM32	0xfffe0000
65 #define	REPLACE	0xfffd
66 #define	IFHISUR(x)	((0xd800 <= (x)) && ((x) <= 0xdbff))
67 #define	IFLOSUR(x)	((0xdc00 <= (x)) && ((x) <= 0xdfff))
68 
69 typedef struct {
70 	boolean_t         bom_written;
71 	boolean_t         little_endian;
72 } ucs_state_t;
73 
74 
75 #if	defined(JFP_ICONV_FROMCODE_UTF32)
76 
77 static size_t				/* return #bytes read, or -1 */
read_unicode(unsigned int * p,unsigned char ** pip,size_t * pileft,ucs_state_t * state)78 read_unicode(
79 	unsigned int	*p,		/* point variable to store UTF-32 */
80 	unsigned char	**pip,		/* point pointer to input buf */
81 	size_t		*pileft,	/* point #bytes left in input buf */
82 	ucs_state_t	*state)		/* BOM state and endian */
83 {
84 	unsigned char	*ip = *pip;
85 	size_t		ileft = *pileft;
86 	size_t		rv = (size_t)0; /* return value */
87 	unsigned char	ic1, ic2, ic3, ic4;	/* bytes read */
88 	unsigned int	u32;		/* resulted UTF-32 */
89 
90 	NGET(ic1, "UTF32-1");
91 	NGET(ic2, "UTF32-2");
92 	NGET(ic3, "UTF32-3");
93 	NGET(ic4, "UTF32-4");
94 
95 	if (state->bom_written == B_FALSE) {
96 		u32 = 0U;
97 		u32 |= (unsigned int)ic1 << 24;
98 		u32 |= (unsigned int)ic2 << 16;
99 		u32 |= (unsigned int)ic3 << 8;
100 		u32 |= (unsigned int)ic4 << 0;
101 		if (u32 == BOM) {
102 			state->bom_written = B_TRUE;
103 			state->little_endian = B_FALSE;
104 			*p = BOM;
105 			rv = (size_t)0;
106 			goto ret;
107 		} else if (u32 == BSBOM32) {
108 			state->bom_written = B_TRUE;
109 			state->little_endian = B_TRUE;
110 			*p = BOM;
111 			rv = (size_t)0;
112 			goto ret;
113 		} else {
114 			state->bom_written = B_TRUE;
115 		}
116 	}
117 
118 	if (state->little_endian == B_TRUE) {
119 		u32 = 0U;
120 		u32 |= (unsigned int)ic1 << 0;
121 		u32 |= (unsigned int)ic2 << 8;
122 		u32 |= (unsigned int)ic3 << 16;
123 		u32 |= (unsigned int)ic4 << 24;
124 	} else {
125 		u32 = 0U;
126 		u32 |= (unsigned int)ic1 << 24;
127 		u32 |= (unsigned int)ic2 << 16;
128 		u32 |= (unsigned int)ic3 << 8;
129 		u32 |= (unsigned int)ic4 << 0;
130 	}
131 
132 	if (u32 == BSBOM32) {
133 		RETERROR(EILSEQ, "byte-swapped BOM detected")
134 	}
135 
136 	if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff)
137 			|| IFHISUR(u32) || IFLOSUR(u32)) {
138 		RETERROR(EILSEQ, "illegal in UTF-32")
139 	}
140 
141 	*p = u32;
142 	rv = *pileft - ileft;
143 
144 ret:
145 	if (rv != (size_t)-1) {
146 		/* update *pip and *pileft only on successful return */
147 		*pip = ip;
148 		*pileft = ileft;
149 	}
150 
151 	return (rv);
152 }
153 
154 #elif	defined(JFP_ICONV_FROMCODE_UTF16) || defined(JFP_ICONV_FROMCODE_UCS2)
155 
156 static size_t				/* return #bytes read, or -1 */
read_unicode(unsigned int * p,unsigned char ** pip,size_t * pileft,ucs_state_t * state)157 read_unicode(
158 	unsigned int	*p,		/* point variable to store UTF-32 */
159 	unsigned char	**pip,		/* point pointer to input buf */
160 	size_t		*pileft,	/* point #bytes left in input buf */
161 	ucs_state_t	*state)		/* BOM state and endian */
162 {
163 	unsigned char	*ip = *pip;
164 	size_t		ileft = *pileft;
165 	size_t		rv = (size_t)0; /* return value */
166 	unsigned char	ic1, ic2;	/* bytes read */
167 	unsigned int	u32;		/* resulted UTF-32 */
168 #ifndef	JFP_ICONV_FROMCODE_UCS2
169 	unsigned int	losur;		/* low surrogate */
170 #endif
171 
172 	NGET(ic1, "UTF16-1");	/* read 1st byte */
173 	NGET(ic2, "UTF16-2");	/* read 2nd byte */
174 
175 	if (state->bom_written == B_FALSE) {
176 		u32 = 0U;
177 		u32 |= (unsigned int)ic1 << 8;
178 		u32 |= (unsigned int)ic2 << 0;
179 		if (u32 == BOM) {
180 			state->bom_written = B_TRUE;
181 			state->little_endian = B_FALSE;
182 			*p = BOM;
183 			rv = (size_t)0;
184 			goto ret;
185 		} else if (u32 == BSBOM16) {
186 			state->bom_written = B_TRUE;
187 			state->little_endian = B_TRUE;
188 			*p = BOM;
189 			rv = (size_t)0;
190 			goto ret;
191 		} else {
192 			state->bom_written = B_TRUE;
193 		}
194 	}
195 
196 	if (state->little_endian == B_TRUE) {
197 		u32 = (((unsigned int)ic2) << 8) | ic1;
198 	} else {
199 		u32 = (((unsigned int)ic1) << 8) | ic2;
200 	}
201 
202 	if (u32 == BSBOM16) {
203 		RETERROR(EILSEQ, "byte-swapped BOM detected")
204 	}
205 
206 	if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff)
207 			|| (IFLOSUR(u32))) {
208 		RETERROR(EILSEQ, "illegal in UTF16")
209 	}
210 
211 	if (IFHISUR(u32)) {
212 #if	defined(JFP_ICONV_FROMCODE_UCS2)
213 		RETERROR(EILSEQ, "surrogate is illegal in UCS2")
214 #else	/* !defined(JFP_ICONV_FROMCODE_UCS2) */
215 		NGET(ic1, "LOSUR-1");
216 		NGET(ic2, "LOSUR-2");
217 
218 		if (state->little_endian == B_TRUE) {
219 			losur = (((unsigned int)ic2) << 8) | ic1;
220 		} else {
221 			losur = (((unsigned int)ic1) << 8) | ic2;
222 		}
223 
224 		if (IFLOSUR(losur)) {
225 			u32 = ((u32 - 0xd800) * 0x400)
226 				+ (losur - 0xdc00) + 0x10000;
227 		} else {
228 			RETERROR(EILSEQ, "low-surrogate expected")
229 		}
230 #endif	/* defined(JFP_ICONV_FROMCODE_UCS2) */
231 	}
232 
233 	*p = u32;
234 	rv = *pileft - ileft;
235 
236 ret:
237 	if (rv != (size_t)-1) {
238 		/* update *pip and *pileft only on successful return */
239 		*pip = ip;
240 		*pileft = ileft;
241 	}
242 
243 	return (rv);
244 }
245 
246 #else	/* JFP_ICONV_FROMCODE_UTF8 (default) */
247 
248 /*
249  * The following vector shows remaining bytes in a UTF-8 character.
250  * Index will be the first byte of the character.
251  */
252 static const char remaining_bytes_tbl[0x100] = {
253 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
254 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
255 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
256 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
257 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
258 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
259 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
260 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
261 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
262 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
263 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
264 	0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
265 
266    /*  C0  C1  C2  C3  C4  C5  C6  C7  C8  C9  CA  CB  CC  CD  CE  CF  */
267 	0,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
268 
269    /*  D0  D1  D2  D3  D4  D5  D6  D7  D8  D9  DA  DB  DC  DD  DE  DF  */
270 	1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
271 
272    /*  E0  E1  E2  E3  E4  E5  E6  E7  E8  E9  EA  EB  EC  ED  EE  EF  */
273 	2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
274 
275    /*  F0  F1  F2  F3  F4  F5  F6  F7  F8  F9  FA  FB  FC  FD  FE  FF  */
276 	3,  3,  3,  3,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
277 };
278 
279 
280 /*
281  * The following is a vector of bit-masks to get used bits in
282  * the first byte of a UTF-8 character.  Index is remaining bytes at above of
283  * the character.
284  */
285 static const char masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
286 
287 
288 /*
289  * The following two vectors are to provide valid minimum and
290  * maximum values for the 2'nd byte of a multibyte UTF-8 character for
291  * better illegal sequence checking. The index value must be the value of
292  * the first byte of the UTF-8 character.
293  */
294 static const unsigned char valid_min_2nd_byte[0x100] = {
295 	0,    0,    0,    0,    0,    0,    0,    0,
296 	0,    0,    0,    0,    0,    0,    0,    0,
297 	0,    0,    0,    0,    0,    0,    0,    0,
298 	0,    0,    0,    0,    0,    0,    0,    0,
299 	0,    0,    0,    0,    0,    0,    0,    0,
300 	0,    0,    0,    0,    0,    0,    0,    0,
301 	0,    0,    0,    0,    0,    0,    0,    0,
302 	0,    0,    0,    0,    0,    0,    0,    0,
303 	0,    0,    0,    0,    0,    0,    0,    0,
304 	0,    0,    0,    0,    0,    0,    0,    0,
305 	0,    0,    0,    0,    0,    0,    0,    0,
306 	0,    0,    0,    0,    0,    0,    0,    0,
307 	0,    0,    0,    0,    0,    0,    0,    0,
308 	0,    0,    0,    0,    0,    0,    0,    0,
309 	0,    0,    0,    0,    0,    0,    0,    0,
310 	0,    0,    0,    0,    0,    0,    0,    0,
311 	0,    0,    0,    0,    0,    0,    0,    0,
312 	0,    0,    0,    0,    0,    0,    0,    0,
313 	0,    0,    0,    0,    0,    0,    0,    0,
314 	0,    0,    0,    0,    0,    0,    0,    0,
315 	0,    0,    0,    0,    0,    0,    0,    0,
316 	0,    0,    0,    0,    0,    0,    0,    0,
317 	0,    0,    0,    0,    0,    0,    0,    0,
318 	0,    0,    0,    0,    0,    0,    0,    0,
319      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
320 	0,    0,    0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
321      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
322 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
323      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
324 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
325      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
326 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
327      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
328 	0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
329      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
330 	0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
331      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
332 	0x90, 0x80, 0x80, 0x80, 0x80, 0,    0,    0,
333 	0,    0,    0,    0,    0,    0,    0,    0,
334 };
335 
336 static const unsigned char valid_max_2nd_byte[0x100] = {
337 	0,    0,    0,    0,    0,    0,    0,    0,
338 	0,    0,    0,    0,    0,    0,    0,    0,
339 	0,    0,    0,    0,    0,    0,    0,    0,
340 	0,    0,    0,    0,    0,    0,    0,    0,
341 	0,    0,    0,    0,    0,    0,    0,    0,
342 	0,    0,    0,    0,    0,    0,    0,    0,
343 	0,    0,    0,    0,    0,    0,    0,    0,
344 	0,    0,    0,    0,    0,    0,    0,    0,
345 	0,    0,    0,    0,    0,    0,    0,    0,
346 	0,    0,    0,    0,    0,    0,    0,    0,
347 	0,    0,    0,    0,    0,    0,    0,    0,
348 	0,    0,    0,    0,    0,    0,    0,    0,
349 	0,    0,    0,    0,    0,    0,    0,    0,
350 	0,    0,    0,    0,    0,    0,    0,    0,
351 	0,    0,    0,    0,    0,    0,    0,    0,
352 	0,    0,    0,    0,    0,    0,    0,    0,
353 	0,    0,    0,    0,    0,    0,    0,    0,
354 	0,    0,    0,    0,    0,    0,    0,    0,
355 	0,    0,    0,    0,    0,    0,    0,    0,
356 	0,    0,    0,    0,    0,    0,    0,    0,
357 	0,    0,    0,    0,    0,    0,    0,    0,
358 	0,    0,    0,    0,    0,    0,    0,    0,
359 	0,    0,    0,    0,    0,    0,    0,    0,
360 	0,    0,    0,    0,    0,    0,    0,    0,
361      /*  C0    C1    C2    C3    C4    C5    C6    C7  */
362 	0,    0,    0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
363      /*  C8    C9    CA    CB    CC    CD    CE    CF  */
364 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
365      /*  D0    D1    D2    D3    D4    D5    D6    D7  */
366 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
367      /*  D8    D9    DA    DB    DC    DD    DE    DF  */
368 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
369      /*  E0    E1    E2    E3    E4    E5    E6    E7  */
370 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf,
371      /*  E8    E9    EA    EB    EC    ED    EE    EF  */
372 	0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf,
373      /*  F0    F1    F2    F3    F4    F5    F6    F7  */
374 	0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0,    0,    0,
375 	0,    0,    0,    0,    0,    0,    0,    0,
376 };
377 
378 static size_t
utf8_ucs(unsigned int * p,unsigned char ** pip,size_t * pileft)379 utf8_ucs(unsigned int *p, unsigned char **pip, size_t *pileft)
380 {
381 	unsigned int	l;	/* to be copied to *p on successful return */
382 	unsigned char	ic;	/* current byte */
383 	unsigned char	ic1;	/* 1st byte */
384 	unsigned char	*ip = *pip;	/* next byte to read */
385 	size_t		ileft = *pileft; /* number of bytes available */
386 	size_t		rv = (size_t)0; /* return value of this function */
387 	int		remaining_bytes;
388 
389 	NGET(ic, "no bytes available");	/* read 1st byte */
390 	ic1 = ic;
391 	l = ic1; /* get bits from 1st byte to UCS value */
392 
393 	if (ic1 < 0x80) {
394 		/* successfully converted */
395 		*p = l;
396 		rv = *pileft - ileft;
397 		goto ret;
398 	}
399 
400 	remaining_bytes = remaining_bytes_tbl[ic1];
401 
402 	if (remaining_bytes != 0) {
403 		l &= masks_tbl[remaining_bytes];
404 
405 		for (; remaining_bytes > 0; remaining_bytes--) {
406 			if (ic1 != 0U) {
407 				NGET(ic, "2nd byte of UTF-8");
408 				if ((ic < valid_min_2nd_byte[ic1]) ||
409 					(ic > valid_max_2nd_byte[ic1])) {
410 					RETERROR(EILSEQ, "2nd byte is invalid")
411 				}
412 				ic1 = 0U; /* 2nd byte check done */
413 			} else {
414 				NGET(ic, "3rd or later byte of UTF-8");
415 				if ((ic < 0x80) || (ic > 0xbf)) {
416 				RETERROR(EILSEQ, "3rd or later byte is invalid")
417 				}
418 			}
419 			l = (l << 6) | (ic & 0x3f);
420 		}
421 
422 		/* successfully converted */
423 		*p = l;
424 		rv = *pileft - ileft;
425 		goto ret;
426 	} else {
427 		RETERROR(EILSEQ, "1st byte is invalid")
428 	}
429 
430 ret:
431 	if (rv != (size_t)-1) {
432 		/*
433 		 * update *pip and *pileft on successful return
434 		 */
435 		*pip = ip;
436 		*pileft = ileft;
437 	}
438 
439 	return (rv);
440 }
441 
442 /* for UTF-8 */
443 static size_t				/* return #bytes read, or -1 */
read_unicode(unsigned int * p,unsigned char ** pip,size_t * pileft,ucs_state_t * state)444 read_unicode(
445 	unsigned int	*p,		/* point variable to store UTF-32 */
446 	unsigned char	**pip,		/* point pointer to input buf */
447 	size_t		*pileft,	/* point #bytes left in input buf */
448 	ucs_state_t	*state)		/* BOM state and endian - unused */
449 {
450 	return (utf8_ucs(p, pip, pileft));
451 }
452 
453 #endif
454 
455 #if	defined(JFP_ICONV_TOCODE_UTF32)
456 
457 static size_t
write_unicode(unsigned int u32,char ** pop,size_t * poleft,ucs_state_t * state,const char * msg)458 write_unicode(
459 	unsigned int	u32,		/* UTF-32 to write */
460 	char		**pop,		/* point pointer to output buf */
461 	size_t		*poleft,	/* point #bytes left in output buf */
462 	ucs_state_t	*state,		/* BOM state and endian */
463 	const char	*msg)		/* debug message */
464 {
465 	char		*op = *pop;
466 	size_t		oleft = *poleft;
467 	size_t		rv = (size_t)0;		/* return value */
468 	unsigned char	ic1, ic2, ic3, ic4;	/* bytes to be written */
469 
470 	if (state->bom_written == B_FALSE) {
471 		if (state->little_endian == B_TRUE) {
472 			ic1 = (unsigned char)((BOM >> 0) & 0xff);
473 			ic2 = (unsigned char)((BOM >> 8) & 0xff);
474 			ic3 = (unsigned char)((BOM >> 16) & 0xff);
475 			ic4 = (unsigned char)((BOM >> 24) & 0xff);
476 		} else {
477 			ic1 = (unsigned char)((BOM >> 24) & 0xff);
478 			ic2 = (unsigned char)((BOM >> 16) & 0xff);
479 			ic3 = (unsigned char)((BOM >> 8) & 0xff);
480 			ic4 = (unsigned char)((BOM >> 0) & 0xff);
481 		}
482 		rv += 4;
483 		NPUT(ic1, "BOM32-1")
484 		NPUT(ic2, "BOM32-2")
485 		NPUT(ic3, "BOM32-3")
486 		NPUT(ic4, "BOM32-4")
487 	}
488 
489 	if (state->little_endian == B_TRUE) {
490 		ic1 = (unsigned char)((u32 >> 0) & 0xff);
491 		ic2 = (unsigned char)((u32 >> 8) & 0xff);
492 		ic3 = (unsigned char)((u32 >> 16) & 0xff);
493 		ic4 = (unsigned char)((u32 >> 24) & 0xff);
494 		rv += 4;
495 	} else {
496 		ic1 = (unsigned char)((u32 >> 24) & 0xff);
497 		ic2 = (unsigned char)((u32 >> 16) & 0xff);
498 		ic3 = (unsigned char)((u32 >> 8) & 0xff);
499 		ic4 = (unsigned char)((u32 >> 0) & 0xff);
500 		rv += 4;
501 	}
502 
503 	NPUT(ic1, "UTF32-1")
504 	NPUT(ic2, "UTF32-2")
505 	NPUT(ic3, "UTF32-3")
506 	NPUT(ic4, "UTF32-4")
507 
508 ret:
509 	if (rv != (size_t)-1) {
510 		/* update *pop and *poleft only on successful return */
511 		*pop = op;
512 		*poleft = oleft;
513 		if (state->bom_written == B_FALSE)
514 			state->bom_written = B_TRUE;
515 	}
516 
517 	return (rv);
518 }
519 
520 #elif	defined(JFP_ICONV_TOCODE_UTF16) || defined(JFP_ICONV_TOCODE_UCS2)
521 
522 static size_t
write_unicode(unsigned int u32,char ** pop,size_t * poleft,ucs_state_t * state,const char * msg)523 write_unicode(
524 	unsigned int	u32,		/* UTF-32 to write */
525 	char		**pop,		/* point pointer to output buf */
526 	size_t		*poleft,	/* point #bytes left in output buf */
527 	ucs_state_t	*state,		/* BOM state and endian */
528 	const char	*msg)		/* debug message */
529 {
530 	char		*op = *pop;
531 	size_t		oleft = *poleft;
532 	size_t		rv = (size_t)0;	/* return value */
533 	unsigned char	ic1, ic2;	/* bytes to be written */
534 	unsigned int	losur = 0U;		/* Hi/Lo surrogates */
535 
536 	if (state->bom_written == B_FALSE) {
537 		if (state->little_endian == B_TRUE) {
538 			ic1 = (unsigned char)((BOM >> 0) & 0xff);
539 			ic2 = (unsigned char)((BOM >> 8) & 0xff);
540 		} else {
541 			ic1 = (unsigned char)((BOM >> 8) & 0xff);
542 			ic2 = (unsigned char)((BOM >> 0) & 0xff);
543 		}
544 		rv += 2;
545 		NPUT(ic1, "BOM16-1")
546 		NPUT(ic2, "BOM16-2")
547 	}
548 
549 	if (u32 > 0xffff) {
550 #if	defined(JFP_ICONV_TOCODE_UCS2)
551 		u32 = REPLACE;
552 #else	/* !defined(JFP_ICONV_TOCODE_UCS2) */
553 		losur = ((u32 - 0x10000) % 0x400) + 0xdc00;
554 		u32 = ((u32 - 0x10000) / 0x400) + 0xd800;
555 #endif	/* defined(JFP_ICONV_TOCODE_UCS2) */
556 	}
557 
558 	if (state->little_endian == B_TRUE) {
559 		ic1 = (unsigned char)(u32 & 0xff);
560 		ic2 = (unsigned char)((u32 >> 8) & 0xff);
561 		rv += 2;
562 	} else {
563 		ic1 = (unsigned char)((u32 >> 8) & 0xff);
564 		ic2 = (unsigned char)(u32 & 0xff);
565 		rv += 2;
566 	}
567 
568 	NPUT(ic1, "UTF16-1")
569 	NPUT(ic2, "UTF16-2")
570 
571 	if (losur != 0U) {
572 		if (state->little_endian == B_TRUE) {
573 			ic1 = (unsigned char)(losur & 0xff);
574 			ic2 = (unsigned char)((losur >> 8) & 0xff);
575 			rv += 2;
576 		} else {
577 			ic1 = (unsigned char)((losur >> 8) & 0xff);
578 			ic2 = (unsigned char)(losur & 0xff);
579 			rv += 2;
580 		}
581 
582 		NPUT(ic1, "LOSUR-1")
583 		NPUT(ic2, "LOSUR-2")
584 	}
585 
586 
587 ret:
588 	if (rv != (size_t)-1) {
589 		/* update *pop and *poleft only on successful return */
590 		*pop = op;
591 		*poleft = oleft;
592 		if (state->bom_written == B_FALSE)
593 			state->bom_written = B_TRUE;
594 	}
595 
596 	return (rv);
597 }
598 
599 #else	/* JFP_ICONV_TOCODE_UTF8 (default) */
600 
601 static size_t
write_unicode(unsigned int u32,char ** pop,size_t * poleft,ucs_state_t * state,const char * msg)602 write_unicode(
603 	unsigned int	u32,		/* UTF-32 to write */
604 	char		**pop,		/* point pointer to output buf */
605 	size_t		*poleft,	/* point #bytes left in output buf */
606 	ucs_state_t	*state,		/* BOM state and endian - unused */
607 	const char	*msg)		/* debug message */
608 {
609 	char	*op = *pop;
610 	size_t	oleft = *poleft;
611 	size_t	rv = 0;			/* return value */
612 
613 	if (u32 <= 0x7f) {
614 		NPUT((unsigned char)(u32), msg);
615 		rv = 1;
616 	} else if (u32 <= 0x7ff) {
617 		NPUT((unsigned char)((((u32)>>6) & 0x1f) | 0xc0), msg);
618 		NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
619 		rv = 2;
620 	} else if ((u32 >= 0xd800) && (u32 <= 0xdfff)) {
621 		RETERROR(EILSEQ, "surrogate in UTF-8")
622 	} else if (u32 <= 0xffff) {
623 		NPUT((unsigned char)((((u32)>>12) & 0x0f) | 0xe0), msg);
624 		NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg);
625 		NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
626 		rv = 3;
627 	} else if (u32 <= 0x10ffff) {
628 		NPUT((unsigned char)((((u32)>>18) & 0x07) | 0xf0), msg);
629 		NPUT((unsigned char)((((u32)>>12) & 0x3f) | 0x80), msg);
630 		NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg);
631 		NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg);
632 		rv = 4;
633 	} else {
634 		RETERROR(EILSEQ, "beyond range of UTF-8")
635 	}
636 
637 ret:
638 	if (rv != (size_t)-1) {
639 		/* update *pop and *poleft only on successful return */
640 		*pop = op;
641 		*poleft = oleft;
642 	}
643 
644 	return (rv);
645 }
646 
647 #endif
648 
649 #define	GETU(pu32) \
650 	switch (read_unicode(pu32, &ip, &ileft, (ucs_state_t *)cd)) { \
651 	case (size_t)-1: \
652 		/* errno has been set in read_unicode() */ \
653 		rv = (size_t)-1; \
654 		goto ret; \
655 	case (size_t)0: \
656 		/* character read was handled in the read_unicode() */ \
657 		/* no further evaluation needed in caller side */ \
658 		rv = (size_t)0; \
659 		goto next; \
660 	default: \
661 		break; \
662 	}
663 
664 
665 #define	PUTU(u32, msg)	\
666 	if (write_unicode(u32, &op, &oleft, (ucs_state_t *)cd, msg) \
667 			== (size_t)-1) { \
668 		rv = ((size_t)-1);\
669 		goto ret; \
670 	}
671 
672 #include	<stdlib.h>
673 
674 static void
_icv_reset_unicode(void * cd)675 _icv_reset_unicode(void *cd)
676 {
677 	ucs_state_t	*state = (ucs_state_t *)cd;
678 
679 #if	defined(JFP_ICONV_FROMCODE_UTF32BE) || \
680 	defined(JFP_ICONV_TOCODE_UTF32BE) || \
681 	defined(JFP_ICONV_FROMCODE_UTF16BE) || \
682 	defined(JFP_ICONV_TOCODE_UTF16BE) || \
683 	defined(JFP_ICONV_FROMCODE_UCS2BE) || \
684 	defined(JFP_ICONV_TOCODE_UCS2BE)
685 	state->little_endian = B_FALSE;
686 	state->bom_written = B_TRUE;
687 #elif	defined(JFP_ICONV_FROMCODE_UTF32LE) || \
688 	defined(JFP_ICONV_TOCODE_UTF32LE) || \
689 	defined(JFP_ICONV_FROMCODE_UTF16LE) || \
690 	defined(JFP_ICONV_TOCODE_UTF16LE) || \
691 	defined(JFP_ICONV_FROMCODE_UCS2LE) || \
692 	defined(JFP_ICONV_TOCODE_UCS2LE)
693 	state->little_endian = B_TRUE;
694 	state->bom_written = B_TRUE;
695 #elif	defined(_LITTLE_ENDIAN)
696 	state->little_endian = B_TRUE;
697 	state->bom_written = B_FALSE;
698 #elif	defined(_BIG_ENDIAN)
699 	state->little_endian = B_FALSE;
700 	state->bom_written = B_FALSE;
701 #endif
702 
703 	return;
704 }
705 
706 static void *
_icv_open_unicode(size_t extsize)707 _icv_open_unicode(size_t extsize)
708 {
709 	ucs_state_t	*cd;
710 
711 	if ((cd = (ucs_state_t *)calloc(1,
712 			sizeof (ucs_state_t) + extsize)) == NULL) {
713 		errno = ENOMEM;
714 		return ((void *)-1);
715 	}
716 
717 	_icv_reset_unicode((void *)cd);
718 
719 	return ((void *)cd);
720 }
721 
722 static void
_icv_close_unicode(void * cd)723 _icv_close_unicode(void *cd)
724 {
725 	if (cd == NULL) {
726 		errno = EBADF;
727 	} else {
728 		free(cd);
729 	}
730 	return;
731 }
732 
733 static void *
_icv_get_ext(void * cd)734 _icv_get_ext(void *cd)
735 {
736 	return ((void *)((unsigned char *)cd + sizeof (ucs_state_t)));
737 }
738