xref: /illumos-gate/usr/src/uts/common/kiconv/kiconv_tc/kiconv_tc.c (revision 86ef0a63e1cfa5dc98606efef379365acca98063)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/systm.h>
30 #include <sys/debug.h>
31 #include <sys/kmem.h>
32 #include <sys/sunddi.h>
33 #include <sys/byteorder.h>
34 #include <sys/errno.h>
35 #include <sys/modctl.h>
36 #include <sys/u8_textprep.h>
37 #include <sys/kiconv.h>
38 #include <sys/kiconv_cck_common.h>
39 #include <sys/kiconv_tc.h>
40 #include <sys/kiconv_big5_utf8.h>
41 #include <sys/kiconv_euctw_utf8.h>
42 #include <sys/kiconv_hkscs_utf8.h>
43 #include <sys/kiconv_cp950hkscs_utf8.h>
44 #include <sys/kiconv_utf8_big5.h>
45 #include <sys/kiconv_utf8_euctw.h>
46 #include <sys/kiconv_utf8_cp950hkscs.h>
47 #include <sys/kiconv_utf8_hkscs.h>
48 
49 /* 4 HKSCS-2004 code points map to 2 Unicode code points separately. */
50 static uchar_t hkscs_special_sequence[][4] = {
51 	{ 0xc3, 0x8a, 0xcc, 0x84 },	/* 0x8862 */
52 	{ 0xc3, 0x8a, 0xcc, 0x8c },	/* 0x8864 */
53 	{ 0xc3, 0xaa, 0xcc, 0x84 },	/* 0x88a3 */
54 	{ 0xc3, 0xaa, 0xcc, 0x8c }	/* 0x88a5 */
55 };
56 
57 /* 4 Unicode code point pair map to 1 HKSCS-2004 code point. */
58 static uint32_t ucs_special_sequence[] = {
59 	0x8866,		/* U+00ca */
60 	0x8862,		/* U+00ca U+0304 */
61 	0x8864,		/* U+00ca U+030c */
62 	0x88a7,		/* U+00ea */
63 	0x88a3,		/* U+00ea U+0304 */
64 	0x88a5		/* U+00ea U+030c */
65 };
66 
67 typedef int8_t (*kiconv_big5toutf8_t)(uint32_t value, uchar_t *ob,
68 	uchar_t *obtail, size_t *ret_val);
69 
70 static int8_t utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
71 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
72 static int8_t utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
73 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
74 static int8_t utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf,
75 	uchar_t *ibtail, uchar_t *ob, uchar_t *obtail, size_t *ret_val);
76 static int8_t utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
77 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
78 static int8_t big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
79 	size_t *ret_val);
80 static int8_t big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
81 	uchar_t *obtail, size_t *ret_val);
82 static int8_t cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
83 	uchar_t *obtail, size_t *ret_val);
84 static int8_t euctw_to_utf8(size_t plane_no, uint32_t euctw_val,
85 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
86 static uint32_t get_unicode_from_UDA(size_t plane_no, uchar_t byte1,
87 	uchar_t byte2);
88 
89 #define	KICONV_TC_BIG5		(0x01)
90 #define	KICONV_TC_BIG5HKSCS	(0x02)
91 #define	KICONV_TC_CP950HKSCS	(0x03)
92 #define	KICONV_TC_EUCTW		(0x04)
93 #define	KICONV_TC_MAX_MAGIC_ID	(0x04)
94 
95 static void *
open_fr_big5()96 open_fr_big5()
97 {
98 	return ((void *)KICONV_TC_BIG5);
99 }
100 
101 static void *
open_fr_big5hkscs()102 open_fr_big5hkscs()
103 {
104 	return ((void *)KICONV_TC_BIG5HKSCS);
105 }
106 
107 static void *
open_fr_cp950hkscs()108 open_fr_cp950hkscs()
109 {
110 	return ((void *)KICONV_TC_CP950HKSCS);
111 }
112 
113 static void *
open_fr_euctw()114 open_fr_euctw()
115 {
116 	return ((void *)KICONV_TC_EUCTW);
117 }
118 
119 static int
close_fr_tc(void * s)120 close_fr_tc(void *s)
121 {
122 	if ((uintptr_t)s > KICONV_TC_MAX_MAGIC_ID)
123 		return (EBADF);
124 
125 	return (0);
126 }
127 
128 /*
129  * Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) to UTF-8.
130  */
131 static size_t
kiconv_fr_big5_common(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_big5toutf8_t ptr_big5touf8)132 kiconv_fr_big5_common(void *kcd, char **inbuf, size_t *inbytesleft,
133     char **outbuf, size_t *outbytesleft, int *errno,
134     kiconv_big5toutf8_t ptr_big5touf8)
135 {
136 	uchar_t		*ib;
137 	uchar_t		*ob;
138 	uchar_t		*ibtail;
139 	uchar_t		*obtail;
140 	size_t		ret_val;
141 	int8_t		sz;
142 	uint32_t	big5_val;
143 
144 	/* Check on the kiconv code conversion descriptor. */
145 	if (kcd == NULL || kcd == (void *)-1) {
146 		*errno = EBADF;
147 		return ((size_t)-1);
148 	}
149 
150 	/* If this is a state reset request, process and return. */
151 	if (inbuf == NULL || *inbuf == NULL) {
152 		return (0);
153 	}
154 
155 	ret_val = 0;
156 	ib = (uchar_t *)*inbuf;
157 	ob = (uchar_t *)*outbuf;
158 	ibtail = ib + *inbytesleft;
159 	obtail = ob + *outbytesleft;
160 
161 	while (ib < ibtail) {
162 		if (KICONV_IS_ASCII(*ib)) {
163 			if (ob >= obtail) {
164 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
165 			}
166 
167 			*ob++ = *ib++;
168 			continue;
169 		}
170 
171 		/*
172 		 * Issue EILSEQ error if the first byte is not a
173 		 * valid BIG5/HKSCS leading byte.
174 		 */
175 		if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
176 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
177 		}
178 
179 		/*
180 		 * Issue EINVAL error if input buffer has an incomplete
181 		 * character at the end of the buffer.
182 		 */
183 		if (ibtail - ib < 2) {
184 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
185 		}
186 
187 		/*
188 		 * Issue EILSEQ error if the remaining bytes is not
189 		 * a valid BIG5/HKSCS byte.
190 		 */
191 		if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
192 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
193 		}
194 
195 		/* Now we have a valid BIG5/HKSCS character. */
196 		big5_val = (uint32_t)(*ib) << 8 | *(ib + 1);
197 		sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
198 
199 		if (sz < 0) {
200 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
201 		}
202 
203 		ib += 2;
204 		ob += sz;
205 	}
206 
207 	*inbuf = (char *)ib;
208 	*inbytesleft = ibtail - ib;
209 	*outbuf = (char *)ob;
210 	*outbytesleft = obtail - ob;
211 
212 	return (ret_val);
213 }
214 
215 /*
216  * String based Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS)
217  * to UTF-8.
218  */
219 static size_t
kiconvstr_fr_big5_common(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_big5toutf8_t ptr_big5touf8)220 kiconvstr_fr_big5_common(uchar_t *ib, size_t *inlen, uchar_t *ob,
221     size_t *outlen, int flag, int *errno,
222     kiconv_big5toutf8_t ptr_big5touf8)
223 {
224 	uchar_t		*oldib;
225 	uchar_t		*ibtail;
226 	uchar_t		*obtail;
227 	size_t		ret_val;
228 	int8_t		sz;
229 	uint32_t	big5_val;
230 	boolean_t	do_not_ignore_null;
231 
232 	ret_val = 0;
233 	ibtail = ib + *inlen;
234 	obtail = ob + *outlen;
235 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
236 
237 	while (ib < ibtail) {
238 		if (*ib == '\0' && do_not_ignore_null)
239 			break;
240 
241 		if (KICONV_IS_ASCII(*ib)) {
242 			if (ob >= obtail) {
243 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
244 			}
245 
246 			*ob++ = *ib++;
247 			continue;
248 		}
249 
250 		oldib = ib;
251 
252 		if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
253 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
254 		}
255 
256 		if (ibtail - ib < 2) {
257 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
258 		}
259 
260 		if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
261 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
262 		}
263 
264 		big5_val = *ib++;
265 		big5_val = (big5_val << 8) | *ib++;
266 		sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
267 
268 		if (sz < 0) {
269 			ib = oldib;
270 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
271 		}
272 
273 		ob += sz;
274 		continue;
275 
276 REPLACE_INVALID:
277 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
278 			ib = oldib;
279 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
280 		}
281 
282 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
283 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
284 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
285 		ret_val++;
286 	}
287 
288 	*inlen = ibtail - ib;
289 	*outlen = obtail - ob;
290 
291 	return (ret_val);
292 }
293 
294 /*
295  * Encoding convertor from BIG5 to UTF-8.
296  */
297 static size_t
kiconv_fr_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)298 kiconv_fr_big5(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf,
299     size_t *outbytesleft, int *errno)
300 {
301 	return (kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
302 	    outbytesleft, errno, big5_to_utf8));
303 }
304 
305 /*
306  * String based encoding convertor from BIG5 to UTF-8.
307  */
308 static size_t
kiconvstr_fr_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)309 kiconvstr_fr_big5(char *inarray, size_t *inlen, char *outarray,
310     size_t *outlen, int flag, int *errno)
311 {
312 	return (kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
313 	    (uchar_t *)outarray, outlen, flag, errno,
314 	    big5_to_utf8));
315 }
316 
317 /*
318  * Encoding convertor from BIG5-HKSCS to UTF-8.
319  */
320 static size_t
kiconv_fr_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)321 kiconv_fr_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
322     char **outbuf, size_t *outbytesleft, int *errno)
323 {
324 	return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
325 	    outbytesleft, errno, big5hkscs_to_utf8);
326 }
327 
328 /*
329  * String based encoding convertor from BIG5-HKSCS to UTF-8.
330  */
331 static size_t
kiconvstr_fr_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)332 kiconvstr_fr_big5hkscs(char *inarray, size_t *inlen, char *outarray,
333     size_t *outlen, int flag, int *errno)
334 {
335 	return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
336 	    (uchar_t *)outarray, outlen, flag, errno, big5hkscs_to_utf8);
337 }
338 
339 /*
340  * Encoding convertor from CP950-HKSCS to UTF-8.
341  */
342 static size_t
kiconv_fr_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)343 kiconv_fr_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
344     char **outbuf, size_t *outbytesleft, int *errno)
345 {
346 	return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
347 	    outbytesleft, errno, cp950hkscs_to_utf8);
348 }
349 
350 /*
351  * String based encoding convertor from CP950-HKSCS to UTF-8.
352  */
353 static size_t
kiconvstr_fr_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)354 kiconvstr_fr_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
355     size_t *outlen, int flag, int *errno)
356 {
357 	return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
358 	    (uchar_t *)outarray, outlen, flag, errno, cp950hkscs_to_utf8);
359 }
360 
361 /*
362  * Encoding convertor from EUC-TW to UTF-8.
363  */
364 static size_t
kiconv_fr_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)365 kiconv_fr_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
366     char **outbuf, size_t *outbytesleft, int *errno)
367 {
368 	uchar_t		*ib;
369 	uchar_t		*ob;
370 	uchar_t		*ibtail;
371 	uchar_t		*obtail;
372 	uchar_t		*oldib;
373 	size_t		ret_val;
374 	size_t		plane_no;
375 	int8_t		sz;
376 	uint32_t	euctw_val;
377 	boolean_t	isplane1;
378 
379 	/* Check on the kiconv code conversion descriptor. */
380 	if (kcd == NULL || kcd == (void *)-1) {
381 		*errno = EBADF;
382 		return ((size_t)-1);
383 	}
384 
385 	/* If this is a state reset request, process and return. */
386 	if (inbuf == NULL || *inbuf == NULL) {
387 		return (0);
388 	}
389 
390 	ret_val = 0;
391 	ib = (uchar_t *)*inbuf;
392 	ob = (uchar_t *)*outbuf;
393 	ibtail = ib + *inbytesleft;
394 	obtail = ob + *outbytesleft;
395 
396 	while (ib < ibtail) {
397 		if (KICONV_IS_ASCII(*ib)) {
398 			if (ob >= obtail) {
399 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
400 			}
401 
402 			*ob++ = *ib++;
403 			continue;
404 		}
405 
406 		/*
407 		 * Issue EILSEQ error if the first byte is not a
408 		 * valid EUC-TW leading byte.
409 		 */
410 		if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
411 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
412 		}
413 
414 		isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
415 		    B_FALSE : B_TRUE;
416 
417 		/*
418 		 * Issue EINVAL error if input buffer has an incomplete
419 		 * character at the end of the buffer.
420 		 */
421 		if (ibtail - ib < (isplane1 ? 2 : 4)) {
422 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
423 		}
424 
425 		oldib = ib;
426 		plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
427 
428 		/*
429 		 * Issue EILSEQ error if the remaining bytes are not
430 		 * valid EUC-TW bytes.
431 		 */
432 		if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
433 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
434 		}
435 
436 		if (! isplane1)
437 			ib += 2;
438 
439 		/* Now we have a valid EUC-TW character. */
440 		euctw_val = *ib++;
441 		euctw_val = (euctw_val << 8) | *ib++;
442 		sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
443 
444 		if (sz < 0) {
445 			ib = oldib;
446 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
447 		}
448 
449 		ob += sz;
450 	}
451 
452 	*inbuf = (char *)ib;
453 	*inbytesleft = ibtail - ib;
454 	*outbuf = (char *)ob;
455 	*outbytesleft = obtail - ob;
456 
457 	return (ret_val);
458 }
459 
460 /*
461  * String based encoding convertor from EUC-TW to UTF-8.
462  */
463 static size_t
kiconvstr_fr_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)464 kiconvstr_fr_euctw(char *inarray, size_t *inlen, char *outarray,
465     size_t *outlen, int flag, int *errno)
466 {
467 	uchar_t		*ib;
468 	uchar_t		*ob;
469 	uchar_t		*ibtail;
470 	uchar_t		*obtail;
471 	uchar_t		*oldib;
472 	size_t		ret_val;
473 	size_t		plane_no;
474 	int8_t		sz;
475 	uint32_t	euctw_val;
476 	boolean_t	isplane1;
477 	boolean_t	do_not_ignore_null;
478 
479 	ret_val = 0;
480 	ib = (uchar_t *)inarray;
481 	ob = (uchar_t *)outarray;
482 	ibtail = ib + *inlen;
483 	obtail = ob + *outlen;
484 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
485 
486 	while (ib < ibtail) {
487 		if (*ib == '\0' && do_not_ignore_null)
488 			break;
489 
490 		if (KICONV_IS_ASCII(*ib)) {
491 			if (ob >= obtail) {
492 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
493 			}
494 
495 			*ob++ = *ib++;
496 			continue;
497 		}
498 
499 		oldib = ib;
500 
501 		if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
502 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
503 		}
504 
505 		isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
506 		    B_FALSE : B_TRUE;
507 
508 		if (ibtail - ib < (isplane1 ? 2 : 4)) {
509 			if (flag & KICONV_REPLACE_INVALID) {
510 				ib = ibtail;
511 				goto REPLACE_INVALID;
512 			}
513 
514 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
515 		}
516 
517 		plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
518 
519 		if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
520 			KICONV_SET_ERRNO_WITH_FLAG(isplane1 ? 2 : 4, EILSEQ);
521 		}
522 
523 		if (! isplane1)
524 			ib += 2;
525 
526 		euctw_val = *ib++;
527 		euctw_val = (euctw_val << 8) | *ib++;
528 		sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
529 
530 		if (sz < 0) {
531 			ib = oldib;
532 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
533 		}
534 
535 		ob += sz;
536 		continue;
537 
538 REPLACE_INVALID:
539 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
540 			ib = oldib;
541 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
542 		}
543 
544 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
545 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
546 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
547 		ret_val++;
548 	}
549 
550 	*inlen = ibtail - ib;
551 	*outlen = obtail - ob;
552 
553 	return (ret_val);
554 }
555 
556 /*
557  * Encoding convertor from UTF-8 to BIG5.
558  */
559 static size_t
kiconv_to_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)560 kiconv_to_big5(void *kcd, char **inbuf, size_t *inbytesleft,
561     char **outbuf, size_t *outbytesleft, int *errno)
562 {
563 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
564 	    outbytesleft, errno, utf8_to_big5);
565 }
566 
567 /*
568  * String based encoding convertor from UTF-8 to BIG5.
569  */
570 static size_t
kiconvstr_to_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)571 kiconvstr_to_big5(char *inarray, size_t *inlen, char *outarray,
572     size_t *outlen, int flag, int *errno)
573 {
574 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
575 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5);
576 }
577 
578 /*
579  * Encoding convertor from UTF-8 to EUC-TW.
580  */
581 static size_t
kiconv_to_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)582 kiconv_to_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
583     char **outbuf, size_t *outbytesleft, int *errno)
584 {
585 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
586 	    outbytesleft, errno, utf8_to_euctw);
587 }
588 
589 /*
590  * String based encoding convertor from UTF-8 to EUC-TW.
591  */
592 static size_t
kiconvstr_to_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)593 kiconvstr_to_euctw(char *inarray, size_t *inlen, char *outarray,
594     size_t *outlen, int flag, int *errno)
595 {
596 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
597 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_euctw);
598 }
599 
600 /*
601  * Encoding convertor from UTF-8 to CP950HKSCS.
602  */
603 static size_t
kiconv_to_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)604 kiconv_to_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
605     char **outbuf, size_t *outbytesleft, int *errno)
606 {
607 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
608 	    outbytesleft, errno, utf8_to_cp950hkscs);
609 }
610 
611 /*
612  * String based encoding convertor from UTF-8 to CP950HKSCS.
613  */
614 static size_t
kiconvstr_to_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)615 kiconvstr_to_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
616     size_t *outlen, int flag, int *errno)
617 {
618 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
619 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_cp950hkscs);
620 }
621 
622 /*
623  * Encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
624  */
625 static size_t
kiconv_to_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)626 kiconv_to_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
627     char **outbuf, size_t *outbytesleft, int *errno)
628 {
629 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
630 	    outbytesleft, errno, utf8_to_big5hkscs);
631 }
632 
633 /*
634  * String based encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
635  */
636 static size_t
kiconvstr_to_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)637 kiconvstr_to_big5hkscs(char *inarray, size_t *inlen, char *outarray,
638     size_t *outlen, int flag, int *errno)
639 {
640 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
641 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5hkscs);
642 }
643 
644 /*
645  * Common convertor from single BIG5/CP950-HKSCS character to UTF-8.
646  * Return: > 0  - Converted successfully
647  *         = -1 - E2BIG
648  */
649 static int8_t
big5_to_utf8_common(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_array_t * table,size_t nitems)650 big5_to_utf8_common(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
651     size_t *ret_val, kiconv_table_array_t *table, size_t nitems)
652 {
653 	size_t	index;
654 	int8_t	sz;
655 	uchar_t	*u8;
656 
657 	index = kiconv_binsearch(big5_val, table, nitems);
658 	u8 = table[index].u8;
659 	sz = u8_number_of_bytes[u8[0]];
660 
661 	if (obtail - ob < sz) {
662 		*ret_val = (size_t)-1;
663 		return (-1);
664 	}
665 
666 	if (index == 0)
667 		(*ret_val)++;	/* Non-identical conversion */
668 
669 	for (index = 0; index < sz; index++)
670 		*ob++ = u8[index];
671 
672 	return (sz);
673 }
674 
675 /*
676  * Convert single BIG5 character to UTF-8.
677  */
678 static int8_t
big5_to_utf8(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)679 big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val)
680 {
681 	return (big5_to_utf8_common(big5_val, ob, obtail, ret_val,
682 	    kiconv_big5_utf8, KICONV_BIG5_UTF8_MAX));
683 }
684 
685 /*
686  * Convert single CP950-HKSCS character to UTF-8.
687  */
688 static int8_t
cp950hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)689 cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
690     size_t *ret_val)
691 {
692 	return (big5_to_utf8_common(hkscs_val, ob, obtail, ret_val,
693 	    kiconv_cp950hkscs_utf8, KICONV_CP950HKSCS_UTF8_MAX));
694 }
695 
696 /*
697  * Calculate unicode value for some CNS planes which fall in Unicode
698  * UDA range.
699  */
700 static uint32_t
get_unicode_from_UDA(size_t plane_no,uchar_t b1,uchar_t b2)701 get_unicode_from_UDA(size_t plane_no, uchar_t b1, uchar_t b2)
702 {
703 	/*
704 	 * CNS Plane 15 is pre-allocated, so need move Plane 16 to back 15
705 	 * to compute the Unicode value.
706 	 */
707 	if (plane_no == 16)
708 		--plane_no;
709 
710 	/* 0xF0000 + (plane_no - 12) * 8836 + (b1 - 0xA1) * 94 + (b2 - 0xA1) */
711 	return (8836 * plane_no + 94 * b1 + b2 + 0xD2611);
712 }
713 
714 /*
715  * Convert single EUC-TW character to UTF-8.
716  * Return: > 0  - Converted successfully
717  *         = -1 - E2BIG
718  */
719 static int8_t
euctw_to_utf8(size_t plane_no,uint32_t euctw_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)720 euctw_to_utf8(size_t plane_no, uint32_t euctw_val, uchar_t *ob,
721     uchar_t *obtail, size_t *ret_val)
722 {
723 	uint32_t u32;
724 	size_t	index;
725 	int8_t	sz;
726 	uchar_t	udc[4];
727 	uchar_t	*u8;
728 
729 	switch (plane_no) {
730 	case 1:
731 		index = kiconv_binsearch(euctw_val, kiconv_cns1_utf8,
732 		    KICONV_CNS1_UTF8_MAX);
733 		u8 = kiconv_cns1_utf8[index].u8;
734 		break;
735 	case 2:
736 		index = kiconv_binsearch(euctw_val, kiconv_cns2_utf8,
737 		    KICONV_CNS2_UTF8_MAX);
738 		u8 = kiconv_cns2_utf8[index].u8;
739 		break;
740 	case 3:
741 		index = kiconv_binsearch(euctw_val, kiconv_cns3_utf8,
742 		    KICONV_CNS3_UTF8_MAX);
743 		u8 = kiconv_cns3_utf8[index].u8;
744 		break;
745 	case 4:
746 		index = kiconv_binsearch(euctw_val, kiconv_cns4_utf8,
747 		    KICONV_CNS4_UTF8_MAX);
748 		u8 = kiconv_cns4_utf8[index].u8;
749 		break;
750 	case 5:
751 		index = kiconv_binsearch(euctw_val, kiconv_cns5_utf8,
752 		    KICONV_CNS5_UTF8_MAX);
753 		u8 = kiconv_cns5_utf8[index].u8;
754 		break;
755 	case 6:
756 		index = kiconv_binsearch(euctw_val, kiconv_cns6_utf8,
757 		    KICONV_CNS6_UTF8_MAX);
758 		u8 = kiconv_cns6_utf8[index].u8;
759 		break;
760 	case 7:
761 		index = kiconv_binsearch(euctw_val, kiconv_cns7_utf8,
762 		    KICONV_CNS7_UTF8_MAX);
763 		u8 = kiconv_cns7_utf8[index].u8;
764 		break;
765 	case 12:
766 	case 13:
767 	case 14:
768 	case 16:
769 		u32 = get_unicode_from_UDA(plane_no,
770 		    (euctw_val & 0xFF00) >> 8, euctw_val & 0xFF);
771 		/*
772 		 * As U+F0000 <= u32 <= U+F8A0F, so its UTF-8 sequence
773 		 * will occupy 4 bytes.
774 		 */
775 		udc[0] = 0xF3;
776 		udc[1] = (uchar_t)(0x80 | (u32 & 0x03F000) >> 12);
777 		udc[2] = (uchar_t)(0x80 | (u32 & 0x000FC0) >> 6);
778 		udc[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
779 		u8 = udc;
780 		index = 1;
781 		break;
782 	case 15:
783 		index = kiconv_binsearch(euctw_val, kiconv_cns15_utf8,
784 		    KICONV_CNS15_UTF8_MAX);
785 		u8 = kiconv_cns15_utf8[index].u8;
786 		break;
787 	default:
788 		index = 0;
789 		u8 = kiconv_cns1_utf8[index].u8;
790 	}
791 
792 	sz = u8_number_of_bytes[u8[0]];
793 	if (obtail - ob < sz) {
794 		*ret_val = (size_t)-1;
795 		return (-1);
796 	}
797 
798 	if (index == 0)
799 		(*ret_val)++;
800 
801 	for (index = 0; index < sz; index++)
802 		*ob++ = u8[index];
803 
804 	return (sz);
805 }
806 
807 /*
808  * Convert single HKSCS character to UTF-8.
809  * Return: > 0  - Converted successfully
810  *         = -1 - E2BIG
811  */
812 static int8_t
big5hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)813 big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
814     size_t *ret_val)
815 {
816 	size_t	index;
817 	int8_t	sz;
818 	uchar_t	*u8;
819 
820 	index = kiconv_binsearch(hkscs_val, kiconv_hkscs_utf8,
821 	    KICONV_HKSCS_UTF8_MAX);
822 	u8 = kiconv_hkscs_utf8[index].u8;
823 
824 	/*
825 	 * Single HKSCS-2004 character may map to 2 Unicode
826 	 * code points.
827 	 */
828 	if (u8[0] == 0xFF) {
829 		u8 = hkscs_special_sequence[u8[1]];
830 		sz = 4;
831 	} else {
832 		sz = u8_number_of_bytes[u8[0]];
833 	}
834 
835 	if (obtail - ob < sz) {
836 		*ret_val = (size_t)-1;
837 		return (-1);
838 	}
839 
840 	if (index == 0)
841 		(*ret_val)++;	/* Non-identical conversion. */
842 
843 	for (index = 0; index < sz; index++)
844 		*ob++ = u8[index];
845 
846 	return (sz);
847 }
848 
849 /*
850  * Convert single UTF-8 character to EUC-TW.
851  * Return: > 0  - Converted successfully
852  *         = -1 - E2BIG
853  */
854 /* ARGSUSED */
855 static int8_t
utf8_to_euctw(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)856 utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
857     uchar_t *ob, uchar_t *obtail, size_t *ret_val)
858 {
859 	size_t		index;
860 	size_t		plane_no;
861 	uchar_t		byte1;
862 	uchar_t		byte2;
863 
864 	if (utf8 >= KICONV_TC_UDA_UTF8_START &&
865 	    utf8 <= KICONV_TC_UDA_UTF8_END) {
866 		/*
867 		 * Calculate EUC-TW code if utf8 is in Unicode
868 		 * Private Plane 15.
869 		 */
870 		index = (((utf8 & 0x7000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
871 		    ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
872 		    KICONV_TC_UDA_UCS4_START;
873 		plane_no = 12 + index / 8836;
874 		byte1 = 0xA1 + (index % 8836) / 94;
875 		byte2 = 0xA1 + index % 94;
876 
877 		/* CNS Plane 15 is pre-allocated, so place it into Plane 16. */
878 		if (plane_no == 15)
879 			plane_no = 16;
880 	} else {
881 		uint32_t	euctw_val;
882 
883 		index = kiconv_binsearch(utf8, kiconv_utf8_euctw,
884 		    KICONV_UTF8_EUCTW_MAX);
885 
886 		if (index == 0) {
887 			if (ob >= obtail) {
888 				*ret_val = (size_t)-1;
889 				return (-1);
890 			}
891 
892 			*ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
893 			(*ret_val)++;
894 
895 			return (1);
896 		}
897 
898 		euctw_val = kiconv_utf8_euctw[index].value;
899 		byte1 = (euctw_val & 0xFF00) >> 8;
900 		byte2 = euctw_val & 0xFF;
901 		plane_no = euctw_val >> 16;
902 	}
903 
904 	if (obtail - ob < (plane_no == 1 ? 2 : 4)) {
905 		*ret_val = (size_t)-1;
906 		return (-1);
907 	}
908 
909 	if (plane_no != 1) {
910 		*ob++ = KICONV_TC_EUCTW_MBYTE;
911 		*ob++ = KICONV_TC_EUCTW_PMASK + plane_no;
912 	}
913 
914 	*ob++ = byte1;
915 	*ob = byte2;
916 
917 	return (plane_no == 1 ? 2 : 4);
918 }
919 
920 /*
921  * Convert single UTF-8 character to BIG5-HKSCS
922  * Return: > 0  - Converted successfully
923  *         = -1 - E2BIG
924  */
925 static int8_t
utf8_to_big5hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)926 utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
927     uchar_t *ob, uchar_t *obtail, size_t *ret_val)
928 {
929 	size_t		index;
930 	int8_t		hkscslen;
931 	uint32_t	hkscscode;
932 	boolean_t	special_sequence = B_FALSE;
933 
934 	index = kiconv_binsearch(utf8, kiconv_utf8_hkscs,
935 	    KICONV_UTF8_HKSCS_MAX);
936 	hkscscode = kiconv_utf8_hkscs[index].value;
937 
938 	/*
939 	 * There are 4 special code points in HKSCS-2004 which mapped
940 	 * to 2 UNICODE code points.
941 	 */
942 	if ((int32_t)hkscscode < 0) {
943 		size_t special_index = (-(int32_t)hkscscode - 1) * 3;
944 
945 		/* Check the following 2 bytes. */
946 		if (ibtail - *inbuf >= 2 && **inbuf == 0xcc &&
947 		    (*(*inbuf + 1) == 0x84 || *(*inbuf + 1) == 0x8c)) {
948 			special_index += (*(*inbuf + 1) == 0x84 ? 1 : 2);
949 			special_sequence = B_TRUE;
950 		}
951 
952 		hkscscode = ucs_special_sequence[special_index];
953 	}
954 
955 	hkscslen = (hkscscode <= 0xFF) ? 1 : 2;
956 	if (obtail - ob < hkscslen) {
957 		*ret_val = (size_t)-1;
958 		return (-1);
959 	}
960 
961 	if (index == 0)
962 		(*ret_val)++;
963 
964 	if (hkscslen > 1)
965 		*ob++ = (uchar_t)(hkscscode >> 8);
966 	*ob = (uchar_t)(hkscscode & 0xFF);
967 
968 	if (special_sequence) {		/* Advance for special sequence */
969 		(*inbuf) += 2;
970 	}
971 
972 	return (hkscslen);
973 }
974 
975 /*
976  * Common convertor for UTF-8 to BIG5/CP950-HKSCS.
977  * Return: > 0  - Converted successfully
978  *         = -1 - E2BIG
979  */
980 static int8_t
utf8_to_big5_common(uint32_t utf8,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_t * table,size_t nitems)981 utf8_to_big5_common(uint32_t utf8, uchar_t *ob, uchar_t *obtail,
982     size_t *ret_val, kiconv_table_t *table, size_t nitems)
983 {
984 	size_t		index;
985 	int8_t		big5len;
986 	uint32_t	big5code;
987 
988 	index = kiconv_binsearch(utf8, table, nitems);
989 	big5code = table[index].value;
990 	big5len = (big5code <= 0xFF) ? 1 : 2;
991 
992 	if (obtail - ob < big5len) {
993 		*ret_val = (size_t)-1;
994 		return (-1);
995 	}
996 
997 	if (index == 0)
998 		(*ret_val)++;
999 
1000 	if (big5len > 1)
1001 		*ob++ = (uchar_t)(big5code >> 8);
1002 	*ob = (uchar_t)(big5code & 0xFF);
1003 
1004 	return (big5len);
1005 }
1006 
1007 /*
1008  * Convert single UTF-8 character to BIG5.
1009  */
1010 /* ARGSUSED */
1011 static int8_t
utf8_to_big5(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1012 utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1013     uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1014 {
1015 	return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1016 	    kiconv_utf8_big5, KICONV_UTF8_BIG5_MAX));
1017 }
1018 
1019 /*
1020  * Convert single UTF-8 character to CP950-HKSCS for Windows compatibility.
1021  */
1022 /* ARGSUSED */
1023 static int8_t
utf8_to_cp950hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1024 utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1025     uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1026 {
1027 	return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1028 	    kiconv_utf8_cp950hkscs, KICONV_UTF8_CP950HKSCS));
1029 }
1030 
1031 static kiconv_ops_t kiconv_tc_ops_tbl[] = {
1032 	{
1033 		"big5", "utf-8", kiconv_open_to_cck, kiconv_to_big5,
1034 		kiconv_close_to_cck, kiconvstr_to_big5
1035 	},
1036 	{
1037 		"utf-8", "big5", open_fr_big5, kiconv_fr_big5,
1038 		close_fr_tc, kiconvstr_fr_big5
1039 	},
1040 
1041 	{
1042 		"big5-hkscs", "utf-8", kiconv_open_to_cck, kiconv_to_big5hkscs,
1043 		kiconv_close_to_cck, kiconvstr_to_big5hkscs
1044 	},
1045 	{
1046 		"utf-8", "big5-hkscs", open_fr_big5hkscs, kiconv_fr_big5hkscs,
1047 		close_fr_tc, kiconvstr_fr_big5hkscs
1048 	},
1049 
1050 	{
1051 		"euc-tw", "utf-8", kiconv_open_to_cck, kiconv_to_euctw,
1052 		kiconv_close_to_cck, kiconvstr_to_euctw
1053 	},
1054 	{
1055 		"utf-8", "euc-tw", open_fr_euctw, kiconv_fr_euctw,
1056 		close_fr_tc, kiconvstr_fr_euctw
1057 	},
1058 
1059 	{
1060 		"cp950-hkscs", "utf-8", kiconv_open_to_cck,
1061 		kiconv_to_cp950hkscs, kiconv_close_to_cck,
1062 		kiconvstr_to_cp950hkscs
1063 	},
1064 	{
1065 		"utf-8", "cp950-hkscs", open_fr_cp950hkscs,
1066 		kiconv_fr_cp950hkscs, close_fr_tc, kiconvstr_fr_cp950hkscs
1067 	},
1068 };
1069 
1070 static kiconv_module_info_t kiconv_tc_info = {
1071 	"kiconv_tc",		/* module name */
1072 	sizeof (kiconv_tc_ops_tbl) / sizeof (kiconv_tc_ops_tbl[0]),
1073 	kiconv_tc_ops_tbl,
1074 	0,
1075 	NULL,
1076 	NULL,
1077 	0
1078 };
1079 
1080 static struct modlkiconv modlkiconv_tc = {
1081 	&mod_kiconvops,
1082 	"kiconv Traditional Chinese module 1.0",
1083 	&kiconv_tc_info
1084 };
1085 
1086 static struct modlinkage modlinkage = {
1087 	MODREV_1,
1088 	(void *)&modlkiconv_tc,
1089 	NULL
1090 };
1091 
1092 int
_init(void)1093 _init(void)
1094 {
1095 	int err;
1096 
1097 	err = mod_install(&modlinkage);
1098 	if (err)
1099 		cmn_err(CE_WARN, "kiconv_tc: failed to load kernel module");
1100 
1101 	return (err);
1102 }
1103 
1104 int
_fini(void)1105 _fini(void)
1106 {
1107 	int err;
1108 
1109 	/*
1110 	 * If this module is being used, then, we cannot remove the module.
1111 	 * The following checking will catch pretty much all usual cases.
1112 	 *
1113 	 * Any remaining will be catached by the kiconv_unregister_module()
1114 	 * during mod_remove() at below.
1115 	 */
1116 	if (kiconv_module_ref_count(KICONV_MODULE_ID_TC))
1117 		return (EBUSY);
1118 
1119 	err = mod_remove(&modlinkage);
1120 	if (err)
1121 		cmn_err(CE_WARN, "kiconv_tc: failed to remove kernel module");
1122 
1123 	return (err);
1124 }
1125 
1126 int
_info(struct modinfo * modinfop)1127 _info(struct modinfo *modinfop)
1128 {
1129 	return (mod_info(&modlinkage, modinfop));
1130 }
1131