xref: /titanic_41/usr/src/uts/common/kiconv/kiconv_tc/kiconv_tc.c (revision 15d9d0b528387242011cdcc6190c9e598cfe3a07)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/sunddi.h>
35 #include <sys/byteorder.h>
36 #include <sys/errno.h>
37 #include <sys/modctl.h>
38 #include <sys/u8_textprep.h>
39 #include <sys/kiconv.h>
40 #include <sys/kiconv_cck_common.h>
41 #include <sys/kiconv_tc.h>
42 #include <sys/kiconv_big5_utf8.h>
43 #include <sys/kiconv_euctw_utf8.h>
44 #include <sys/kiconv_hkscs_utf8.h>
45 #include <sys/kiconv_cp950hkscs_utf8.h>
46 #include <sys/kiconv_utf8_big5.h>
47 #include <sys/kiconv_utf8_euctw.h>
48 #include <sys/kiconv_utf8_cp950hkscs.h>
49 #include <sys/kiconv_utf8_hkscs.h>
50 
51 /* 4 HKSCS-2004 code points map to 2 Unicode code points separately. */
52 static uchar_t hkscs_special_sequence[][4] = {
53 	{ 0xc3, 0x8a, 0xcc, 0x84 },	/* 0x8862 */
54 	{ 0xc3, 0x8a, 0xcc, 0x8c },	/* 0x8864 */
55 	{ 0xc3, 0xaa, 0xcc, 0x84 },	/* 0x88a3 */
56 	{ 0xc3, 0xaa, 0xcc, 0x8c } 	/* 0x88a5 */
57 };
58 
59 /* 4 Unicode code point pair map to 1 HKSCS-2004 code point. */
60 static uint32_t ucs_special_sequence[] = {
61 	0x8866,		/* U+00ca */
62 	0x8862,		/* U+00ca U+0304 */
63 	0x8864,		/* U+00ca U+030c */
64 	0x88a7,		/* U+00ea */
65 	0x88a3,		/* U+00ea U+0304 */
66 	0x88a5		/* U+00ea U+030c */
67 };
68 
69 typedef int8_t (*kiconv_big5toutf8_t)(uint32_t value, uchar_t *ob,
70 	uchar_t *obtail, size_t *ret_val);
71 
72 static int8_t utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
73 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
74 static int8_t utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
75 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
76 static int8_t utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf,
77 	uchar_t *ibtail, uchar_t *ob, uchar_t *obtail, size_t *ret_val);
78 static int8_t utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
79 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
80 static int8_t big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
81 	size_t *ret_val);
82 static int8_t big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
83 	uchar_t *obtail, size_t *ret_val);
84 static int8_t cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
85 	uchar_t *obtail, size_t *ret_val);
86 static int8_t euctw_to_utf8(size_t plane_no, uint32_t euctw_val,
87 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
88 static uint32_t get_unicode_from_UDA(size_t plane_no, uchar_t byte1,
89 	uchar_t byte2);
90 
91 #define	KICONV_TC_BIG5		(0x01)
92 #define	KICONV_TC_BIG5HKSCS	(0x02)
93 #define	KICONV_TC_CP950HKSCS	(0x03)
94 #define	KICONV_TC_EUCTW		(0x04)
95 #define	KICONV_TC_MAX_MAGIC_ID	(0x04)
96 
97 static void *
open_fr_big5()98 open_fr_big5()
99 {
100 	return ((void *)KICONV_TC_BIG5);
101 }
102 
103 static void *
open_fr_big5hkscs()104 open_fr_big5hkscs()
105 {
106 	return ((void *)KICONV_TC_BIG5HKSCS);
107 }
108 
109 static void *
open_fr_cp950hkscs()110 open_fr_cp950hkscs()
111 {
112 	return ((void *)KICONV_TC_CP950HKSCS);
113 }
114 
115 static void *
open_fr_euctw()116 open_fr_euctw()
117 {
118 	return ((void *)KICONV_TC_EUCTW);
119 }
120 
121 static int
close_fr_tc(void * s)122 close_fr_tc(void *s)
123 {
124 	if ((uintptr_t)s > KICONV_TC_MAX_MAGIC_ID)
125 		return (EBADF);
126 
127 	return (0);
128 }
129 
130 /*
131  * Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) to UTF-8.
132  */
133 static size_t
kiconv_fr_big5_common(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_big5toutf8_t ptr_big5touf8)134 kiconv_fr_big5_common(void *kcd, char **inbuf, size_t *inbytesleft,
135 	char **outbuf, size_t *outbytesleft, int *errno,
136 	kiconv_big5toutf8_t ptr_big5touf8)
137 {
138 	uchar_t		*ib;
139 	uchar_t		*ob;
140 	uchar_t		*ibtail;
141 	uchar_t		*obtail;
142 	size_t		ret_val;
143 	int8_t		sz;
144 	uint32_t	big5_val;
145 
146 	/* Check on the kiconv code conversion descriptor. */
147 	if (kcd == NULL || kcd == (void *)-1) {
148 		*errno = EBADF;
149 		return ((size_t)-1);
150 	}
151 
152 	/* If this is a state reset request, process and return. */
153 	if (inbuf == NULL || *inbuf == NULL) {
154 		return (0);
155 	}
156 
157 	ret_val = 0;
158 	ib = (uchar_t *)*inbuf;
159 	ob = (uchar_t *)*outbuf;
160 	ibtail = ib + *inbytesleft;
161 	obtail = ob + *outbytesleft;
162 
163 	while (ib < ibtail) {
164 		if (KICONV_IS_ASCII(*ib)) {
165 			if (ob >= obtail) {
166 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
167 			}
168 
169 			*ob++ = *ib++;
170 			continue;
171 		}
172 
173 		/*
174 		 * Issue EILSEQ error if the first byte is not a
175 		 * valid BIG5/HKSCS leading byte.
176 		 */
177 		if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
178 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
179 		}
180 
181 		/*
182 		 * Issue EINVAL error if input buffer has an incomplete
183 		 * character at the end of the buffer.
184 		 */
185 		if (ibtail - ib < 2) {
186 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
187 		}
188 
189 		/*
190 		 * Issue EILSEQ error if the remaining bytes is not
191 		 * a valid BIG5/HKSCS byte.
192 		 */
193 		if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
194 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
195 		}
196 
197 		/* Now we have a valid BIG5/HKSCS character. */
198 		big5_val = (uint32_t)(*ib) << 8 | *(ib + 1);
199 		sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
200 
201 		if (sz < 0) {
202 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
203 		}
204 
205 		ib += 2;
206 		ob += sz;
207 	}
208 
209 	*inbuf = (char *)ib;
210 	*inbytesleft = ibtail - ib;
211 	*outbuf = (char *)ob;
212 	*outbytesleft = obtail - ob;
213 
214 	return (ret_val);
215 }
216 
217 /*
218  * String based Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS)
219  * to UTF-8.
220  */
221 static size_t
kiconvstr_fr_big5_common(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_big5toutf8_t ptr_big5touf8)222 kiconvstr_fr_big5_common(uchar_t *ib, size_t *inlen, uchar_t *ob,
223     size_t *outlen, int flag, int *errno,
224     kiconv_big5toutf8_t ptr_big5touf8)
225 {
226 	uchar_t		*oldib;
227 	uchar_t		*ibtail;
228 	uchar_t		*obtail;
229 	size_t		ret_val;
230 	int8_t		sz;
231 	uint32_t	big5_val;
232 	boolean_t	do_not_ignore_null;
233 
234 	ret_val = 0;
235 	ibtail = ib + *inlen;
236 	obtail = ob + *outlen;
237 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
238 
239 	while (ib < ibtail) {
240 		if (*ib == '\0' && do_not_ignore_null)
241 			break;
242 
243 		if (KICONV_IS_ASCII(*ib)) {
244 			if (ob >= obtail) {
245 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
246 			}
247 
248 			*ob++ = *ib++;
249 			continue;
250 		}
251 
252 		oldib = ib;
253 
254 		if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
255 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
256 		}
257 
258 		if (ibtail - ib < 2) {
259 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
260 		}
261 
262 		if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
263 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
264 		}
265 
266 		big5_val = *ib++;
267 		big5_val = (big5_val << 8) | *ib++;
268 		sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
269 
270 		if (sz < 0) {
271 			ib = oldib;
272 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
273 		}
274 
275 		ob += sz;
276 		continue;
277 
278 REPLACE_INVALID:
279 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
280 			ib = oldib;
281 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
282 		}
283 
284 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
285 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
286 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
287 		ret_val++;
288 	}
289 
290 	*inlen = ibtail - ib;
291 	*outlen = obtail - ob;
292 
293 	return (ret_val);
294 }
295 
296 /*
297  * Encoding convertor from BIG5 to UTF-8.
298  */
299 static size_t
kiconv_fr_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)300 kiconv_fr_big5(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf,
301 	size_t *outbytesleft, int *errno)
302 {
303 	return (kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
304 	    outbytesleft, errno, big5_to_utf8));
305 }
306 
307 /*
308  * String based encoding convertor from BIG5 to UTF-8.
309  */
310 static size_t
kiconvstr_fr_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)311 kiconvstr_fr_big5(char *inarray, size_t *inlen, char *outarray,
312     size_t *outlen, int flag, int *errno)
313 {
314 	return (kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
315 	    (uchar_t *)outarray, outlen, flag, errno,
316 	    big5_to_utf8));
317 }
318 
319 /*
320  * Encoding convertor from BIG5-HKSCS to UTF-8.
321  */
322 static size_t
kiconv_fr_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)323 kiconv_fr_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
324     char **outbuf, size_t *outbytesleft, int *errno)
325 {
326 	return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
327 	    outbytesleft, errno, big5hkscs_to_utf8);
328 }
329 
330 /*
331  * String based encoding convertor from BIG5-HKSCS to UTF-8.
332  */
333 static size_t
kiconvstr_fr_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)334 kiconvstr_fr_big5hkscs(char *inarray, size_t *inlen, char *outarray,
335 	size_t *outlen, int flag, int *errno)
336 {
337 	return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
338 	    (uchar_t *)outarray, outlen, flag, errno, big5hkscs_to_utf8);
339 }
340 
341 /*
342  * Encoding convertor from CP950-HKSCS to UTF-8.
343  */
344 static size_t
kiconv_fr_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)345 kiconv_fr_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
346     char **outbuf, size_t *outbytesleft, int *errno)
347 {
348 	return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
349 	    outbytesleft, errno, cp950hkscs_to_utf8);
350 }
351 
352 /*
353  * String based encoding convertor from CP950-HKSCS to UTF-8.
354  */
355 static size_t
kiconvstr_fr_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)356 kiconvstr_fr_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
357 	size_t *outlen, int flag, int *errno)
358 {
359 	return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
360 	    (uchar_t *)outarray, outlen, flag, errno, cp950hkscs_to_utf8);
361 }
362 
363 /*
364  * Encoding convertor from EUC-TW to UTF-8.
365  */
366 static size_t
kiconv_fr_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)367 kiconv_fr_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
368 	char **outbuf, size_t *outbytesleft, int *errno)
369 {
370 	uchar_t		*ib;
371 	uchar_t		*ob;
372 	uchar_t		*ibtail;
373 	uchar_t		*obtail;
374 	uchar_t		*oldib;
375 	size_t		ret_val;
376 	size_t		plane_no;
377 	int8_t		sz;
378 	uint32_t	euctw_val;
379 	boolean_t	isplane1;
380 
381 	/* Check on the kiconv code conversion descriptor. */
382 	if (kcd == NULL || kcd == (void *)-1) {
383 		*errno = EBADF;
384 		return ((size_t)-1);
385 	}
386 
387 	/* If this is a state reset request, process and return. */
388 	if (inbuf == NULL || *inbuf == NULL) {
389 		return (0);
390 	}
391 
392 	ret_val = 0;
393 	ib = (uchar_t *)*inbuf;
394 	ob = (uchar_t *)*outbuf;
395 	ibtail = ib + *inbytesleft;
396 	obtail = ob + *outbytesleft;
397 
398 	while (ib < ibtail) {
399 		if (KICONV_IS_ASCII(*ib)) {
400 			if (ob >= obtail) {
401 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
402 			}
403 
404 			*ob++ = *ib++;
405 			continue;
406 		}
407 
408 		/*
409 		 * Issue EILSEQ error if the first byte is not a
410 		 * valid EUC-TW leading byte.
411 		 */
412 		if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
413 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
414 		}
415 
416 		isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
417 		    B_FALSE : B_TRUE;
418 
419 		/*
420 		 * Issue EINVAL error if input buffer has an incomplete
421 		 * character at the end of the buffer.
422 		 */
423 		if (ibtail - ib < (isplane1 ? 2 : 4)) {
424 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
425 		}
426 
427 		oldib = ib;
428 		plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
429 
430 		/*
431 		 * Issue EILSEQ error if the remaining bytes are not
432 		 * valid EUC-TW bytes.
433 		 */
434 		if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
435 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
436 		}
437 
438 		if (! isplane1)
439 			ib += 2;
440 
441 		/* Now we have a valid EUC-TW character. */
442 		euctw_val = *ib++;
443 		euctw_val = (euctw_val << 8) | *ib++;
444 		sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
445 
446 		if (sz < 0) {
447 			ib = oldib;
448 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
449 		}
450 
451 		ob += sz;
452 	}
453 
454 	*inbuf = (char *)ib;
455 	*inbytesleft = ibtail - ib;
456 	*outbuf = (char *)ob;
457 	*outbytesleft = obtail - ob;
458 
459 	return (ret_val);
460 }
461 
462 /*
463  * String based encoding convertor from EUC-TW to UTF-8.
464  */
465 static size_t
kiconvstr_fr_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)466 kiconvstr_fr_euctw(char *inarray, size_t *inlen, char *outarray,
467 	size_t *outlen, int flag, int *errno)
468 {
469 	uchar_t		*ib;
470 	uchar_t		*ob;
471 	uchar_t		*ibtail;
472 	uchar_t		*obtail;
473 	uchar_t		*oldib;
474 	size_t		ret_val;
475 	size_t		plane_no;
476 	int8_t		sz;
477 	uint32_t	euctw_val;
478 	boolean_t	isplane1;
479 	boolean_t	do_not_ignore_null;
480 
481 	ret_val = 0;
482 	ib = (uchar_t *)inarray;
483 	ob = (uchar_t *)outarray;
484 	ibtail = ib + *inlen;
485 	obtail = ob + *outlen;
486 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
487 
488 	while (ib < ibtail) {
489 		if (*ib == '\0' && do_not_ignore_null)
490 			break;
491 
492 		if (KICONV_IS_ASCII(*ib)) {
493 			if (ob >= obtail) {
494 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
495 			}
496 
497 			*ob++ = *ib++;
498 			continue;
499 		}
500 
501 		oldib = ib;
502 
503 		if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
504 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
505 		}
506 
507 		isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
508 		    B_FALSE : B_TRUE;
509 
510 		if (ibtail - ib < (isplane1 ? 2 : 4)) {
511 			if (flag & KICONV_REPLACE_INVALID) {
512 				ib = ibtail;
513 				goto REPLACE_INVALID;
514 			}
515 
516 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
517 		}
518 
519 		plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
520 
521 		if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
522 			KICONV_SET_ERRNO_WITH_FLAG(isplane1 ? 2 : 4, EILSEQ);
523 		}
524 
525 		if (! isplane1)
526 			ib += 2;
527 
528 		euctw_val = *ib++;
529 		euctw_val = (euctw_val << 8) | *ib++;
530 		sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
531 
532 		if (sz < 0) {
533 			ib = oldib;
534 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
535 		}
536 
537 		ob += sz;
538 		continue;
539 
540 REPLACE_INVALID:
541 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
542 			ib = oldib;
543 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
544 		}
545 
546 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
547 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
548 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
549 		ret_val++;
550 	}
551 
552 	*inlen = ibtail - ib;
553 	*outlen = obtail - ob;
554 
555 	return (ret_val);
556 }
557 
558 /*
559  * Encoding convertor from UTF-8 to BIG5.
560  */
561 static size_t
kiconv_to_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)562 kiconv_to_big5(void *kcd, char **inbuf, size_t *inbytesleft,
563 	char **outbuf, size_t *outbytesleft, int *errno)
564 {
565 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
566 	    outbytesleft, errno, utf8_to_big5);
567 }
568 
569 /*
570  * String based encoding convertor from UTF-8 to BIG5.
571  */
572 static size_t
kiconvstr_to_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)573 kiconvstr_to_big5(char *inarray, size_t *inlen, char *outarray,
574 	size_t *outlen, int flag, int *errno)
575 {
576 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
577 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5);
578 }
579 
580 /*
581  * Encoding convertor from UTF-8 to EUC-TW.
582  */
583 static size_t
kiconv_to_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)584 kiconv_to_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
585 	char **outbuf, size_t *outbytesleft, int *errno)
586 {
587 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
588 	    outbytesleft, errno, utf8_to_euctw);
589 }
590 
591 /*
592  * String based encoding convertor from UTF-8 to EUC-TW.
593  */
594 static size_t
kiconvstr_to_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)595 kiconvstr_to_euctw(char *inarray, size_t *inlen, char *outarray,
596 	size_t *outlen, int flag, int *errno)
597 {
598 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
599 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_euctw);
600 }
601 
602 /*
603  * Encoding convertor from UTF-8 to CP950HKSCS.
604  */
605 static size_t
kiconv_to_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)606 kiconv_to_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
607 	char **outbuf, size_t *outbytesleft, int *errno)
608 {
609 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
610 	    outbytesleft, errno, utf8_to_cp950hkscs);
611 }
612 
613 /*
614  * String based encoding convertor from UTF-8 to CP950HKSCS.
615  */
616 static size_t
kiconvstr_to_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)617 kiconvstr_to_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
618 	size_t *outlen, int flag, int *errno)
619 {
620 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
621 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_cp950hkscs);
622 }
623 
624 /*
625  * Encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
626  */
627 static size_t
kiconv_to_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)628 kiconv_to_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
629 	char **outbuf, size_t *outbytesleft, int *errno)
630 {
631 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
632 	    outbytesleft, errno, utf8_to_big5hkscs);
633 }
634 
635 /*
636  * String based encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
637  */
638 static size_t
kiconvstr_to_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)639 kiconvstr_to_big5hkscs(char *inarray, size_t *inlen, char *outarray,
640     size_t *outlen, int flag, int *errno)
641 {
642 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
643 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5hkscs);
644 }
645 
646 /*
647  * Common convertor from single BIG5/CP950-HKSCS character to UTF-8.
648  * Return: > 0  - Converted successfully
649  *         = -1 - E2BIG
650  */
651 static int8_t
big5_to_utf8_common(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_array_t * table,size_t nitems)652 big5_to_utf8_common(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
653 	size_t *ret_val, kiconv_table_array_t *table, size_t nitems)
654 {
655 	size_t	index;
656 	int8_t	sz;
657 	uchar_t	*u8;
658 
659 	index = kiconv_binsearch(big5_val, table, nitems);
660 	u8 = table[index].u8;
661 	sz = u8_number_of_bytes[u8[0]];
662 
663 	if (obtail - ob < sz) {
664 		*ret_val = (size_t)-1;
665 		return (-1);
666 	}
667 
668 	if (index == 0)
669 		(*ret_val)++;	/* Non-identical conversion */
670 
671 	for (index = 0; index < sz; index++)
672 		*ob++ = u8[index];
673 
674 	return (sz);
675 }
676 
677 /*
678  * Convert single BIG5 character to UTF-8.
679  */
680 static int8_t
big5_to_utf8(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)681 big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val)
682 {
683 	return (big5_to_utf8_common(big5_val, ob, obtail, ret_val,
684 	    kiconv_big5_utf8, KICONV_BIG5_UTF8_MAX));
685 }
686 
687 /*
688  * Convert single CP950-HKSCS character to UTF-8.
689  */
690 static int8_t
cp950hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)691 cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
692 	size_t *ret_val)
693 {
694 	return (big5_to_utf8_common(hkscs_val, ob, obtail, ret_val,
695 	    kiconv_cp950hkscs_utf8, KICONV_CP950HKSCS_UTF8_MAX));
696 }
697 
698 /*
699  * Calculate unicode value for some CNS planes which fall in Unicode
700  * UDA range.
701  */
702 static uint32_t
get_unicode_from_UDA(size_t plane_no,uchar_t b1,uchar_t b2)703 get_unicode_from_UDA(size_t plane_no, uchar_t b1, uchar_t b2)
704 {
705 	/*
706 	 * CNS Plane 15 is pre-allocated, so need move Plane 16 to back 15
707 	 * to compute the Unicode value.
708 	 */
709 	if (plane_no == 16)
710 		--plane_no;
711 
712 	/* 0xF0000 + (plane_no - 12) * 8836 + (b1 - 0xA1) * 94 + (b2 - 0xA1) */
713 	return (8836 * plane_no + 94 * b1 + b2 + 0xD2611);
714 }
715 
716 /*
717  * Convert single EUC-TW character to UTF-8.
718  * Return: > 0  - Converted successfully
719  *         = -1 - E2BIG
720  */
721 static int8_t
euctw_to_utf8(size_t plane_no,uint32_t euctw_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)722 euctw_to_utf8(size_t plane_no, uint32_t euctw_val, uchar_t *ob,
723 	uchar_t *obtail, size_t *ret_val)
724 {
725 	uint32_t u32;
726 	size_t	index;
727 	int8_t	sz;
728 	uchar_t	udc[4];
729 	uchar_t	*u8;
730 
731 	switch (plane_no) {
732 	case 1:
733 		index = kiconv_binsearch(euctw_val, kiconv_cns1_utf8,
734 		    KICONV_CNS1_UTF8_MAX);
735 		u8 = kiconv_cns1_utf8[index].u8;
736 		break;
737 	case 2:
738 		index = kiconv_binsearch(euctw_val, kiconv_cns2_utf8,
739 		    KICONV_CNS2_UTF8_MAX);
740 		u8 = kiconv_cns2_utf8[index].u8;
741 		break;
742 	case 3:
743 		index = kiconv_binsearch(euctw_val, kiconv_cns3_utf8,
744 		    KICONV_CNS3_UTF8_MAX);
745 		u8 = kiconv_cns3_utf8[index].u8;
746 		break;
747 	case 4:
748 		index = kiconv_binsearch(euctw_val, kiconv_cns4_utf8,
749 		    KICONV_CNS4_UTF8_MAX);
750 		u8 = kiconv_cns4_utf8[index].u8;
751 		break;
752 	case 5:
753 		index = kiconv_binsearch(euctw_val, kiconv_cns5_utf8,
754 		    KICONV_CNS5_UTF8_MAX);
755 		u8 = kiconv_cns5_utf8[index].u8;
756 		break;
757 	case 6:
758 		index = kiconv_binsearch(euctw_val, kiconv_cns6_utf8,
759 		    KICONV_CNS6_UTF8_MAX);
760 		u8 = kiconv_cns6_utf8[index].u8;
761 		break;
762 	case 7:
763 		index = kiconv_binsearch(euctw_val, kiconv_cns7_utf8,
764 		    KICONV_CNS7_UTF8_MAX);
765 		u8 = kiconv_cns7_utf8[index].u8;
766 		break;
767 	case 12:
768 	case 13:
769 	case 14:
770 	case 16:
771 		u32 = get_unicode_from_UDA(plane_no,
772 		    (euctw_val & 0xFF00) >> 8, euctw_val & 0xFF);
773 		/*
774 		 * As U+F0000 <= u32 <= U+F8A0F, so its UTF-8 sequence
775 		 * will occupy 4 bytes.
776 		 */
777 		udc[0] = 0xF3;
778 		udc[1] = (uchar_t)(0x80 | (u32 & 0x03F000) >> 12);
779 		udc[2] = (uchar_t)(0x80 | (u32 & 0x000FC0) >> 6);
780 		udc[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
781 		u8 = udc;
782 		index = 1;
783 		break;
784 	case 15:
785 		index = kiconv_binsearch(euctw_val, kiconv_cns15_utf8,
786 		    KICONV_CNS15_UTF8_MAX);
787 		u8 = kiconv_cns15_utf8[index].u8;
788 		break;
789 	default:
790 		index = 0;
791 		u8 = kiconv_cns1_utf8[index].u8;
792 	}
793 
794 	sz = u8_number_of_bytes[u8[0]];
795 	if (obtail - ob < sz) {
796 		*ret_val = (size_t)-1;
797 		return (-1);
798 	}
799 
800 	if (index == 0)
801 		(*ret_val)++;
802 
803 	for (index = 0; index < sz; index++)
804 		*ob++ = u8[index];
805 
806 	return (sz);
807 }
808 
809 /*
810  * Convert single HKSCS character to UTF-8.
811  * Return: > 0  - Converted successfully
812  *         = -1 - E2BIG
813  */
814 static int8_t
big5hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)815 big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
816 	size_t *ret_val)
817 {
818 	size_t	index;
819 	int8_t	sz;
820 	uchar_t	*u8;
821 
822 	index = kiconv_binsearch(hkscs_val, kiconv_hkscs_utf8,
823 	    KICONV_HKSCS_UTF8_MAX);
824 	u8 = kiconv_hkscs_utf8[index].u8;
825 
826 	/*
827 	 * Single HKSCS-2004 character may map to 2 Unicode
828 	 * code points.
829 	 */
830 	if (u8[0] == 0xFF) {
831 		u8 = hkscs_special_sequence[u8[1]];
832 		sz = 4;
833 	} else {
834 		sz = u8_number_of_bytes[u8[0]];
835 	}
836 
837 	if (obtail - ob < sz) {
838 		*ret_val = (size_t)-1;
839 		return (-1);
840 	}
841 
842 	if (index == 0)
843 		(*ret_val)++;	/* Non-identical conversion. */
844 
845 	for (index = 0; index < sz; index++)
846 		*ob++ = u8[index];
847 
848 	return (sz);
849 }
850 
851 /*
852  * Convert single UTF-8 character to EUC-TW.
853  * Return: > 0  - Converted successfully
854  *         = -1 - E2BIG
855  */
856 /* ARGSUSED */
857 static int8_t
utf8_to_euctw(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)858 utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
859 	uchar_t *ob, uchar_t *obtail, size_t *ret_val)
860 {
861 	size_t		index;
862 	size_t		plane_no;
863 	uchar_t		byte1;
864 	uchar_t		byte2;
865 
866 	if (utf8 >= KICONV_TC_UDA_UTF8_START &&
867 	    utf8 <= KICONV_TC_UDA_UTF8_END) {
868 		/*
869 		 * Calculate EUC-TW code if utf8 is in Unicode
870 		 * Private Plane 15.
871 		 */
872 		index = (((utf8 & 0x7000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
873 		    ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
874 		    KICONV_TC_UDA_UCS4_START;
875 		plane_no = 12 + index / 8836;
876 		byte1 = 0xA1 + (index % 8836) / 94;
877 		byte2 = 0xA1 + index % 94;
878 
879 		/* CNS Plane 15 is pre-allocated, so place it into Plane 16. */
880 		if (plane_no == 15)
881 			plane_no = 16;
882 	} else {
883 		uint32_t	euctw_val;
884 
885 		index = kiconv_binsearch(utf8, kiconv_utf8_euctw,
886 		    KICONV_UTF8_EUCTW_MAX);
887 
888 		if (index == 0) {
889 			if (ob >= obtail) {
890 				*ret_val = (size_t)-1;
891 				return (-1);
892 			}
893 
894 			*ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
895 			(*ret_val)++;
896 
897 			return (1);
898 		}
899 
900 		euctw_val = kiconv_utf8_euctw[index].value;
901 		byte1 = (euctw_val & 0xFF00) >> 8;
902 		byte2 = euctw_val & 0xFF;
903 		plane_no = euctw_val >> 16;
904 	}
905 
906 	if (obtail - ob < (plane_no == 1 ? 2 : 4)) {
907 		*ret_val = (size_t)-1;
908 		return (-1);
909 	}
910 
911 	if (plane_no != 1) {
912 		*ob++ = KICONV_TC_EUCTW_MBYTE;
913 		*ob++ = KICONV_TC_EUCTW_PMASK + plane_no;
914 	}
915 
916 	*ob++ = byte1;
917 	*ob = byte2;
918 
919 	return (plane_no == 1 ? 2 : 4);
920 }
921 
922 /*
923  * Convert single UTF-8 character to BIG5-HKSCS
924  * Return: > 0  - Converted successfully
925  *         = -1 - E2BIG
926  */
927 static int8_t
utf8_to_big5hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)928 utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
929     uchar_t *ob, uchar_t *obtail, size_t *ret_val)
930 {
931 	size_t		index;
932 	int8_t		hkscslen;
933 	uint32_t	hkscscode;
934 	boolean_t	special_sequence = B_FALSE;
935 
936 	index = kiconv_binsearch(utf8, kiconv_utf8_hkscs,
937 	    KICONV_UTF8_HKSCS_MAX);
938 	hkscscode = kiconv_utf8_hkscs[index].value;
939 
940 	/*
941 	 * There are 4 special code points in HKSCS-2004 which mapped
942 	 * to 2 UNICODE code points.
943 	 */
944 	if ((int32_t)hkscscode < 0) {
945 		size_t special_index = (-(int32_t)hkscscode - 1) * 3;
946 
947 		/* Check the following 2 bytes. */
948 		if (ibtail - *inbuf >= 2 && **inbuf == 0xcc &&
949 		    (*(*inbuf + 1) == 0x84 || *(*inbuf + 1) == 0x8c)) {
950 			special_index += (*(*inbuf + 1) == 0x84 ? 1 : 2);
951 			special_sequence = B_TRUE;
952 		}
953 
954 		hkscscode = ucs_special_sequence[special_index];
955 	}
956 
957 	hkscslen = (hkscscode <= 0xFF) ? 1 : 2;
958 	if (obtail - ob < hkscslen) {
959 		*ret_val = (size_t)-1;
960 		return (-1);
961 	}
962 
963 	if (index == 0)
964 		(*ret_val)++;
965 
966 	if (hkscslen > 1)
967 		*ob++ = (uchar_t)(hkscscode >> 8);
968 	*ob = (uchar_t)(hkscscode & 0xFF);
969 
970 	if (special_sequence) {		/* Advance for special sequence */
971 		(*inbuf) += 2;
972 	}
973 
974 	return (hkscslen);
975 }
976 
977 /*
978  * Common convertor for UTF-8 to BIG5/CP950-HKSCS.
979  * Return: > 0  - Converted successfully
980  *         = -1 - E2BIG
981  */
982 static int8_t
utf8_to_big5_common(uint32_t utf8,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_t * table,size_t nitems)983 utf8_to_big5_common(uint32_t utf8, uchar_t *ob, uchar_t *obtail,
984 	size_t *ret_val, kiconv_table_t *table, size_t nitems)
985 {
986 	size_t		index;
987 	int8_t		big5len;
988 	uint32_t	big5code;
989 
990 	index = kiconv_binsearch(utf8, table, nitems);
991 	big5code = table[index].value;
992 	big5len = (big5code <= 0xFF) ? 1 : 2;
993 
994 	if (obtail - ob < big5len) {
995 		*ret_val = (size_t)-1;
996 		return (-1);
997 	}
998 
999 	if (index == 0)
1000 		(*ret_val)++;
1001 
1002 	if (big5len > 1)
1003 		*ob++ = (uchar_t)(big5code >> 8);
1004 	*ob = (uchar_t)(big5code & 0xFF);
1005 
1006 	return (big5len);
1007 }
1008 
1009 /*
1010  * Convert single UTF-8 character to BIG5.
1011  */
1012 /* ARGSUSED */
1013 static int8_t
utf8_to_big5(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1014 utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1015 	uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1016 {
1017 	return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1018 	    kiconv_utf8_big5, KICONV_UTF8_BIG5_MAX));
1019 }
1020 
1021 /*
1022  * Convert single UTF-8 character to CP950-HKSCS for Windows compatibility.
1023  */
1024 /* ARGSUSED */
1025 static int8_t
utf8_to_cp950hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1026 utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1027 	uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1028 {
1029 	return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1030 	    kiconv_utf8_cp950hkscs, KICONV_UTF8_CP950HKSCS));
1031 }
1032 
1033 static kiconv_ops_t kiconv_tc_ops_tbl[] = {
1034 	{
1035 		"big5", "utf-8", kiconv_open_to_cck, kiconv_to_big5,
1036 		kiconv_close_to_cck, kiconvstr_to_big5
1037 	},
1038 	{
1039 		"utf-8", "big5", open_fr_big5, kiconv_fr_big5,
1040 		close_fr_tc, kiconvstr_fr_big5
1041 	},
1042 
1043 	{
1044 		"big5-hkscs", "utf-8", kiconv_open_to_cck, kiconv_to_big5hkscs,
1045 		kiconv_close_to_cck, kiconvstr_to_big5hkscs
1046 	},
1047 	{
1048 		"utf-8", "big5-hkscs", open_fr_big5hkscs, kiconv_fr_big5hkscs,
1049 		close_fr_tc, kiconvstr_fr_big5hkscs
1050 	},
1051 
1052 	{
1053 		"euc-tw", "utf-8", kiconv_open_to_cck, kiconv_to_euctw,
1054 		kiconv_close_to_cck, kiconvstr_to_euctw
1055 	},
1056 	{
1057 		"utf-8", "euc-tw", open_fr_euctw, kiconv_fr_euctw,
1058 		close_fr_tc, kiconvstr_fr_euctw
1059 	},
1060 
1061 	{
1062 		"cp950-hkscs", "utf-8", kiconv_open_to_cck,
1063 		kiconv_to_cp950hkscs, kiconv_close_to_cck,
1064 		kiconvstr_to_cp950hkscs
1065 	},
1066 	{
1067 		"utf-8", "cp950-hkscs", open_fr_cp950hkscs,
1068 		kiconv_fr_cp950hkscs, close_fr_tc, kiconvstr_fr_cp950hkscs
1069 	},
1070 };
1071 
1072 static kiconv_module_info_t kiconv_tc_info = {
1073 	"kiconv_tc",		/* module name */
1074 	sizeof (kiconv_tc_ops_tbl) / sizeof (kiconv_tc_ops_tbl[0]),
1075 	kiconv_tc_ops_tbl,
1076 	0,
1077 	NULL,
1078 	NULL,
1079 	0
1080 };
1081 
1082 static struct modlkiconv modlkiconv_tc = {
1083 	&mod_kiconvops,
1084 	"kiconv Traditional Chinese module 1.0",
1085 	&kiconv_tc_info
1086 };
1087 
1088 static struct modlinkage modlinkage = {
1089 	MODREV_1,
1090 	(void *)&modlkiconv_tc,
1091 	NULL
1092 };
1093 
1094 int
_init(void)1095 _init(void)
1096 {
1097 	int err;
1098 
1099 	err = mod_install(&modlinkage);
1100 	if (err)
1101 		cmn_err(CE_WARN, "kiconv_tc: failed to load kernel module");
1102 
1103 	return (err);
1104 }
1105 
1106 int
_fini(void)1107 _fini(void)
1108 {
1109 	int err;
1110 
1111 	/*
1112 	 * If this module is being used, then, we cannot remove the module.
1113 	 * The following checking will catch pretty much all usual cases.
1114 	 *
1115 	 * Any remaining will be catached by the kiconv_unregister_module()
1116 	 * during mod_remove() at below.
1117 	 */
1118 	if (kiconv_module_ref_count(KICONV_MODULE_ID_TC))
1119 		return (EBUSY);
1120 
1121 	err = mod_remove(&modlinkage);
1122 	if (err)
1123 		cmn_err(CE_WARN, "kiconv_tc: failed to remove kernel module");
1124 
1125 	return (err);
1126 }
1127 
1128 int
_info(struct modinfo * modinfop)1129 _info(struct modinfo *modinfop)
1130 {
1131 	return (mod_info(&modlinkage, modinfop));
1132 }
1133