xref: /illumos-gate/usr/src/uts/common/kiconv/kiconv_sc/kiconv_sc.c (revision 89b2a9fbeabf42fa54594df0e5927bcc50a07cc9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/sunddi.h>
35 #include <sys/byteorder.h>
36 #include <sys/errno.h>
37 #include <sys/modctl.h>
38 #include <sys/kiconv.h>
39 #include <sys/u8_textprep.h>
40 #include <sys/kiconv_cck_common.h>
41 #include <sys/kiconv_sc.h>
42 #include <sys/kiconv_gb18030_utf8.h>
43 #include <sys/kiconv_gb2312_utf8.h>
44 #include <sys/kiconv_utf8_gb18030.h>
45 #include <sys/kiconv_utf8_gb2312.h>
46 
47 static int8_t gb2312_to_utf8(uchar_t byte1, uchar_t byte2, uchar_t *ob,
48 	uchar_t *obtail, size_t *ret_val);
49 static int8_t gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail,
50 	size_t *ret_val, boolean_t isgbk4);
51 static int8_t utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
52 	uchar_t *ob, uchar_t *obtail, size_t *ret);
53 static int8_t utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
54 	uchar_t *ob, uchar_t *obtail, size_t *ret);
55 static int8_t utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
56 	uchar_t *ob, uchar_t *obtail, size_t *ret);
57 
58 #define	KICONV_SC_GB18030		(0x01)
59 #define	KICONV_SC_GBK			(0x02)
60 #define	KICONV_SC_EUCCN			(0x03)
61 #define	KICONV_SC_MAX_MAGIC_ID		(0x03)
62 
63 static void *
64 open_fr_gb18030()
65 {
66 	return ((void *)KICONV_SC_GB18030);
67 }
68 
69 static void *
70 open_fr_gbk()
71 {
72 	return ((void *)KICONV_SC_GBK);
73 }
74 
75 static void *
76 open_fr_euccn()
77 {
78 	return ((void *)KICONV_SC_EUCCN);
79 }
80 
81 static int
82 close_fr_sc(void *s)
83 {
84 	if ((uintptr_t)s > KICONV_SC_MAX_MAGIC_ID)
85 		return (EBADF);
86 
87 	return (0);
88 }
89 
90 /*
91  * Encoding convertor from UTF-8 to GB18030.
92  */
93 size_t
94 kiconv_to_gb18030(void *kcd, char **inbuf, size_t *inbytesleft,
95 	char **outbuf, size_t *outbytesleft, int *errno)
96 {
97 
98 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
99 	    outbytesleft, errno, utf8_to_gb18030);
100 }
101 
102 /*
103  * String based encoding convertor from UTF-8 to GB18030.
104  */
105 size_t
106 kiconvstr_to_gb18030(char *inarray, size_t *inlen, char *outarray,
107 	size_t *outlen, int flag, int *errno)
108 {
109 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
110 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb18030);
111 }
112 
113 /*
114  * Encoding convertor from GB18030 to UTF-8.
115  */
116 size_t
117 kiconv_fr_gb18030(void *kcd, char **inbuf, size_t *inbytesleft,
118 	char **outbuf, size_t *outbytesleft, int *errno)
119 {
120 	uchar_t		*ib;
121 	uchar_t		*ob;
122 	uchar_t		*ibtail;
123 	uchar_t		*obtail;
124 	size_t		ret_val;
125 	int8_t		sz;
126 	uint32_t	gb_val;
127 	boolean_t	isgbk4;
128 
129 	/* Check on the kiconv code conversion descriptor. */
130 	if (kcd == NULL || kcd == (void *)-1) {
131 		*errno = EBADF;
132 		return ((size_t)-1);
133 	}
134 
135 	/* If this is a state reset request, process and return. */
136 	if (inbuf == NULL || *inbuf == NULL) {
137 		return (0);
138 	}
139 
140 	ret_val = 0;
141 	ib = (uchar_t *)*inbuf;
142 	ob = (uchar_t *)*outbuf;
143 	ibtail = ib + *inbytesleft;
144 	obtail = ob + *outbytesleft;
145 
146 	while (ib < ibtail) {
147 		if (KICONV_IS_ASCII(*ib)) {
148 			if (ob >= obtail) {
149 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
150 			}
151 
152 			*ob++ = *ib++;
153 			continue;
154 		}
155 
156 		/*
157 		 * Issue EILSEQ error if the first byte is not a
158 		 * valid GB18030 leading byte.
159 		 */
160 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
161 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
162 		}
163 
164 		isgbk4 = (ibtail - ib < 2) ? B_FALSE :
165 		    KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1));
166 
167 		if (isgbk4) {
168 			if (ibtail - ib < 4) {
169 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
170 			}
171 
172 			if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) &&
173 			    KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) &&
174 			    KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) {
175 				KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
176 			}
177 
178 			gb_val = (uint32_t)(*ib) << 24 |
179 			    (uint32_t)(*(ib + 1)) << 16 |
180 			    (uint32_t)(*(ib + 2)) << 8 | *(ib + 3);
181 		} else {
182 			if (ibtail - ib < 2) {
183 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
184 			}
185 
186 			if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
187 				KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
188 			}
189 
190 			gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
191 		}
192 
193 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4);
194 		if (sz < 0) {
195 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
196 		}
197 
198 		ib += isgbk4 ? 4 : 2;
199 		ob += sz;
200 	}
201 
202 	*inbuf = (char *)ib;
203 	*inbytesleft = ibtail - ib;
204 	*outbuf = (char *)ob;
205 	*outbytesleft = obtail - ob;
206 
207 	return (ret_val);
208 }
209 
210 /*
211  * String based encoding convertor from GB18030 to UTF-8.
212  */
213 size_t
214 kiconvstr_fr_gb18030(char *inarray, size_t *inlen, char *outarray,
215 	size_t *outlen, int flag, int *errno)
216 {
217 	uchar_t		*ib;
218 	uchar_t		*ob;
219 	uchar_t		*ibtail;
220 	uchar_t		*obtail;
221 	uchar_t		*oldib;
222 	size_t		ret_val;
223 	int8_t		sz;
224 	uint32_t	gb_val;
225 	boolean_t	isgbk4;
226 	boolean_t	do_not_ignore_null;
227 
228 	ret_val = 0;
229 	ib = (uchar_t *)inarray;
230 	ob = (uchar_t *)outarray;
231 	ibtail = ib + *inlen;
232 	obtail = ob + *outlen;
233 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
234 
235 	while (ib < ibtail) {
236 		if (*ib == '\0' && do_not_ignore_null)
237 			break;
238 
239 		if (KICONV_IS_ASCII(*ib)) {
240 			if (ob >= obtail) {
241 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
242 			}
243 
244 			*ob++ = *ib++;
245 			continue;
246 		}
247 
248 		oldib = ib;
249 
250 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
251 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
252 		}
253 
254 		isgbk4 = (ibtail - ib < 2) ? B_FALSE :
255 		    KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1));
256 
257 		if (isgbk4) {
258 			if (ibtail - ib < 4) {
259 				if (flag & KICONV_REPLACE_INVALID) {
260 					ib = ibtail;
261 					goto REPLACE_INVALID;
262 				}
263 
264 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
265 			}
266 
267 			if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) &&
268 			    KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) &&
269 			    KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) {
270 				KICONV_SET_ERRNO_WITH_FLAG(4, EILSEQ);
271 			}
272 
273 			gb_val = (uint32_t)(*ib) << 24 |
274 			    (uint32_t)(*(ib + 1)) << 16 |
275 			    (uint32_t)(*(ib + 2)) << 8 | *(ib + 3);
276 		} else {
277 			if (ibtail - ib < 2) {
278 				if (flag & KICONV_REPLACE_INVALID) {
279 					ib = ibtail;
280 					goto REPLACE_INVALID;
281 				}
282 
283 				KICONV_SET_ERRNO_AND_BREAK(EINVAL);
284 			}
285 
286 			if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
287 				KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
288 			}
289 
290 			gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
291 		}
292 
293 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4);
294 		if (sz < 0) {
295 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
296 		}
297 
298 		ib += isgbk4 ? 4 : 2;
299 		ob += sz;
300 		continue;
301 
302 REPLACE_INVALID:
303 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
304 			ib = oldib;
305 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
306 		}
307 
308 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
309 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
310 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
311 		ret_val++;
312 	}
313 
314 	*inlen = ibtail - ib;
315 	*outlen = obtail - ob;
316 
317 	return (ret_val);
318 }
319 
320 /*
321  * Encoding convertor from UTF-8 to GBK.
322  */
323 size_t
324 kiconv_to_gbk(void *kcd, char **inbuf, size_t *inbytesleft,
325 	char **outbuf, size_t *outbytesleft, int *errno)
326 {
327 
328 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
329 	    outbytesleft, errno, utf8_to_gbk);
330 }
331 
332 /*
333  * String based encoding convertor from UTF-8 to GBK.
334  */
335 size_t
336 kiconvstr_to_gbk(char *inarray, size_t *inlen, char *outarray,
337 	size_t *outlen, int flag, int *errno)
338 {
339 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
340 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_gbk);
341 }
342 
343 /*
344  * Encoding convertor from GBK to UTF-8.
345  */
346 size_t
347 kiconv_fr_gbk(void *kcd, char **inbuf, size_t *inbytesleft,
348 	char **outbuf, size_t *outbytesleft, int *errno)
349 {
350 	uchar_t		*ib;
351 	uchar_t		*ob;
352 	uchar_t		*ibtail;
353 	uchar_t		*obtail;
354 	size_t		ret_val;
355 	int8_t		sz;
356 	uint32_t	gb_val;
357 
358 	/* Check on the kiconv code conversion descriptor. */
359 	if (kcd == NULL || kcd == (void *)-1) {
360 		*errno = EBADF;
361 		return ((size_t)-1);
362 	}
363 
364 	/* If this is a state reset request, process and return. */
365 	if (inbuf == NULL || *inbuf == NULL) {
366 		return (0);
367 	}
368 
369 	ret_val = 0;
370 	ib = (uchar_t *)*inbuf;
371 	ob = (uchar_t *)*outbuf;
372 	ibtail = ib + *inbytesleft;
373 	obtail = ob + *outbytesleft;
374 
375 	while (ib < ibtail) {
376 		if (KICONV_IS_ASCII(*ib)) {
377 			if (ob >= obtail) {
378 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
379 			}
380 
381 			*ob++ = *ib++;
382 			continue;
383 		}
384 
385 		/*
386 		 * Issue EILSEQ error if the first byte is not a
387 		 * valid GBK leading byte.
388 		 */
389 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
390 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
391 		}
392 
393 		/*
394 		 * Issue EINVAL error if input buffer has an incomplete
395 		 * character at the end of the buffer.
396 		 */
397 		if (ibtail - ib < 2) {
398 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
399 		}
400 
401 		/*
402 		 * Issue EILSEQ error if the remaining byte is not
403 		 * a valid GBK byte.
404 		 */
405 		if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
406 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
407 		}
408 
409 		/* Now we have a valid GBK character. */
410 		gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
411 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE);
412 
413 		if (sz < 0) {
414 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
415 		}
416 
417 		ib += 2;
418 		ob += sz;
419 	}
420 
421 	*inbuf = (char *)ib;
422 	*inbytesleft = ibtail - ib;
423 	*outbuf = (char *)ob;
424 	*outbytesleft = obtail - ob;
425 
426 	return (ret_val);
427 }
428 
429 /*
430  * String based encoding convertor from GBK to UTF-8.
431  */
432 size_t
433 kiconvstr_fr_gbk(char *inarray, size_t *inlen, char *outarray,
434 	size_t *outlen, int flag, int *errno)
435 {
436 	uchar_t		*ib;
437 	uchar_t		*ob;
438 	uchar_t		*ibtail;
439 	uchar_t		*obtail;
440 	uchar_t		*oldib;
441 	size_t		ret_val;
442 	int8_t		sz;
443 	uint32_t	gb_val;
444 	boolean_t	do_not_ignore_null;
445 
446 	ret_val = 0;
447 	ib = (uchar_t *)inarray;
448 	ob = (uchar_t *)outarray;
449 	ibtail = ib + *inlen;
450 	obtail = ob + *outlen;
451 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
452 
453 	while (ib < ibtail) {
454 		if (*ib == '\0' && do_not_ignore_null)
455 			break;
456 
457 		if (KICONV_IS_ASCII(*ib)) {
458 			if (ob >= obtail) {
459 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
460 			}
461 
462 			*ob++ = *ib++;
463 			continue;
464 		}
465 
466 		oldib = ib;
467 
468 		if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
469 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
470 		}
471 
472 		if (ibtail - ib < 2) {
473 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
474 		}
475 
476 		if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
477 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
478 		}
479 
480 		gb_val = (uint32_t)(*ib << 8) | *(ib + 1);
481 		sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE);
482 
483 		if (sz < 0) {
484 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
485 		}
486 
487 		ib += 2;
488 		ob += sz;
489 		continue;
490 
491 REPLACE_INVALID:
492 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
493 			ib = oldib;
494 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
495 		}
496 
497 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
498 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
499 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
500 		ret_val++;
501 	}
502 
503 	*inlen = ibtail - ib;
504 	*outlen = obtail - ob;
505 
506 	return (ret_val);
507 }
508 
509 /*
510  * Encoding convertor from UTF-8 to EUC-CN.
511  */
512 size_t
513 kiconv_to_euccn(void *kcd, char **inbuf, size_t *inbytesleft,
514 	char **outbuf, size_t *outbytesleft, int *errno)
515 {
516 	return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
517 	    outbytesleft, errno, utf8_to_gb2312);
518 }
519 
520 /*
521  * String based encoding convertor from UTF-8 to EUC-CN.
522  */
523 size_t
524 kiconvstr_to_euccn(char *inarray, size_t *inlen, char *outarray,
525 	size_t *outlen, int flag, int *errno)
526 {
527 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
528 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb2312);
529 }
530 
531 /*
532  * Encoding converto from EUC-CN to UTF-8 code.
533  */
534 size_t
535 kiconv_fr_euccn(void *kcd, char **inbuf, size_t *inbytesleft,
536 	char **outbuf, size_t *outbytesleft, int *errno)
537 {
538 	uchar_t		*ib;
539 	uchar_t		*ob;
540 	uchar_t		*ibtail;
541 	uchar_t		*obtail;
542 	size_t		ret_val;
543 	int8_t		sz;
544 
545 	/* Check on the kiconv code conversion descriptor. */
546 	if (kcd == NULL || kcd == (void *)-1) {
547 		*errno = EBADF;
548 		return ((size_t)-1);
549 	}
550 
551 	/* If this is a state reset request, process and return. */
552 	if (inbuf == NULL || *inbuf == NULL) {
553 		return (0);
554 	}
555 
556 	ret_val = 0;
557 	ib = (uchar_t *)*inbuf;
558 	ob = (uchar_t *)*outbuf;
559 	ibtail = ib + *inbytesleft;
560 	obtail = ob + *outbytesleft;
561 
562 	while (ib < ibtail) {
563 		if (KICONV_IS_ASCII(*ib)) {
564 			if (ob >= obtail) {
565 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
566 			}
567 
568 			*ob++ = *ib++;
569 			continue;
570 		}
571 
572 		/*
573 		 * Issue EILSEQ error if the first byte is not a
574 		 * valid GB2312 leading byte.
575 		 */
576 		if (! KICONV_SC_IS_GB2312_BYTE(*ib)) {
577 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
578 		}
579 
580 		/*
581 		 * Issue EINVAL error if input buffer has an incomplete
582 		 * character at the end of the buffer.
583 		 */
584 		if (ibtail - ib < 2) {
585 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
586 		}
587 
588 		/*
589 		 * Issue EILSEQ error if the remaining byte is not
590 		 * a valid GB2312 byte.
591 		 */
592 		if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) {
593 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
594 		}
595 
596 		/* Now we have a valid GB2312 character */
597 		sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val);
598 		if (sz < 0) {
599 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
600 		}
601 
602 		ib += 2;
603 		ob += sz;
604 	}
605 
606 	*inbuf = (char *)ib;
607 	*inbytesleft = ibtail - ib;
608 	*outbuf = (char *)ob;
609 	*outbytesleft = obtail - ob;
610 
611 	return (ret_val);
612 }
613 
614 /*
615  * String based encoding convertor from EUC-CN to UTF-8.
616  */
617 size_t
618 kiconvstr_fr_euccn(char *inarray, size_t *inlen, char *outarray,
619     size_t *outlen, int flag, int *errno)
620 {
621 	uchar_t		*ib;
622 	uchar_t		*ob;
623 	uchar_t		*ibtail;
624 	uchar_t		*obtail;
625 	uchar_t		*oldib;
626 	size_t		ret_val;
627 	int8_t		sz;
628 	boolean_t	do_not_ignore_null;
629 
630 	ret_val = 0;
631 	ib = (uchar_t *)inarray;
632 	ob = (uchar_t *)outarray;
633 	ibtail = ib + *inlen;
634 	obtail = ob + *outlen;
635 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
636 
637 	while (ib < ibtail) {
638 		if (*ib == '\0' && do_not_ignore_null)
639 			break;
640 
641 		if (KICONV_IS_ASCII(*ib)) {
642 			if (ob >= obtail) {
643 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
644 			}
645 
646 			*ob++ = *ib++;
647 			continue;
648 		}
649 
650 		oldib = ib;
651 
652 		if (! KICONV_SC_IS_GB2312_BYTE(*ib)) {
653 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
654 		}
655 
656 		if (ibtail - ib < 2) {
657 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
658 		}
659 
660 		if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) {
661 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
662 		}
663 
664 		sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val);
665 		if (sz < 0) {
666 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
667 		}
668 
669 		ib += 2;
670 		ob += sz;
671 		continue;
672 
673 REPLACE_INVALID:
674 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
675 			ib = oldib;
676 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
677 		}
678 
679 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
680 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
681 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
682 		ret_val++;
683 	}
684 
685 	*inlen = ibtail - ib;
686 	*outlen = obtail - ob;
687 
688 	return (ret_val);
689 }
690 
691 /*
692  * Convert single GB2312 character to UTF-8.
693  * Return: > 0  - Converted successfully
694  *         = -1 - E2BIG
695  */
696 static int8_t
697 gb2312_to_utf8(uchar_t b1, uchar_t b2, uchar_t *ob, uchar_t *obtail,
698 	size_t *ret_val)
699 {
700 	size_t	index;
701 	int8_t	sz;
702 	uchar_t	*u8;
703 
704 	/* index = (b1 - KICONV_EUC_START) * 94 + b2 - KICONV_EUC_START; */
705 	index = b1 * 94 + b2 - 0x3BBF;
706 
707 	if (index >= KICONV_GB2312_UTF8_MAX)
708 		index = KICONV_GB2312_UTF8_MAX - 1;	/* Map to 0xEFBFBD */
709 
710 	u8 = kiconv_gb2312_utf8[index];
711 	sz = u8_number_of_bytes[u8[0]];
712 
713 	if (obtail - ob < sz) {
714 		*ret_val = (size_t)-1;
715 		return (-1);
716 	}
717 
718 	for (index = 0; index < sz; index++)
719 		*ob++ = u8[index];
720 
721 	/*
722 	 * As kiconv_gb2312_utf8 contain muliple KICONV_UTF8_REPLACEMENT_CHAR
723 	 * elements, so need to ckeck more.
724 	 */
725 	if (sz == KICONV_UTF8_REPLACEMENT_CHAR_LEN &&
726 	    u8[0] == KICONV_UTF8_REPLACEMENT_CHAR1 &&
727 	    u8[1] == KICONV_UTF8_REPLACEMENT_CHAR2 &&
728 	    u8[2] == KICONV_UTF8_REPLACEMENT_CHAR3)
729 		(*ret_val)++;
730 
731 	return (sz);
732 }
733 
734 /*
735  * Convert single GB18030 or GBK character to UTF-8.
736  * Return: > 0  - Converted successfully
737  *         = -1 - E2BIG
738  */
739 static int8_t
740 gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val,
741 	boolean_t isgbk4)
742 {
743 	size_t	index;
744 	int8_t	sz;
745 	uchar_t	u8array[4];
746 	uchar_t	*u8;
747 
748 	if (isgbk4) {
749 		if (gbk_val >= KICONV_SC_PLANE1_GB18030_START) {
750 			uint32_t	u32;
751 
752 			/*
753 			 * u32 = ((gbk_val >> 24) - 0x90) * 12600 +
754 			 *   (((gbk_val & 0xFF0000) >> 16) - 0x30) * 1260 +
755 			 *   (((gbk_val & 0xFF00) >> 8) - 0x81) * 10 +
756 			 *   (gbk_val & 0xFF - 0x30)+
757 			 *   KICONV_SC_PLANE1_UCS4_START;
758 			 */
759 			u32 = (gbk_val >> 24) * 12600 +
760 			    ((gbk_val & 0xFF0000) >> 16) * 1260 +
761 			    ((gbk_val & 0xFF00) >> 8) * 10 +
762 			    (gbk_val & 0xFF) - 0x1BA0FA;
763 			u8array[0] = (uchar_t)(0xF0 | ((u32 & 0x1C0000) >> 18));
764 			u8array[1] = (uchar_t)(0x80 | ((u32 & 0x03F000) >> 12));
765 			u8array[2] = (uchar_t)(0x80 | ((u32 & 0x000FC0) >> 6));
766 			u8array[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
767 			u8 = u8array;
768 			index = 1;
769 		} else {
770 			index = kiconv_binsearch(gbk_val,
771 			    kiconv_gbk4_utf8, KICONV_GBK4_UTF8_MAX);
772 			u8 = kiconv_gbk4_utf8[index].u8;
773 		}
774 	} else {
775 		index = kiconv_binsearch(gbk_val,
776 		    kiconv_gbk_utf8, KICONV_GBK_UTF8_MAX);
777 		u8 = kiconv_gbk_utf8[index].u8;
778 	}
779 
780 	sz = u8_number_of_bytes[u8[0]];
781 	if (obtail - ob < sz) {
782 		*ret_val = (size_t)-1;
783 		return (-1);
784 	}
785 
786 	if (index == 0)
787 		(*ret_val)++;	/* Non-identical conversion */
788 
789 	for (index = 0; index < sz; index++)
790 		*ob++ = u8[index];
791 
792 	return (sz);
793 }
794 
795 /*
796  * Convert single UTF-8 character to GB18030.
797  * Return: > 0  - Converted successfully
798  *         = -1 - E2BIG
799  */
800 /* ARGSUSED */
801 static int8_t
802 utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
803 	uchar_t *ob, uchar_t *obtail, size_t *ret)
804 {
805 	size_t 		index;
806 	int8_t		gbklen;
807 	uint32_t	gbkcode;
808 
809 	if (utf8 >= KICONV_SC_PLANE1_UTF8_START) {
810 		/* Four bytes GB18030 [0x90308130, 0xe339fe39] handling. */
811 		uint32_t	u32;
812 
813 		u32 = (((utf8 & 0x07000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
814 		    ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
815 		    KICONV_SC_PLANE1_UCS4_START;
816 		gbkcode = ((u32 / 12600 + 0x90) << 24) |
817 		    (((u32 % 12600) / 1260 + 0x30) << 16) |
818 		    (((u32 % 1260) / 10 + 0x81) << 8) | (u32 % 10 + 0x30);
819 		gbklen = 4;
820 		index = 1;
821 	} else {
822 		index = kiconv_binsearch(utf8, kiconv_utf8_gb18030,
823 		    KICONV_UTF8_GB18030_MAX);
824 		gbkcode = kiconv_utf8_gb18030[index].value;
825 		KICONV_SC_GET_GB_LEN(gbkcode, gbklen);
826 	}
827 
828 	if (obtail - ob < gbklen) {
829 		*ret = (size_t)-1;
830 		return (-1);
831 	}
832 
833 	if (index == 0)
834 		(*ret)++;		/* Non-identical conversion */
835 
836 	if (gbklen == 2) {
837 		*ob++ = (uchar_t)(gbkcode >> 8);
838 	} else if (gbklen == 4) {
839 		*ob++ = (uchar_t)(gbkcode >> 24);
840 		*ob++ = (uchar_t)(gbkcode >> 16);
841 		*ob++ = (uchar_t)(gbkcode >> 8);
842 	}
843 	*ob = (uchar_t)(gbkcode & 0xFF);
844 
845 	return (gbklen);
846 }
847 
848 /*
849  * Convert single UTF-8 character to GBK.
850  * Return: > 0  - Converted successfully
851  *         = -1 - E2BIG
852  */
853 /* ARGSUSED */
854 static int8_t
855 utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
856 	uchar_t *ob, uchar_t *obtail, size_t *ret)
857 {
858 	size_t 		index;
859 	int8_t		gbklen;
860 	uint32_t	gbkcode;
861 
862 	index = kiconv_binsearch(utf8, kiconv_utf8_gb18030,
863 	    KICONV_UTF8_GB18030_MAX);
864 	gbkcode = kiconv_utf8_gb18030[index].value;
865 	KICONV_SC_GET_GB_LEN(gbkcode, gbklen);
866 
867 	/* GBK and GB18030 share the same table, so check the length. */
868 	if (gbklen == 4) {
869 		index = 0;
870 		gbkcode = kiconv_utf8_gb18030[index].value;
871 		gbklen = 1;
872 	}
873 
874 	if (obtail - ob < gbklen) {
875 		*ret = (size_t)-1;
876 		return (-1);
877 	}
878 
879 	if (index == 0)
880 		(*ret)++;		/* Non-identical conversion */
881 
882 	if (gbklen > 1)
883 		*ob++ = (uchar_t)(gbkcode >> 8);
884 	*ob = (uchar_t)(gbkcode & 0xFF);
885 
886 	return (gbklen);
887 }
888 
889 /*
890  * Convert single UTF-8 character to GB2312.
891  * Return: > 0  - Converted successfully
892  *         = -1 - E2BIG
893  */
894 /* ARGSUSED */
895 static int8_t
896 utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *intail,
897 	uchar_t *ob, uchar_t *obtail, size_t *ret)
898 {
899 	size_t		index;
900 	int8_t		gblen;
901 	uint32_t	gbcode;
902 
903 	index = kiconv_binsearch(utf8, kiconv_utf8_gb2312,
904 	    KICONV_UTF8_GB2312_MAX);
905 	gbcode = kiconv_utf8_gb2312[index].value;
906 	gblen = (gbcode <= 0xFF) ? 1 : 2;
907 
908 	if (obtail - ob < gblen) {
909 		*ret = (size_t)-1;
910 		return (-1);
911 	}
912 
913 	if (index == 0)
914 		(*ret)++;
915 
916 	if (gblen > 1)
917 		*ob++ = (uchar_t)(gbcode >> 8);
918 	*ob = (uchar_t)(gbcode & 0xFF);
919 
920 	return (gblen);
921 }
922 
923 static kiconv_ops_t kiconv_sc_ops_tbl[] = {
924 	{
925 		"gb18030", "utf-8", kiconv_open_to_cck, kiconv_to_gb18030,
926 		kiconv_close_to_cck, kiconvstr_to_gb18030
927 	},
928 	{
929 		"utf-8", "gb18030", open_fr_gb18030, kiconv_fr_gb18030,
930 		close_fr_sc, kiconvstr_fr_gb18030
931 	},
932 	{
933 		"gbk", "utf-8", kiconv_open_to_cck, kiconv_to_gbk,
934 		kiconv_close_to_cck, kiconvstr_to_gbk
935 	},
936 	{
937 		"utf-8", "gbk", open_fr_gbk, kiconv_fr_gbk,
938 		close_fr_sc, kiconvstr_fr_gbk
939 	},
940 	{
941 		"euccn", "utf-8", kiconv_open_to_cck, kiconv_to_euccn,
942 		kiconv_close_to_cck, kiconvstr_to_euccn
943 	},
944 	{
945 		"utf-8", "euccn", open_fr_euccn, kiconv_fr_euccn,
946 		close_fr_sc, kiconvstr_fr_euccn
947 	},
948 };
949 
950 static kiconv_module_info_t kiconv_sc_info = {
951 	"kiconv_sc",		/* module name */
952 	sizeof (kiconv_sc_ops_tbl) / sizeof (kiconv_sc_ops_tbl[0]),
953 	kiconv_sc_ops_tbl,
954 	0,
955 	NULL,
956 	NULL,
957 	0
958 };
959 
960 static struct modlkiconv modlkiconv_sc = {
961 	&mod_kiconvops,
962 	"kiconv Simplified Chinese module 1.0",
963 	&kiconv_sc_info
964 };
965 
966 static struct modlinkage modlinkage = {
967 	MODREV_1,
968 	(void *)&modlkiconv_sc,
969 	NULL
970 };
971 
972 int
973 _init(void)
974 {
975 	int err;
976 
977 	err = mod_install(&modlinkage);
978 	if (err)
979 		cmn_err(CE_WARN, "kiconv_sc: failed to load kernel module");
980 
981 	return (err);
982 }
983 
984 int
985 _fini(void)
986 {
987 	int err;
988 
989 	/*
990 	 * If this module is being used, then, we cannot remove the module.
991 	 * The following checking will catch pretty much all usual cases.
992 	 *
993 	 * Any remaining will be catached by the kiconv_unregister_module()
994 	 * during mod_remove() at below.
995 	 */
996 	if (kiconv_module_ref_count(KICONV_MODULE_ID_SC))
997 		return (EBUSY);
998 
999 	err = mod_remove(&modlinkage);
1000 	if (err)
1001 		cmn_err(CE_WARN, "kiconv_sc: failed to remove kernel module");
1002 
1003 	return (err);
1004 }
1005 
1006 int
1007 _info(struct modinfo *modinfop)
1008 {
1009 	return (mod_info(&modlinkage, modinfop));
1010 }
1011