1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/systm.h>
30 #include <sys/debug.h>
31 #include <sys/kmem.h>
32 #include <sys/sunddi.h>
33 #include <sys/byteorder.h>
34 #include <sys/errno.h>
35 #include <sys/modctl.h>
36 #include <sys/kiconv.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/kiconv_cck_common.h>
39 #include <sys/kiconv_sc.h>
40 #include <sys/kiconv_gb18030_utf8.h>
41 #include <sys/kiconv_gb2312_utf8.h>
42 #include <sys/kiconv_utf8_gb18030.h>
43 #include <sys/kiconv_utf8_gb2312.h>
44
45 static int8_t gb2312_to_utf8(uchar_t byte1, uchar_t byte2, uchar_t *ob,
46 uchar_t *obtail, size_t *ret_val);
47 static int8_t gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail,
48 size_t *ret_val, boolean_t isgbk4);
49 static int8_t utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
50 uchar_t *ob, uchar_t *obtail, size_t *ret);
51 static int8_t utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
52 uchar_t *ob, uchar_t *obtail, size_t *ret);
53 static int8_t utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
54 uchar_t *ob, uchar_t *obtail, size_t *ret);
55
56 #define KICONV_SC_GB18030 (0x01)
57 #define KICONV_SC_GBK (0x02)
58 #define KICONV_SC_EUCCN (0x03)
59 #define KICONV_SC_MAX_MAGIC_ID (0x03)
60
61 static void *
open_fr_gb18030()62 open_fr_gb18030()
63 {
64 return ((void *)KICONV_SC_GB18030);
65 }
66
67 static void *
open_fr_gbk()68 open_fr_gbk()
69 {
70 return ((void *)KICONV_SC_GBK);
71 }
72
73 static void *
open_fr_euccn()74 open_fr_euccn()
75 {
76 return ((void *)KICONV_SC_EUCCN);
77 }
78
79 static int
close_fr_sc(void * s)80 close_fr_sc(void *s)
81 {
82 if ((uintptr_t)s > KICONV_SC_MAX_MAGIC_ID)
83 return (EBADF);
84
85 return (0);
86 }
87
88 /*
89 * Encoding convertor from UTF-8 to GB18030.
90 */
91 size_t
kiconv_to_gb18030(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)92 kiconv_to_gb18030(void *kcd, char **inbuf, size_t *inbytesleft,
93 char **outbuf, size_t *outbytesleft, int *errno)
94 {
95
96 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
97 outbytesleft, errno, utf8_to_gb18030);
98 }
99
100 /*
101 * String based encoding convertor from UTF-8 to GB18030.
102 */
103 size_t
kiconvstr_to_gb18030(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)104 kiconvstr_to_gb18030(char *inarray, size_t *inlen, char *outarray,
105 size_t *outlen, int flag, int *errno)
106 {
107 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
108 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb18030);
109 }
110
111 /*
112 * Encoding convertor from GB18030 to UTF-8.
113 */
114 size_t
kiconv_fr_gb18030(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)115 kiconv_fr_gb18030(void *kcd, char **inbuf, size_t *inbytesleft,
116 char **outbuf, size_t *outbytesleft, int *errno)
117 {
118 uchar_t *ib;
119 uchar_t *ob;
120 uchar_t *ibtail;
121 uchar_t *obtail;
122 size_t ret_val;
123 int8_t sz;
124 uint32_t gb_val;
125 boolean_t isgbk4;
126
127 /* Check on the kiconv code conversion descriptor. */
128 if (kcd == NULL || kcd == (void *)-1) {
129 *errno = EBADF;
130 return ((size_t)-1);
131 }
132
133 /* If this is a state reset request, process and return. */
134 if (inbuf == NULL || *inbuf == NULL) {
135 return (0);
136 }
137
138 ret_val = 0;
139 ib = (uchar_t *)*inbuf;
140 ob = (uchar_t *)*outbuf;
141 ibtail = ib + *inbytesleft;
142 obtail = ob + *outbytesleft;
143
144 while (ib < ibtail) {
145 if (KICONV_IS_ASCII(*ib)) {
146 if (ob >= obtail) {
147 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
148 }
149
150 *ob++ = *ib++;
151 continue;
152 }
153
154 /*
155 * Issue EILSEQ error if the first byte is not a
156 * valid GB18030 leading byte.
157 */
158 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
159 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
160 }
161
162 isgbk4 = (ibtail - ib < 2) ? B_FALSE :
163 KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1));
164
165 if (isgbk4) {
166 if (ibtail - ib < 4) {
167 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
168 }
169
170 if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) &&
171 KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) &&
172 KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) {
173 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
174 }
175
176 gb_val = (uint32_t)(*ib) << 24 |
177 (uint32_t)(*(ib + 1)) << 16 |
178 (uint32_t)(*(ib + 2)) << 8 | *(ib + 3);
179 } else {
180 if (ibtail - ib < 2) {
181 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
182 }
183
184 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
185 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
186 }
187
188 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
189 }
190
191 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4);
192 if (sz < 0) {
193 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
194 }
195
196 ib += isgbk4 ? 4 : 2;
197 ob += sz;
198 }
199
200 *inbuf = (char *)ib;
201 *inbytesleft = ibtail - ib;
202 *outbuf = (char *)ob;
203 *outbytesleft = obtail - ob;
204
205 return (ret_val);
206 }
207
208 /*
209 * String based encoding convertor from GB18030 to UTF-8.
210 */
211 size_t
kiconvstr_fr_gb18030(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)212 kiconvstr_fr_gb18030(char *inarray, size_t *inlen, char *outarray,
213 size_t *outlen, int flag, int *errno)
214 {
215 uchar_t *ib;
216 uchar_t *ob;
217 uchar_t *ibtail;
218 uchar_t *obtail;
219 uchar_t *oldib;
220 size_t ret_val;
221 int8_t sz;
222 uint32_t gb_val;
223 boolean_t isgbk4;
224 boolean_t do_not_ignore_null;
225
226 ret_val = 0;
227 ib = (uchar_t *)inarray;
228 ob = (uchar_t *)outarray;
229 ibtail = ib + *inlen;
230 obtail = ob + *outlen;
231 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
232
233 while (ib < ibtail) {
234 if (*ib == '\0' && do_not_ignore_null)
235 break;
236
237 if (KICONV_IS_ASCII(*ib)) {
238 if (ob >= obtail) {
239 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
240 }
241
242 *ob++ = *ib++;
243 continue;
244 }
245
246 oldib = ib;
247
248 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
249 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
250 }
251
252 isgbk4 = (ibtail - ib < 2) ? B_FALSE :
253 KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1));
254
255 if (isgbk4) {
256 if (ibtail - ib < 4) {
257 if (flag & KICONV_REPLACE_INVALID) {
258 ib = ibtail;
259 goto REPLACE_INVALID;
260 }
261
262 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
263 }
264
265 if (! (KICONV_SC_IS_GB18030_2nd_BYTE(*(ib + 1)) &&
266 KICONV_SC_IS_GB18030_3rd_BYTE(*(ib + 2)) &&
267 KICONV_SC_IS_GB18030_4th_BYTE(*(ib + 3)))) {
268 KICONV_SET_ERRNO_WITH_FLAG(4, EILSEQ);
269 }
270
271 gb_val = (uint32_t)(*ib) << 24 |
272 (uint32_t)(*(ib + 1)) << 16 |
273 (uint32_t)(*(ib + 2)) << 8 | *(ib + 3);
274 } else {
275 if (ibtail - ib < 2) {
276 if (flag & KICONV_REPLACE_INVALID) {
277 ib = ibtail;
278 goto REPLACE_INVALID;
279 }
280
281 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
282 }
283
284 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
285 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
286 }
287
288 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
289 }
290
291 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, isgbk4);
292 if (sz < 0) {
293 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
294 }
295
296 ib += isgbk4 ? 4 : 2;
297 ob += sz;
298 continue;
299
300 REPLACE_INVALID:
301 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
302 ib = oldib;
303 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
304 }
305
306 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
307 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
308 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
309 ret_val++;
310 }
311
312 *inlen = ibtail - ib;
313 *outlen = obtail - ob;
314
315 return (ret_val);
316 }
317
318 /*
319 * Encoding convertor from UTF-8 to GBK.
320 */
321 size_t
kiconv_to_gbk(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)322 kiconv_to_gbk(void *kcd, char **inbuf, size_t *inbytesleft,
323 char **outbuf, size_t *outbytesleft, int *errno)
324 {
325
326 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
327 outbytesleft, errno, utf8_to_gbk);
328 }
329
330 /*
331 * String based encoding convertor from UTF-8 to GBK.
332 */
333 size_t
kiconvstr_to_gbk(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)334 kiconvstr_to_gbk(char *inarray, size_t *inlen, char *outarray,
335 size_t *outlen, int flag, int *errno)
336 {
337 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
338 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gbk);
339 }
340
341 /*
342 * Encoding convertor from GBK to UTF-8.
343 */
344 size_t
kiconv_fr_gbk(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)345 kiconv_fr_gbk(void *kcd, char **inbuf, size_t *inbytesleft,
346 char **outbuf, size_t *outbytesleft, int *errno)
347 {
348 uchar_t *ib;
349 uchar_t *ob;
350 uchar_t *ibtail;
351 uchar_t *obtail;
352 size_t ret_val;
353 int8_t sz;
354 uint32_t gb_val;
355
356 /* Check on the kiconv code conversion descriptor. */
357 if (kcd == NULL || kcd == (void *)-1) {
358 *errno = EBADF;
359 return ((size_t)-1);
360 }
361
362 /* If this is a state reset request, process and return. */
363 if (inbuf == NULL || *inbuf == NULL) {
364 return (0);
365 }
366
367 ret_val = 0;
368 ib = (uchar_t *)*inbuf;
369 ob = (uchar_t *)*outbuf;
370 ibtail = ib + *inbytesleft;
371 obtail = ob + *outbytesleft;
372
373 while (ib < ibtail) {
374 if (KICONV_IS_ASCII(*ib)) {
375 if (ob >= obtail) {
376 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
377 }
378
379 *ob++ = *ib++;
380 continue;
381 }
382
383 /*
384 * Issue EILSEQ error if the first byte is not a
385 * valid GBK leading byte.
386 */
387 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
388 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
389 }
390
391 /*
392 * Issue EINVAL error if input buffer has an incomplete
393 * character at the end of the buffer.
394 */
395 if (ibtail - ib < 2) {
396 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
397 }
398
399 /*
400 * Issue EILSEQ error if the remaining byte is not
401 * a valid GBK byte.
402 */
403 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
404 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
405 }
406
407 /* Now we have a valid GBK character. */
408 gb_val = (uint32_t)(*ib) << 8 | *(ib + 1);
409 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE);
410
411 if (sz < 0) {
412 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
413 }
414
415 ib += 2;
416 ob += sz;
417 }
418
419 *inbuf = (char *)ib;
420 *inbytesleft = ibtail - ib;
421 *outbuf = (char *)ob;
422 *outbytesleft = obtail - ob;
423
424 return (ret_val);
425 }
426
427 /*
428 * String based encoding convertor from GBK to UTF-8.
429 */
430 size_t
kiconvstr_fr_gbk(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)431 kiconvstr_fr_gbk(char *inarray, size_t *inlen, char *outarray,
432 size_t *outlen, int flag, int *errno)
433 {
434 uchar_t *ib;
435 uchar_t *ob;
436 uchar_t *ibtail;
437 uchar_t *obtail;
438 uchar_t *oldib;
439 size_t ret_val;
440 int8_t sz;
441 uint32_t gb_val;
442 boolean_t do_not_ignore_null;
443
444 ret_val = 0;
445 ib = (uchar_t *)inarray;
446 ob = (uchar_t *)outarray;
447 ibtail = ib + *inlen;
448 obtail = ob + *outlen;
449 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
450
451 while (ib < ibtail) {
452 if (*ib == '\0' && do_not_ignore_null)
453 break;
454
455 if (KICONV_IS_ASCII(*ib)) {
456 if (ob >= obtail) {
457 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
458 }
459
460 *ob++ = *ib++;
461 continue;
462 }
463
464 oldib = ib;
465
466 if (! KICONV_SC_IS_GBK_1st_BYTE(*ib)) {
467 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
468 }
469
470 if (ibtail - ib < 2) {
471 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
472 }
473
474 if (! KICONV_SC_IS_GBK_2nd_BYTE(*(ib + 1))) {
475 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
476 }
477
478 gb_val = (uint32_t)(*ib << 8) | *(ib + 1);
479 sz = gbk_to_utf8(gb_val, ob, obtail, &ret_val, B_FALSE);
480
481 if (sz < 0) {
482 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
483 }
484
485 ib += 2;
486 ob += sz;
487 continue;
488
489 REPLACE_INVALID:
490 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
491 ib = oldib;
492 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
493 }
494
495 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
496 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
497 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
498 ret_val++;
499 }
500
501 *inlen = ibtail - ib;
502 *outlen = obtail - ob;
503
504 return (ret_val);
505 }
506
507 /*
508 * Encoding convertor from UTF-8 to EUC-CN.
509 */
510 size_t
kiconv_to_euccn(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)511 kiconv_to_euccn(void *kcd, char **inbuf, size_t *inbytesleft,
512 char **outbuf, size_t *outbytesleft, int *errno)
513 {
514 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
515 outbytesleft, errno, utf8_to_gb2312);
516 }
517
518 /*
519 * String based encoding convertor from UTF-8 to EUC-CN.
520 */
521 size_t
kiconvstr_to_euccn(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)522 kiconvstr_to_euccn(char *inarray, size_t *inlen, char *outarray,
523 size_t *outlen, int flag, int *errno)
524 {
525 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
526 (uchar_t *)outarray, outlen, flag, errno, utf8_to_gb2312);
527 }
528
529 /*
530 * Encoding converto from EUC-CN to UTF-8 code.
531 */
532 size_t
kiconv_fr_euccn(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)533 kiconv_fr_euccn(void *kcd, char **inbuf, size_t *inbytesleft,
534 char **outbuf, size_t *outbytesleft, int *errno)
535 {
536 uchar_t *ib;
537 uchar_t *ob;
538 uchar_t *ibtail;
539 uchar_t *obtail;
540 size_t ret_val;
541 int8_t sz;
542
543 /* Check on the kiconv code conversion descriptor. */
544 if (kcd == NULL || kcd == (void *)-1) {
545 *errno = EBADF;
546 return ((size_t)-1);
547 }
548
549 /* If this is a state reset request, process and return. */
550 if (inbuf == NULL || *inbuf == NULL) {
551 return (0);
552 }
553
554 ret_val = 0;
555 ib = (uchar_t *)*inbuf;
556 ob = (uchar_t *)*outbuf;
557 ibtail = ib + *inbytesleft;
558 obtail = ob + *outbytesleft;
559
560 while (ib < ibtail) {
561 if (KICONV_IS_ASCII(*ib)) {
562 if (ob >= obtail) {
563 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
564 }
565
566 *ob++ = *ib++;
567 continue;
568 }
569
570 /*
571 * Issue EILSEQ error if the first byte is not a
572 * valid GB2312 leading byte.
573 */
574 if (! KICONV_SC_IS_GB2312_BYTE(*ib)) {
575 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
576 }
577
578 /*
579 * Issue EINVAL error if input buffer has an incomplete
580 * character at the end of the buffer.
581 */
582 if (ibtail - ib < 2) {
583 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
584 }
585
586 /*
587 * Issue EILSEQ error if the remaining byte is not
588 * a valid GB2312 byte.
589 */
590 if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) {
591 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
592 }
593
594 /* Now we have a valid GB2312 character */
595 sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val);
596 if (sz < 0) {
597 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
598 }
599
600 ib += 2;
601 ob += sz;
602 }
603
604 *inbuf = (char *)ib;
605 *inbytesleft = ibtail - ib;
606 *outbuf = (char *)ob;
607 *outbytesleft = obtail - ob;
608
609 return (ret_val);
610 }
611
612 /*
613 * String based encoding convertor from EUC-CN to UTF-8.
614 */
615 size_t
kiconvstr_fr_euccn(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)616 kiconvstr_fr_euccn(char *inarray, size_t *inlen, char *outarray,
617 size_t *outlen, int flag, int *errno)
618 {
619 uchar_t *ib;
620 uchar_t *ob;
621 uchar_t *ibtail;
622 uchar_t *obtail;
623 uchar_t *oldib;
624 size_t ret_val;
625 int8_t sz;
626 boolean_t do_not_ignore_null;
627
628 ret_val = 0;
629 ib = (uchar_t *)inarray;
630 ob = (uchar_t *)outarray;
631 ibtail = ib + *inlen;
632 obtail = ob + *outlen;
633 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
634
635 while (ib < ibtail) {
636 if (*ib == '\0' && do_not_ignore_null)
637 break;
638
639 if (KICONV_IS_ASCII(*ib)) {
640 if (ob >= obtail) {
641 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
642 }
643
644 *ob++ = *ib++;
645 continue;
646 }
647
648 oldib = ib;
649
650 if (! KICONV_SC_IS_GB2312_BYTE(*ib)) {
651 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
652 }
653
654 if (ibtail - ib < 2) {
655 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
656 }
657
658 if (! KICONV_SC_IS_GB2312_BYTE(*(ib + 1))) {
659 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
660 }
661
662 sz = gb2312_to_utf8(*ib, *(ib + 1), ob, obtail, &ret_val);
663 if (sz < 0) {
664 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
665 }
666
667 ib += 2;
668 ob += sz;
669 continue;
670
671 REPLACE_INVALID:
672 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
673 ib = oldib;
674 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
675 }
676
677 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
678 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
679 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
680 ret_val++;
681 }
682
683 *inlen = ibtail - ib;
684 *outlen = obtail - ob;
685
686 return (ret_val);
687 }
688
689 /*
690 * Convert single GB2312 character to UTF-8.
691 * Return: > 0 - Converted successfully
692 * = -1 - E2BIG
693 */
694 static int8_t
gb2312_to_utf8(uchar_t b1,uchar_t b2,uchar_t * ob,uchar_t * obtail,size_t * ret_val)695 gb2312_to_utf8(uchar_t b1, uchar_t b2, uchar_t *ob, uchar_t *obtail,
696 size_t *ret_val)
697 {
698 size_t index;
699 int8_t sz;
700 uchar_t *u8;
701
702 /* index = (b1 - KICONV_EUC_START) * 94 + b2 - KICONV_EUC_START; */
703 index = b1 * 94 + b2 - 0x3BBF;
704
705 if (index >= KICONV_GB2312_UTF8_MAX)
706 index = KICONV_GB2312_UTF8_MAX - 1; /* Map to 0xEFBFBD */
707
708 u8 = kiconv_gb2312_utf8[index];
709 sz = u8_number_of_bytes[u8[0]];
710
711 if (obtail - ob < sz) {
712 *ret_val = (size_t)-1;
713 return (-1);
714 }
715
716 for (index = 0; index < sz; index++)
717 *ob++ = u8[index];
718
719 /*
720 * As kiconv_gb2312_utf8 contain muliple KICONV_UTF8_REPLACEMENT_CHAR
721 * elements, so need to ckeck more.
722 */
723 if (sz == KICONV_UTF8_REPLACEMENT_CHAR_LEN &&
724 u8[0] == KICONV_UTF8_REPLACEMENT_CHAR1 &&
725 u8[1] == KICONV_UTF8_REPLACEMENT_CHAR2 &&
726 u8[2] == KICONV_UTF8_REPLACEMENT_CHAR3)
727 (*ret_val)++;
728
729 return (sz);
730 }
731
732 /*
733 * Convert single GB18030 or GBK character to UTF-8.
734 * Return: > 0 - Converted successfully
735 * = -1 - E2BIG
736 */
737 static int8_t
gbk_to_utf8(uint32_t gbk_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val,boolean_t isgbk4)738 gbk_to_utf8(uint32_t gbk_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val,
739 boolean_t isgbk4)
740 {
741 size_t index;
742 int8_t sz;
743 uchar_t u8array[4];
744 uchar_t *u8;
745
746 if (isgbk4) {
747 if (gbk_val >= KICONV_SC_PLANE1_GB18030_START) {
748 uint32_t u32;
749
750 /*
751 * u32 = ((gbk_val >> 24) - 0x90) * 12600 +
752 * (((gbk_val & 0xFF0000) >> 16) - 0x30) * 1260 +
753 * (((gbk_val & 0xFF00) >> 8) - 0x81) * 10 +
754 * (gbk_val & 0xFF - 0x30)+
755 * KICONV_SC_PLANE1_UCS4_START;
756 */
757 u32 = (gbk_val >> 24) * 12600 +
758 ((gbk_val & 0xFF0000) >> 16) * 1260 +
759 ((gbk_val & 0xFF00) >> 8) * 10 +
760 (gbk_val & 0xFF) - 0x1BA0FA;
761 u8array[0] = (uchar_t)(0xF0 | ((u32 & 0x1C0000) >> 18));
762 u8array[1] = (uchar_t)(0x80 | ((u32 & 0x03F000) >> 12));
763 u8array[2] = (uchar_t)(0x80 | ((u32 & 0x000FC0) >> 6));
764 u8array[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
765 u8 = u8array;
766 index = 1;
767 } else {
768 index = kiconv_binsearch(gbk_val,
769 kiconv_gbk4_utf8, KICONV_GBK4_UTF8_MAX);
770 u8 = kiconv_gbk4_utf8[index].u8;
771 }
772 } else {
773 index = kiconv_binsearch(gbk_val,
774 kiconv_gbk_utf8, KICONV_GBK_UTF8_MAX);
775 u8 = kiconv_gbk_utf8[index].u8;
776 }
777
778 sz = u8_number_of_bytes[u8[0]];
779 if (obtail - ob < sz) {
780 *ret_val = (size_t)-1;
781 return (-1);
782 }
783
784 if (index == 0)
785 (*ret_val)++; /* Non-identical conversion */
786
787 for (index = 0; index < sz; index++)
788 *ob++ = u8[index];
789
790 return (sz);
791 }
792
793 /*
794 * Convert single UTF-8 character to GB18030.
795 * Return: > 0 - Converted successfully
796 * = -1 - E2BIG
797 */
798 /* ARGSUSED */
799 static int8_t
utf8_to_gb18030(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret)800 utf8_to_gb18030(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
801 uchar_t *ob, uchar_t *obtail, size_t *ret)
802 {
803 size_t index;
804 int8_t gbklen;
805 uint32_t gbkcode;
806
807 if (utf8 >= KICONV_SC_PLANE1_UTF8_START) {
808 /* Four bytes GB18030 [0x90308130, 0xe339fe39] handling. */
809 uint32_t u32;
810
811 u32 = (((utf8 & 0x07000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
812 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
813 KICONV_SC_PLANE1_UCS4_START;
814 gbkcode = ((u32 / 12600 + 0x90) << 24) |
815 (((u32 % 12600) / 1260 + 0x30) << 16) |
816 (((u32 % 1260) / 10 + 0x81) << 8) | (u32 % 10 + 0x30);
817 gbklen = 4;
818 index = 1;
819 } else {
820 index = kiconv_binsearch(utf8, kiconv_utf8_gb18030,
821 KICONV_UTF8_GB18030_MAX);
822 gbkcode = kiconv_utf8_gb18030[index].value;
823 KICONV_SC_GET_GB_LEN(gbkcode, gbklen);
824 }
825
826 if (obtail - ob < gbklen) {
827 *ret = (size_t)-1;
828 return (-1);
829 }
830
831 if (index == 0)
832 (*ret)++; /* Non-identical conversion */
833
834 if (gbklen == 2) {
835 *ob++ = (uchar_t)(gbkcode >> 8);
836 } else if (gbklen == 4) {
837 *ob++ = (uchar_t)(gbkcode >> 24);
838 *ob++ = (uchar_t)(gbkcode >> 16);
839 *ob++ = (uchar_t)(gbkcode >> 8);
840 }
841 *ob = (uchar_t)(gbkcode & 0xFF);
842
843 return (gbklen);
844 }
845
846 /*
847 * Convert single UTF-8 character to GBK.
848 * Return: > 0 - Converted successfully
849 * = -1 - E2BIG
850 */
851 /* ARGSUSED */
852 static int8_t
utf8_to_gbk(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret)853 utf8_to_gbk(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
854 uchar_t *ob, uchar_t *obtail, size_t *ret)
855 {
856 size_t index;
857 int8_t gbklen;
858 uint32_t gbkcode;
859
860 index = kiconv_binsearch(utf8, kiconv_utf8_gb18030,
861 KICONV_UTF8_GB18030_MAX);
862 gbkcode = kiconv_utf8_gb18030[index].value;
863 KICONV_SC_GET_GB_LEN(gbkcode, gbklen);
864
865 /* GBK and GB18030 share the same table, so check the length. */
866 if (gbklen == 4) {
867 index = 0;
868 gbkcode = kiconv_utf8_gb18030[index].value;
869 gbklen = 1;
870 }
871
872 if (obtail - ob < gbklen) {
873 *ret = (size_t)-1;
874 return (-1);
875 }
876
877 if (index == 0)
878 (*ret)++; /* Non-identical conversion */
879
880 if (gbklen > 1)
881 *ob++ = (uchar_t)(gbkcode >> 8);
882 *ob = (uchar_t)(gbkcode & 0xFF);
883
884 return (gbklen);
885 }
886
887 /*
888 * Convert single UTF-8 character to GB2312.
889 * Return: > 0 - Converted successfully
890 * = -1 - E2BIG
891 */
892 /* ARGSUSED */
893 static int8_t
utf8_to_gb2312(uint32_t utf8,uchar_t ** inbuf,uchar_t * intail,uchar_t * ob,uchar_t * obtail,size_t * ret)894 utf8_to_gb2312(uint32_t utf8, uchar_t **inbuf, uchar_t *intail,
895 uchar_t *ob, uchar_t *obtail, size_t *ret)
896 {
897 size_t index;
898 int8_t gblen;
899 uint32_t gbcode;
900
901 index = kiconv_binsearch(utf8, kiconv_utf8_gb2312,
902 KICONV_UTF8_GB2312_MAX);
903 gbcode = kiconv_utf8_gb2312[index].value;
904 gblen = (gbcode <= 0xFF) ? 1 : 2;
905
906 if (obtail - ob < gblen) {
907 *ret = (size_t)-1;
908 return (-1);
909 }
910
911 if (index == 0)
912 (*ret)++;
913
914 if (gblen > 1)
915 *ob++ = (uchar_t)(gbcode >> 8);
916 *ob = (uchar_t)(gbcode & 0xFF);
917
918 return (gblen);
919 }
920
921 static kiconv_ops_t kiconv_sc_ops_tbl[] = {
922 {
923 "gb18030", "utf-8", kiconv_open_to_cck, kiconv_to_gb18030,
924 kiconv_close_to_cck, kiconvstr_to_gb18030
925 },
926 {
927 "utf-8", "gb18030", open_fr_gb18030, kiconv_fr_gb18030,
928 close_fr_sc, kiconvstr_fr_gb18030
929 },
930 {
931 "gbk", "utf-8", kiconv_open_to_cck, kiconv_to_gbk,
932 kiconv_close_to_cck, kiconvstr_to_gbk
933 },
934 {
935 "utf-8", "gbk", open_fr_gbk, kiconv_fr_gbk,
936 close_fr_sc, kiconvstr_fr_gbk
937 },
938 {
939 "euccn", "utf-8", kiconv_open_to_cck, kiconv_to_euccn,
940 kiconv_close_to_cck, kiconvstr_to_euccn
941 },
942 {
943 "utf-8", "euccn", open_fr_euccn, kiconv_fr_euccn,
944 close_fr_sc, kiconvstr_fr_euccn
945 },
946 };
947
948 static kiconv_module_info_t kiconv_sc_info = {
949 "kiconv_sc", /* module name */
950 sizeof (kiconv_sc_ops_tbl) / sizeof (kiconv_sc_ops_tbl[0]),
951 kiconv_sc_ops_tbl,
952 0,
953 NULL,
954 NULL,
955 0
956 };
957
958 static struct modlkiconv modlkiconv_sc = {
959 &mod_kiconvops,
960 "kiconv Simplified Chinese module 1.0",
961 &kiconv_sc_info
962 };
963
964 static struct modlinkage modlinkage = {
965 MODREV_1,
966 (void *)&modlkiconv_sc,
967 NULL
968 };
969
970 int
_init(void)971 _init(void)
972 {
973 int err;
974
975 err = mod_install(&modlinkage);
976 if (err)
977 cmn_err(CE_WARN, "kiconv_sc: failed to load kernel module");
978
979 return (err);
980 }
981
982 int
_fini(void)983 _fini(void)
984 {
985 int err;
986
987 /*
988 * If this module is being used, then, we cannot remove the module.
989 * The following checking will catch pretty much all usual cases.
990 *
991 * Any remaining will be catached by the kiconv_unregister_module()
992 * during mod_remove() at below.
993 */
994 if (kiconv_module_ref_count(KICONV_MODULE_ID_SC))
995 return (EBUSY);
996
997 err = mod_remove(&modlinkage);
998 if (err)
999 cmn_err(CE_WARN, "kiconv_sc: failed to remove kernel module");
1000
1001 return (err);
1002 }
1003
1004 int
_info(struct modinfo * modinfop)1005 _info(struct modinfo *modinfop)
1006 {
1007 return (mod_info(&modlinkage, modinfop));
1008 }
1009