1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/systm.h>
30 #include <sys/debug.h>
31 #include <sys/kmem.h>
32 #include <sys/sunddi.h>
33 #include <sys/byteorder.h>
34 #include <sys/errno.h>
35 #include <sys/modctl.h>
36 #include <sys/u8_textprep.h>
37 #include <sys/kiconv.h>
38 #include <sys/kiconv_cck_common.h>
39 #include <sys/kiconv_tc.h>
40 #include <sys/kiconv_big5_utf8.h>
41 #include <sys/kiconv_euctw_utf8.h>
42 #include <sys/kiconv_hkscs_utf8.h>
43 #include <sys/kiconv_cp950hkscs_utf8.h>
44 #include <sys/kiconv_utf8_big5.h>
45 #include <sys/kiconv_utf8_euctw.h>
46 #include <sys/kiconv_utf8_cp950hkscs.h>
47 #include <sys/kiconv_utf8_hkscs.h>
48
49 /* 4 HKSCS-2004 code points map to 2 Unicode code points separately. */
50 static uchar_t hkscs_special_sequence[][4] = {
51 { 0xc3, 0x8a, 0xcc, 0x84 }, /* 0x8862 */
52 { 0xc3, 0x8a, 0xcc, 0x8c }, /* 0x8864 */
53 { 0xc3, 0xaa, 0xcc, 0x84 }, /* 0x88a3 */
54 { 0xc3, 0xaa, 0xcc, 0x8c } /* 0x88a5 */
55 };
56
57 /* 4 Unicode code point pair map to 1 HKSCS-2004 code point. */
58 static uint32_t ucs_special_sequence[] = {
59 0x8866, /* U+00ca */
60 0x8862, /* U+00ca U+0304 */
61 0x8864, /* U+00ca U+030c */
62 0x88a7, /* U+00ea */
63 0x88a3, /* U+00ea U+0304 */
64 0x88a5 /* U+00ea U+030c */
65 };
66
67 typedef int8_t (*kiconv_big5toutf8_t)(uint32_t value, uchar_t *ob,
68 uchar_t *obtail, size_t *ret_val);
69
70 static int8_t utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
71 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
72 static int8_t utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
73 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
74 static int8_t utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf,
75 uchar_t *ibtail, uchar_t *ob, uchar_t *obtail, size_t *ret_val);
76 static int8_t utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
77 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
78 static int8_t big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
79 size_t *ret_val);
80 static int8_t big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
81 uchar_t *obtail, size_t *ret_val);
82 static int8_t cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
83 uchar_t *obtail, size_t *ret_val);
84 static int8_t euctw_to_utf8(size_t plane_no, uint32_t euctw_val,
85 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
86 static uint32_t get_unicode_from_UDA(size_t plane_no, uchar_t byte1,
87 uchar_t byte2);
88
89 #define KICONV_TC_BIG5 (0x01)
90 #define KICONV_TC_BIG5HKSCS (0x02)
91 #define KICONV_TC_CP950HKSCS (0x03)
92 #define KICONV_TC_EUCTW (0x04)
93 #define KICONV_TC_MAX_MAGIC_ID (0x04)
94
95 static void *
open_fr_big5()96 open_fr_big5()
97 {
98 return ((void *)KICONV_TC_BIG5);
99 }
100
101 static void *
open_fr_big5hkscs()102 open_fr_big5hkscs()
103 {
104 return ((void *)KICONV_TC_BIG5HKSCS);
105 }
106
107 static void *
open_fr_cp950hkscs()108 open_fr_cp950hkscs()
109 {
110 return ((void *)KICONV_TC_CP950HKSCS);
111 }
112
113 static void *
open_fr_euctw()114 open_fr_euctw()
115 {
116 return ((void *)KICONV_TC_EUCTW);
117 }
118
119 static int
close_fr_tc(void * s)120 close_fr_tc(void *s)
121 {
122 if ((uintptr_t)s > KICONV_TC_MAX_MAGIC_ID)
123 return (EBADF);
124
125 return (0);
126 }
127
128 /*
129 * Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) to UTF-8.
130 */
131 static size_t
kiconv_fr_big5_common(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_big5toutf8_t ptr_big5touf8)132 kiconv_fr_big5_common(void *kcd, char **inbuf, size_t *inbytesleft,
133 char **outbuf, size_t *outbytesleft, int *errno,
134 kiconv_big5toutf8_t ptr_big5touf8)
135 {
136 uchar_t *ib;
137 uchar_t *ob;
138 uchar_t *ibtail;
139 uchar_t *obtail;
140 size_t ret_val;
141 int8_t sz;
142 uint32_t big5_val;
143
144 /* Check on the kiconv code conversion descriptor. */
145 if (kcd == NULL || kcd == (void *)-1) {
146 *errno = EBADF;
147 return ((size_t)-1);
148 }
149
150 /* If this is a state reset request, process and return. */
151 if (inbuf == NULL || *inbuf == NULL) {
152 return (0);
153 }
154
155 ret_val = 0;
156 ib = (uchar_t *)*inbuf;
157 ob = (uchar_t *)*outbuf;
158 ibtail = ib + *inbytesleft;
159 obtail = ob + *outbytesleft;
160
161 while (ib < ibtail) {
162 if (KICONV_IS_ASCII(*ib)) {
163 if (ob >= obtail) {
164 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
165 }
166
167 *ob++ = *ib++;
168 continue;
169 }
170
171 /*
172 * Issue EILSEQ error if the first byte is not a
173 * valid BIG5/HKSCS leading byte.
174 */
175 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
176 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
177 }
178
179 /*
180 * Issue EINVAL error if input buffer has an incomplete
181 * character at the end of the buffer.
182 */
183 if (ibtail - ib < 2) {
184 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
185 }
186
187 /*
188 * Issue EILSEQ error if the remaining bytes is not
189 * a valid BIG5/HKSCS byte.
190 */
191 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
192 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
193 }
194
195 /* Now we have a valid BIG5/HKSCS character. */
196 big5_val = (uint32_t)(*ib) << 8 | *(ib + 1);
197 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
198
199 if (sz < 0) {
200 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
201 }
202
203 ib += 2;
204 ob += sz;
205 }
206
207 *inbuf = (char *)ib;
208 *inbytesleft = ibtail - ib;
209 *outbuf = (char *)ob;
210 *outbytesleft = obtail - ob;
211
212 return (ret_val);
213 }
214
215 /*
216 * String based Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS)
217 * to UTF-8.
218 */
219 static size_t
kiconvstr_fr_big5_common(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_big5toutf8_t ptr_big5touf8)220 kiconvstr_fr_big5_common(uchar_t *ib, size_t *inlen, uchar_t *ob,
221 size_t *outlen, int flag, int *errno,
222 kiconv_big5toutf8_t ptr_big5touf8)
223 {
224 uchar_t *oldib;
225 uchar_t *ibtail;
226 uchar_t *obtail;
227 size_t ret_val;
228 int8_t sz;
229 uint32_t big5_val;
230 boolean_t do_not_ignore_null;
231
232 ret_val = 0;
233 ibtail = ib + *inlen;
234 obtail = ob + *outlen;
235 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
236
237 while (ib < ibtail) {
238 if (*ib == '\0' && do_not_ignore_null)
239 break;
240
241 if (KICONV_IS_ASCII(*ib)) {
242 if (ob >= obtail) {
243 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
244 }
245
246 *ob++ = *ib++;
247 continue;
248 }
249
250 oldib = ib;
251
252 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
253 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
254 }
255
256 if (ibtail - ib < 2) {
257 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
258 }
259
260 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
261 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
262 }
263
264 big5_val = *ib++;
265 big5_val = (big5_val << 8) | *ib++;
266 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
267
268 if (sz < 0) {
269 ib = oldib;
270 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
271 }
272
273 ob += sz;
274 continue;
275
276 REPLACE_INVALID:
277 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
278 ib = oldib;
279 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
280 }
281
282 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
283 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
284 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
285 ret_val++;
286 }
287
288 *inlen = ibtail - ib;
289 *outlen = obtail - ob;
290
291 return (ret_val);
292 }
293
294 /*
295 * Encoding convertor from BIG5 to UTF-8.
296 */
297 static size_t
kiconv_fr_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)298 kiconv_fr_big5(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf,
299 size_t *outbytesleft, int *errno)
300 {
301 return (kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
302 outbytesleft, errno, big5_to_utf8));
303 }
304
305 /*
306 * String based encoding convertor from BIG5 to UTF-8.
307 */
308 static size_t
kiconvstr_fr_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)309 kiconvstr_fr_big5(char *inarray, size_t *inlen, char *outarray,
310 size_t *outlen, int flag, int *errno)
311 {
312 return (kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
313 (uchar_t *)outarray, outlen, flag, errno,
314 big5_to_utf8));
315 }
316
317 /*
318 * Encoding convertor from BIG5-HKSCS to UTF-8.
319 */
320 static size_t
kiconv_fr_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)321 kiconv_fr_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
322 char **outbuf, size_t *outbytesleft, int *errno)
323 {
324 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
325 outbytesleft, errno, big5hkscs_to_utf8);
326 }
327
328 /*
329 * String based encoding convertor from BIG5-HKSCS to UTF-8.
330 */
331 static size_t
kiconvstr_fr_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)332 kiconvstr_fr_big5hkscs(char *inarray, size_t *inlen, char *outarray,
333 size_t *outlen, int flag, int *errno)
334 {
335 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
336 (uchar_t *)outarray, outlen, flag, errno, big5hkscs_to_utf8);
337 }
338
339 /*
340 * Encoding convertor from CP950-HKSCS to UTF-8.
341 */
342 static size_t
kiconv_fr_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)343 kiconv_fr_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
344 char **outbuf, size_t *outbytesleft, int *errno)
345 {
346 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
347 outbytesleft, errno, cp950hkscs_to_utf8);
348 }
349
350 /*
351 * String based encoding convertor from CP950-HKSCS to UTF-8.
352 */
353 static size_t
kiconvstr_fr_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)354 kiconvstr_fr_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
355 size_t *outlen, int flag, int *errno)
356 {
357 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
358 (uchar_t *)outarray, outlen, flag, errno, cp950hkscs_to_utf8);
359 }
360
361 /*
362 * Encoding convertor from EUC-TW to UTF-8.
363 */
364 static size_t
kiconv_fr_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)365 kiconv_fr_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
366 char **outbuf, size_t *outbytesleft, int *errno)
367 {
368 uchar_t *ib;
369 uchar_t *ob;
370 uchar_t *ibtail;
371 uchar_t *obtail;
372 uchar_t *oldib;
373 size_t ret_val;
374 size_t plane_no;
375 int8_t sz;
376 uint32_t euctw_val;
377 boolean_t isplane1;
378
379 /* Check on the kiconv code conversion descriptor. */
380 if (kcd == NULL || kcd == (void *)-1) {
381 *errno = EBADF;
382 return ((size_t)-1);
383 }
384
385 /* If this is a state reset request, process and return. */
386 if (inbuf == NULL || *inbuf == NULL) {
387 return (0);
388 }
389
390 ret_val = 0;
391 ib = (uchar_t *)*inbuf;
392 ob = (uchar_t *)*outbuf;
393 ibtail = ib + *inbytesleft;
394 obtail = ob + *outbytesleft;
395
396 while (ib < ibtail) {
397 if (KICONV_IS_ASCII(*ib)) {
398 if (ob >= obtail) {
399 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
400 }
401
402 *ob++ = *ib++;
403 continue;
404 }
405
406 /*
407 * Issue EILSEQ error if the first byte is not a
408 * valid EUC-TW leading byte.
409 */
410 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
411 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
412 }
413
414 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
415 B_FALSE : B_TRUE;
416
417 /*
418 * Issue EINVAL error if input buffer has an incomplete
419 * character at the end of the buffer.
420 */
421 if (ibtail - ib < (isplane1 ? 2 : 4)) {
422 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
423 }
424
425 oldib = ib;
426 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
427
428 /*
429 * Issue EILSEQ error if the remaining bytes are not
430 * valid EUC-TW bytes.
431 */
432 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
433 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
434 }
435
436 if (! isplane1)
437 ib += 2;
438
439 /* Now we have a valid EUC-TW character. */
440 euctw_val = *ib++;
441 euctw_val = (euctw_val << 8) | *ib++;
442 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
443
444 if (sz < 0) {
445 ib = oldib;
446 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
447 }
448
449 ob += sz;
450 }
451
452 *inbuf = (char *)ib;
453 *inbytesleft = ibtail - ib;
454 *outbuf = (char *)ob;
455 *outbytesleft = obtail - ob;
456
457 return (ret_val);
458 }
459
460 /*
461 * String based encoding convertor from EUC-TW to UTF-8.
462 */
463 static size_t
kiconvstr_fr_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)464 kiconvstr_fr_euctw(char *inarray, size_t *inlen, char *outarray,
465 size_t *outlen, int flag, int *errno)
466 {
467 uchar_t *ib;
468 uchar_t *ob;
469 uchar_t *ibtail;
470 uchar_t *obtail;
471 uchar_t *oldib;
472 size_t ret_val;
473 size_t plane_no;
474 int8_t sz;
475 uint32_t euctw_val;
476 boolean_t isplane1;
477 boolean_t do_not_ignore_null;
478
479 ret_val = 0;
480 ib = (uchar_t *)inarray;
481 ob = (uchar_t *)outarray;
482 ibtail = ib + *inlen;
483 obtail = ob + *outlen;
484 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
485
486 while (ib < ibtail) {
487 if (*ib == '\0' && do_not_ignore_null)
488 break;
489
490 if (KICONV_IS_ASCII(*ib)) {
491 if (ob >= obtail) {
492 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
493 }
494
495 *ob++ = *ib++;
496 continue;
497 }
498
499 oldib = ib;
500
501 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
502 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
503 }
504
505 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
506 B_FALSE : B_TRUE;
507
508 if (ibtail - ib < (isplane1 ? 2 : 4)) {
509 if (flag & KICONV_REPLACE_INVALID) {
510 ib = ibtail;
511 goto REPLACE_INVALID;
512 }
513
514 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
515 }
516
517 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
518
519 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
520 KICONV_SET_ERRNO_WITH_FLAG(isplane1 ? 2 : 4, EILSEQ);
521 }
522
523 if (! isplane1)
524 ib += 2;
525
526 euctw_val = *ib++;
527 euctw_val = (euctw_val << 8) | *ib++;
528 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
529
530 if (sz < 0) {
531 ib = oldib;
532 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
533 }
534
535 ob += sz;
536 continue;
537
538 REPLACE_INVALID:
539 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
540 ib = oldib;
541 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
542 }
543
544 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
545 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
546 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
547 ret_val++;
548 }
549
550 *inlen = ibtail - ib;
551 *outlen = obtail - ob;
552
553 return (ret_val);
554 }
555
556 /*
557 * Encoding convertor from UTF-8 to BIG5.
558 */
559 static size_t
kiconv_to_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)560 kiconv_to_big5(void *kcd, char **inbuf, size_t *inbytesleft,
561 char **outbuf, size_t *outbytesleft, int *errno)
562 {
563 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
564 outbytesleft, errno, utf8_to_big5);
565 }
566
567 /*
568 * String based encoding convertor from UTF-8 to BIG5.
569 */
570 static size_t
kiconvstr_to_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)571 kiconvstr_to_big5(char *inarray, size_t *inlen, char *outarray,
572 size_t *outlen, int flag, int *errno)
573 {
574 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
575 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5);
576 }
577
578 /*
579 * Encoding convertor from UTF-8 to EUC-TW.
580 */
581 static size_t
kiconv_to_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)582 kiconv_to_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
583 char **outbuf, size_t *outbytesleft, int *errno)
584 {
585 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
586 outbytesleft, errno, utf8_to_euctw);
587 }
588
589 /*
590 * String based encoding convertor from UTF-8 to EUC-TW.
591 */
592 static size_t
kiconvstr_to_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)593 kiconvstr_to_euctw(char *inarray, size_t *inlen, char *outarray,
594 size_t *outlen, int flag, int *errno)
595 {
596 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
597 (uchar_t *)outarray, outlen, flag, errno, utf8_to_euctw);
598 }
599
600 /*
601 * Encoding convertor from UTF-8 to CP950HKSCS.
602 */
603 static size_t
kiconv_to_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)604 kiconv_to_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
605 char **outbuf, size_t *outbytesleft, int *errno)
606 {
607 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
608 outbytesleft, errno, utf8_to_cp950hkscs);
609 }
610
611 /*
612 * String based encoding convertor from UTF-8 to CP950HKSCS.
613 */
614 static size_t
kiconvstr_to_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)615 kiconvstr_to_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
616 size_t *outlen, int flag, int *errno)
617 {
618 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
619 (uchar_t *)outarray, outlen, flag, errno, utf8_to_cp950hkscs);
620 }
621
622 /*
623 * Encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
624 */
625 static size_t
kiconv_to_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)626 kiconv_to_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
627 char **outbuf, size_t *outbytesleft, int *errno)
628 {
629 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
630 outbytesleft, errno, utf8_to_big5hkscs);
631 }
632
633 /*
634 * String based encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
635 */
636 static size_t
kiconvstr_to_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)637 kiconvstr_to_big5hkscs(char *inarray, size_t *inlen, char *outarray,
638 size_t *outlen, int flag, int *errno)
639 {
640 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
641 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5hkscs);
642 }
643
644 /*
645 * Common convertor from single BIG5/CP950-HKSCS character to UTF-8.
646 * Return: > 0 - Converted successfully
647 * = -1 - E2BIG
648 */
649 static int8_t
big5_to_utf8_common(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_array_t * table,size_t nitems)650 big5_to_utf8_common(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
651 size_t *ret_val, kiconv_table_array_t *table, size_t nitems)
652 {
653 size_t index;
654 int8_t sz;
655 uchar_t *u8;
656
657 index = kiconv_binsearch(big5_val, table, nitems);
658 u8 = table[index].u8;
659 sz = u8_number_of_bytes[u8[0]];
660
661 if (obtail - ob < sz) {
662 *ret_val = (size_t)-1;
663 return (-1);
664 }
665
666 if (index == 0)
667 (*ret_val)++; /* Non-identical conversion */
668
669 for (index = 0; index < sz; index++)
670 *ob++ = u8[index];
671
672 return (sz);
673 }
674
675 /*
676 * Convert single BIG5 character to UTF-8.
677 */
678 static int8_t
big5_to_utf8(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)679 big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val)
680 {
681 return (big5_to_utf8_common(big5_val, ob, obtail, ret_val,
682 kiconv_big5_utf8, KICONV_BIG5_UTF8_MAX));
683 }
684
685 /*
686 * Convert single CP950-HKSCS character to UTF-8.
687 */
688 static int8_t
cp950hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)689 cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
690 size_t *ret_val)
691 {
692 return (big5_to_utf8_common(hkscs_val, ob, obtail, ret_val,
693 kiconv_cp950hkscs_utf8, KICONV_CP950HKSCS_UTF8_MAX));
694 }
695
696 /*
697 * Calculate unicode value for some CNS planes which fall in Unicode
698 * UDA range.
699 */
700 static uint32_t
get_unicode_from_UDA(size_t plane_no,uchar_t b1,uchar_t b2)701 get_unicode_from_UDA(size_t plane_no, uchar_t b1, uchar_t b2)
702 {
703 /*
704 * CNS Plane 15 is pre-allocated, so need move Plane 16 to back 15
705 * to compute the Unicode value.
706 */
707 if (plane_no == 16)
708 --plane_no;
709
710 /* 0xF0000 + (plane_no - 12) * 8836 + (b1 - 0xA1) * 94 + (b2 - 0xA1) */
711 return (8836 * plane_no + 94 * b1 + b2 + 0xD2611);
712 }
713
714 /*
715 * Convert single EUC-TW character to UTF-8.
716 * Return: > 0 - Converted successfully
717 * = -1 - E2BIG
718 */
719 static int8_t
euctw_to_utf8(size_t plane_no,uint32_t euctw_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)720 euctw_to_utf8(size_t plane_no, uint32_t euctw_val, uchar_t *ob,
721 uchar_t *obtail, size_t *ret_val)
722 {
723 uint32_t u32;
724 size_t index;
725 int8_t sz;
726 uchar_t udc[4];
727 uchar_t *u8;
728
729 switch (plane_no) {
730 case 1:
731 index = kiconv_binsearch(euctw_val, kiconv_cns1_utf8,
732 KICONV_CNS1_UTF8_MAX);
733 u8 = kiconv_cns1_utf8[index].u8;
734 break;
735 case 2:
736 index = kiconv_binsearch(euctw_val, kiconv_cns2_utf8,
737 KICONV_CNS2_UTF8_MAX);
738 u8 = kiconv_cns2_utf8[index].u8;
739 break;
740 case 3:
741 index = kiconv_binsearch(euctw_val, kiconv_cns3_utf8,
742 KICONV_CNS3_UTF8_MAX);
743 u8 = kiconv_cns3_utf8[index].u8;
744 break;
745 case 4:
746 index = kiconv_binsearch(euctw_val, kiconv_cns4_utf8,
747 KICONV_CNS4_UTF8_MAX);
748 u8 = kiconv_cns4_utf8[index].u8;
749 break;
750 case 5:
751 index = kiconv_binsearch(euctw_val, kiconv_cns5_utf8,
752 KICONV_CNS5_UTF8_MAX);
753 u8 = kiconv_cns5_utf8[index].u8;
754 break;
755 case 6:
756 index = kiconv_binsearch(euctw_val, kiconv_cns6_utf8,
757 KICONV_CNS6_UTF8_MAX);
758 u8 = kiconv_cns6_utf8[index].u8;
759 break;
760 case 7:
761 index = kiconv_binsearch(euctw_val, kiconv_cns7_utf8,
762 KICONV_CNS7_UTF8_MAX);
763 u8 = kiconv_cns7_utf8[index].u8;
764 break;
765 case 12:
766 case 13:
767 case 14:
768 case 16:
769 u32 = get_unicode_from_UDA(plane_no,
770 (euctw_val & 0xFF00) >> 8, euctw_val & 0xFF);
771 /*
772 * As U+F0000 <= u32 <= U+F8A0F, so its UTF-8 sequence
773 * will occupy 4 bytes.
774 */
775 udc[0] = 0xF3;
776 udc[1] = (uchar_t)(0x80 | (u32 & 0x03F000) >> 12);
777 udc[2] = (uchar_t)(0x80 | (u32 & 0x000FC0) >> 6);
778 udc[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
779 u8 = udc;
780 index = 1;
781 break;
782 case 15:
783 index = kiconv_binsearch(euctw_val, kiconv_cns15_utf8,
784 KICONV_CNS15_UTF8_MAX);
785 u8 = kiconv_cns15_utf8[index].u8;
786 break;
787 default:
788 index = 0;
789 u8 = kiconv_cns1_utf8[index].u8;
790 }
791
792 sz = u8_number_of_bytes[u8[0]];
793 if (obtail - ob < sz) {
794 *ret_val = (size_t)-1;
795 return (-1);
796 }
797
798 if (index == 0)
799 (*ret_val)++;
800
801 for (index = 0; index < sz; index++)
802 *ob++ = u8[index];
803
804 return (sz);
805 }
806
807 /*
808 * Convert single HKSCS character to UTF-8.
809 * Return: > 0 - Converted successfully
810 * = -1 - E2BIG
811 */
812 static int8_t
big5hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)813 big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
814 size_t *ret_val)
815 {
816 size_t index;
817 int8_t sz;
818 uchar_t *u8;
819
820 index = kiconv_binsearch(hkscs_val, kiconv_hkscs_utf8,
821 KICONV_HKSCS_UTF8_MAX);
822 u8 = kiconv_hkscs_utf8[index].u8;
823
824 /*
825 * Single HKSCS-2004 character may map to 2 Unicode
826 * code points.
827 */
828 if (u8[0] == 0xFF) {
829 u8 = hkscs_special_sequence[u8[1]];
830 sz = 4;
831 } else {
832 sz = u8_number_of_bytes[u8[0]];
833 }
834
835 if (obtail - ob < sz) {
836 *ret_val = (size_t)-1;
837 return (-1);
838 }
839
840 if (index == 0)
841 (*ret_val)++; /* Non-identical conversion. */
842
843 for (index = 0; index < sz; index++)
844 *ob++ = u8[index];
845
846 return (sz);
847 }
848
849 /*
850 * Convert single UTF-8 character to EUC-TW.
851 * Return: > 0 - Converted successfully
852 * = -1 - E2BIG
853 */
854 /* ARGSUSED */
855 static int8_t
utf8_to_euctw(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)856 utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
857 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
858 {
859 size_t index;
860 size_t plane_no;
861 uchar_t byte1;
862 uchar_t byte2;
863
864 if (utf8 >= KICONV_TC_UDA_UTF8_START &&
865 utf8 <= KICONV_TC_UDA_UTF8_END) {
866 /*
867 * Calculate EUC-TW code if utf8 is in Unicode
868 * Private Plane 15.
869 */
870 index = (((utf8 & 0x7000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
871 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
872 KICONV_TC_UDA_UCS4_START;
873 plane_no = 12 + index / 8836;
874 byte1 = 0xA1 + (index % 8836) / 94;
875 byte2 = 0xA1 + index % 94;
876
877 /* CNS Plane 15 is pre-allocated, so place it into Plane 16. */
878 if (plane_no == 15)
879 plane_no = 16;
880 } else {
881 uint32_t euctw_val;
882
883 index = kiconv_binsearch(utf8, kiconv_utf8_euctw,
884 KICONV_UTF8_EUCTW_MAX);
885
886 if (index == 0) {
887 if (ob >= obtail) {
888 *ret_val = (size_t)-1;
889 return (-1);
890 }
891
892 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
893 (*ret_val)++;
894
895 return (1);
896 }
897
898 euctw_val = kiconv_utf8_euctw[index].value;
899 byte1 = (euctw_val & 0xFF00) >> 8;
900 byte2 = euctw_val & 0xFF;
901 plane_no = euctw_val >> 16;
902 }
903
904 if (obtail - ob < (plane_no == 1 ? 2 : 4)) {
905 *ret_val = (size_t)-1;
906 return (-1);
907 }
908
909 if (plane_no != 1) {
910 *ob++ = KICONV_TC_EUCTW_MBYTE;
911 *ob++ = KICONV_TC_EUCTW_PMASK + plane_no;
912 }
913
914 *ob++ = byte1;
915 *ob = byte2;
916
917 return (plane_no == 1 ? 2 : 4);
918 }
919
920 /*
921 * Convert single UTF-8 character to BIG5-HKSCS
922 * Return: > 0 - Converted successfully
923 * = -1 - E2BIG
924 */
925 static int8_t
utf8_to_big5hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)926 utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
927 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
928 {
929 size_t index;
930 int8_t hkscslen;
931 uint32_t hkscscode;
932 boolean_t special_sequence = B_FALSE;
933
934 index = kiconv_binsearch(utf8, kiconv_utf8_hkscs,
935 KICONV_UTF8_HKSCS_MAX);
936 hkscscode = kiconv_utf8_hkscs[index].value;
937
938 /*
939 * There are 4 special code points in HKSCS-2004 which mapped
940 * to 2 UNICODE code points.
941 */
942 if ((int32_t)hkscscode < 0) {
943 size_t special_index = (-(int32_t)hkscscode - 1) * 3;
944
945 /* Check the following 2 bytes. */
946 if (ibtail - *inbuf >= 2 && **inbuf == 0xcc &&
947 (*(*inbuf + 1) == 0x84 || *(*inbuf + 1) == 0x8c)) {
948 special_index += (*(*inbuf + 1) == 0x84 ? 1 : 2);
949 special_sequence = B_TRUE;
950 }
951
952 hkscscode = ucs_special_sequence[special_index];
953 }
954
955 hkscslen = (hkscscode <= 0xFF) ? 1 : 2;
956 if (obtail - ob < hkscslen) {
957 *ret_val = (size_t)-1;
958 return (-1);
959 }
960
961 if (index == 0)
962 (*ret_val)++;
963
964 if (hkscslen > 1)
965 *ob++ = (uchar_t)(hkscscode >> 8);
966 *ob = (uchar_t)(hkscscode & 0xFF);
967
968 if (special_sequence) { /* Advance for special sequence */
969 (*inbuf) += 2;
970 }
971
972 return (hkscslen);
973 }
974
975 /*
976 * Common convertor for UTF-8 to BIG5/CP950-HKSCS.
977 * Return: > 0 - Converted successfully
978 * = -1 - E2BIG
979 */
980 static int8_t
utf8_to_big5_common(uint32_t utf8,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_t * table,size_t nitems)981 utf8_to_big5_common(uint32_t utf8, uchar_t *ob, uchar_t *obtail,
982 size_t *ret_val, kiconv_table_t *table, size_t nitems)
983 {
984 size_t index;
985 int8_t big5len;
986 uint32_t big5code;
987
988 index = kiconv_binsearch(utf8, table, nitems);
989 big5code = table[index].value;
990 big5len = (big5code <= 0xFF) ? 1 : 2;
991
992 if (obtail - ob < big5len) {
993 *ret_val = (size_t)-1;
994 return (-1);
995 }
996
997 if (index == 0)
998 (*ret_val)++;
999
1000 if (big5len > 1)
1001 *ob++ = (uchar_t)(big5code >> 8);
1002 *ob = (uchar_t)(big5code & 0xFF);
1003
1004 return (big5len);
1005 }
1006
1007 /*
1008 * Convert single UTF-8 character to BIG5.
1009 */
1010 /* ARGSUSED */
1011 static int8_t
utf8_to_big5(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1012 utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1013 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1014 {
1015 return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1016 kiconv_utf8_big5, KICONV_UTF8_BIG5_MAX));
1017 }
1018
1019 /*
1020 * Convert single UTF-8 character to CP950-HKSCS for Windows compatibility.
1021 */
1022 /* ARGSUSED */
1023 static int8_t
utf8_to_cp950hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1024 utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1025 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1026 {
1027 return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1028 kiconv_utf8_cp950hkscs, KICONV_UTF8_CP950HKSCS));
1029 }
1030
1031 static kiconv_ops_t kiconv_tc_ops_tbl[] = {
1032 {
1033 "big5", "utf-8", kiconv_open_to_cck, kiconv_to_big5,
1034 kiconv_close_to_cck, kiconvstr_to_big5
1035 },
1036 {
1037 "utf-8", "big5", open_fr_big5, kiconv_fr_big5,
1038 close_fr_tc, kiconvstr_fr_big5
1039 },
1040
1041 {
1042 "big5-hkscs", "utf-8", kiconv_open_to_cck, kiconv_to_big5hkscs,
1043 kiconv_close_to_cck, kiconvstr_to_big5hkscs
1044 },
1045 {
1046 "utf-8", "big5-hkscs", open_fr_big5hkscs, kiconv_fr_big5hkscs,
1047 close_fr_tc, kiconvstr_fr_big5hkscs
1048 },
1049
1050 {
1051 "euc-tw", "utf-8", kiconv_open_to_cck, kiconv_to_euctw,
1052 kiconv_close_to_cck, kiconvstr_to_euctw
1053 },
1054 {
1055 "utf-8", "euc-tw", open_fr_euctw, kiconv_fr_euctw,
1056 close_fr_tc, kiconvstr_fr_euctw
1057 },
1058
1059 {
1060 "cp950-hkscs", "utf-8", kiconv_open_to_cck,
1061 kiconv_to_cp950hkscs, kiconv_close_to_cck,
1062 kiconvstr_to_cp950hkscs
1063 },
1064 {
1065 "utf-8", "cp950-hkscs", open_fr_cp950hkscs,
1066 kiconv_fr_cp950hkscs, close_fr_tc, kiconvstr_fr_cp950hkscs
1067 },
1068 };
1069
1070 static kiconv_module_info_t kiconv_tc_info = {
1071 "kiconv_tc", /* module name */
1072 sizeof (kiconv_tc_ops_tbl) / sizeof (kiconv_tc_ops_tbl[0]),
1073 kiconv_tc_ops_tbl,
1074 0,
1075 NULL,
1076 NULL,
1077 0
1078 };
1079
1080 static struct modlkiconv modlkiconv_tc = {
1081 &mod_kiconvops,
1082 "kiconv Traditional Chinese module 1.0",
1083 &kiconv_tc_info
1084 };
1085
1086 static struct modlinkage modlinkage = {
1087 MODREV_1,
1088 (void *)&modlkiconv_tc,
1089 NULL
1090 };
1091
1092 int
_init(void)1093 _init(void)
1094 {
1095 int err;
1096
1097 err = mod_install(&modlinkage);
1098 if (err)
1099 cmn_err(CE_WARN, "kiconv_tc: failed to load kernel module");
1100
1101 return (err);
1102 }
1103
1104 int
_fini(void)1105 _fini(void)
1106 {
1107 int err;
1108
1109 /*
1110 * If this module is being used, then, we cannot remove the module.
1111 * The following checking will catch pretty much all usual cases.
1112 *
1113 * Any remaining will be catached by the kiconv_unregister_module()
1114 * during mod_remove() at below.
1115 */
1116 if (kiconv_module_ref_count(KICONV_MODULE_ID_TC))
1117 return (EBUSY);
1118
1119 err = mod_remove(&modlinkage);
1120 if (err)
1121 cmn_err(CE_WARN, "kiconv_tc: failed to remove kernel module");
1122
1123 return (err);
1124 }
1125
1126 int
_info(struct modinfo * modinfop)1127 _info(struct modinfo *modinfop)
1128 {
1129 return (mod_info(&modlinkage, modinfop));
1130 }
1131