1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/sunddi.h>
35 #include <sys/byteorder.h>
36 #include <sys/errno.h>
37 #include <sys/modctl.h>
38 #include <sys/u8_textprep.h>
39 #include <sys/kiconv.h>
40 #include <sys/kiconv_cck_common.h>
41 #include <sys/kiconv_tc.h>
42 #include <sys/kiconv_big5_utf8.h>
43 #include <sys/kiconv_euctw_utf8.h>
44 #include <sys/kiconv_hkscs_utf8.h>
45 #include <sys/kiconv_cp950hkscs_utf8.h>
46 #include <sys/kiconv_utf8_big5.h>
47 #include <sys/kiconv_utf8_euctw.h>
48 #include <sys/kiconv_utf8_cp950hkscs.h>
49 #include <sys/kiconv_utf8_hkscs.h>
50
51 /* 4 HKSCS-2004 code points map to 2 Unicode code points separately. */
52 static uchar_t hkscs_special_sequence[][4] = {
53 { 0xc3, 0x8a, 0xcc, 0x84 }, /* 0x8862 */
54 { 0xc3, 0x8a, 0xcc, 0x8c }, /* 0x8864 */
55 { 0xc3, 0xaa, 0xcc, 0x84 }, /* 0x88a3 */
56 { 0xc3, 0xaa, 0xcc, 0x8c } /* 0x88a5 */
57 };
58
59 /* 4 Unicode code point pair map to 1 HKSCS-2004 code point. */
60 static uint32_t ucs_special_sequence[] = {
61 0x8866, /* U+00ca */
62 0x8862, /* U+00ca U+0304 */
63 0x8864, /* U+00ca U+030c */
64 0x88a7, /* U+00ea */
65 0x88a3, /* U+00ea U+0304 */
66 0x88a5 /* U+00ea U+030c */
67 };
68
69 typedef int8_t (*kiconv_big5toutf8_t)(uint32_t value, uchar_t *ob,
70 uchar_t *obtail, size_t *ret_val);
71
72 static int8_t utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
73 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
74 static int8_t utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
75 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
76 static int8_t utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf,
77 uchar_t *ibtail, uchar_t *ob, uchar_t *obtail, size_t *ret_val);
78 static int8_t utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
79 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
80 static int8_t big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
81 size_t *ret_val);
82 static int8_t big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
83 uchar_t *obtail, size_t *ret_val);
84 static int8_t cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob,
85 uchar_t *obtail, size_t *ret_val);
86 static int8_t euctw_to_utf8(size_t plane_no, uint32_t euctw_val,
87 uchar_t *ob, uchar_t *obtail, size_t *ret_val);
88 static uint32_t get_unicode_from_UDA(size_t plane_no, uchar_t byte1,
89 uchar_t byte2);
90
91 #define KICONV_TC_BIG5 (0x01)
92 #define KICONV_TC_BIG5HKSCS (0x02)
93 #define KICONV_TC_CP950HKSCS (0x03)
94 #define KICONV_TC_EUCTW (0x04)
95 #define KICONV_TC_MAX_MAGIC_ID (0x04)
96
97 static void *
open_fr_big5()98 open_fr_big5()
99 {
100 return ((void *)KICONV_TC_BIG5);
101 }
102
103 static void *
open_fr_big5hkscs()104 open_fr_big5hkscs()
105 {
106 return ((void *)KICONV_TC_BIG5HKSCS);
107 }
108
109 static void *
open_fr_cp950hkscs()110 open_fr_cp950hkscs()
111 {
112 return ((void *)KICONV_TC_CP950HKSCS);
113 }
114
115 static void *
open_fr_euctw()116 open_fr_euctw()
117 {
118 return ((void *)KICONV_TC_EUCTW);
119 }
120
121 static int
close_fr_tc(void * s)122 close_fr_tc(void *s)
123 {
124 if ((uintptr_t)s > KICONV_TC_MAX_MAGIC_ID)
125 return (EBADF);
126
127 return (0);
128 }
129
130 /*
131 * Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS) to UTF-8.
132 */
133 static size_t
kiconv_fr_big5_common(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_big5toutf8_t ptr_big5touf8)134 kiconv_fr_big5_common(void *kcd, char **inbuf, size_t *inbytesleft,
135 char **outbuf, size_t *outbytesleft, int *errno,
136 kiconv_big5toutf8_t ptr_big5touf8)
137 {
138 uchar_t *ib;
139 uchar_t *ob;
140 uchar_t *ibtail;
141 uchar_t *obtail;
142 size_t ret_val;
143 int8_t sz;
144 uint32_t big5_val;
145
146 /* Check on the kiconv code conversion descriptor. */
147 if (kcd == NULL || kcd == (void *)-1) {
148 *errno = EBADF;
149 return ((size_t)-1);
150 }
151
152 /* If this is a state reset request, process and return. */
153 if (inbuf == NULL || *inbuf == NULL) {
154 return (0);
155 }
156
157 ret_val = 0;
158 ib = (uchar_t *)*inbuf;
159 ob = (uchar_t *)*outbuf;
160 ibtail = ib + *inbytesleft;
161 obtail = ob + *outbytesleft;
162
163 while (ib < ibtail) {
164 if (KICONV_IS_ASCII(*ib)) {
165 if (ob >= obtail) {
166 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
167 }
168
169 *ob++ = *ib++;
170 continue;
171 }
172
173 /*
174 * Issue EILSEQ error if the first byte is not a
175 * valid BIG5/HKSCS leading byte.
176 */
177 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
178 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
179 }
180
181 /*
182 * Issue EINVAL error if input buffer has an incomplete
183 * character at the end of the buffer.
184 */
185 if (ibtail - ib < 2) {
186 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
187 }
188
189 /*
190 * Issue EILSEQ error if the remaining bytes is not
191 * a valid BIG5/HKSCS byte.
192 */
193 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
194 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
195 }
196
197 /* Now we have a valid BIG5/HKSCS character. */
198 big5_val = (uint32_t)(*ib) << 8 | *(ib + 1);
199 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
200
201 if (sz < 0) {
202 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
203 }
204
205 ib += 2;
206 ob += sz;
207 }
208
209 *inbuf = (char *)ib;
210 *inbytesleft = ibtail - ib;
211 *outbuf = (char *)ob;
212 *outbytesleft = obtail - ob;
213
214 return (ret_val);
215 }
216
217 /*
218 * String based Common convertor from BIG5/HKSCS(BIG5-HKSCS or CP950-HKSCS)
219 * to UTF-8.
220 */
221 static size_t
kiconvstr_fr_big5_common(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_big5toutf8_t ptr_big5touf8)222 kiconvstr_fr_big5_common(uchar_t *ib, size_t *inlen, uchar_t *ob,
223 size_t *outlen, int flag, int *errno,
224 kiconv_big5toutf8_t ptr_big5touf8)
225 {
226 uchar_t *oldib;
227 uchar_t *ibtail;
228 uchar_t *obtail;
229 size_t ret_val;
230 int8_t sz;
231 uint32_t big5_val;
232 boolean_t do_not_ignore_null;
233
234 ret_val = 0;
235 ibtail = ib + *inlen;
236 obtail = ob + *outlen;
237 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
238
239 while (ib < ibtail) {
240 if (*ib == '\0' && do_not_ignore_null)
241 break;
242
243 if (KICONV_IS_ASCII(*ib)) {
244 if (ob >= obtail) {
245 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
246 }
247
248 *ob++ = *ib++;
249 continue;
250 }
251
252 oldib = ib;
253
254 if (! KICONV_TC_IS_BIG5_1st_BYTE(*ib)) {
255 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
256 }
257
258 if (ibtail - ib < 2) {
259 KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
260 }
261
262 if (! KICONV_TC_IS_BIG5_2nd_BYTE(*(ib + 1))) {
263 KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
264 }
265
266 big5_val = *ib++;
267 big5_val = (big5_val << 8) | *ib++;
268 sz = ptr_big5touf8(big5_val, ob, obtail, &ret_val);
269
270 if (sz < 0) {
271 ib = oldib;
272 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
273 }
274
275 ob += sz;
276 continue;
277
278 REPLACE_INVALID:
279 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
280 ib = oldib;
281 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
282 }
283
284 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
285 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
286 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
287 ret_val++;
288 }
289
290 *inlen = ibtail - ib;
291 *outlen = obtail - ob;
292
293 return (ret_val);
294 }
295
296 /*
297 * Encoding convertor from BIG5 to UTF-8.
298 */
299 static size_t
kiconv_fr_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)300 kiconv_fr_big5(void *kcd, char **inbuf, size_t *inbytesleft, char **outbuf,
301 size_t *outbytesleft, int *errno)
302 {
303 return (kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
304 outbytesleft, errno, big5_to_utf8));
305 }
306
307 /*
308 * String based encoding convertor from BIG5 to UTF-8.
309 */
310 static size_t
kiconvstr_fr_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)311 kiconvstr_fr_big5(char *inarray, size_t *inlen, char *outarray,
312 size_t *outlen, int flag, int *errno)
313 {
314 return (kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
315 (uchar_t *)outarray, outlen, flag, errno,
316 big5_to_utf8));
317 }
318
319 /*
320 * Encoding convertor from BIG5-HKSCS to UTF-8.
321 */
322 static size_t
kiconv_fr_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)323 kiconv_fr_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
324 char **outbuf, size_t *outbytesleft, int *errno)
325 {
326 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
327 outbytesleft, errno, big5hkscs_to_utf8);
328 }
329
330 /*
331 * String based encoding convertor from BIG5-HKSCS to UTF-8.
332 */
333 static size_t
kiconvstr_fr_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)334 kiconvstr_fr_big5hkscs(char *inarray, size_t *inlen, char *outarray,
335 size_t *outlen, int flag, int *errno)
336 {
337 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
338 (uchar_t *)outarray, outlen, flag, errno, big5hkscs_to_utf8);
339 }
340
341 /*
342 * Encoding convertor from CP950-HKSCS to UTF-8.
343 */
344 static size_t
kiconv_fr_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)345 kiconv_fr_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
346 char **outbuf, size_t *outbytesleft, int *errno)
347 {
348 return kiconv_fr_big5_common(kcd, inbuf, inbytesleft, outbuf,
349 outbytesleft, errno, cp950hkscs_to_utf8);
350 }
351
352 /*
353 * String based encoding convertor from CP950-HKSCS to UTF-8.
354 */
355 static size_t
kiconvstr_fr_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)356 kiconvstr_fr_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
357 size_t *outlen, int flag, int *errno)
358 {
359 return kiconvstr_fr_big5_common((uchar_t *)inarray, inlen,
360 (uchar_t *)outarray, outlen, flag, errno, cp950hkscs_to_utf8);
361 }
362
363 /*
364 * Encoding convertor from EUC-TW to UTF-8.
365 */
366 static size_t
kiconv_fr_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)367 kiconv_fr_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
368 char **outbuf, size_t *outbytesleft, int *errno)
369 {
370 uchar_t *ib;
371 uchar_t *ob;
372 uchar_t *ibtail;
373 uchar_t *obtail;
374 uchar_t *oldib;
375 size_t ret_val;
376 size_t plane_no;
377 int8_t sz;
378 uint32_t euctw_val;
379 boolean_t isplane1;
380
381 /* Check on the kiconv code conversion descriptor. */
382 if (kcd == NULL || kcd == (void *)-1) {
383 *errno = EBADF;
384 return ((size_t)-1);
385 }
386
387 /* If this is a state reset request, process and return. */
388 if (inbuf == NULL || *inbuf == NULL) {
389 return (0);
390 }
391
392 ret_val = 0;
393 ib = (uchar_t *)*inbuf;
394 ob = (uchar_t *)*outbuf;
395 ibtail = ib + *inbytesleft;
396 obtail = ob + *outbytesleft;
397
398 while (ib < ibtail) {
399 if (KICONV_IS_ASCII(*ib)) {
400 if (ob >= obtail) {
401 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
402 }
403
404 *ob++ = *ib++;
405 continue;
406 }
407
408 /*
409 * Issue EILSEQ error if the first byte is not a
410 * valid EUC-TW leading byte.
411 */
412 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
413 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
414 }
415
416 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
417 B_FALSE : B_TRUE;
418
419 /*
420 * Issue EINVAL error if input buffer has an incomplete
421 * character at the end of the buffer.
422 */
423 if (ibtail - ib < (isplane1 ? 2 : 4)) {
424 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
425 }
426
427 oldib = ib;
428 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
429
430 /*
431 * Issue EILSEQ error if the remaining bytes are not
432 * valid EUC-TW bytes.
433 */
434 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
435 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
436 }
437
438 if (! isplane1)
439 ib += 2;
440
441 /* Now we have a valid EUC-TW character. */
442 euctw_val = *ib++;
443 euctw_val = (euctw_val << 8) | *ib++;
444 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
445
446 if (sz < 0) {
447 ib = oldib;
448 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
449 }
450
451 ob += sz;
452 }
453
454 *inbuf = (char *)ib;
455 *inbytesleft = ibtail - ib;
456 *outbuf = (char *)ob;
457 *outbytesleft = obtail - ob;
458
459 return (ret_val);
460 }
461
462 /*
463 * String based encoding convertor from EUC-TW to UTF-8.
464 */
465 static size_t
kiconvstr_fr_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)466 kiconvstr_fr_euctw(char *inarray, size_t *inlen, char *outarray,
467 size_t *outlen, int flag, int *errno)
468 {
469 uchar_t *ib;
470 uchar_t *ob;
471 uchar_t *ibtail;
472 uchar_t *obtail;
473 uchar_t *oldib;
474 size_t ret_val;
475 size_t plane_no;
476 int8_t sz;
477 uint32_t euctw_val;
478 boolean_t isplane1;
479 boolean_t do_not_ignore_null;
480
481 ret_val = 0;
482 ib = (uchar_t *)inarray;
483 ob = (uchar_t *)outarray;
484 ibtail = ib + *inlen;
485 obtail = ob + *outlen;
486 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
487
488 while (ib < ibtail) {
489 if (*ib == '\0' && do_not_ignore_null)
490 break;
491
492 if (KICONV_IS_ASCII(*ib)) {
493 if (ob >= obtail) {
494 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
495 }
496
497 *ob++ = *ib++;
498 continue;
499 }
500
501 oldib = ib;
502
503 if (! KICONV_TC_IS_EUCTW_1st_BYTE(*ib)) {
504 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
505 }
506
507 isplane1 = (*ib == KICONV_TC_EUCTW_MBYTE) ?
508 B_FALSE : B_TRUE;
509
510 if (ibtail - ib < (isplane1 ? 2 : 4)) {
511 if (flag & KICONV_REPLACE_INVALID) {
512 ib = ibtail;
513 goto REPLACE_INVALID;
514 }
515
516 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
517 }
518
519 plane_no = isplane1 ? 1 : *(ib + 1) - KICONV_TC_EUCTW_PMASK;
520
521 if (! KICONV_TC_IS_VALID_EUCTW_SEQ(ib)) {
522 KICONV_SET_ERRNO_WITH_FLAG(isplane1 ? 2 : 4, EILSEQ);
523 }
524
525 if (! isplane1)
526 ib += 2;
527
528 euctw_val = *ib++;
529 euctw_val = (euctw_val << 8) | *ib++;
530 sz = euctw_to_utf8(plane_no, euctw_val, ob, obtail, &ret_val);
531
532 if (sz < 0) {
533 ib = oldib;
534 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
535 }
536
537 ob += sz;
538 continue;
539
540 REPLACE_INVALID:
541 if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
542 ib = oldib;
543 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
544 }
545
546 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
547 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
548 *ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
549 ret_val++;
550 }
551
552 *inlen = ibtail - ib;
553 *outlen = obtail - ob;
554
555 return (ret_val);
556 }
557
558 /*
559 * Encoding convertor from UTF-8 to BIG5.
560 */
561 static size_t
kiconv_to_big5(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)562 kiconv_to_big5(void *kcd, char **inbuf, size_t *inbytesleft,
563 char **outbuf, size_t *outbytesleft, int *errno)
564 {
565 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
566 outbytesleft, errno, utf8_to_big5);
567 }
568
569 /*
570 * String based encoding convertor from UTF-8 to BIG5.
571 */
572 static size_t
kiconvstr_to_big5(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)573 kiconvstr_to_big5(char *inarray, size_t *inlen, char *outarray,
574 size_t *outlen, int flag, int *errno)
575 {
576 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
577 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5);
578 }
579
580 /*
581 * Encoding convertor from UTF-8 to EUC-TW.
582 */
583 static size_t
kiconv_to_euctw(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)584 kiconv_to_euctw(void *kcd, char **inbuf, size_t *inbytesleft,
585 char **outbuf, size_t *outbytesleft, int *errno)
586 {
587 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
588 outbytesleft, errno, utf8_to_euctw);
589 }
590
591 /*
592 * String based encoding convertor from UTF-8 to EUC-TW.
593 */
594 static size_t
kiconvstr_to_euctw(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)595 kiconvstr_to_euctw(char *inarray, size_t *inlen, char *outarray,
596 size_t *outlen, int flag, int *errno)
597 {
598 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
599 (uchar_t *)outarray, outlen, flag, errno, utf8_to_euctw);
600 }
601
602 /*
603 * Encoding convertor from UTF-8 to CP950HKSCS.
604 */
605 static size_t
kiconv_to_cp950hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)606 kiconv_to_cp950hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
607 char **outbuf, size_t *outbytesleft, int *errno)
608 {
609 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
610 outbytesleft, errno, utf8_to_cp950hkscs);
611 }
612
613 /*
614 * String based encoding convertor from UTF-8 to CP950HKSCS.
615 */
616 static size_t
kiconvstr_to_cp950hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)617 kiconvstr_to_cp950hkscs(char *inarray, size_t *inlen, char *outarray,
618 size_t *outlen, int flag, int *errno)
619 {
620 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
621 (uchar_t *)outarray, outlen, flag, errno, utf8_to_cp950hkscs);
622 }
623
624 /*
625 * Encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
626 */
627 static size_t
kiconv_to_big5hkscs(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno)628 kiconv_to_big5hkscs(void *kcd, char **inbuf, size_t *inbytesleft,
629 char **outbuf, size_t *outbytesleft, int *errno)
630 {
631 return kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
632 outbytesleft, errno, utf8_to_big5hkscs);
633 }
634
635 /*
636 * String based encoding convertor from UTF-8 to BIG5HKSCS(HKSCS-2004).
637 */
638 static size_t
kiconvstr_to_big5hkscs(char * inarray,size_t * inlen,char * outarray,size_t * outlen,int flag,int * errno)639 kiconvstr_to_big5hkscs(char *inarray, size_t *inlen, char *outarray,
640 size_t *outlen, int flag, int *errno)
641 {
642 return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
643 (uchar_t *)outarray, outlen, flag, errno, utf8_to_big5hkscs);
644 }
645
646 /*
647 * Common convertor from single BIG5/CP950-HKSCS character to UTF-8.
648 * Return: > 0 - Converted successfully
649 * = -1 - E2BIG
650 */
651 static int8_t
big5_to_utf8_common(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_array_t * table,size_t nitems)652 big5_to_utf8_common(uint32_t big5_val, uchar_t *ob, uchar_t *obtail,
653 size_t *ret_val, kiconv_table_array_t *table, size_t nitems)
654 {
655 size_t index;
656 int8_t sz;
657 uchar_t *u8;
658
659 index = kiconv_binsearch(big5_val, table, nitems);
660 u8 = table[index].u8;
661 sz = u8_number_of_bytes[u8[0]];
662
663 if (obtail - ob < sz) {
664 *ret_val = (size_t)-1;
665 return (-1);
666 }
667
668 if (index == 0)
669 (*ret_val)++; /* Non-identical conversion */
670
671 for (index = 0; index < sz; index++)
672 *ob++ = u8[index];
673
674 return (sz);
675 }
676
677 /*
678 * Convert single BIG5 character to UTF-8.
679 */
680 static int8_t
big5_to_utf8(uint32_t big5_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)681 big5_to_utf8(uint32_t big5_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val)
682 {
683 return (big5_to_utf8_common(big5_val, ob, obtail, ret_val,
684 kiconv_big5_utf8, KICONV_BIG5_UTF8_MAX));
685 }
686
687 /*
688 * Convert single CP950-HKSCS character to UTF-8.
689 */
690 static int8_t
cp950hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)691 cp950hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
692 size_t *ret_val)
693 {
694 return (big5_to_utf8_common(hkscs_val, ob, obtail, ret_val,
695 kiconv_cp950hkscs_utf8, KICONV_CP950HKSCS_UTF8_MAX));
696 }
697
698 /*
699 * Calculate unicode value for some CNS planes which fall in Unicode
700 * UDA range.
701 */
702 static uint32_t
get_unicode_from_UDA(size_t plane_no,uchar_t b1,uchar_t b2)703 get_unicode_from_UDA(size_t plane_no, uchar_t b1, uchar_t b2)
704 {
705 /*
706 * CNS Plane 15 is pre-allocated, so need move Plane 16 to back 15
707 * to compute the Unicode value.
708 */
709 if (plane_no == 16)
710 --plane_no;
711
712 /* 0xF0000 + (plane_no - 12) * 8836 + (b1 - 0xA1) * 94 + (b2 - 0xA1) */
713 return (8836 * plane_no + 94 * b1 + b2 + 0xD2611);
714 }
715
716 /*
717 * Convert single EUC-TW character to UTF-8.
718 * Return: > 0 - Converted successfully
719 * = -1 - E2BIG
720 */
721 static int8_t
euctw_to_utf8(size_t plane_no,uint32_t euctw_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)722 euctw_to_utf8(size_t plane_no, uint32_t euctw_val, uchar_t *ob,
723 uchar_t *obtail, size_t *ret_val)
724 {
725 uint32_t u32;
726 size_t index;
727 int8_t sz;
728 uchar_t udc[4];
729 uchar_t *u8;
730
731 switch (plane_no) {
732 case 1:
733 index = kiconv_binsearch(euctw_val, kiconv_cns1_utf8,
734 KICONV_CNS1_UTF8_MAX);
735 u8 = kiconv_cns1_utf8[index].u8;
736 break;
737 case 2:
738 index = kiconv_binsearch(euctw_val, kiconv_cns2_utf8,
739 KICONV_CNS2_UTF8_MAX);
740 u8 = kiconv_cns2_utf8[index].u8;
741 break;
742 case 3:
743 index = kiconv_binsearch(euctw_val, kiconv_cns3_utf8,
744 KICONV_CNS3_UTF8_MAX);
745 u8 = kiconv_cns3_utf8[index].u8;
746 break;
747 case 4:
748 index = kiconv_binsearch(euctw_val, kiconv_cns4_utf8,
749 KICONV_CNS4_UTF8_MAX);
750 u8 = kiconv_cns4_utf8[index].u8;
751 break;
752 case 5:
753 index = kiconv_binsearch(euctw_val, kiconv_cns5_utf8,
754 KICONV_CNS5_UTF8_MAX);
755 u8 = kiconv_cns5_utf8[index].u8;
756 break;
757 case 6:
758 index = kiconv_binsearch(euctw_val, kiconv_cns6_utf8,
759 KICONV_CNS6_UTF8_MAX);
760 u8 = kiconv_cns6_utf8[index].u8;
761 break;
762 case 7:
763 index = kiconv_binsearch(euctw_val, kiconv_cns7_utf8,
764 KICONV_CNS7_UTF8_MAX);
765 u8 = kiconv_cns7_utf8[index].u8;
766 break;
767 case 12:
768 case 13:
769 case 14:
770 case 16:
771 u32 = get_unicode_from_UDA(plane_no,
772 (euctw_val & 0xFF00) >> 8, euctw_val & 0xFF);
773 /*
774 * As U+F0000 <= u32 <= U+F8A0F, so its UTF-8 sequence
775 * will occupy 4 bytes.
776 */
777 udc[0] = 0xF3;
778 udc[1] = (uchar_t)(0x80 | (u32 & 0x03F000) >> 12);
779 udc[2] = (uchar_t)(0x80 | (u32 & 0x000FC0) >> 6);
780 udc[3] = (uchar_t)(0x80 | (u32 & 0x00003F));
781 u8 = udc;
782 index = 1;
783 break;
784 case 15:
785 index = kiconv_binsearch(euctw_val, kiconv_cns15_utf8,
786 KICONV_CNS15_UTF8_MAX);
787 u8 = kiconv_cns15_utf8[index].u8;
788 break;
789 default:
790 index = 0;
791 u8 = kiconv_cns1_utf8[index].u8;
792 }
793
794 sz = u8_number_of_bytes[u8[0]];
795 if (obtail - ob < sz) {
796 *ret_val = (size_t)-1;
797 return (-1);
798 }
799
800 if (index == 0)
801 (*ret_val)++;
802
803 for (index = 0; index < sz; index++)
804 *ob++ = u8[index];
805
806 return (sz);
807 }
808
809 /*
810 * Convert single HKSCS character to UTF-8.
811 * Return: > 0 - Converted successfully
812 * = -1 - E2BIG
813 */
814 static int8_t
big5hkscs_to_utf8(uint32_t hkscs_val,uchar_t * ob,uchar_t * obtail,size_t * ret_val)815 big5hkscs_to_utf8(uint32_t hkscs_val, uchar_t *ob, uchar_t *obtail,
816 size_t *ret_val)
817 {
818 size_t index;
819 int8_t sz;
820 uchar_t *u8;
821
822 index = kiconv_binsearch(hkscs_val, kiconv_hkscs_utf8,
823 KICONV_HKSCS_UTF8_MAX);
824 u8 = kiconv_hkscs_utf8[index].u8;
825
826 /*
827 * Single HKSCS-2004 character may map to 2 Unicode
828 * code points.
829 */
830 if (u8[0] == 0xFF) {
831 u8 = hkscs_special_sequence[u8[1]];
832 sz = 4;
833 } else {
834 sz = u8_number_of_bytes[u8[0]];
835 }
836
837 if (obtail - ob < sz) {
838 *ret_val = (size_t)-1;
839 return (-1);
840 }
841
842 if (index == 0)
843 (*ret_val)++; /* Non-identical conversion. */
844
845 for (index = 0; index < sz; index++)
846 *ob++ = u8[index];
847
848 return (sz);
849 }
850
851 /*
852 * Convert single UTF-8 character to EUC-TW.
853 * Return: > 0 - Converted successfully
854 * = -1 - E2BIG
855 */
856 /* ARGSUSED */
857 static int8_t
utf8_to_euctw(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)858 utf8_to_euctw(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
859 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
860 {
861 size_t index;
862 size_t plane_no;
863 uchar_t byte1;
864 uchar_t byte2;
865
866 if (utf8 >= KICONV_TC_UDA_UTF8_START &&
867 utf8 <= KICONV_TC_UDA_UTF8_END) {
868 /*
869 * Calculate EUC-TW code if utf8 is in Unicode
870 * Private Plane 15.
871 */
872 index = (((utf8 & 0x7000000) >> 6) | ((utf8 & 0x3F0000) >> 4) |
873 ((utf8 & 0x3F00) >> 2) | (utf8 & 0x3F)) -
874 KICONV_TC_UDA_UCS4_START;
875 plane_no = 12 + index / 8836;
876 byte1 = 0xA1 + (index % 8836) / 94;
877 byte2 = 0xA1 + index % 94;
878
879 /* CNS Plane 15 is pre-allocated, so place it into Plane 16. */
880 if (plane_no == 15)
881 plane_no = 16;
882 } else {
883 uint32_t euctw_val;
884
885 index = kiconv_binsearch(utf8, kiconv_utf8_euctw,
886 KICONV_UTF8_EUCTW_MAX);
887
888 if (index == 0) {
889 if (ob >= obtail) {
890 *ret_val = (size_t)-1;
891 return (-1);
892 }
893
894 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
895 (*ret_val)++;
896
897 return (1);
898 }
899
900 euctw_val = kiconv_utf8_euctw[index].value;
901 byte1 = (euctw_val & 0xFF00) >> 8;
902 byte2 = euctw_val & 0xFF;
903 plane_no = euctw_val >> 16;
904 }
905
906 if (obtail - ob < (plane_no == 1 ? 2 : 4)) {
907 *ret_val = (size_t)-1;
908 return (-1);
909 }
910
911 if (plane_no != 1) {
912 *ob++ = KICONV_TC_EUCTW_MBYTE;
913 *ob++ = KICONV_TC_EUCTW_PMASK + plane_no;
914 }
915
916 *ob++ = byte1;
917 *ob = byte2;
918
919 return (plane_no == 1 ? 2 : 4);
920 }
921
922 /*
923 * Convert single UTF-8 character to BIG5-HKSCS
924 * Return: > 0 - Converted successfully
925 * = -1 - E2BIG
926 */
927 static int8_t
utf8_to_big5hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)928 utf8_to_big5hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
929 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
930 {
931 size_t index;
932 int8_t hkscslen;
933 uint32_t hkscscode;
934 boolean_t special_sequence = B_FALSE;
935
936 index = kiconv_binsearch(utf8, kiconv_utf8_hkscs,
937 KICONV_UTF8_HKSCS_MAX);
938 hkscscode = kiconv_utf8_hkscs[index].value;
939
940 /*
941 * There are 4 special code points in HKSCS-2004 which mapped
942 * to 2 UNICODE code points.
943 */
944 if ((int32_t)hkscscode < 0) {
945 size_t special_index = (-(int32_t)hkscscode - 1) * 3;
946
947 /* Check the following 2 bytes. */
948 if (ibtail - *inbuf >= 2 && **inbuf == 0xcc &&
949 (*(*inbuf + 1) == 0x84 || *(*inbuf + 1) == 0x8c)) {
950 special_index += (*(*inbuf + 1) == 0x84 ? 1 : 2);
951 special_sequence = B_TRUE;
952 }
953
954 hkscscode = ucs_special_sequence[special_index];
955 }
956
957 hkscslen = (hkscscode <= 0xFF) ? 1 : 2;
958 if (obtail - ob < hkscslen) {
959 *ret_val = (size_t)-1;
960 return (-1);
961 }
962
963 if (index == 0)
964 (*ret_val)++;
965
966 if (hkscslen > 1)
967 *ob++ = (uchar_t)(hkscscode >> 8);
968 *ob = (uchar_t)(hkscscode & 0xFF);
969
970 if (special_sequence) { /* Advance for special sequence */
971 (*inbuf) += 2;
972 }
973
974 return (hkscslen);
975 }
976
977 /*
978 * Common convertor for UTF-8 to BIG5/CP950-HKSCS.
979 * Return: > 0 - Converted successfully
980 * = -1 - E2BIG
981 */
982 static int8_t
utf8_to_big5_common(uint32_t utf8,uchar_t * ob,uchar_t * obtail,size_t * ret_val,kiconv_table_t * table,size_t nitems)983 utf8_to_big5_common(uint32_t utf8, uchar_t *ob, uchar_t *obtail,
984 size_t *ret_val, kiconv_table_t *table, size_t nitems)
985 {
986 size_t index;
987 int8_t big5len;
988 uint32_t big5code;
989
990 index = kiconv_binsearch(utf8, table, nitems);
991 big5code = table[index].value;
992 big5len = (big5code <= 0xFF) ? 1 : 2;
993
994 if (obtail - ob < big5len) {
995 *ret_val = (size_t)-1;
996 return (-1);
997 }
998
999 if (index == 0)
1000 (*ret_val)++;
1001
1002 if (big5len > 1)
1003 *ob++ = (uchar_t)(big5code >> 8);
1004 *ob = (uchar_t)(big5code & 0xFF);
1005
1006 return (big5len);
1007 }
1008
1009 /*
1010 * Convert single UTF-8 character to BIG5.
1011 */
1012 /* ARGSUSED */
1013 static int8_t
utf8_to_big5(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1014 utf8_to_big5(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1015 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1016 {
1017 return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1018 kiconv_utf8_big5, KICONV_UTF8_BIG5_MAX));
1019 }
1020
1021 /*
1022 * Convert single UTF-8 character to CP950-HKSCS for Windows compatibility.
1023 */
1024 /* ARGSUSED */
1025 static int8_t
utf8_to_cp950hkscs(uint32_t utf8,uchar_t ** inbuf,uchar_t * ibtail,uchar_t * ob,uchar_t * obtail,size_t * ret_val)1026 utf8_to_cp950hkscs(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
1027 uchar_t *ob, uchar_t *obtail, size_t *ret_val)
1028 {
1029 return (utf8_to_big5_common(utf8, ob, obtail, ret_val,
1030 kiconv_utf8_cp950hkscs, KICONV_UTF8_CP950HKSCS));
1031 }
1032
1033 static kiconv_ops_t kiconv_tc_ops_tbl[] = {
1034 {
1035 "big5", "utf-8", kiconv_open_to_cck, kiconv_to_big5,
1036 kiconv_close_to_cck, kiconvstr_to_big5
1037 },
1038 {
1039 "utf-8", "big5", open_fr_big5, kiconv_fr_big5,
1040 close_fr_tc, kiconvstr_fr_big5
1041 },
1042
1043 {
1044 "big5-hkscs", "utf-8", kiconv_open_to_cck, kiconv_to_big5hkscs,
1045 kiconv_close_to_cck, kiconvstr_to_big5hkscs
1046 },
1047 {
1048 "utf-8", "big5-hkscs", open_fr_big5hkscs, kiconv_fr_big5hkscs,
1049 close_fr_tc, kiconvstr_fr_big5hkscs
1050 },
1051
1052 {
1053 "euc-tw", "utf-8", kiconv_open_to_cck, kiconv_to_euctw,
1054 kiconv_close_to_cck, kiconvstr_to_euctw
1055 },
1056 {
1057 "utf-8", "euc-tw", open_fr_euctw, kiconv_fr_euctw,
1058 close_fr_tc, kiconvstr_fr_euctw
1059 },
1060
1061 {
1062 "cp950-hkscs", "utf-8", kiconv_open_to_cck,
1063 kiconv_to_cp950hkscs, kiconv_close_to_cck,
1064 kiconvstr_to_cp950hkscs
1065 },
1066 {
1067 "utf-8", "cp950-hkscs", open_fr_cp950hkscs,
1068 kiconv_fr_cp950hkscs, close_fr_tc, kiconvstr_fr_cp950hkscs
1069 },
1070 };
1071
1072 static kiconv_module_info_t kiconv_tc_info = {
1073 "kiconv_tc", /* module name */
1074 sizeof (kiconv_tc_ops_tbl) / sizeof (kiconv_tc_ops_tbl[0]),
1075 kiconv_tc_ops_tbl,
1076 0,
1077 NULL,
1078 NULL,
1079 0
1080 };
1081
1082 static struct modlkiconv modlkiconv_tc = {
1083 &mod_kiconvops,
1084 "kiconv Traditional Chinese module 1.0",
1085 &kiconv_tc_info
1086 };
1087
1088 static struct modlinkage modlinkage = {
1089 MODREV_1,
1090 (void *)&modlkiconv_tc,
1091 NULL
1092 };
1093
1094 int
_init(void)1095 _init(void)
1096 {
1097 int err;
1098
1099 err = mod_install(&modlinkage);
1100 if (err)
1101 cmn_err(CE_WARN, "kiconv_tc: failed to load kernel module");
1102
1103 return (err);
1104 }
1105
1106 int
_fini(void)1107 _fini(void)
1108 {
1109 int err;
1110
1111 /*
1112 * If this module is being used, then, we cannot remove the module.
1113 * The following checking will catch pretty much all usual cases.
1114 *
1115 * Any remaining will be catached by the kiconv_unregister_module()
1116 * during mod_remove() at below.
1117 */
1118 if (kiconv_module_ref_count(KICONV_MODULE_ID_TC))
1119 return (EBUSY);
1120
1121 err = mod_remove(&modlinkage);
1122 if (err)
1123 cmn_err(CE_WARN, "kiconv_tc: failed to remove kernel module");
1124
1125 return (err);
1126 }
1127
1128 int
_info(struct modinfo * modinfop)1129 _info(struct modinfo *modinfop)
1130 {
1131 return (mod_info(&modlinkage, modinfop));
1132 }
1133