xref: /illumos-gate/usr/src/uts/common/kiconv/kiconv_ko/kiconv_ko.c (revision 54034eb2d6e7d811adf4a1fe5105eac6fea6b0b5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/sysmacros.h>
32 #include <sys/systm.h>
33 #include <sys/debug.h>
34 #include <sys/kmem.h>
35 #include <sys/sunddi.h>
36 #include <sys/byteorder.h>
37 #include <sys/errno.h>
38 #include <sys/modctl.h>
39 #include <sys/u8_textprep.h>
40 #include <sys/kiconv.h>
41 #include <sys/kiconv_cck_common.h>
42 #include <sys/kiconv_ko.h>
43 #include <sys/kiconv_uhc_utf8.h>
44 #include <sys/kiconv_utf8_uhc.h>
45 #include <sys/kiconv_euckr_utf8.h>
46 #include <sys/kiconv_utf8_euckr.h>
47 
48 static int8_t utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
49 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
50 static int8_t utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
51 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
52 static int8_t ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail,
53 	size_t *ret_val, kiconv_table_array_t *table, size_t nitems);
54 
55 
56 #define	KICONV_KO_EUCKR		(0x01)
57 #define	KICONV_KO_UHC		(0x02)
58 #define	KICONV_KO_MAX_MAGIC_ID	(0x02)
59 
60 static void *
61 open_fr_euckr()
62 {
63 	return ((void *)KICONV_KO_EUCKR);
64 }
65 
66 static void *
67 open_fr_uhc()
68 {
69 	return ((void *)KICONV_KO_UHC);
70 }
71 
72 static int
73 close_fr_ko(void *s)
74 {
75 	if ((uintptr_t)s > KICONV_KO_MAX_MAGIC_ID)
76 		return (EBADF);
77 
78 	return (0);
79 }
80 
81 /*
82  * Encoding convertor from EUC-KR to UTF-8.
83  */
84 static size_t
85 kiconv_fr_euckr(void *kcd, char **inbuf, size_t *inbufleft,
86 	char **outbuf, size_t *outbufleft, int *errno)
87 {
88 	uchar_t		*ib;
89 	uchar_t		*ob;
90 	uchar_t		*ibtail;
91 	uchar_t		*obtail;
92 	size_t		ret_val;
93 	int8_t		sz;
94 	uint32_t	euckr_val;
95 
96 	/* Check on the kiconv code conversion descriptor. */
97 	if (kcd == NULL || kcd == (void *)-1) {
98 		*errno = EBADF;
99 		return ((size_t)-1);
100 	}
101 
102 	/* If this is a state reset request, process and return. */
103 	if (inbuf == NULL || *inbuf == NULL) {
104 		return (0);
105 	}
106 
107 	ret_val = 0;
108 	ib = (uchar_t *)*inbuf;
109 	ob = (uchar_t *)*outbuf;
110 	ibtail = ib + *inbufleft;
111 	obtail = ob + *outbufleft;
112 
113 	while (ib < ibtail) {
114 		if (KICONV_IS_ASCII(*ib)) {
115 			if (ob >= obtail) {
116 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
117 			}
118 
119 			*ob++ = *ib++;
120 			continue;
121 		}
122 
123 		/*
124 		 * Issue EILSEQ error if the first byte is not a
125 		 * valid EUC-KR leading byte.
126 		 */
127 		if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) {
128 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
129 		}
130 
131 		/*
132 		 * Issue EINVAL error if input buffer has an incomplete
133 		 * character at the end of the buffer.
134 		 */
135 		if (ibtail - ib < 2) {
136 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
137 		}
138 
139 		/*
140 		 * Issue EILSEQ error if the remaining byte is not
141 		 * a valid EUC-KR byte.
142 		 */
143 		if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) {
144 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
145 		}
146 
147 		euckr_val = (uint32_t)(*ib) << 8 | *(ib + 1);
148 		sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val,
149 		    kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX);
150 
151 		if (sz < 0) {
152 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
153 		}
154 
155 		ib += 2;
156 		ob += sz;
157 	}
158 
159 	*inbuf = (char *)ib;
160 	*inbufleft = ibtail - ib;
161 	*outbuf = (char *)ob;
162 	*outbufleft = obtail - ob;
163 
164 	return (ret_val);
165 }
166 
167 /*
168  * String based encoding convertor from EUC-KR to UTF-8.
169  */
170 static size_t
171 kiconvstr_fr_euckr(char *inarray, size_t *inlen, char *outarray,
172 	size_t *outlen, int flag, int *errno)
173 {
174 	uchar_t		*ib;
175 	uchar_t		*ob;
176 	uchar_t		*ibtail;
177 	uchar_t		*obtail;
178 	uchar_t		*oldib;
179 	size_t		ret_val;
180 	int8_t		sz;
181 	uint32_t	euckr_val;
182 	boolean_t	do_not_ignore_null;
183 
184 	ret_val = 0;
185 	ib = (uchar_t *)inarray;
186 	ob = (uchar_t *)outarray;
187 	ibtail = ib + *inlen;
188 	obtail = ob + *outlen;
189 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
190 
191 	while (ib < ibtail) {
192 		if (*ib == '\0' && do_not_ignore_null)
193 			break;
194 
195 		if (KICONV_IS_ASCII(*ib)) {
196 			if (ob >= obtail) {
197 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
198 			}
199 
200 			*ob++ = *ib++;
201 			continue;
202 		}
203 
204 		oldib = ib;
205 
206 		if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) {
207 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
208 		}
209 
210 		if (ibtail - ib < 2) {
211 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
212 		}
213 
214 		if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) {
215 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
216 		}
217 
218 		euckr_val = *ib++;
219 		euckr_val = (euckr_val << 8) | *ib++;
220 		sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val,
221 		    kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX);
222 
223 		if (sz < 0) {
224 			ib = oldib;
225 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
226 		}
227 
228 		ob += sz;
229 		continue;
230 
231 REPLACE_INVALID:
232 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
233 			ib = oldib;
234 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
235 		}
236 
237 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
238 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
239 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
240 		ret_val++;
241 	}
242 
243 	*inlen = ibtail - ib;
244 	*outlen = obtail - ob;
245 
246 	return (ret_val);
247 }
248 
249 /*
250  * Encoding convertor from Unified Hangul Code to UTF-8.
251  */
252 static size_t
253 kiconv_fr_uhc(void *kcd, char **inbuf, size_t *inbufleft,
254 	char **outbuf, size_t *outbufleft, int *errno)
255 {
256 	uchar_t		*ib;
257 	uchar_t		*ob;
258 	uchar_t		*ibtail;
259 	uchar_t		*obtail;
260 	size_t		ret_val;
261 	int8_t		sz;
262 	uint32_t	uhc_val;
263 
264 	/* Check on the kiconv code conversion descriptor. */
265 	if (kcd == NULL || kcd == (void *)-1) {
266 		*errno = EBADF;
267 		return ((size_t)-1);
268 	}
269 
270 	/* If this is a state reset request, process and return. */
271 	if (inbuf == NULL || *inbuf == NULL) {
272 		return (0);
273 	}
274 
275 	ret_val = 0;
276 	ib = (uchar_t *)*inbuf;
277 	ob = (uchar_t *)*outbuf;
278 	ibtail = ib + *inbufleft;
279 	obtail = ob + *outbufleft;
280 
281 	while (ib < ibtail) {
282 		if (KICONV_IS_ASCII(*ib)) {
283 			if (ob >= obtail) {
284 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
285 			}
286 
287 			*ob++ = *ib++;
288 			continue;
289 		}
290 
291 		/*
292 		 * Issue EILSEQ error if the first byte is not a
293 		 * valid UHC leading byte.
294 		 */
295 		if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) {
296 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
297 		}
298 
299 		/*
300 		 * Issue EINVAL error if input buffer has an incomplete
301 		 * character at the end of the buffer.
302 		 */
303 		if (ibtail - ib < 2) {
304 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
305 		}
306 
307 		/*
308 		 * Issue EILSEQ error if the remaining byte is not
309 		 * a valid UHC byte.
310 		 */
311 		if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) {
312 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
313 		}
314 
315 		uhc_val = (uint32_t)(*ib) << 8 | *(ib + 1);
316 		sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val,
317 		    kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX);
318 
319 		if (sz < 0) {
320 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
321 		}
322 
323 		ib += 2;
324 		ob += sz;
325 	}
326 
327 	*inbuf = (char *)ib;
328 	*inbufleft = ibtail - ib;
329 	*outbuf = (char *)ob;
330 	*outbufleft = obtail - ob;
331 
332 	return (ret_val);
333 }
334 
335 /*
336  * String based encoding convertor from Unified Hangul Code to UTF-8.
337  */
338 static size_t
339 kiconvstr_fr_uhc(char *inarray, size_t *inlen, char *outarray,
340 	size_t *outlen, int flag, int *errno)
341 {
342 	uchar_t		*ib;
343 	uchar_t		*ob;
344 	uchar_t		*ibtail;
345 	uchar_t		*obtail;
346 	uchar_t		*oldib;
347 	size_t		ret_val;
348 	int8_t		sz;
349 	uint32_t	uhc_val;
350 	boolean_t	do_not_ignore_null;
351 
352 	ret_val = 0;
353 	ib = (uchar_t *)inarray;
354 	ob = (uchar_t *)outarray;
355 	ibtail = ib + *inlen;
356 	obtail = ob + *outlen;
357 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
358 
359 	while (ib < ibtail) {
360 		if (*ib == '\0' && do_not_ignore_null)
361 			break;
362 
363 		if (KICONV_IS_ASCII(*ib)) {
364 			if (ob >= obtail) {
365 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
366 			}
367 
368 			*ob++ = *ib++;
369 			continue;
370 		}
371 
372 		oldib = ib;
373 
374 		if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) {
375 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
376 		}
377 
378 		if (ibtail - ib < 2) {
379 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
380 		}
381 
382 		if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) {
383 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
384 		}
385 
386 		uhc_val = *ib++;
387 		uhc_val = (uhc_val << 8) | *ib++;
388 		sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val,
389 		    kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX);
390 
391 		if (sz < 0) {
392 			ib = oldib;
393 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
394 		}
395 
396 		ob += sz;
397 		continue;
398 
399 REPLACE_INVALID:
400 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
401 			ib = oldib;
402 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
403 		}
404 
405 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
406 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
407 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
408 		ret_val++;
409 	}
410 
411 	*inlen = ibtail - ib;
412 	*outlen = obtail - ob;
413 
414 	return (ret_val);
415 }
416 
417 /*
418  * Encoding convertor from UTF-8 to EUC-KR.
419  */
420 static size_t
421 kiconv_to_euckr(void *kcd, char **inbuf, size_t *inbytesleft,
422 	char **outbuf, size_t *outbytesleft, int *errno)
423 {
424 	return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
425 	    outbytesleft, errno, utf8_to_euckr));
426 }
427 
428 /*
429  * Encoding convertor from UTF-8 to Unified Hangul Code.
430  */
431 static size_t
432 kiconv_to_uhc(void *kcd, char **inbuf, size_t *inbytesleft,
433 	char **outbuf, size_t *outbytesleft, int *errno)
434 {
435 	return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
436 	    outbytesleft, errno, utf8_to_uhc));
437 }
438 
439 /*
440  * String based encoding convertor from UTF-8 to EUC-KR.
441  */
442 static size_t
443 kiconvstr_to_euckr(char *inarray, size_t *inlen, char *outarray,
444 	size_t *outlen, int flag, int *errno)
445 {
446 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
447 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_euckr);
448 }
449 
450 /*
451  * String based encoding convertor from UTF-8 to Unified Hangul Code.
452  */
453 static size_t
454 kiconvstr_to_uhc(char *inarray, size_t *inlen, char *outarray,
455 	size_t *outlen, int flag, int *errno)
456 {
457 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
458 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_uhc);
459 }
460 
461 /*
462  * Convert an UTF-8 character to a character of ko encodings
463  * (EUC-KR or UHC).
464  */
465 static int8_t
466 utf8_to_ko(uint32_t utf8, uchar_t *ob, uchar_t *obtail, size_t *ret_val,
467 	kiconv_table_t *table, size_t nitems)
468 {
469 	size_t	index;
470 	size_t	kocode;
471 	int8_t  kolen;
472 
473 	if (KICONV_KO_IS_UDC_IN_UTF8(utf8)) {
474 		/* User Definable Area handing. */
475 		kocode = (((utf8 & 0xF0000) >> 4) | ((utf8 & 0x3F00) >> 2) |
476 		    (utf8 & 0x3F)) - KICONV_KO_UDA_UCS4_START;
477 		if (kocode < KICONV_KO_UDA_RANGE) {
478 			kocode = (KICONV_KO_UDA_EUC_SEG1 << 8) |
479 			    (kocode + KICONV_KO_UDA_OFFSET_START);
480 		} else {
481 			/* 0x43 = 0xA1 - 0x5E */
482 			kocode = (KICONV_KO_UDA_EUC_SEG2 << 8) |
483 			    (kocode + 0x43);
484 		}
485 
486 		index = 1;
487 	} else {
488 		index = kiconv_binsearch(utf8, table, nitems);
489 		kocode = table[index].value;
490 	}
491 
492 	kolen = (kocode <= 0xFF) ? 1 : 2;
493 
494 	if (obtail - ob < kolen) {
495 		*ret_val = (size_t)-1;
496 		return (-1);
497 	}
498 
499 	if (index == 0)
500 		(*ret_val)++;
501 
502 	if (kolen > 1)
503 		*ob++ = (uchar_t)(kocode >> 8);
504 	*ob = (uchar_t)(kocode & 0xFF);
505 
506 	return (kolen);
507 }
508 
509 /*
510  * Convert an UTF-8 character to Unified Hangual Code.
511  */
512 /* ARGSUSED */
513 static int8_t
514 utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
515 	uchar_t *ob, uchar_t *obtail, size_t *ret_val)
516 {
517 	return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_uhc,
518 	    KICONV_UTF8_UHC_MAX));
519 }
520 
521 /*
522  * Convert an UTF-8 character to EUC-KR.
523  */
524 /* ARGSUSED */
525 static int8_t
526 utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
527 	uchar_t *ob, uchar_t *obtail, size_t *ret_val)
528 {
529 	return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_euckr,
530 	    KICONV_UTF8_EUCKR_MAX));
531 }
532 
533 /*
534  * Convert a single ko encoding (EUC-KR or UHC) character to UTF-8.
535  */
536 static int8_t
537 ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val,
538 	kiconv_table_array_t *table, size_t nitems)
539 {
540 	size_t	index;
541 	int8_t	sz;
542 	uchar_t	udc[3];
543 	uchar_t	*u8;
544 
545 	if (KICONV_KO_IS_UDC_IN_EUC(ko_val)) {
546 		/* UDA(User Definable Area) handling. */
547 		uint32_t u32;
548 
549 		u32 = (ko_val & 0xFF) + (((ko_val & 0xFF00) == 0xC900) ?
550 		    KICONV_KO_UDA_OFFSET_1 : KICONV_KO_UDA_OFFSET_2);
551 		udc[0] = 0xEF;
552 		udc[1] = (uchar_t)(0x80 | (u32 & 0x00000FC0) >> 6);
553 		udc[2] = (uchar_t)(0x80 | (u32 & 0x0000003F));
554 		u8 = udc;
555 		index = 1;
556 	} else {
557 		index = kiconv_binsearch(ko_val, table, nitems);
558 		u8 = table[index].u8;
559 	}
560 
561 	sz = u8_number_of_bytes[u8[0]];
562 
563 	if (obtail - ob < sz) {
564 		*ret_val = (size_t)-1;
565 		return (-1);
566 	}
567 
568 	if (index == 0)
569 		(*ret_val)++;	/* Non-identical conversion */
570 
571 	for (index = 0; index < sz; index++)
572 		*ob++ = u8[index];
573 
574 	return (sz);
575 }
576 
577 static kiconv_ops_t kiconv_ko_ops_tbl[] = {
578 	{
579 		"euc-kr", "utf-8", kiconv_open_to_cck, kiconv_to_euckr,
580 		kiconv_close_to_cck, kiconvstr_to_euckr
581 	},
582 	{
583 		"utf-8", "euc-kr", open_fr_euckr, kiconv_fr_euckr,
584 		close_fr_ko, kiconvstr_fr_euckr
585 	},
586 	{
587 		"unifiedhangul", "utf-8", kiconv_open_to_cck, kiconv_to_uhc,
588 		kiconv_close_to_cck, kiconvstr_to_uhc
589 	},
590 	{
591 		"utf-8", "unifiedhangul", open_fr_uhc, kiconv_fr_uhc,
592 		close_fr_ko, kiconvstr_fr_uhc
593 	}
594 };
595 
596 static kiconv_module_info_t kiconv_ko_info = {
597 	"kiconv_ko",		/* module name */
598 	sizeof (kiconv_ko_ops_tbl) / sizeof (kiconv_ko_ops_tbl[0]),
599 	kiconv_ko_ops_tbl,
600 	0,
601 	NULL,
602 	NULL,
603 	0
604 };
605 
606 static struct modlkiconv modlkiconv_ko = {
607 	&mod_kiconvops,
608 	"kiconv korean module 1.0",
609 	&kiconv_ko_info
610 };
611 
612 static struct modlinkage modlinkage = {
613 	MODREV_1,
614 	(void *)&modlkiconv_ko,
615 	NULL
616 };
617 
618 int
619 _init(void)
620 {
621 	int err;
622 
623 	err = mod_install(&modlinkage);
624 	if (err)
625 		cmn_err(CE_WARN, "kiconv_ko: failed to load kernel module");
626 
627 	return (err);
628 }
629 
630 int
631 _fini(void)
632 {
633 	int err;
634 
635 	/*
636 	 * If this module is being used, then, we cannot remove the module.
637 	 * The following checking will catch pretty much all usual cases.
638 	 *
639 	 * Any remaining will be catached by the kiconv_unregister_module()
640 	 * during mod_remove() at below.
641 	 */
642 	if (kiconv_module_ref_count(KICONV_MODULE_ID_KO))
643 		return (EBUSY);
644 
645 	err = mod_remove(&modlinkage);
646 	if (err)
647 		cmn_err(CE_WARN, "kiconv_ko: failed to remove kernel module");
648 
649 	return (err);
650 }
651 
652 int
653 _info(struct modinfo *modinfop)
654 {
655 	return (mod_info(&modlinkage, modinfop));
656 }
657