xref: /illumos-gate/usr/src/uts/common/kiconv/kiconv_ko/kiconv_ko.c (revision dd72704bd9e794056c558153663c739e2012d721)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/sysmacros.h>
30 #include <sys/systm.h>
31 #include <sys/debug.h>
32 #include <sys/kmem.h>
33 #include <sys/sunddi.h>
34 #include <sys/byteorder.h>
35 #include <sys/errno.h>
36 #include <sys/modctl.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/kiconv.h>
39 #include <sys/kiconv_cck_common.h>
40 #include <sys/kiconv_ko.h>
41 #include <sys/kiconv_uhc_utf8.h>
42 #include <sys/kiconv_utf8_uhc.h>
43 #include <sys/kiconv_euckr_utf8.h>
44 #include <sys/kiconv_utf8_euckr.h>
45 
46 static int8_t utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
47 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
48 static int8_t utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
49 	uchar_t *ob, uchar_t *obtail, size_t *ret_val);
50 static int8_t ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail,
51 	size_t *ret_val, kiconv_table_array_t *table, size_t nitems);
52 
53 
54 #define	KICONV_KO_EUCKR		(0x01)
55 #define	KICONV_KO_UHC		(0x02)
56 #define	KICONV_KO_MAX_MAGIC_ID	(0x02)
57 
58 static void *
59 open_fr_euckr()
60 {
61 	return ((void *)KICONV_KO_EUCKR);
62 }
63 
64 static void *
65 open_fr_uhc()
66 {
67 	return ((void *)KICONV_KO_UHC);
68 }
69 
70 static int
71 close_fr_ko(void *s)
72 {
73 	if ((uintptr_t)s > KICONV_KO_MAX_MAGIC_ID)
74 		return (EBADF);
75 
76 	return (0);
77 }
78 
79 /*
80  * Encoding convertor from EUC-KR to UTF-8.
81  */
82 static size_t
83 kiconv_fr_euckr(void *kcd, char **inbuf, size_t *inbufleft,
84     char **outbuf, size_t *outbufleft, int *errno)
85 {
86 	uchar_t		*ib;
87 	uchar_t		*ob;
88 	uchar_t		*ibtail;
89 	uchar_t		*obtail;
90 	size_t		ret_val;
91 	int8_t		sz;
92 	uint32_t	euckr_val;
93 
94 	/* Check on the kiconv code conversion descriptor. */
95 	if (kcd == NULL || kcd == (void *)-1) {
96 		*errno = EBADF;
97 		return ((size_t)-1);
98 	}
99 
100 	/* If this is a state reset request, process and return. */
101 	if (inbuf == NULL || *inbuf == NULL) {
102 		return (0);
103 	}
104 
105 	ret_val = 0;
106 	ib = (uchar_t *)*inbuf;
107 	ob = (uchar_t *)*outbuf;
108 	ibtail = ib + *inbufleft;
109 	obtail = ob + *outbufleft;
110 
111 	while (ib < ibtail) {
112 		if (KICONV_IS_ASCII(*ib)) {
113 			if (ob >= obtail) {
114 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
115 			}
116 
117 			*ob++ = *ib++;
118 			continue;
119 		}
120 
121 		/*
122 		 * Issue EILSEQ error if the first byte is not a
123 		 * valid EUC-KR leading byte.
124 		 */
125 		if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) {
126 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
127 		}
128 
129 		/*
130 		 * Issue EINVAL error if input buffer has an incomplete
131 		 * character at the end of the buffer.
132 		 */
133 		if (ibtail - ib < 2) {
134 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
135 		}
136 
137 		/*
138 		 * Issue EILSEQ error if the remaining byte is not
139 		 * a valid EUC-KR byte.
140 		 */
141 		if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) {
142 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
143 		}
144 
145 		euckr_val = (uint32_t)(*ib) << 8 | *(ib + 1);
146 		sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val,
147 		    kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX);
148 
149 		if (sz < 0) {
150 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
151 		}
152 
153 		ib += 2;
154 		ob += sz;
155 	}
156 
157 	*inbuf = (char *)ib;
158 	*inbufleft = ibtail - ib;
159 	*outbuf = (char *)ob;
160 	*outbufleft = obtail - ob;
161 
162 	return (ret_val);
163 }
164 
165 /*
166  * String based encoding convertor from EUC-KR to UTF-8.
167  */
168 static size_t
169 kiconvstr_fr_euckr(char *inarray, size_t *inlen, char *outarray,
170     size_t *outlen, int flag, int *errno)
171 {
172 	uchar_t		*ib;
173 	uchar_t		*ob;
174 	uchar_t		*ibtail;
175 	uchar_t		*obtail;
176 	uchar_t		*oldib;
177 	size_t		ret_val;
178 	int8_t		sz;
179 	uint32_t	euckr_val;
180 	boolean_t	do_not_ignore_null;
181 
182 	ret_val = 0;
183 	ib = (uchar_t *)inarray;
184 	ob = (uchar_t *)outarray;
185 	ibtail = ib + *inlen;
186 	obtail = ob + *outlen;
187 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
188 
189 	while (ib < ibtail) {
190 		if (*ib == '\0' && do_not_ignore_null)
191 			break;
192 
193 		if (KICONV_IS_ASCII(*ib)) {
194 			if (ob >= obtail) {
195 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
196 			}
197 
198 			*ob++ = *ib++;
199 			continue;
200 		}
201 
202 		oldib = ib;
203 
204 		if (! KICONV_KO_IS_EUCKR_BYTE(*ib)) {
205 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
206 		}
207 
208 		if (ibtail - ib < 2) {
209 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
210 		}
211 
212 		if (! KICONV_KO_IS_EUCKR_BYTE(*(ib + 1))) {
213 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
214 		}
215 
216 		euckr_val = *ib++;
217 		euckr_val = (euckr_val << 8) | *ib++;
218 		sz = ko_to_utf8(euckr_val, ob, obtail, &ret_val,
219 		    kiconv_euckr_utf8, KICONV_EUCKR_UTF8_MAX);
220 
221 		if (sz < 0) {
222 			ib = oldib;
223 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
224 		}
225 
226 		ob += sz;
227 		continue;
228 
229 REPLACE_INVALID:
230 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
231 			ib = oldib;
232 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
233 		}
234 
235 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
236 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
237 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
238 		ret_val++;
239 	}
240 
241 	*inlen = ibtail - ib;
242 	*outlen = obtail - ob;
243 
244 	return (ret_val);
245 }
246 
247 /*
248  * Encoding convertor from Unified Hangul Code to UTF-8.
249  */
250 static size_t
251 kiconv_fr_uhc(void *kcd, char **inbuf, size_t *inbufleft,
252     char **outbuf, size_t *outbufleft, int *errno)
253 {
254 	uchar_t		*ib;
255 	uchar_t		*ob;
256 	uchar_t		*ibtail;
257 	uchar_t		*obtail;
258 	size_t		ret_val;
259 	int8_t		sz;
260 	uint32_t	uhc_val;
261 
262 	/* Check on the kiconv code conversion descriptor. */
263 	if (kcd == NULL || kcd == (void *)-1) {
264 		*errno = EBADF;
265 		return ((size_t)-1);
266 	}
267 
268 	/* If this is a state reset request, process and return. */
269 	if (inbuf == NULL || *inbuf == NULL) {
270 		return (0);
271 	}
272 
273 	ret_val = 0;
274 	ib = (uchar_t *)*inbuf;
275 	ob = (uchar_t *)*outbuf;
276 	ibtail = ib + *inbufleft;
277 	obtail = ob + *outbufleft;
278 
279 	while (ib < ibtail) {
280 		if (KICONV_IS_ASCII(*ib)) {
281 			if (ob >= obtail) {
282 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
283 			}
284 
285 			*ob++ = *ib++;
286 			continue;
287 		}
288 
289 		/*
290 		 * Issue EILSEQ error if the first byte is not a
291 		 * valid UHC leading byte.
292 		 */
293 		if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) {
294 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
295 		}
296 
297 		/*
298 		 * Issue EINVAL error if input buffer has an incomplete
299 		 * character at the end of the buffer.
300 		 */
301 		if (ibtail - ib < 2) {
302 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
303 		}
304 
305 		/*
306 		 * Issue EILSEQ error if the remaining byte is not
307 		 * a valid UHC byte.
308 		 */
309 		if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) {
310 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
311 		}
312 
313 		uhc_val = (uint32_t)(*ib) << 8 | *(ib + 1);
314 		sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val,
315 		    kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX);
316 
317 		if (sz < 0) {
318 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
319 		}
320 
321 		ib += 2;
322 		ob += sz;
323 	}
324 
325 	*inbuf = (char *)ib;
326 	*inbufleft = ibtail - ib;
327 	*outbuf = (char *)ob;
328 	*outbufleft = obtail - ob;
329 
330 	return (ret_val);
331 }
332 
333 /*
334  * String based encoding convertor from Unified Hangul Code to UTF-8.
335  */
336 static size_t
337 kiconvstr_fr_uhc(char *inarray, size_t *inlen, char *outarray,
338     size_t *outlen, int flag, int *errno)
339 {
340 	uchar_t		*ib;
341 	uchar_t		*ob;
342 	uchar_t		*ibtail;
343 	uchar_t		*obtail;
344 	uchar_t		*oldib;
345 	size_t		ret_val;
346 	int8_t		sz;
347 	uint32_t	uhc_val;
348 	boolean_t	do_not_ignore_null;
349 
350 	ret_val = 0;
351 	ib = (uchar_t *)inarray;
352 	ob = (uchar_t *)outarray;
353 	ibtail = ib + *inlen;
354 	obtail = ob + *outlen;
355 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
356 
357 	while (ib < ibtail) {
358 		if (*ib == '\0' && do_not_ignore_null)
359 			break;
360 
361 		if (KICONV_IS_ASCII(*ib)) {
362 			if (ob >= obtail) {
363 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
364 			}
365 
366 			*ob++ = *ib++;
367 			continue;
368 		}
369 
370 		oldib = ib;
371 
372 		if (! KICONV_KO_IS_UHC_1st_BYTE(*ib)) {
373 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
374 		}
375 
376 		if (ibtail - ib < 2) {
377 			KICONV_SET_ERRNO_WITH_FLAG(1, EINVAL);
378 		}
379 
380 		if (! KICONV_KO_IS_UHC_2nd_BYTE(*(ib + 1))) {
381 			KICONV_SET_ERRNO_WITH_FLAG(2, EILSEQ);
382 		}
383 
384 		uhc_val = *ib++;
385 		uhc_val = (uhc_val << 8) | *ib++;
386 		sz = ko_to_utf8(uhc_val, ob, obtail, &ret_val,
387 		    kiconv_uhc_utf8, KICONV_UHC_UTF8_MAX);
388 
389 		if (sz < 0) {
390 			ib = oldib;
391 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
392 		}
393 
394 		ob += sz;
395 		continue;
396 
397 REPLACE_INVALID:
398 		if (obtail - ob < KICONV_UTF8_REPLACEMENT_CHAR_LEN) {
399 			ib = oldib;
400 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
401 		}
402 
403 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR1;
404 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR2;
405 		*ob++ = KICONV_UTF8_REPLACEMENT_CHAR3;
406 		ret_val++;
407 	}
408 
409 	*inlen = ibtail - ib;
410 	*outlen = obtail - ob;
411 
412 	return (ret_val);
413 }
414 
415 /*
416  * Encoding convertor from UTF-8 to EUC-KR.
417  */
418 static size_t
419 kiconv_to_euckr(void *kcd, char **inbuf, size_t *inbytesleft,
420     char **outbuf, size_t *outbytesleft, int *errno)
421 {
422 	return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
423 	    outbytesleft, errno, utf8_to_euckr));
424 }
425 
426 /*
427  * Encoding convertor from UTF-8 to Unified Hangul Code.
428  */
429 static size_t
430 kiconv_to_uhc(void *kcd, char **inbuf, size_t *inbytesleft,
431     char **outbuf, size_t *outbytesleft, int *errno)
432 {
433 	return (kiconv_utf8_to_cck(kcd, inbuf, inbytesleft, outbuf,
434 	    outbytesleft, errno, utf8_to_uhc));
435 }
436 
437 /*
438  * String based encoding convertor from UTF-8 to EUC-KR.
439  */
440 static size_t
441 kiconvstr_to_euckr(char *inarray, size_t *inlen, char *outarray,
442     size_t *outlen, int flag, int *errno)
443 {
444 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
445 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_euckr);
446 }
447 
448 /*
449  * String based encoding convertor from UTF-8 to Unified Hangul Code.
450  */
451 static size_t
452 kiconvstr_to_uhc(char *inarray, size_t *inlen, char *outarray,
453     size_t *outlen, int flag, int *errno)
454 {
455 	return kiconvstr_utf8_to_cck((uchar_t *)inarray, inlen,
456 	    (uchar_t *)outarray, outlen, flag, errno, utf8_to_uhc);
457 }
458 
459 /*
460  * Convert an UTF-8 character to a character of ko encodings
461  * (EUC-KR or UHC).
462  */
463 static int8_t
464 utf8_to_ko(uint32_t utf8, uchar_t *ob, uchar_t *obtail, size_t *ret_val,
465     kiconv_table_t *table, size_t nitems)
466 {
467 	size_t	index;
468 	size_t	kocode;
469 	int8_t  kolen;
470 
471 	if (KICONV_KO_IS_UDC_IN_UTF8(utf8)) {
472 		/* User Definable Area handing. */
473 		kocode = (((utf8 & 0xF0000) >> 4) | ((utf8 & 0x3F00) >> 2) |
474 		    (utf8 & 0x3F)) - KICONV_KO_UDA_UCS4_START;
475 		if (kocode < KICONV_KO_UDA_RANGE) {
476 			kocode = (KICONV_KO_UDA_EUC_SEG1 << 8) |
477 			    (kocode + KICONV_KO_UDA_OFFSET_START);
478 		} else {
479 			/* 0x43 = 0xA1 - 0x5E */
480 			kocode = (KICONV_KO_UDA_EUC_SEG2 << 8) |
481 			    (kocode + 0x43);
482 		}
483 
484 		index = 1;
485 	} else {
486 		index = kiconv_binsearch(utf8, table, nitems);
487 		kocode = table[index].value;
488 	}
489 
490 	kolen = (kocode <= 0xFF) ? 1 : 2;
491 
492 	if (obtail - ob < kolen) {
493 		*ret_val = (size_t)-1;
494 		return (-1);
495 	}
496 
497 	if (index == 0)
498 		(*ret_val)++;
499 
500 	if (kolen > 1)
501 		*ob++ = (uchar_t)(kocode >> 8);
502 	*ob = (uchar_t)(kocode & 0xFF);
503 
504 	return (kolen);
505 }
506 
507 /*
508  * Convert an UTF-8 character to Unified Hangual Code.
509  */
510 /* ARGSUSED */
511 static int8_t
512 utf8_to_uhc(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
513     uchar_t *ob, uchar_t *obtail, size_t *ret_val)
514 {
515 	return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_uhc,
516 	    KICONV_UTF8_UHC_MAX));
517 }
518 
519 /*
520  * Convert an UTF-8 character to EUC-KR.
521  */
522 /* ARGSUSED */
523 static int8_t
524 utf8_to_euckr(uint32_t utf8, uchar_t **inbuf, uchar_t *ibtail,
525     uchar_t *ob, uchar_t *obtail, size_t *ret_val)
526 {
527 	return (utf8_to_ko(utf8, ob, obtail, ret_val, kiconv_utf8_euckr,
528 	    KICONV_UTF8_EUCKR_MAX));
529 }
530 
531 /*
532  * Convert a single ko encoding (EUC-KR or UHC) character to UTF-8.
533  */
534 static int8_t
535 ko_to_utf8(uint32_t ko_val, uchar_t *ob, uchar_t *obtail, size_t *ret_val,
536     kiconv_table_array_t *table, size_t nitems)
537 {
538 	size_t	index;
539 	int8_t	sz;
540 	uchar_t	udc[3];
541 	uchar_t	*u8;
542 
543 	if (KICONV_KO_IS_UDC_IN_EUC(ko_val)) {
544 		/* UDA(User Definable Area) handling. */
545 		uint32_t u32;
546 
547 		u32 = (ko_val & 0xFF) + (((ko_val & 0xFF00) == 0xC900) ?
548 		    KICONV_KO_UDA_OFFSET_1 : KICONV_KO_UDA_OFFSET_2);
549 		udc[0] = 0xEF;
550 		udc[1] = (uchar_t)(0x80 | (u32 & 0x00000FC0) >> 6);
551 		udc[2] = (uchar_t)(0x80 | (u32 & 0x0000003F));
552 		u8 = udc;
553 		index = 1;
554 	} else {
555 		index = kiconv_binsearch(ko_val, table, nitems);
556 		u8 = table[index].u8;
557 	}
558 
559 	sz = u8_number_of_bytes[u8[0]];
560 
561 	if (obtail - ob < sz) {
562 		*ret_val = (size_t)-1;
563 		return (-1);
564 	}
565 
566 	if (index == 0)
567 		(*ret_val)++;	/* Non-identical conversion */
568 
569 	for (index = 0; index < sz; index++)
570 		*ob++ = u8[index];
571 
572 	return (sz);
573 }
574 
575 static kiconv_ops_t kiconv_ko_ops_tbl[] = {
576 	{
577 		"euc-kr", "utf-8", kiconv_open_to_cck, kiconv_to_euckr,
578 		kiconv_close_to_cck, kiconvstr_to_euckr
579 	},
580 	{
581 		"utf-8", "euc-kr", open_fr_euckr, kiconv_fr_euckr,
582 		close_fr_ko, kiconvstr_fr_euckr
583 	},
584 	{
585 		"unifiedhangul", "utf-8", kiconv_open_to_cck, kiconv_to_uhc,
586 		kiconv_close_to_cck, kiconvstr_to_uhc
587 	},
588 	{
589 		"utf-8", "unifiedhangul", open_fr_uhc, kiconv_fr_uhc,
590 		close_fr_ko, kiconvstr_fr_uhc
591 	}
592 };
593 
594 static kiconv_module_info_t kiconv_ko_info = {
595 	"kiconv_ko",		/* module name */
596 	sizeof (kiconv_ko_ops_tbl) / sizeof (kiconv_ko_ops_tbl[0]),
597 	kiconv_ko_ops_tbl,
598 	0,
599 	NULL,
600 	NULL,
601 	0
602 };
603 
604 static struct modlkiconv modlkiconv_ko = {
605 	&mod_kiconvops,
606 	"kiconv korean module 1.0",
607 	&kiconv_ko_info
608 };
609 
610 static struct modlinkage modlinkage = {
611 	MODREV_1,
612 	(void *)&modlkiconv_ko,
613 	NULL
614 };
615 
616 int
617 _init(void)
618 {
619 	int err;
620 
621 	err = mod_install(&modlinkage);
622 	if (err)
623 		cmn_err(CE_WARN, "kiconv_ko: failed to load kernel module");
624 
625 	return (err);
626 }
627 
628 int
629 _fini(void)
630 {
631 	int err;
632 
633 	/*
634 	 * If this module is being used, then, we cannot remove the module.
635 	 * The following checking will catch pretty much all usual cases.
636 	 *
637 	 * Any remaining will be catached by the kiconv_unregister_module()
638 	 * during mod_remove() at below.
639 	 */
640 	if (kiconv_module_ref_count(KICONV_MODULE_ID_KO))
641 		return (EBUSY);
642 
643 	err = mod_remove(&modlinkage);
644 	if (err)
645 		cmn_err(CE_WARN, "kiconv_ko: failed to remove kernel module");
646 
647 	return (err);
648 }
649 
650 int
651 _info(struct modinfo *modinfop)
652 {
653 	return (mod_info(&modlinkage, modinfop));
654 }
655