xref: /freebsd/sys/libkern/iconv_ucs.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2003, 2005 Ryuichiro Imura
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/kernel.h>
32 #include <sys/systm.h>
33 #include <sys/malloc.h>
34 #include <sys/iconv.h>
35 
36 #include "iconv_converter_if.h"
37 
38 /*
39  * "UCS" converter
40  */
41 
42 #define	KICONV_UCS_COMBINE	0x1
43 #define	KICONV_UCS_FROM_UTF8	0x2
44 #define	KICONV_UCS_TO_UTF8	0x4
45 #define	KICONV_UCS_FROM_LE	0x8
46 #define	KICONV_UCS_TO_LE	0x10
47 #define	KICONV_UCS_FROM_UTF16	0x20
48 #define	KICONV_UCS_TO_UTF16	0x40
49 #define	KICONV_UCS_UCS4		0x80
50 
51 #define	ENCODING_UTF16	"UTF-16BE"
52 #define	ENCODING_UTF8	"UTF-8"
53 
54 static struct {
55 	const char *name;
56 	int from_flag, to_flag;
57 } unicode_family[] = {
58 	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
59 	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
60 	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
61 	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
62 	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
63 	{ NULL,		0,	0 }
64 };
65 
66 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
67 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
68 static uint32_t encode_surrogate(uint32_t code);
69 static uint32_t decode_surrogate(const u_char *ucs);
70 
71 #ifdef MODULE_DEPEND
72 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
73 #endif
74 
75 /*
76  * UCS converter instance
77  */
78 struct iconv_ucs {
79 	KOBJ_FIELDS;
80 	int			convtype;
81 	struct iconv_cspair *	d_csp;
82 	struct iconv_cspair *	d_cspf;
83 	void *			f_ctp;
84 	void *			t_ctp;
85 	void *			ctype;
86 };
87 
88 static int
89 iconv_ucs_open(struct iconv_converter_class *dcp,
90 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
91 {
92 	struct iconv_ucs *dp;
93 	int i;
94 	const char *from, *to;
95 
96 	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
97 	to = csp->cp_to;
98 	from = cspf ? cspf->cp_from : csp->cp_from;
99 
100 	dp->convtype = 0;
101 
102 	if (cspf)
103 		dp->convtype |= KICONV_UCS_COMBINE;
104 	for (i = 0; unicode_family[i].name; i++) {
105 		if (strcasecmp(from, unicode_family[i].name) == 0)
106 			dp->convtype |= unicode_family[i].from_flag;
107 		if (strcasecmp(to, unicode_family[i].name) == 0)
108 			dp->convtype |= unicode_family[i].to_flag;
109 	}
110 	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
111 		dp->convtype |= KICONV_UCS_UCS4;
112 	else
113 		dp->convtype &= ~KICONV_UCS_UCS4;
114 
115 	dp->f_ctp = dp->t_ctp = NULL;
116 	if (dp->convtype & KICONV_UCS_COMBINE) {
117 		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
118 		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
119 			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
120 		}
121 		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
122 		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
123 			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
124 		}
125 	}
126 
127 	dp->ctype = NULL;
128 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
129 		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
130 
131 	dp->d_csp = csp;
132 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
133 		if (cspf) {
134 			dp->d_cspf = cspf;
135 			cspf->cp_refcount++;
136 		} else
137 			csp->cp_refcount++;
138 	}
139 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
140 		csp->cp_refcount++;
141 	*dpp = (void*)dp;
142 	return 0;
143 }
144 
145 static int
146 iconv_ucs_close(void *data)
147 {
148 	struct iconv_ucs *dp = data;
149 
150 	if (dp->f_ctp)
151 		iconv_close(dp->f_ctp);
152 	if (dp->t_ctp)
153 		iconv_close(dp->t_ctp);
154 	if (dp->ctype)
155 		iconv_close(dp->ctype);
156 	if (dp->d_cspf)
157 		dp->d_cspf->cp_refcount--;
158 	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
159 		dp->d_csp->cp_refcount--;
160 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
161 		dp->d_csp->cp_refcount--;
162 	kobj_delete((struct kobj*)data, M_ICONV);
163 	return 0;
164 }
165 
166 static int
167 iconv_ucs_conv(void *d2p, const char **inbuf,
168 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
169 	int convchar, int casetype)
170 {
171 	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
172 	int ret = 0, i;
173 	size_t in, on, ir, or, inlen, outlen, ucslen;
174 	const char *src, *p;
175 	char *dst;
176 	u_char ucs[4], *q;
177 	uint32_t code;
178 
179 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
180 		return 0;
181 	ir = in = *inbytesleft;
182 	or = on = *outbytesleft;
183 	src = *inbuf;
184 	dst = *outbuf;
185 
186 	while (ir > 0 && or > 0) {
187 		/*
188 		 * The first half of conversion.
189 		 * (convert any code into ENCODING_UNICODE)
190 		 */
191 		code = 0;
192 		p = src;
193 		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
194 			/* convert UTF-8 to ENCODING_UNICODE */
195 			inlen = 0;
196 			code = utf8_to_ucs4(p, &inlen, ir);
197 			if (code == 0) {
198 				ret = -1;
199 				break;
200 			}
201 
202 			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
203 				code = towlower(code, dp->ctype);
204 			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
205 				code = towupper(code, dp->ctype);
206 			}
207 
208 			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
209 				/* reserved for utf-16 surrogate pair */
210 				/* invalid unicode */
211 				ret = -1;
212 				break;
213 			}
214 
215 			if (inlen == 4) {
216 				if (dp->convtype & KICONV_UCS_UCS4) {
217 					ucslen = 4;
218 					code = encode_surrogate(code);
219 				} else {
220 					/* can't handle with ucs-2 */
221 					ret = -1;
222 					break;
223 				}
224 			} else {
225 				ucslen = 2;
226 			}
227 
228 			/* save UCS-4 into ucs[] */
229 			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
230 				*q++ = (code >> (i << 3)) & 0xff;
231 
232 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
233 			/* convert local code to ENCODING_UNICODE */
234 			ucslen = 4;
235 			inlen = ir;
236 			q = ucs;
237 			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
238 			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
239 			if (ret)
240 				break;
241 			inlen = ir - inlen;
242 			ucslen = 4 - ucslen;
243 
244 		} else {
245 			/* src code is a proper subset of ENCODING_UNICODE */
246 			q = ucs;
247 			if (dp->convtype & KICONV_UCS_FROM_LE) {
248 				*q = *(p + 1);
249 				*(q + 1) = *p;
250 				p += 2;
251 			} else {
252 				*q = *p++;
253 				*(q + 1) = *p++;
254 			}
255 			if ((*q & 0xfc) == 0xd8) {
256 				if (dp->convtype & KICONV_UCS_UCS4 &&
257 				    dp->convtype & KICONV_UCS_FROM_UTF16) {
258 					inlen = ucslen = 4;
259 				} else {
260 					/* invalid unicode */
261 					ret = -1;
262 					break;
263 				}
264 			} else {
265 				inlen = ucslen = 2;
266 			}
267 			if (ir < inlen) {
268 				ret = -1;
269 				break;
270 			}
271 			if (ucslen == 4) {
272 				q += 2;
273 				if (dp->convtype & KICONV_UCS_FROM_LE) {
274 					*q = *(p + 1);
275 					*(q + 1) = *p;
276 				} else {
277 					*q = *p++;
278 					*(q + 1) = *p;
279 				}
280 				if ((*q & 0xfc) != 0xdc) {
281 					/* invalid unicode */
282 					ret = -1;
283 					break;
284 				}
285 			}
286 		}
287 
288 		/*
289 		 * The second half of conversion.
290 		 * (convert ENCODING_UNICODE into any code)
291 		 */
292 		p = ucs;
293 		if (dp->convtype & KICONV_UCS_TO_UTF8) {
294 			q = (u_char *)dst;
295 			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
296 				/* decode surrogate pair */
297 				code = decode_surrogate(p);
298 			} else {
299 				code = (ucs[0] << 8) | ucs[1];
300 			}
301 
302 			if (casetype == KICONV_LOWER && dp->ctype) {
303 				code = towlower(code, dp->ctype);
304 			} else if (casetype == KICONV_UPPER && dp->ctype) {
305 				code = towupper(code, dp->ctype);
306 			}
307 
308 			outlen = 0;
309 			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
310 				ret = -1;
311 				break;
312 			}
313 
314 			src += inlen;
315 			ir -= inlen;
316 			dst += outlen;
317 			or -= outlen;
318 
319 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
320 			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
321 			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
322 			if (ret)
323 				break;
324 
325 			src += inlen;
326 			ir -= inlen;
327 
328 		} else {
329 			/* dst code is a proper subset of ENCODING_UNICODE */
330 			if (or < ucslen) {
331 				ret = -1;
332 				break;
333 			}
334 			src += inlen;
335 			ir -= inlen;
336 			or -= ucslen;
337 			if (dp->convtype & KICONV_UCS_TO_LE) {
338 				*dst++ = *(p + 1);
339 				*dst++ = *p;
340 				p += 2;
341 			} else {
342 				*dst++ = *p++;
343 				*dst++ = *p++;
344 			}
345 			if (ucslen == 4) {
346 				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
347 				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
348 					ret = -1;
349 					break;
350 				}
351 				if (dp->convtype & KICONV_UCS_TO_LE) {
352 					*dst++ = *(p + 1);
353 					*dst++ = *p;
354 				} else {
355 					*dst++ = *p++;
356 					*dst++ = *p;
357 				}
358 			}
359 		}
360 
361 		if (convchar == 1)
362 			break;
363 	}
364 
365 	*inbuf += in - ir;
366 	*outbuf += on - or;
367 	*inbytesleft -= in - ir;
368 	*outbytesleft -= on - or;
369 	return (ret);
370 }
371 
372 static int
373 iconv_ucs_init(struct iconv_converter_class *dcp)
374 {
375 	int error;
376 
377 	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
378 	if (error)
379 		return (error);
380 	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
381 	if (error)
382 		return (error);
383 	return (0);
384 }
385 
386 static int
387 iconv_ucs_done(struct iconv_converter_class *dcp)
388 {
389 	return (0);
390 }
391 
392 static const char *
393 iconv_ucs_name(struct iconv_converter_class *dcp)
394 {
395 	return (ENCODING_UNICODE);
396 }
397 
398 static kobj_method_t iconv_ucs_methods[] = {
399 	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
400 	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
401 	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
402 	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
403 	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
404 	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
405 	{0, 0}
406 };
407 
408 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
409 
410 static uint32_t
411 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
412 {
413 	size_t i, w = 0;
414 	uint32_t ucs4 = 0;
415 
416 	/*
417 	 * get leading 1 byte from utf-8
418 	 */
419 	if ((*src & 0x80) == 0) {
420 		/*
421 		 * leading 1 bit is "0"
422 		 *  utf-8: 0xxxxxxx
423 		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
424 		 */
425 		w = 1;
426 		/* get trailing 7 bits */
427 		ucs4 = *src & 0x7f;
428 	} else if ((*src & 0xe0) == 0xc0) {
429 		/*
430 		 * leading 3 bits are "110"
431 		 *  utf-8: 110xxxxx 10yyyyyy
432 		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
433 		 */
434 		w = 2;
435 		/* get trailing 5 bits */
436 		ucs4 = *src & 0x1f;
437 	} else if ((*src & 0xf0) == 0xe0) {
438 		/*
439 		 * leading 4 bits are "1110"
440 		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
441 		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
442 		 */
443 		w = 3;
444 		/* get trailing 4 bits */
445 		ucs4 = *src & 0x0f;
446 	} else if ((*src & 0xf8) == 0xf0) {
447 		/*
448 		 * leading 5 bits are "11110"
449 		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
450 		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
451 		 */
452 		w = 4;
453 		/* get trailing 3 bits */
454 		ucs4 = *src & 0x07;
455 	} else {
456 		/* out of utf-16 range or having illegal bits */
457 		return (0);
458 	}
459 
460 	if (srclen < w)
461 		return (0);
462 
463 	/*
464 	 * get left parts from utf-8
465 	 */
466 	for (i = 1 ; i < w ; i++) {
467 		if ((*(src + i) & 0xc0) != 0x80) {
468 			/* invalid: leading 2 bits are not "10" */
469 			return (0);
470 		}
471 		/* concatenate trailing 6 bits into ucs4 */
472 		ucs4 <<= 6;
473 		ucs4 |= *(src + i) & 0x3f;
474 	}
475 
476 	*utf8width = w;
477 	return (ucs4);
478 }
479 
480 static u_char *
481 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
482 {
483 	u_char lead, *p;
484 	size_t i, w;
485 
486 	/*
487 	 * determine utf-8 width and leading bits
488 	 */
489 	if (ucs4 < 0x80) {
490 		w = 1;
491 		lead = 0;	/* "0" */
492 	} else if (ucs4 < 0x800) {
493 		w = 2;
494 		lead = 0xc0;	/* "11" */
495 	} else if (ucs4 < 0x10000) {
496 		w = 3;
497 		lead = 0xe0;	/* "111" */
498 	} else if (ucs4 < 0x200000) {
499 		w = 4;
500 		lead = 0xf0;	/* "1111" */
501 	} else {
502 		return (NULL);
503 	}
504 
505 	if (dstlen < w)
506 		return (NULL);
507 
508 	/*
509 	 * construct utf-8
510 	 */
511 	p = dst;
512 	for (i = w - 1 ; i >= 1 ; i--) {
513 		/* get trailing 6 bits and put it with leading bit as "1" */
514 		*(p + i) = (ucs4 & 0x3f) | 0x80;
515 		ucs4 >>= 6;
516 	}
517 	*p = ucs4 | lead;
518 
519 	*utf8width = w;
520 
521 	return (p);
522 }
523 
524 static uint32_t
525 encode_surrogate(uint32_t code)
526 {
527 	return ((((code - 0x10000) << 6) & 0x3ff0000) |
528 	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
529 }
530 
531 static uint32_t
532 decode_surrogate(const u_char *ucs)
533 {
534 	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
535 	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
536 }
537