xref: /freebsd/sys/libkern/iconv_ucs.c (revision eb69d1f144a6fcc765d1b9d44a5ae8082353e70b)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2003, 2005 Ryuichiro Imura
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/systm.h>
35 #include <sys/malloc.h>
36 #include <sys/iconv.h>
37 
38 #include "iconv_converter_if.h"
39 
40 /*
41  * "UCS" converter
42  */
43 
44 #define	KICONV_UCS_COMBINE	0x1
45 #define	KICONV_UCS_FROM_UTF8	0x2
46 #define	KICONV_UCS_TO_UTF8	0x4
47 #define	KICONV_UCS_FROM_LE	0x8
48 #define	KICONV_UCS_TO_LE	0x10
49 #define	KICONV_UCS_FROM_UTF16	0x20
50 #define	KICONV_UCS_TO_UTF16	0x40
51 #define	KICONV_UCS_UCS4		0x80
52 
53 #define	ENCODING_UTF16	"UTF-16BE"
54 #define	ENCODING_UTF8	"UTF-8"
55 
56 static struct {
57 	const char *name;
58 	int from_flag, to_flag;
59 } unicode_family[] = {
60 	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
61 	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
62 	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
63 	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
64 	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
65 	{ NULL,		0,	0 }
66 };
67 
68 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
69 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
70 static uint32_t encode_surrogate(uint32_t code);
71 static uint32_t decode_surrogate(const u_char *ucs);
72 
73 #ifdef MODULE_DEPEND
74 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
75 #endif
76 
77 /*
78  * UCS converter instance
79  */
80 struct iconv_ucs {
81 	KOBJ_FIELDS;
82 	int			convtype;
83 	struct iconv_cspair *	d_csp;
84 	struct iconv_cspair *	d_cspf;
85 	void *			f_ctp;
86 	void *			t_ctp;
87 	void *			ctype;
88 };
89 
90 static int
91 iconv_ucs_open(struct iconv_converter_class *dcp,
92 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
93 {
94 	struct iconv_ucs *dp;
95 	int i;
96 	const char *from, *to;
97 
98 	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
99 	to = csp->cp_to;
100 	from = cspf ? cspf->cp_from : csp->cp_from;
101 
102 	dp->convtype = 0;
103 
104 	if (cspf)
105 		dp->convtype |= KICONV_UCS_COMBINE;
106 	for (i = 0; unicode_family[i].name; i++) {
107 		if (strcasecmp(from, unicode_family[i].name) == 0)
108 			dp->convtype |= unicode_family[i].from_flag;
109 		if (strcasecmp(to, unicode_family[i].name) == 0)
110 			dp->convtype |= unicode_family[i].to_flag;
111 	}
112 	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
113 		dp->convtype |= KICONV_UCS_UCS4;
114 	else
115 		dp->convtype &= ~KICONV_UCS_UCS4;
116 
117 	dp->f_ctp = dp->t_ctp = NULL;
118 	if (dp->convtype & KICONV_UCS_COMBINE) {
119 		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
120 		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
121 			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
122 		}
123 		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
124 		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
125 			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
126 		}
127 	}
128 
129 	dp->ctype = NULL;
130 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
131 		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
132 
133 	dp->d_csp = csp;
134 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
135 		if (cspf) {
136 			dp->d_cspf = cspf;
137 			cspf->cp_refcount++;
138 		} else
139 			csp->cp_refcount++;
140 	}
141 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
142 		csp->cp_refcount++;
143 	*dpp = (void*)dp;
144 	return 0;
145 }
146 
147 static int
148 iconv_ucs_close(void *data)
149 {
150 	struct iconv_ucs *dp = data;
151 
152 	if (dp->f_ctp)
153 		iconv_close(dp->f_ctp);
154 	if (dp->t_ctp)
155 		iconv_close(dp->t_ctp);
156 	if (dp->ctype)
157 		iconv_close(dp->ctype);
158 	if (dp->d_cspf)
159 		dp->d_cspf->cp_refcount--;
160 	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
161 		dp->d_csp->cp_refcount--;
162 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
163 		dp->d_csp->cp_refcount--;
164 	kobj_delete((struct kobj*)data, M_ICONV);
165 	return 0;
166 }
167 
168 static int
169 iconv_ucs_conv(void *d2p, const char **inbuf,
170 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
171 	int convchar, int casetype)
172 {
173 	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
174 	int ret = 0, i;
175 	size_t in, on, ir, or, inlen, outlen, ucslen;
176 	const char *src, *p;
177 	char *dst;
178 	u_char ucs[4], *q;
179 	uint32_t code;
180 
181 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
182 		return 0;
183 	ir = in = *inbytesleft;
184 	or = on = *outbytesleft;
185 	src = *inbuf;
186 	dst = *outbuf;
187 
188 	while (ir > 0 && or > 0) {
189 
190 		/*
191 		 * The first half of conversion.
192 		 * (convert any code into ENCODING_UNICODE)
193 		 */
194 		code = 0;
195 		p = src;
196 		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
197 			/* convert UTF-8 to ENCODING_UNICODE */
198 			inlen = 0;
199 			code = utf8_to_ucs4(p, &inlen, ir);
200 			if (code == 0) {
201 				ret = -1;
202 				break;
203 			}
204 
205 			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
206 				code = towlower(code, dp->ctype);
207 			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
208 				code = towupper(code, dp->ctype);
209 			}
210 
211 			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
212 				/* reserved for utf-16 surrogate pair */
213 				/* invalid unicode */
214 				ret = -1;
215 				break;
216 			}
217 
218 			if (inlen == 4) {
219 				if (dp->convtype & KICONV_UCS_UCS4) {
220 					ucslen = 4;
221 					code = encode_surrogate(code);
222 				} else {
223 					/* can't handle with ucs-2 */
224 					ret = -1;
225 					break;
226 				}
227 			} else {
228 				ucslen = 2;
229 			}
230 
231 			/* save UCS-4 into ucs[] */
232 			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
233 				*q++ = (code >> (i << 3)) & 0xff;
234 
235 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
236 			/* convert local code to ENCODING_UNICODE */
237 			ucslen = 4;
238 			inlen = ir;
239 			q = ucs;
240 			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
241 			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
242 			if (ret)
243 				break;
244 			inlen = ir - inlen;
245 			ucslen = 4 - ucslen;
246 
247 		} else {
248 			/* src code is a proper subset of ENCODING_UNICODE */
249 			q = ucs;
250 			if (dp->convtype & KICONV_UCS_FROM_LE) {
251 				*q = *(p + 1);
252 				*(q + 1) = *p;
253 				p += 2;
254 			} else {
255 				*q = *p++;
256 				*(q + 1) = *p++;
257 			}
258 			if ((*q & 0xfc) == 0xd8) {
259 				if (dp->convtype & KICONV_UCS_UCS4 &&
260 				    dp->convtype & KICONV_UCS_FROM_UTF16) {
261 					inlen = ucslen = 4;
262 				} else {
263 					/* invalid unicode */
264 					ret = -1;
265 					break;
266 				}
267 			} else {
268 				inlen = ucslen = 2;
269 			}
270 			if (ir < inlen) {
271 				ret = -1;
272 				break;
273 			}
274 			if (ucslen == 4) {
275 				q += 2;
276 				if (dp->convtype & KICONV_UCS_FROM_LE) {
277 					*q = *(p + 1);
278 					*(q + 1) = *p;
279 				} else {
280 					*q = *p++;
281 					*(q + 1) = *p;
282 				}
283 				if ((*q & 0xfc) != 0xdc) {
284 					/* invalid unicode */
285 					ret = -1;
286 					break;
287 				}
288 			}
289 		}
290 
291 		/*
292 		 * The second half of conversion.
293 		 * (convert ENCODING_UNICODE into any code)
294 		 */
295 		p = ucs;
296 		if (dp->convtype & KICONV_UCS_TO_UTF8) {
297 			q = (u_char *)dst;
298 			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
299 				/* decode surrogate pair */
300 				code = decode_surrogate(p);
301 			} else {
302 				code = (ucs[0] << 8) | ucs[1];
303 			}
304 
305 			if (casetype == KICONV_LOWER && dp->ctype) {
306 				code = towlower(code, dp->ctype);
307 			} else if (casetype == KICONV_UPPER && dp->ctype) {
308 				code = towupper(code, dp->ctype);
309 			}
310 
311 			outlen = 0;
312 			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
313 				ret = -1;
314 				break;
315 			}
316 
317 			src += inlen;
318 			ir -= inlen;
319 			dst += outlen;
320 			or -= outlen;
321 
322 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
323 			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
324 			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
325 			if (ret)
326 				break;
327 
328 			src += inlen;
329 			ir -= inlen;
330 
331 		} else {
332 			/* dst code is a proper subset of ENCODING_UNICODE */
333 			if (or < ucslen) {
334 				ret = -1;
335 				break;
336 			}
337 			src += inlen;
338 			ir -= inlen;
339 			or -= ucslen;
340 			if (dp->convtype & KICONV_UCS_TO_LE) {
341 				*dst++ = *(p + 1);
342 				*dst++ = *p;
343 				p += 2;
344 			} else {
345 				*dst++ = *p++;
346 				*dst++ = *p++;
347 			}
348 			if (ucslen == 4) {
349 				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
350 				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
351 					ret = -1;
352 					break;
353 				}
354 				if (dp->convtype & KICONV_UCS_TO_LE) {
355 					*dst++ = *(p + 1);
356 					*dst++ = *p;
357 				} else {
358 					*dst++ = *p++;
359 					*dst++ = *p;
360 				}
361 			}
362 		}
363 
364 		if (convchar == 1)
365 			break;
366 	}
367 
368 	*inbuf += in - ir;
369 	*outbuf += on - or;
370 	*inbytesleft -= in - ir;
371 	*outbytesleft -= on - or;
372 	return (ret);
373 }
374 
375 static int
376 iconv_ucs_init(struct iconv_converter_class *dcp)
377 {
378 	int error;
379 
380 	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
381 	if (error)
382 		return (error);
383 	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
384 	if (error)
385 		return (error);
386 	return (0);
387 }
388 
389 static int
390 iconv_ucs_done(struct iconv_converter_class *dcp)
391 {
392 	return (0);
393 }
394 
395 static const char *
396 iconv_ucs_name(struct iconv_converter_class *dcp)
397 {
398 	return (ENCODING_UNICODE);
399 }
400 
401 static kobj_method_t iconv_ucs_methods[] = {
402 	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
403 	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
404 	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
405 	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
406 	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
407 	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
408 	{0, 0}
409 };
410 
411 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
412 
413 static uint32_t
414 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
415 {
416 	size_t i, w = 0;
417 	uint32_t ucs4 = 0;
418 
419 	/*
420 	 * get leading 1 byte from utf-8
421 	 */
422 	if ((*src & 0x80) == 0) {
423 		/*
424 		 * leading 1 bit is "0"
425 		 *  utf-8: 0xxxxxxx
426 		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
427 		 */
428 		w = 1;
429 		/* get trailing 7 bits */
430 		ucs4 = *src & 0x7f;
431 	} else if ((*src & 0xe0) == 0xc0) {
432 		/*
433 		 * leading 3 bits are "110"
434 		 *  utf-8: 110xxxxx 10yyyyyy
435 		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
436 		 */
437 		w = 2;
438 		/* get trailing 5 bits */
439 		ucs4 = *src & 0x1f;
440 	} else if ((*src & 0xf0) == 0xe0) {
441 		/*
442 		 * leading 4 bits are "1110"
443 		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
444 		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
445 		 */
446 		w = 3;
447 		/* get trailing 4 bits */
448 		ucs4 = *src & 0x0f;
449 	} else if ((*src & 0xf8) == 0xf0) {
450 		/*
451 		 * leading 5 bits are "11110"
452 		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
453 		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
454 		 */
455 		w = 4;
456 		/* get trailing 3 bits */
457 		ucs4 = *src & 0x07;
458 	} else {
459 		/* out of utf-16 range or having illegal bits */
460 		return (0);
461 	}
462 
463 	if (srclen < w)
464 		return (0);
465 
466 	/*
467 	 * get left parts from utf-8
468 	 */
469 	for (i = 1 ; i < w ; i++) {
470 		if ((*(src + i) & 0xc0) != 0x80) {
471 			/* invalid: leading 2 bits are not "10" */
472 			return (0);
473 		}
474 		/* concatenate trailing 6 bits into ucs4 */
475 		ucs4 <<= 6;
476 		ucs4 |= *(src + i) & 0x3f;
477 	}
478 
479 	*utf8width = w;
480 	return (ucs4);
481 }
482 
483 static u_char *
484 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
485 {
486 	u_char lead, *p;
487 	size_t i, w;
488 
489 	/*
490 	 * determine utf-8 width and leading bits
491 	 */
492 	if (ucs4 < 0x80) {
493 		w = 1;
494 		lead = 0;	/* "0" */
495 	} else if (ucs4 < 0x800) {
496 		w = 2;
497 		lead = 0xc0;	/* "11" */
498 	} else if (ucs4 < 0x10000) {
499 		w = 3;
500 		lead = 0xe0;	/* "111" */
501 	} else if (ucs4 < 0x200000) {
502 		w = 4;
503 		lead = 0xf0;	/* "1111" */
504 	} else {
505 		return (NULL);
506 	}
507 
508 	if (dstlen < w)
509 		return (NULL);
510 
511 	/*
512 	 * construct utf-8
513 	 */
514 	p = dst;
515 	for (i = w - 1 ; i >= 1 ; i--) {
516 		/* get trailing 6 bits and put it with leading bit as "1" */
517 		*(p + i) = (ucs4 & 0x3f) | 0x80;
518 		ucs4 >>= 6;
519 	}
520 	*p = ucs4 | lead;
521 
522 	*utf8width = w;
523 
524 	return (p);
525 }
526 
527 static uint32_t
528 encode_surrogate(uint32_t code)
529 {
530 	return ((((code - 0x10000) << 6) & 0x3ff0000) |
531 	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
532 }
533 
534 static uint32_t
535 decode_surrogate(const u_char *ucs)
536 {
537 	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
538 	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
539 }
540 
541