1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2003, 2005 Ryuichiro Imura
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/kernel.h>
31 #include <sys/systm.h>
32 #include <sys/malloc.h>
33 #include <sys/iconv.h>
34
35 #include "iconv_converter_if.h"
36
37 /*
38 * "UCS" converter
39 */
40
41 #define KICONV_UCS_COMBINE 0x1
42 #define KICONV_UCS_FROM_UTF8 0x2
43 #define KICONV_UCS_TO_UTF8 0x4
44 #define KICONV_UCS_FROM_LE 0x8
45 #define KICONV_UCS_TO_LE 0x10
46 #define KICONV_UCS_FROM_UTF16 0x20
47 #define KICONV_UCS_TO_UTF16 0x40
48 #define KICONV_UCS_UCS4 0x80
49
50 #define ENCODING_UTF16 "UTF-16BE"
51 #define ENCODING_UTF8 "UTF-8"
52
53 static struct {
54 const char *name;
55 int from_flag, to_flag;
56 } unicode_family[] = {
57 { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
58 { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
59 { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
60 { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
61 KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
62 { NULL, 0, 0 }
63 };
64
65 static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
66 static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
67 static uint32_t encode_surrogate(uint32_t code);
68 static uint32_t decode_surrogate(const u_char *ucs);
69
70 #ifdef MODULE_DEPEND
71 MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
72 #endif
73
74 /*
75 * UCS converter instance
76 */
77 struct iconv_ucs {
78 KOBJ_FIELDS;
79 int convtype;
80 struct iconv_cspair * d_csp;
81 struct iconv_cspair * d_cspf;
82 void * f_ctp;
83 void * t_ctp;
84 void * ctype;
85 };
86
87 static int
iconv_ucs_open(struct iconv_converter_class * dcp,struct iconv_cspair * csp,struct iconv_cspair * cspf,void ** dpp)88 iconv_ucs_open(struct iconv_converter_class *dcp,
89 struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
90 {
91 struct iconv_ucs *dp;
92 int i;
93 const char *from, *to;
94
95 dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
96 to = csp->cp_to;
97 from = cspf ? cspf->cp_from : csp->cp_from;
98
99 dp->convtype = 0;
100
101 if (cspf)
102 dp->convtype |= KICONV_UCS_COMBINE;
103 for (i = 0; unicode_family[i].name; i++) {
104 if (strcasecmp(from, unicode_family[i].name) == 0)
105 dp->convtype |= unicode_family[i].from_flag;
106 if (strcasecmp(to, unicode_family[i].name) == 0)
107 dp->convtype |= unicode_family[i].to_flag;
108 }
109 if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
110 dp->convtype |= KICONV_UCS_UCS4;
111 else
112 dp->convtype &= ~KICONV_UCS_UCS4;
113
114 dp->f_ctp = dp->t_ctp = NULL;
115 if (dp->convtype & KICONV_UCS_COMBINE) {
116 if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
117 (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
118 iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
119 }
120 if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
121 (dp->convtype & KICONV_UCS_TO_LE) == 0) {
122 iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
123 }
124 }
125
126 dp->ctype = NULL;
127 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
128 iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
129
130 dp->d_csp = csp;
131 if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
132 if (cspf) {
133 dp->d_cspf = cspf;
134 cspf->cp_refcount++;
135 } else
136 csp->cp_refcount++;
137 }
138 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
139 csp->cp_refcount++;
140 *dpp = (void*)dp;
141 return 0;
142 }
143
144 static int
iconv_ucs_close(void * data)145 iconv_ucs_close(void *data)
146 {
147 struct iconv_ucs *dp = data;
148
149 if (dp->f_ctp)
150 iconv_close(dp->f_ctp);
151 if (dp->t_ctp)
152 iconv_close(dp->t_ctp);
153 if (dp->ctype)
154 iconv_close(dp->ctype);
155 if (dp->d_cspf)
156 dp->d_cspf->cp_refcount--;
157 else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
158 dp->d_csp->cp_refcount--;
159 if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
160 dp->d_csp->cp_refcount--;
161 kobj_delete((struct kobj*)data, M_ICONV);
162 return 0;
163 }
164
165 static int
iconv_ucs_conv(void * d2p,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int convchar,int casetype)166 iconv_ucs_conv(void *d2p, const char **inbuf,
167 size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
168 int convchar, int casetype)
169 {
170 struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
171 int ret = 0, i;
172 size_t in, on, ir, or, inlen, outlen, ucslen;
173 const char *src, *p;
174 char *dst;
175 u_char ucs[4], *q;
176 uint32_t code;
177
178 if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
179 return 0;
180 ir = in = *inbytesleft;
181 or = on = *outbytesleft;
182 src = *inbuf;
183 dst = *outbuf;
184
185 while (ir > 0 && or > 0) {
186 /*
187 * The first half of conversion.
188 * (convert any code into ENCODING_UNICODE)
189 */
190 code = 0;
191 p = src;
192 if (dp->convtype & KICONV_UCS_FROM_UTF8) {
193 /* convert UTF-8 to ENCODING_UNICODE */
194 inlen = 0;
195 code = utf8_to_ucs4(p, &inlen, ir);
196 if (code == 0) {
197 ret = -1;
198 break;
199 }
200
201 if (casetype == KICONV_FROM_LOWER && dp->ctype) {
202 code = towlower(code, dp->ctype);
203 } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
204 code = towupper(code, dp->ctype);
205 }
206
207 if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
208 /* reserved for utf-16 surrogate pair */
209 /* invalid unicode */
210 ret = -1;
211 break;
212 }
213
214 if (inlen == 4) {
215 if (dp->convtype & KICONV_UCS_UCS4) {
216 ucslen = 4;
217 code = encode_surrogate(code);
218 } else {
219 /* can't handle with ucs-2 */
220 ret = -1;
221 break;
222 }
223 } else {
224 ucslen = 2;
225 }
226
227 /* save UCS-4 into ucs[] */
228 for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
229 *q++ = (code >> (i << 3)) & 0xff;
230
231 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
232 /* convert local code to ENCODING_UNICODE */
233 ucslen = 4;
234 inlen = ir;
235 q = ucs;
236 ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
237 &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
238 if (ret)
239 break;
240 inlen = ir - inlen;
241 ucslen = 4 - ucslen;
242
243 } else {
244 /* src code is a proper subset of ENCODING_UNICODE */
245 q = ucs;
246 if (dp->convtype & KICONV_UCS_FROM_LE) {
247 *q = *(p + 1);
248 *(q + 1) = *p;
249 p += 2;
250 } else {
251 *q = *p++;
252 *(q + 1) = *p++;
253 }
254 if ((*q & 0xfc) == 0xd8) {
255 if (dp->convtype & KICONV_UCS_UCS4 &&
256 dp->convtype & KICONV_UCS_FROM_UTF16) {
257 inlen = ucslen = 4;
258 } else {
259 /* invalid unicode */
260 ret = -1;
261 break;
262 }
263 } else {
264 inlen = ucslen = 2;
265 }
266 if (ir < inlen) {
267 ret = -1;
268 break;
269 }
270 if (ucslen == 4) {
271 q += 2;
272 if (dp->convtype & KICONV_UCS_FROM_LE) {
273 *q = *(p + 1);
274 *(q + 1) = *p;
275 } else {
276 *q = *p++;
277 *(q + 1) = *p;
278 }
279 if ((*q & 0xfc) != 0xdc) {
280 /* invalid unicode */
281 ret = -1;
282 break;
283 }
284 }
285 }
286
287 /*
288 * The second half of conversion.
289 * (convert ENCODING_UNICODE into any code)
290 */
291 p = ucs;
292 if (dp->convtype & KICONV_UCS_TO_UTF8) {
293 q = (u_char *)dst;
294 if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
295 /* decode surrogate pair */
296 code = decode_surrogate(p);
297 } else {
298 code = (ucs[0] << 8) | ucs[1];
299 }
300
301 if (casetype == KICONV_LOWER && dp->ctype) {
302 code = towlower(code, dp->ctype);
303 } else if (casetype == KICONV_UPPER && dp->ctype) {
304 code = towupper(code, dp->ctype);
305 }
306
307 outlen = 0;
308 if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
309 ret = -1;
310 break;
311 }
312
313 src += inlen;
314 ir -= inlen;
315 dst += outlen;
316 or -= outlen;
317
318 } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
319 ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
320 &or, casetype & (KICONV_LOWER | KICONV_UPPER));
321 if (ret)
322 break;
323
324 src += inlen;
325 ir -= inlen;
326
327 } else {
328 /* dst code is a proper subset of ENCODING_UNICODE */
329 if (or < ucslen) {
330 ret = -1;
331 break;
332 }
333 src += inlen;
334 ir -= inlen;
335 or -= ucslen;
336 if (dp->convtype & KICONV_UCS_TO_LE) {
337 *dst++ = *(p + 1);
338 *dst++ = *p;
339 p += 2;
340 } else {
341 *dst++ = *p++;
342 *dst++ = *p++;
343 }
344 if (ucslen == 4) {
345 if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
346 (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
347 ret = -1;
348 break;
349 }
350 if (dp->convtype & KICONV_UCS_TO_LE) {
351 *dst++ = *(p + 1);
352 *dst++ = *p;
353 } else {
354 *dst++ = *p++;
355 *dst++ = *p;
356 }
357 }
358 }
359
360 if (convchar == 1)
361 break;
362 }
363
364 *inbuf += in - ir;
365 *outbuf += on - or;
366 *inbytesleft -= in - ir;
367 *outbytesleft -= on - or;
368 return (ret);
369 }
370
371 static int
iconv_ucs_init(struct iconv_converter_class * dcp)372 iconv_ucs_init(struct iconv_converter_class *dcp)
373 {
374 int error;
375
376 error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
377 if (error)
378 return (error);
379 error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
380 if (error)
381 return (error);
382 return (0);
383 }
384
385 static int
iconv_ucs_done(struct iconv_converter_class * dcp)386 iconv_ucs_done(struct iconv_converter_class *dcp)
387 {
388 return (0);
389 }
390
391 static const char *
iconv_ucs_name(struct iconv_converter_class * dcp)392 iconv_ucs_name(struct iconv_converter_class *dcp)
393 {
394 return (ENCODING_UNICODE);
395 }
396
397 static kobj_method_t iconv_ucs_methods[] = {
398 KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
399 KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
400 KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
401 KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
402 KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
403 KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
404 {0, 0}
405 };
406
407 KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
408
409 static uint32_t
utf8_to_ucs4(const char * src,size_t * utf8width,size_t srclen)410 utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
411 {
412 size_t i, w = 0;
413 uint32_t ucs4 = 0;
414
415 /*
416 * get leading 1 byte from utf-8
417 */
418 if ((*src & 0x80) == 0) {
419 /*
420 * leading 1 bit is "0"
421 * utf-8: 0xxxxxxx
422 * ucs-4: 00000000 00000000 00000000 0xxxxxxx
423 */
424 w = 1;
425 /* get trailing 7 bits */
426 ucs4 = *src & 0x7f;
427 } else if ((*src & 0xe0) == 0xc0) {
428 /*
429 * leading 3 bits are "110"
430 * utf-8: 110xxxxx 10yyyyyy
431 * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
432 */
433 w = 2;
434 /* get trailing 5 bits */
435 ucs4 = *src & 0x1f;
436 } else if ((*src & 0xf0) == 0xe0) {
437 /*
438 * leading 4 bits are "1110"
439 * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
440 * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
441 */
442 w = 3;
443 /* get trailing 4 bits */
444 ucs4 = *src & 0x0f;
445 } else if ((*src & 0xf8) == 0xf0) {
446 /*
447 * leading 5 bits are "11110"
448 * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
449 * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
450 */
451 w = 4;
452 /* get trailing 3 bits */
453 ucs4 = *src & 0x07;
454 } else {
455 /* out of utf-16 range or having illegal bits */
456 return (0);
457 }
458
459 if (srclen < w)
460 return (0);
461
462 /*
463 * get left parts from utf-8
464 */
465 for (i = 1 ; i < w ; i++) {
466 if ((*(src + i) & 0xc0) != 0x80) {
467 /* invalid: leading 2 bits are not "10" */
468 return (0);
469 }
470 /* concatenate trailing 6 bits into ucs4 */
471 ucs4 <<= 6;
472 ucs4 |= *(src + i) & 0x3f;
473 }
474
475 *utf8width = w;
476 return (ucs4);
477 }
478
479 static u_char *
ucs4_to_utf8(uint32_t ucs4,char * dst,size_t * utf8width,size_t dstlen)480 ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
481 {
482 u_char lead, *p;
483 size_t i, w;
484
485 /*
486 * determine utf-8 width and leading bits
487 */
488 if (ucs4 < 0x80) {
489 w = 1;
490 lead = 0; /* "0" */
491 } else if (ucs4 < 0x800) {
492 w = 2;
493 lead = 0xc0; /* "11" */
494 } else if (ucs4 < 0x10000) {
495 w = 3;
496 lead = 0xe0; /* "111" */
497 } else if (ucs4 < 0x200000) {
498 w = 4;
499 lead = 0xf0; /* "1111" */
500 } else {
501 return (NULL);
502 }
503
504 if (dstlen < w)
505 return (NULL);
506
507 /*
508 * construct utf-8
509 */
510 p = dst;
511 for (i = w - 1 ; i >= 1 ; i--) {
512 /* get trailing 6 bits and put it with leading bit as "1" */
513 *(p + i) = (ucs4 & 0x3f) | 0x80;
514 ucs4 >>= 6;
515 }
516 *p = ucs4 | lead;
517
518 *utf8width = w;
519
520 return (p);
521 }
522
523 static uint32_t
encode_surrogate(uint32_t code)524 encode_surrogate(uint32_t code)
525 {
526 return ((((code - 0x10000) << 6) & 0x3ff0000) |
527 ((code - 0x10000) & 0x3ff) | 0xd800dc00);
528 }
529
530 static uint32_t
decode_surrogate(const u_char * ucs)531 decode_surrogate(const u_char *ucs)
532 {
533 return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
534 ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
535 }
536