xref: /illumos-gate/usr/src/cmd/localedef/wide.c (revision 6b5e5868e7ebf1aff3a5abd7d0c4ef0e5fbf3648)
1 /*
2  * This file and its contents are supplied under the terms of the
3  * Common Development and Distribution License ("CDDL"), version 1.0.
4  * You may only use this file in accordance with the terms version 1.0
5  * of the CDDL.
6  *
7  * A full copy of the text of the CDDL should have accompanied this
8  * source.  A copy of the CDDL is also available via the Internet at
9  * http://www.illumos.org/license/CDDL.
10  */
11 
12 /*
13  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
14  */
15 
16 /*
17  * The functions in this file convert from the standard multibyte forms
18  * to the wide character forms used internally by libc.  Unfortunately,
19  * this approach means that we need a method for each and every encoding.
20  */
21 
22 #include <stdlib.h>
23 #include <wchar.h>
24 #include <string.h>
25 #include <sys/types.h>
26 #include "localedef.h"
27 
28 static int towide_none(wchar_t *, const char *, int);
29 static int towide_utf8(wchar_t *, const char *, int);
30 static int towide_big5(wchar_t *, const char *, int);
31 static int towide_gbk(wchar_t *, const char *, int);
32 static int towide_gb2312(wchar_t *, const char *, int);
33 static int towide_gb18030(wchar_t *, const char *, int);
34 static int towide_mskanji(wchar_t *, const char *, int);
35 static int towide_euccn(wchar_t *, const char *, int);
36 static int towide_eucjp(wchar_t *, const char *, int);
37 static int towide_euckr(wchar_t *, const char *, int);
38 static int towide_euctw(wchar_t *, const char *, int);
39 
40 static int tomb_none(char *, wchar_t);
41 static int tomb_utf8(char *, wchar_t);
42 static int tomb_mbs(char *, wchar_t);
43 
44 static int (*_towide)(wchar_t *, const char *, int) = towide_none;
45 static int (*_tomb)(char *, wchar_t) = tomb_none;
46 static const char *_encoding = "NONE";
47 
48 /*
49  * Table of supported encodings.  We only bother to list the multibyte
50  * encodings here, because single byte locales are handed by "NONE".
51  */
52 static struct {
53 	const char *name;
54 	/* the name that the underlying libc implemenation uses */
55 	const char *cname;
56 	int (*towide)(wchar_t *, const char *, int);
57 	int (*tomb)(char *, wchar_t);
58 } mb_encodings[] = {
59 	{ "UTF-8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
60 	{ "UTF8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
61 	{ "utf8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
62 	{ "utf-8",	"UTF-8",	towide_utf8,	tomb_utf8 },
63 
64 	{ "EUC-CN",	"EUC-CN",	towide_euccn,	tomb_mbs },
65 	{ "eucCN",	"EUC-CN",	towide_euccn,	tomb_mbs },
66 
67 	{ "EUC-JP",	"EUC-JP",	towide_eucjp,	tomb_mbs },
68 	{ "eucJP",	"EUC-JP",	towide_eucjp,	tomb_mbs },
69 
70 	{ "EUC-KR",	"EUC-KR",	towide_euckr,	tomb_mbs },
71 	{ "eucKR",	"EUC-KR",	towide_euckr,	tomb_mbs },
72 
73 	{ "EUC-TW",	"EUC-TW",	towide_euctw,	tomb_mbs },
74 	{ "eucTW",	"EUC-TW",	towide_euctw,	tomb_mbs },
75 
76 	{ "MS_Kanji",	"MSKanji",	towide_mskanji,	tomb_mbs },
77 	{ "MSKanji",	"MSKanji",	towide_mskanji,	tomb_mbs },
78 	{ "PCK",	"MSKanji",	towide_mskanji,	tomb_mbs },
79 	{ "SJIS",	"MSKanji",	towide_mskanji,	tomb_mbs },
80 	{ "Shift_JIS",	"MSKanji",	towide_mskanji,	tomb_mbs },
81 
82 	{ "BIG5",	"BIG5",		towide_big5,	tomb_mbs },
83 	{ "big5",	"BIG5",		towide_big5,	tomb_mbs },
84 	{ "Big5",	"BIG5",		towide_big5,	tomb_mbs },
85 
86 	{ "GBK",	"GBK",		towide_gbk,	tomb_mbs },
87 
88 	{ "GB18030",	"GB18030",	towide_gb18030,	tomb_mbs },
89 
90 	{ "GB2312",	"GB2312",	towide_gb2312,	tomb_mbs },
91 
92 	{ "ASCII",	"ASCII",	towide_none,	tomb_none },
93 	{ "US-ASCII",	"ASCII",	towide_none,	tomb_none },
94 	{ "646",	"ASCII",	towide_none,	tomb_none },
95 
96 	{ NULL, NULL },
97 };
98 
99 static char *
100 show_mb(const char *mb)
101 {
102 	static char buf[64];
103 
104 	/* ASCII stuff we just print */
105 	if (isascii(*mb) && isgraph(*mb)) {
106 		buf[0] = *mb;
107 		buf[1] = 0;
108 		return (buf);
109 	}
110 	buf[0] = 0;
111 	while (*mb != 0) {
112 		char scr[8];
113 		(void) snprintf(scr, sizeof (scr), "\\x%02x", *mb);
114 		(void) strlcat(buf, scr, sizeof (buf));
115 		mb++;
116 	}
117 	return (buf);
118 }
119 
120 static char	*widemsg;
121 
122 void
123 werr(const char *fmt, ...)
124 {
125 	char	*msg;
126 
127 	va_list	va;
128 	va_start(va, fmt);
129 	(void) vasprintf(&msg, fmt, va);
130 	va_end(va);
131 
132 	free(widemsg);
133 	widemsg = msg;
134 }
135 
136 /*
137  * This is used for 8-bit encodings.
138  */
139 int
140 towide_none(wchar_t *c, const char *mb, int n)
141 {
142 	if (mb_cur_max != 1) {
143 		werr("invalid or unsupported multibyte locale");
144 		return (-1);
145 	}
146 	if (n < 1) {
147 		werr("no character data");
148 		return (-1);
149 	}
150 	*c = (uint8_t)*mb;
151 	return (1);
152 }
153 
154 int
155 tomb_none(char *mb, wchar_t wc)
156 {
157 	if (mb_cur_max != 1) {
158 		werr("invalid or unsupported multibyte locale");
159 		return (-1);
160 	}
161 	*(uint8_t *)mb = (wc & 0xff);
162 	mb[1] = 0;
163 	return (1);
164 }
165 
166 /*
167  * UTF-8 stores wide characters in UTF-32 form.
168  */
169 int
170 towide_utf8(wchar_t *wc, const char *mb, int n)
171 {
172 	wchar_t	c;
173 	int	nb;
174 	int	lv;	/* lowest legal value */
175 	int	i;
176 	const uint8_t *s = (const uint8_t *)mb;
177 
178 	if (n < 1) {
179 		werr("no utf8 data");
180 		return (-1);
181 	}
182 	c = *s;
183 
184 	if ((c & 0x80) == 0) {
185 		/* 7-bit ASCII */
186 		*wc = c;
187 		return (1);
188 	} else if ((c & 0xe0) == 0xc0) {
189 		/* u80-u7ff - two bytes encoded */
190 		nb = 2;
191 		lv = 0x80;
192 		c &= ~0xe0;
193 	} else if ((c & 0xf0) == 0xe0) {
194 		/* u800-uffff - three bytes encoded */
195 		nb = 3;
196 		lv = 0x800;
197 		c &= ~0xf0;
198 	} else if ((c & 0xf8) == 0xf0) {
199 		/* u1000-u1fffff - four bytes encoded */
200 		nb = 4;
201 		lv = 0x1000;
202 		c &= ~0xf8;
203 	} else {
204 		/* 5 and 6 byte encodings are not legal unicode */
205 		werr("utf8 encoding too large (%s)", show_mb(mb));
206 		return (-1);
207 	}
208 	if (nb > n) {
209 		werr("incomplete utf8 sequence (%s)", show_mb(mb));
210 		return (-1);
211 	}
212 
213 	for (i = 1; i < nb; i++) {
214 		if (((s[i]) & 0xc0) != 0x80) {
215 			werr("illegal utf8 byte (%x)", s[i]);
216 			return (-1);
217 		}
218 		c <<= 6;
219 		c |= (s[i] & 0x3f);
220 	}
221 
222 	if (c < lv) {
223 		werr("illegal redundant utf8 encoding (%s)", show_mb(mb));
224 		return (-1);
225 	}
226 	*wc = c;
227 	return (nb);
228 }
229 
230 int
231 tomb_utf8(char *mb, wchar_t wc)
232 {
233 	uint8_t *s = (uint8_t *)mb;
234 	uint8_t msk;
235 	int cnt;
236 	int i;
237 
238 	if (wc <= 0x7f) {
239 		s[0] = wc & 0x7f;
240 		s[1] = 0;
241 		return (1);
242 	}
243 	if (wc <= 0x7ff) {
244 		cnt = 2;
245 		msk = 0xc0;
246 	} else if (wc <= 0xffff) {
247 		cnt = 3;
248 		msk = 0xe0;
249 	} else if (wc <= 0x1fffff) {
250 		cnt = 4;
251 		msk = 0xf0;
252 	} else {
253 		werr("illegal uf8 char (%x)", wc);
254 		return (-1);
255 	}
256 	for (i = cnt - 1; i; i--) {
257 		s[i] = (wc & 0x3f) | 0x80;
258 		wc >>= 6;
259 	}
260 	s[0] = (msk) | wc;
261 	s[cnt] = 0;
262 	return (cnt);
263 }
264 
265 /*
266  * Several encodings share a simplistic dual byte encoding.  In these
267  * forms, they all indicate that a two byte sequence is to be used if
268  * the first byte has its high bit set.  They all store this simple
269  * encoding as a 16-bit value, although a great many of the possible
270  * code points are not used in most character sets.  This gives a possible
271  * set of just over 32,000 valid code points.
272  *
273  * 0x00 - 0x7f		- 1 byte encoding
274  * 0x80 - 0x7fff	- illegal
275  * 0x8000 - 0xffff	- 2 byte encoding
276  */
277 static int
278 towide_dbcs(wchar_t *wc, const char *mb, int n)
279 {
280 	wchar_t	c;
281 
282 	c = *(uint8_t *)mb;
283 
284 	if (n < 1) {
285 		werr("no character data");
286 		return (-1);
287 	}
288 	if ((c & 0x80) == 0) {
289 		/* 7-bit */
290 		*wc = c;
291 		return (1);
292 	}
293 	if (n < 2) {
294 		werr("incomplete character sequence (%s)", show_mb(mb));
295 		return (-1);
296 	}
297 
298 	/* Store both bytes as a single 16-bit wide. */
299 	c <<= 8;
300 	c |= (uint8_t)(mb[1]);
301 	*wc = c;
302 	return (2);
303 }
304 
305 /*
306  * Most multibyte locales just convert the wide character to the multibyte
307  * form by stripping leading null bytes, and writing the 32-bit quantity
308  * in big-endian order.
309  */
310 int
311 tomb_mbs(char *mb, wchar_t wc)
312 {
313 	uint8_t *s = (uint8_t *)mb;
314 	int 	n = 0, c;
315 
316 	if ((wc & 0xff000000U) != 0) {
317 		n = 4;
318 	} else if ((wc & 0x00ff0000U) != 0) {
319 		n = 3;
320 	} else if ((wc & 0x0000ff00U) != 0) {
321 		n = 2;
322 	} else {
323 		n = 1;
324 	}
325 	c = n;
326 	while (n) {
327 		n--;
328 		s[n] = wc & 0xff;
329 		wc >>= 8;
330 	}
331 	/* ensure null termination */
332 	s[c] = 0;
333 	return (c);
334 }
335 
336 
337 /*
338  * big5 is a simple dual byte character set.
339  */
340 int
341 towide_big5(wchar_t *wc, const char *mb, int n)
342 {
343 	return (towide_dbcs(wc, mb, n));
344 }
345 
346 /*
347  * GBK encodes wides in the same way that big5 does, the high order
348  * bit of the first byte indicates a double byte character.
349  */
350 int
351 towide_gbk(wchar_t *wc, const char *mb, int n)
352 {
353 	return (towide_dbcs(wc, mb, n));
354 }
355 
356 /*
357  * GB2312 is another DBCS.  Its cleaner than others in that the second
358  * byte does not encode ASCII, but it supports characters.
359  */
360 int
361 towide_gb2312(wchar_t *wc, const char *mb, int n)
362 {
363 	return (towide_dbcs(wc, mb, n));
364 }
365 
366 /*
367  * GB18030.  This encodes as 8, 16, or 32-bits.
368  * 7-bit values are in 1 byte,  4 byte sequences are used when
369  * the second byte encodes 0x30-39 and all other sequences are 2 bytes.
370  */
371 int
372 towide_gb18030(wchar_t *wc, const char *mb, int n)
373 {
374 	wchar_t	c;
375 
376 	c = *(uint8_t *)mb;
377 
378 	if (n < 1) {
379 		werr("no character data");
380 		return (-1);
381 	}
382 	if ((c & 0x80) == 0) {
383 		/* 7-bit */
384 		*wc = c;
385 		return (1);
386 	}
387 	if (n < 2) {
388 		werr("incomplete character sequence (%s)", show_mb(mb));
389 		return (-1);
390 	}
391 
392 	/* pull in the second byte */
393 	c <<= 8;
394 	c |= (uint8_t)(mb[1]);
395 
396 	if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) {
397 		if (n < 4) {
398 			werr("incomplete 4-byte character sequence (%s)",
399 			    show_mb(mb));
400 			return (-1);
401 		}
402 		c <<= 8;
403 		c |= (uint8_t)(mb[2]);
404 		c <<= 8;
405 		c |= (uint8_t)(mb[3]);
406 		*wc = c;
407 		return (4);
408 	}
409 
410 	*wc = c;
411 	return (2);
412 }
413 
414 /*
415  * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it
416  * also has a range of single byte characters above 0x80.  (0xa1-0xdf).
417  */
418 int
419 towide_mskanji(wchar_t *wc, const char *mb, int n)
420 {
421 	wchar_t	c;
422 
423 	c = *(uint8_t *)mb;
424 
425 	if (n < 1) {
426 		werr("no character data");
427 		return (-1);
428 	}
429 	if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) {
430 		/* 7-bit */
431 		*wc = c;
432 		return (-1);
433 	}
434 
435 	if (n < 2) {
436 		werr("incomplete character sequence (%s)", show_mb(mb));
437 		return (-1);
438 	}
439 
440 	/* Store both bytes as a single 16-bit wide. */
441 	c <<= 8;
442 	c |= (uint8_t)(mb[1]);
443 	*wc = c;
444 	return (2);
445 }
446 
447 /*
448  * EUC forms.  EUC encodings are "variable".  FreeBSD carries some additional
449  * variable data to encode these, but we're going to treat each as independent
450  * instead.  Its the only way we can sensibly move forward.
451  *
452  * Note that the way in which the different EUC forms vary is how wide
453  * CS2 and CS3 are and what the first byte of them is.
454  */
455 static int
456 towide_euc_impl(wchar_t *wc, const char *mb, int n,
457     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
458 {
459 	int i;
460 	int width;
461 	wchar_t	c;
462 
463 	c = *(uint8_t *)mb;
464 
465 	if (n < 1) {
466 		werr("no character data");
467 		return (-1);
468 	}
469 
470 	/*
471 	 * All variations of EUC encode 7-bit ASCII as one byte, and use
472 	 * additional bytes for more than that.
473 	 */
474 	if ((c & 0x80) == 0) {
475 		/* 7-bit */
476 		*wc = c;
477 		return (1);
478 	}
479 
480 	/*
481 	 * All EUC variants reserve 0xa1-0xff to identify CS1, which
482 	 * is always two bytes wide.  Note that unused CS will be zero,
483 	 * and that cannot be true because we know that the high order
484 	 * bit must be set.
485 	 */
486 	if (c >= 0xa1) {
487 		width = 2;
488 	} else if (c == cs2) {
489 		width = cs2width;
490 	} else if (c == cs3) {
491 		width = cs3width;
492 	}
493 
494 	if (n < width) {
495 		werr("incomplete character sequence (%s)", show_mb(mb));
496 		return (-1);
497 	}
498 
499 	for (i = 1; i < width; i++) {
500 		/* pull in the next byte */
501 		c <<= 8;
502 		c |= (uint8_t)(mb[i]);
503 	}
504 
505 	*wc = c;
506 	return (width);
507 }
508 
509 /*
510  * EUC-CN encodes as follows:
511  *
512  * Code set 0 (ASCII):				0x21-0x7E
513  * Code set 1 (CNS 11643-1992 Plane 1):		0xA1A1-0xFEFE
514  * Code set 2 (CNS 11643-1992 Planes 1-16):	0x8EA1A1A1-0x8EB0FEFE
515  * Code set 3:					unused
516  */
517 int
518 towide_euccn(wchar_t *wc, const char *mb, int n)
519 {
520 	return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
521 }
522 
523 /*
524  * EUC-JP encodes as follows:
525  *
526  * Code set 0 (ASCII or JIS X 0201-1976 Roman):	0x21-0x7E
527  * Code set 1 (JIS X 0208):			0xA1A1-0xFEFE
528  * Code set 2 (half-width katakana):		0x8EA1-0x8EDF
529  * Code set 3 (JIS X 0212-1990):		0x8FA1A1-0x8FFEFE
530  */
531 int
532 towide_eucjp(wchar_t *wc, const char *mb, int n)
533 {
534 	return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3));
535 }
536 
537 /*
538  * EUC-KR encodes as follows:
539  *
540  * Code set 0 (ASCII or KS C 5636-1993):	0x21-0x7E
541  * Code set 1 (KS C 5601-1992):			0xA1A1-0xFEFE
542  * Code set 2:					unused
543  * Code set 3:					unused
544  */
545 int
546 towide_euckr(wchar_t *wc, const char *mb, int n)
547 {
548 	return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0));
549 }
550 
551 /*
552  * EUC-TW encodes as follows:
553  *
554  * Code set 0 (ASCII):				0x21-0x7E
555  * Code set 1 (CNS 11643-1992 Plane 1):		0xA1A1-0xFEFE
556  * Code set 2 (CNS 11643-1992 Planes 1-16):	0x8EA1A1A1-0x8EB0FEFE
557  * Code set 3:					unused
558  */
559 int
560 towide_euctw(wchar_t *wc, const char *mb, int n)
561 {
562 	return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
563 }
564 
565 /*
566  * Public entry points.
567  */
568 
569 int
570 to_wide(wchar_t *wc, const char *mb)
571 {
572 	/* this won't fail hard */
573 	return (_towide(wc, mb, strlen(mb) + 1));
574 }
575 
576 int
577 to_mb(char *mb, wchar_t wc)
578 {
579 	int	rv;
580 
581 	if ((rv = _tomb(mb, wc)) < 0) {
582 		errf(widemsg);
583 		free(widemsg);
584 		widemsg = NULL;
585 	}
586 	return (rv);
587 }
588 
589 char *
590 to_mb_string(const wchar_t *wcs)
591 {
592 	char	*mbs;
593 	char	*ptr;
594 	int	len;
595 
596 	mbs = malloc((wcslen(wcs) * mb_cur_max) + 1);
597 	if (mbs == NULL) {
598 		errf("out of memory");
599 		return (NULL);
600 	}
601 	ptr = mbs;
602 	while (*wcs) {
603 		if ((len = to_mb(ptr, *wcs)) < 0) {
604 			INTERR;
605 			free(mbs);
606 			return (NULL);
607 		}
608 		wcs++;
609 		ptr += len;
610 	}
611 	*ptr = 0;
612 	return (mbs);
613 }
614 
615 void
616 set_wide_encoding(const char *encoding)
617 {
618 	int i;
619 
620 	_towide = towide_none;
621 	_tomb = tomb_none;
622 	_encoding = "NONE";
623 
624 	for (i = 0; mb_encodings[i].name; i++) {
625 		if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
626 			_towide = mb_encodings[i].towide;
627 			_tomb = mb_encodings[i].tomb;
628 			_encoding = mb_encodings[i].cname;
629 		}
630 	}
631 }
632 
633 const char *
634 get_wide_encoding(void)
635 {
636 	return (_encoding);
637 }
638