xref: /titanic_53/usr/src/cmd/localedef/wide.c (revision 6b5e5868e7ebf1aff3a5abd7d0c4ef0e5fbf3648)
1*6b5e5868SGarrett D'Amore /*
2*6b5e5868SGarrett D'Amore  * This file and its contents are supplied under the terms of the
3*6b5e5868SGarrett D'Amore  * Common Development and Distribution License ("CDDL"), version 1.0.
4*6b5e5868SGarrett D'Amore  * You may only use this file in accordance with the terms version 1.0
5*6b5e5868SGarrett D'Amore  * of the CDDL.
6*6b5e5868SGarrett D'Amore  *
7*6b5e5868SGarrett D'Amore  * A full copy of the text of the CDDL should have accompanied this
8*6b5e5868SGarrett D'Amore  * source.  A copy of the CDDL is also available via the Internet at
9*6b5e5868SGarrett D'Amore  * http://www.illumos.org/license/CDDL.
10*6b5e5868SGarrett D'Amore  */
11*6b5e5868SGarrett D'Amore 
12*6b5e5868SGarrett D'Amore /*
13*6b5e5868SGarrett D'Amore  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
14*6b5e5868SGarrett D'Amore  */
15*6b5e5868SGarrett D'Amore 
16*6b5e5868SGarrett D'Amore /*
17*6b5e5868SGarrett D'Amore  * The functions in this file convert from the standard multibyte forms
18*6b5e5868SGarrett D'Amore  * to the wide character forms used internally by libc.  Unfortunately,
19*6b5e5868SGarrett D'Amore  * this approach means that we need a method for each and every encoding.
20*6b5e5868SGarrett D'Amore  */
21*6b5e5868SGarrett D'Amore 
22*6b5e5868SGarrett D'Amore #include <stdlib.h>
23*6b5e5868SGarrett D'Amore #include <wchar.h>
24*6b5e5868SGarrett D'Amore #include <string.h>
25*6b5e5868SGarrett D'Amore #include <sys/types.h>
26*6b5e5868SGarrett D'Amore #include "localedef.h"
27*6b5e5868SGarrett D'Amore 
28*6b5e5868SGarrett D'Amore static int towide_none(wchar_t *, const char *, int);
29*6b5e5868SGarrett D'Amore static int towide_utf8(wchar_t *, const char *, int);
30*6b5e5868SGarrett D'Amore static int towide_big5(wchar_t *, const char *, int);
31*6b5e5868SGarrett D'Amore static int towide_gbk(wchar_t *, const char *, int);
32*6b5e5868SGarrett D'Amore static int towide_gb2312(wchar_t *, const char *, int);
33*6b5e5868SGarrett D'Amore static int towide_gb18030(wchar_t *, const char *, int);
34*6b5e5868SGarrett D'Amore static int towide_mskanji(wchar_t *, const char *, int);
35*6b5e5868SGarrett D'Amore static int towide_euccn(wchar_t *, const char *, int);
36*6b5e5868SGarrett D'Amore static int towide_eucjp(wchar_t *, const char *, int);
37*6b5e5868SGarrett D'Amore static int towide_euckr(wchar_t *, const char *, int);
38*6b5e5868SGarrett D'Amore static int towide_euctw(wchar_t *, const char *, int);
39*6b5e5868SGarrett D'Amore 
40*6b5e5868SGarrett D'Amore static int tomb_none(char *, wchar_t);
41*6b5e5868SGarrett D'Amore static int tomb_utf8(char *, wchar_t);
42*6b5e5868SGarrett D'Amore static int tomb_mbs(char *, wchar_t);
43*6b5e5868SGarrett D'Amore 
44*6b5e5868SGarrett D'Amore static int (*_towide)(wchar_t *, const char *, int) = towide_none;
45*6b5e5868SGarrett D'Amore static int (*_tomb)(char *, wchar_t) = tomb_none;
46*6b5e5868SGarrett D'Amore static const char *_encoding = "NONE";
47*6b5e5868SGarrett D'Amore 
48*6b5e5868SGarrett D'Amore /*
49*6b5e5868SGarrett D'Amore  * Table of supported encodings.  We only bother to list the multibyte
50*6b5e5868SGarrett D'Amore  * encodings here, because single byte locales are handed by "NONE".
51*6b5e5868SGarrett D'Amore  */
52*6b5e5868SGarrett D'Amore static struct {
53*6b5e5868SGarrett D'Amore 	const char *name;
54*6b5e5868SGarrett D'Amore 	/* the name that the underlying libc implemenation uses */
55*6b5e5868SGarrett D'Amore 	const char *cname;
56*6b5e5868SGarrett D'Amore 	int (*towide)(wchar_t *, const char *, int);
57*6b5e5868SGarrett D'Amore 	int (*tomb)(char *, wchar_t);
58*6b5e5868SGarrett D'Amore } mb_encodings[] = {
59*6b5e5868SGarrett D'Amore 	{ "UTF-8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
60*6b5e5868SGarrett D'Amore 	{ "UTF8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
61*6b5e5868SGarrett D'Amore 	{ "utf8",	"UTF-8",	 towide_utf8,	tomb_utf8 },
62*6b5e5868SGarrett D'Amore 	{ "utf-8",	"UTF-8",	towide_utf8,	tomb_utf8 },
63*6b5e5868SGarrett D'Amore 
64*6b5e5868SGarrett D'Amore 	{ "EUC-CN",	"EUC-CN",	towide_euccn,	tomb_mbs },
65*6b5e5868SGarrett D'Amore 	{ "eucCN",	"EUC-CN",	towide_euccn,	tomb_mbs },
66*6b5e5868SGarrett D'Amore 
67*6b5e5868SGarrett D'Amore 	{ "EUC-JP",	"EUC-JP",	towide_eucjp,	tomb_mbs },
68*6b5e5868SGarrett D'Amore 	{ "eucJP",	"EUC-JP",	towide_eucjp,	tomb_mbs },
69*6b5e5868SGarrett D'Amore 
70*6b5e5868SGarrett D'Amore 	{ "EUC-KR",	"EUC-KR",	towide_euckr,	tomb_mbs },
71*6b5e5868SGarrett D'Amore 	{ "eucKR",	"EUC-KR",	towide_euckr,	tomb_mbs },
72*6b5e5868SGarrett D'Amore 
73*6b5e5868SGarrett D'Amore 	{ "EUC-TW",	"EUC-TW",	towide_euctw,	tomb_mbs },
74*6b5e5868SGarrett D'Amore 	{ "eucTW",	"EUC-TW",	towide_euctw,	tomb_mbs },
75*6b5e5868SGarrett D'Amore 
76*6b5e5868SGarrett D'Amore 	{ "MS_Kanji",	"MSKanji",	towide_mskanji,	tomb_mbs },
77*6b5e5868SGarrett D'Amore 	{ "MSKanji",	"MSKanji",	towide_mskanji,	tomb_mbs },
78*6b5e5868SGarrett D'Amore 	{ "PCK",	"MSKanji",	towide_mskanji,	tomb_mbs },
79*6b5e5868SGarrett D'Amore 	{ "SJIS",	"MSKanji",	towide_mskanji,	tomb_mbs },
80*6b5e5868SGarrett D'Amore 	{ "Shift_JIS",	"MSKanji",	towide_mskanji,	tomb_mbs },
81*6b5e5868SGarrett D'Amore 
82*6b5e5868SGarrett D'Amore 	{ "BIG5",	"BIG5",		towide_big5,	tomb_mbs },
83*6b5e5868SGarrett D'Amore 	{ "big5",	"BIG5",		towide_big5,	tomb_mbs },
84*6b5e5868SGarrett D'Amore 	{ "Big5",	"BIG5",		towide_big5,	tomb_mbs },
85*6b5e5868SGarrett D'Amore 
86*6b5e5868SGarrett D'Amore 	{ "GBK",	"GBK",		towide_gbk,	tomb_mbs },
87*6b5e5868SGarrett D'Amore 
88*6b5e5868SGarrett D'Amore 	{ "GB18030",	"GB18030",	towide_gb18030,	tomb_mbs },
89*6b5e5868SGarrett D'Amore 
90*6b5e5868SGarrett D'Amore 	{ "GB2312",	"GB2312",	towide_gb2312,	tomb_mbs },
91*6b5e5868SGarrett D'Amore 
92*6b5e5868SGarrett D'Amore 	{ "ASCII",	"ASCII",	towide_none,	tomb_none },
93*6b5e5868SGarrett D'Amore 	{ "US-ASCII",	"ASCII",	towide_none,	tomb_none },
94*6b5e5868SGarrett D'Amore 	{ "646",	"ASCII",	towide_none,	tomb_none },
95*6b5e5868SGarrett D'Amore 
96*6b5e5868SGarrett D'Amore 	{ NULL, NULL },
97*6b5e5868SGarrett D'Amore };
98*6b5e5868SGarrett D'Amore 
99*6b5e5868SGarrett D'Amore static char *
100*6b5e5868SGarrett D'Amore show_mb(const char *mb)
101*6b5e5868SGarrett D'Amore {
102*6b5e5868SGarrett D'Amore 	static char buf[64];
103*6b5e5868SGarrett D'Amore 
104*6b5e5868SGarrett D'Amore 	/* ASCII stuff we just print */
105*6b5e5868SGarrett D'Amore 	if (isascii(*mb) && isgraph(*mb)) {
106*6b5e5868SGarrett D'Amore 		buf[0] = *mb;
107*6b5e5868SGarrett D'Amore 		buf[1] = 0;
108*6b5e5868SGarrett D'Amore 		return (buf);
109*6b5e5868SGarrett D'Amore 	}
110*6b5e5868SGarrett D'Amore 	buf[0] = 0;
111*6b5e5868SGarrett D'Amore 	while (*mb != 0) {
112*6b5e5868SGarrett D'Amore 		char scr[8];
113*6b5e5868SGarrett D'Amore 		(void) snprintf(scr, sizeof (scr), "\\x%02x", *mb);
114*6b5e5868SGarrett D'Amore 		(void) strlcat(buf, scr, sizeof (buf));
115*6b5e5868SGarrett D'Amore 		mb++;
116*6b5e5868SGarrett D'Amore 	}
117*6b5e5868SGarrett D'Amore 	return (buf);
118*6b5e5868SGarrett D'Amore }
119*6b5e5868SGarrett D'Amore 
120*6b5e5868SGarrett D'Amore static char	*widemsg;
121*6b5e5868SGarrett D'Amore 
122*6b5e5868SGarrett D'Amore void
123*6b5e5868SGarrett D'Amore werr(const char *fmt, ...)
124*6b5e5868SGarrett D'Amore {
125*6b5e5868SGarrett D'Amore 	char	*msg;
126*6b5e5868SGarrett D'Amore 
127*6b5e5868SGarrett D'Amore 	va_list	va;
128*6b5e5868SGarrett D'Amore 	va_start(va, fmt);
129*6b5e5868SGarrett D'Amore 	(void) vasprintf(&msg, fmt, va);
130*6b5e5868SGarrett D'Amore 	va_end(va);
131*6b5e5868SGarrett D'Amore 
132*6b5e5868SGarrett D'Amore 	free(widemsg);
133*6b5e5868SGarrett D'Amore 	widemsg = msg;
134*6b5e5868SGarrett D'Amore }
135*6b5e5868SGarrett D'Amore 
136*6b5e5868SGarrett D'Amore /*
137*6b5e5868SGarrett D'Amore  * This is used for 8-bit encodings.
138*6b5e5868SGarrett D'Amore  */
139*6b5e5868SGarrett D'Amore int
140*6b5e5868SGarrett D'Amore towide_none(wchar_t *c, const char *mb, int n)
141*6b5e5868SGarrett D'Amore {
142*6b5e5868SGarrett D'Amore 	if (mb_cur_max != 1) {
143*6b5e5868SGarrett D'Amore 		werr("invalid or unsupported multibyte locale");
144*6b5e5868SGarrett D'Amore 		return (-1);
145*6b5e5868SGarrett D'Amore 	}
146*6b5e5868SGarrett D'Amore 	if (n < 1) {
147*6b5e5868SGarrett D'Amore 		werr("no character data");
148*6b5e5868SGarrett D'Amore 		return (-1);
149*6b5e5868SGarrett D'Amore 	}
150*6b5e5868SGarrett D'Amore 	*c = (uint8_t)*mb;
151*6b5e5868SGarrett D'Amore 	return (1);
152*6b5e5868SGarrett D'Amore }
153*6b5e5868SGarrett D'Amore 
154*6b5e5868SGarrett D'Amore int
155*6b5e5868SGarrett D'Amore tomb_none(char *mb, wchar_t wc)
156*6b5e5868SGarrett D'Amore {
157*6b5e5868SGarrett D'Amore 	if (mb_cur_max != 1) {
158*6b5e5868SGarrett D'Amore 		werr("invalid or unsupported multibyte locale");
159*6b5e5868SGarrett D'Amore 		return (-1);
160*6b5e5868SGarrett D'Amore 	}
161*6b5e5868SGarrett D'Amore 	*(uint8_t *)mb = (wc & 0xff);
162*6b5e5868SGarrett D'Amore 	mb[1] = 0;
163*6b5e5868SGarrett D'Amore 	return (1);
164*6b5e5868SGarrett D'Amore }
165*6b5e5868SGarrett D'Amore 
166*6b5e5868SGarrett D'Amore /*
167*6b5e5868SGarrett D'Amore  * UTF-8 stores wide characters in UTF-32 form.
168*6b5e5868SGarrett D'Amore  */
169*6b5e5868SGarrett D'Amore int
170*6b5e5868SGarrett D'Amore towide_utf8(wchar_t *wc, const char *mb, int n)
171*6b5e5868SGarrett D'Amore {
172*6b5e5868SGarrett D'Amore 	wchar_t	c;
173*6b5e5868SGarrett D'Amore 	int	nb;
174*6b5e5868SGarrett D'Amore 	int	lv;	/* lowest legal value */
175*6b5e5868SGarrett D'Amore 	int	i;
176*6b5e5868SGarrett D'Amore 	const uint8_t *s = (const uint8_t *)mb;
177*6b5e5868SGarrett D'Amore 
178*6b5e5868SGarrett D'Amore 	if (n < 1) {
179*6b5e5868SGarrett D'Amore 		werr("no utf8 data");
180*6b5e5868SGarrett D'Amore 		return (-1);
181*6b5e5868SGarrett D'Amore 	}
182*6b5e5868SGarrett D'Amore 	c = *s;
183*6b5e5868SGarrett D'Amore 
184*6b5e5868SGarrett D'Amore 	if ((c & 0x80) == 0) {
185*6b5e5868SGarrett D'Amore 		/* 7-bit ASCII */
186*6b5e5868SGarrett D'Amore 		*wc = c;
187*6b5e5868SGarrett D'Amore 		return (1);
188*6b5e5868SGarrett D'Amore 	} else if ((c & 0xe0) == 0xc0) {
189*6b5e5868SGarrett D'Amore 		/* u80-u7ff - two bytes encoded */
190*6b5e5868SGarrett D'Amore 		nb = 2;
191*6b5e5868SGarrett D'Amore 		lv = 0x80;
192*6b5e5868SGarrett D'Amore 		c &= ~0xe0;
193*6b5e5868SGarrett D'Amore 	} else if ((c & 0xf0) == 0xe0) {
194*6b5e5868SGarrett D'Amore 		/* u800-uffff - three bytes encoded */
195*6b5e5868SGarrett D'Amore 		nb = 3;
196*6b5e5868SGarrett D'Amore 		lv = 0x800;
197*6b5e5868SGarrett D'Amore 		c &= ~0xf0;
198*6b5e5868SGarrett D'Amore 	} else if ((c & 0xf8) == 0xf0) {
199*6b5e5868SGarrett D'Amore 		/* u1000-u1fffff - four bytes encoded */
200*6b5e5868SGarrett D'Amore 		nb = 4;
201*6b5e5868SGarrett D'Amore 		lv = 0x1000;
202*6b5e5868SGarrett D'Amore 		c &= ~0xf8;
203*6b5e5868SGarrett D'Amore 	} else {
204*6b5e5868SGarrett D'Amore 		/* 5 and 6 byte encodings are not legal unicode */
205*6b5e5868SGarrett D'Amore 		werr("utf8 encoding too large (%s)", show_mb(mb));
206*6b5e5868SGarrett D'Amore 		return (-1);
207*6b5e5868SGarrett D'Amore 	}
208*6b5e5868SGarrett D'Amore 	if (nb > n) {
209*6b5e5868SGarrett D'Amore 		werr("incomplete utf8 sequence (%s)", show_mb(mb));
210*6b5e5868SGarrett D'Amore 		return (-1);
211*6b5e5868SGarrett D'Amore 	}
212*6b5e5868SGarrett D'Amore 
213*6b5e5868SGarrett D'Amore 	for (i = 1; i < nb; i++) {
214*6b5e5868SGarrett D'Amore 		if (((s[i]) & 0xc0) != 0x80) {
215*6b5e5868SGarrett D'Amore 			werr("illegal utf8 byte (%x)", s[i]);
216*6b5e5868SGarrett D'Amore 			return (-1);
217*6b5e5868SGarrett D'Amore 		}
218*6b5e5868SGarrett D'Amore 		c <<= 6;
219*6b5e5868SGarrett D'Amore 		c |= (s[i] & 0x3f);
220*6b5e5868SGarrett D'Amore 	}
221*6b5e5868SGarrett D'Amore 
222*6b5e5868SGarrett D'Amore 	if (c < lv) {
223*6b5e5868SGarrett D'Amore 		werr("illegal redundant utf8 encoding (%s)", show_mb(mb));
224*6b5e5868SGarrett D'Amore 		return (-1);
225*6b5e5868SGarrett D'Amore 	}
226*6b5e5868SGarrett D'Amore 	*wc = c;
227*6b5e5868SGarrett D'Amore 	return (nb);
228*6b5e5868SGarrett D'Amore }
229*6b5e5868SGarrett D'Amore 
230*6b5e5868SGarrett D'Amore int
231*6b5e5868SGarrett D'Amore tomb_utf8(char *mb, wchar_t wc)
232*6b5e5868SGarrett D'Amore {
233*6b5e5868SGarrett D'Amore 	uint8_t *s = (uint8_t *)mb;
234*6b5e5868SGarrett D'Amore 	uint8_t msk;
235*6b5e5868SGarrett D'Amore 	int cnt;
236*6b5e5868SGarrett D'Amore 	int i;
237*6b5e5868SGarrett D'Amore 
238*6b5e5868SGarrett D'Amore 	if (wc <= 0x7f) {
239*6b5e5868SGarrett D'Amore 		s[0] = wc & 0x7f;
240*6b5e5868SGarrett D'Amore 		s[1] = 0;
241*6b5e5868SGarrett D'Amore 		return (1);
242*6b5e5868SGarrett D'Amore 	}
243*6b5e5868SGarrett D'Amore 	if (wc <= 0x7ff) {
244*6b5e5868SGarrett D'Amore 		cnt = 2;
245*6b5e5868SGarrett D'Amore 		msk = 0xc0;
246*6b5e5868SGarrett D'Amore 	} else if (wc <= 0xffff) {
247*6b5e5868SGarrett D'Amore 		cnt = 3;
248*6b5e5868SGarrett D'Amore 		msk = 0xe0;
249*6b5e5868SGarrett D'Amore 	} else if (wc <= 0x1fffff) {
250*6b5e5868SGarrett D'Amore 		cnt = 4;
251*6b5e5868SGarrett D'Amore 		msk = 0xf0;
252*6b5e5868SGarrett D'Amore 	} else {
253*6b5e5868SGarrett D'Amore 		werr("illegal uf8 char (%x)", wc);
254*6b5e5868SGarrett D'Amore 		return (-1);
255*6b5e5868SGarrett D'Amore 	}
256*6b5e5868SGarrett D'Amore 	for (i = cnt - 1; i; i--) {
257*6b5e5868SGarrett D'Amore 		s[i] = (wc & 0x3f) | 0x80;
258*6b5e5868SGarrett D'Amore 		wc >>= 6;
259*6b5e5868SGarrett D'Amore 	}
260*6b5e5868SGarrett D'Amore 	s[0] = (msk) | wc;
261*6b5e5868SGarrett D'Amore 	s[cnt] = 0;
262*6b5e5868SGarrett D'Amore 	return (cnt);
263*6b5e5868SGarrett D'Amore }
264*6b5e5868SGarrett D'Amore 
265*6b5e5868SGarrett D'Amore /*
266*6b5e5868SGarrett D'Amore  * Several encodings share a simplistic dual byte encoding.  In these
267*6b5e5868SGarrett D'Amore  * forms, they all indicate that a two byte sequence is to be used if
268*6b5e5868SGarrett D'Amore  * the first byte has its high bit set.  They all store this simple
269*6b5e5868SGarrett D'Amore  * encoding as a 16-bit value, although a great many of the possible
270*6b5e5868SGarrett D'Amore  * code points are not used in most character sets.  This gives a possible
271*6b5e5868SGarrett D'Amore  * set of just over 32,000 valid code points.
272*6b5e5868SGarrett D'Amore  *
273*6b5e5868SGarrett D'Amore  * 0x00 - 0x7f		- 1 byte encoding
274*6b5e5868SGarrett D'Amore  * 0x80 - 0x7fff	- illegal
275*6b5e5868SGarrett D'Amore  * 0x8000 - 0xffff	- 2 byte encoding
276*6b5e5868SGarrett D'Amore  */
277*6b5e5868SGarrett D'Amore static int
278*6b5e5868SGarrett D'Amore towide_dbcs(wchar_t *wc, const char *mb, int n)
279*6b5e5868SGarrett D'Amore {
280*6b5e5868SGarrett D'Amore 	wchar_t	c;
281*6b5e5868SGarrett D'Amore 
282*6b5e5868SGarrett D'Amore 	c = *(uint8_t *)mb;
283*6b5e5868SGarrett D'Amore 
284*6b5e5868SGarrett D'Amore 	if (n < 1) {
285*6b5e5868SGarrett D'Amore 		werr("no character data");
286*6b5e5868SGarrett D'Amore 		return (-1);
287*6b5e5868SGarrett D'Amore 	}
288*6b5e5868SGarrett D'Amore 	if ((c & 0x80) == 0) {
289*6b5e5868SGarrett D'Amore 		/* 7-bit */
290*6b5e5868SGarrett D'Amore 		*wc = c;
291*6b5e5868SGarrett D'Amore 		return (1);
292*6b5e5868SGarrett D'Amore 	}
293*6b5e5868SGarrett D'Amore 	if (n < 2) {
294*6b5e5868SGarrett D'Amore 		werr("incomplete character sequence (%s)", show_mb(mb));
295*6b5e5868SGarrett D'Amore 		return (-1);
296*6b5e5868SGarrett D'Amore 	}
297*6b5e5868SGarrett D'Amore 
298*6b5e5868SGarrett D'Amore 	/* Store both bytes as a single 16-bit wide. */
299*6b5e5868SGarrett D'Amore 	c <<= 8;
300*6b5e5868SGarrett D'Amore 	c |= (uint8_t)(mb[1]);
301*6b5e5868SGarrett D'Amore 	*wc = c;
302*6b5e5868SGarrett D'Amore 	return (2);
303*6b5e5868SGarrett D'Amore }
304*6b5e5868SGarrett D'Amore 
305*6b5e5868SGarrett D'Amore /*
306*6b5e5868SGarrett D'Amore  * Most multibyte locales just convert the wide character to the multibyte
307*6b5e5868SGarrett D'Amore  * form by stripping leading null bytes, and writing the 32-bit quantity
308*6b5e5868SGarrett D'Amore  * in big-endian order.
309*6b5e5868SGarrett D'Amore  */
310*6b5e5868SGarrett D'Amore int
311*6b5e5868SGarrett D'Amore tomb_mbs(char *mb, wchar_t wc)
312*6b5e5868SGarrett D'Amore {
313*6b5e5868SGarrett D'Amore 	uint8_t *s = (uint8_t *)mb;
314*6b5e5868SGarrett D'Amore 	int 	n = 0, c;
315*6b5e5868SGarrett D'Amore 
316*6b5e5868SGarrett D'Amore 	if ((wc & 0xff000000U) != 0) {
317*6b5e5868SGarrett D'Amore 		n = 4;
318*6b5e5868SGarrett D'Amore 	} else if ((wc & 0x00ff0000U) != 0) {
319*6b5e5868SGarrett D'Amore 		n = 3;
320*6b5e5868SGarrett D'Amore 	} else if ((wc & 0x0000ff00U) != 0) {
321*6b5e5868SGarrett D'Amore 		n = 2;
322*6b5e5868SGarrett D'Amore 	} else {
323*6b5e5868SGarrett D'Amore 		n = 1;
324*6b5e5868SGarrett D'Amore 	}
325*6b5e5868SGarrett D'Amore 	c = n;
326*6b5e5868SGarrett D'Amore 	while (n) {
327*6b5e5868SGarrett D'Amore 		n--;
328*6b5e5868SGarrett D'Amore 		s[n] = wc & 0xff;
329*6b5e5868SGarrett D'Amore 		wc >>= 8;
330*6b5e5868SGarrett D'Amore 	}
331*6b5e5868SGarrett D'Amore 	/* ensure null termination */
332*6b5e5868SGarrett D'Amore 	s[c] = 0;
333*6b5e5868SGarrett D'Amore 	return (c);
334*6b5e5868SGarrett D'Amore }
335*6b5e5868SGarrett D'Amore 
336*6b5e5868SGarrett D'Amore 
337*6b5e5868SGarrett D'Amore /*
338*6b5e5868SGarrett D'Amore  * big5 is a simple dual byte character set.
339*6b5e5868SGarrett D'Amore  */
340*6b5e5868SGarrett D'Amore int
341*6b5e5868SGarrett D'Amore towide_big5(wchar_t *wc, const char *mb, int n)
342*6b5e5868SGarrett D'Amore {
343*6b5e5868SGarrett D'Amore 	return (towide_dbcs(wc, mb, n));
344*6b5e5868SGarrett D'Amore }
345*6b5e5868SGarrett D'Amore 
346*6b5e5868SGarrett D'Amore /*
347*6b5e5868SGarrett D'Amore  * GBK encodes wides in the same way that big5 does, the high order
348*6b5e5868SGarrett D'Amore  * bit of the first byte indicates a double byte character.
349*6b5e5868SGarrett D'Amore  */
350*6b5e5868SGarrett D'Amore int
351*6b5e5868SGarrett D'Amore towide_gbk(wchar_t *wc, const char *mb, int n)
352*6b5e5868SGarrett D'Amore {
353*6b5e5868SGarrett D'Amore 	return (towide_dbcs(wc, mb, n));
354*6b5e5868SGarrett D'Amore }
355*6b5e5868SGarrett D'Amore 
356*6b5e5868SGarrett D'Amore /*
357*6b5e5868SGarrett D'Amore  * GB2312 is another DBCS.  Its cleaner than others in that the second
358*6b5e5868SGarrett D'Amore  * byte does not encode ASCII, but it supports characters.
359*6b5e5868SGarrett D'Amore  */
360*6b5e5868SGarrett D'Amore int
361*6b5e5868SGarrett D'Amore towide_gb2312(wchar_t *wc, const char *mb, int n)
362*6b5e5868SGarrett D'Amore {
363*6b5e5868SGarrett D'Amore 	return (towide_dbcs(wc, mb, n));
364*6b5e5868SGarrett D'Amore }
365*6b5e5868SGarrett D'Amore 
366*6b5e5868SGarrett D'Amore /*
367*6b5e5868SGarrett D'Amore  * GB18030.  This encodes as 8, 16, or 32-bits.
368*6b5e5868SGarrett D'Amore  * 7-bit values are in 1 byte,  4 byte sequences are used when
369*6b5e5868SGarrett D'Amore  * the second byte encodes 0x30-39 and all other sequences are 2 bytes.
370*6b5e5868SGarrett D'Amore  */
371*6b5e5868SGarrett D'Amore int
372*6b5e5868SGarrett D'Amore towide_gb18030(wchar_t *wc, const char *mb, int n)
373*6b5e5868SGarrett D'Amore {
374*6b5e5868SGarrett D'Amore 	wchar_t	c;
375*6b5e5868SGarrett D'Amore 
376*6b5e5868SGarrett D'Amore 	c = *(uint8_t *)mb;
377*6b5e5868SGarrett D'Amore 
378*6b5e5868SGarrett D'Amore 	if (n < 1) {
379*6b5e5868SGarrett D'Amore 		werr("no character data");
380*6b5e5868SGarrett D'Amore 		return (-1);
381*6b5e5868SGarrett D'Amore 	}
382*6b5e5868SGarrett D'Amore 	if ((c & 0x80) == 0) {
383*6b5e5868SGarrett D'Amore 		/* 7-bit */
384*6b5e5868SGarrett D'Amore 		*wc = c;
385*6b5e5868SGarrett D'Amore 		return (1);
386*6b5e5868SGarrett D'Amore 	}
387*6b5e5868SGarrett D'Amore 	if (n < 2) {
388*6b5e5868SGarrett D'Amore 		werr("incomplete character sequence (%s)", show_mb(mb));
389*6b5e5868SGarrett D'Amore 		return (-1);
390*6b5e5868SGarrett D'Amore 	}
391*6b5e5868SGarrett D'Amore 
392*6b5e5868SGarrett D'Amore 	/* pull in the second byte */
393*6b5e5868SGarrett D'Amore 	c <<= 8;
394*6b5e5868SGarrett D'Amore 	c |= (uint8_t)(mb[1]);
395*6b5e5868SGarrett D'Amore 
396*6b5e5868SGarrett D'Amore 	if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) {
397*6b5e5868SGarrett D'Amore 		if (n < 4) {
398*6b5e5868SGarrett D'Amore 			werr("incomplete 4-byte character sequence (%s)",
399*6b5e5868SGarrett D'Amore 			    show_mb(mb));
400*6b5e5868SGarrett D'Amore 			return (-1);
401*6b5e5868SGarrett D'Amore 		}
402*6b5e5868SGarrett D'Amore 		c <<= 8;
403*6b5e5868SGarrett D'Amore 		c |= (uint8_t)(mb[2]);
404*6b5e5868SGarrett D'Amore 		c <<= 8;
405*6b5e5868SGarrett D'Amore 		c |= (uint8_t)(mb[3]);
406*6b5e5868SGarrett D'Amore 		*wc = c;
407*6b5e5868SGarrett D'Amore 		return (4);
408*6b5e5868SGarrett D'Amore 	}
409*6b5e5868SGarrett D'Amore 
410*6b5e5868SGarrett D'Amore 	*wc = c;
411*6b5e5868SGarrett D'Amore 	return (2);
412*6b5e5868SGarrett D'Amore }
413*6b5e5868SGarrett D'Amore 
414*6b5e5868SGarrett D'Amore /*
415*6b5e5868SGarrett D'Amore  * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it
416*6b5e5868SGarrett D'Amore  * also has a range of single byte characters above 0x80.  (0xa1-0xdf).
417*6b5e5868SGarrett D'Amore  */
418*6b5e5868SGarrett D'Amore int
419*6b5e5868SGarrett D'Amore towide_mskanji(wchar_t *wc, const char *mb, int n)
420*6b5e5868SGarrett D'Amore {
421*6b5e5868SGarrett D'Amore 	wchar_t	c;
422*6b5e5868SGarrett D'Amore 
423*6b5e5868SGarrett D'Amore 	c = *(uint8_t *)mb;
424*6b5e5868SGarrett D'Amore 
425*6b5e5868SGarrett D'Amore 	if (n < 1) {
426*6b5e5868SGarrett D'Amore 		werr("no character data");
427*6b5e5868SGarrett D'Amore 		return (-1);
428*6b5e5868SGarrett D'Amore 	}
429*6b5e5868SGarrett D'Amore 	if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) {
430*6b5e5868SGarrett D'Amore 		/* 7-bit */
431*6b5e5868SGarrett D'Amore 		*wc = c;
432*6b5e5868SGarrett D'Amore 		return (-1);
433*6b5e5868SGarrett D'Amore 	}
434*6b5e5868SGarrett D'Amore 
435*6b5e5868SGarrett D'Amore 	if (n < 2) {
436*6b5e5868SGarrett D'Amore 		werr("incomplete character sequence (%s)", show_mb(mb));
437*6b5e5868SGarrett D'Amore 		return (-1);
438*6b5e5868SGarrett D'Amore 	}
439*6b5e5868SGarrett D'Amore 
440*6b5e5868SGarrett D'Amore 	/* Store both bytes as a single 16-bit wide. */
441*6b5e5868SGarrett D'Amore 	c <<= 8;
442*6b5e5868SGarrett D'Amore 	c |= (uint8_t)(mb[1]);
443*6b5e5868SGarrett D'Amore 	*wc = c;
444*6b5e5868SGarrett D'Amore 	return (2);
445*6b5e5868SGarrett D'Amore }
446*6b5e5868SGarrett D'Amore 
447*6b5e5868SGarrett D'Amore /*
448*6b5e5868SGarrett D'Amore  * EUC forms.  EUC encodings are "variable".  FreeBSD carries some additional
449*6b5e5868SGarrett D'Amore  * variable data to encode these, but we're going to treat each as independent
450*6b5e5868SGarrett D'Amore  * instead.  Its the only way we can sensibly move forward.
451*6b5e5868SGarrett D'Amore  *
452*6b5e5868SGarrett D'Amore  * Note that the way in which the different EUC forms vary is how wide
453*6b5e5868SGarrett D'Amore  * CS2 and CS3 are and what the first byte of them is.
454*6b5e5868SGarrett D'Amore  */
455*6b5e5868SGarrett D'Amore static int
456*6b5e5868SGarrett D'Amore towide_euc_impl(wchar_t *wc, const char *mb, int n,
457*6b5e5868SGarrett D'Amore     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
458*6b5e5868SGarrett D'Amore {
459*6b5e5868SGarrett D'Amore 	int i;
460*6b5e5868SGarrett D'Amore 	int width;
461*6b5e5868SGarrett D'Amore 	wchar_t	c;
462*6b5e5868SGarrett D'Amore 
463*6b5e5868SGarrett D'Amore 	c = *(uint8_t *)mb;
464*6b5e5868SGarrett D'Amore 
465*6b5e5868SGarrett D'Amore 	if (n < 1) {
466*6b5e5868SGarrett D'Amore 		werr("no character data");
467*6b5e5868SGarrett D'Amore 		return (-1);
468*6b5e5868SGarrett D'Amore 	}
469*6b5e5868SGarrett D'Amore 
470*6b5e5868SGarrett D'Amore 	/*
471*6b5e5868SGarrett D'Amore 	 * All variations of EUC encode 7-bit ASCII as one byte, and use
472*6b5e5868SGarrett D'Amore 	 * additional bytes for more than that.
473*6b5e5868SGarrett D'Amore 	 */
474*6b5e5868SGarrett D'Amore 	if ((c & 0x80) == 0) {
475*6b5e5868SGarrett D'Amore 		/* 7-bit */
476*6b5e5868SGarrett D'Amore 		*wc = c;
477*6b5e5868SGarrett D'Amore 		return (1);
478*6b5e5868SGarrett D'Amore 	}
479*6b5e5868SGarrett D'Amore 
480*6b5e5868SGarrett D'Amore 	/*
481*6b5e5868SGarrett D'Amore 	 * All EUC variants reserve 0xa1-0xff to identify CS1, which
482*6b5e5868SGarrett D'Amore 	 * is always two bytes wide.  Note that unused CS will be zero,
483*6b5e5868SGarrett D'Amore 	 * and that cannot be true because we know that the high order
484*6b5e5868SGarrett D'Amore 	 * bit must be set.
485*6b5e5868SGarrett D'Amore 	 */
486*6b5e5868SGarrett D'Amore 	if (c >= 0xa1) {
487*6b5e5868SGarrett D'Amore 		width = 2;
488*6b5e5868SGarrett D'Amore 	} else if (c == cs2) {
489*6b5e5868SGarrett D'Amore 		width = cs2width;
490*6b5e5868SGarrett D'Amore 	} else if (c == cs3) {
491*6b5e5868SGarrett D'Amore 		width = cs3width;
492*6b5e5868SGarrett D'Amore 	}
493*6b5e5868SGarrett D'Amore 
494*6b5e5868SGarrett D'Amore 	if (n < width) {
495*6b5e5868SGarrett D'Amore 		werr("incomplete character sequence (%s)", show_mb(mb));
496*6b5e5868SGarrett D'Amore 		return (-1);
497*6b5e5868SGarrett D'Amore 	}
498*6b5e5868SGarrett D'Amore 
499*6b5e5868SGarrett D'Amore 	for (i = 1; i < width; i++) {
500*6b5e5868SGarrett D'Amore 		/* pull in the next byte */
501*6b5e5868SGarrett D'Amore 		c <<= 8;
502*6b5e5868SGarrett D'Amore 		c |= (uint8_t)(mb[i]);
503*6b5e5868SGarrett D'Amore 	}
504*6b5e5868SGarrett D'Amore 
505*6b5e5868SGarrett D'Amore 	*wc = c;
506*6b5e5868SGarrett D'Amore 	return (width);
507*6b5e5868SGarrett D'Amore }
508*6b5e5868SGarrett D'Amore 
509*6b5e5868SGarrett D'Amore /*
510*6b5e5868SGarrett D'Amore  * EUC-CN encodes as follows:
511*6b5e5868SGarrett D'Amore  *
512*6b5e5868SGarrett D'Amore  * Code set 0 (ASCII):				0x21-0x7E
513*6b5e5868SGarrett D'Amore  * Code set 1 (CNS 11643-1992 Plane 1):		0xA1A1-0xFEFE
514*6b5e5868SGarrett D'Amore  * Code set 2 (CNS 11643-1992 Planes 1-16):	0x8EA1A1A1-0x8EB0FEFE
515*6b5e5868SGarrett D'Amore  * Code set 3:					unused
516*6b5e5868SGarrett D'Amore  */
517*6b5e5868SGarrett D'Amore int
518*6b5e5868SGarrett D'Amore towide_euccn(wchar_t *wc, const char *mb, int n)
519*6b5e5868SGarrett D'Amore {
520*6b5e5868SGarrett D'Amore 	return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
521*6b5e5868SGarrett D'Amore }
522*6b5e5868SGarrett D'Amore 
523*6b5e5868SGarrett D'Amore /*
524*6b5e5868SGarrett D'Amore  * EUC-JP encodes as follows:
525*6b5e5868SGarrett D'Amore  *
526*6b5e5868SGarrett D'Amore  * Code set 0 (ASCII or JIS X 0201-1976 Roman):	0x21-0x7E
527*6b5e5868SGarrett D'Amore  * Code set 1 (JIS X 0208):			0xA1A1-0xFEFE
528*6b5e5868SGarrett D'Amore  * Code set 2 (half-width katakana):		0x8EA1-0x8EDF
529*6b5e5868SGarrett D'Amore  * Code set 3 (JIS X 0212-1990):		0x8FA1A1-0x8FFEFE
530*6b5e5868SGarrett D'Amore  */
531*6b5e5868SGarrett D'Amore int
532*6b5e5868SGarrett D'Amore towide_eucjp(wchar_t *wc, const char *mb, int n)
533*6b5e5868SGarrett D'Amore {
534*6b5e5868SGarrett D'Amore 	return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3));
535*6b5e5868SGarrett D'Amore }
536*6b5e5868SGarrett D'Amore 
537*6b5e5868SGarrett D'Amore /*
538*6b5e5868SGarrett D'Amore  * EUC-KR encodes as follows:
539*6b5e5868SGarrett D'Amore  *
540*6b5e5868SGarrett D'Amore  * Code set 0 (ASCII or KS C 5636-1993):	0x21-0x7E
541*6b5e5868SGarrett D'Amore  * Code set 1 (KS C 5601-1992):			0xA1A1-0xFEFE
542*6b5e5868SGarrett D'Amore  * Code set 2:					unused
543*6b5e5868SGarrett D'Amore  * Code set 3:					unused
544*6b5e5868SGarrett D'Amore  */
545*6b5e5868SGarrett D'Amore int
546*6b5e5868SGarrett D'Amore towide_euckr(wchar_t *wc, const char *mb, int n)
547*6b5e5868SGarrett D'Amore {
548*6b5e5868SGarrett D'Amore 	return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0));
549*6b5e5868SGarrett D'Amore }
550*6b5e5868SGarrett D'Amore 
551*6b5e5868SGarrett D'Amore /*
552*6b5e5868SGarrett D'Amore  * EUC-TW encodes as follows:
553*6b5e5868SGarrett D'Amore  *
554*6b5e5868SGarrett D'Amore  * Code set 0 (ASCII):				0x21-0x7E
555*6b5e5868SGarrett D'Amore  * Code set 1 (CNS 11643-1992 Plane 1):		0xA1A1-0xFEFE
556*6b5e5868SGarrett D'Amore  * Code set 2 (CNS 11643-1992 Planes 1-16):	0x8EA1A1A1-0x8EB0FEFE
557*6b5e5868SGarrett D'Amore  * Code set 3:					unused
558*6b5e5868SGarrett D'Amore  */
559*6b5e5868SGarrett D'Amore int
560*6b5e5868SGarrett D'Amore towide_euctw(wchar_t *wc, const char *mb, int n)
561*6b5e5868SGarrett D'Amore {
562*6b5e5868SGarrett D'Amore 	return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
563*6b5e5868SGarrett D'Amore }
564*6b5e5868SGarrett D'Amore 
565*6b5e5868SGarrett D'Amore /*
566*6b5e5868SGarrett D'Amore  * Public entry points.
567*6b5e5868SGarrett D'Amore  */
568*6b5e5868SGarrett D'Amore 
569*6b5e5868SGarrett D'Amore int
570*6b5e5868SGarrett D'Amore to_wide(wchar_t *wc, const char *mb)
571*6b5e5868SGarrett D'Amore {
572*6b5e5868SGarrett D'Amore 	/* this won't fail hard */
573*6b5e5868SGarrett D'Amore 	return (_towide(wc, mb, strlen(mb) + 1));
574*6b5e5868SGarrett D'Amore }
575*6b5e5868SGarrett D'Amore 
576*6b5e5868SGarrett D'Amore int
577*6b5e5868SGarrett D'Amore to_mb(char *mb, wchar_t wc)
578*6b5e5868SGarrett D'Amore {
579*6b5e5868SGarrett D'Amore 	int	rv;
580*6b5e5868SGarrett D'Amore 
581*6b5e5868SGarrett D'Amore 	if ((rv = _tomb(mb, wc)) < 0) {
582*6b5e5868SGarrett D'Amore 		errf(widemsg);
583*6b5e5868SGarrett D'Amore 		free(widemsg);
584*6b5e5868SGarrett D'Amore 		widemsg = NULL;
585*6b5e5868SGarrett D'Amore 	}
586*6b5e5868SGarrett D'Amore 	return (rv);
587*6b5e5868SGarrett D'Amore }
588*6b5e5868SGarrett D'Amore 
589*6b5e5868SGarrett D'Amore char *
590*6b5e5868SGarrett D'Amore to_mb_string(const wchar_t *wcs)
591*6b5e5868SGarrett D'Amore {
592*6b5e5868SGarrett D'Amore 	char	*mbs;
593*6b5e5868SGarrett D'Amore 	char	*ptr;
594*6b5e5868SGarrett D'Amore 	int	len;
595*6b5e5868SGarrett D'Amore 
596*6b5e5868SGarrett D'Amore 	mbs = malloc((wcslen(wcs) * mb_cur_max) + 1);
597*6b5e5868SGarrett D'Amore 	if (mbs == NULL) {
598*6b5e5868SGarrett D'Amore 		errf("out of memory");
599*6b5e5868SGarrett D'Amore 		return (NULL);
600*6b5e5868SGarrett D'Amore 	}
601*6b5e5868SGarrett D'Amore 	ptr = mbs;
602*6b5e5868SGarrett D'Amore 	while (*wcs) {
603*6b5e5868SGarrett D'Amore 		if ((len = to_mb(ptr, *wcs)) < 0) {
604*6b5e5868SGarrett D'Amore 			INTERR;
605*6b5e5868SGarrett D'Amore 			free(mbs);
606*6b5e5868SGarrett D'Amore 			return (NULL);
607*6b5e5868SGarrett D'Amore 		}
608*6b5e5868SGarrett D'Amore 		wcs++;
609*6b5e5868SGarrett D'Amore 		ptr += len;
610*6b5e5868SGarrett D'Amore 	}
611*6b5e5868SGarrett D'Amore 	*ptr = 0;
612*6b5e5868SGarrett D'Amore 	return (mbs);
613*6b5e5868SGarrett D'Amore }
614*6b5e5868SGarrett D'Amore 
615*6b5e5868SGarrett D'Amore void
616*6b5e5868SGarrett D'Amore set_wide_encoding(const char *encoding)
617*6b5e5868SGarrett D'Amore {
618*6b5e5868SGarrett D'Amore 	int i;
619*6b5e5868SGarrett D'Amore 
620*6b5e5868SGarrett D'Amore 	_towide = towide_none;
621*6b5e5868SGarrett D'Amore 	_tomb = tomb_none;
622*6b5e5868SGarrett D'Amore 	_encoding = "NONE";
623*6b5e5868SGarrett D'Amore 
624*6b5e5868SGarrett D'Amore 	for (i = 0; mb_encodings[i].name; i++) {
625*6b5e5868SGarrett D'Amore 		if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
626*6b5e5868SGarrett D'Amore 			_towide = mb_encodings[i].towide;
627*6b5e5868SGarrett D'Amore 			_tomb = mb_encodings[i].tomb;
628*6b5e5868SGarrett D'Amore 			_encoding = mb_encodings[i].cname;
629*6b5e5868SGarrett D'Amore 		}
630*6b5e5868SGarrett D'Amore 	}
631*6b5e5868SGarrett D'Amore }
632*6b5e5868SGarrett D'Amore 
633*6b5e5868SGarrett D'Amore const char *
634*6b5e5868SGarrett D'Amore get_wide_encoding(void)
635*6b5e5868SGarrett D'Amore {
636*6b5e5868SGarrett D'Amore 	return (_encoding);
637*6b5e5868SGarrett D'Amore }
638