xref: /illumos-gate/usr/src/lib/libc/port/locale/euc.c (revision 9d04e500a2b8dcd13abf3d813c6796aecd7a9689)
1 /*
2  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
4  * Copyright (c) 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Paul Borman at Krystal Technologies.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 4. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include "lint.h"
36 #include <errno.h>
37 #include <limits.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include <sys/types.h>
42 #include <sys/euc.h>
43 #include "runetype.h"
44 #include "mblocal.h"
45 
46 static size_t	_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
47     const char *_RESTRICT_KYWD,
48     size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
49 static size_t	_EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
50     mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
51 
52 static size_t	_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
53 		    const char *_RESTRICT_KYWD,
54 		    size_t, mbstate_t *_RESTRICT_KYWD);
55 static size_t	_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
56 		    const char *_RESTRICT_KYWD,
57 		    size_t, mbstate_t *_RESTRICT_KYWD);
58 static size_t	_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
59 		    const char *_RESTRICT_KYWD,
60 		    size_t, mbstate_t *_RESTRICT_KYWD);
61 static size_t	_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
62 		    const char *_RESTRICT_KYWD,
63 		    size_t, mbstate_t *_RESTRICT_KYWD);
64 static size_t	_EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
65 		    mbstate_t *_RESTRICT_KYWD);
66 static size_t	_EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
67 		    mbstate_t *_RESTRICT_KYWD);
68 static size_t	_EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
69 		    mbstate_t *_RESTRICT_KYWD);
70 static size_t	_EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
71 		    mbstate_t *_RESTRICT_KYWD);
72 static int	_EUC_mbsinit(const mbstate_t *);
73 
74 typedef struct {
75 	wchar_t	ch;
76 	int	set;
77 	int	want;
78 } _EucState;
79 
80 static int
81 _EUC_mbsinit(const mbstate_t *ps)
82 {
83 
84 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
85 }
86 
87 /*
88  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
89  */
90 int
91 _EUC_CN_init(_RuneLocale *rl)
92 {
93 	__mbrtowc = _EUC_CN_mbrtowc;
94 	__wcrtomb = _EUC_CN_wcrtomb;
95 	__mbsinit = _EUC_mbsinit;
96 
97 	_CurrentRuneLocale = rl;
98 
99 	__ctype[520] = 4;
100 	charset_is_ascii = 0;
101 	return (0);
102 }
103 
104 static size_t
105 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
106     size_t n, mbstate_t *_RESTRICT_KYWD ps)
107 {
108 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
109 }
110 
111 static size_t
112 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
113     mbstate_t *_RESTRICT_KYWD ps)
114 {
115 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
116 }
117 
118 /*
119  * EUC-KR uses only CS0 and CS1.
120  */
121 int
122 _EUC_KR_init(_RuneLocale *rl)
123 {
124 	__mbrtowc = _EUC_KR_mbrtowc;
125 	__wcrtomb = _EUC_KR_wcrtomb;
126 	__mbsinit = _EUC_mbsinit;
127 
128 	_CurrentRuneLocale = rl;
129 
130 	__ctype[520] = 2;
131 	charset_is_ascii = 0;
132 	return (0);
133 }
134 
135 static size_t
136 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
137     size_t n, mbstate_t *_RESTRICT_KYWD ps)
138 {
139 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
140 }
141 
142 static size_t
143 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
144     mbstate_t *_RESTRICT_KYWD ps)
145 {
146 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
147 }
148 
149 /*
150  * EUC-JP uses CS0, CS1, CS2, and CS3.
151  */
152 int
153 _EUC_JP_init(_RuneLocale *rl)
154 {
155 	__mbrtowc = _EUC_JP_mbrtowc;
156 	__wcrtomb = _EUC_JP_wcrtomb;
157 	__mbsinit = _EUC_mbsinit;
158 
159 	_CurrentRuneLocale = rl;
160 
161 	__ctype[520] = 3;
162 	charset_is_ascii = 0;
163 	return (0);
164 }
165 
166 static size_t
167 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
168     size_t n, mbstate_t *_RESTRICT_KYWD ps)
169 {
170 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
171 }
172 
173 static size_t
174 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
175     mbstate_t *_RESTRICT_KYWD ps)
176 {
177 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
178 }
179 
180 /*
181  * EUC-TW uses CS0, CS1, and CS2.
182  */
183 int
184 _EUC_TW_init(_RuneLocale *rl)
185 {
186 	__mbrtowc = _EUC_TW_mbrtowc;
187 	__wcrtomb = _EUC_TW_wcrtomb;
188 	__mbsinit = _EUC_mbsinit;
189 
190 	_CurrentRuneLocale = rl;
191 
192 	__ctype[520] = 4;
193 	charset_is_ascii = 0;
194 	return (0);
195 }
196 
197 static size_t
198 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
199     size_t n, mbstate_t *_RESTRICT_KYWD ps)
200 {
201 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
202 }
203 
204 static size_t
205 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
206     mbstate_t *_RESTRICT_KYWD ps)
207 {
208 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
209 }
210 
211 /*
212  * Common EUC code.
213  */
214 
215 static size_t
216 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
217     size_t n, mbstate_t *_RESTRICT_KYWD ps,
218     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
219 {
220 	_EucState *es;
221 	int i, want;
222 	wchar_t wc;
223 	unsigned char ch;
224 
225 	es = (_EucState *)ps;
226 
227 	if (es->want < 0 || es->want > MB_CUR_MAX) {
228 		errno = EINVAL;
229 		return ((size_t)-1);
230 	}
231 
232 	if (s == NULL) {
233 		s = "";
234 		n = 1;
235 		pwc = NULL;
236 	}
237 
238 	if (n == 0)
239 		/* Incomplete multibyte sequence */
240 		return ((size_t)-2);
241 
242 	if (es->want == 0) {
243 		/* Fast path for plain ASCII (CS0) */
244 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
245 			if (pwc != NULL)
246 				*pwc = ch;
247 			return (ch != '\0' ? 1 : 0);
248 		}
249 
250 		if (ch >= 0xa1) {
251 			/* CS1 */
252 			want = 2;
253 		} else if (ch == cs2) {
254 			want = cs2width;
255 		} else if (ch == cs3) {
256 			want = cs3width;
257 		} else {
258 			errno = EILSEQ;
259 			return ((size_t)-1);
260 		}
261 
262 
263 		es->want = want;
264 		es->ch = 0;
265 	} else {
266 		want = es->want;
267 		wc = es->ch;
268 	}
269 
270 	for (i = 0; i < MIN(want, n); i++) {
271 		wc <<= 8;
272 		wc |= *s;
273 		s++;
274 	}
275 	if (i < want) {
276 		/* Incomplete multibyte sequence */
277 		es->want = want - i;
278 		es->ch = wc;
279 		return ((size_t)-2);
280 	}
281 	if (pwc != NULL)
282 		*pwc = wc;
283 	es->want = 0;
284 	return (wc == L'\0' ? 0 : want);
285 }
286 
287 static size_t
288 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
289     mbstate_t *_RESTRICT_KYWD ps,
290     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
291 {
292 	_EucState *es;
293 	int i, len;
294 	wchar_t nm;
295 
296 	es = (_EucState *)ps;
297 
298 	if (es->want != 0) {
299 		errno = EINVAL;
300 		return ((size_t)-1);
301 	}
302 
303 	if (s == NULL)
304 		/* Reset to initial shift state (no-op) */
305 		return (1);
306 
307 	if ((wc & ~0x7f) == 0) {
308 		/* Fast path for plain ASCII (CS0) */
309 		*s = (char)wc;
310 		return (1);
311 	}
312 
313 	/* Determine the "length" */
314 	if ((unsigned)wc > 0xffffff) {
315 		len = 4;
316 	} else if ((unsigned)wc > 0xffff) {
317 		len = 3;
318 	} else if ((unsigned)wc > 0xff) {
319 		len = 2;
320 	} else {
321 		len = 1;
322 	}
323 
324 	if (len > MB_CUR_MAX) {
325 		errno = EILSEQ;
326 		return ((size_t)-1);
327 	}
328 
329 	/* This first check excludes CS1, which is implicitly valid. */
330 	if ((wc < 0xa100) || (wc > 0xffff)) {
331 		/* Check for valid CS2 or CS3 */
332 		nm = (wc >> ((len - 1) * 8));
333 		if (nm == cs2) {
334 			if (len != cs2width) {
335 				errno = EILSEQ;
336 				return ((size_t)-1);
337 			}
338 		} else if (nm == cs3) {
339 			if (len != cs3width) {
340 				errno = EILSEQ;
341 				return ((size_t)-1);
342 			}
343 		} else {
344 			errno = EILSEQ;
345 			return ((size_t)-1);
346 		}
347 	}
348 
349 	/* Stash the bytes, least significant last */
350 	for (i = len - 1; i >= 0; i--) {
351 		s[i] = (wc & 0xff);
352 		wc >>= 8;
353 	}
354 	return (len);
355 }
356