xref: /illumos-gate/usr/src/lib/libc/port/locale/euc.c (revision 379728489ed47862c4927c75771e767b9476c9c4)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
5  * Copyright (c) 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Paul Borman at Krystal Technologies.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include "lint.h"
37 #include <errno.h>
38 #include <limits.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <wchar.h>
42 #include <sys/types.h>
43 #include <sys/euc.h>
44 #include "mblocal.h"
45 #include "lctype.h"
46 
47 static size_t	_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
48     const char *_RESTRICT_KYWD,
49     size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t,
50     boolean_t);
51 static size_t	_EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
52     mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
53 
54 static size_t	_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
55 		    const char *_RESTRICT_KYWD,
56 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
57 static size_t	_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
58 		    const char *_RESTRICT_KYWD,
59 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
60 static size_t	_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
61 		    const char *_RESTRICT_KYWD,
62 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
63 static size_t	_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
64 		    const char *_RESTRICT_KYWD,
65 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
66 
67 static size_t	_EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
68 		    mbstate_t *_RESTRICT_KYWD);
69 static size_t	_EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
70 		    mbstate_t *_RESTRICT_KYWD);
71 static size_t	_EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
72 		    mbstate_t *_RESTRICT_KYWD);
73 static size_t	_EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
74 		    mbstate_t *_RESTRICT_KYWD);
75 
76 static size_t	_EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
77 		    const char **_RESTRICT_KYWD, size_t, size_t,
78 		    mbstate_t *_RESTRICT_KYWD);
79 static size_t	_EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
80 		    const char **_RESTRICT_KYWD, size_t, size_t,
81 		    mbstate_t *_RESTRICT_KYWD);
82 static size_t	_EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
83 		    const char **_RESTRICT_KYWD, size_t, size_t,
84 		    mbstate_t *_RESTRICT_KYWD);
85 static size_t	_EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
86 		    const char **_RESTRICT_KYWD, size_t, size_t,
87 		    mbstate_t *_RESTRICT_KYWD);
88 
89 static size_t	_EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD,
90 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
91 		    mbstate_t *_RESTRICT_KYWD);
92 static size_t	_EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD,
93 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
94 		    mbstate_t *_RESTRICT_KYWD);
95 static size_t	_EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD,
96 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
97 		    mbstate_t *_RESTRICT_KYWD);
98 static size_t	_EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD,
99 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
100 		    mbstate_t *_RESTRICT_KYWD);
101 
102 static int	_EUC_mbsinit(const mbstate_t *);
103 
104 int
105 _EUC_mbsinit(const mbstate_t *ps)
106 {
107 
108 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
109 }
110 
111 /*
112  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
113  */
114 void
115 _EUC_CN_init(struct lc_ctype *lct)
116 {
117 	lct->lc_mbrtowc = _EUC_CN_mbrtowc;
118 	lct->lc_wcrtomb = _EUC_CN_wcrtomb;
119 	lct->lc_mbsnrtowcs = _EUC_CN_mbsnrtowcs;
120 	lct->lc_wcsnrtombs = _EUC_CN_wcsnrtombs;
121 	lct->lc_mbsinit = _EUC_mbsinit;
122 
123 	lct->lc_max_mblen = 4;
124 	lct->lc_is_ascii = 0;
125 }
126 
127 static size_t
128 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
129     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
130 {
131 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0, zero));
132 }
133 
134 static size_t
135 _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
136     const char **_RESTRICT_KYWD src,
137     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
138 {
139 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
140 }
141 
142 static size_t
143 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
144     mbstate_t *_RESTRICT_KYWD ps)
145 {
146 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
147 }
148 
149 static size_t
150 _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
151     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
152 {
153 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
154 }
155 
156 /*
157  * EUC-KR uses only CS0 and CS1.
158  */
159 void
160 _EUC_KR_init(struct lc_ctype *lct)
161 {
162 	lct->lc_mbrtowc = _EUC_KR_mbrtowc;
163 	lct->lc_wcrtomb = _EUC_KR_wcrtomb;
164 	lct->lc_mbsnrtowcs = _EUC_KR_mbsnrtowcs;
165 	lct->lc_wcsnrtombs = _EUC_KR_wcsnrtombs;
166 	lct->lc_mbsinit = _EUC_mbsinit;
167 
168 	lct->lc_max_mblen = 2;
169 	lct->lc_is_ascii = 0;
170 }
171 
172 static size_t
173 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
174     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
175 {
176 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0, zero));
177 }
178 
179 static size_t
180 _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
181     const char **_RESTRICT_KYWD src,
182     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
183 {
184 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
185 }
186 
187 static size_t
188 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
189     mbstate_t *_RESTRICT_KYWD ps)
190 {
191 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
192 }
193 
194 static size_t
195 _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
196     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
197 {
198 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
199 }
200 
201 /*
202  * EUC-JP uses CS0, CS1, CS2, and CS3.
203  */
204 void
205 _EUC_JP_init(struct lc_ctype *lct)
206 {
207 	lct->lc_mbrtowc = _EUC_JP_mbrtowc;
208 	lct->lc_wcrtomb = _EUC_JP_wcrtomb;
209 	lct->lc_mbsnrtowcs = _EUC_JP_mbsnrtowcs;
210 	lct->lc_wcsnrtombs = _EUC_JP_wcsnrtombs;
211 	lct->lc_mbsinit = _EUC_mbsinit;
212 
213 	lct->lc_max_mblen = 3;
214 	lct->lc_is_ascii = 0;
215 }
216 
217 static size_t
218 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
219     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
220 {
221 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3, zero));
222 }
223 
224 static size_t
225 _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
226     const char **_RESTRICT_KYWD src,
227     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
228 {
229 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
230 }
231 
232 static size_t
233 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
234     mbstate_t *_RESTRICT_KYWD ps)
235 {
236 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
237 }
238 
239 static size_t
240 _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
241     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
242 {
243 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
244 }
245 
246 /*
247  * EUC-TW uses CS0, CS1, and CS2.
248  */
249 void
250 _EUC_TW_init(struct lc_ctype *lct)
251 {
252 	lct->lc_mbrtowc = _EUC_TW_mbrtowc;
253 	lct->lc_wcrtomb = _EUC_TW_wcrtomb;
254 	lct->lc_mbsnrtowcs = _EUC_TW_mbsnrtowcs;
255 	lct->lc_wcsnrtombs = _EUC_TW_wcsnrtombs;
256 	lct->lc_mbsinit = _EUC_mbsinit;
257 
258 	lct->lc_max_mblen = 4;
259 	lct->lc_is_ascii = 0;
260 }
261 
262 static size_t
263 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
264     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
265 {
266 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0, zero));
267 }
268 
269 static size_t
270 _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
271     const char **_RESTRICT_KYWD src,
272     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
273 {
274 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
275 }
276 
277 static size_t
278 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
279     mbstate_t *_RESTRICT_KYWD ps)
280 {
281 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
282 }
283 
284 static size_t
285 _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
286     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
287 {
288 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
289 }
290 
291 /*
292  * Common EUC code.
293  */
294 
295 static size_t
296 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
297     size_t n, mbstate_t *_RESTRICT_KYWD ps,
298     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width,
299     boolean_t zero)
300 {
301 	_EucState *es;
302 	int i, want;
303 	wchar_t wc = 0;
304 	unsigned char ch, chs;
305 
306 	es = (_EucState *)ps;
307 
308 	if (es->want < 0 || es->want > MB_CUR_MAX) {
309 		errno = EINVAL;
310 		return ((size_t)-1);
311 	}
312 
313 	if (s == NULL) {
314 		s = "";
315 		n = 1;
316 		pwc = NULL;
317 	}
318 
319 	if (n == 0)
320 		/* Incomplete multibyte sequence */
321 		return ((size_t)-2);
322 
323 	if (es->want == 0) {
324 		/* Fast path for plain ASCII (CS0) */
325 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
326 			if (pwc != NULL)
327 				*pwc = ch;
328 			if (zero || ch != '\0') {
329 				return (1);
330 			} else {
331 				return (0);
332 			}
333 		}
334 
335 		if (ch >= 0xa1) {
336 			/* CS1 */
337 			want = 2;
338 		} else if (ch == cs2) {
339 			want = cs2width;
340 		} else if (ch == cs3) {
341 			want = cs3width;
342 		} else {
343 			errno = EILSEQ;
344 			return ((size_t)-1);
345 		}
346 
347 
348 		es->want = want;
349 		es->ch = 0;
350 	} else {
351 		want = es->want;
352 		wc = es->ch;
353 	}
354 
355 	for (i = 0; i < MIN(want, n); i++) {
356 		wc <<= 8;
357 		chs = *s;
358 		wc |= chs;
359 		s++;
360 	}
361 	if (i < want) {
362 		/* Incomplete multibyte sequence */
363 		es->want = want - i;
364 		es->ch = wc;
365 		return ((size_t)-2);
366 	}
367 	if (pwc != NULL)
368 		*pwc = wc;
369 	es->want = 0;
370 	if (zero || wc != L'\0') {
371 		return (want);
372 	} else {
373 		return (0);
374 	}
375 }
376 
377 static size_t
378 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
379     mbstate_t *_RESTRICT_KYWD ps,
380     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
381 {
382 	_EucState *es;
383 	int i, len;
384 	wchar_t nm;
385 
386 	es = (_EucState *)ps;
387 
388 	if (es->want != 0) {
389 		errno = EINVAL;
390 		return ((size_t)-1);
391 	}
392 
393 	if (s == NULL)
394 		/* Reset to initial shift state (no-op) */
395 		return (1);
396 
397 	if ((wc & ~0x7f) == 0) {
398 		/* Fast path for plain ASCII (CS0) */
399 		*s = (char)wc;
400 		return (1);
401 	}
402 
403 	/* Determine the "length" */
404 	if ((unsigned)wc > 0xffffff) {
405 		len = 4;
406 	} else if ((unsigned)wc > 0xffff) {
407 		len = 3;
408 	} else if ((unsigned)wc > 0xff) {
409 		len = 2;
410 	} else {
411 		len = 1;
412 	}
413 
414 	if (len > MB_CUR_MAX) {
415 		errno = EILSEQ;
416 		return ((size_t)-1);
417 	}
418 
419 	/* This first check excludes CS1, which is implicitly valid. */
420 	if ((wc < 0xa100) || (wc > 0xffff)) {
421 		/* Check for valid CS2 or CS3 */
422 		nm = (wc >> ((len - 1) * 8));
423 		if (nm == cs2) {
424 			if (len != cs2width) {
425 				errno = EILSEQ;
426 				return ((size_t)-1);
427 			}
428 		} else if (nm == cs3) {
429 			if (len != cs3width) {
430 				errno = EILSEQ;
431 				return ((size_t)-1);
432 			}
433 		} else {
434 			errno = EILSEQ;
435 			return ((size_t)-1);
436 		}
437 	}
438 
439 	/* Stash the bytes, least significant last */
440 	for (i = len - 1; i >= 0; i--) {
441 		s[i] = (wc & 0xff);
442 		wc >>= 8;
443 	}
444 	return (len);
445 }
446