xref: /illumos-gate/usr/src/lib/libc/port/locale/euc.c (revision d1aea6f139360e9e7f1504facb24f8521047b15c)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
5  * Copyright (c) 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Paul Borman at Krystal Technologies.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 4. Neither the name of the University nor the names of its contributors
20  *    may be used to endorse or promote products derived from this software
21  *    without specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 #include "lint.h"
37 #include <errno.h>
38 #include <limits.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <wchar.h>
42 #include <sys/types.h>
43 #include <sys/euc.h>
44 #include "mblocal.h"
45 #include "lctype.h"
46 
47 static size_t	_EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD,
48     const char *_RESTRICT_KYWD,
49     size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
50 static size_t	_EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t,
51     mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t);
52 
53 static size_t	_EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD,
54 		    const char *_RESTRICT_KYWD,
55 		    size_t, mbstate_t *_RESTRICT_KYWD);
56 static size_t	_EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD,
57 		    const char *_RESTRICT_KYWD,
58 		    size_t, mbstate_t *_RESTRICT_KYWD);
59 static size_t	_EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD,
60 		    const char *_RESTRICT_KYWD,
61 		    size_t, mbstate_t *_RESTRICT_KYWD);
62 static size_t	_EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD,
63 		    const char *_RESTRICT_KYWD,
64 		    size_t, mbstate_t *_RESTRICT_KYWD);
65 
66 static size_t	_EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
67 		    mbstate_t *_RESTRICT_KYWD);
68 static size_t	_EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
69 		    mbstate_t *_RESTRICT_KYWD);
70 static size_t	_EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
71 		    mbstate_t *_RESTRICT_KYWD);
72 static size_t	_EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
73 		    mbstate_t *_RESTRICT_KYWD);
74 
75 static size_t	_EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
76 		    const char **_RESTRICT_KYWD, size_t, size_t,
77 		    mbstate_t *_RESTRICT_KYWD);
78 static size_t	_EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
79 		    const char **_RESTRICT_KYWD, size_t, size_t,
80 		    mbstate_t *_RESTRICT_KYWD);
81 static size_t	_EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
82 		    const char **_RESTRICT_KYWD, size_t, size_t,
83 		    mbstate_t *_RESTRICT_KYWD);
84 static size_t	_EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
85 		    const char **_RESTRICT_KYWD, size_t, size_t,
86 		    mbstate_t *_RESTRICT_KYWD);
87 
88 static size_t	_EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD,
89 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
90 		    mbstate_t *_RESTRICT_KYWD);
91 static size_t	_EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD,
92 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
93 		    mbstate_t *_RESTRICT_KYWD);
94 static size_t	_EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD,
95 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
96 		    mbstate_t *_RESTRICT_KYWD);
97 static size_t	_EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD,
98 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
99 		    mbstate_t *_RESTRICT_KYWD);
100 
101 static int	_EUC_mbsinit(const mbstate_t *);
102 
103 typedef struct {
104 	wchar_t	ch;
105 	int	set;
106 	int	want;
107 } _EucState;
108 
109 int
110 _EUC_mbsinit(const mbstate_t *ps)
111 {
112 
113 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
114 }
115 
116 /*
117  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
118  */
119 void
120 _EUC_CN_init(struct lc_ctype *lct)
121 {
122 	lct->lc_mbrtowc = _EUC_CN_mbrtowc;
123 	lct->lc_wcrtomb = _EUC_CN_wcrtomb;
124 	lct->lc_mbsnrtowcs = _EUC_CN_mbsnrtowcs;
125 	lct->lc_wcsnrtombs = _EUC_CN_wcsnrtombs;
126 	lct->lc_mbsinit = _EUC_mbsinit;
127 
128 	lct->lc_max_mblen = 4;
129 	lct->lc_is_ascii = 0;
130 }
131 
132 static size_t
133 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
134     size_t n, mbstate_t *_RESTRICT_KYWD ps)
135 {
136 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
137 }
138 
139 static size_t
140 _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
141     const char **_RESTRICT_KYWD src,
142     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
143 {
144 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
145 }
146 
147 static size_t
148 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
149     mbstate_t *_RESTRICT_KYWD ps)
150 {
151 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
152 }
153 
154 static size_t
155 _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
156     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
157 {
158 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
159 }
160 
161 /*
162  * EUC-KR uses only CS0 and CS1.
163  */
164 void
165 _EUC_KR_init(struct lc_ctype *lct)
166 {
167 	lct->lc_mbrtowc = _EUC_KR_mbrtowc;
168 	lct->lc_wcrtomb = _EUC_KR_wcrtomb;
169 	lct->lc_mbsnrtowcs = _EUC_KR_mbsnrtowcs;
170 	lct->lc_wcsnrtombs = _EUC_KR_wcsnrtombs;
171 	lct->lc_mbsinit = _EUC_mbsinit;
172 
173 	lct->lc_max_mblen = 2;
174 	lct->lc_is_ascii = 0;
175 }
176 
177 static size_t
178 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
179     size_t n, mbstate_t *_RESTRICT_KYWD ps)
180 {
181 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
182 }
183 
184 static size_t
185 _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
186     const char **_RESTRICT_KYWD src,
187     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
188 {
189 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
190 }
191 
192 static size_t
193 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
194     mbstate_t *_RESTRICT_KYWD ps)
195 {
196 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
197 }
198 
199 static size_t
200 _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
201     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
202 {
203 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
204 }
205 
206 /*
207  * EUC-JP uses CS0, CS1, CS2, and CS3.
208  */
209 void
210 _EUC_JP_init(struct lc_ctype *lct)
211 {
212 	lct->lc_mbrtowc = _EUC_JP_mbrtowc;
213 	lct->lc_wcrtomb = _EUC_JP_wcrtomb;
214 	lct->lc_mbsnrtowcs = _EUC_JP_mbsnrtowcs;
215 	lct->lc_wcsnrtombs = _EUC_JP_wcsnrtombs;
216 	lct->lc_mbsinit = _EUC_mbsinit;
217 
218 	lct->lc_max_mblen = 3;
219 	lct->lc_is_ascii = 0;
220 }
221 
222 static size_t
223 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
224     size_t n, mbstate_t *_RESTRICT_KYWD ps)
225 {
226 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
227 }
228 
229 static size_t
230 _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
231     const char **_RESTRICT_KYWD src,
232     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
233 {
234 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
235 }
236 
237 static size_t
238 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
239     mbstate_t *_RESTRICT_KYWD ps)
240 {
241 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
242 }
243 
244 static size_t
245 _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
246     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
247 {
248 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
249 }
250 
251 /*
252  * EUC-TW uses CS0, CS1, and CS2.
253  */
254 void
255 _EUC_TW_init(struct lc_ctype *lct)
256 {
257 	lct->lc_mbrtowc = _EUC_TW_mbrtowc;
258 	lct->lc_wcrtomb = _EUC_TW_wcrtomb;
259 	lct->lc_mbsnrtowcs = _EUC_TW_mbsnrtowcs;
260 	lct->lc_wcsnrtombs = _EUC_TW_wcsnrtombs;
261 	lct->lc_mbsinit = _EUC_mbsinit;
262 
263 	lct->lc_max_mblen = 4;
264 	lct->lc_is_ascii = 0;
265 }
266 
267 static size_t
268 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
269     size_t n, mbstate_t *_RESTRICT_KYWD ps)
270 {
271 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
272 }
273 
274 static size_t
275 _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
276     const char **_RESTRICT_KYWD src,
277     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
278 {
279 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
280 }
281 
282 static size_t
283 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
284     mbstate_t *_RESTRICT_KYWD ps)
285 {
286 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
287 }
288 
289 static size_t
290 _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
291     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
292 {
293 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
294 }
295 
296 /*
297  * Common EUC code.
298  */
299 
300 static size_t
301 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
302     size_t n, mbstate_t *_RESTRICT_KYWD ps,
303     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
304 {
305 	_EucState *es;
306 	int i, want;
307 	wchar_t wc = 0;
308 	unsigned char ch, chs;
309 
310 	es = (_EucState *)ps;
311 
312 	if (es->want < 0 || es->want > MB_CUR_MAX) {
313 		errno = EINVAL;
314 		return ((size_t)-1);
315 	}
316 
317 	if (s == NULL) {
318 		s = "";
319 		n = 1;
320 		pwc = NULL;
321 	}
322 
323 	if (n == 0)
324 		/* Incomplete multibyte sequence */
325 		return ((size_t)-2);
326 
327 	if (es->want == 0) {
328 		/* Fast path for plain ASCII (CS0) */
329 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
330 			if (pwc != NULL)
331 				*pwc = ch;
332 			return (ch != '\0' ? 1 : 0);
333 		}
334 
335 		if (ch >= 0xa1) {
336 			/* CS1 */
337 			want = 2;
338 		} else if (ch == cs2) {
339 			want = cs2width;
340 		} else if (ch == cs3) {
341 			want = cs3width;
342 		} else {
343 			errno = EILSEQ;
344 			return ((size_t)-1);
345 		}
346 
347 
348 		es->want = want;
349 		es->ch = 0;
350 	} else {
351 		want = es->want;
352 		wc = es->ch;
353 	}
354 
355 	for (i = 0; i < MIN(want, n); i++) {
356 		wc <<= 8;
357 		chs = *s;
358 		wc |= chs;
359 		s++;
360 	}
361 	if (i < want) {
362 		/* Incomplete multibyte sequence */
363 		es->want = want - i;
364 		es->ch = wc;
365 		return ((size_t)-2);
366 	}
367 	if (pwc != NULL)
368 		*pwc = wc;
369 	es->want = 0;
370 	return (wc == L'\0' ? 0 : want);
371 }
372 
373 static size_t
374 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc,
375     mbstate_t *_RESTRICT_KYWD ps,
376     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
377 {
378 	_EucState *es;
379 	int i, len;
380 	wchar_t nm;
381 
382 	es = (_EucState *)ps;
383 
384 	if (es->want != 0) {
385 		errno = EINVAL;
386 		return ((size_t)-1);
387 	}
388 
389 	if (s == NULL)
390 		/* Reset to initial shift state (no-op) */
391 		return (1);
392 
393 	if ((wc & ~0x7f) == 0) {
394 		/* Fast path for plain ASCII (CS0) */
395 		*s = (char)wc;
396 		return (1);
397 	}
398 
399 	/* Determine the "length" */
400 	if ((unsigned)wc > 0xffffff) {
401 		len = 4;
402 	} else if ((unsigned)wc > 0xffff) {
403 		len = 3;
404 	} else if ((unsigned)wc > 0xff) {
405 		len = 2;
406 	} else {
407 		len = 1;
408 	}
409 
410 	if (len > MB_CUR_MAX) {
411 		errno = EILSEQ;
412 		return ((size_t)-1);
413 	}
414 
415 	/* This first check excludes CS1, which is implicitly valid. */
416 	if ((wc < 0xa100) || (wc > 0xffff)) {
417 		/* Check for valid CS2 or CS3 */
418 		nm = (wc >> ((len - 1) * 8));
419 		if (nm == cs2) {
420 			if (len != cs2width) {
421 				errno = EILSEQ;
422 				return ((size_t)-1);
423 			}
424 		} else if (nm == cs3) {
425 			if (len != cs3width) {
426 				errno = EILSEQ;
427 				return ((size_t)-1);
428 			}
429 		} else {
430 			errno = EILSEQ;
431 			return ((size_t)-1);
432 		}
433 	}
434 
435 	/* Stash the bytes, least significant last */
436 	for (i = len - 1; i >= 0; i--) {
437 		s[i] = (wc & 0xff);
438 		wc >>= 8;
439 	}
440 	return (len);
441 }
442