xref: /freebsd/lib/libc/locale/euc.c (revision 4b9d605768acabc460aa6dcfe8a1f8db35b16794)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
7  * Copyright (c) 1993
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Paul Borman at Krystal Technologies.
12  *
13  * Copyright (c) 2011 The FreeBSD Foundation
14  *
15  * Portions of this software were developed by David Chisnall
16  * under sponsorship from the FreeBSD Foundation.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  * 3. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  */
42 
43 #include <sys/param.h>
44 #include <errno.h>
45 #include <limits.h>
46 #include <runetype.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <wchar.h>
50 #include "mblocal.h"
51 
52 extern int __mb_sb_limit;
53 
54 static size_t	_EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict,
55     size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
56 static size_t	_EUC_wcrtomb_impl(char * __restrict, wchar_t,
57     mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
58 
59 static size_t	_EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict,
60 		    size_t, mbstate_t * __restrict);
61 static size_t	_EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict,
62 		    size_t, mbstate_t * __restrict);
63 static size_t	_EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict,
64 		    size_t, mbstate_t * __restrict);
65 static size_t	_EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict,
66 		    size_t, mbstate_t * __restrict);
67 
68 static size_t	_EUC_CN_wcrtomb(char * __restrict, wchar_t,
69 		    mbstate_t * __restrict);
70 static size_t	_EUC_JP_wcrtomb(char * __restrict, wchar_t,
71 		    mbstate_t * __restrict);
72 static size_t	_EUC_KR_wcrtomb(char * __restrict, wchar_t,
73 		    mbstate_t * __restrict);
74 static size_t	_EUC_TW_wcrtomb(char * __restrict, wchar_t,
75 		    mbstate_t * __restrict);
76 
77 static size_t	_EUC_CN_mbsnrtowcs(wchar_t * __restrict,
78 		    const char ** __restrict, size_t, size_t,
79 		    mbstate_t * __restrict);
80 static size_t	_EUC_JP_mbsnrtowcs(wchar_t * __restrict,
81 		    const char ** __restrict, size_t, size_t,
82 		    mbstate_t * __restrict);
83 static size_t	_EUC_KR_mbsnrtowcs(wchar_t * __restrict,
84 		    const char ** __restrict, size_t, size_t,
85 		    mbstate_t * __restrict);
86 static size_t	_EUC_TW_mbsnrtowcs(wchar_t * __restrict,
87 		    const char ** __restrict, size_t, size_t,
88 		    mbstate_t * __restrict);
89 
90 static size_t	_EUC_CN_wcsnrtombs(char * __restrict,
91 		    const wchar_t ** __restrict, size_t, size_t,
92 		    mbstate_t * __restrict);
93 static size_t	_EUC_JP_wcsnrtombs(char * __restrict,
94 		    const wchar_t ** __restrict, size_t, size_t,
95 		    mbstate_t * __restrict);
96 static size_t	_EUC_KR_wcsnrtombs(char * __restrict,
97 		    const wchar_t ** __restrict, size_t, size_t,
98 		    mbstate_t * __restrict);
99 static size_t	_EUC_TW_wcsnrtombs(char * __restrict,
100 		    const wchar_t ** __restrict, size_t, size_t,
101 		    mbstate_t * __restrict);
102 
103 static int	_EUC_mbsinit(const mbstate_t *);
104 
105 typedef struct {
106 	wchar_t	ch;
107 	int	set;
108 	int	want;
109 } _EucState;
110 
111 static int
112 _EUC_mbsinit(const mbstate_t *ps)
113 {
114 
115 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
116 }
117 
118 /*
119  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
120  */
121 int
122 _EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl)
123 {
124 	l->__mbrtowc = _EUC_CN_mbrtowc;
125 	l->__wcrtomb = _EUC_CN_wcrtomb;
126 	l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs;
127 	l->__wcsnrtombs = _EUC_CN_wcsnrtombs;
128 	l->__mbsinit = _EUC_mbsinit;
129 
130 	l->runes = rl;
131 	l->__mb_cur_max = 4;
132 	l->__mb_sb_limit = 128;
133 	return (0);
134 }
135 
136 static size_t
137 _EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
138     size_t n, mbstate_t * __restrict ps)
139 {
140 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
141 }
142 
143 static size_t
144 _EUC_CN_mbsnrtowcs(wchar_t * __restrict dst,
145     const char ** __restrict src,
146     size_t nms, size_t len, mbstate_t * __restrict ps)
147 {
148 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
149 }
150 
151 static size_t
152 _EUC_CN_wcrtomb(char * __restrict s, wchar_t wc,
153     mbstate_t * __restrict ps)
154 {
155 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
156 }
157 
158 static size_t
159 _EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
160 	size_t nwc, size_t len, mbstate_t * __restrict ps)
161 {
162 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
163 }
164 
165 /*
166  * EUC-KR uses only CS0 and CS1.
167  */
168 int
169 _EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl)
170 {
171 	l->__mbrtowc = _EUC_KR_mbrtowc;
172 	l->__wcrtomb = _EUC_KR_wcrtomb;
173 	l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs;
174 	l->__wcsnrtombs = _EUC_KR_wcsnrtombs;
175 	l->__mbsinit = _EUC_mbsinit;
176 
177 	l->runes = rl;
178 	l->__mb_cur_max = 2;
179 	l->__mb_sb_limit = 128;
180 	return (0);
181 }
182 
183 static size_t
184 _EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
185     size_t n, mbstate_t * __restrict ps)
186 {
187 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
188 }
189 
190 static size_t
191 _EUC_KR_mbsnrtowcs(wchar_t * __restrict dst,
192     const char ** __restrict src,
193     size_t nms, size_t len, mbstate_t * __restrict ps)
194 {
195 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
196 }
197 
198 static size_t
199 _EUC_KR_wcrtomb(char * __restrict s, wchar_t wc,
200 	mbstate_t * __restrict ps)
201 {
202 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
203 }
204 
205 static size_t
206 _EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
207 	size_t nwc, size_t len, mbstate_t * __restrict ps)
208 {
209 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
210 }
211 
212 /*
213  * EUC-JP uses CS0, CS1, CS2, and CS3.
214  */
215 int
216 _EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl)
217 {
218 	l->__mbrtowc = _EUC_JP_mbrtowc;
219 	l->__wcrtomb = _EUC_JP_wcrtomb;
220 	l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs;
221 	l->__wcsnrtombs = _EUC_JP_wcsnrtombs;
222 	l->__mbsinit = _EUC_mbsinit;
223 
224 	l->runes = rl;
225 	l->__mb_cur_max = 3;
226 	l->__mb_sb_limit = 128;
227 	return (0);
228 }
229 
230 static size_t
231 _EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
232     size_t n, mbstate_t * __restrict ps)
233 {
234 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
235 }
236 
237 static size_t
238 _EUC_JP_mbsnrtowcs(wchar_t * __restrict dst,
239     const char ** __restrict src,
240     size_t nms, size_t len, mbstate_t * __restrict ps)
241 {
242 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
243 }
244 
245 static size_t
246 _EUC_JP_wcrtomb(char * __restrict s, wchar_t wc,
247     mbstate_t * __restrict ps)
248 {
249 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
250 }
251 
252 static size_t
253 _EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
254 	size_t nwc, size_t len, mbstate_t * __restrict ps)
255 {
256 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
257 }
258 
259 /*
260  * EUC-TW uses CS0, CS1, and CS2.
261  */
262 int
263 _EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl)
264 {
265 	l->__mbrtowc = _EUC_TW_mbrtowc;
266 	l->__wcrtomb = _EUC_TW_wcrtomb;
267 	l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs;
268 	l->__wcsnrtombs = _EUC_TW_wcsnrtombs;
269 	l->__mbsinit = _EUC_mbsinit;
270 
271 	l->runes = rl;
272 	l->__mb_cur_max = 4;
273 	l->__mb_sb_limit = 128;
274 	return (0);
275 }
276 
277 static size_t
278 _EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
279 	size_t n, mbstate_t * __restrict ps)
280 {
281 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
282 }
283 
284 static size_t
285 _EUC_TW_mbsnrtowcs(wchar_t * __restrict dst,
286 	const char ** __restrict src,
287 	size_t nms, size_t len, mbstate_t * __restrict ps)
288 {
289 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
290 }
291 
292 static size_t
293 _EUC_TW_wcrtomb(char * __restrict s, wchar_t wc,
294 	mbstate_t * __restrict ps)
295 {
296 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
297 }
298 
299 static size_t
300 _EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
301 	size_t nwc, size_t len, mbstate_t * __restrict ps)
302 {
303 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
304 }
305 
306 /*
307  * Common EUC code.
308  */
309 
310 static size_t
311 _EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s,
312 	size_t n, mbstate_t * __restrict ps,
313 	uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
314 {
315 	_EucState *es;
316 	int i, want;
317 	wchar_t wc = 0;
318 	unsigned char ch, chs;
319 
320 	es = (_EucState *)ps;
321 
322 	if (es->want < 0 || es->want > MB_CUR_MAX) {
323 		errno = EINVAL;
324 		return ((size_t)-1);
325 	}
326 
327 	if (s == NULL) {
328 		s = "";
329 		n = 1;
330 		pwc = NULL;
331 	}
332 
333 	if (n == 0)
334 		/* Incomplete multibyte sequence */
335 		return ((size_t)-2);
336 
337 	if (es->want == 0) {
338 		/* Fast path for plain ASCII (CS0) */
339 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
340 			if (pwc != NULL)
341 				*pwc = ch;
342 			return (ch != '\0' ? 1 : 0);
343 		}
344 
345 		if (ch >= 0xa1) {
346 			/* CS1 */
347 			want = 2;
348 		} else if (ch == cs2) {
349 			want = cs2width;
350 		} else if (ch == cs3) {
351 			want = cs3width;
352 		} else {
353 			errno = EILSEQ;
354 			return ((size_t)-1);
355 		}
356 
357 
358 		es->want = want;
359 		es->ch = 0;
360 	} else {
361 		want = es->want;
362 		wc = es->ch;
363 	}
364 
365 	for (i = 0; i < MIN(want, n); i++) {
366 		wc <<= 8;
367 		chs = *s;
368 		wc |= chs;
369 		s++;
370 	}
371 	if (i < want) {
372 		/* Incomplete multibyte sequence */
373 		es->want = want - i;
374 		es->ch = wc;
375 		errno = EILSEQ;
376 		return ((size_t)-2);
377 	}
378 	if (pwc != NULL)
379 		*pwc = wc;
380 	es->want = 0;
381 	return (wc == L'\0' ? 0 : want);
382 }
383 
384 static size_t
385 _EUC_wcrtomb_impl(char * __restrict s, wchar_t wc,
386     mbstate_t * __restrict ps,
387     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
388 {
389 	_EucState *es;
390 	int i, len;
391 	wchar_t nm;
392 
393 	es = (_EucState *)ps;
394 
395 	if (es->want != 0) {
396 		errno = EINVAL;
397 		return ((size_t)-1);
398 	}
399 
400 	if (s == NULL)
401 		/* Reset to initial shift state (no-op) */
402 		return (1);
403 
404 	if ((wc & ~0x7f) == 0) {
405 		/* Fast path for plain ASCII (CS0) */
406 		*s = (char)wc;
407 		return (1);
408 	}
409 
410 	/* Determine the "length" */
411 	if ((unsigned)wc > 0xffffff) {
412 		len = 4;
413 	} else if ((unsigned)wc > 0xffff) {
414 		len = 3;
415 	} else if ((unsigned)wc > 0xff) {
416 		len = 2;
417 	} else {
418 		len = 1;
419 	}
420 
421 	if (len > MB_CUR_MAX) {
422 		errno = EILSEQ;
423 		return ((size_t)-1);
424 	}
425 
426 	/* This first check excludes CS1, which is implicitly valid. */
427 	if ((wc < 0xa100) || (wc > 0xffff)) {
428 		/* Check for valid CS2 or CS3 */
429 		nm = (wc >> ((len - 1) * 8));
430 		if (nm == cs2) {
431 			if (len != cs2width) {
432 				errno = EILSEQ;
433 				return ((size_t)-1);
434 			}
435 		} else if (nm == cs3) {
436 			if (len != cs3width) {
437 				errno = EILSEQ;
438 				return ((size_t)-1);
439 			}
440 		} else {
441 			errno = EILSEQ;
442 			return ((size_t)-1);
443 		}
444 	}
445 
446 	/* Stash the bytes, least significant last */
447 	for (i = len - 1; i >= 0; i--) {
448 		s[i] = (wc & 0xff);
449 		wc >>= 8;
450 	}
451 	return (len);
452 }
453