xref: /freebsd/lib/libc/locale/euc.c (revision b9f654b163bce26de79705e77b872427c9f2afa1)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
5  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
6  * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved.
7  * Copyright (c) 1993
8  *	The Regents of the University of California.  All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * Paul Borman at Krystal Technologies.
12  *
13  * Copyright (c) 2011 The FreeBSD Foundation
14  * All rights reserved.
15  * Portions of this software were developed by David Chisnall
16  * under sponsorship from the FreeBSD Foundation.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  * 3. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  */
42 
43 #if defined(LIBC_SCCS) && !defined(lint)
44 static char sccsid[] = "@(#)euc.c	8.1 (Berkeley) 6/4/93";
45 #endif /* LIBC_SCCS and not lint */
46 #include <sys/param.h>
47 __FBSDID("$FreeBSD$");
48 
49 #include <errno.h>
50 #include <limits.h>
51 #include <runetype.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <wchar.h>
55 #include "mblocal.h"
56 
57 extern int __mb_sb_limit;
58 
59 static size_t	_EUC_mbrtowc_impl(wchar_t * __restrict, const char * __restrict,
60     size_t, mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
61 static size_t	_EUC_wcrtomb_impl(char * __restrict, wchar_t,
62     mbstate_t * __restrict, uint8_t, uint8_t, uint8_t, uint8_t);
63 
64 static size_t	_EUC_CN_mbrtowc(wchar_t * __restrict, const char * __restrict,
65 		    size_t, mbstate_t * __restrict);
66 static size_t	_EUC_JP_mbrtowc(wchar_t * __restrict, const char * __restrict,
67 		    size_t, mbstate_t * __restrict);
68 static size_t	_EUC_KR_mbrtowc(wchar_t * __restrict, const char * __restrict,
69 		    size_t, mbstate_t * __restrict);
70 static size_t	_EUC_TW_mbrtowc(wchar_t * __restrict, const char * __restrict,
71 		    size_t, mbstate_t * __restrict);
72 
73 static size_t	_EUC_CN_wcrtomb(char * __restrict, wchar_t,
74 		    mbstate_t * __restrict);
75 static size_t	_EUC_JP_wcrtomb(char * __restrict, wchar_t,
76 		    mbstate_t * __restrict);
77 static size_t	_EUC_KR_wcrtomb(char * __restrict, wchar_t,
78 		    mbstate_t * __restrict);
79 static size_t	_EUC_TW_wcrtomb(char * __restrict, wchar_t,
80 		    mbstate_t * __restrict);
81 
82 static size_t	_EUC_CN_mbsnrtowcs(wchar_t * __restrict,
83 		    const char ** __restrict, size_t, size_t,
84 		    mbstate_t * __restrict);
85 static size_t	_EUC_JP_mbsnrtowcs(wchar_t * __restrict,
86 		    const char ** __restrict, size_t, size_t,
87 		    mbstate_t * __restrict);
88 static size_t	_EUC_KR_mbsnrtowcs(wchar_t * __restrict,
89 		    const char ** __restrict, size_t, size_t,
90 		    mbstate_t * __restrict);
91 static size_t	_EUC_TW_mbsnrtowcs(wchar_t * __restrict,
92 		    const char ** __restrict, size_t, size_t,
93 		    mbstate_t * __restrict);
94 
95 static size_t	_EUC_CN_wcsnrtombs(char * __restrict,
96 		    const wchar_t ** __restrict, size_t, size_t,
97 		    mbstate_t * __restrict);
98 static size_t	_EUC_JP_wcsnrtombs(char * __restrict,
99 		    const wchar_t ** __restrict, size_t, size_t,
100 		    mbstate_t * __restrict);
101 static size_t	_EUC_KR_wcsnrtombs(char * __restrict,
102 		    const wchar_t ** __restrict, size_t, size_t,
103 		    mbstate_t * __restrict);
104 static size_t	_EUC_TW_wcsnrtombs(char * __restrict,
105 		    const wchar_t ** __restrict, size_t, size_t,
106 		    mbstate_t * __restrict);
107 
108 static int	_EUC_mbsinit(const mbstate_t *);
109 
110 typedef struct {
111 	wchar_t	ch;
112 	int	set;
113 	int	want;
114 } _EucState;
115 
116 static int
117 _EUC_mbsinit(const mbstate_t *ps)
118 {
119 
120 	return (ps == NULL || ((const _EucState *)ps)->want == 0);
121 }
122 
123 /*
124  * EUC-CN uses CS0, CS1 and CS2 (4 bytes).
125  */
126 int
127 _EUC_CN_init(struct xlocale_ctype *l, _RuneLocale *rl)
128 {
129 	l->__mbrtowc = _EUC_CN_mbrtowc;
130 	l->__wcrtomb = _EUC_CN_wcrtomb;
131 	l->__mbsnrtowcs = _EUC_CN_mbsnrtowcs;
132 	l->__wcsnrtombs = _EUC_CN_wcsnrtombs;
133 	l->__mbsinit = _EUC_mbsinit;
134 
135 	l->runes = rl;
136 	l->__mb_cur_max = 4;
137 	l->__mb_sb_limit = 128;
138 	return (0);
139 }
140 
141 static size_t
142 _EUC_CN_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
143     size_t n, mbstate_t * __restrict ps)
144 {
145 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
146 }
147 
148 static size_t
149 _EUC_CN_mbsnrtowcs(wchar_t * __restrict dst,
150     const char ** __restrict src,
151     size_t nms, size_t len, mbstate_t * __restrict ps)
152 {
153 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc));
154 }
155 
156 static size_t
157 _EUC_CN_wcrtomb(char * __restrict s, wchar_t wc,
158     mbstate_t * __restrict ps)
159 {
160 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
161 }
162 
163 static size_t
164 _EUC_CN_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
165 	size_t nwc, size_t len, mbstate_t * __restrict ps)
166 {
167 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb));
168 }
169 
170 /*
171  * EUC-KR uses only CS0 and CS1.
172  */
173 int
174 _EUC_KR_init(struct xlocale_ctype *l, _RuneLocale *rl)
175 {
176 	l->__mbrtowc = _EUC_KR_mbrtowc;
177 	l->__wcrtomb = _EUC_KR_wcrtomb;
178 	l->__mbsnrtowcs = _EUC_KR_mbsnrtowcs;
179 	l->__wcsnrtombs = _EUC_KR_wcsnrtombs;
180 	l->__mbsinit = _EUC_mbsinit;
181 
182 	l->runes = rl;
183 	l->__mb_cur_max = 2;
184 	l->__mb_sb_limit = 128;
185 	return (0);
186 }
187 
188 static size_t
189 _EUC_KR_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
190     size_t n, mbstate_t * __restrict ps)
191 {
192 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0));
193 }
194 
195 static size_t
196 _EUC_KR_mbsnrtowcs(wchar_t * __restrict dst,
197     const char ** __restrict src,
198     size_t nms, size_t len, mbstate_t * __restrict ps)
199 {
200 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc));
201 }
202 
203 static size_t
204 _EUC_KR_wcrtomb(char * __restrict s, wchar_t wc,
205 	mbstate_t * __restrict ps)
206 {
207 	return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0));
208 }
209 
210 static size_t
211 _EUC_KR_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
212 	size_t nwc, size_t len, mbstate_t * __restrict ps)
213 {
214 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb));
215 }
216 
217 /*
218  * EUC-JP uses CS0, CS1, CS2, and CS3.
219  */
220 int
221 _EUC_JP_init(struct xlocale_ctype *l, _RuneLocale *rl)
222 {
223 	l->__mbrtowc = _EUC_JP_mbrtowc;
224 	l->__wcrtomb = _EUC_JP_wcrtomb;
225 	l->__mbsnrtowcs = _EUC_JP_mbsnrtowcs;
226 	l->__wcsnrtombs = _EUC_JP_wcsnrtombs;
227 	l->__mbsinit = _EUC_mbsinit;
228 
229 	l->runes = rl;
230 	l->__mb_cur_max = 3;
231 	l->__mb_sb_limit = 128;
232 	return (0);
233 }
234 
235 static size_t
236 _EUC_JP_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
237     size_t n, mbstate_t * __restrict ps)
238 {
239 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3));
240 }
241 
242 static size_t
243 _EUC_JP_mbsnrtowcs(wchar_t * __restrict dst,
244     const char ** __restrict src,
245     size_t nms, size_t len, mbstate_t * __restrict ps)
246 {
247 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc));
248 }
249 
250 static size_t
251 _EUC_JP_wcrtomb(char * __restrict s, wchar_t wc,
252     mbstate_t * __restrict ps)
253 {
254 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3));
255 }
256 
257 static size_t
258 _EUC_JP_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
259 	size_t nwc, size_t len, mbstate_t * __restrict ps)
260 {
261 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb));
262 }
263 
264 /*
265  * EUC-TW uses CS0, CS1, and CS2.
266  */
267 int
268 _EUC_TW_init(struct xlocale_ctype *l, _RuneLocale *rl)
269 {
270 	l->__mbrtowc = _EUC_TW_mbrtowc;
271 	l->__wcrtomb = _EUC_TW_wcrtomb;
272 	l->__mbsnrtowcs = _EUC_TW_mbsnrtowcs;
273 	l->__wcsnrtombs = _EUC_TW_wcsnrtombs;
274 	l->__mbsinit = _EUC_mbsinit;
275 
276 	l->runes = rl;
277 	l->__mb_cur_max = 4;
278 	l->__mb_sb_limit = 128;
279 	return (0);
280 }
281 
282 static size_t
283 _EUC_TW_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
284 	size_t n, mbstate_t * __restrict ps)
285 {
286 	return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0));
287 }
288 
289 static size_t
290 _EUC_TW_mbsnrtowcs(wchar_t * __restrict dst,
291 	const char ** __restrict src,
292 	size_t nms, size_t len, mbstate_t * __restrict ps)
293 {
294 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc));
295 }
296 
297 static size_t
298 _EUC_TW_wcrtomb(char * __restrict s, wchar_t wc,
299 	mbstate_t * __restrict ps)
300 {
301 	return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0));
302 }
303 
304 static size_t
305 _EUC_TW_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
306 	size_t nwc, size_t len, mbstate_t * __restrict ps)
307 {
308 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb));
309 }
310 
311 /*
312  * Common EUC code.
313  */
314 
315 static size_t
316 _EUC_mbrtowc_impl(wchar_t * __restrict pwc, const char * __restrict s,
317 	size_t n, mbstate_t * __restrict ps,
318 	uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
319 {
320 	_EucState *es;
321 	int i, want;
322 	wchar_t wc = 0;
323 	unsigned char ch, chs;
324 
325 	es = (_EucState *)ps;
326 
327 	if (es->want < 0 || es->want > MB_CUR_MAX) {
328 		errno = EINVAL;
329 		return ((size_t)-1);
330 	}
331 
332 	if (s == NULL) {
333 		s = "";
334 		n = 1;
335 		pwc = NULL;
336 	}
337 
338 	if (n == 0)
339 		/* Incomplete multibyte sequence */
340 		return ((size_t)-2);
341 
342 	if (es->want == 0) {
343 		/* Fast path for plain ASCII (CS0) */
344 		if (((ch = (unsigned char)*s) & 0x80) == 0) {
345 			if (pwc != NULL)
346 				*pwc = ch;
347 			return (ch != '\0' ? 1 : 0);
348 		}
349 
350 		if (ch >= 0xa1) {
351 			/* CS1 */
352 			want = 2;
353 		} else if (ch == cs2) {
354 			want = cs2width;
355 		} else if (ch == cs3) {
356 			want = cs3width;
357 		} else {
358 			errno = EILSEQ;
359 			return ((size_t)-1);
360 		}
361 
362 
363 		es->want = want;
364 		es->ch = 0;
365 	} else {
366 		want = es->want;
367 		wc = es->ch;
368 	}
369 
370 	for (i = 0; i < MIN(want, n); i++) {
371 		wc <<= 8;
372 		chs = *s;
373 		wc |= chs;
374 		s++;
375 	}
376 	if (i < want) {
377 		/* Incomplete multibyte sequence */
378 		es->want = want - i;
379 		es->ch = wc;
380 		errno = EILSEQ;
381 		return ((size_t)-2);
382 	}
383 	if (pwc != NULL)
384 		*pwc = wc;
385 	es->want = 0;
386 	return (wc == L'\0' ? 0 : want);
387 }
388 
389 static size_t
390 _EUC_wcrtomb_impl(char * __restrict s, wchar_t wc,
391     mbstate_t * __restrict ps,
392     uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
393 {
394 	_EucState *es;
395 	int i, len;
396 	wchar_t nm;
397 
398 	es = (_EucState *)ps;
399 
400 	if (es->want != 0) {
401 		errno = EINVAL;
402 		return ((size_t)-1);
403 	}
404 
405 	if (s == NULL)
406 		/* Reset to initial shift state (no-op) */
407 		return (1);
408 
409 	if ((wc & ~0x7f) == 0) {
410 		/* Fast path for plain ASCII (CS0) */
411 		*s = (char)wc;
412 		return (1);
413 	}
414 
415 	/* Determine the "length" */
416 	if ((unsigned)wc > 0xffffff) {
417 		len = 4;
418 	} else if ((unsigned)wc > 0xffff) {
419 		len = 3;
420 	} else if ((unsigned)wc > 0xff) {
421 		len = 2;
422 	} else {
423 		len = 1;
424 	}
425 
426 	if (len > MB_CUR_MAX) {
427 		errno = EILSEQ;
428 		return ((size_t)-1);
429 	}
430 
431 	/* This first check excludes CS1, which is implicitly valid. */
432 	if ((wc < 0xa100) || (wc > 0xffff)) {
433 		/* Check for valid CS2 or CS3 */
434 		nm = (wc >> ((len - 1) * 8));
435 		if (nm == cs2) {
436 			if (len != cs2width) {
437 				errno = EILSEQ;
438 				return ((size_t)-1);
439 			}
440 		} else if (nm == cs3) {
441 			if (len != cs3width) {
442 				errno = EILSEQ;
443 				return ((size_t)-1);
444 			}
445 		} else {
446 			errno = EILSEQ;
447 			return ((size_t)-1);
448 		}
449 	}
450 
451 	/* Stash the bytes, least significant last */
452 	for (i = len - 1; i >= 0; i--) {
453 		s[i] = (wc & 0xff);
454 		wc >>= 8;
455 	}
456 	return (len);
457 }
458