xref: /freebsd/lib/libc/locale/gb18030.c (revision 1d386b48a555f61cb7325543adbbb5c3f3407a66)
14f6d4aa3STim J. Robbins /*-
2*4d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
3d915a14eSPedro F. Giffuni  *
47b247341SBaptiste Daroussin  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
57b247341SBaptiste Daroussin  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
64f6d4aa3STim J. Robbins  * Copyright (c) 2002-2004 Tim J. Robbins
74f6d4aa3STim J. Robbins  * All rights reserved.
8a0308108SAndrey A. Chernov  *
93c87aa1dSDavid Chisnall  * Copyright (c) 2011 The FreeBSD Foundation
105b5fa75aSEd Maste  *
113c87aa1dSDavid Chisnall  * Portions of this software were developed by David Chisnall
123c87aa1dSDavid Chisnall  * under sponsorship from the FreeBSD Foundation.
133c87aa1dSDavid Chisnall  *
14a0308108SAndrey A. Chernov  * Redistribution and use in source and binary forms, with or without
15a0308108SAndrey A. Chernov  * modification, are permitted provided that the following conditions
16a0308108SAndrey A. Chernov  * are met:
17a0308108SAndrey A. Chernov  * 1. Redistributions of source code must retain the above copyright
18a0308108SAndrey A. Chernov  *    notice, this list of conditions and the following disclaimer.
19a0308108SAndrey A. Chernov  * 2. Redistributions in binary form must reproduce the above copyright
20a0308108SAndrey A. Chernov  *    notice, this list of conditions and the following disclaimer in the
21a0308108SAndrey A. Chernov  *    documentation and/or other materials provided with the distribution.
22a0308108SAndrey A. Chernov  *
234f6d4aa3STim J. Robbins  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24a0308108SAndrey A. Chernov  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25a0308108SAndrey A. Chernov  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
264f6d4aa3STim J. Robbins  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27a0308108SAndrey A. Chernov  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28a0308108SAndrey A. Chernov  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29a0308108SAndrey A. Chernov  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30a0308108SAndrey A. Chernov  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31a0308108SAndrey A. Chernov  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32a0308108SAndrey A. Chernov  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33a0308108SAndrey A. Chernov  * SUCH DAMAGE.
34a0308108SAndrey A. Chernov  */
357b247341SBaptiste Daroussin 
364f6d4aa3STim J. Robbins /*
374f6d4aa3STim J. Robbins  * PRC National Standard GB 18030-2000 encoding of Chinese text.
384f6d4aa3STim J. Robbins  *
394f6d4aa3STim J. Robbins  * See gb18030(5) for details.
404f6d4aa3STim J. Robbins  */
41a0308108SAndrey A. Chernov 
42ca2dae42STim J. Robbins #include <sys/param.h>
434f6d4aa3STim J. Robbins #include <errno.h>
444f6d4aa3STim J. Robbins #include <runetype.h>
45a0308108SAndrey A. Chernov #include <stdlib.h>
46ca2dae42STim J. Robbins #include <string.h>
474f6d4aa3STim J. Robbins #include <wchar.h>
482051a8f2STim J. Robbins #include "mblocal.h"
494f6d4aa3STim J. Robbins 
50e94c6cb4SAlexey Zelkin static size_t	_GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict,
51e94c6cb4SAlexey Zelkin 		    size_t, mbstate_t * __restrict);
52e94c6cb4SAlexey Zelkin static int	_GB18030_mbsinit(const mbstate_t *);
53e94c6cb4SAlexey Zelkin static size_t	_GB18030_wcrtomb(char * __restrict, wchar_t,
544f6d4aa3STim J. Robbins 		    mbstate_t * __restrict);
557b247341SBaptiste Daroussin static size_t	_GB18030_mbsnrtowcs(wchar_t * __restrict,
567b247341SBaptiste Daroussin 		    const char ** __restrict, size_t, size_t,
577b247341SBaptiste Daroussin 		    mbstate_t * __restrict);
587b247341SBaptiste Daroussin static size_t	_GB18030_wcsnrtombs(char * __restrict,
597b247341SBaptiste Daroussin 		    const wchar_t ** __restrict, size_t, size_t,
607b247341SBaptiste Daroussin 		    mbstate_t * __restrict);
617b247341SBaptiste Daroussin 
62a0308108SAndrey A. Chernov 
63ca2dae42STim J. Robbins typedef struct {
64ca2dae42STim J. Robbins 	int	count;
65ca2dae42STim J. Robbins 	u_char	bytes[4];
66ca2dae42STim J. Robbins } _GB18030State;
67ca2dae42STim J. Robbins 
68a0308108SAndrey A. Chernov int
_GB18030_init(struct xlocale_ctype * l,_RuneLocale * rl)693c87aa1dSDavid Chisnall _GB18030_init(struct xlocale_ctype *l, _RuneLocale *rl)
70a0308108SAndrey A. Chernov {
714f6d4aa3STim J. Robbins 
723c87aa1dSDavid Chisnall 	l->__mbrtowc = _GB18030_mbrtowc;
733c87aa1dSDavid Chisnall 	l->__wcrtomb = _GB18030_wcrtomb;
743c87aa1dSDavid Chisnall 	l->__mbsinit = _GB18030_mbsinit;
757b247341SBaptiste Daroussin 	l->__mbsnrtowcs = _GB18030_mbsnrtowcs;
767b247341SBaptiste Daroussin 	l->__wcsnrtombs = _GB18030_wcsnrtombs;
773c87aa1dSDavid Chisnall 	l->runes = rl;
783c87aa1dSDavid Chisnall 	l->__mb_cur_max = 4;
793c87aa1dSDavid Chisnall 	l->__mb_sb_limit = 128;
804f6d4aa3STim J. Robbins 
81a0308108SAndrey A. Chernov 	return (0);
82a0308108SAndrey A. Chernov }
83a0308108SAndrey A. Chernov 
84e94c6cb4SAlexey Zelkin static int
_GB18030_mbsinit(const mbstate_t * ps)85ca2dae42STim J. Robbins _GB18030_mbsinit(const mbstate_t *ps)
86ca2dae42STim J. Robbins {
87ca2dae42STim J. Robbins 
88fa02ee78STim J. Robbins 	return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
89ca2dae42STim J. Robbins }
90ca2dae42STim J. Robbins 
91e94c6cb4SAlexey Zelkin static size_t
_GB18030_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)924f6d4aa3STim J. Robbins _GB18030_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s,
93ca2dae42STim J. Robbins     size_t n, mbstate_t * __restrict ps)
94a0308108SAndrey A. Chernov {
95ca2dae42STim J. Robbins 	_GB18030State *gs;
964f6d4aa3STim J. Robbins 	wchar_t wch;
97ca2dae42STim J. Robbins 	int ch, len, ocount;
98ca2dae42STim J. Robbins 	size_t ncopy;
994f6d4aa3STim J. Robbins 
100ca2dae42STim J. Robbins 	gs = (_GB18030State *)ps;
101ca2dae42STim J. Robbins 
102fc813796STim J. Robbins 	if (gs->count < 0 || gs->count > sizeof(gs->bytes)) {
103fc813796STim J. Robbins 		errno = EINVAL;
104fc813796STim J. Robbins 		return ((size_t)-1);
105fc813796STim J. Robbins 	}
106fc813796STim J. Robbins 
107ca2dae42STim J. Robbins 	if (s == NULL) {
108ca2dae42STim J. Robbins 		s = "";
109ca2dae42STim J. Robbins 		n = 1;
110ca2dae42STim J. Robbins 		pwc = NULL;
111ca2dae42STim J. Robbins 	}
112ca2dae42STim J. Robbins 
113ca2dae42STim J. Robbins 	ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof(gs->bytes) - gs->count);
114ca2dae42STim J. Robbins 	memcpy(gs->bytes + gs->count, s, ncopy);
115ca2dae42STim J. Robbins 	ocount = gs->count;
116ca2dae42STim J. Robbins 	gs->count += ncopy;
117ca2dae42STim J. Robbins 	s = (char *)gs->bytes;
118ca2dae42STim J. Robbins 	n = gs->count;
119ca2dae42STim J. Robbins 
1204f6d4aa3STim J. Robbins 	if (n == 0)
1214f6d4aa3STim J. Robbins 		/* Incomplete multibyte sequence */
1224f6d4aa3STim J. Robbins 		return ((size_t)-2);
1234f6d4aa3STim J. Robbins 
1244f6d4aa3STim J. Robbins 	/*
1254f6d4aa3STim J. Robbins 	 * Single byte:		[00-7f]
1264f6d4aa3STim J. Robbins 	 * Two byte:		[81-fe][40-7e,80-fe]
1274f6d4aa3STim J. Robbins 	 * Four byte:		[81-fe][30-39][81-fe][30-39]
1284f6d4aa3STim J. Robbins 	 */
1294f6d4aa3STim J. Robbins 	ch = (unsigned char)*s++;
1304f6d4aa3STim J. Robbins 	if (ch <= 0x7f) {
1314f6d4aa3STim J. Robbins 		len = 1;
1324f6d4aa3STim J. Robbins 		wch = ch;
1334f6d4aa3STim J. Robbins 	} else if (ch >= 0x81 && ch <= 0xfe) {
1344f6d4aa3STim J. Robbins 		wch = ch;
1354f6d4aa3STim J. Robbins 		if (n < 2)
1364f6d4aa3STim J. Robbins 			return ((size_t)-2);
1374f6d4aa3STim J. Robbins 		ch = (unsigned char)*s++;
1384f6d4aa3STim J. Robbins 		if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
1394f6d4aa3STim J. Robbins 			wch = (wch << 8) | ch;
1404f6d4aa3STim J. Robbins 			len = 2;
1414f6d4aa3STim J. Robbins 		} else if (ch >= 0x30 && ch <= 0x39) {
1424f6d4aa3STim J. Robbins 			/*
1434f6d4aa3STim J. Robbins 			 * Strip high bit off the wide character we will
1444f6d4aa3STim J. Robbins 			 * eventually output so that it is positive when
1454f6d4aa3STim J. Robbins 			 * cast to wint_t on 32-bit twos-complement machines.
1464f6d4aa3STim J. Robbins 			 */
1474f6d4aa3STim J. Robbins 			wch = ((wch & 0x7f) << 8) | ch;
1484f6d4aa3STim J. Robbins 			if (n < 3)
1494f6d4aa3STim J. Robbins 				return ((size_t)-2);
1504f6d4aa3STim J. Robbins 			ch = (unsigned char)*s++;
1514f6d4aa3STim J. Robbins 			if (ch < 0x81 || ch > 0xfe)
1524f6d4aa3STim J. Robbins 				goto ilseq;
1534f6d4aa3STim J. Robbins 			wch = (wch << 8) | ch;
1544f6d4aa3STim J. Robbins 			if (n < 4)
1554f6d4aa3STim J. Robbins 				return ((size_t)-2);
1564f6d4aa3STim J. Robbins 			ch = (unsigned char)*s++;
1574f6d4aa3STim J. Robbins 			if (ch < 0x30 || ch > 0x39)
1584f6d4aa3STim J. Robbins 				goto ilseq;
1594f6d4aa3STim J. Robbins 			wch = (wch << 8) | ch;
1604f6d4aa3STim J. Robbins 			len = 4;
1614f6d4aa3STim J. Robbins 		} else
1624f6d4aa3STim J. Robbins 			goto ilseq;
1634f6d4aa3STim J. Robbins 	} else
1644f6d4aa3STim J. Robbins 		goto ilseq;
1654f6d4aa3STim J. Robbins 
1664f6d4aa3STim J. Robbins 	if (pwc != NULL)
1674f6d4aa3STim J. Robbins 		*pwc = wch;
168ca2dae42STim J. Robbins 	gs->count = 0;
169ca2dae42STim J. Robbins 	return (wch == L'\0' ? 0 : len - ocount);
1704f6d4aa3STim J. Robbins ilseq:
1714f6d4aa3STim J. Robbins 	errno = EILSEQ;
1724f6d4aa3STim J. Robbins 	return ((size_t)-1);
173a0308108SAndrey A. Chernov }
174a0308108SAndrey A. Chernov 
175e94c6cb4SAlexey Zelkin static size_t
_GB18030_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)176fc813796STim J. Robbins _GB18030_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
177a0308108SAndrey A. Chernov {
178fc813796STim J. Robbins 	_GB18030State *gs;
1794f6d4aa3STim J. Robbins 	size_t len;
1804f6d4aa3STim J. Robbins 	int c;
181a0308108SAndrey A. Chernov 
182fc813796STim J. Robbins 	gs = (_GB18030State *)ps;
183fc813796STim J. Robbins 
184fc813796STim J. Robbins 	if (gs->count != 0) {
185fc813796STim J. Robbins 		errno = EINVAL;
186fc813796STim J. Robbins 		return ((size_t)-1);
187fc813796STim J. Robbins 	}
188fc813796STim J. Robbins 
1894f6d4aa3STim J. Robbins 	if (s == NULL)
1904f6d4aa3STim J. Robbins 		/* Reset to initial shift state (no-op) */
191a0308108SAndrey A. Chernov 		return (1);
1924f6d4aa3STim J. Robbins 	if ((wc & ~0x7fffffff) != 0)
1934f6d4aa3STim J. Robbins 		goto ilseq;
1944f6d4aa3STim J. Robbins 	if (wc & 0x7f000000) {
1954f6d4aa3STim J. Robbins 		/* Replace high bit that mbrtowc() removed. */
1964f6d4aa3STim J. Robbins 		wc |= 0x80000000;
1974f6d4aa3STim J. Robbins 		c = (wc >> 24) & 0xff;
1984f6d4aa3STim J. Robbins 		if (c < 0x81 || c > 0xfe)
1994f6d4aa3STim J. Robbins 			goto ilseq;
2004f6d4aa3STim J. Robbins 		*s++ = c;
2014f6d4aa3STim J. Robbins 		c = (wc >> 16) & 0xff;
2024f6d4aa3STim J. Robbins 		if (c < 0x30 || c > 0x39)
2034f6d4aa3STim J. Robbins 			goto ilseq;
2044f6d4aa3STim J. Robbins 		*s++ = c;
2054f6d4aa3STim J. Robbins 		c = (wc >> 8) & 0xff;
2064f6d4aa3STim J. Robbins 		if (c < 0x81 || c > 0xfe)
2074f6d4aa3STim J. Robbins 			goto ilseq;
2084f6d4aa3STim J. Robbins 		*s++ = c;
2094f6d4aa3STim J. Robbins 		c = wc & 0xff;
2104f6d4aa3STim J. Robbins 		if (c < 0x30 || c > 0x39)
2114f6d4aa3STim J. Robbins 			goto ilseq;
2124f6d4aa3STim J. Robbins 		*s++ = c;
2134f6d4aa3STim J. Robbins 		len = 4;
2144f6d4aa3STim J. Robbins 	} else if (wc & 0x00ff0000)
2154f6d4aa3STim J. Robbins 		goto ilseq;
2164f6d4aa3STim J. Robbins 	else if (wc & 0x0000ff00) {
2174f6d4aa3STim J. Robbins 		c = (wc >> 8) & 0xff;
2184f6d4aa3STim J. Robbins 		if (c < 0x81 || c > 0xfe)
2194f6d4aa3STim J. Robbins 			goto ilseq;
2204f6d4aa3STim J. Robbins 		*s++ = c;
2214f6d4aa3STim J. Robbins 		c = wc & 0xff;
2224f6d4aa3STim J. Robbins 		if (c < 0x40 || c == 0x7f || c == 0xff)
2234f6d4aa3STim J. Robbins 			goto ilseq;
2244f6d4aa3STim J. Robbins 		*s++ = c;
2254f6d4aa3STim J. Robbins 		len = 2;
2264f6d4aa3STim J. Robbins 	} else if (wc <= 0x7f) {
2274f6d4aa3STim J. Robbins 		*s++ = wc;
2284f6d4aa3STim J. Robbins 		len = 1;
2294f6d4aa3STim J. Robbins 	} else
2304f6d4aa3STim J. Robbins 		goto ilseq;
2314f6d4aa3STim J. Robbins 
2324f6d4aa3STim J. Robbins 	return (len);
2334f6d4aa3STim J. Robbins ilseq:
2344f6d4aa3STim J. Robbins 	errno = EILSEQ;
2354f6d4aa3STim J. Robbins 	return ((size_t)-1);
236a0308108SAndrey A. Chernov }
2377b247341SBaptiste Daroussin 
2387b247341SBaptiste Daroussin static size_t
_GB18030_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nms,size_t len,mbstate_t * __restrict ps)2397b247341SBaptiste Daroussin _GB18030_mbsnrtowcs(wchar_t * __restrict dst,
2407b247341SBaptiste Daroussin     const char ** __restrict src, size_t nms, size_t len,
2417b247341SBaptiste Daroussin     mbstate_t * __restrict ps)
2427b247341SBaptiste Daroussin {
2437b247341SBaptiste Daroussin 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB18030_mbrtowc));
2447b247341SBaptiste Daroussin }
2457b247341SBaptiste Daroussin 
2467b247341SBaptiste Daroussin static size_t
_GB18030_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)2477b247341SBaptiste Daroussin _GB18030_wcsnrtombs(char * __restrict dst,
2487b247341SBaptiste Daroussin     const wchar_t ** __restrict src, size_t nwc, size_t len,
2497b247341SBaptiste Daroussin     mbstate_t * __restrict ps)
2507b247341SBaptiste Daroussin {
2517b247341SBaptiste Daroussin 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB18030_wcrtomb));
2527b247341SBaptiste Daroussin }
253