xref: /illumos-gate/usr/src/lib/libc/port/locale/gb18030.c (revision 66582b606a8194f7f3ba5b3a3a6dca5b0d346361)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * PRC National Standard GB 18030-2000 encoding of Chinese text.
31  *
32  * See gb18030(5) for details.
33  */
34 
35 #include "lint.h"
36 #include <sys/types.h>
37 #include <errno.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42 #include "lctype.h"
43 
44 
45 static size_t	_GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
46 		    const char *_RESTRICT_KYWD,
47 		    size_t, mbstate_t *_RESTRICT_KYWD);
48 static int	_GB18030_mbsinit(const mbstate_t *);
49 static size_t	_GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
50 		    mbstate_t *_RESTRICT_KYWD);
51 static size_t	_GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
52 		    const char **_RESTRICT_KYWD, size_t, size_t,
53 		    mbstate_t *_RESTRICT_KYWD);
54 static size_t	_GB18030_wcsnrtombs(char *_RESTRICT_KYWD,
55 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
56 		    mbstate_t *_RESTRICT_KYWD);
57 
58 
59 typedef struct {
60 	int	count;
61 	uchar_t	bytes[4];
62 } _GB18030State;
63 
64 void
65 _GB18030_init(struct lc_ctype *lct)
66 {
67 
68 	lct->lc_mbrtowc = _GB18030_mbrtowc;
69 	lct->lc_wcrtomb = _GB18030_wcrtomb;
70 	lct->lc_mbsinit = _GB18030_mbsinit;
71 	lct->lc_mbsnrtowcs = _GB18030_mbsnrtowcs;
72 	lct->lc_wcsnrtombs = _GB18030_wcsnrtombs;
73 	lct->lc_max_mblen = 4;
74 	lct->lc_is_ascii = 0;
75 }
76 
77 static int
78 _GB18030_mbsinit(const mbstate_t *ps)
79 {
80 
81 	return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
82 }
83 
84 static size_t
85 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
86     size_t n, mbstate_t *_RESTRICT_KYWD ps)
87 {
88 	_GB18030State *gs;
89 	wchar_t wch;
90 	int ch, len, ocount;
91 	size_t ncopy;
92 
93 	gs = (_GB18030State *)ps;
94 
95 	if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
96 		errno = EINVAL;
97 		return ((size_t)-1);
98 	}
99 
100 	if (s == NULL) {
101 		s = "";
102 		n = 1;
103 		pwc = NULL;
104 	}
105 
106 	ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
107 	(void) memcpy(gs->bytes + gs->count, s, ncopy);
108 	ocount = gs->count;
109 	gs->count += ncopy;
110 	s = (char *)gs->bytes;
111 	n = gs->count;
112 
113 	if (n == 0)
114 		/* Incomplete multibyte sequence */
115 		return ((size_t)-2);
116 
117 	/*
118 	 * Single byte:		[00-7f]
119 	 * Two byte:		[81-fe][40-7e,80-fe]
120 	 * Four byte:		[81-fe][30-39][81-fe][30-39]
121 	 */
122 	ch = (unsigned char)*s++;
123 	if (ch <= 0x7f) {
124 		len = 1;
125 		wch = ch;
126 	} else if (ch >= 0x81 && ch <= 0xfe) {
127 		wch = ch;
128 		if (n < 2)
129 			return ((size_t)-2);
130 		ch = (unsigned char)*s++;
131 		if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
132 			wch = (wch << 8) | ch;
133 			len = 2;
134 		} else if (ch >= 0x30 && ch <= 0x39) {
135 			/*
136 			 * Strip high bit off the wide character we will
137 			 * eventually output so that it is positive when
138 			 * cast to wint_t on 32-bit twos-complement machines.
139 			 */
140 			wch = ((wch & 0x7f) << 8) | ch;
141 			if (n < 3)
142 				return ((size_t)-2);
143 			ch = (unsigned char)*s++;
144 			if (ch < 0x81 || ch > 0xfe)
145 				goto ilseq;
146 			wch = (wch << 8) | ch;
147 			if (n < 4)
148 				return ((size_t)-2);
149 			ch = (unsigned char)*s++;
150 			if (ch < 0x30 || ch > 0x39)
151 				goto ilseq;
152 			wch = (wch << 8) | ch;
153 			len = 4;
154 		} else
155 			goto ilseq;
156 	} else
157 		goto ilseq;
158 
159 	if (pwc != NULL)
160 		*pwc = wch;
161 	gs->count = 0;
162 	return (wch == L'\0' ? 0 : len - ocount);
163 ilseq:
164 	errno = EILSEQ;
165 	return ((size_t)-1);
166 }
167 
168 static size_t
169 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
170     mbstate_t *_RESTRICT_KYWD ps)
171 {
172 	_GB18030State *gs;
173 	size_t len;
174 	int c;
175 
176 	gs = (_GB18030State *)ps;
177 
178 	if (gs->count != 0) {
179 		errno = EINVAL;
180 		return ((size_t)-1);
181 	}
182 
183 	if (s == NULL)
184 		/* Reset to initial shift state (no-op) */
185 		return (1);
186 	if ((wc & ~0x7fffffff) != 0)
187 		goto ilseq;
188 	if (wc & 0x7f000000) {
189 		/* Replace high bit that mbrtowc() removed. */
190 		wc |= 0x80000000;
191 		c = (wc >> 24) & 0xff;
192 		if (c < 0x81 || c > 0xfe)
193 			goto ilseq;
194 		*s++ = c;
195 		c = (wc >> 16) & 0xff;
196 		if (c < 0x30 || c > 0x39)
197 			goto ilseq;
198 		*s++ = c;
199 		c = (wc >> 8) & 0xff;
200 		if (c < 0x81 || c > 0xfe)
201 			goto ilseq;
202 		*s++ = c;
203 		c = wc & 0xff;
204 		if (c < 0x30 || c > 0x39)
205 			goto ilseq;
206 		*s++ = c;
207 		len = 4;
208 	} else if (wc & 0x00ff0000)
209 		goto ilseq;
210 	else if (wc & 0x0000ff00) {
211 		c = (wc >> 8) & 0xff;
212 		if (c < 0x81 || c > 0xfe)
213 			goto ilseq;
214 		*s++ = c;
215 		c = wc & 0xff;
216 		if (c < 0x40 || c == 0x7f || c == 0xff)
217 			goto ilseq;
218 		*s++ = c;
219 		len = 2;
220 	} else if (wc <= 0x7f) {
221 		*s++ = wc;
222 		len = 1;
223 	} else
224 		goto ilseq;
225 
226 	return (len);
227 ilseq:
228 	errno = EILSEQ;
229 	return ((size_t)-1);
230 }
231 
232 static size_t
233 _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
234     const char **_RESTRICT_KYWD src, size_t nms, size_t len,
235     mbstate_t *_RESTRICT_KYWD ps)
236 {
237 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB18030_mbrtowc));
238 }
239 
240 static size_t
241 _GB18030_wcsnrtombs(char *_RESTRICT_KYWD dst,
242     const wchar_t **_RESTRICT_KYWD src, size_t nwc, size_t len,
243     mbstate_t *_RESTRICT_KYWD ps)
244 {
245 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB18030_wcrtomb));
246 }
247