xref: /illumos-gate/usr/src/lib/libc/port/locale/gb18030.c (revision fec047081731fd77caf46ec0471c501b2cb33894)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * PRC National Standard GB 18030-2000 encoding of Chinese text.
31  *
32  * See gb18030(7) for details.
33  */
34 
35 #include "lint.h"
36 #include <sys/types.h>
37 #include <errno.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42 #include "lctype.h"
43 
44 
45 static size_t	_GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
46 		    const char *_RESTRICT_KYWD,
47 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
48 static int	_GB18030_mbsinit(const mbstate_t *);
49 static size_t	_GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
50 		    mbstate_t *_RESTRICT_KYWD);
51 static size_t	_GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
52 		    const char **_RESTRICT_KYWD, size_t, size_t,
53 		    mbstate_t *_RESTRICT_KYWD);
54 static size_t	_GB18030_wcsnrtombs(char *_RESTRICT_KYWD,
55 		    const wchar_t **_RESTRICT_KYWD, size_t, size_t,
56 		    mbstate_t *_RESTRICT_KYWD);
57 
58 void
59 _GB18030_init(struct lc_ctype *lct)
60 {
61 
62 	lct->lc_mbrtowc = _GB18030_mbrtowc;
63 	lct->lc_wcrtomb = _GB18030_wcrtomb;
64 	lct->lc_mbsinit = _GB18030_mbsinit;
65 	lct->lc_mbsnrtowcs = _GB18030_mbsnrtowcs;
66 	lct->lc_wcsnrtombs = _GB18030_wcsnrtombs;
67 	lct->lc_max_mblen = 4;
68 	lct->lc_is_ascii = 0;
69 }
70 
71 static int
72 _GB18030_mbsinit(const mbstate_t *ps)
73 {
74 
75 	return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
76 }
77 
78 static size_t
79 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
80     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
81 {
82 	_GB18030State *gs;
83 	wchar_t wch;
84 	int ch, len, ocount;
85 	size_t ncopy;
86 
87 	gs = (_GB18030State *)ps;
88 
89 	if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
90 		errno = EINVAL;
91 		return ((size_t)-1);
92 	}
93 
94 	if (s == NULL) {
95 		s = "";
96 		n = 1;
97 		pwc = NULL;
98 	}
99 
100 	ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
101 	(void) memcpy(gs->bytes + gs->count, s, ncopy);
102 	ocount = gs->count;
103 	gs->count += ncopy;
104 	s = (char *)gs->bytes;
105 	n = gs->count;
106 
107 	if (n == 0)
108 		/* Incomplete multibyte sequence */
109 		return ((size_t)-2);
110 
111 	/*
112 	 * Single byte:		[00-7f]
113 	 * Two byte:		[81-fe][40-7e,80-fe]
114 	 * Four byte:		[81-fe][30-39][81-fe][30-39]
115 	 */
116 	ch = (unsigned char)*s++;
117 	if (ch <= 0x7f) {
118 		len = 1;
119 		wch = ch;
120 	} else if (ch >= 0x81 && ch <= 0xfe) {
121 		wch = ch;
122 		if (n < 2)
123 			return ((size_t)-2);
124 		ch = (unsigned char)*s++;
125 		if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
126 			wch = (wch << 8) | ch;
127 			len = 2;
128 		} else if (ch >= 0x30 && ch <= 0x39) {
129 			/*
130 			 * Strip high bit off the wide character we will
131 			 * eventually output so that it is positive when
132 			 * cast to wint_t on 32-bit twos-complement machines.
133 			 */
134 			wch = ((wch & 0x7f) << 8) | ch;
135 			if (n < 3)
136 				return ((size_t)-2);
137 			ch = (unsigned char)*s++;
138 			if (ch < 0x81 || ch > 0xfe)
139 				goto ilseq;
140 			wch = (wch << 8) | ch;
141 			if (n < 4)
142 				return ((size_t)-2);
143 			ch = (unsigned char)*s++;
144 			if (ch < 0x30 || ch > 0x39)
145 				goto ilseq;
146 			wch = (wch << 8) | ch;
147 			len = 4;
148 		} else
149 			goto ilseq;
150 	} else
151 		goto ilseq;
152 
153 	if (pwc != NULL)
154 		*pwc = wch;
155 	gs->count = 0;
156 	if (zero || wch != L'\0') {
157 		return (len - ocount);
158 	} else {
159 		return (0);
160 	}
161 ilseq:
162 	errno = EILSEQ;
163 	return ((size_t)-1);
164 }
165 
166 static size_t
167 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
168     mbstate_t *_RESTRICT_KYWD ps)
169 {
170 	_GB18030State *gs;
171 	size_t len;
172 	int c;
173 
174 	gs = (_GB18030State *)ps;
175 
176 	if (gs->count != 0) {
177 		errno = EINVAL;
178 		return ((size_t)-1);
179 	}
180 
181 	if (s == NULL)
182 		/* Reset to initial shift state (no-op) */
183 		return (1);
184 	if ((wc & ~0x7fffffff) != 0)
185 		goto ilseq;
186 	if (wc & 0x7f000000) {
187 		/* Replace high bit that mbrtowc() removed. */
188 		wc |= 0x80000000;
189 		c = (wc >> 24) & 0xff;
190 		if (c < 0x81 || c > 0xfe)
191 			goto ilseq;
192 		*s++ = c;
193 		c = (wc >> 16) & 0xff;
194 		if (c < 0x30 || c > 0x39)
195 			goto ilseq;
196 		*s++ = c;
197 		c = (wc >> 8) & 0xff;
198 		if (c < 0x81 || c > 0xfe)
199 			goto ilseq;
200 		*s++ = c;
201 		c = wc & 0xff;
202 		if (c < 0x30 || c > 0x39)
203 			goto ilseq;
204 		*s++ = c;
205 		len = 4;
206 	} else if (wc & 0x00ff0000)
207 		goto ilseq;
208 	else if (wc & 0x0000ff00) {
209 		c = (wc >> 8) & 0xff;
210 		if (c < 0x81 || c > 0xfe)
211 			goto ilseq;
212 		*s++ = c;
213 		c = wc & 0xff;
214 		if (c < 0x40 || c == 0x7f || c == 0xff)
215 			goto ilseq;
216 		*s++ = c;
217 		len = 2;
218 	} else if (wc <= 0x7f) {
219 		*s++ = wc;
220 		len = 1;
221 	} else
222 		goto ilseq;
223 
224 	return (len);
225 ilseq:
226 	errno = EILSEQ;
227 	return ((size_t)-1);
228 }
229 
230 static size_t
231 _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
232     const char **_RESTRICT_KYWD src, size_t nms, size_t len,
233     mbstate_t *_RESTRICT_KYWD ps)
234 {
235 	return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB18030_mbrtowc));
236 }
237 
238 static size_t
239 _GB18030_wcsnrtombs(char *_RESTRICT_KYWD dst,
240     const wchar_t **_RESTRICT_KYWD src, size_t nwc, size_t len,
241     mbstate_t *_RESTRICT_KYWD ps)
242 {
243 	return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB18030_wcrtomb));
244 }
245