xref: /illumos-gate/usr/src/lib/libc/port/locale/gb18030.c (revision 5027ae116939e77882ac5560962b15d8a69c9235)
1 /*
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 2002-2004 Tim J. Robbins
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * PRC National Standard GB 18030-2000 encoding of Chinese text.
30  *
31  * See gb18030(5) for details.
32  */
33 
34 #include "lint.h"
35 #include <sys/types.h>
36 #include <errno.h>
37 #include "runetype.h"
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42 
43 
44 static size_t	_GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
45 		    const char *_RESTRICT_KYWD,
46 		    size_t, mbstate_t *_RESTRICT_KYWD);
47 static int	_GB18030_mbsinit(const mbstate_t *);
48 static size_t	_GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
49 		    mbstate_t *_RESTRICT_KYWD);
50 
51 typedef struct {
52 	int	count;
53 	uchar_t	bytes[4];
54 } _GB18030State;
55 
56 int
57 _GB18030_init(_RuneLocale *rl)
58 {
59 
60 	__mbrtowc = _GB18030_mbrtowc;
61 	__wcrtomb = _GB18030_wcrtomb;
62 	__mbsinit = _GB18030_mbsinit;
63 	_CurrentRuneLocale = rl;
64 	__ctype[520] = 4;
65 	charset_is_ascii = 0;
66 
67 	return (0);
68 }
69 
70 static int
71 _GB18030_mbsinit(const mbstate_t *ps)
72 {
73 
74 	return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
75 }
76 
77 static size_t
78 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
79     size_t n, mbstate_t *_RESTRICT_KYWD ps)
80 {
81 	_GB18030State *gs;
82 	wchar_t wch;
83 	int ch, len, ocount;
84 	size_t ncopy;
85 
86 	gs = (_GB18030State *)ps;
87 
88 	if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
89 		errno = EINVAL;
90 		return ((size_t)-1);
91 	}
92 
93 	if (s == NULL) {
94 		s = "";
95 		n = 1;
96 		pwc = NULL;
97 	}
98 
99 	ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
100 	(void) memcpy(gs->bytes + gs->count, s, ncopy);
101 	ocount = gs->count;
102 	gs->count += ncopy;
103 	s = (char *)gs->bytes;
104 	n = gs->count;
105 
106 	if (n == 0)
107 		/* Incomplete multibyte sequence */
108 		return ((size_t)-2);
109 
110 	/*
111 	 * Single byte:		[00-7f]
112 	 * Two byte:		[81-fe][40-7e,80-fe]
113 	 * Four byte:		[81-fe][30-39][81-fe][30-39]
114 	 */
115 	ch = (unsigned char)*s++;
116 	if (ch <= 0x7f) {
117 		len = 1;
118 		wch = ch;
119 	} else if (ch >= 0x81 && ch <= 0xfe) {
120 		wch = ch;
121 		if (n < 2)
122 			return ((size_t)-2);
123 		ch = (unsigned char)*s++;
124 		if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
125 			wch = (wch << 8) | ch;
126 			len = 2;
127 		} else if (ch >= 0x30 && ch <= 0x39) {
128 			/*
129 			 * Strip high bit off the wide character we will
130 			 * eventually output so that it is positive when
131 			 * cast to wint_t on 32-bit twos-complement machines.
132 			 */
133 			wch = ((wch & 0x7f) << 8) | ch;
134 			if (n < 3)
135 				return ((size_t)-2);
136 			ch = (unsigned char)*s++;
137 			if (ch < 0x81 || ch > 0xfe)
138 				goto ilseq;
139 			wch = (wch << 8) | ch;
140 			if (n < 4)
141 				return ((size_t)-2);
142 			ch = (unsigned char)*s++;
143 			if (ch < 0x30 || ch > 0x39)
144 				goto ilseq;
145 			wch = (wch << 8) | ch;
146 			len = 4;
147 		} else
148 			goto ilseq;
149 	} else
150 		goto ilseq;
151 
152 	if (pwc != NULL)
153 		*pwc = wch;
154 	gs->count = 0;
155 	return (wch == L'\0' ? 0 : len - ocount);
156 ilseq:
157 	errno = EILSEQ;
158 	return ((size_t)-1);
159 }
160 
161 static size_t
162 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
163     mbstate_t *_RESTRICT_KYWD ps)
164 {
165 	_GB18030State *gs;
166 	size_t len;
167 	int c;
168 
169 	gs = (_GB18030State *)ps;
170 
171 	if (gs->count != 0) {
172 		errno = EINVAL;
173 		return ((size_t)-1);
174 	}
175 
176 	if (s == NULL)
177 		/* Reset to initial shift state (no-op) */
178 		return (1);
179 	if ((wc & ~0x7fffffff) != 0)
180 		goto ilseq;
181 	if (wc & 0x7f000000) {
182 		/* Replace high bit that mbrtowc() removed. */
183 		wc |= 0x80000000;
184 		c = (wc >> 24) & 0xff;
185 		if (c < 0x81 || c > 0xfe)
186 			goto ilseq;
187 		*s++ = c;
188 		c = (wc >> 16) & 0xff;
189 		if (c < 0x30 || c > 0x39)
190 			goto ilseq;
191 		*s++ = c;
192 		c = (wc >> 8) & 0xff;
193 		if (c < 0x81 || c > 0xfe)
194 			goto ilseq;
195 		*s++ = c;
196 		c = wc & 0xff;
197 		if (c < 0x30 || c > 0x39)
198 			goto ilseq;
199 		*s++ = c;
200 		len = 4;
201 	} else if (wc & 0x00ff0000)
202 		goto ilseq;
203 	else if (wc & 0x0000ff00) {
204 		c = (wc >> 8) & 0xff;
205 		if (c < 0x81 || c > 0xfe)
206 			goto ilseq;
207 		*s++ = c;
208 		c = wc & 0xff;
209 		if (c < 0x40 || c == 0x7f || c == 0xff)
210 			goto ilseq;
211 		*s++ = c;
212 		len = 2;
213 	} else if (wc <= 0x7f) {
214 		*s++ = wc;
215 		len = 1;
216 	} else
217 		goto ilseq;
218 
219 	return (len);
220 ilseq:
221 	errno = EILSEQ;
222 	return ((size_t)-1);
223 }
224