1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2010 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 2002-2004 Tim J. Robbins
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 /*
30 * PRC National Standard GB 18030-2000 encoding of Chinese text.
31 *
32 * See gb18030(5) for details.
33 */
34
35 #include "lint.h"
36 #include <sys/types.h>
37 #include <errno.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42 #include "lctype.h"
43
44
45 static size_t _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD,
46 const char *_RESTRICT_KYWD,
47 size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
48 static int _GB18030_mbsinit(const mbstate_t *);
49 static size_t _GB18030_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
50 mbstate_t *_RESTRICT_KYWD);
51 static size_t _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
52 const char **_RESTRICT_KYWD, size_t, size_t,
53 mbstate_t *_RESTRICT_KYWD);
54 static size_t _GB18030_wcsnrtombs(char *_RESTRICT_KYWD,
55 const wchar_t **_RESTRICT_KYWD, size_t, size_t,
56 mbstate_t *_RESTRICT_KYWD);
57
58 void
_GB18030_init(struct lc_ctype * lct)59 _GB18030_init(struct lc_ctype *lct)
60 {
61
62 lct->lc_mbrtowc = _GB18030_mbrtowc;
63 lct->lc_wcrtomb = _GB18030_wcrtomb;
64 lct->lc_mbsinit = _GB18030_mbsinit;
65 lct->lc_mbsnrtowcs = _GB18030_mbsnrtowcs;
66 lct->lc_wcsnrtombs = _GB18030_wcsnrtombs;
67 lct->lc_max_mblen = 4;
68 lct->lc_is_ascii = 0;
69 }
70
71 static int
_GB18030_mbsinit(const mbstate_t * ps)72 _GB18030_mbsinit(const mbstate_t *ps)
73 {
74
75 return (ps == NULL || ((const _GB18030State *)ps)->count == 0);
76 }
77
78 static size_t
_GB18030_mbrtowc(wchar_t * _RESTRICT_KYWD pwc,const char * _RESTRICT_KYWD s,size_t n,mbstate_t * _RESTRICT_KYWD ps,boolean_t zero)79 _GB18030_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
80 size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
81 {
82 _GB18030State *gs;
83 wchar_t wch;
84 int ch, len, ocount;
85 size_t ncopy;
86
87 gs = (_GB18030State *)ps;
88
89 if (gs->count < 0 || gs->count > sizeof (gs->bytes)) {
90 errno = EINVAL;
91 return ((size_t)-1);
92 }
93
94 if (s == NULL) {
95 s = "";
96 n = 1;
97 pwc = NULL;
98 }
99
100 ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof (gs->bytes) - gs->count);
101 (void) memcpy(gs->bytes + gs->count, s, ncopy);
102 ocount = gs->count;
103 gs->count += ncopy;
104 s = (char *)gs->bytes;
105 n = gs->count;
106
107 if (n == 0)
108 /* Incomplete multibyte sequence */
109 return ((size_t)-2);
110
111 /*
112 * Single byte: [00-7f]
113 * Two byte: [81-fe][40-7e,80-fe]
114 * Four byte: [81-fe][30-39][81-fe][30-39]
115 */
116 ch = (unsigned char)*s++;
117 if (ch <= 0x7f) {
118 len = 1;
119 wch = ch;
120 } else if (ch >= 0x81 && ch <= 0xfe) {
121 wch = ch;
122 if (n < 2)
123 return ((size_t)-2);
124 ch = (unsigned char)*s++;
125 if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) {
126 wch = (wch << 8) | ch;
127 len = 2;
128 } else if (ch >= 0x30 && ch <= 0x39) {
129 /*
130 * Strip high bit off the wide character we will
131 * eventually output so that it is positive when
132 * cast to wint_t on 32-bit twos-complement machines.
133 */
134 wch = ((wch & 0x7f) << 8) | ch;
135 if (n < 3)
136 return ((size_t)-2);
137 ch = (unsigned char)*s++;
138 if (ch < 0x81 || ch > 0xfe)
139 goto ilseq;
140 wch = (wch << 8) | ch;
141 if (n < 4)
142 return ((size_t)-2);
143 ch = (unsigned char)*s++;
144 if (ch < 0x30 || ch > 0x39)
145 goto ilseq;
146 wch = (wch << 8) | ch;
147 len = 4;
148 } else
149 goto ilseq;
150 } else
151 goto ilseq;
152
153 if (pwc != NULL)
154 *pwc = wch;
155 gs->count = 0;
156 if (zero || wch != L'\0') {
157 return (len - ocount);
158 } else {
159 return (0);
160 }
161 ilseq:
162 errno = EILSEQ;
163 return ((size_t)-1);
164 }
165
166 static size_t
_GB18030_wcrtomb(char * _RESTRICT_KYWD s,wchar_t wc,mbstate_t * _RESTRICT_KYWD ps)167 _GB18030_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc,
168 mbstate_t *_RESTRICT_KYWD ps)
169 {
170 _GB18030State *gs;
171 size_t len;
172 int c;
173
174 gs = (_GB18030State *)ps;
175
176 if (gs->count != 0) {
177 errno = EINVAL;
178 return ((size_t)-1);
179 }
180
181 if (s == NULL)
182 /* Reset to initial shift state (no-op) */
183 return (1);
184 if ((wc & ~0x7fffffff) != 0)
185 goto ilseq;
186 if (wc & 0x7f000000) {
187 /* Replace high bit that mbrtowc() removed. */
188 wc |= 0x80000000;
189 c = (wc >> 24) & 0xff;
190 if (c < 0x81 || c > 0xfe)
191 goto ilseq;
192 *s++ = c;
193 c = (wc >> 16) & 0xff;
194 if (c < 0x30 || c > 0x39)
195 goto ilseq;
196 *s++ = c;
197 c = (wc >> 8) & 0xff;
198 if (c < 0x81 || c > 0xfe)
199 goto ilseq;
200 *s++ = c;
201 c = wc & 0xff;
202 if (c < 0x30 || c > 0x39)
203 goto ilseq;
204 *s++ = c;
205 len = 4;
206 } else if (wc & 0x00ff0000)
207 goto ilseq;
208 else if (wc & 0x0000ff00) {
209 c = (wc >> 8) & 0xff;
210 if (c < 0x81 || c > 0xfe)
211 goto ilseq;
212 *s++ = c;
213 c = wc & 0xff;
214 if (c < 0x40 || c == 0x7f || c == 0xff)
215 goto ilseq;
216 *s++ = c;
217 len = 2;
218 } else if (wc <= 0x7f) {
219 *s++ = wc;
220 len = 1;
221 } else
222 goto ilseq;
223
224 return (len);
225 ilseq:
226 errno = EILSEQ;
227 return ((size_t)-1);
228 }
229
230 static size_t
_GB18030_mbsnrtowcs(wchar_t * _RESTRICT_KYWD dst,const char ** _RESTRICT_KYWD src,size_t nms,size_t len,mbstate_t * _RESTRICT_KYWD ps)231 _GB18030_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst,
232 const char **_RESTRICT_KYWD src, size_t nms, size_t len,
233 mbstate_t *_RESTRICT_KYWD ps)
234 {
235 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _GB18030_mbrtowc));
236 }
237
238 static size_t
_GB18030_wcsnrtombs(char * _RESTRICT_KYWD dst,const wchar_t ** _RESTRICT_KYWD src,size_t nwc,size_t len,mbstate_t * _RESTRICT_KYWD ps)239 _GB18030_wcsnrtombs(char *_RESTRICT_KYWD dst,
240 const wchar_t **_RESTRICT_KYWD src, size_t nwc, size_t len,
241 mbstate_t *_RESTRICT_KYWD ps)
242 {
243 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _GB18030_wcrtomb));
244 }
245