1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 2002-2004 Tim J. Robbins
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include "lint.h"
30 #include <errno.h>
31 #include <limits.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "mblocal.h"
36 #include "lctype.h"
37
38 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
39 const char *_RESTRICT_KYWD,
40 size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
41 static int _UTF8_mbsinit(const mbstate_t *);
42 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
43 const char **_RESTRICT_KYWD, size_t, size_t,
44 mbstate_t *_RESTRICT_KYWD);
45 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
46 mbstate_t *_RESTRICT_KYWD);
47 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
48 const wchar_t **_RESTRICT_KYWD,
49 size_t, size_t, mbstate_t *_RESTRICT_KYWD);
50
51 void
_UTF8_init(struct lc_ctype * lct)52 _UTF8_init(struct lc_ctype *lct)
53 {
54 lct->lc_mbrtowc = _UTF8_mbrtowc;
55 lct->lc_wcrtomb = _UTF8_wcrtomb;
56 lct->lc_mbsinit = _UTF8_mbsinit;
57 lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
58 lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
59 lct->lc_is_ascii = 0;
60 lct->lc_max_mblen = 4;
61 }
62
63 static int
_UTF8_mbsinit(const mbstate_t * ps)64 _UTF8_mbsinit(const mbstate_t *ps)
65 {
66
67 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
68 }
69
70 static size_t
_UTF8_mbrtowc(wchar_t * _RESTRICT_KYWD pwc,const char * _RESTRICT_KYWD s,size_t n,mbstate_t * _RESTRICT_KYWD ps,boolean_t zero)71 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
72 size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
73 {
74 _UTF8State *us;
75 int ch, i, mask, want;
76 wchar_t lbound, wch;
77
78 us = (_UTF8State *)ps;
79
80 if (us->want < 0 || us->want > 6) {
81 errno = EINVAL;
82 return ((size_t)-1);
83 }
84
85 if (s == NULL) {
86 s = "";
87 n = 1;
88 pwc = NULL;
89 }
90
91 if (n == 0)
92 /* Incomplete multibyte sequence */
93 return ((size_t)-2);
94
95 if (us->want == 0) {
96 /*
97 * Determine the number of octets that make up this character
98 * from the first octet, and a mask that extracts the
99 * interesting bits of the first octet. We already know
100 * the character is at least two bytes long.
101 *
102 * We also specify a lower bound for the character code to
103 * detect redundant, non-"shortest form" encodings. For
104 * example, the sequence C0 80 is _not_ a legal representation
105 * of the null character. This enforces a 1-to-1 mapping
106 * between character codes and their multibyte representations.
107 */
108 ch = (unsigned char)*s;
109 if ((ch & 0x80) == 0) {
110 /* Fast path for plain ASCII characters. */
111 if (pwc != NULL)
112 *pwc = ch;
113 if (zero || ch != '\0') {
114 return (1);
115 } else {
116 return (0);
117 }
118 }
119 if ((ch & 0xe0) == 0xc0) {
120 mask = 0x1f;
121 want = 2;
122 lbound = 0x80;
123 } else if ((ch & 0xf0) == 0xe0) {
124 mask = 0x0f;
125 want = 3;
126 lbound = 0x800;
127 } else if ((ch & 0xf8) == 0xf0) {
128 mask = 0x07;
129 want = 4;
130 lbound = 0x10000;
131 #if 0
132 /* These would be illegal in the UTF-8 space */
133
134 } else if ((ch & 0xfc) == 0xf8) {
135 mask = 0x03;
136 want = 5;
137 lbound = 0x200000;
138 } else if ((ch & 0xfe) == 0xfc) {
139 mask = 0x01;
140 want = 6;
141 lbound = 0x4000000;
142 #endif
143 } else {
144 /*
145 * Malformed input; input is not UTF-8.
146 */
147 errno = EILSEQ;
148 return ((size_t)-1);
149 }
150 } else {
151 want = us->want;
152 lbound = us->lbound;
153 }
154
155 /*
156 * Decode the octet sequence representing the character in chunks
157 * of 6 bits, most significant first.
158 */
159 if (us->want == 0)
160 wch = (unsigned char)*s++ & mask;
161 else
162 wch = us->ch;
163
164 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
165 if ((*s & 0xc0) != 0x80) {
166 /*
167 * Malformed input; bad characters in the middle
168 * of a character.
169 */
170 errno = EILSEQ;
171 return ((size_t)-1);
172 }
173 wch <<= 6;
174 wch |= *s++ & 0x3f;
175 }
176 if (i < want) {
177 /* Incomplete multibyte sequence. */
178 us->want = want - i;
179 us->lbound = lbound;
180 us->ch = wch;
181 return ((size_t)-2);
182 }
183 if (wch < lbound) {
184 /*
185 * Malformed input; redundant encoding.
186 */
187 errno = EILSEQ;
188 return ((size_t)-1);
189 }
190 if (pwc != NULL)
191 *pwc = wch;
192 us->want = 0;
193 if (zero || wch != L'\0') {
194 return (want);
195 } else {
196 return (0);
197 }
198 }
199
200 static size_t
_UTF8_mbsnrtowcs(wchar_t * _RESTRICT_KYWD dst,const char ** _RESTRICT_KYWD src,size_t nms,size_t len,mbstate_t * _RESTRICT_KYWD ps)201 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
202 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
203 {
204 _UTF8State *us;
205 const char *s;
206 size_t nchr;
207 wchar_t wc;
208 size_t nb;
209
210 us = (_UTF8State *)ps;
211
212 s = *src;
213 nchr = 0;
214
215 if (dst == NULL) {
216 /*
217 * The fast path in the loop below is not safe if an ASCII
218 * character appears as anything but the first byte of a
219 * multibyte sequence. Check now to avoid doing it in the loop.
220 */
221 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
222 errno = EILSEQ;
223 return ((size_t)-1);
224 }
225 for (;;) {
226 if (nms > 0 && (signed char)*s > 0) {
227 /*
228 * Fast path for plain ASCII characters
229 * excluding NUL.
230 */
231 nb = 1;
232 } else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps,
233 B_FALSE)) == (size_t)-1) {
234 /* Invalid sequence - mbrtowc() sets errno. */
235 return ((size_t)-1);
236 } else if (nb == 0 || nb == (size_t)-2) {
237 return (nchr);
238 }
239 s += nb;
240 nms -= nb;
241 nchr++;
242 }
243 /*NOTREACHED*/
244 }
245
246 /*
247 * The fast path in the loop below is not safe if an ASCII
248 * character appears as anything but the first byte of a
249 * multibyte sequence. Check now to avoid doing it in the loop.
250 */
251 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
252 errno = EILSEQ;
253 return ((size_t)-1);
254 }
255 while (len-- > 0) {
256 if (nms > 0 && (signed char)*s > 0) {
257 /*
258 * Fast path for plain ASCII characters
259 * excluding NUL.
260 */
261 *dst = (wchar_t)*s;
262 nb = 1;
263 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, B_FALSE)) ==
264 (size_t)-1) {
265 *src = s;
266 return ((size_t)-1);
267 } else if (nb == (size_t)-2) {
268 *src = s + nms;
269 return (nchr);
270 } else if (nb == 0) {
271 *src = NULL;
272 return (nchr);
273 }
274 s += nb;
275 nms -= nb;
276 nchr++;
277 dst++;
278 }
279 *src = s;
280 return (nchr);
281 }
282
283 static size_t
_UTF8_wcrtomb(char * _RESTRICT_KYWD s,wchar_t wc,mbstate_t * _RESTRICT_KYWD ps)284 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
285 {
286 _UTF8State *us;
287 unsigned char lead;
288 int i, len;
289
290 us = (_UTF8State *)ps;
291
292 if (us->want != 0) {
293 errno = EINVAL;
294 return ((size_t)-1);
295 }
296
297 if (s == NULL)
298 /* Reset to initial shift state (no-op) */
299 return (1);
300
301 /*
302 * Determine the number of octets needed to represent this character.
303 * We always output the shortest sequence possible. Also specify the
304 * first few bits of the first octet, which contains the information
305 * about the sequence length.
306 */
307 if ((wc & ~0x7f) == 0) {
308 /* Fast path for plain ASCII characters. */
309 *s = (char)wc;
310 return (1);
311 } else if ((wc & ~0x7ff) == 0) {
312 lead = 0xc0;
313 len = 2;
314 } else if ((wc & ~0xffff) == 0) {
315 lead = 0xe0;
316 len = 3;
317 } else if ((wc & ~0x1fffff) == 0) {
318 lead = 0xf0;
319 len = 4;
320 #if 0
321 /* Again, 5 and 6 byte encodings are simply not permitted */
322 } else if ((wc & ~0x3ffffff) == 0) {
323 lead = 0xf8;
324 len = 5;
325 } else if ((wc & ~0x7fffffff) == 0) {
326 lead = 0xfc;
327 len = 6;
328 #endif
329 } else {
330 errno = EILSEQ;
331 return ((size_t)-1);
332 }
333
334 /*
335 * Output the octets representing the character in chunks
336 * of 6 bits, least significant last. The first octet is
337 * a special case because it contains the sequence length
338 * information.
339 */
340 for (i = len - 1; i > 0; i--) {
341 s[i] = (wc & 0x3f) | 0x80;
342 wc >>= 6;
343 }
344 *s = (wc & 0xff) | lead;
345
346 return (len);
347 }
348
349 static size_t
_UTF8_wcsnrtombs(char * _RESTRICT_KYWD dst,const wchar_t ** _RESTRICT_KYWD src,size_t nwc,size_t len,mbstate_t * _RESTRICT_KYWD ps)350 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
351 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
352 {
353 _UTF8State *us;
354 char buf[MB_LEN_MAX];
355 const wchar_t *s;
356 size_t nbytes;
357 size_t nb;
358
359 us = (_UTF8State *)ps;
360
361 if (us->want != 0) {
362 errno = EINVAL;
363 return ((size_t)-1);
364 }
365
366 s = *src;
367 nbytes = 0;
368
369 if (dst == NULL) {
370 while (nwc-- > 0) {
371 if (0 <= *s && *s < 0x80)
372 /* Fast path for plain ASCII characters. */
373 nb = 1;
374 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
375 (size_t)-1)
376 /* Invalid character - wcrtomb() sets errno. */
377 return ((size_t)-1);
378 if (*s == L'\0')
379 return (nbytes + nb - 1);
380 s++;
381 nbytes += nb;
382 }
383 return (nbytes);
384 }
385
386 while (len > 0 && nwc-- > 0) {
387 if (0 <= *s && *s < 0x80) {
388 /* Fast path for plain ASCII characters. */
389 nb = 1;
390 *dst = *s;
391 } else if (len > (size_t)MB_CUR_MAX) {
392 /* Enough space to translate in-place. */
393 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
394 *src = s;
395 return ((size_t)-1);
396 }
397 } else {
398 /*
399 * May not be enough space; use temp. buffer.
400 */
401 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
402 *src = s;
403 return ((size_t)-1);
404 }
405 if (nb > (int)len)
406 /* MB sequence for character won't fit. */
407 break;
408 (void) memcpy(dst, buf, nb);
409 }
410 if (*s == L'\0') {
411 *src = NULL;
412 return (nbytes + nb - 1);
413 }
414 s++;
415 dst += nb;
416 len -= nb;
417 nbytes += nb;
418 }
419 *src = s;
420 return (nbytes);
421 }
422