1 /*
2 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
4 * Copyright (c) 2002-2004 Tim J. Robbins
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include "lint.h"
30 #include <errno.h>
31 #include <limits.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "mblocal.h"
36 #include "lctype.h"
37
38 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
39 const char *_RESTRICT_KYWD,
40 size_t, mbstate_t *_RESTRICT_KYWD);
41 static int _UTF8_mbsinit(const mbstate_t *);
42 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
43 const char **_RESTRICT_KYWD, size_t, size_t,
44 mbstate_t *_RESTRICT_KYWD);
45 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
46 mbstate_t *_RESTRICT_KYWD);
47 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
48 const wchar_t **_RESTRICT_KYWD,
49 size_t, size_t, mbstate_t *_RESTRICT_KYWD);
50
51 typedef struct {
52 wchar_t ch;
53 int want;
54 wchar_t lbound;
55 } _UTF8State;
56
57 void
_UTF8_init(struct lc_ctype * lct)58 _UTF8_init(struct lc_ctype *lct)
59 {
60 lct->lc_mbrtowc = _UTF8_mbrtowc;
61 lct->lc_wcrtomb = _UTF8_wcrtomb;
62 lct->lc_mbsinit = _UTF8_mbsinit;
63 lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
64 lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
65 lct->lc_is_ascii = 0;
66 lct->lc_max_mblen = 4;
67 }
68
69 static int
_UTF8_mbsinit(const mbstate_t * ps)70 _UTF8_mbsinit(const mbstate_t *ps)
71 {
72
73 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
74 }
75
76 static size_t
_UTF8_mbrtowc(wchar_t * _RESTRICT_KYWD pwc,const char * _RESTRICT_KYWD s,size_t n,mbstate_t * _RESTRICT_KYWD ps)77 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
78 size_t n, mbstate_t *_RESTRICT_KYWD ps)
79 {
80 _UTF8State *us;
81 int ch, i, mask, want;
82 wchar_t lbound, wch;
83
84 us = (_UTF8State *)ps;
85
86 if (us->want < 0 || us->want > 6) {
87 errno = EINVAL;
88 return ((size_t)-1);
89 }
90
91 if (s == NULL) {
92 s = "";
93 n = 1;
94 pwc = NULL;
95 }
96
97 if (n == 0)
98 /* Incomplete multibyte sequence */
99 return ((size_t)-2);
100
101 if (us->want == 0) {
102 /*
103 * Determine the number of octets that make up this character
104 * from the first octet, and a mask that extracts the
105 * interesting bits of the first octet. We already know
106 * the character is at least two bytes long.
107 *
108 * We also specify a lower bound for the character code to
109 * detect redundant, non-"shortest form" encodings. For
110 * example, the sequence C0 80 is _not_ a legal representation
111 * of the null character. This enforces a 1-to-1 mapping
112 * between character codes and their multibyte representations.
113 */
114 ch = (unsigned char)*s;
115 if ((ch & 0x80) == 0) {
116 /* Fast path for plain ASCII characters. */
117 if (pwc != NULL)
118 *pwc = ch;
119 return (ch != '\0' ? 1 : 0);
120 }
121 if ((ch & 0xe0) == 0xc0) {
122 mask = 0x1f;
123 want = 2;
124 lbound = 0x80;
125 } else if ((ch & 0xf0) == 0xe0) {
126 mask = 0x0f;
127 want = 3;
128 lbound = 0x800;
129 } else if ((ch & 0xf8) == 0xf0) {
130 mask = 0x07;
131 want = 4;
132 lbound = 0x10000;
133 #if 0
134 /* These would be illegal in the UTF-8 space */
135
136 } else if ((ch & 0xfc) == 0xf8) {
137 mask = 0x03;
138 want = 5;
139 lbound = 0x200000;
140 } else if ((ch & 0xfe) == 0xfc) {
141 mask = 0x01;
142 want = 6;
143 lbound = 0x4000000;
144 #endif
145 } else {
146 /*
147 * Malformed input; input is not UTF-8.
148 */
149 errno = EILSEQ;
150 return ((size_t)-1);
151 }
152 } else {
153 want = us->want;
154 lbound = us->lbound;
155 }
156
157 /*
158 * Decode the octet sequence representing the character in chunks
159 * of 6 bits, most significant first.
160 */
161 if (us->want == 0)
162 wch = (unsigned char)*s++ & mask;
163 else
164 wch = us->ch;
165
166 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
167 if ((*s & 0xc0) != 0x80) {
168 /*
169 * Malformed input; bad characters in the middle
170 * of a character.
171 */
172 errno = EILSEQ;
173 return ((size_t)-1);
174 }
175 wch <<= 6;
176 wch |= *s++ & 0x3f;
177 }
178 if (i < want) {
179 /* Incomplete multibyte sequence. */
180 us->want = want - i;
181 us->lbound = lbound;
182 us->ch = wch;
183 return ((size_t)-2);
184 }
185 if (wch < lbound) {
186 /*
187 * Malformed input; redundant encoding.
188 */
189 errno = EILSEQ;
190 return ((size_t)-1);
191 }
192 if (pwc != NULL)
193 *pwc = wch;
194 us->want = 0;
195 return (wch == L'\0' ? 0 : want);
196 }
197
198 static size_t
_UTF8_mbsnrtowcs(wchar_t * _RESTRICT_KYWD dst,const char ** _RESTRICT_KYWD src,size_t nms,size_t len,mbstate_t * _RESTRICT_KYWD ps)199 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
200 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
201 {
202 _UTF8State *us;
203 const char *s;
204 size_t nchr;
205 wchar_t wc;
206 size_t nb;
207
208 us = (_UTF8State *)ps;
209
210 s = *src;
211 nchr = 0;
212
213 if (dst == NULL) {
214 /*
215 * The fast path in the loop below is not safe if an ASCII
216 * character appears as anything but the first byte of a
217 * multibyte sequence. Check now to avoid doing it in the loop.
218 */
219 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
220 errno = EILSEQ;
221 return ((size_t)-1);
222 }
223 for (;;) {
224 if (nms > 0 && (signed char)*s > 0)
225 /*
226 * Fast path for plain ASCII characters
227 * excluding NUL.
228 */
229 nb = 1;
230 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
231 (size_t)-1)
232 /* Invalid sequence - mbrtowc() sets errno. */
233 return ((size_t)-1);
234 else if (nb == 0 || nb == (size_t)-2)
235 return (nchr);
236 s += nb;
237 nms -= nb;
238 nchr++;
239 }
240 /*NOTREACHED*/
241 }
242
243 /*
244 * The fast path in the loop below is not safe if an ASCII
245 * character appears as anything but the first byte of a
246 * multibyte sequence. Check now to avoid doing it in the loop.
247 */
248 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
249 errno = EILSEQ;
250 return ((size_t)-1);
251 }
252 while (len-- > 0) {
253 if (nms > 0 && (signed char)*s > 0) {
254 /*
255 * Fast path for plain ASCII characters
256 * excluding NUL.
257 */
258 *dst = (wchar_t)*s;
259 nb = 1;
260 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
261 (size_t)-1) {
262 *src = s;
263 return ((size_t)-1);
264 } else if (nb == (size_t)-2) {
265 *src = s + nms;
266 return (nchr);
267 } else if (nb == 0) {
268 *src = NULL;
269 return (nchr);
270 }
271 s += nb;
272 nms -= nb;
273 nchr++;
274 dst++;
275 }
276 *src = s;
277 return (nchr);
278 }
279
280 static size_t
_UTF8_wcrtomb(char * _RESTRICT_KYWD s,wchar_t wc,mbstate_t * _RESTRICT_KYWD ps)281 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
282 {
283 _UTF8State *us;
284 unsigned char lead;
285 int i, len;
286
287 us = (_UTF8State *)ps;
288
289 if (us->want != 0) {
290 errno = EINVAL;
291 return ((size_t)-1);
292 }
293
294 if (s == NULL)
295 /* Reset to initial shift state (no-op) */
296 return (1);
297
298 /*
299 * Determine the number of octets needed to represent this character.
300 * We always output the shortest sequence possible. Also specify the
301 * first few bits of the first octet, which contains the information
302 * about the sequence length.
303 */
304 if ((wc & ~0x7f) == 0) {
305 /* Fast path for plain ASCII characters. */
306 *s = (char)wc;
307 return (1);
308 } else if ((wc & ~0x7ff) == 0) {
309 lead = 0xc0;
310 len = 2;
311 } else if ((wc & ~0xffff) == 0) {
312 lead = 0xe0;
313 len = 3;
314 } else if ((wc & ~0x1fffff) == 0) {
315 lead = 0xf0;
316 len = 4;
317 #if 0
318 /* Again, 5 and 6 byte encodings are simply not permitted */
319 } else if ((wc & ~0x3ffffff) == 0) {
320 lead = 0xf8;
321 len = 5;
322 } else if ((wc & ~0x7fffffff) == 0) {
323 lead = 0xfc;
324 len = 6;
325 #endif
326 } else {
327 errno = EILSEQ;
328 return ((size_t)-1);
329 }
330
331 /*
332 * Output the octets representing the character in chunks
333 * of 6 bits, least significant last. The first octet is
334 * a special case because it contains the sequence length
335 * information.
336 */
337 for (i = len - 1; i > 0; i--) {
338 s[i] = (wc & 0x3f) | 0x80;
339 wc >>= 6;
340 }
341 *s = (wc & 0xff) | lead;
342
343 return (len);
344 }
345
346 static size_t
_UTF8_wcsnrtombs(char * _RESTRICT_KYWD dst,const wchar_t ** _RESTRICT_KYWD src,size_t nwc,size_t len,mbstate_t * _RESTRICT_KYWD ps)347 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
348 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
349 {
350 _UTF8State *us;
351 char buf[MB_LEN_MAX];
352 const wchar_t *s;
353 size_t nbytes;
354 size_t nb;
355
356 us = (_UTF8State *)ps;
357
358 if (us->want != 0) {
359 errno = EINVAL;
360 return ((size_t)-1);
361 }
362
363 s = *src;
364 nbytes = 0;
365
366 if (dst == NULL) {
367 while (nwc-- > 0) {
368 if (0 <= *s && *s < 0x80)
369 /* Fast path for plain ASCII characters. */
370 nb = 1;
371 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
372 (size_t)-1)
373 /* Invalid character - wcrtomb() sets errno. */
374 return ((size_t)-1);
375 if (*s == L'\0')
376 return (nbytes + nb - 1);
377 s++;
378 nbytes += nb;
379 }
380 return (nbytes);
381 }
382
383 while (len > 0 && nwc-- > 0) {
384 if (0 <= *s && *s < 0x80) {
385 /* Fast path for plain ASCII characters. */
386 nb = 1;
387 *dst = *s;
388 } else if (len > (size_t)MB_CUR_MAX) {
389 /* Enough space to translate in-place. */
390 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
391 *src = s;
392 return ((size_t)-1);
393 }
394 } else {
395 /*
396 * May not be enough space; use temp. buffer.
397 */
398 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
399 *src = s;
400 return ((size_t)-1);
401 }
402 if (nb > (int)len)
403 /* MB sequence for character won't fit. */
404 break;
405 (void) memcpy(dst, buf, nb);
406 }
407 if (*s == L'\0') {
408 *src = NULL;
409 return (nbytes + nb - 1);
410 }
411 s++;
412 dst += nb;
413 len -= nb;
414 nbytes += nb;
415 }
416 *src = s;
417 return (nbytes);
418 }
419