1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright 2013 Garrett D'Amore <garrett@damore.org>
5 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
6 * Copyright (c) 2002-2004 Tim J. Robbins
7 * All rights reserved.
8 *
9 * Copyright (c) 2011 The FreeBSD Foundation
10 *
11 * Portions of this software were developed by David Chisnall
12 * under sponsorship from the FreeBSD Foundation.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36 #include <sys/param.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <runetype.h>
40 #include <stdlib.h>
41 #include <string.h>
42 #include <wchar.h>
43 #include "mblocal.h"
44
45 extern int __mb_sb_limit;
46
47 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
48 size_t, mbstate_t * __restrict);
49 static int _UTF8_mbsinit(const mbstate_t *);
50 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
51 const char ** __restrict, size_t, size_t,
52 mbstate_t * __restrict);
53 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
54 mbstate_t * __restrict);
55 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
56 size_t, size_t, mbstate_t * __restrict);
57
58 typedef struct {
59 wchar_t ch;
60 int want;
61 wchar_t lbound;
62 } _UTF8State;
63
64 int
_UTF8_init(struct xlocale_ctype * l,_RuneLocale * rl)65 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
66 {
67
68 l->__mbrtowc = _UTF8_mbrtowc;
69 l->__wcrtomb = _UTF8_wcrtomb;
70 l->__mbsinit = _UTF8_mbsinit;
71 l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
72 l->__wcsnrtombs = _UTF8_wcsnrtombs;
73 l->runes = rl;
74 l->__mb_cur_max = 4;
75 /*
76 * UCS-4 encoding used as the internal representation, so
77 * slots 0x0080-0x00FF are occuped and must be excluded
78 * from the single byte ctype by setting the limit.
79 */
80 l->__mb_sb_limit = 128;
81
82 return (0);
83 }
84
85 static int
_UTF8_mbsinit(const mbstate_t * ps)86 _UTF8_mbsinit(const mbstate_t *ps)
87 {
88
89 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
90 }
91
92 static size_t
_UTF8_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)93 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
94 mbstate_t * __restrict ps)
95 {
96 _UTF8State *us;
97 int ch, i, mask, want;
98 wchar_t lbound, wch;
99
100 us = (_UTF8State *)ps;
101
102 if (us->want < 0 || us->want > 6) {
103 errno = EINVAL;
104 return ((size_t)-1);
105 }
106
107 if (s == NULL) {
108 s = "";
109 n = 1;
110 pwc = NULL;
111 }
112
113 if (n == 0)
114 /* Incomplete multibyte sequence */
115 return ((size_t)-2);
116
117 if (us->want == 0) {
118 /*
119 * Determine the number of octets that make up this character
120 * from the first octet, and a mask that extracts the
121 * interesting bits of the first octet. We already know
122 * the character is at least two bytes long.
123 *
124 * We also specify a lower bound for the character code to
125 * detect redundant, non-"shortest form" encodings. For
126 * example, the sequence C0 80 is _not_ a legal representation
127 * of the null character. This enforces a 1-to-1 mapping
128 * between character codes and their multibyte representations.
129 */
130 ch = (unsigned char)*s;
131 if ((ch & 0x80) == 0) {
132 /* Fast path for plain ASCII characters. */
133 if (pwc != NULL)
134 *pwc = ch;
135 return (ch != '\0' ? 1 : 0);
136 }
137 if ((ch & 0xe0) == 0xc0) {
138 mask = 0x1f;
139 want = 2;
140 lbound = 0x80;
141 } else if ((ch & 0xf0) == 0xe0) {
142 mask = 0x0f;
143 want = 3;
144 lbound = 0x800;
145 } else if ((ch & 0xf8) == 0xf0) {
146 mask = 0x07;
147 want = 4;
148 lbound = 0x10000;
149 } else {
150 /*
151 * Malformed input; input is not UTF-8.
152 */
153 errno = EILSEQ;
154 return ((size_t)-1);
155 }
156 } else {
157 want = us->want;
158 lbound = us->lbound;
159 }
160
161 /*
162 * Decode the octet sequence representing the character in chunks
163 * of 6 bits, most significant first.
164 */
165 if (us->want == 0)
166 wch = (unsigned char)*s++ & mask;
167 else
168 wch = us->ch;
169
170 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
171 if ((*s & 0xc0) != 0x80) {
172 /*
173 * Malformed input; bad characters in the middle
174 * of a character.
175 */
176 errno = EILSEQ;
177 return ((size_t)-1);
178 }
179 wch <<= 6;
180 wch |= *s++ & 0x3f;
181 }
182 if (i < want) {
183 /* Incomplete multibyte sequence. */
184 us->want = want - i;
185 us->lbound = lbound;
186 us->ch = wch;
187 return ((size_t)-2);
188 }
189 if (wch < lbound) {
190 /*
191 * Malformed input; redundant encoding.
192 */
193 errno = EILSEQ;
194 return ((size_t)-1);
195 }
196 if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) {
197 /*
198 * Malformed input; invalid code points.
199 */
200 errno = EILSEQ;
201 return ((size_t)-1);
202 }
203 if (pwc != NULL)
204 *pwc = wch;
205 us->want = 0;
206 return (wch == L'\0' ? 0 : want);
207 }
208
209 static size_t
_UTF8_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nms,size_t len,mbstate_t * __restrict ps)210 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
211 size_t nms, size_t len, mbstate_t * __restrict ps)
212 {
213 _UTF8State *us;
214 const char *s;
215 size_t nchr;
216 wchar_t wc;
217 size_t nb;
218
219 us = (_UTF8State *)ps;
220
221 s = *src;
222 nchr = 0;
223
224 if (dst == NULL) {
225 /*
226 * The fast path in the loop below is not safe if an ASCII
227 * character appears as anything but the first byte of a
228 * multibyte sequence. Check now to avoid doing it in the loop.
229 */
230 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
231 errno = EILSEQ;
232 return ((size_t)-1);
233 }
234 for (;;) {
235 if (nms > 0 && (signed char)*s > 0)
236 /*
237 * Fast path for plain ASCII characters
238 * excluding NUL.
239 */
240 nb = 1;
241 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
242 (size_t)-1)
243 /* Invalid sequence - mbrtowc() sets errno. */
244 return ((size_t)-1);
245 else if (nb == 0 || nb == (size_t)-2)
246 return (nchr);
247 s += nb;
248 nms -= nb;
249 nchr++;
250 }
251 /*NOTREACHED*/
252 }
253
254 /*
255 * The fast path in the loop below is not safe if an ASCII
256 * character appears as anything but the first byte of a
257 * multibyte sequence. Check now to avoid doing it in the loop.
258 */
259 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
260 errno = EILSEQ;
261 return ((size_t)-1);
262 }
263 while (len-- > 0) {
264 if (nms > 0 && (signed char)*s > 0) {
265 /*
266 * Fast path for plain ASCII characters
267 * excluding NUL.
268 */
269 *dst = (wchar_t)*s;
270 nb = 1;
271 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
272 (size_t)-1) {
273 *src = s;
274 return ((size_t)-1);
275 } else if (nb == (size_t)-2) {
276 *src = s + nms;
277 return (nchr);
278 } else if (nb == 0) {
279 *src = NULL;
280 return (nchr);
281 }
282 s += nb;
283 nms -= nb;
284 nchr++;
285 dst++;
286 }
287 *src = s;
288 return (nchr);
289 }
290
291 static size_t
_UTF8_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)292 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
293 {
294 _UTF8State *us;
295 unsigned char lead;
296 int i, len;
297
298 us = (_UTF8State *)ps;
299
300 if (us->want != 0) {
301 errno = EINVAL;
302 return ((size_t)-1);
303 }
304
305 if (s == NULL)
306 /* Reset to initial shift state (no-op) */
307 return (1);
308
309 /*
310 * Determine the number of octets needed to represent this character.
311 * We always output the shortest sequence possible. Also specify the
312 * first few bits of the first octet, which contains the information
313 * about the sequence length.
314 */
315 if ((wc & ~0x7f) == 0) {
316 /* Fast path for plain ASCII characters. */
317 *s = (char)wc;
318 return (1);
319 } else if ((wc & ~0x7ff) == 0) {
320 lead = 0xc0;
321 len = 2;
322 } else if ((wc & ~0xffff) == 0) {
323 if (wc >= 0xd800 && wc <= 0xdfff) {
324 errno = EILSEQ;
325 return ((size_t)-1);
326 }
327 lead = 0xe0;
328 len = 3;
329 } else if (wc >= 0 && wc <= 0x10ffff) {
330 lead = 0xf0;
331 len = 4;
332 } else {
333 errno = EILSEQ;
334 return ((size_t)-1);
335 }
336
337 /*
338 * Output the octets representing the character in chunks
339 * of 6 bits, least significant last. The first octet is
340 * a special case because it contains the sequence length
341 * information.
342 */
343 for (i = len - 1; i > 0; i--) {
344 s[i] = (wc & 0x3f) | 0x80;
345 wc >>= 6;
346 }
347 *s = (wc & 0xff) | lead;
348
349 return (len);
350 }
351
352 static size_t
_UTF8_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)353 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
354 size_t nwc, size_t len, mbstate_t * __restrict ps)
355 {
356 _UTF8State *us;
357 char buf[MB_LEN_MAX];
358 const wchar_t *s;
359 size_t nbytes;
360 size_t nb;
361
362 us = (_UTF8State *)ps;
363
364 if (us->want != 0) {
365 errno = EINVAL;
366 return ((size_t)-1);
367 }
368
369 s = *src;
370 nbytes = 0;
371
372 if (dst == NULL) {
373 while (nwc-- > 0) {
374 if (0 <= *s && *s < 0x80)
375 /* Fast path for plain ASCII characters. */
376 nb = 1;
377 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
378 (size_t)-1)
379 /* Invalid character - wcrtomb() sets errno. */
380 return ((size_t)-1);
381 if (*s == L'\0')
382 return (nbytes + nb - 1);
383 s++;
384 nbytes += nb;
385 }
386 return (nbytes);
387 }
388
389 while (len > 0 && nwc-- > 0) {
390 if (0 <= *s && *s < 0x80) {
391 /* Fast path for plain ASCII characters. */
392 nb = 1;
393 *dst = *s;
394 } else if (len > (size_t)MB_CUR_MAX) {
395 /* Enough space to translate in-place. */
396 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
397 *src = s;
398 return ((size_t)-1);
399 }
400 } else {
401 /*
402 * May not be enough space; use temp. buffer.
403 */
404 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
405 *src = s;
406 return ((size_t)-1);
407 }
408 if (nb > (int)len)
409 /* MB sequence for character won't fit. */
410 break;
411 memcpy(dst, buf, nb);
412 }
413 if (*s == L'\0') {
414 *src = NULL;
415 return (nbytes + nb - 1);
416 }
417 s++;
418 dst += nb;
419 len -= nb;
420 nbytes += nb;
421 }
422 *src = s;
423 return (nbytes);
424 }
425