xref: /freebsd/lib/libc/locale/utf8.c (revision 0572ccaa4543b0abef8ef81e384c1d04de9f3da1)
1 /*-
2  * Copyright (c) 2002-2004 Tim J. Robbins
3  * All rights reserved.
4  *
5  * Copyright (c) 2011 The FreeBSD Foundation
6  * All rights reserved.
7  * Portions of this software were developed by David Chisnall
8  * under sponsorship from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <errno.h>
36 #include <limits.h>
37 #include <runetype.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42 
43 extern int __mb_sb_limit;
44 
45 static size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
46 		    size_t, mbstate_t * __restrict);
47 static int	_UTF8_mbsinit(const mbstate_t *);
48 static size_t	_UTF8_mbsnrtowcs(wchar_t * __restrict,
49 		    const char ** __restrict, size_t, size_t,
50 		    mbstate_t * __restrict);
51 static size_t	_UTF8_wcrtomb(char * __restrict, wchar_t,
52 		    mbstate_t * __restrict);
53 static size_t	_UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
54 		    size_t, size_t, mbstate_t * __restrict);
55 
56 typedef struct {
57 	wchar_t	ch;
58 	int	want;
59 	wchar_t	lbound;
60 } _UTF8State;
61 
62 int
63 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
64 {
65 
66 	l->__mbrtowc = _UTF8_mbrtowc;
67 	l->__wcrtomb = _UTF8_wcrtomb;
68 	l->__mbsinit = _UTF8_mbsinit;
69 	l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
70 	l->__wcsnrtombs = _UTF8_wcsnrtombs;
71 	l->runes = rl;
72 	l->__mb_cur_max = 6;
73 	/*
74 	 * UCS-4 encoding used as the internal representation, so
75 	 * slots 0x0080-0x00FF are occuped and must be excluded
76 	 * from the single byte ctype by setting the limit.
77 	 */
78 	l->__mb_sb_limit = 128;
79 
80 	return (0);
81 }
82 
83 static int
84 _UTF8_mbsinit(const mbstate_t *ps)
85 {
86 
87 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
88 }
89 
90 static size_t
91 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
92     mbstate_t * __restrict ps)
93 {
94 	_UTF8State *us;
95 	int ch, i, mask, want;
96 	wchar_t lbound, wch;
97 
98 	us = (_UTF8State *)ps;
99 
100 	if (us->want < 0 || us->want > 6) {
101 		errno = EINVAL;
102 		return ((size_t)-1);
103 	}
104 
105 	if (s == NULL) {
106 		s = "";
107 		n = 1;
108 		pwc = NULL;
109 	}
110 
111 	if (n == 0)
112 		/* Incomplete multibyte sequence */
113 		return ((size_t)-2);
114 
115 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
116 		/* Fast path for plain ASCII characters. */
117 		if (pwc != NULL)
118 			*pwc = ch;
119 		return (ch != '\0' ? 1 : 0);
120 	}
121 
122 	if (us->want == 0) {
123 		/*
124 		 * Determine the number of octets that make up this character
125 		 * from the first octet, and a mask that extracts the
126 		 * interesting bits of the first octet. We already know
127 		 * the character is at least two bytes long.
128 		 *
129 		 * We also specify a lower bound for the character code to
130 		 * detect redundant, non-"shortest form" encodings. For
131 		 * example, the sequence C0 80 is _not_ a legal representation
132 		 * of the null character. This enforces a 1-to-1 mapping
133 		 * between character codes and their multibyte representations.
134 		 */
135 		ch = (unsigned char)*s;
136 		if ((ch & 0x80) == 0) {
137 			mask = 0x7f;
138 			want = 1;
139 			lbound = 0;
140 		} else if ((ch & 0xe0) == 0xc0) {
141 			mask = 0x1f;
142 			want = 2;
143 			lbound = 0x80;
144 		} else if ((ch & 0xf0) == 0xe0) {
145 			mask = 0x0f;
146 			want = 3;
147 			lbound = 0x800;
148 		} else if ((ch & 0xf8) == 0xf0) {
149 			mask = 0x07;
150 			want = 4;
151 			lbound = 0x10000;
152 		} else if ((ch & 0xfc) == 0xf8) {
153 			mask = 0x03;
154 			want = 5;
155 			lbound = 0x200000;
156 		} else if ((ch & 0xfe) == 0xfc) {
157 			mask = 0x01;
158 			want = 6;
159 			lbound = 0x4000000;
160 		} else {
161 			/*
162 			 * Malformed input; input is not UTF-8.
163 			 */
164 			errno = EILSEQ;
165 			return ((size_t)-1);
166 		}
167 	} else {
168 		want = us->want;
169 		lbound = us->lbound;
170 	}
171 
172 	/*
173 	 * Decode the octet sequence representing the character in chunks
174 	 * of 6 bits, most significant first.
175 	 */
176 	if (us->want == 0)
177 		wch = (unsigned char)*s++ & mask;
178 	else
179 		wch = us->ch;
180 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
181 		if ((*s & 0xc0) != 0x80) {
182 			/*
183 			 * Malformed input; bad characters in the middle
184 			 * of a character.
185 			 */
186 			errno = EILSEQ;
187 			return ((size_t)-1);
188 		}
189 		wch <<= 6;
190 		wch |= *s++ & 0x3f;
191 	}
192 	if (i < want) {
193 		/* Incomplete multibyte sequence. */
194 		us->want = want - i;
195 		us->lbound = lbound;
196 		us->ch = wch;
197 		return ((size_t)-2);
198 	}
199 	if (wch < lbound) {
200 		/*
201 		 * Malformed input; redundant encoding.
202 		 */
203 		errno = EILSEQ;
204 		return ((size_t)-1);
205 	}
206 	if (wch >= 0xd800 && wch <= 0xdfff) {
207 		/*
208 		 * Malformed input; invalid code points.
209 		 */
210 		errno = EILSEQ;
211 		return ((size_t)-1);
212 	}
213 	if (pwc != NULL)
214 		*pwc = wch;
215 	us->want = 0;
216 	return (wch == L'\0' ? 0 : want);
217 }
218 
219 static size_t
220 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
221     size_t nms, size_t len, mbstate_t * __restrict ps)
222 {
223 	_UTF8State *us;
224 	const char *s;
225 	size_t nchr;
226 	wchar_t wc;
227 	size_t nb;
228 
229 	us = (_UTF8State *)ps;
230 
231 	s = *src;
232 	nchr = 0;
233 
234 	if (dst == NULL) {
235 		/*
236 		 * The fast path in the loop below is not safe if an ASCII
237 		 * character appears as anything but the first byte of a
238 		 * multibyte sequence. Check now to avoid doing it in the loop.
239 		 */
240 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
241 			errno = EILSEQ;
242 			return ((size_t)-1);
243 		}
244 		for (;;) {
245 			if (nms > 0 && (signed char)*s > 0)
246 				/*
247 				 * Fast path for plain ASCII characters
248 				 * excluding NUL.
249 				 */
250 				nb = 1;
251 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
252 			    (size_t)-1)
253 				/* Invalid sequence - mbrtowc() sets errno. */
254 				return ((size_t)-1);
255 			else if (nb == 0 || nb == (size_t)-2)
256 				return (nchr);
257 			s += nb;
258 			nms -= nb;
259 			nchr++;
260 		}
261 		/*NOTREACHED*/
262 	}
263 
264 	/*
265 	 * The fast path in the loop below is not safe if an ASCII
266 	 * character appears as anything but the first byte of a
267 	 * multibyte sequence. Check now to avoid doing it in the loop.
268 	 */
269 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
270 		errno = EILSEQ;
271 		return ((size_t)-1);
272 	}
273 	while (len-- > 0) {
274 		if (nms > 0 && (signed char)*s > 0) {
275 			/*
276 			 * Fast path for plain ASCII characters
277 			 * excluding NUL.
278 			 */
279 			*dst = (wchar_t)*s;
280 			nb = 1;
281 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
282 		    (size_t)-1) {
283 			*src = s;
284 			return ((size_t)-1);
285 		} else if (nb == (size_t)-2) {
286 			*src = s + nms;
287 			return (nchr);
288 		} else if (nb == 0) {
289 			*src = NULL;
290 			return (nchr);
291 		}
292 		s += nb;
293 		nms -= nb;
294 		nchr++;
295 		dst++;
296 	}
297 	*src = s;
298 	return (nchr);
299 }
300 
301 static size_t
302 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
303 {
304 	_UTF8State *us;
305 	unsigned char lead;
306 	int i, len;
307 
308 	us = (_UTF8State *)ps;
309 
310 	if (us->want != 0) {
311 		errno = EINVAL;
312 		return ((size_t)-1);
313 	}
314 
315 	if (s == NULL)
316 		/* Reset to initial shift state (no-op) */
317 		return (1);
318 
319 	if ((wc & ~0x7f) == 0) {
320 		/* Fast path for plain ASCII characters. */
321 		*s = (char)wc;
322 		return (1);
323 	}
324 
325 	/*
326 	 * Determine the number of octets needed to represent this character.
327 	 * We always output the shortest sequence possible. Also specify the
328 	 * first few bits of the first octet, which contains the information
329 	 * about the sequence length.
330 	 */
331 	if ((wc & ~0x7f) == 0) {
332 		lead = 0;
333 		len = 1;
334 	} else if ((wc & ~0x7ff) == 0) {
335 		lead = 0xc0;
336 		len = 2;
337 	} else if ((wc & ~0xffff) == 0) {
338 		lead = 0xe0;
339 		len = 3;
340 	} else if ((wc & ~0x1fffff) == 0) {
341 		lead = 0xf0;
342 		len = 4;
343 	} else if ((wc & ~0x3ffffff) == 0) {
344 		lead = 0xf8;
345 		len = 5;
346 	} else if ((wc & ~0x7fffffff) == 0) {
347 		lead = 0xfc;
348 		len = 6;
349 	} else {
350 		errno = EILSEQ;
351 		return ((size_t)-1);
352 	}
353 
354 	/*
355 	 * Output the octets representing the character in chunks
356 	 * of 6 bits, least significant last. The first octet is
357 	 * a special case because it contains the sequence length
358 	 * information.
359 	 */
360 	for (i = len - 1; i > 0; i--) {
361 		s[i] = (wc & 0x3f) | 0x80;
362 		wc >>= 6;
363 	}
364 	*s = (wc & 0xff) | lead;
365 
366 	return (len);
367 }
368 
369 static size_t
370 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
371     size_t nwc, size_t len, mbstate_t * __restrict ps)
372 {
373 	_UTF8State *us;
374 	char buf[MB_LEN_MAX];
375 	const wchar_t *s;
376 	size_t nbytes;
377 	size_t nb;
378 
379 	us = (_UTF8State *)ps;
380 
381 	if (us->want != 0) {
382 		errno = EINVAL;
383 		return ((size_t)-1);
384 	}
385 
386 	s = *src;
387 	nbytes = 0;
388 
389 	if (dst == NULL) {
390 		while (nwc-- > 0) {
391 			if (0 <= *s && *s < 0x80)
392 				/* Fast path for plain ASCII characters. */
393 				nb = 1;
394 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
395 			    (size_t)-1)
396 				/* Invalid character - wcrtomb() sets errno. */
397 				return ((size_t)-1);
398 			if (*s == L'\0')
399 				return (nbytes + nb - 1);
400 			s++;
401 			nbytes += nb;
402 		}
403 		return (nbytes);
404 	}
405 
406 	while (len > 0 && nwc-- > 0) {
407 		if (0 <= *s && *s < 0x80) {
408 			/* Fast path for plain ASCII characters. */
409 			nb = 1;
410 			*dst = *s;
411 		} else if (len > (size_t)MB_CUR_MAX) {
412 			/* Enough space to translate in-place. */
413 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
414 				*src = s;
415 				return ((size_t)-1);
416 			}
417 		} else {
418 			/*
419 			 * May not be enough space; use temp. buffer.
420 			 */
421 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
422 				*src = s;
423 				return ((size_t)-1);
424 			}
425 			if (nb > (int)len)
426 				/* MB sequence for character won't fit. */
427 				break;
428 			memcpy(dst, buf, nb);
429 		}
430 		if (*s == L'\0') {
431 			*src = NULL;
432 			return (nbytes + nb - 1);
433 		}
434 		s++;
435 		dst += nb;
436 		len -= nb;
437 		nbytes += nb;
438 	}
439 	*src = s;
440 	return (nbytes);
441 }
442