xref: /freebsd/lib/libc/locale/utf8.c (revision b1d046441de9053152c7cf03d6b60d9882687e1b)
1 /*-
2  * Copyright (c) 2002-2004 Tim J. Robbins
3  * All rights reserved.
4  *
5  * Copyright (c) 2011 The FreeBSD Foundation
6  * All rights reserved.
7  * Portions of this software were developed by David Chisnall
8  * under sponsorship from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/param.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <errno.h>
36 #include <limits.h>
37 #include <runetype.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <wchar.h>
41 #include "mblocal.h"
42 
43 extern int __mb_sb_limit;
44 
45 static size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
46 		    size_t, mbstate_t * __restrict);
47 static int	_UTF8_mbsinit(const mbstate_t *);
48 static size_t	_UTF8_mbsnrtowcs(wchar_t * __restrict,
49 		    const char ** __restrict, size_t, size_t,
50 		    mbstate_t * __restrict);
51 static size_t	_UTF8_wcrtomb(char * __restrict, wchar_t,
52 		    mbstate_t * __restrict);
53 static size_t	_UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
54 		    size_t, size_t, mbstate_t * __restrict);
55 
56 typedef struct {
57 	wchar_t	ch;
58 	int	want;
59 	wchar_t	lbound;
60 } _UTF8State;
61 
62 int
63 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
64 {
65 
66 	l->__mbrtowc = _UTF8_mbrtowc;
67 	l->__wcrtomb = _UTF8_wcrtomb;
68 	l->__mbsinit = _UTF8_mbsinit;
69 	l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
70 	l->__wcsnrtombs = _UTF8_wcsnrtombs;
71 	l->runes = rl;
72 	l->__mb_cur_max = 6;
73 	/*
74 	 * UCS-4 encoding used as the internal representation, so
75 	 * slots 0x0080-0x00FF are occuped and must be excluded
76 	 * from the single byte ctype by setting the limit.
77 	 */
78 	l->__mb_sb_limit = 128;
79 
80 	return (0);
81 }
82 
83 static int
84 _UTF8_mbsinit(const mbstate_t *ps)
85 {
86 
87 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
88 }
89 
90 static size_t
91 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
92     mbstate_t * __restrict ps)
93 {
94 	_UTF8State *us;
95 	int ch, i, mask, want;
96 	wchar_t lbound, wch;
97 
98 	us = (_UTF8State *)ps;
99 
100 	if (us->want < 0 || us->want > 6) {
101 		errno = EINVAL;
102 		return ((size_t)-1);
103 	}
104 
105 	if (s == NULL) {
106 		s = "";
107 		n = 1;
108 		pwc = NULL;
109 	}
110 
111 	if (n == 0)
112 		/* Incomplete multibyte sequence */
113 		return ((size_t)-2);
114 
115 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
116 		/* Fast path for plain ASCII characters. */
117 		if (pwc != NULL)
118 			*pwc = ch;
119 		return (ch != '\0' ? 1 : 0);
120 	}
121 
122 	if (us->want == 0) {
123 		/*
124 		 * Determine the number of octets that make up this character
125 		 * from the first octet, and a mask that extracts the
126 		 * interesting bits of the first octet. We already know
127 		 * the character is at least two bytes long.
128 		 *
129 		 * We also specify a lower bound for the character code to
130 		 * detect redundant, non-"shortest form" encodings. For
131 		 * example, the sequence C0 80 is _not_ a legal representation
132 		 * of the null character. This enforces a 1-to-1 mapping
133 		 * between character codes and their multibyte representations.
134 		 */
135 		ch = (unsigned char)*s;
136 		if ((ch & 0x80) == 0) {
137 			mask = 0x7f;
138 			want = 1;
139 			lbound = 0;
140 		} else if ((ch & 0xe0) == 0xc0) {
141 			mask = 0x1f;
142 			want = 2;
143 			lbound = 0x80;
144 		} else if ((ch & 0xf0) == 0xe0) {
145 			mask = 0x0f;
146 			want = 3;
147 			lbound = 0x800;
148 		} else if ((ch & 0xf8) == 0xf0) {
149 			mask = 0x07;
150 			want = 4;
151 			lbound = 0x10000;
152 		} else if ((ch & 0xfc) == 0xf8) {
153 			mask = 0x03;
154 			want = 5;
155 			lbound = 0x200000;
156 		} else if ((ch & 0xfe) == 0xfc) {
157 			mask = 0x01;
158 			want = 6;
159 			lbound = 0x4000000;
160 		} else {
161 			/*
162 			 * Malformed input; input is not UTF-8.
163 			 */
164 			errno = EILSEQ;
165 			return ((size_t)-1);
166 		}
167 	} else {
168 		want = us->want;
169 		lbound = us->lbound;
170 	}
171 
172 	/*
173 	 * Decode the octet sequence representing the character in chunks
174 	 * of 6 bits, most significant first.
175 	 */
176 	if (us->want == 0)
177 		wch = (unsigned char)*s++ & mask;
178 	else
179 		wch = us->ch;
180 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
181 		if ((*s & 0xc0) != 0x80) {
182 			/*
183 			 * Malformed input; bad characters in the middle
184 			 * of a character.
185 			 */
186 			errno = EILSEQ;
187 			return ((size_t)-1);
188 		}
189 		wch <<= 6;
190 		wch |= *s++ & 0x3f;
191 	}
192 	if (i < want) {
193 		/* Incomplete multibyte sequence. */
194 		us->want = want - i;
195 		us->lbound = lbound;
196 		us->ch = wch;
197 		return ((size_t)-2);
198 	}
199 	if (wch < lbound) {
200 		/*
201 		 * Malformed input; redundant encoding.
202 		 */
203 		errno = EILSEQ;
204 		return ((size_t)-1);
205 	}
206 	if (pwc != NULL)
207 		*pwc = wch;
208 	us->want = 0;
209 	return (wch == L'\0' ? 0 : want);
210 }
211 
212 static size_t
213 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
214     size_t nms, size_t len, mbstate_t * __restrict ps)
215 {
216 	_UTF8State *us;
217 	const char *s;
218 	size_t nchr;
219 	wchar_t wc;
220 	size_t nb;
221 
222 	us = (_UTF8State *)ps;
223 
224 	s = *src;
225 	nchr = 0;
226 
227 	if (dst == NULL) {
228 		/*
229 		 * The fast path in the loop below is not safe if an ASCII
230 		 * character appears as anything but the first byte of a
231 		 * multibyte sequence. Check now to avoid doing it in the loop.
232 		 */
233 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
234 			errno = EILSEQ;
235 			return ((size_t)-1);
236 		}
237 		for (;;) {
238 			if (nms > 0 && (signed char)*s > 0)
239 				/*
240 				 * Fast path for plain ASCII characters
241 				 * excluding NUL.
242 				 */
243 				nb = 1;
244 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
245 			    (size_t)-1)
246 				/* Invalid sequence - mbrtowc() sets errno. */
247 				return ((size_t)-1);
248 			else if (nb == 0 || nb == (size_t)-2)
249 				return (nchr);
250 			s += nb;
251 			nms -= nb;
252 			nchr++;
253 		}
254 		/*NOTREACHED*/
255 	}
256 
257 	/*
258 	 * The fast path in the loop below is not safe if an ASCII
259 	 * character appears as anything but the first byte of a
260 	 * multibyte sequence. Check now to avoid doing it in the loop.
261 	 */
262 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
263 		errno = EILSEQ;
264 		return ((size_t)-1);
265 	}
266 	while (len-- > 0) {
267 		if (nms > 0 && (signed char)*s > 0) {
268 			/*
269 			 * Fast path for plain ASCII characters
270 			 * excluding NUL.
271 			 */
272 			*dst = (wchar_t)*s;
273 			nb = 1;
274 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
275 		    (size_t)-1) {
276 			*src = s;
277 			return ((size_t)-1);
278 		} else if (nb == (size_t)-2) {
279 			*src = s + nms;
280 			return (nchr);
281 		} else if (nb == 0) {
282 			*src = NULL;
283 			return (nchr);
284 		}
285 		s += nb;
286 		nms -= nb;
287 		nchr++;
288 		dst++;
289 	}
290 	*src = s;
291 	return (nchr);
292 }
293 
294 static size_t
295 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
296 {
297 	_UTF8State *us;
298 	unsigned char lead;
299 	int i, len;
300 
301 	us = (_UTF8State *)ps;
302 
303 	if (us->want != 0) {
304 		errno = EINVAL;
305 		return ((size_t)-1);
306 	}
307 
308 	if (s == NULL)
309 		/* Reset to initial shift state (no-op) */
310 		return (1);
311 
312 	if ((wc & ~0x7f) == 0) {
313 		/* Fast path for plain ASCII characters. */
314 		*s = (char)wc;
315 		return (1);
316 	}
317 
318 	/*
319 	 * Determine the number of octets needed to represent this character.
320 	 * We always output the shortest sequence possible. Also specify the
321 	 * first few bits of the first octet, which contains the information
322 	 * about the sequence length.
323 	 */
324 	if ((wc & ~0x7f) == 0) {
325 		lead = 0;
326 		len = 1;
327 	} else if ((wc & ~0x7ff) == 0) {
328 		lead = 0xc0;
329 		len = 2;
330 	} else if ((wc & ~0xffff) == 0) {
331 		lead = 0xe0;
332 		len = 3;
333 	} else if ((wc & ~0x1fffff) == 0) {
334 		lead = 0xf0;
335 		len = 4;
336 	} else if ((wc & ~0x3ffffff) == 0) {
337 		lead = 0xf8;
338 		len = 5;
339 	} else if ((wc & ~0x7fffffff) == 0) {
340 		lead = 0xfc;
341 		len = 6;
342 	} else {
343 		errno = EILSEQ;
344 		return ((size_t)-1);
345 	}
346 
347 	/*
348 	 * Output the octets representing the character in chunks
349 	 * of 6 bits, least significant last. The first octet is
350 	 * a special case because it contains the sequence length
351 	 * information.
352 	 */
353 	for (i = len - 1; i > 0; i--) {
354 		s[i] = (wc & 0x3f) | 0x80;
355 		wc >>= 6;
356 	}
357 	*s = (wc & 0xff) | lead;
358 
359 	return (len);
360 }
361 
362 static size_t
363 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
364     size_t nwc, size_t len, mbstate_t * __restrict ps)
365 {
366 	_UTF8State *us;
367 	char buf[MB_LEN_MAX];
368 	const wchar_t *s;
369 	size_t nbytes;
370 	size_t nb;
371 
372 	us = (_UTF8State *)ps;
373 
374 	if (us->want != 0) {
375 		errno = EINVAL;
376 		return ((size_t)-1);
377 	}
378 
379 	s = *src;
380 	nbytes = 0;
381 
382 	if (dst == NULL) {
383 		while (nwc-- > 0) {
384 			if (0 <= *s && *s < 0x80)
385 				/* Fast path for plain ASCII characters. */
386 				nb = 1;
387 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
388 			    (size_t)-1)
389 				/* Invalid character - wcrtomb() sets errno. */
390 				return ((size_t)-1);
391 			if (*s == L'\0')
392 				return (nbytes + nb - 1);
393 			s++;
394 			nbytes += nb;
395 		}
396 		return (nbytes);
397 	}
398 
399 	while (len > 0 && nwc-- > 0) {
400 		if (0 <= *s && *s < 0x80) {
401 			/* Fast path for plain ASCII characters. */
402 			nb = 1;
403 			*dst = *s;
404 		} else if (len > (size_t)MB_CUR_MAX) {
405 			/* Enough space to translate in-place. */
406 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
407 				*src = s;
408 				return ((size_t)-1);
409 			}
410 		} else {
411 			/*
412 			 * May not be enough space; use temp. buffer.
413 			 */
414 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
415 				*src = s;
416 				return ((size_t)-1);
417 			}
418 			if (nb > (int)len)
419 				/* MB sequence for character won't fit. */
420 				break;
421 			memcpy(dst, buf, nb);
422 		}
423 		if (*s == L'\0') {
424 			*src = NULL;
425 			return (nbytes + nb - 1);
426 		}
427 		s++;
428 		dst += nb;
429 		len -= nb;
430 		nbytes += nb;
431 	}
432 	*src = s;
433 	return (nbytes);
434 }
435