xref: /freebsd/lib/libc/locale/utf8.c (revision 94942af266ac119ede0ca836f9aa5a5ac0582938)
1 /*-
2  * Copyright (c) 2002-2004 Tim J. Robbins
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/param.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <errno.h>
31 #include <limits.h>
32 #include <runetype.h>
33 #include <stdlib.h>
34 #include <string.h>
35 #include <wchar.h>
36 #include "mblocal.h"
37 
38 static size_t	_UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
39 		    size_t, mbstate_t * __restrict);
40 static int	_UTF8_mbsinit(const mbstate_t *);
41 static size_t	_UTF8_mbsnrtowcs(wchar_t * __restrict,
42 		    const char ** __restrict, size_t, size_t,
43 		    mbstate_t * __restrict);
44 static size_t	_UTF8_wcrtomb(char * __restrict, wchar_t,
45 		    mbstate_t * __restrict);
46 static size_t	_UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
47 		    size_t, size_t, mbstate_t * __restrict);
48 
49 typedef struct {
50 	wchar_t	ch;
51 	int	want;
52 	wchar_t	lbound;
53 } _UTF8State;
54 
55 int
56 _UTF8_init(_RuneLocale *rl)
57 {
58 
59 	__mbrtowc = _UTF8_mbrtowc;
60 	__wcrtomb = _UTF8_wcrtomb;
61 	__mbsinit = _UTF8_mbsinit;
62 	__mbsnrtowcs = _UTF8_mbsnrtowcs;
63 	__wcsnrtombs = _UTF8_wcsnrtombs;
64 	_CurrentRuneLocale = rl;
65 	__mb_cur_max = 6;
66 
67 	return (0);
68 }
69 
70 static int
71 _UTF8_mbsinit(const mbstate_t *ps)
72 {
73 
74 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
75 }
76 
77 static size_t
78 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
79     mbstate_t * __restrict ps)
80 {
81 	_UTF8State *us;
82 	int ch, i, mask, want;
83 	wchar_t lbound, wch;
84 
85 	us = (_UTF8State *)ps;
86 
87 	if (us->want < 0 || us->want > 6) {
88 		errno = EINVAL;
89 		return ((size_t)-1);
90 	}
91 
92 	if (s == NULL) {
93 		s = "";
94 		n = 1;
95 		pwc = NULL;
96 	}
97 
98 	if (n == 0)
99 		/* Incomplete multibyte sequence */
100 		return ((size_t)-2);
101 
102 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
103 		/* Fast path for plain ASCII characters. */
104 		if (pwc != NULL)
105 			*pwc = ch;
106 		return (ch != '\0' ? 1 : 0);
107 	}
108 
109 	if (us->want == 0) {
110 		/*
111 		 * Determine the number of octets that make up this character
112 		 * from the first octet, and a mask that extracts the
113 		 * interesting bits of the first octet. We already know
114 		 * the character is at least two bytes long.
115 		 *
116 		 * We also specify a lower bound for the character code to
117 		 * detect redundant, non-"shortest form" encodings. For
118 		 * example, the sequence C0 80 is _not_ a legal representation
119 		 * of the null character. This enforces a 1-to-1 mapping
120 		 * between character codes and their multibyte representations.
121 		 */
122 		ch = (unsigned char)*s;
123 		if ((ch & 0x80) == 0) {
124 			mask = 0x7f;
125 			want = 1;
126 			lbound = 0;
127 		} else if ((ch & 0xe0) == 0xc0) {
128 			mask = 0x1f;
129 			want = 2;
130 			lbound = 0x80;
131 		} else if ((ch & 0xf0) == 0xe0) {
132 			mask = 0x0f;
133 			want = 3;
134 			lbound = 0x800;
135 		} else if ((ch & 0xf8) == 0xf0) {
136 			mask = 0x07;
137 			want = 4;
138 			lbound = 0x10000;
139 		} else if ((ch & 0xfc) == 0xf8) {
140 			mask = 0x03;
141 			want = 5;
142 			lbound = 0x200000;
143 		} else if ((ch & 0xfe) == 0xfc) {
144 			mask = 0x01;
145 			want = 6;
146 			lbound = 0x4000000;
147 		} else {
148 			/*
149 			 * Malformed input; input is not UTF-8.
150 			 */
151 			errno = EILSEQ;
152 			return ((size_t)-1);
153 		}
154 	} else {
155 		want = us->want;
156 		lbound = us->lbound;
157 	}
158 
159 	/*
160 	 * Decode the octet sequence representing the character in chunks
161 	 * of 6 bits, most significant first.
162 	 */
163 	if (us->want == 0)
164 		wch = (unsigned char)*s++ & mask;
165 	else
166 		wch = us->ch;
167 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
168 		if ((*s & 0xc0) != 0x80) {
169 			/*
170 			 * Malformed input; bad characters in the middle
171 			 * of a character.
172 			 */
173 			errno = EILSEQ;
174 			return ((size_t)-1);
175 		}
176 		wch <<= 6;
177 		wch |= *s++ & 0x3f;
178 	}
179 	if (i < want) {
180 		/* Incomplete multibyte sequence. */
181 		us->want = want - i;
182 		us->lbound = lbound;
183 		us->ch = wch;
184 		return ((size_t)-2);
185 	}
186 	if (wch < lbound) {
187 		/*
188 		 * Malformed input; redundant encoding.
189 		 */
190 		errno = EILSEQ;
191 		return ((size_t)-1);
192 	}
193 	if (pwc != NULL)
194 		*pwc = wch;
195 	us->want = 0;
196 	return (wch == L'\0' ? 0 : want);
197 }
198 
199 static size_t
200 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
201     size_t nms, size_t len, mbstate_t * __restrict ps)
202 {
203 	_UTF8State *us;
204 	const char *s;
205 	size_t nchr;
206 	wchar_t wc;
207 	size_t nb;
208 
209 	us = (_UTF8State *)ps;
210 
211 	s = *src;
212 	nchr = 0;
213 
214 	if (dst == NULL) {
215 		/*
216 		 * The fast path in the loop below is not safe if an ASCII
217 		 * character appears as anything but the first byte of a
218 		 * multibyte sequence. Check now to avoid doing it in the loop.
219 		 */
220 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
221 			errno = EILSEQ;
222 			return ((size_t)-1);
223 		}
224 		for (;;) {
225 			if (nms > 0 && (signed char)*s > 0)
226 				/*
227 				 * Fast path for plain ASCII characters
228 				 * excluding NUL.
229 				 */
230 				nb = 1;
231 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
232 			    (size_t)-1)
233 				/* Invalid sequence - mbrtowc() sets errno. */
234 				return ((size_t)-1);
235 			else if (nb == 0 || nb == (size_t)-2)
236 				return (nchr);
237 			s += nb;
238 			nms -= nb;
239 			nchr++;
240 		}
241 		/*NOTREACHED*/
242 	}
243 
244 	/*
245 	 * The fast path in the loop below is not safe if an ASCII
246 	 * character appears as anything but the first byte of a
247 	 * multibyte sequence. Check now to avoid doing it in the loop.
248 	 */
249 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
250 		errno = EILSEQ;
251 		return ((size_t)-1);
252 	}
253 	while (len-- > 0) {
254 		if (nms > 0 && (signed char)*s > 0) {
255 			/*
256 			 * Fast path for plain ASCII characters
257 			 * excluding NUL.
258 			 */
259 			*dst = (wchar_t)*s;
260 			nb = 1;
261 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
262 		    (size_t)-1) {
263 			*src = s;
264 			return ((size_t)-1);
265 		} else if (nb == (size_t)-2) {
266 			*src = s + nms;
267 			return (nchr);
268 		} else if (nb == 0) {
269 			*src = NULL;
270 			return (nchr);
271 		}
272 		s += nb;
273 		nms -= nb;
274 		nchr++;
275 		dst++;
276 	}
277 	*src = s;
278 	return (nchr);
279 }
280 
281 static size_t
282 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
283 {
284 	_UTF8State *us;
285 	unsigned char lead;
286 	int i, len;
287 
288 	us = (_UTF8State *)ps;
289 
290 	if (us->want != 0) {
291 		errno = EINVAL;
292 		return ((size_t)-1);
293 	}
294 
295 	if (s == NULL)
296 		/* Reset to initial shift state (no-op) */
297 		return (1);
298 
299 	if ((wc & ~0x7f) == 0) {
300 		/* Fast path for plain ASCII characters. */
301 		*s = (char)wc;
302 		return (1);
303 	}
304 
305 	/*
306 	 * Determine the number of octets needed to represent this character.
307 	 * We always output the shortest sequence possible. Also specify the
308 	 * first few bits of the first octet, which contains the information
309 	 * about the sequence length.
310 	 */
311 	if ((wc & ~0x7f) == 0) {
312 		lead = 0;
313 		len = 1;
314 	} else if ((wc & ~0x7ff) == 0) {
315 		lead = 0xc0;
316 		len = 2;
317 	} else if ((wc & ~0xffff) == 0) {
318 		lead = 0xe0;
319 		len = 3;
320 	} else if ((wc & ~0x1fffff) == 0) {
321 		lead = 0xf0;
322 		len = 4;
323 	} else if ((wc & ~0x3ffffff) == 0) {
324 		lead = 0xf8;
325 		len = 5;
326 	} else if ((wc & ~0x7fffffff) == 0) {
327 		lead = 0xfc;
328 		len = 6;
329 	} else {
330 		errno = EILSEQ;
331 		return ((size_t)-1);
332 	}
333 
334 	/*
335 	 * Output the octets representing the character in chunks
336 	 * of 6 bits, least significant last. The first octet is
337 	 * a special case because it contains the sequence length
338 	 * information.
339 	 */
340 	for (i = len - 1; i > 0; i--) {
341 		s[i] = (wc & 0x3f) | 0x80;
342 		wc >>= 6;
343 	}
344 	*s = (wc & 0xff) | lead;
345 
346 	return (len);
347 }
348 
349 static size_t
350 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
351     size_t nwc, size_t len, mbstate_t * __restrict ps)
352 {
353 	_UTF8State *us;
354 	char buf[MB_LEN_MAX];
355 	const wchar_t *s;
356 	size_t nbytes;
357 	size_t nb;
358 
359 	us = (_UTF8State *)ps;
360 
361 	if (us->want != 0) {
362 		errno = EINVAL;
363 		return ((size_t)-1);
364 	}
365 
366 	s = *src;
367 	nbytes = 0;
368 
369 	if (dst == NULL) {
370 		while (nwc-- > 0) {
371 			if (0 <= *s && *s < 0x80)
372 				/* Fast path for plain ASCII characters. */
373 				nb = 1;
374 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
375 			    (size_t)-1)
376 				/* Invalid character - wcrtomb() sets errno. */
377 				return ((size_t)-1);
378 			if (*s == L'\0')
379 				return (nbytes + nb - 1);
380 			s++;
381 			nbytes += nb;
382 		}
383 		return (nbytes);
384 	}
385 
386 	while (len > 0 && nwc-- > 0) {
387 		if (0 <= *s && *s < 0x80) {
388 			/* Fast path for plain ASCII characters. */
389 			nb = 1;
390 			*dst = *s;
391 		} else if (len > (size_t)MB_CUR_MAX) {
392 			/* Enough space to translate in-place. */
393 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
394 				*src = s;
395 				return ((size_t)-1);
396 			}
397 		} else {
398 			/*
399 			 * May not be enough space; use temp. buffer.
400 			 */
401 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
402 				*src = s;
403 				return ((size_t)-1);
404 			}
405 			if (nb > (int)len)
406 				/* MB sequence for character won't fit. */
407 				break;
408 			memcpy(dst, buf, nb);
409 		}
410 		if (*s == L'\0') {
411 			*src = NULL;
412 			return (nbytes + nb - 1);
413 		}
414 		s++;
415 		dst += nb;
416 		len -= nb;
417 		nbytes += nb;
418 	}
419 	*src = s;
420 	return (nbytes);
421 }
422