xref: /illumos-gate/usr/src/lib/libc/port/locale/utf8.c (revision eb6b10e69fa5ba733da194d3ad71a0e63338be29)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "lint.h"
30 #include <errno.h>
31 #include <limits.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "mblocal.h"
36 #include "lctype.h"
37 
38 static size_t	_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
39 		    const char *_RESTRICT_KYWD,
40 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
41 static int	_UTF8_mbsinit(const mbstate_t *);
42 static size_t	_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
43 		    const char **_RESTRICT_KYWD, size_t, size_t,
44 		    mbstate_t *_RESTRICT_KYWD);
45 static size_t	_UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
46 		    mbstate_t *_RESTRICT_KYWD);
47 static size_t	_UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
48 		    const wchar_t **_RESTRICT_KYWD,
49 		    size_t, size_t, mbstate_t *_RESTRICT_KYWD);
50 
51 typedef struct {
52 	wchar_t	ch;
53 	int	want;
54 	wchar_t	lbound;
55 } _UTF8State;
56 
57 void
58 _UTF8_init(struct lc_ctype *lct)
59 {
60 	lct->lc_mbrtowc = _UTF8_mbrtowc;
61 	lct->lc_wcrtomb = _UTF8_wcrtomb;
62 	lct->lc_mbsinit = _UTF8_mbsinit;
63 	lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
64 	lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
65 	lct->lc_is_ascii = 0;
66 	lct->lc_max_mblen = 4;
67 }
68 
69 static int
70 _UTF8_mbsinit(const mbstate_t *ps)
71 {
72 
73 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
74 }
75 
76 static size_t
77 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
78     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
79 {
80 	_UTF8State *us;
81 	int ch, i, mask, want;
82 	wchar_t lbound, wch;
83 
84 	us = (_UTF8State *)ps;
85 
86 	if (us->want < 0 || us->want > 6) {
87 		errno = EINVAL;
88 		return ((size_t)-1);
89 	}
90 
91 	if (s == NULL) {
92 		s = "";
93 		n = 1;
94 		pwc = NULL;
95 	}
96 
97 	if (n == 0)
98 		/* Incomplete multibyte sequence */
99 		return ((size_t)-2);
100 
101 	if (us->want == 0) {
102 		/*
103 		 * Determine the number of octets that make up this character
104 		 * from the first octet, and a mask that extracts the
105 		 * interesting bits of the first octet. We already know
106 		 * the character is at least two bytes long.
107 		 *
108 		 * We also specify a lower bound for the character code to
109 		 * detect redundant, non-"shortest form" encodings. For
110 		 * example, the sequence C0 80 is _not_ a legal representation
111 		 * of the null character. This enforces a 1-to-1 mapping
112 		 * between character codes and their multibyte representations.
113 		 */
114 		ch = (unsigned char)*s;
115 		if ((ch & 0x80) == 0) {
116 			/* Fast path for plain ASCII characters. */
117 			if (pwc != NULL)
118 				*pwc = ch;
119 			if (zero || ch != '\0') {
120 				return (1);
121 			} else {
122 				return (0);
123 			}
124 		}
125 		if ((ch & 0xe0) == 0xc0) {
126 			mask = 0x1f;
127 			want = 2;
128 			lbound = 0x80;
129 		} else if ((ch & 0xf0) == 0xe0) {
130 			mask = 0x0f;
131 			want = 3;
132 			lbound = 0x800;
133 		} else if ((ch & 0xf8) == 0xf0) {
134 			mask = 0x07;
135 			want = 4;
136 			lbound = 0x10000;
137 #if 0
138 		/* These would be illegal in the UTF-8 space */
139 
140 		} else if ((ch & 0xfc) == 0xf8) {
141 			mask = 0x03;
142 			want = 5;
143 			lbound = 0x200000;
144 		} else if ((ch & 0xfe) == 0xfc) {
145 			mask = 0x01;
146 			want = 6;
147 			lbound = 0x4000000;
148 #endif
149 		} else {
150 			/*
151 			 * Malformed input; input is not UTF-8.
152 			 */
153 			errno = EILSEQ;
154 			return ((size_t)-1);
155 		}
156 	} else {
157 		want = us->want;
158 		lbound = us->lbound;
159 	}
160 
161 	/*
162 	 * Decode the octet sequence representing the character in chunks
163 	 * of 6 bits, most significant first.
164 	 */
165 	if (us->want == 0)
166 		wch = (unsigned char)*s++ & mask;
167 	else
168 		wch = us->ch;
169 
170 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
171 		if ((*s & 0xc0) != 0x80) {
172 			/*
173 			 * Malformed input; bad characters in the middle
174 			 * of a character.
175 			 */
176 			errno = EILSEQ;
177 			return ((size_t)-1);
178 		}
179 		wch <<= 6;
180 		wch |= *s++ & 0x3f;
181 	}
182 	if (i < want) {
183 		/* Incomplete multibyte sequence. */
184 		us->want = want - i;
185 		us->lbound = lbound;
186 		us->ch = wch;
187 		return ((size_t)-2);
188 	}
189 	if (wch < lbound) {
190 		/*
191 		 * Malformed input; redundant encoding.
192 		 */
193 		errno = EILSEQ;
194 		return ((size_t)-1);
195 	}
196 	if (pwc != NULL)
197 		*pwc = wch;
198 	us->want = 0;
199 	if (zero || wch != L'\0') {
200 		return (want);
201 	} else {
202 		return (0);
203 	}
204 }
205 
206 static size_t
207 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
208     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
209 {
210 	_UTF8State *us;
211 	const char *s;
212 	size_t nchr;
213 	wchar_t wc;
214 	size_t nb;
215 
216 	us = (_UTF8State *)ps;
217 
218 	s = *src;
219 	nchr = 0;
220 
221 	if (dst == NULL) {
222 		/*
223 		 * The fast path in the loop below is not safe if an ASCII
224 		 * character appears as anything but the first byte of a
225 		 * multibyte sequence. Check now to avoid doing it in the loop.
226 		 */
227 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
228 			errno = EILSEQ;
229 			return ((size_t)-1);
230 		}
231 		for (;;) {
232 			if (nms > 0 && (signed char)*s > 0) {
233 				/*
234 				 * Fast path for plain ASCII characters
235 				 * excluding NUL.
236 				 */
237 				nb = 1;
238 			} else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps,
239 			    B_FALSE)) == (size_t)-1) {
240 				/* Invalid sequence - mbrtowc() sets errno. */
241 				return ((size_t)-1);
242 			} else if (nb == 0 || nb == (size_t)-2) {
243 				return (nchr);
244 			}
245 			s += nb;
246 			nms -= nb;
247 			nchr++;
248 		}
249 		/*NOTREACHED*/
250 	}
251 
252 	/*
253 	 * The fast path in the loop below is not safe if an ASCII
254 	 * character appears as anything but the first byte of a
255 	 * multibyte sequence. Check now to avoid doing it in the loop.
256 	 */
257 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
258 		errno = EILSEQ;
259 		return ((size_t)-1);
260 	}
261 	while (len-- > 0) {
262 		if (nms > 0 && (signed char)*s > 0) {
263 			/*
264 			 * Fast path for plain ASCII characters
265 			 * excluding NUL.
266 			 */
267 			*dst = (wchar_t)*s;
268 			nb = 1;
269 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, B_FALSE)) ==
270 		    (size_t)-1) {
271 			*src = s;
272 			return ((size_t)-1);
273 		} else if (nb == (size_t)-2) {
274 			*src = s + nms;
275 			return (nchr);
276 		} else if (nb == 0) {
277 			*src = NULL;
278 			return (nchr);
279 		}
280 		s += nb;
281 		nms -= nb;
282 		nchr++;
283 		dst++;
284 	}
285 	*src = s;
286 	return (nchr);
287 }
288 
289 static size_t
290 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
291 {
292 	_UTF8State *us;
293 	unsigned char lead;
294 	int i, len;
295 
296 	us = (_UTF8State *)ps;
297 
298 	if (us->want != 0) {
299 		errno = EINVAL;
300 		return ((size_t)-1);
301 	}
302 
303 	if (s == NULL)
304 		/* Reset to initial shift state (no-op) */
305 		return (1);
306 
307 	/*
308 	 * Determine the number of octets needed to represent this character.
309 	 * We always output the shortest sequence possible. Also specify the
310 	 * first few bits of the first octet, which contains the information
311 	 * about the sequence length.
312 	 */
313 	if ((wc & ~0x7f) == 0) {
314 		/* Fast path for plain ASCII characters. */
315 		*s = (char)wc;
316 		return (1);
317 	} else if ((wc & ~0x7ff) == 0) {
318 		lead = 0xc0;
319 		len = 2;
320 	} else if ((wc & ~0xffff) == 0) {
321 		lead = 0xe0;
322 		len = 3;
323 	} else if ((wc & ~0x1fffff) == 0) {
324 		lead = 0xf0;
325 		len = 4;
326 #if 0
327 	/* Again, 5 and 6 byte encodings are simply not permitted */
328 	} else if ((wc & ~0x3ffffff) == 0) {
329 		lead = 0xf8;
330 		len = 5;
331 	} else if ((wc & ~0x7fffffff) == 0) {
332 		lead = 0xfc;
333 		len = 6;
334 #endif
335 	} else {
336 		errno = EILSEQ;
337 		return ((size_t)-1);
338 	}
339 
340 	/*
341 	 * Output the octets representing the character in chunks
342 	 * of 6 bits, least significant last. The first octet is
343 	 * a special case because it contains the sequence length
344 	 * information.
345 	 */
346 	for (i = len - 1; i > 0; i--) {
347 		s[i] = (wc & 0x3f) | 0x80;
348 		wc >>= 6;
349 	}
350 	*s = (wc & 0xff) | lead;
351 
352 	return (len);
353 }
354 
355 static size_t
356 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
357     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
358 {
359 	_UTF8State *us;
360 	char buf[MB_LEN_MAX];
361 	const wchar_t *s;
362 	size_t nbytes;
363 	size_t nb;
364 
365 	us = (_UTF8State *)ps;
366 
367 	if (us->want != 0) {
368 		errno = EINVAL;
369 		return ((size_t)-1);
370 	}
371 
372 	s = *src;
373 	nbytes = 0;
374 
375 	if (dst == NULL) {
376 		while (nwc-- > 0) {
377 			if (0 <= *s && *s < 0x80)
378 				/* Fast path for plain ASCII characters. */
379 				nb = 1;
380 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
381 			    (size_t)-1)
382 				/* Invalid character - wcrtomb() sets errno. */
383 				return ((size_t)-1);
384 			if (*s == L'\0')
385 				return (nbytes + nb - 1);
386 			s++;
387 			nbytes += nb;
388 		}
389 		return (nbytes);
390 	}
391 
392 	while (len > 0 && nwc-- > 0) {
393 		if (0 <= *s && *s < 0x80) {
394 			/* Fast path for plain ASCII characters. */
395 			nb = 1;
396 			*dst = *s;
397 		} else if (len > (size_t)MB_CUR_MAX) {
398 			/* Enough space to translate in-place. */
399 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
400 				*src = s;
401 				return ((size_t)-1);
402 			}
403 		} else {
404 			/*
405 			 * May not be enough space; use temp. buffer.
406 			 */
407 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
408 				*src = s;
409 				return ((size_t)-1);
410 			}
411 			if (nb > (int)len)
412 				/* MB sequence for character won't fit. */
413 				break;
414 			(void) memcpy(dst, buf, nb);
415 		}
416 		if (*s == L'\0') {
417 			*src = NULL;
418 			return (nbytes + nb - 1);
419 		}
420 		s++;
421 		dst += nb;
422 		len -= nb;
423 		nbytes += nb;
424 	}
425 	*src = s;
426 	return (nbytes);
427 }
428