xref: /illumos-gate/usr/src/lib/libc/port/locale/utf8.c (revision 4585130b259133a26efae68275dbe56b08366deb)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "lint.h"
30 #include <errno.h>
31 #include <limits.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "mblocal.h"
36 #include "lctype.h"
37 
38 static size_t	_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
39 		    const char *_RESTRICT_KYWD,
40 		    size_t, mbstate_t *_RESTRICT_KYWD);
41 static int	_UTF8_mbsinit(const mbstate_t *);
42 static size_t	_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
43 		    const char **_RESTRICT_KYWD, size_t, size_t,
44 		    mbstate_t *_RESTRICT_KYWD);
45 static size_t	_UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
46 		    mbstate_t *_RESTRICT_KYWD);
47 static size_t	_UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
48 		    const wchar_t **_RESTRICT_KYWD,
49 		    size_t, size_t, mbstate_t *_RESTRICT_KYWD);
50 
51 typedef struct {
52 	wchar_t	ch;
53 	int	want;
54 	wchar_t	lbound;
55 } _UTF8State;
56 
57 void
58 _UTF8_init(struct lc_ctype *lct)
59 {
60 	lct->lc_mbrtowc = _UTF8_mbrtowc;
61 	lct->lc_wcrtomb = _UTF8_wcrtomb;
62 	lct->lc_mbsinit = _UTF8_mbsinit;
63 	lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
64 	lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
65 	lct->lc_is_ascii = 0;
66 	lct->lc_max_mblen = 4;
67 }
68 
69 static int
70 _UTF8_mbsinit(const mbstate_t *ps)
71 {
72 
73 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
74 }
75 
76 static size_t
77 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
78     size_t n, mbstate_t *_RESTRICT_KYWD ps)
79 {
80 	_UTF8State *us;
81 	int ch, i, mask, want;
82 	wchar_t lbound, wch;
83 
84 	us = (_UTF8State *)ps;
85 
86 	if (us->want < 0 || us->want > 6) {
87 		errno = EINVAL;
88 		return ((size_t)-1);
89 	}
90 
91 	if (s == NULL) {
92 		s = "";
93 		n = 1;
94 		pwc = NULL;
95 	}
96 
97 	if (n == 0)
98 		/* Incomplete multibyte sequence */
99 		return ((size_t)-2);
100 
101 	if (us->want == 0) {
102 		/*
103 		 * Determine the number of octets that make up this character
104 		 * from the first octet, and a mask that extracts the
105 		 * interesting bits of the first octet. We already know
106 		 * the character is at least two bytes long.
107 		 *
108 		 * We also specify a lower bound for the character code to
109 		 * detect redundant, non-"shortest form" encodings. For
110 		 * example, the sequence C0 80 is _not_ a legal representation
111 		 * of the null character. This enforces a 1-to-1 mapping
112 		 * between character codes and their multibyte representations.
113 		 */
114 		ch = (unsigned char)*s;
115 		if ((ch & 0x80) == 0) {
116 			/* Fast path for plain ASCII characters. */
117 			if (pwc != NULL)
118 				*pwc = ch;
119 			return (ch != '\0' ? 1 : 0);
120 		}
121 		if ((ch & 0xe0) == 0xc0) {
122 			mask = 0x1f;
123 			want = 2;
124 			lbound = 0x80;
125 		} else if ((ch & 0xf0) == 0xe0) {
126 			mask = 0x0f;
127 			want = 3;
128 			lbound = 0x800;
129 		} else if ((ch & 0xf8) == 0xf0) {
130 			mask = 0x07;
131 			want = 4;
132 			lbound = 0x10000;
133 #if 0
134 		/* These would be illegal in the UTF-8 space */
135 
136 		} else if ((ch & 0xfc) == 0xf8) {
137 			mask = 0x03;
138 			want = 5;
139 			lbound = 0x200000;
140 		} else if ((ch & 0xfe) == 0xfc) {
141 			mask = 0x01;
142 			want = 6;
143 			lbound = 0x4000000;
144 #endif
145 		} else {
146 			/*
147 			 * Malformed input; input is not UTF-8.
148 			 */
149 			errno = EILSEQ;
150 			return ((size_t)-1);
151 		}
152 	} else {
153 		want = us->want;
154 		lbound = us->lbound;
155 	}
156 
157 	/*
158 	 * Decode the octet sequence representing the character in chunks
159 	 * of 6 bits, most significant first.
160 	 */
161 	if (us->want == 0)
162 		wch = (unsigned char)*s++ & mask;
163 	else
164 		wch = us->ch;
165 
166 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
167 		if ((*s & 0xc0) != 0x80) {
168 			/*
169 			 * Malformed input; bad characters in the middle
170 			 * of a character.
171 			 */
172 			errno = EILSEQ;
173 			return ((size_t)-1);
174 		}
175 		wch <<= 6;
176 		wch |= *s++ & 0x3f;
177 	}
178 	if (i < want) {
179 		/* Incomplete multibyte sequence. */
180 		us->want = want - i;
181 		us->lbound = lbound;
182 		us->ch = wch;
183 		return ((size_t)-2);
184 	}
185 	if (wch < lbound) {
186 		/*
187 		 * Malformed input; redundant encoding.
188 		 */
189 		errno = EILSEQ;
190 		return ((size_t)-1);
191 	}
192 	if (pwc != NULL)
193 		*pwc = wch;
194 	us->want = 0;
195 	return (wch == L'\0' ? 0 : want);
196 }
197 
198 static size_t
199 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
200     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
201 {
202 	_UTF8State *us;
203 	const char *s;
204 	size_t nchr;
205 	wchar_t wc;
206 	size_t nb;
207 
208 	us = (_UTF8State *)ps;
209 
210 	s = *src;
211 	nchr = 0;
212 
213 	if (dst == NULL) {
214 		/*
215 		 * The fast path in the loop below is not safe if an ASCII
216 		 * character appears as anything but the first byte of a
217 		 * multibyte sequence. Check now to avoid doing it in the loop.
218 		 */
219 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
220 			errno = EILSEQ;
221 			return ((size_t)-1);
222 		}
223 		for (;;) {
224 			if (nms > 0 && (signed char)*s > 0)
225 				/*
226 				 * Fast path for plain ASCII characters
227 				 * excluding NUL.
228 				 */
229 				nb = 1;
230 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
231 			    (size_t)-1)
232 				/* Invalid sequence - mbrtowc() sets errno. */
233 				return ((size_t)-1);
234 			else if (nb == 0 || nb == (size_t)-2)
235 				return (nchr);
236 			s += nb;
237 			nms -= nb;
238 			nchr++;
239 		}
240 		/*NOTREACHED*/
241 	}
242 
243 	/*
244 	 * The fast path in the loop below is not safe if an ASCII
245 	 * character appears as anything but the first byte of a
246 	 * multibyte sequence. Check now to avoid doing it in the loop.
247 	 */
248 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
249 		errno = EILSEQ;
250 		return ((size_t)-1);
251 	}
252 	while (len-- > 0) {
253 		if (nms > 0 && (signed char)*s > 0) {
254 			/*
255 			 * Fast path for plain ASCII characters
256 			 * excluding NUL.
257 			 */
258 			*dst = (wchar_t)*s;
259 			nb = 1;
260 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
261 		    (size_t)-1) {
262 			*src = s;
263 			return ((size_t)-1);
264 		} else if (nb == (size_t)-2) {
265 			*src = s + nms;
266 			return (nchr);
267 		} else if (nb == 0) {
268 			*src = NULL;
269 			return (nchr);
270 		}
271 		s += nb;
272 		nms -= nb;
273 		nchr++;
274 		dst++;
275 	}
276 	*src = s;
277 	return (nchr);
278 }
279 
280 static size_t
281 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
282 {
283 	_UTF8State *us;
284 	unsigned char lead;
285 	int i, len;
286 
287 	us = (_UTF8State *)ps;
288 
289 	if (us->want != 0) {
290 		errno = EINVAL;
291 		return ((size_t)-1);
292 	}
293 
294 	if (s == NULL)
295 		/* Reset to initial shift state (no-op) */
296 		return (1);
297 
298 	/*
299 	 * Determine the number of octets needed to represent this character.
300 	 * We always output the shortest sequence possible. Also specify the
301 	 * first few bits of the first octet, which contains the information
302 	 * about the sequence length.
303 	 */
304 	if ((wc & ~0x7f) == 0) {
305 		/* Fast path for plain ASCII characters. */
306 		*s = (char)wc;
307 		return (1);
308 	} else if ((wc & ~0x7ff) == 0) {
309 		lead = 0xc0;
310 		len = 2;
311 	} else if ((wc & ~0xffff) == 0) {
312 		lead = 0xe0;
313 		len = 3;
314 	} else if ((wc & ~0x1fffff) == 0) {
315 		lead = 0xf0;
316 		len = 4;
317 #if 0
318 	/* Again, 5 and 6 byte encodings are simply not permitted */
319 	} else if ((wc & ~0x3ffffff) == 0) {
320 		lead = 0xf8;
321 		len = 5;
322 	} else if ((wc & ~0x7fffffff) == 0) {
323 		lead = 0xfc;
324 		len = 6;
325 #endif
326 	} else {
327 		errno = EILSEQ;
328 		return ((size_t)-1);
329 	}
330 
331 	/*
332 	 * Output the octets representing the character in chunks
333 	 * of 6 bits, least significant last. The first octet is
334 	 * a special case because it contains the sequence length
335 	 * information.
336 	 */
337 	for (i = len - 1; i > 0; i--) {
338 		s[i] = (wc & 0x3f) | 0x80;
339 		wc >>= 6;
340 	}
341 	*s = (wc & 0xff) | lead;
342 
343 	return (len);
344 }
345 
346 static size_t
347 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
348     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
349 {
350 	_UTF8State *us;
351 	char buf[MB_LEN_MAX];
352 	const wchar_t *s;
353 	size_t nbytes;
354 	size_t nb;
355 
356 	us = (_UTF8State *)ps;
357 
358 	if (us->want != 0) {
359 		errno = EINVAL;
360 		return ((size_t)-1);
361 	}
362 
363 	s = *src;
364 	nbytes = 0;
365 
366 	if (dst == NULL) {
367 		while (nwc-- > 0) {
368 			if (0 <= *s && *s < 0x80)
369 				/* Fast path for plain ASCII characters. */
370 				nb = 1;
371 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
372 			    (size_t)-1)
373 				/* Invalid character - wcrtomb() sets errno. */
374 				return ((size_t)-1);
375 			if (*s == L'\0')
376 				return (nbytes + nb - 1);
377 			s++;
378 			nbytes += nb;
379 		}
380 		return (nbytes);
381 	}
382 
383 	while (len > 0 && nwc-- > 0) {
384 		if (0 <= *s && *s < 0x80) {
385 			/* Fast path for plain ASCII characters. */
386 			nb = 1;
387 			*dst = *s;
388 		} else if (len > (size_t)MB_CUR_MAX) {
389 			/* Enough space to translate in-place. */
390 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
391 				*src = s;
392 				return ((size_t)-1);
393 			}
394 		} else {
395 			/*
396 			 * May not be enough space; use temp. buffer.
397 			 */
398 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
399 				*src = s;
400 				return ((size_t)-1);
401 			}
402 			if (nb > (int)len)
403 				/* MB sequence for character won't fit. */
404 				break;
405 			(void) memcpy(dst, buf, nb);
406 		}
407 		if (*s == L'\0') {
408 			*src = NULL;
409 			return (nbytes + nb - 1);
410 		}
411 		s++;
412 		dst += nb;
413 		len -= nb;
414 		nbytes += nb;
415 	}
416 	*src = s;
417 	return (nbytes);
418 }
419