xref: /illumos-gate/usr/src/lib/libc/port/locale/utf8.c (revision 99ea293e719ac006d413e4fde6ac0d5cd4dd6c59)
1 /*
2  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include "lint.h"
30 #include <errno.h>
31 #include <limits.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "mblocal.h"
36 #include "lctype.h"
37 
38 static size_t	_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
39 		    const char *_RESTRICT_KYWD,
40 		    size_t, mbstate_t *_RESTRICT_KYWD, boolean_t);
41 static int	_UTF8_mbsinit(const mbstate_t *);
42 static size_t	_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
43 		    const char **_RESTRICT_KYWD, size_t, size_t,
44 		    mbstate_t *_RESTRICT_KYWD);
45 static size_t	_UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
46 		    mbstate_t *_RESTRICT_KYWD);
47 static size_t	_UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
48 		    const wchar_t **_RESTRICT_KYWD,
49 		    size_t, size_t, mbstate_t *_RESTRICT_KYWD);
50 
51 void
52 _UTF8_init(struct lc_ctype *lct)
53 {
54 	lct->lc_mbrtowc = _UTF8_mbrtowc;
55 	lct->lc_wcrtomb = _UTF8_wcrtomb;
56 	lct->lc_mbsinit = _UTF8_mbsinit;
57 	lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
58 	lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
59 	lct->lc_is_ascii = 0;
60 	lct->lc_max_mblen = 4;
61 }
62 
63 static int
64 _UTF8_mbsinit(const mbstate_t *ps)
65 {
66 
67 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
68 }
69 
70 static size_t
71 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
72     size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero)
73 {
74 	_UTF8State *us;
75 	int ch, i, mask, want;
76 	wchar_t lbound, wch;
77 
78 	us = (_UTF8State *)ps;
79 
80 	if (us->want < 0 || us->want > 6) {
81 		errno = EINVAL;
82 		return ((size_t)-1);
83 	}
84 
85 	if (s == NULL) {
86 		s = "";
87 		n = 1;
88 		pwc = NULL;
89 	}
90 
91 	if (n == 0)
92 		/* Incomplete multibyte sequence */
93 		return ((size_t)-2);
94 
95 	if (us->want == 0) {
96 		/*
97 		 * Determine the number of octets that make up this character
98 		 * from the first octet, and a mask that extracts the
99 		 * interesting bits of the first octet. We already know
100 		 * the character is at least two bytes long.
101 		 *
102 		 * We also specify a lower bound for the character code to
103 		 * detect redundant, non-"shortest form" encodings. For
104 		 * example, the sequence C0 80 is _not_ a legal representation
105 		 * of the null character. This enforces a 1-to-1 mapping
106 		 * between character codes and their multibyte representations.
107 		 */
108 		ch = (unsigned char)*s;
109 		if ((ch & 0x80) == 0) {
110 			/* Fast path for plain ASCII characters. */
111 			if (pwc != NULL)
112 				*pwc = ch;
113 			if (zero || ch != '\0') {
114 				return (1);
115 			} else {
116 				return (0);
117 			}
118 		}
119 		if ((ch & 0xe0) == 0xc0) {
120 			mask = 0x1f;
121 			want = 2;
122 			lbound = 0x80;
123 		} else if ((ch & 0xf0) == 0xe0) {
124 			mask = 0x0f;
125 			want = 3;
126 			lbound = 0x800;
127 		} else if ((ch & 0xf8) == 0xf0) {
128 			mask = 0x07;
129 			want = 4;
130 			lbound = 0x10000;
131 #if 0
132 		/* These would be illegal in the UTF-8 space */
133 
134 		} else if ((ch & 0xfc) == 0xf8) {
135 			mask = 0x03;
136 			want = 5;
137 			lbound = 0x200000;
138 		} else if ((ch & 0xfe) == 0xfc) {
139 			mask = 0x01;
140 			want = 6;
141 			lbound = 0x4000000;
142 #endif
143 		} else {
144 			/*
145 			 * Malformed input; input is not UTF-8.
146 			 */
147 			errno = EILSEQ;
148 			return ((size_t)-1);
149 		}
150 	} else {
151 		want = us->want;
152 		lbound = us->lbound;
153 	}
154 
155 	/*
156 	 * Decode the octet sequence representing the character in chunks
157 	 * of 6 bits, most significant first.
158 	 */
159 	if (us->want == 0)
160 		wch = (unsigned char)*s++ & mask;
161 	else
162 		wch = us->ch;
163 
164 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
165 		if ((*s & 0xc0) != 0x80) {
166 			/*
167 			 * Malformed input; bad characters in the middle
168 			 * of a character.
169 			 */
170 			errno = EILSEQ;
171 			return ((size_t)-1);
172 		}
173 		wch <<= 6;
174 		wch |= *s++ & 0x3f;
175 	}
176 	if (i < want) {
177 		/* Incomplete multibyte sequence. */
178 		us->want = want - i;
179 		us->lbound = lbound;
180 		us->ch = wch;
181 		return ((size_t)-2);
182 	}
183 	if (wch < lbound) {
184 		/*
185 		 * Malformed input; redundant encoding.
186 		 */
187 		errno = EILSEQ;
188 		return ((size_t)-1);
189 	}
190 	if (pwc != NULL)
191 		*pwc = wch;
192 	us->want = 0;
193 	if (zero || wch != L'\0') {
194 		return (want);
195 	} else {
196 		return (0);
197 	}
198 }
199 
200 static size_t
201 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
202     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
203 {
204 	_UTF8State *us;
205 	const char *s;
206 	size_t nchr;
207 	wchar_t wc;
208 	size_t nb;
209 
210 	us = (_UTF8State *)ps;
211 
212 	s = *src;
213 	nchr = 0;
214 
215 	if (dst == NULL) {
216 		/*
217 		 * The fast path in the loop below is not safe if an ASCII
218 		 * character appears as anything but the first byte of a
219 		 * multibyte sequence. Check now to avoid doing it in the loop.
220 		 */
221 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
222 			errno = EILSEQ;
223 			return ((size_t)-1);
224 		}
225 		for (;;) {
226 			if (nms > 0 && (signed char)*s > 0) {
227 				/*
228 				 * Fast path for plain ASCII characters
229 				 * excluding NUL.
230 				 */
231 				nb = 1;
232 			} else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps,
233 			    B_FALSE)) == (size_t)-1) {
234 				/* Invalid sequence - mbrtowc() sets errno. */
235 				return ((size_t)-1);
236 			} else if (nb == 0 || nb == (size_t)-2) {
237 				return (nchr);
238 			}
239 			s += nb;
240 			nms -= nb;
241 			nchr++;
242 		}
243 		/*NOTREACHED*/
244 	}
245 
246 	/*
247 	 * The fast path in the loop below is not safe if an ASCII
248 	 * character appears as anything but the first byte of a
249 	 * multibyte sequence. Check now to avoid doing it in the loop.
250 	 */
251 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
252 		errno = EILSEQ;
253 		return ((size_t)-1);
254 	}
255 	while (len-- > 0) {
256 		if (nms > 0 && (signed char)*s > 0) {
257 			/*
258 			 * Fast path for plain ASCII characters
259 			 * excluding NUL.
260 			 */
261 			*dst = (wchar_t)*s;
262 			nb = 1;
263 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, B_FALSE)) ==
264 		    (size_t)-1) {
265 			*src = s;
266 			return ((size_t)-1);
267 		} else if (nb == (size_t)-2) {
268 			*src = s + nms;
269 			return (nchr);
270 		} else if (nb == 0) {
271 			*src = NULL;
272 			return (nchr);
273 		}
274 		s += nb;
275 		nms -= nb;
276 		nchr++;
277 		dst++;
278 	}
279 	*src = s;
280 	return (nchr);
281 }
282 
283 static size_t
284 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
285 {
286 	_UTF8State *us;
287 	unsigned char lead;
288 	int i, len;
289 
290 	us = (_UTF8State *)ps;
291 
292 	if (us->want != 0) {
293 		errno = EINVAL;
294 		return ((size_t)-1);
295 	}
296 
297 	if (s == NULL)
298 		/* Reset to initial shift state (no-op) */
299 		return (1);
300 
301 	/*
302 	 * Determine the number of octets needed to represent this character.
303 	 * We always output the shortest sequence possible. Also specify the
304 	 * first few bits of the first octet, which contains the information
305 	 * about the sequence length.
306 	 */
307 	if ((wc & ~0x7f) == 0) {
308 		/* Fast path for plain ASCII characters. */
309 		*s = (char)wc;
310 		return (1);
311 	} else if ((wc & ~0x7ff) == 0) {
312 		lead = 0xc0;
313 		len = 2;
314 	} else if ((wc & ~0xffff) == 0) {
315 		lead = 0xe0;
316 		len = 3;
317 	} else if ((wc & ~0x1fffff) == 0) {
318 		lead = 0xf0;
319 		len = 4;
320 #if 0
321 	/* Again, 5 and 6 byte encodings are simply not permitted */
322 	} else if ((wc & ~0x3ffffff) == 0) {
323 		lead = 0xf8;
324 		len = 5;
325 	} else if ((wc & ~0x7fffffff) == 0) {
326 		lead = 0xfc;
327 		len = 6;
328 #endif
329 	} else {
330 		errno = EILSEQ;
331 		return ((size_t)-1);
332 	}
333 
334 	/*
335 	 * Output the octets representing the character in chunks
336 	 * of 6 bits, least significant last. The first octet is
337 	 * a special case because it contains the sequence length
338 	 * information.
339 	 */
340 	for (i = len - 1; i > 0; i--) {
341 		s[i] = (wc & 0x3f) | 0x80;
342 		wc >>= 6;
343 	}
344 	*s = (wc & 0xff) | lead;
345 
346 	return (len);
347 }
348 
349 static size_t
350 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
351     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
352 {
353 	_UTF8State *us;
354 	char buf[MB_LEN_MAX];
355 	const wchar_t *s;
356 	size_t nbytes;
357 	size_t nb;
358 
359 	us = (_UTF8State *)ps;
360 
361 	if (us->want != 0) {
362 		errno = EINVAL;
363 		return ((size_t)-1);
364 	}
365 
366 	s = *src;
367 	nbytes = 0;
368 
369 	if (dst == NULL) {
370 		while (nwc-- > 0) {
371 			if (0 <= *s && *s < 0x80)
372 				/* Fast path for plain ASCII characters. */
373 				nb = 1;
374 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
375 			    (size_t)-1)
376 				/* Invalid character - wcrtomb() sets errno. */
377 				return ((size_t)-1);
378 			if (*s == L'\0')
379 				return (nbytes + nb - 1);
380 			s++;
381 			nbytes += nb;
382 		}
383 		return (nbytes);
384 	}
385 
386 	while (len > 0 && nwc-- > 0) {
387 		if (0 <= *s && *s < 0x80) {
388 			/* Fast path for plain ASCII characters. */
389 			nb = 1;
390 			*dst = *s;
391 		} else if (len > (size_t)MB_CUR_MAX) {
392 			/* Enough space to translate in-place. */
393 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
394 				*src = s;
395 				return ((size_t)-1);
396 			}
397 		} else {
398 			/*
399 			 * May not be enough space; use temp. buffer.
400 			 */
401 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
402 				*src = s;
403 				return ((size_t)-1);
404 			}
405 			if (nb > (int)len)
406 				/* MB sequence for character won't fit. */
407 				break;
408 			(void) memcpy(dst, buf, nb);
409 		}
410 		if (*s == L'\0') {
411 			*src = NULL;
412 			return (nbytes + nb - 1);
413 		}
414 		s++;
415 		dst += nb;
416 		len -= nb;
417 		nbytes += nb;
418 	}
419 	*src = s;
420 	return (nbytes);
421 }
422