xref: /illumos-gate/usr/src/lib/libc/port/locale/utf8.c (revision 83eaeac78ef2f69de16c2fecd3077c0ee9269743)
1 /*
2  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
3  * Copyright (c) 2002-2004 Tim J. Robbins
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include "lint.h"
29 #include <errno.h>
30 #include <limits.h>
31 #include "runetype.h"
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include "mblocal.h"
36 
37 static size_t	_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
38 		    const char *_RESTRICT_KYWD,
39 		    size_t, mbstate_t *_RESTRICT_KYWD);
40 static int	_UTF8_mbsinit(const mbstate_t *);
41 static size_t	_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
42 		    const char **_RESTRICT_KYWD, size_t, size_t,
43 		    mbstate_t *_RESTRICT_KYWD);
44 static size_t	_UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
45 		    mbstate_t *_RESTRICT_KYWD);
46 static size_t	_UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
47 		    const wchar_t **_RESTRICT_KYWD,
48 		    size_t, size_t, mbstate_t *_RESTRICT_KYWD);
49 
50 typedef struct {
51 	wchar_t	ch;
52 	int	want;
53 	wchar_t	lbound;
54 } _UTF8State;
55 
56 int
57 _UTF8_init(_RuneLocale *rl)
58 {
59 	__mbrtowc = _UTF8_mbrtowc;
60 	__wcrtomb = _UTF8_wcrtomb;
61 	__mbsinit = _UTF8_mbsinit;
62 	__mbsnrtowcs = _UTF8_mbsnrtowcs;
63 	__wcsnrtombs = _UTF8_wcsnrtombs;
64 	_CurrentRuneLocale = rl;
65 
66 	charset_is_ascii = 0;
67 
68 	/*
69 	 * In theory up to 6 bytes can be used for the encoding,
70 	 * but only encodings with more than 4 bytes are illegal.
71 	 */
72 	__ctype[520] = 4;
73 	/*
74 	 * Note that the other CSWIDTH members are nonsensical for this
75 	 * this coding.  They only are valid with EUC codings.
76 	 */
77 
78 	return (0);
79 }
80 
81 static int
82 _UTF8_mbsinit(const mbstate_t *ps)
83 {
84 
85 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
86 }
87 
88 static size_t
89 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
90     size_t n, mbstate_t *_RESTRICT_KYWD ps)
91 {
92 	_UTF8State *us;
93 	int ch, i, mask, want;
94 	wchar_t lbound, wch;
95 
96 	us = (_UTF8State *)ps;
97 
98 	if (us->want < 0 || us->want > 6) {
99 		errno = EINVAL;
100 		return ((size_t)-1);
101 	}
102 
103 	if (s == NULL) {
104 		s = "";
105 		n = 1;
106 		pwc = NULL;
107 	}
108 
109 	if (n == 0)
110 		/* Incomplete multibyte sequence */
111 		return ((size_t)-2);
112 
113 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
114 		/* Fast path for plain ASCII characters. */
115 		if (pwc != NULL)
116 			*pwc = ch;
117 		return (ch != '\0' ? 1 : 0);
118 	}
119 
120 	if (us->want == 0) {
121 		/*
122 		 * Determine the number of octets that make up this character
123 		 * from the first octet, and a mask that extracts the
124 		 * interesting bits of the first octet. We already know
125 		 * the character is at least two bytes long.
126 		 *
127 		 * We also specify a lower bound for the character code to
128 		 * detect redundant, non-"shortest form" encodings. For
129 		 * example, the sequence C0 80 is _not_ a legal representation
130 		 * of the null character. This enforces a 1-to-1 mapping
131 		 * between character codes and their multibyte representations.
132 		 */
133 		ch = (unsigned char)*s;
134 		if ((ch & 0x80) == 0) {
135 			mask = 0x7f;
136 			want = 1;
137 			lbound = 0;
138 		} else if ((ch & 0xe0) == 0xc0) {
139 			mask = 0x1f;
140 			want = 2;
141 			lbound = 0x80;
142 		} else if ((ch & 0xf0) == 0xe0) {
143 			mask = 0x0f;
144 			want = 3;
145 			lbound = 0x800;
146 		} else if ((ch & 0xf8) == 0xf0) {
147 			mask = 0x07;
148 			want = 4;
149 			lbound = 0x10000;
150 #if 0
151 		/* These would be illegal in the UTF-8 space */
152 
153 		} else if ((ch & 0xfc) == 0xf8) {
154 			mask = 0x03;
155 			want = 5;
156 			lbound = 0x200000;
157 		} else if ((ch & 0xfe) == 0xfc) {
158 			mask = 0x01;
159 			want = 6;
160 			lbound = 0x4000000;
161 #endif
162 		} else {
163 			/*
164 			 * Malformed input; input is not UTF-8.
165 			 */
166 			errno = EILSEQ;
167 			return ((size_t)-1);
168 		}
169 	} else {
170 		want = us->want;
171 		lbound = us->lbound;
172 	}
173 
174 	/*
175 	 * Decode the octet sequence representing the character in chunks
176 	 * of 6 bits, most significant first.
177 	 */
178 	if (us->want == 0)
179 		wch = (unsigned char)*s++ & mask;
180 	else
181 		wch = us->ch;
182 
183 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
184 		if ((*s & 0xc0) != 0x80) {
185 			/*
186 			 * Malformed input; bad characters in the middle
187 			 * of a character.
188 			 */
189 			errno = EILSEQ;
190 			return ((size_t)-1);
191 		}
192 		wch <<= 6;
193 		wch |= *s++ & 0x3f;
194 	}
195 	if (i < want) {
196 		/* Incomplete multibyte sequence. */
197 		us->want = want - i;
198 		us->lbound = lbound;
199 		us->ch = wch;
200 		return ((size_t)-2);
201 	}
202 	if (wch < lbound) {
203 		/*
204 		 * Malformed input; redundant encoding.
205 		 */
206 		errno = EILSEQ;
207 		return ((size_t)-1);
208 	}
209 	if (pwc != NULL)
210 		*pwc = wch;
211 	us->want = 0;
212 	return (wch == L'\0' ? 0 : want);
213 }
214 
215 static size_t
216 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
217     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
218 {
219 	_UTF8State *us;
220 	const char *s;
221 	size_t nchr;
222 	wchar_t wc;
223 	size_t nb;
224 
225 	us = (_UTF8State *)ps;
226 
227 	s = *src;
228 	nchr = 0;
229 
230 	if (dst == NULL) {
231 		/*
232 		 * The fast path in the loop below is not safe if an ASCII
233 		 * character appears as anything but the first byte of a
234 		 * multibyte sequence. Check now to avoid doing it in the loop.
235 		 */
236 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
237 			errno = EILSEQ;
238 			return ((size_t)-1);
239 		}
240 		for (;;) {
241 			if (nms > 0 && (signed char)*s > 0)
242 				/*
243 				 * Fast path for plain ASCII characters
244 				 * excluding NUL.
245 				 */
246 				nb = 1;
247 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
248 			    (size_t)-1)
249 				/* Invalid sequence - mbrtowc() sets errno. */
250 				return ((size_t)-1);
251 			else if (nb == 0 || nb == (size_t)-2)
252 				return (nchr);
253 			s += nb;
254 			nms -= nb;
255 			nchr++;
256 		}
257 		/*NOTREACHED*/
258 	}
259 
260 	/*
261 	 * The fast path in the loop below is not safe if an ASCII
262 	 * character appears as anything but the first byte of a
263 	 * multibyte sequence. Check now to avoid doing it in the loop.
264 	 */
265 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
266 		errno = EILSEQ;
267 		return ((size_t)-1);
268 	}
269 	while (len-- > 0) {
270 		if (nms > 0 && (signed char)*s > 0) {
271 			/*
272 			 * Fast path for plain ASCII characters
273 			 * excluding NUL.
274 			 */
275 			*dst = (wchar_t)*s;
276 			nb = 1;
277 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
278 		    (size_t)-1) {
279 			*src = s;
280 			return ((size_t)-1);
281 		} else if (nb == (size_t)-2) {
282 			*src = s + nms;
283 			return (nchr);
284 		} else if (nb == 0) {
285 			*src = NULL;
286 			return (nchr);
287 		}
288 		s += nb;
289 		nms -= nb;
290 		nchr++;
291 		dst++;
292 	}
293 	*src = s;
294 	return (nchr);
295 }
296 
297 static size_t
298 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
299 {
300 	_UTF8State *us;
301 	unsigned char lead;
302 	int i, len;
303 
304 	us = (_UTF8State *)ps;
305 
306 	if (us->want != 0) {
307 		errno = EINVAL;
308 		return ((size_t)-1);
309 	}
310 
311 	if (s == NULL)
312 		/* Reset to initial shift state (no-op) */
313 		return (1);
314 
315 	if ((wc & ~0x7f) == 0) {
316 		/* Fast path for plain ASCII characters. */
317 		*s = (char)wc;
318 		return (1);
319 	}
320 
321 	/*
322 	 * Determine the number of octets needed to represent this character.
323 	 * We always output the shortest sequence possible. Also specify the
324 	 * first few bits of the first octet, which contains the information
325 	 * about the sequence length.
326 	 */
327 	if ((wc & ~0x7f) == 0) {
328 		lead = 0;
329 		len = 1;
330 	} else if ((wc & ~0x7ff) == 0) {
331 		lead = 0xc0;
332 		len = 2;
333 	} else if ((wc & ~0xffff) == 0) {
334 		lead = 0xe0;
335 		len = 3;
336 	} else if ((wc & ~0x1fffff) == 0) {
337 		lead = 0xf0;
338 		len = 4;
339 #if 0
340 	/* Again, 5 and 6 byte encodings are simply not permitted */
341 	} else if ((wc & ~0x3ffffff) == 0) {
342 		lead = 0xf8;
343 		len = 5;
344 	} else if ((wc & ~0x7fffffff) == 0) {
345 		lead = 0xfc;
346 		len = 6;
347 #endif
348 	} else {
349 		errno = EILSEQ;
350 		return ((size_t)-1);
351 	}
352 
353 	/*
354 	 * Output the octets representing the character in chunks
355 	 * of 6 bits, least significant last. The first octet is
356 	 * a special case because it contains the sequence length
357 	 * information.
358 	 */
359 	for (i = len - 1; i > 0; i--) {
360 		s[i] = (wc & 0x3f) | 0x80;
361 		wc >>= 6;
362 	}
363 	*s = (wc & 0xff) | lead;
364 
365 	return (len);
366 }
367 
368 static size_t
369 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
370     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
371 {
372 	_UTF8State *us;
373 	char buf[MB_LEN_MAX];
374 	const wchar_t *s;
375 	size_t nbytes;
376 	size_t nb;
377 
378 	us = (_UTF8State *)ps;
379 
380 	if (us->want != 0) {
381 		errno = EINVAL;
382 		return ((size_t)-1);
383 	}
384 
385 	s = *src;
386 	nbytes = 0;
387 
388 	if (dst == NULL) {
389 		while (nwc-- > 0) {
390 			if (0 <= *s && *s < 0x80)
391 				/* Fast path for plain ASCII characters. */
392 				nb = 1;
393 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
394 			    (size_t)-1)
395 				/* Invalid character - wcrtomb() sets errno. */
396 				return ((size_t)-1);
397 			if (*s == L'\0')
398 				return (nbytes + nb - 1);
399 			s++;
400 			nbytes += nb;
401 		}
402 		return (nbytes);
403 	}
404 
405 	while (len > 0 && nwc-- > 0) {
406 		if (0 <= *s && *s < 0x80) {
407 			/* Fast path for plain ASCII characters. */
408 			nb = 1;
409 			*dst = *s;
410 		} else if (len > (size_t)MB_CUR_MAX) {
411 			/* Enough space to translate in-place. */
412 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
413 				*src = s;
414 				return ((size_t)-1);
415 			}
416 		} else {
417 			/*
418 			 * May not be enough space; use temp. buffer.
419 			 */
420 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
421 				*src = s;
422 				return ((size_t)-1);
423 			}
424 			if (nb > (int)len)
425 				/* MB sequence for character won't fit. */
426 				break;
427 			(void) memcpy(dst, buf, nb);
428 		}
429 		if (*s == L'\0') {
430 			*src = NULL;
431 			return (nbytes + nb - 1);
432 		}
433 		s++;
434 		dst += nb;
435 		len -= nb;
436 		nbytes += nb;
437 	}
438 	*src = s;
439 	return (nbytes);
440 }
441