xref: /illumos-gate/usr/src/lib/libc/port/locale/utf8.c (revision 1356956d0b091aa705b3e6330c7224417baa3a89)
1 /*
2  * Copyright (c) 2002-2004 Tim J. Robbins
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * Copyright 2010 Nexenta Systems, Inc.  All rights reserved.
28  * Use is subject to license terms.
29  */
30 
31 #include "lint.h"
32 #include <errno.h>
33 #include <limits.h>
34 #include "runetype.h"
35 #include <stdlib.h>
36 #include <string.h>
37 #include <wchar.h>
38 #include "mblocal.h"
39 
40 static size_t	_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
41 		    const char *_RESTRICT_KYWD,
42 		    size_t, mbstate_t *_RESTRICT_KYWD);
43 static int	_UTF8_mbsinit(const mbstate_t *);
44 static size_t	_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
45 		    const char **_RESTRICT_KYWD, size_t, size_t,
46 		    mbstate_t *_RESTRICT_KYWD);
47 static size_t	_UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
48 		    mbstate_t *_RESTRICT_KYWD);
49 static size_t	_UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
50 		    const wchar_t **_RESTRICT_KYWD,
51 		    size_t, size_t, mbstate_t *_RESTRICT_KYWD);
52 
53 typedef struct {
54 	wchar_t	ch;
55 	int	want;
56 	wchar_t	lbound;
57 } _UTF8State;
58 
59 int
60 _UTF8_init(_RuneLocale *rl)
61 {
62 	__mbrtowc = _UTF8_mbrtowc;
63 	__wcrtomb = _UTF8_wcrtomb;
64 	__mbsinit = _UTF8_mbsinit;
65 	__mbsnrtowcs = _UTF8_mbsnrtowcs;
66 	__wcsnrtombs = _UTF8_wcsnrtombs;
67 	_CurrentRuneLocale = rl;
68 
69 	charset_is_ascii = 0;
70 
71 	/*
72 	 * In theory up to 6 bytes can be used for the encoding,
73 	 * but only encodings with more than 4 bytes are illegal.
74 	 */
75 	__ctype[520] = 4;
76 	/*
77 	 * Note that the other CSWIDTH members are nonsensical for this
78 	 * this coding.  They only are valid with EUC codings.
79 	 */
80 
81 	return (0);
82 }
83 
84 static int
85 _UTF8_mbsinit(const mbstate_t *ps)
86 {
87 
88 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
89 }
90 
91 static size_t
92 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
93     size_t n, mbstate_t *_RESTRICT_KYWD ps)
94 {
95 	_UTF8State *us;
96 	int ch, i, mask, want;
97 	wchar_t lbound, wch;
98 
99 	us = (_UTF8State *)ps;
100 
101 	if (us->want < 0 || us->want > 6) {
102 		errno = EINVAL;
103 		return ((size_t)-1);
104 	}
105 
106 	if (s == NULL) {
107 		s = "";
108 		n = 1;
109 		pwc = NULL;
110 	}
111 
112 	if (n == 0)
113 		/* Incomplete multibyte sequence */
114 		return ((size_t)-2);
115 
116 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
117 		/* Fast path for plain ASCII characters. */
118 		if (pwc != NULL)
119 			*pwc = ch;
120 		return (ch != '\0' ? 1 : 0);
121 	}
122 
123 	if (us->want == 0) {
124 		/*
125 		 * Determine the number of octets that make up this character
126 		 * from the first octet, and a mask that extracts the
127 		 * interesting bits of the first octet. We already know
128 		 * the character is at least two bytes long.
129 		 *
130 		 * We also specify a lower bound for the character code to
131 		 * detect redundant, non-"shortest form" encodings. For
132 		 * example, the sequence C0 80 is _not_ a legal representation
133 		 * of the null character. This enforces a 1-to-1 mapping
134 		 * between character codes and their multibyte representations.
135 		 */
136 		ch = (unsigned char)*s;
137 		if ((ch & 0x80) == 0) {
138 			mask = 0x7f;
139 			want = 1;
140 			lbound = 0;
141 		} else if ((ch & 0xe0) == 0xc0) {
142 			mask = 0x1f;
143 			want = 2;
144 			lbound = 0x80;
145 		} else if ((ch & 0xf0) == 0xe0) {
146 			mask = 0x0f;
147 			want = 3;
148 			lbound = 0x800;
149 		} else if ((ch & 0xf8) == 0xf0) {
150 			mask = 0x07;
151 			want = 4;
152 			lbound = 0x10000;
153 #if 0
154 		/* These would be illegal in the UTF-8 space */
155 
156 		} else if ((ch & 0xfc) == 0xf8) {
157 			mask = 0x03;
158 			want = 5;
159 			lbound = 0x200000;
160 		} else if ((ch & 0xfe) == 0xfc) {
161 			mask = 0x01;
162 			want = 6;
163 			lbound = 0x4000000;
164 #endif
165 		} else {
166 			/*
167 			 * Malformed input; input is not UTF-8.
168 			 */
169 			errno = EILSEQ;
170 			return ((size_t)-1);
171 		}
172 	} else {
173 		want = us->want;
174 		lbound = us->lbound;
175 	}
176 
177 	/*
178 	 * Decode the octet sequence representing the character in chunks
179 	 * of 6 bits, most significant first.
180 	 */
181 	if (us->want == 0)
182 		wch = (unsigned char)*s++ & mask;
183 	else
184 		wch = us->ch;
185 
186 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
187 		if ((*s & 0xc0) != 0x80) {
188 			/*
189 			 * Malformed input; bad characters in the middle
190 			 * of a character.
191 			 */
192 			errno = EILSEQ;
193 			return ((size_t)-1);
194 		}
195 		wch <<= 6;
196 		wch |= *s++ & 0x3f;
197 	}
198 	if (i < want) {
199 		/* Incomplete multibyte sequence. */
200 		us->want = want - i;
201 		us->lbound = lbound;
202 		us->ch = wch;
203 		return ((size_t)-2);
204 	}
205 	if (wch < lbound) {
206 		/*
207 		 * Malformed input; redundant encoding.
208 		 */
209 		errno = EILSEQ;
210 		return ((size_t)-1);
211 	}
212 	if (pwc != NULL)
213 		*pwc = wch;
214 	us->want = 0;
215 	return (wch == L'\0' ? 0 : want);
216 }
217 
218 static size_t
219 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
220     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
221 {
222 	_UTF8State *us;
223 	const char *s;
224 	size_t nchr;
225 	wchar_t wc;
226 	size_t nb;
227 
228 	us = (_UTF8State *)ps;
229 
230 	s = *src;
231 	nchr = 0;
232 
233 	if (dst == NULL) {
234 		/*
235 		 * The fast path in the loop below is not safe if an ASCII
236 		 * character appears as anything but the first byte of a
237 		 * multibyte sequence. Check now to avoid doing it in the loop.
238 		 */
239 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
240 			errno = EILSEQ;
241 			return ((size_t)-1);
242 		}
243 		for (;;) {
244 			if (nms > 0 && (signed char)*s > 0)
245 				/*
246 				 * Fast path for plain ASCII characters
247 				 * excluding NUL.
248 				 */
249 				nb = 1;
250 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
251 			    (size_t)-1)
252 				/* Invalid sequence - mbrtowc() sets errno. */
253 				return ((size_t)-1);
254 			else if (nb == 0 || nb == (size_t)-2)
255 				return (nchr);
256 			s += nb;
257 			nms -= nb;
258 			nchr++;
259 		}
260 		/*NOTREACHED*/
261 	}
262 
263 	/*
264 	 * The fast path in the loop below is not safe if an ASCII
265 	 * character appears as anything but the first byte of a
266 	 * multibyte sequence. Check now to avoid doing it in the loop.
267 	 */
268 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
269 		errno = EILSEQ;
270 		return ((size_t)-1);
271 	}
272 	while (len-- > 0) {
273 		if (nms > 0 && (signed char)*s > 0) {
274 			/*
275 			 * Fast path for plain ASCII characters
276 			 * excluding NUL.
277 			 */
278 			*dst = (wchar_t)*s;
279 			nb = 1;
280 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
281 		    (size_t)-1) {
282 			*src = s;
283 			return ((size_t)-1);
284 		} else if (nb == (size_t)-2) {
285 			*src = s + nms;
286 			return (nchr);
287 		} else if (nb == 0) {
288 			*src = NULL;
289 			return (nchr);
290 		}
291 		s += nb;
292 		nms -= nb;
293 		nchr++;
294 		dst++;
295 	}
296 	*src = s;
297 	return (nchr);
298 }
299 
300 static size_t
301 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
302 {
303 	_UTF8State *us;
304 	unsigned char lead;
305 	int i, len;
306 
307 	us = (_UTF8State *)ps;
308 
309 	if (us->want != 0) {
310 		errno = EINVAL;
311 		return ((size_t)-1);
312 	}
313 
314 	if (s == NULL)
315 		/* Reset to initial shift state (no-op) */
316 		return (1);
317 
318 	if ((wc & ~0x7f) == 0) {
319 		/* Fast path for plain ASCII characters. */
320 		*s = (char)wc;
321 		return (1);
322 	}
323 
324 	/*
325 	 * Determine the number of octets needed to represent this character.
326 	 * We always output the shortest sequence possible. Also specify the
327 	 * first few bits of the first octet, which contains the information
328 	 * about the sequence length.
329 	 */
330 	if ((wc & ~0x7f) == 0) {
331 		lead = 0;
332 		len = 1;
333 	} else if ((wc & ~0x7ff) == 0) {
334 		lead = 0xc0;
335 		len = 2;
336 	} else if ((wc & ~0xffff) == 0) {
337 		lead = 0xe0;
338 		len = 3;
339 	} else if ((wc & ~0x1fffff) == 0) {
340 		lead = 0xf0;
341 		len = 4;
342 #if 0
343 	/* Again, 5 and 6 byte encodings are simply not permitted */
344 	} else if ((wc & ~0x3ffffff) == 0) {
345 		lead = 0xf8;
346 		len = 5;
347 	} else if ((wc & ~0x7fffffff) == 0) {
348 		lead = 0xfc;
349 		len = 6;
350 #endif
351 	} else {
352 		errno = EILSEQ;
353 		return ((size_t)-1);
354 	}
355 
356 	/*
357 	 * Output the octets representing the character in chunks
358 	 * of 6 bits, least significant last. The first octet is
359 	 * a special case because it contains the sequence length
360 	 * information.
361 	 */
362 	for (i = len - 1; i > 0; i--) {
363 		s[i] = (wc & 0x3f) | 0x80;
364 		wc >>= 6;
365 	}
366 	*s = (wc & 0xff) | lead;
367 
368 	return (len);
369 }
370 
371 static size_t
372 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
373     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
374 {
375 	_UTF8State *us;
376 	char buf[MB_LEN_MAX];
377 	const wchar_t *s;
378 	size_t nbytes;
379 	size_t nb;
380 
381 	us = (_UTF8State *)ps;
382 
383 	if (us->want != 0) {
384 		errno = EINVAL;
385 		return ((size_t)-1);
386 	}
387 
388 	s = *src;
389 	nbytes = 0;
390 
391 	if (dst == NULL) {
392 		while (nwc-- > 0) {
393 			if (0 <= *s && *s < 0x80)
394 				/* Fast path for plain ASCII characters. */
395 				nb = 1;
396 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
397 			    (size_t)-1)
398 				/* Invalid character - wcrtomb() sets errno. */
399 				return ((size_t)-1);
400 			if (*s == L'\0')
401 				return (nbytes + nb - 1);
402 			s++;
403 			nbytes += nb;
404 		}
405 		return (nbytes);
406 	}
407 
408 	while (len > 0 && nwc-- > 0) {
409 		if (0 <= *s && *s < 0x80) {
410 			/* Fast path for plain ASCII characters. */
411 			nb = 1;
412 			*dst = *s;
413 		} else if (len > (size_t)MB_CUR_MAX) {
414 			/* Enough space to translate in-place. */
415 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
416 				*src = s;
417 				return ((size_t)-1);
418 			}
419 		} else {
420 			/*
421 			 * May not be enough space; use temp. buffer.
422 			 */
423 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
424 				*src = s;
425 				return ((size_t)-1);
426 			}
427 			if (nb > (int)len)
428 				/* MB sequence for character won't fit. */
429 				break;
430 			(void) memcpy(dst, buf, nb);
431 		}
432 		if (*s == L'\0') {
433 			*src = NULL;
434 			return (nbytes + nb - 1);
435 		}
436 		s++;
437 		dst += nb;
438 		len -= nb;
439 		nbytes += nb;
440 	}
441 	*src = s;
442 	return (nbytes);
443 }
444