xref: /titanic_44/usr/src/lib/libc/port/locale/utf8.c (revision 2d08521bd15501c8370ba2153b9cca4f094979d0)
14297a3b0SGarrett D'Amore /*
2*2d08521bSGarrett D'Amore  * Copyright 2013 Garrett D'Amore <garrett@damore.org>
3475b496bSGarrett D'Amore  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
44297a3b0SGarrett D'Amore  * Copyright (c) 2002-2004 Tim J. Robbins
54297a3b0SGarrett D'Amore  * All rights reserved.
64297a3b0SGarrett D'Amore  *
74297a3b0SGarrett D'Amore  * Redistribution and use in source and binary forms, with or without
84297a3b0SGarrett D'Amore  * modification, are permitted provided that the following conditions
94297a3b0SGarrett D'Amore  * are met:
104297a3b0SGarrett D'Amore  * 1. Redistributions of source code must retain the above copyright
114297a3b0SGarrett D'Amore  *    notice, this list of conditions and the following disclaimer.
124297a3b0SGarrett D'Amore  * 2. Redistributions in binary form must reproduce the above copyright
134297a3b0SGarrett D'Amore  *    notice, this list of conditions and the following disclaimer in the
144297a3b0SGarrett D'Amore  *    documentation and/or other materials provided with the distribution.
154297a3b0SGarrett D'Amore  *
164297a3b0SGarrett D'Amore  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
174297a3b0SGarrett D'Amore  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
184297a3b0SGarrett D'Amore  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
194297a3b0SGarrett D'Amore  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
204297a3b0SGarrett D'Amore  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
214297a3b0SGarrett D'Amore  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
224297a3b0SGarrett D'Amore  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
234297a3b0SGarrett D'Amore  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
244297a3b0SGarrett D'Amore  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
254297a3b0SGarrett D'Amore  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
264297a3b0SGarrett D'Amore  * SUCH DAMAGE.
274297a3b0SGarrett D'Amore  */
284297a3b0SGarrett D'Amore 
294297a3b0SGarrett D'Amore #include "lint.h"
304297a3b0SGarrett D'Amore #include <errno.h>
314297a3b0SGarrett D'Amore #include <limits.h>
324297a3b0SGarrett D'Amore #include <stdlib.h>
334297a3b0SGarrett D'Amore #include <string.h>
344297a3b0SGarrett D'Amore #include <wchar.h>
354297a3b0SGarrett D'Amore #include "mblocal.h"
36*2d08521bSGarrett D'Amore #include "lctype.h"
374297a3b0SGarrett D'Amore 
384297a3b0SGarrett D'Amore static size_t	_UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD,
394297a3b0SGarrett D'Amore 		    const char *_RESTRICT_KYWD,
404297a3b0SGarrett D'Amore 		    size_t, mbstate_t *_RESTRICT_KYWD);
414297a3b0SGarrett D'Amore static int	_UTF8_mbsinit(const mbstate_t *);
424297a3b0SGarrett D'Amore static size_t	_UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD,
434297a3b0SGarrett D'Amore 		    const char **_RESTRICT_KYWD, size_t, size_t,
444297a3b0SGarrett D'Amore 		    mbstate_t *_RESTRICT_KYWD);
454297a3b0SGarrett D'Amore static size_t	_UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t,
464297a3b0SGarrett D'Amore 		    mbstate_t *_RESTRICT_KYWD);
474297a3b0SGarrett D'Amore static size_t	_UTF8_wcsnrtombs(char *_RESTRICT_KYWD,
484297a3b0SGarrett D'Amore 		    const wchar_t **_RESTRICT_KYWD,
494297a3b0SGarrett D'Amore 		    size_t, size_t, mbstate_t *_RESTRICT_KYWD);
504297a3b0SGarrett D'Amore 
514297a3b0SGarrett D'Amore typedef struct {
524297a3b0SGarrett D'Amore 	wchar_t	ch;
534297a3b0SGarrett D'Amore 	int	want;
544297a3b0SGarrett D'Amore 	wchar_t	lbound;
554297a3b0SGarrett D'Amore } _UTF8State;
564297a3b0SGarrett D'Amore 
57*2d08521bSGarrett D'Amore void
_UTF8_init(struct lc_ctype * lct)58*2d08521bSGarrett D'Amore _UTF8_init(struct lc_ctype *lct)
594297a3b0SGarrett D'Amore {
60*2d08521bSGarrett D'Amore 	lct->lc_mbrtowc = _UTF8_mbrtowc;
61*2d08521bSGarrett D'Amore 	lct->lc_wcrtomb = _UTF8_wcrtomb;
62*2d08521bSGarrett D'Amore 	lct->lc_mbsinit = _UTF8_mbsinit;
63*2d08521bSGarrett D'Amore 	lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs;
64*2d08521bSGarrett D'Amore 	lct->lc_wcsnrtombs = _UTF8_wcsnrtombs;
65*2d08521bSGarrett D'Amore 	lct->lc_is_ascii = 0;
66*2d08521bSGarrett D'Amore 	lct->lc_max_mblen = 4;
674297a3b0SGarrett D'Amore }
684297a3b0SGarrett D'Amore 
694297a3b0SGarrett D'Amore static int
_UTF8_mbsinit(const mbstate_t * ps)704297a3b0SGarrett D'Amore _UTF8_mbsinit(const mbstate_t *ps)
714297a3b0SGarrett D'Amore {
724297a3b0SGarrett D'Amore 
734297a3b0SGarrett D'Amore 	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
744297a3b0SGarrett D'Amore }
754297a3b0SGarrett D'Amore 
764297a3b0SGarrett D'Amore static size_t
_UTF8_mbrtowc(wchar_t * _RESTRICT_KYWD pwc,const char * _RESTRICT_KYWD s,size_t n,mbstate_t * _RESTRICT_KYWD ps)774297a3b0SGarrett D'Amore _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s,
784297a3b0SGarrett D'Amore     size_t n, mbstate_t *_RESTRICT_KYWD ps)
794297a3b0SGarrett D'Amore {
804297a3b0SGarrett D'Amore 	_UTF8State *us;
814297a3b0SGarrett D'Amore 	int ch, i, mask, want;
824297a3b0SGarrett D'Amore 	wchar_t lbound, wch;
834297a3b0SGarrett D'Amore 
844297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
854297a3b0SGarrett D'Amore 
864297a3b0SGarrett D'Amore 	if (us->want < 0 || us->want > 6) {
874297a3b0SGarrett D'Amore 		errno = EINVAL;
884297a3b0SGarrett D'Amore 		return ((size_t)-1);
894297a3b0SGarrett D'Amore 	}
904297a3b0SGarrett D'Amore 
914297a3b0SGarrett D'Amore 	if (s == NULL) {
924297a3b0SGarrett D'Amore 		s = "";
934297a3b0SGarrett D'Amore 		n = 1;
944297a3b0SGarrett D'Amore 		pwc = NULL;
954297a3b0SGarrett D'Amore 	}
964297a3b0SGarrett D'Amore 
974297a3b0SGarrett D'Amore 	if (n == 0)
984297a3b0SGarrett D'Amore 		/* Incomplete multibyte sequence */
994297a3b0SGarrett D'Amore 		return ((size_t)-2);
1004297a3b0SGarrett D'Amore 
1014297a3b0SGarrett D'Amore 	if (us->want == 0) {
1024297a3b0SGarrett D'Amore 		/*
1034297a3b0SGarrett D'Amore 		 * Determine the number of octets that make up this character
1044297a3b0SGarrett D'Amore 		 * from the first octet, and a mask that extracts the
1054297a3b0SGarrett D'Amore 		 * interesting bits of the first octet. We already know
1064297a3b0SGarrett D'Amore 		 * the character is at least two bytes long.
1074297a3b0SGarrett D'Amore 		 *
1084297a3b0SGarrett D'Amore 		 * We also specify a lower bound for the character code to
1094297a3b0SGarrett D'Amore 		 * detect redundant, non-"shortest form" encodings. For
1104297a3b0SGarrett D'Amore 		 * example, the sequence C0 80 is _not_ a legal representation
1114297a3b0SGarrett D'Amore 		 * of the null character. This enforces a 1-to-1 mapping
1124297a3b0SGarrett D'Amore 		 * between character codes and their multibyte representations.
1134297a3b0SGarrett D'Amore 		 */
1144297a3b0SGarrett D'Amore 		ch = (unsigned char)*s;
1154297a3b0SGarrett D'Amore 		if ((ch & 0x80) == 0) {
116475b496bSGarrett D'Amore 			/* Fast path for plain ASCII characters. */
117475b496bSGarrett D'Amore 			if (pwc != NULL)
118475b496bSGarrett D'Amore 				*pwc = ch;
119475b496bSGarrett D'Amore 			return (ch != '\0' ? 1 : 0);
120475b496bSGarrett D'Amore 		}
121475b496bSGarrett D'Amore 		if ((ch & 0xe0) == 0xc0) {
1224297a3b0SGarrett D'Amore 			mask = 0x1f;
1234297a3b0SGarrett D'Amore 			want = 2;
1244297a3b0SGarrett D'Amore 			lbound = 0x80;
1254297a3b0SGarrett D'Amore 		} else if ((ch & 0xf0) == 0xe0) {
1264297a3b0SGarrett D'Amore 			mask = 0x0f;
1274297a3b0SGarrett D'Amore 			want = 3;
1284297a3b0SGarrett D'Amore 			lbound = 0x800;
1294297a3b0SGarrett D'Amore 		} else if ((ch & 0xf8) == 0xf0) {
1304297a3b0SGarrett D'Amore 			mask = 0x07;
1314297a3b0SGarrett D'Amore 			want = 4;
1324297a3b0SGarrett D'Amore 			lbound = 0x10000;
1334297a3b0SGarrett D'Amore #if 0
1344297a3b0SGarrett D'Amore 		/* These would be illegal in the UTF-8 space */
1354297a3b0SGarrett D'Amore 
1364297a3b0SGarrett D'Amore 		} else if ((ch & 0xfc) == 0xf8) {
1374297a3b0SGarrett D'Amore 			mask = 0x03;
1384297a3b0SGarrett D'Amore 			want = 5;
1394297a3b0SGarrett D'Amore 			lbound = 0x200000;
1404297a3b0SGarrett D'Amore 		} else if ((ch & 0xfe) == 0xfc) {
1414297a3b0SGarrett D'Amore 			mask = 0x01;
1424297a3b0SGarrett D'Amore 			want = 6;
1434297a3b0SGarrett D'Amore 			lbound = 0x4000000;
1444297a3b0SGarrett D'Amore #endif
1454297a3b0SGarrett D'Amore 		} else {
1464297a3b0SGarrett D'Amore 			/*
1474297a3b0SGarrett D'Amore 			 * Malformed input; input is not UTF-8.
1484297a3b0SGarrett D'Amore 			 */
1494297a3b0SGarrett D'Amore 			errno = EILSEQ;
1504297a3b0SGarrett D'Amore 			return ((size_t)-1);
1514297a3b0SGarrett D'Amore 		}
1524297a3b0SGarrett D'Amore 	} else {
1534297a3b0SGarrett D'Amore 		want = us->want;
1544297a3b0SGarrett D'Amore 		lbound = us->lbound;
1554297a3b0SGarrett D'Amore 	}
1564297a3b0SGarrett D'Amore 
1574297a3b0SGarrett D'Amore 	/*
1584297a3b0SGarrett D'Amore 	 * Decode the octet sequence representing the character in chunks
1594297a3b0SGarrett D'Amore 	 * of 6 bits, most significant first.
1604297a3b0SGarrett D'Amore 	 */
1614297a3b0SGarrett D'Amore 	if (us->want == 0)
1624297a3b0SGarrett D'Amore 		wch = (unsigned char)*s++ & mask;
1634297a3b0SGarrett D'Amore 	else
1644297a3b0SGarrett D'Amore 		wch = us->ch;
1654297a3b0SGarrett D'Amore 
1664297a3b0SGarrett D'Amore 	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
1674297a3b0SGarrett D'Amore 		if ((*s & 0xc0) != 0x80) {
1684297a3b0SGarrett D'Amore 			/*
1694297a3b0SGarrett D'Amore 			 * Malformed input; bad characters in the middle
1704297a3b0SGarrett D'Amore 			 * of a character.
1714297a3b0SGarrett D'Amore 			 */
1724297a3b0SGarrett D'Amore 			errno = EILSEQ;
1734297a3b0SGarrett D'Amore 			return ((size_t)-1);
1744297a3b0SGarrett D'Amore 		}
1754297a3b0SGarrett D'Amore 		wch <<= 6;
1764297a3b0SGarrett D'Amore 		wch |= *s++ & 0x3f;
1774297a3b0SGarrett D'Amore 	}
1784297a3b0SGarrett D'Amore 	if (i < want) {
1794297a3b0SGarrett D'Amore 		/* Incomplete multibyte sequence. */
1804297a3b0SGarrett D'Amore 		us->want = want - i;
1814297a3b0SGarrett D'Amore 		us->lbound = lbound;
1824297a3b0SGarrett D'Amore 		us->ch = wch;
1834297a3b0SGarrett D'Amore 		return ((size_t)-2);
1844297a3b0SGarrett D'Amore 	}
1854297a3b0SGarrett D'Amore 	if (wch < lbound) {
1864297a3b0SGarrett D'Amore 		/*
1874297a3b0SGarrett D'Amore 		 * Malformed input; redundant encoding.
1884297a3b0SGarrett D'Amore 		 */
1894297a3b0SGarrett D'Amore 		errno = EILSEQ;
1904297a3b0SGarrett D'Amore 		return ((size_t)-1);
1914297a3b0SGarrett D'Amore 	}
1924297a3b0SGarrett D'Amore 	if (pwc != NULL)
1934297a3b0SGarrett D'Amore 		*pwc = wch;
1944297a3b0SGarrett D'Amore 	us->want = 0;
1954297a3b0SGarrett D'Amore 	return (wch == L'\0' ? 0 : want);
1964297a3b0SGarrett D'Amore }
1974297a3b0SGarrett D'Amore 
1984297a3b0SGarrett D'Amore static size_t
_UTF8_mbsnrtowcs(wchar_t * _RESTRICT_KYWD dst,const char ** _RESTRICT_KYWD src,size_t nms,size_t len,mbstate_t * _RESTRICT_KYWD ps)1994297a3b0SGarrett D'Amore _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src,
2004297a3b0SGarrett D'Amore     size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps)
2014297a3b0SGarrett D'Amore {
2024297a3b0SGarrett D'Amore 	_UTF8State *us;
2034297a3b0SGarrett D'Amore 	const char *s;
2044297a3b0SGarrett D'Amore 	size_t nchr;
2054297a3b0SGarrett D'Amore 	wchar_t wc;
2064297a3b0SGarrett D'Amore 	size_t nb;
2074297a3b0SGarrett D'Amore 
2084297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
2094297a3b0SGarrett D'Amore 
2104297a3b0SGarrett D'Amore 	s = *src;
2114297a3b0SGarrett D'Amore 	nchr = 0;
2124297a3b0SGarrett D'Amore 
2134297a3b0SGarrett D'Amore 	if (dst == NULL) {
2144297a3b0SGarrett D'Amore 		/*
2154297a3b0SGarrett D'Amore 		 * The fast path in the loop below is not safe if an ASCII
2164297a3b0SGarrett D'Amore 		 * character appears as anything but the first byte of a
2174297a3b0SGarrett D'Amore 		 * multibyte sequence. Check now to avoid doing it in the loop.
2184297a3b0SGarrett D'Amore 		 */
2194297a3b0SGarrett D'Amore 		if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
2204297a3b0SGarrett D'Amore 			errno = EILSEQ;
2214297a3b0SGarrett D'Amore 			return ((size_t)-1);
2224297a3b0SGarrett D'Amore 		}
2234297a3b0SGarrett D'Amore 		for (;;) {
2244297a3b0SGarrett D'Amore 			if (nms > 0 && (signed char)*s > 0)
2254297a3b0SGarrett D'Amore 				/*
2264297a3b0SGarrett D'Amore 				 * Fast path for plain ASCII characters
2274297a3b0SGarrett D'Amore 				 * excluding NUL.
2284297a3b0SGarrett D'Amore 				 */
2294297a3b0SGarrett D'Amore 				nb = 1;
2304297a3b0SGarrett D'Amore 			else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
2314297a3b0SGarrett D'Amore 			    (size_t)-1)
2324297a3b0SGarrett D'Amore 				/* Invalid sequence - mbrtowc() sets errno. */
2334297a3b0SGarrett D'Amore 				return ((size_t)-1);
2344297a3b0SGarrett D'Amore 			else if (nb == 0 || nb == (size_t)-2)
2354297a3b0SGarrett D'Amore 				return (nchr);
2364297a3b0SGarrett D'Amore 			s += nb;
2374297a3b0SGarrett D'Amore 			nms -= nb;
2384297a3b0SGarrett D'Amore 			nchr++;
2394297a3b0SGarrett D'Amore 		}
2404297a3b0SGarrett D'Amore 		/*NOTREACHED*/
2414297a3b0SGarrett D'Amore 	}
2424297a3b0SGarrett D'Amore 
2434297a3b0SGarrett D'Amore 	/*
2444297a3b0SGarrett D'Amore 	 * The fast path in the loop below is not safe if an ASCII
2454297a3b0SGarrett D'Amore 	 * character appears as anything but the first byte of a
2464297a3b0SGarrett D'Amore 	 * multibyte sequence. Check now to avoid doing it in the loop.
2474297a3b0SGarrett D'Amore 	 */
2484297a3b0SGarrett D'Amore 	if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
2494297a3b0SGarrett D'Amore 		errno = EILSEQ;
2504297a3b0SGarrett D'Amore 		return ((size_t)-1);
2514297a3b0SGarrett D'Amore 	}
2524297a3b0SGarrett D'Amore 	while (len-- > 0) {
2534297a3b0SGarrett D'Amore 		if (nms > 0 && (signed char)*s > 0) {
2544297a3b0SGarrett D'Amore 			/*
2554297a3b0SGarrett D'Amore 			 * Fast path for plain ASCII characters
2564297a3b0SGarrett D'Amore 			 * excluding NUL.
2574297a3b0SGarrett D'Amore 			 */
2584297a3b0SGarrett D'Amore 			*dst = (wchar_t)*s;
2594297a3b0SGarrett D'Amore 			nb = 1;
2604297a3b0SGarrett D'Amore 		} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
2614297a3b0SGarrett D'Amore 		    (size_t)-1) {
2624297a3b0SGarrett D'Amore 			*src = s;
2634297a3b0SGarrett D'Amore 			return ((size_t)-1);
2644297a3b0SGarrett D'Amore 		} else if (nb == (size_t)-2) {
2654297a3b0SGarrett D'Amore 			*src = s + nms;
2664297a3b0SGarrett D'Amore 			return (nchr);
2674297a3b0SGarrett D'Amore 		} else if (nb == 0) {
2684297a3b0SGarrett D'Amore 			*src = NULL;
2694297a3b0SGarrett D'Amore 			return (nchr);
2704297a3b0SGarrett D'Amore 		}
2714297a3b0SGarrett D'Amore 		s += nb;
2724297a3b0SGarrett D'Amore 		nms -= nb;
2734297a3b0SGarrett D'Amore 		nchr++;
2744297a3b0SGarrett D'Amore 		dst++;
2754297a3b0SGarrett D'Amore 	}
2764297a3b0SGarrett D'Amore 	*src = s;
2774297a3b0SGarrett D'Amore 	return (nchr);
2784297a3b0SGarrett D'Amore }
2794297a3b0SGarrett D'Amore 
2804297a3b0SGarrett D'Amore static size_t
_UTF8_wcrtomb(char * _RESTRICT_KYWD s,wchar_t wc,mbstate_t * _RESTRICT_KYWD ps)2814297a3b0SGarrett D'Amore _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps)
2824297a3b0SGarrett D'Amore {
2834297a3b0SGarrett D'Amore 	_UTF8State *us;
2844297a3b0SGarrett D'Amore 	unsigned char lead;
2854297a3b0SGarrett D'Amore 	int i, len;
2864297a3b0SGarrett D'Amore 
2874297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
2884297a3b0SGarrett D'Amore 
2894297a3b0SGarrett D'Amore 	if (us->want != 0) {
2904297a3b0SGarrett D'Amore 		errno = EINVAL;
2914297a3b0SGarrett D'Amore 		return ((size_t)-1);
2924297a3b0SGarrett D'Amore 	}
2934297a3b0SGarrett D'Amore 
2944297a3b0SGarrett D'Amore 	if (s == NULL)
2954297a3b0SGarrett D'Amore 		/* Reset to initial shift state (no-op) */
2964297a3b0SGarrett D'Amore 		return (1);
2974297a3b0SGarrett D'Amore 
2984297a3b0SGarrett D'Amore 	/*
2994297a3b0SGarrett D'Amore 	 * Determine the number of octets needed to represent this character.
3004297a3b0SGarrett D'Amore 	 * We always output the shortest sequence possible. Also specify the
3014297a3b0SGarrett D'Amore 	 * first few bits of the first octet, which contains the information
3024297a3b0SGarrett D'Amore 	 * about the sequence length.
3034297a3b0SGarrett D'Amore 	 */
3044297a3b0SGarrett D'Amore 	if ((wc & ~0x7f) == 0) {
305475b496bSGarrett D'Amore 		/* Fast path for plain ASCII characters. */
306475b496bSGarrett D'Amore 		*s = (char)wc;
307475b496bSGarrett D'Amore 		return (1);
3084297a3b0SGarrett D'Amore 	} else if ((wc & ~0x7ff) == 0) {
3094297a3b0SGarrett D'Amore 		lead = 0xc0;
3104297a3b0SGarrett D'Amore 		len = 2;
3114297a3b0SGarrett D'Amore 	} else if ((wc & ~0xffff) == 0) {
3124297a3b0SGarrett D'Amore 		lead = 0xe0;
3134297a3b0SGarrett D'Amore 		len = 3;
3144297a3b0SGarrett D'Amore 	} else if ((wc & ~0x1fffff) == 0) {
3154297a3b0SGarrett D'Amore 		lead = 0xf0;
3164297a3b0SGarrett D'Amore 		len = 4;
3174297a3b0SGarrett D'Amore #if 0
3184297a3b0SGarrett D'Amore 	/* Again, 5 and 6 byte encodings are simply not permitted */
3194297a3b0SGarrett D'Amore 	} else if ((wc & ~0x3ffffff) == 0) {
3204297a3b0SGarrett D'Amore 		lead = 0xf8;
3214297a3b0SGarrett D'Amore 		len = 5;
3224297a3b0SGarrett D'Amore 	} else if ((wc & ~0x7fffffff) == 0) {
3234297a3b0SGarrett D'Amore 		lead = 0xfc;
3244297a3b0SGarrett D'Amore 		len = 6;
3254297a3b0SGarrett D'Amore #endif
3264297a3b0SGarrett D'Amore 	} else {
3274297a3b0SGarrett D'Amore 		errno = EILSEQ;
3284297a3b0SGarrett D'Amore 		return ((size_t)-1);
3294297a3b0SGarrett D'Amore 	}
3304297a3b0SGarrett D'Amore 
3314297a3b0SGarrett D'Amore 	/*
3324297a3b0SGarrett D'Amore 	 * Output the octets representing the character in chunks
3334297a3b0SGarrett D'Amore 	 * of 6 bits, least significant last. The first octet is
3344297a3b0SGarrett D'Amore 	 * a special case because it contains the sequence length
3354297a3b0SGarrett D'Amore 	 * information.
3364297a3b0SGarrett D'Amore 	 */
3374297a3b0SGarrett D'Amore 	for (i = len - 1; i > 0; i--) {
3384297a3b0SGarrett D'Amore 		s[i] = (wc & 0x3f) | 0x80;
3394297a3b0SGarrett D'Amore 		wc >>= 6;
3404297a3b0SGarrett D'Amore 	}
3414297a3b0SGarrett D'Amore 	*s = (wc & 0xff) | lead;
3424297a3b0SGarrett D'Amore 
3434297a3b0SGarrett D'Amore 	return (len);
3444297a3b0SGarrett D'Amore }
3454297a3b0SGarrett D'Amore 
3464297a3b0SGarrett D'Amore static size_t
_UTF8_wcsnrtombs(char * _RESTRICT_KYWD dst,const wchar_t ** _RESTRICT_KYWD src,size_t nwc,size_t len,mbstate_t * _RESTRICT_KYWD ps)3474297a3b0SGarrett D'Amore _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src,
3484297a3b0SGarrett D'Amore     size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps)
3494297a3b0SGarrett D'Amore {
3504297a3b0SGarrett D'Amore 	_UTF8State *us;
3514297a3b0SGarrett D'Amore 	char buf[MB_LEN_MAX];
3524297a3b0SGarrett D'Amore 	const wchar_t *s;
3534297a3b0SGarrett D'Amore 	size_t nbytes;
3544297a3b0SGarrett D'Amore 	size_t nb;
3554297a3b0SGarrett D'Amore 
3564297a3b0SGarrett D'Amore 	us = (_UTF8State *)ps;
3574297a3b0SGarrett D'Amore 
3584297a3b0SGarrett D'Amore 	if (us->want != 0) {
3594297a3b0SGarrett D'Amore 		errno = EINVAL;
3604297a3b0SGarrett D'Amore 		return ((size_t)-1);
3614297a3b0SGarrett D'Amore 	}
3624297a3b0SGarrett D'Amore 
3634297a3b0SGarrett D'Amore 	s = *src;
3644297a3b0SGarrett D'Amore 	nbytes = 0;
3654297a3b0SGarrett D'Amore 
3664297a3b0SGarrett D'Amore 	if (dst == NULL) {
3674297a3b0SGarrett D'Amore 		while (nwc-- > 0) {
3684297a3b0SGarrett D'Amore 			if (0 <= *s && *s < 0x80)
3694297a3b0SGarrett D'Amore 				/* Fast path for plain ASCII characters. */
3704297a3b0SGarrett D'Amore 				nb = 1;
3714297a3b0SGarrett D'Amore 			else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
3724297a3b0SGarrett D'Amore 			    (size_t)-1)
3734297a3b0SGarrett D'Amore 				/* Invalid character - wcrtomb() sets errno. */
3744297a3b0SGarrett D'Amore 				return ((size_t)-1);
3754297a3b0SGarrett D'Amore 			if (*s == L'\0')
3764297a3b0SGarrett D'Amore 				return (nbytes + nb - 1);
3774297a3b0SGarrett D'Amore 			s++;
3784297a3b0SGarrett D'Amore 			nbytes += nb;
3794297a3b0SGarrett D'Amore 		}
3804297a3b0SGarrett D'Amore 		return (nbytes);
3814297a3b0SGarrett D'Amore 	}
3824297a3b0SGarrett D'Amore 
3834297a3b0SGarrett D'Amore 	while (len > 0 && nwc-- > 0) {
3844297a3b0SGarrett D'Amore 		if (0 <= *s && *s < 0x80) {
3854297a3b0SGarrett D'Amore 			/* Fast path for plain ASCII characters. */
3864297a3b0SGarrett D'Amore 			nb = 1;
3874297a3b0SGarrett D'Amore 			*dst = *s;
3884297a3b0SGarrett D'Amore 		} else if (len > (size_t)MB_CUR_MAX) {
3894297a3b0SGarrett D'Amore 			/* Enough space to translate in-place. */
3904297a3b0SGarrett D'Amore 			if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
3914297a3b0SGarrett D'Amore 				*src = s;
3924297a3b0SGarrett D'Amore 				return ((size_t)-1);
3934297a3b0SGarrett D'Amore 			}
3944297a3b0SGarrett D'Amore 		} else {
3954297a3b0SGarrett D'Amore 			/*
3964297a3b0SGarrett D'Amore 			 * May not be enough space; use temp. buffer.
3974297a3b0SGarrett D'Amore 			 */
3984297a3b0SGarrett D'Amore 			if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
3994297a3b0SGarrett D'Amore 				*src = s;
4004297a3b0SGarrett D'Amore 				return ((size_t)-1);
4014297a3b0SGarrett D'Amore 			}
4024297a3b0SGarrett D'Amore 			if (nb > (int)len)
4034297a3b0SGarrett D'Amore 				/* MB sequence for character won't fit. */
4044297a3b0SGarrett D'Amore 				break;
4054297a3b0SGarrett D'Amore 			(void) memcpy(dst, buf, nb);
4064297a3b0SGarrett D'Amore 		}
4074297a3b0SGarrett D'Amore 		if (*s == L'\0') {
4084297a3b0SGarrett D'Amore 			*src = NULL;
4094297a3b0SGarrett D'Amore 			return (nbytes + nb - 1);
4104297a3b0SGarrett D'Amore 		}
4114297a3b0SGarrett D'Amore 		s++;
4124297a3b0SGarrett D'Amore 		dst += nb;
4134297a3b0SGarrett D'Amore 		len -= nb;
4144297a3b0SGarrett D'Amore 		nbytes += nb;
4154297a3b0SGarrett D'Amore 	}
4164297a3b0SGarrett D'Amore 	*src = s;
4174297a3b0SGarrett D'Amore 	return (nbytes);
4184297a3b0SGarrett D'Amore }
419