1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 5 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 6 * Copyright (c) 2002-2004 Tim J. Robbins 7 * All rights reserved. 8 * 9 * Copyright (c) 2011 The FreeBSD Foundation 10 * All rights reserved. 11 * Portions of this software were developed by David Chisnall 12 * under sponsorship from the FreeBSD Foundation. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include <sys/param.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <errno.h> 40 #include <limits.h> 41 #include <runetype.h> 42 #include <stdlib.h> 43 #include <string.h> 44 #include <wchar.h> 45 #include "mblocal.h" 46 47 extern int __mb_sb_limit; 48 49 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 50 size_t, mbstate_t * __restrict); 51 static int _UTF8_mbsinit(const mbstate_t *); 52 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 53 const char ** __restrict, size_t, size_t, 54 mbstate_t * __restrict); 55 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 56 mbstate_t * __restrict); 57 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 58 size_t, size_t, mbstate_t * __restrict); 59 60 typedef struct { 61 wchar_t ch; 62 int want; 63 wchar_t lbound; 64 } _UTF8State; 65 66 int 67 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 68 { 69 70 l->__mbrtowc = _UTF8_mbrtowc; 71 l->__wcrtomb = _UTF8_wcrtomb; 72 l->__mbsinit = _UTF8_mbsinit; 73 l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 74 l->__wcsnrtombs = _UTF8_wcsnrtombs; 75 l->runes = rl; 76 l->__mb_cur_max = 4; 77 /* 78 * UCS-4 encoding used as the internal representation, so 79 * slots 0x0080-0x00FF are occuped and must be excluded 80 * from the single byte ctype by setting the limit. 81 */ 82 l->__mb_sb_limit = 128; 83 84 return (0); 85 } 86 87 static int 88 _UTF8_mbsinit(const mbstate_t *ps) 89 { 90 91 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 92 } 93 94 static size_t 95 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 96 mbstate_t * __restrict ps) 97 { 98 _UTF8State *us; 99 int ch, i, mask, want; 100 wchar_t lbound, wch; 101 102 us = (_UTF8State *)ps; 103 104 if (us->want < 0 || us->want > 6) { 105 errno = EINVAL; 106 return ((size_t)-1); 107 } 108 109 if (s == NULL) { 110 s = ""; 111 n = 1; 112 pwc = NULL; 113 } 114 115 if (n == 0) 116 /* Incomplete multibyte sequence */ 117 return ((size_t)-2); 118 119 if (us->want == 0) { 120 /* 121 * Determine the number of octets that make up this character 122 * from the first octet, and a mask that extracts the 123 * interesting bits of the first octet. We already know 124 * the character is at least two bytes long. 125 * 126 * We also specify a lower bound for the character code to 127 * detect redundant, non-"shortest form" encodings. For 128 * example, the sequence C0 80 is _not_ a legal representation 129 * of the null character. This enforces a 1-to-1 mapping 130 * between character codes and their multibyte representations. 131 */ 132 ch = (unsigned char)*s; 133 if ((ch & 0x80) == 0) { 134 /* Fast path for plain ASCII characters. */ 135 if (pwc != NULL) 136 *pwc = ch; 137 return (ch != '\0' ? 1 : 0); 138 } 139 if ((ch & 0xe0) == 0xc0) { 140 mask = 0x1f; 141 want = 2; 142 lbound = 0x80; 143 } else if ((ch & 0xf0) == 0xe0) { 144 mask = 0x0f; 145 want = 3; 146 lbound = 0x800; 147 } else if ((ch & 0xf8) == 0xf0) { 148 mask = 0x07; 149 want = 4; 150 lbound = 0x10000; 151 } else { 152 /* 153 * Malformed input; input is not UTF-8. 154 */ 155 errno = EILSEQ; 156 return ((size_t)-1); 157 } 158 } else { 159 want = us->want; 160 lbound = us->lbound; 161 } 162 163 /* 164 * Decode the octet sequence representing the character in chunks 165 * of 6 bits, most significant first. 166 */ 167 if (us->want == 0) 168 wch = (unsigned char)*s++ & mask; 169 else 170 wch = us->ch; 171 172 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 173 if ((*s & 0xc0) != 0x80) { 174 /* 175 * Malformed input; bad characters in the middle 176 * of a character. 177 */ 178 errno = EILSEQ; 179 return ((size_t)-1); 180 } 181 wch <<= 6; 182 wch |= *s++ & 0x3f; 183 } 184 if (i < want) { 185 /* Incomplete multibyte sequence. */ 186 us->want = want - i; 187 us->lbound = lbound; 188 us->ch = wch; 189 return ((size_t)-2); 190 } 191 if (wch < lbound) { 192 /* 193 * Malformed input; redundant encoding. 194 */ 195 errno = EILSEQ; 196 return ((size_t)-1); 197 } 198 if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) { 199 /* 200 * Malformed input; invalid code points. 201 */ 202 errno = EILSEQ; 203 return ((size_t)-1); 204 } 205 if (pwc != NULL) 206 *pwc = wch; 207 us->want = 0; 208 return (wch == L'\0' ? 0 : want); 209 } 210 211 static size_t 212 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 213 size_t nms, size_t len, mbstate_t * __restrict ps) 214 { 215 _UTF8State *us; 216 const char *s; 217 size_t nchr; 218 wchar_t wc; 219 size_t nb; 220 221 us = (_UTF8State *)ps; 222 223 s = *src; 224 nchr = 0; 225 226 if (dst == NULL) { 227 /* 228 * The fast path in the loop below is not safe if an ASCII 229 * character appears as anything but the first byte of a 230 * multibyte sequence. Check now to avoid doing it in the loop. 231 */ 232 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 233 errno = EILSEQ; 234 return ((size_t)-1); 235 } 236 for (;;) { 237 if (nms > 0 && (signed char)*s > 0) 238 /* 239 * Fast path for plain ASCII characters 240 * excluding NUL. 241 */ 242 nb = 1; 243 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 244 (size_t)-1) 245 /* Invalid sequence - mbrtowc() sets errno. */ 246 return ((size_t)-1); 247 else if (nb == 0 || nb == (size_t)-2) 248 return (nchr); 249 s += nb; 250 nms -= nb; 251 nchr++; 252 } 253 /*NOTREACHED*/ 254 } 255 256 /* 257 * The fast path in the loop below is not safe if an ASCII 258 * character appears as anything but the first byte of a 259 * multibyte sequence. Check now to avoid doing it in the loop. 260 */ 261 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 262 errno = EILSEQ; 263 return ((size_t)-1); 264 } 265 while (len-- > 0) { 266 if (nms > 0 && (signed char)*s > 0) { 267 /* 268 * Fast path for plain ASCII characters 269 * excluding NUL. 270 */ 271 *dst = (wchar_t)*s; 272 nb = 1; 273 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 274 (size_t)-1) { 275 *src = s; 276 return ((size_t)-1); 277 } else if (nb == (size_t)-2) { 278 *src = s + nms; 279 return (nchr); 280 } else if (nb == 0) { 281 *src = NULL; 282 return (nchr); 283 } 284 s += nb; 285 nms -= nb; 286 nchr++; 287 dst++; 288 } 289 *src = s; 290 return (nchr); 291 } 292 293 static size_t 294 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 295 { 296 _UTF8State *us; 297 unsigned char lead; 298 int i, len; 299 300 us = (_UTF8State *)ps; 301 302 if (us->want != 0) { 303 errno = EINVAL; 304 return ((size_t)-1); 305 } 306 307 if (s == NULL) 308 /* Reset to initial shift state (no-op) */ 309 return (1); 310 311 /* 312 * Determine the number of octets needed to represent this character. 313 * We always output the shortest sequence possible. Also specify the 314 * first few bits of the first octet, which contains the information 315 * about the sequence length. 316 */ 317 if ((wc & ~0x7f) == 0) { 318 /* Fast path for plain ASCII characters. */ 319 *s = (char)wc; 320 return (1); 321 } else if ((wc & ~0x7ff) == 0) { 322 lead = 0xc0; 323 len = 2; 324 } else if ((wc & ~0xffff) == 0) { 325 if (wc >= 0xd800 && wc <= 0xdfff) { 326 errno = EILSEQ; 327 return ((size_t)-1); 328 } 329 lead = 0xe0; 330 len = 3; 331 } else if (wc >= 0 && wc <= 0x10ffff) { 332 lead = 0xf0; 333 len = 4; 334 } else { 335 errno = EILSEQ; 336 return ((size_t)-1); 337 } 338 339 /* 340 * Output the octets representing the character in chunks 341 * of 6 bits, least significant last. The first octet is 342 * a special case because it contains the sequence length 343 * information. 344 */ 345 for (i = len - 1; i > 0; i--) { 346 s[i] = (wc & 0x3f) | 0x80; 347 wc >>= 6; 348 } 349 *s = (wc & 0xff) | lead; 350 351 return (len); 352 } 353 354 static size_t 355 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 356 size_t nwc, size_t len, mbstate_t * __restrict ps) 357 { 358 _UTF8State *us; 359 char buf[MB_LEN_MAX]; 360 const wchar_t *s; 361 size_t nbytes; 362 size_t nb; 363 364 us = (_UTF8State *)ps; 365 366 if (us->want != 0) { 367 errno = EINVAL; 368 return ((size_t)-1); 369 } 370 371 s = *src; 372 nbytes = 0; 373 374 if (dst == NULL) { 375 while (nwc-- > 0) { 376 if (0 <= *s && *s < 0x80) 377 /* Fast path for plain ASCII characters. */ 378 nb = 1; 379 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 380 (size_t)-1) 381 /* Invalid character - wcrtomb() sets errno. */ 382 return ((size_t)-1); 383 if (*s == L'\0') 384 return (nbytes + nb - 1); 385 s++; 386 nbytes += nb; 387 } 388 return (nbytes); 389 } 390 391 while (len > 0 && nwc-- > 0) { 392 if (0 <= *s && *s < 0x80) { 393 /* Fast path for plain ASCII characters. */ 394 nb = 1; 395 *dst = *s; 396 } else if (len > (size_t)MB_CUR_MAX) { 397 /* Enough space to translate in-place. */ 398 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 399 *src = s; 400 return ((size_t)-1); 401 } 402 } else { 403 /* 404 * May not be enough space; use temp. buffer. 405 */ 406 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 407 *src = s; 408 return ((size_t)-1); 409 } 410 if (nb > (int)len) 411 /* MB sequence for character won't fit. */ 412 break; 413 memcpy(dst, buf, nb); 414 } 415 if (*s == L'\0') { 416 *src = NULL; 417 return (nbytes + nb - 1); 418 } 419 s++; 420 dst += nb; 421 len -= nb; 422 nbytes += nb; 423 } 424 *src = s; 425 return (nbytes); 426 } 427