1 /*- 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Copyright (c) 2011 The FreeBSD Foundation 6 * All rights reserved. 7 * Portions of this software were developed by David Chisnall 8 * under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/param.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <errno.h> 36 #include <limits.h> 37 #include <runetype.h> 38 #include <stdlib.h> 39 #include <string.h> 40 #include <wchar.h> 41 #include "mblocal.h" 42 43 extern int __mb_sb_limit; 44 45 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 46 size_t, mbstate_t * __restrict); 47 static int _UTF8_mbsinit(const mbstate_t *); 48 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 49 const char ** __restrict, size_t, size_t, 50 mbstate_t * __restrict); 51 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 52 mbstate_t * __restrict); 53 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 54 size_t, size_t, mbstate_t * __restrict); 55 56 typedef struct { 57 wchar_t ch; 58 int want; 59 wchar_t lbound; 60 } _UTF8State; 61 62 int 63 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 64 { 65 66 l->__mbrtowc = _UTF8_mbrtowc; 67 l->__wcrtomb = _UTF8_wcrtomb; 68 l->__mbsinit = _UTF8_mbsinit; 69 l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 70 l->__wcsnrtombs = _UTF8_wcsnrtombs; 71 l->runes = rl; 72 l->__mb_cur_max = 6; 73 /* 74 * UCS-4 encoding used as the internal representation, so 75 * slots 0x0080-0x00FF are occuped and must be excluded 76 * from the single byte ctype by setting the limit. 77 */ 78 l->__mb_sb_limit = 128; 79 80 return (0); 81 } 82 83 static int 84 _UTF8_mbsinit(const mbstate_t *ps) 85 { 86 87 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 88 } 89 90 static size_t 91 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 92 mbstate_t * __restrict ps) 93 { 94 _UTF8State *us; 95 int ch, i, mask, want; 96 wchar_t lbound, wch; 97 98 us = (_UTF8State *)ps; 99 100 if (us->want < 0 || us->want > 6) { 101 errno = EINVAL; 102 return ((size_t)-1); 103 } 104 105 if (s == NULL) { 106 s = ""; 107 n = 1; 108 pwc = NULL; 109 } 110 111 if (n == 0) 112 /* Incomplete multibyte sequence */ 113 return ((size_t)-2); 114 115 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 116 /* Fast path for plain ASCII characters. */ 117 if (pwc != NULL) 118 *pwc = ch; 119 return (ch != '\0' ? 1 : 0); 120 } 121 122 if (us->want == 0) { 123 /* 124 * Determine the number of octets that make up this character 125 * from the first octet, and a mask that extracts the 126 * interesting bits of the first octet. We already know 127 * the character is at least two bytes long. 128 * 129 * We also specify a lower bound for the character code to 130 * detect redundant, non-"shortest form" encodings. For 131 * example, the sequence C0 80 is _not_ a legal representation 132 * of the null character. This enforces a 1-to-1 mapping 133 * between character codes and their multibyte representations. 134 */ 135 ch = (unsigned char)*s; 136 if ((ch & 0x80) == 0) { 137 mask = 0x7f; 138 want = 1; 139 lbound = 0; 140 } else if ((ch & 0xe0) == 0xc0) { 141 mask = 0x1f; 142 want = 2; 143 lbound = 0x80; 144 } else if ((ch & 0xf0) == 0xe0) { 145 mask = 0x0f; 146 want = 3; 147 lbound = 0x800; 148 } else if ((ch & 0xf8) == 0xf0) { 149 mask = 0x07; 150 want = 4; 151 lbound = 0x10000; 152 } else if ((ch & 0xfc) == 0xf8) { 153 mask = 0x03; 154 want = 5; 155 lbound = 0x200000; 156 } else if ((ch & 0xfe) == 0xfc) { 157 mask = 0x01; 158 want = 6; 159 lbound = 0x4000000; 160 } else { 161 /* 162 * Malformed input; input is not UTF-8. 163 */ 164 errno = EILSEQ; 165 return ((size_t)-1); 166 } 167 } else { 168 want = us->want; 169 lbound = us->lbound; 170 } 171 172 /* 173 * Decode the octet sequence representing the character in chunks 174 * of 6 bits, most significant first. 175 */ 176 if (us->want == 0) 177 wch = (unsigned char)*s++ & mask; 178 else 179 wch = us->ch; 180 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 181 if ((*s & 0xc0) != 0x80) { 182 /* 183 * Malformed input; bad characters in the middle 184 * of a character. 185 */ 186 errno = EILSEQ; 187 return ((size_t)-1); 188 } 189 wch <<= 6; 190 wch |= *s++ & 0x3f; 191 } 192 if (i < want) { 193 /* Incomplete multibyte sequence. */ 194 us->want = want - i; 195 us->lbound = lbound; 196 us->ch = wch; 197 return ((size_t)-2); 198 } 199 if (wch < lbound) { 200 /* 201 * Malformed input; redundant encoding. 202 */ 203 errno = EILSEQ; 204 return ((size_t)-1); 205 } 206 if (wch >= 0xd800 && wch <= 0xdfff) { 207 /* 208 * Malformed input; invalid code points. 209 */ 210 errno = EILSEQ; 211 return ((size_t)-1); 212 } 213 if (pwc != NULL) 214 *pwc = wch; 215 us->want = 0; 216 return (wch == L'\0' ? 0 : want); 217 } 218 219 static size_t 220 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 221 size_t nms, size_t len, mbstate_t * __restrict ps) 222 { 223 _UTF8State *us; 224 const char *s; 225 size_t nchr; 226 wchar_t wc; 227 size_t nb; 228 229 us = (_UTF8State *)ps; 230 231 s = *src; 232 nchr = 0; 233 234 if (dst == NULL) { 235 /* 236 * The fast path in the loop below is not safe if an ASCII 237 * character appears as anything but the first byte of a 238 * multibyte sequence. Check now to avoid doing it in the loop. 239 */ 240 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 241 errno = EILSEQ; 242 return ((size_t)-1); 243 } 244 for (;;) { 245 if (nms > 0 && (signed char)*s > 0) 246 /* 247 * Fast path for plain ASCII characters 248 * excluding NUL. 249 */ 250 nb = 1; 251 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 252 (size_t)-1) 253 /* Invalid sequence - mbrtowc() sets errno. */ 254 return ((size_t)-1); 255 else if (nb == 0 || nb == (size_t)-2) 256 return (nchr); 257 s += nb; 258 nms -= nb; 259 nchr++; 260 } 261 /*NOTREACHED*/ 262 } 263 264 /* 265 * The fast path in the loop below is not safe if an ASCII 266 * character appears as anything but the first byte of a 267 * multibyte sequence. Check now to avoid doing it in the loop. 268 */ 269 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 270 errno = EILSEQ; 271 return ((size_t)-1); 272 } 273 while (len-- > 0) { 274 if (nms > 0 && (signed char)*s > 0) { 275 /* 276 * Fast path for plain ASCII characters 277 * excluding NUL. 278 */ 279 *dst = (wchar_t)*s; 280 nb = 1; 281 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 282 (size_t)-1) { 283 *src = s; 284 return ((size_t)-1); 285 } else if (nb == (size_t)-2) { 286 *src = s + nms; 287 return (nchr); 288 } else if (nb == 0) { 289 *src = NULL; 290 return (nchr); 291 } 292 s += nb; 293 nms -= nb; 294 nchr++; 295 dst++; 296 } 297 *src = s; 298 return (nchr); 299 } 300 301 static size_t 302 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 303 { 304 _UTF8State *us; 305 unsigned char lead; 306 int i, len; 307 308 us = (_UTF8State *)ps; 309 310 if (us->want != 0) { 311 errno = EINVAL; 312 return ((size_t)-1); 313 } 314 315 if (s == NULL) 316 /* Reset to initial shift state (no-op) */ 317 return (1); 318 319 if ((wc & ~0x7f) == 0) { 320 /* Fast path for plain ASCII characters. */ 321 *s = (char)wc; 322 return (1); 323 } 324 325 /* 326 * Determine the number of octets needed to represent this character. 327 * We always output the shortest sequence possible. Also specify the 328 * first few bits of the first octet, which contains the information 329 * about the sequence length. 330 */ 331 if ((wc & ~0x7f) == 0) { 332 lead = 0; 333 len = 1; 334 } else if ((wc & ~0x7ff) == 0) { 335 lead = 0xc0; 336 len = 2; 337 } else if ((wc & ~0xffff) == 0) { 338 lead = 0xe0; 339 len = 3; 340 } else if ((wc & ~0x1fffff) == 0) { 341 lead = 0xf0; 342 len = 4; 343 } else if ((wc & ~0x3ffffff) == 0) { 344 lead = 0xf8; 345 len = 5; 346 } else if ((wc & ~0x7fffffff) == 0) { 347 lead = 0xfc; 348 len = 6; 349 } else { 350 errno = EILSEQ; 351 return ((size_t)-1); 352 } 353 354 /* 355 * Output the octets representing the character in chunks 356 * of 6 bits, least significant last. The first octet is 357 * a special case because it contains the sequence length 358 * information. 359 */ 360 for (i = len - 1; i > 0; i--) { 361 s[i] = (wc & 0x3f) | 0x80; 362 wc >>= 6; 363 } 364 *s = (wc & 0xff) | lead; 365 366 return (len); 367 } 368 369 static size_t 370 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 371 size_t nwc, size_t len, mbstate_t * __restrict ps) 372 { 373 _UTF8State *us; 374 char buf[MB_LEN_MAX]; 375 const wchar_t *s; 376 size_t nbytes; 377 size_t nb; 378 379 us = (_UTF8State *)ps; 380 381 if (us->want != 0) { 382 errno = EINVAL; 383 return ((size_t)-1); 384 } 385 386 s = *src; 387 nbytes = 0; 388 389 if (dst == NULL) { 390 while (nwc-- > 0) { 391 if (0 <= *s && *s < 0x80) 392 /* Fast path for plain ASCII characters. */ 393 nb = 1; 394 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 395 (size_t)-1) 396 /* Invalid character - wcrtomb() sets errno. */ 397 return ((size_t)-1); 398 if (*s == L'\0') 399 return (nbytes + nb - 1); 400 s++; 401 nbytes += nb; 402 } 403 return (nbytes); 404 } 405 406 while (len > 0 && nwc-- > 0) { 407 if (0 <= *s && *s < 0x80) { 408 /* Fast path for plain ASCII characters. */ 409 nb = 1; 410 *dst = *s; 411 } else if (len > (size_t)MB_CUR_MAX) { 412 /* Enough space to translate in-place. */ 413 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 414 *src = s; 415 return ((size_t)-1); 416 } 417 } else { 418 /* 419 * May not be enough space; use temp. buffer. 420 */ 421 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 422 *src = s; 423 return ((size_t)-1); 424 } 425 if (nb > (int)len) 426 /* MB sequence for character won't fit. */ 427 break; 428 memcpy(dst, buf, nb); 429 } 430 if (*s == L'\0') { 431 *src = NULL; 432 return (nbytes + nb - 1); 433 } 434 s++; 435 dst += nb; 436 len -= nb; 437 nbytes += nb; 438 } 439 *src = s; 440 return (nbytes); 441 } 442