1 /*- 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 2002-2004 Tim J. Robbins 5 * All rights reserved. 6 * 7 * Copyright (c) 2011 The FreeBSD Foundation 8 * All rights reserved. 9 * Portions of this software were developed by David Chisnall 10 * under sponsorship from the FreeBSD Foundation. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 */ 33 34 #include <sys/param.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <errno.h> 38 #include <limits.h> 39 #include <runetype.h> 40 #include <stdlib.h> 41 #include <string.h> 42 #include <wchar.h> 43 #include "mblocal.h" 44 45 extern int __mb_sb_limit; 46 47 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 48 size_t, mbstate_t * __restrict); 49 static int _UTF8_mbsinit(const mbstate_t *); 50 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 51 const char ** __restrict, size_t, size_t, 52 mbstate_t * __restrict); 53 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 54 mbstate_t * __restrict); 55 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 56 size_t, size_t, mbstate_t * __restrict); 57 58 typedef struct { 59 wchar_t ch; 60 int want; 61 wchar_t lbound; 62 } _UTF8State; 63 64 int 65 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 66 { 67 68 l->__mbrtowc = _UTF8_mbrtowc; 69 l->__wcrtomb = _UTF8_wcrtomb; 70 l->__mbsinit = _UTF8_mbsinit; 71 l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 72 l->__wcsnrtombs = _UTF8_wcsnrtombs; 73 l->runes = rl; 74 l->__mb_cur_max = 4; 75 /* 76 * UCS-4 encoding used as the internal representation, so 77 * slots 0x0080-0x00FF are occuped and must be excluded 78 * from the single byte ctype by setting the limit. 79 */ 80 l->__mb_sb_limit = 128; 81 82 return (0); 83 } 84 85 static int 86 _UTF8_mbsinit(const mbstate_t *ps) 87 { 88 89 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 90 } 91 92 static size_t 93 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 94 mbstate_t * __restrict ps) 95 { 96 _UTF8State *us; 97 int ch, i, mask, want; 98 wchar_t lbound, wch; 99 100 us = (_UTF8State *)ps; 101 102 if (us->want < 0 || us->want > 6) { 103 errno = EINVAL; 104 return ((size_t)-1); 105 } 106 107 if (s == NULL) { 108 s = ""; 109 n = 1; 110 pwc = NULL; 111 } 112 113 if (n == 0) 114 /* Incomplete multibyte sequence */ 115 return ((size_t)-2); 116 117 if (us->want == 0) { 118 /* 119 * Determine the number of octets that make up this character 120 * from the first octet, and a mask that extracts the 121 * interesting bits of the first octet. We already know 122 * the character is at least two bytes long. 123 * 124 * We also specify a lower bound for the character code to 125 * detect redundant, non-"shortest form" encodings. For 126 * example, the sequence C0 80 is _not_ a legal representation 127 * of the null character. This enforces a 1-to-1 mapping 128 * between character codes and their multibyte representations. 129 */ 130 ch = (unsigned char)*s; 131 if ((ch & 0x80) == 0) { 132 /* Fast path for plain ASCII characters. */ 133 if (pwc != NULL) 134 *pwc = ch; 135 return (ch != '\0' ? 1 : 0); 136 } 137 if ((ch & 0xe0) == 0xc0) { 138 mask = 0x1f; 139 want = 2; 140 lbound = 0x80; 141 } else if ((ch & 0xf0) == 0xe0) { 142 mask = 0x0f; 143 want = 3; 144 lbound = 0x800; 145 } else if ((ch & 0xf8) == 0xf0) { 146 mask = 0x07; 147 want = 4; 148 lbound = 0x10000; 149 } else { 150 /* 151 * Malformed input; input is not UTF-8. 152 */ 153 errno = EILSEQ; 154 return ((size_t)-1); 155 } 156 } else { 157 want = us->want; 158 lbound = us->lbound; 159 } 160 161 /* 162 * Decode the octet sequence representing the character in chunks 163 * of 6 bits, most significant first. 164 */ 165 if (us->want == 0) 166 wch = (unsigned char)*s++ & mask; 167 else 168 wch = us->ch; 169 170 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 171 if ((*s & 0xc0) != 0x80) { 172 /* 173 * Malformed input; bad characters in the middle 174 * of a character. 175 */ 176 errno = EILSEQ; 177 return ((size_t)-1); 178 } 179 wch <<= 6; 180 wch |= *s++ & 0x3f; 181 } 182 if (i < want) { 183 /* Incomplete multibyte sequence. */ 184 us->want = want - i; 185 us->lbound = lbound; 186 us->ch = wch; 187 return ((size_t)-2); 188 } 189 if (wch < lbound) { 190 /* 191 * Malformed input; redundant encoding. 192 */ 193 errno = EILSEQ; 194 return ((size_t)-1); 195 } 196 if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) { 197 /* 198 * Malformed input; invalid code points. 199 */ 200 errno = EILSEQ; 201 return ((size_t)-1); 202 } 203 if (pwc != NULL) 204 *pwc = wch; 205 us->want = 0; 206 return (wch == L'\0' ? 0 : want); 207 } 208 209 static size_t 210 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 211 size_t nms, size_t len, mbstate_t * __restrict ps) 212 { 213 _UTF8State *us; 214 const char *s; 215 size_t nchr; 216 wchar_t wc; 217 size_t nb; 218 219 us = (_UTF8State *)ps; 220 221 s = *src; 222 nchr = 0; 223 224 if (dst == NULL) { 225 /* 226 * The fast path in the loop below is not safe if an ASCII 227 * character appears as anything but the first byte of a 228 * multibyte sequence. Check now to avoid doing it in the loop. 229 */ 230 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 231 errno = EILSEQ; 232 return ((size_t)-1); 233 } 234 for (;;) { 235 if (nms > 0 && (signed char)*s > 0) 236 /* 237 * Fast path for plain ASCII characters 238 * excluding NUL. 239 */ 240 nb = 1; 241 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 242 (size_t)-1) 243 /* Invalid sequence - mbrtowc() sets errno. */ 244 return ((size_t)-1); 245 else if (nb == 0 || nb == (size_t)-2) 246 return (nchr); 247 s += nb; 248 nms -= nb; 249 nchr++; 250 } 251 /*NOTREACHED*/ 252 } 253 254 /* 255 * The fast path in the loop below is not safe if an ASCII 256 * character appears as anything but the first byte of a 257 * multibyte sequence. Check now to avoid doing it in the loop. 258 */ 259 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 260 errno = EILSEQ; 261 return ((size_t)-1); 262 } 263 while (len-- > 0) { 264 if (nms > 0 && (signed char)*s > 0) { 265 /* 266 * Fast path for plain ASCII characters 267 * excluding NUL. 268 */ 269 *dst = (wchar_t)*s; 270 nb = 1; 271 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 272 (size_t)-1) { 273 *src = s; 274 return ((size_t)-1); 275 } else if (nb == (size_t)-2) { 276 *src = s + nms; 277 return (nchr); 278 } else if (nb == 0) { 279 *src = NULL; 280 return (nchr); 281 } 282 s += nb; 283 nms -= nb; 284 nchr++; 285 dst++; 286 } 287 *src = s; 288 return (nchr); 289 } 290 291 static size_t 292 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 293 { 294 _UTF8State *us; 295 unsigned char lead; 296 int i, len; 297 298 us = (_UTF8State *)ps; 299 300 if (us->want != 0) { 301 errno = EINVAL; 302 return ((size_t)-1); 303 } 304 305 if (s == NULL) 306 /* Reset to initial shift state (no-op) */ 307 return (1); 308 309 /* 310 * Determine the number of octets needed to represent this character. 311 * We always output the shortest sequence possible. Also specify the 312 * first few bits of the first octet, which contains the information 313 * about the sequence length. 314 */ 315 if ((wc & ~0x7f) == 0) { 316 /* Fast path for plain ASCII characters. */ 317 *s = (char)wc; 318 return (1); 319 } else if ((wc & ~0x7ff) == 0) { 320 lead = 0xc0; 321 len = 2; 322 } else if ((wc & ~0xffff) == 0) { 323 if (wc >= 0xd800 && wc <= 0xdfff) { 324 errno = EILSEQ; 325 return ((size_t)-1); 326 } 327 lead = 0xe0; 328 len = 3; 329 } else if (wc >= 0 && wc <= 0x10ffff) { 330 lead = 0xf0; 331 len = 4; 332 } else { 333 errno = EILSEQ; 334 return ((size_t)-1); 335 } 336 337 /* 338 * Output the octets representing the character in chunks 339 * of 6 bits, least significant last. The first octet is 340 * a special case because it contains the sequence length 341 * information. 342 */ 343 for (i = len - 1; i > 0; i--) { 344 s[i] = (wc & 0x3f) | 0x80; 345 wc >>= 6; 346 } 347 *s = (wc & 0xff) | lead; 348 349 return (len); 350 } 351 352 static size_t 353 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 354 size_t nwc, size_t len, mbstate_t * __restrict ps) 355 { 356 _UTF8State *us; 357 char buf[MB_LEN_MAX]; 358 const wchar_t *s; 359 size_t nbytes; 360 size_t nb; 361 362 us = (_UTF8State *)ps; 363 364 if (us->want != 0) { 365 errno = EINVAL; 366 return ((size_t)-1); 367 } 368 369 s = *src; 370 nbytes = 0; 371 372 if (dst == NULL) { 373 while (nwc-- > 0) { 374 if (0 <= *s && *s < 0x80) 375 /* Fast path for plain ASCII characters. */ 376 nb = 1; 377 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 378 (size_t)-1) 379 /* Invalid character - wcrtomb() sets errno. */ 380 return ((size_t)-1); 381 if (*s == L'\0') 382 return (nbytes + nb - 1); 383 s++; 384 nbytes += nb; 385 } 386 return (nbytes); 387 } 388 389 while (len > 0 && nwc-- > 0) { 390 if (0 <= *s && *s < 0x80) { 391 /* Fast path for plain ASCII characters. */ 392 nb = 1; 393 *dst = *s; 394 } else if (len > (size_t)MB_CUR_MAX) { 395 /* Enough space to translate in-place. */ 396 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 397 *src = s; 398 return ((size_t)-1); 399 } 400 } else { 401 /* 402 * May not be enough space; use temp. buffer. 403 */ 404 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 405 *src = s; 406 return ((size_t)-1); 407 } 408 if (nb > (int)len) 409 /* MB sequence for character won't fit. */ 410 break; 411 memcpy(dst, buf, nb); 412 } 413 if (*s == L'\0') { 414 *src = NULL; 415 return (nbytes + nb - 1); 416 } 417 s++; 418 dst += nb; 419 len -= nb; 420 nbytes += nb; 421 } 422 *src = s; 423 return (nbytes); 424 } 425