1 /*- 2 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 2002-2004 Tim J. Robbins 4 * All rights reserved. 5 * 6 * Copyright (c) 2011 The FreeBSD Foundation 7 * All rights reserved. 8 * Portions of this software were developed by David Chisnall 9 * under sponsorship from the FreeBSD Foundation. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #include <sys/param.h> 34 __FBSDID("$FreeBSD$"); 35 36 #include <errno.h> 37 #include <limits.h> 38 #include <runetype.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <wchar.h> 42 #include "mblocal.h" 43 44 extern int __mb_sb_limit; 45 46 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 47 size_t, mbstate_t * __restrict); 48 static int _UTF8_mbsinit(const mbstate_t *); 49 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 50 const char ** __restrict, size_t, size_t, 51 mbstate_t * __restrict); 52 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 53 mbstate_t * __restrict); 54 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 55 size_t, size_t, mbstate_t * __restrict); 56 57 typedef struct { 58 wchar_t ch; 59 int want; 60 wchar_t lbound; 61 } _UTF8State; 62 63 int 64 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl) 65 { 66 67 l->__mbrtowc = _UTF8_mbrtowc; 68 l->__wcrtomb = _UTF8_wcrtomb; 69 l->__mbsinit = _UTF8_mbsinit; 70 l->__mbsnrtowcs = _UTF8_mbsnrtowcs; 71 l->__wcsnrtombs = _UTF8_wcsnrtombs; 72 l->runes = rl; 73 l->__mb_cur_max = 6; 74 /* 75 * UCS-4 encoding used as the internal representation, so 76 * slots 0x0080-0x00FF are occuped and must be excluded 77 * from the single byte ctype by setting the limit. 78 */ 79 l->__mb_sb_limit = 128; 80 81 return (0); 82 } 83 84 static int 85 _UTF8_mbsinit(const mbstate_t *ps) 86 { 87 88 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 89 } 90 91 static size_t 92 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 93 mbstate_t * __restrict ps) 94 { 95 _UTF8State *us; 96 int ch, i, mask, want; 97 wchar_t lbound, wch; 98 99 us = (_UTF8State *)ps; 100 101 if (us->want < 0 || us->want > 6) { 102 errno = EINVAL; 103 return ((size_t)-1); 104 } 105 106 if (s == NULL) { 107 s = ""; 108 n = 1; 109 pwc = NULL; 110 } 111 112 if (n == 0) 113 /* Incomplete multibyte sequence */ 114 return ((size_t)-2); 115 116 if (us->want == 0) { 117 /* 118 * Determine the number of octets that make up this character 119 * from the first octet, and a mask that extracts the 120 * interesting bits of the first octet. We already know 121 * the character is at least two bytes long. 122 * 123 * We also specify a lower bound for the character code to 124 * detect redundant, non-"shortest form" encodings. For 125 * example, the sequence C0 80 is _not_ a legal representation 126 * of the null character. This enforces a 1-to-1 mapping 127 * between character codes and their multibyte representations. 128 */ 129 ch = (unsigned char)*s; 130 if ((ch & 0x80) == 0) { 131 /* Fast path for plain ASCII characters. */ 132 if (pwc != NULL) 133 *pwc = ch; 134 return (ch != '\0' ? 1 : 0); 135 } 136 if ((ch & 0xe0) == 0xc0) { 137 mask = 0x1f; 138 want = 2; 139 lbound = 0x80; 140 } else if ((ch & 0xf0) == 0xe0) { 141 mask = 0x0f; 142 want = 3; 143 lbound = 0x800; 144 } else if ((ch & 0xf8) == 0xf0) { 145 mask = 0x07; 146 want = 4; 147 lbound = 0x10000; 148 } else if ((ch & 0xfc) == 0xf8) { 149 mask = 0x03; 150 want = 5; 151 lbound = 0x200000; 152 } else if ((ch & 0xfe) == 0xfc) { 153 mask = 0x01; 154 want = 6; 155 lbound = 0x4000000; 156 } else { 157 /* 158 * Malformed input; input is not UTF-8. 159 */ 160 errno = EILSEQ; 161 return ((size_t)-1); 162 } 163 } else { 164 want = us->want; 165 lbound = us->lbound; 166 } 167 168 /* 169 * Decode the octet sequence representing the character in chunks 170 * of 6 bits, most significant first. 171 */ 172 if (us->want == 0) 173 wch = (unsigned char)*s++ & mask; 174 else 175 wch = us->ch; 176 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 177 if ((*s & 0xc0) != 0x80) { 178 /* 179 * Malformed input; bad characters in the middle 180 * of a character. 181 */ 182 errno = EILSEQ; 183 return ((size_t)-1); 184 } 185 wch <<= 6; 186 wch |= *s++ & 0x3f; 187 } 188 if (i < want) { 189 /* Incomplete multibyte sequence. */ 190 us->want = want - i; 191 us->lbound = lbound; 192 us->ch = wch; 193 return ((size_t)-2); 194 } 195 if (wch < lbound) { 196 /* 197 * Malformed input; redundant encoding. 198 */ 199 errno = EILSEQ; 200 return ((size_t)-1); 201 } 202 if (wch >= 0xd800 && wch <= 0xdfff) { 203 /* 204 * Malformed input; invalid code points. 205 */ 206 errno = EILSEQ; 207 return ((size_t)-1); 208 } 209 if (pwc != NULL) 210 *pwc = wch; 211 us->want = 0; 212 return (wch == L'\0' ? 0 : want); 213 } 214 215 static size_t 216 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 217 size_t nms, size_t len, mbstate_t * __restrict ps) 218 { 219 _UTF8State *us; 220 const char *s; 221 size_t nchr; 222 wchar_t wc; 223 size_t nb; 224 225 us = (_UTF8State *)ps; 226 227 s = *src; 228 nchr = 0; 229 230 if (dst == NULL) { 231 /* 232 * The fast path in the loop below is not safe if an ASCII 233 * character appears as anything but the first byte of a 234 * multibyte sequence. Check now to avoid doing it in the loop. 235 */ 236 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 237 errno = EILSEQ; 238 return ((size_t)-1); 239 } 240 for (;;) { 241 if (nms > 0 && (signed char)*s > 0) 242 /* 243 * Fast path for plain ASCII characters 244 * excluding NUL. 245 */ 246 nb = 1; 247 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 248 (size_t)-1) 249 /* Invalid sequence - mbrtowc() sets errno. */ 250 return ((size_t)-1); 251 else if (nb == 0 || nb == (size_t)-2) 252 return (nchr); 253 s += nb; 254 nms -= nb; 255 nchr++; 256 } 257 /*NOTREACHED*/ 258 } 259 260 /* 261 * The fast path in the loop below is not safe if an ASCII 262 * character appears as anything but the first byte of a 263 * multibyte sequence. Check now to avoid doing it in the loop. 264 */ 265 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 266 errno = EILSEQ; 267 return ((size_t)-1); 268 } 269 while (len-- > 0) { 270 if (nms > 0 && (signed char)*s > 0) { 271 /* 272 * Fast path for plain ASCII characters 273 * excluding NUL. 274 */ 275 *dst = (wchar_t)*s; 276 nb = 1; 277 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 278 (size_t)-1) { 279 *src = s; 280 return ((size_t)-1); 281 } else if (nb == (size_t)-2) { 282 *src = s + nms; 283 return (nchr); 284 } else if (nb == 0) { 285 *src = NULL; 286 return (nchr); 287 } 288 s += nb; 289 nms -= nb; 290 nchr++; 291 dst++; 292 } 293 *src = s; 294 return (nchr); 295 } 296 297 static size_t 298 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 299 { 300 _UTF8State *us; 301 unsigned char lead; 302 int i, len; 303 304 us = (_UTF8State *)ps; 305 306 if (us->want != 0) { 307 errno = EINVAL; 308 return ((size_t)-1); 309 } 310 311 if (s == NULL) 312 /* Reset to initial shift state (no-op) */ 313 return (1); 314 315 /* 316 * Determine the number of octets needed to represent this character. 317 * We always output the shortest sequence possible. Also specify the 318 * first few bits of the first octet, which contains the information 319 * about the sequence length. 320 */ 321 if ((wc & ~0x7f) == 0) { 322 /* Fast path for plain ASCII characters. */ 323 *s = (char)wc; 324 return (1); 325 } else if ((wc & ~0x7ff) == 0) { 326 lead = 0xc0; 327 len = 2; 328 } else if ((wc & ~0xffff) == 0) { 329 lead = 0xe0; 330 len = 3; 331 } else if ((wc & ~0x1fffff) == 0) { 332 lead = 0xf0; 333 len = 4; 334 } else if ((wc & ~0x3ffffff) == 0) { 335 lead = 0xf8; 336 len = 5; 337 } else if ((wc & ~0x7fffffff) == 0) { 338 lead = 0xfc; 339 len = 6; 340 } else { 341 errno = EILSEQ; 342 return ((size_t)-1); 343 } 344 345 /* 346 * Output the octets representing the character in chunks 347 * of 6 bits, least significant last. The first octet is 348 * a special case because it contains the sequence length 349 * information. 350 */ 351 for (i = len - 1; i > 0; i--) { 352 s[i] = (wc & 0x3f) | 0x80; 353 wc >>= 6; 354 } 355 *s = (wc & 0xff) | lead; 356 357 return (len); 358 } 359 360 static size_t 361 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 362 size_t nwc, size_t len, mbstate_t * __restrict ps) 363 { 364 _UTF8State *us; 365 char buf[MB_LEN_MAX]; 366 const wchar_t *s; 367 size_t nbytes; 368 size_t nb; 369 370 us = (_UTF8State *)ps; 371 372 if (us->want != 0) { 373 errno = EINVAL; 374 return ((size_t)-1); 375 } 376 377 s = *src; 378 nbytes = 0; 379 380 if (dst == NULL) { 381 while (nwc-- > 0) { 382 if (0 <= *s && *s < 0x80) 383 /* Fast path for plain ASCII characters. */ 384 nb = 1; 385 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 386 (size_t)-1) 387 /* Invalid character - wcrtomb() sets errno. */ 388 return ((size_t)-1); 389 if (*s == L'\0') 390 return (nbytes + nb - 1); 391 s++; 392 nbytes += nb; 393 } 394 return (nbytes); 395 } 396 397 while (len > 0 && nwc-- > 0) { 398 if (0 <= *s && *s < 0x80) { 399 /* Fast path for plain ASCII characters. */ 400 nb = 1; 401 *dst = *s; 402 } else if (len > (size_t)MB_CUR_MAX) { 403 /* Enough space to translate in-place. */ 404 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 405 *src = s; 406 return ((size_t)-1); 407 } 408 } else { 409 /* 410 * May not be enough space; use temp. buffer. 411 */ 412 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 413 *src = s; 414 return ((size_t)-1); 415 } 416 if (nb > (int)len) 417 /* MB sequence for character won't fit. */ 418 break; 419 memcpy(dst, buf, nb); 420 } 421 if (*s == L'\0') { 422 *src = NULL; 423 return (nbytes + nb - 1); 424 } 425 s++; 426 dst += nb; 427 len -= nb; 428 nbytes += nb; 429 } 430 *src = s; 431 return (nbytes); 432 } 433