1 /*- 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/param.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <errno.h> 31 #include <limits.h> 32 #include <runetype.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <wchar.h> 36 #include "mblocal.h" 37 38 extern int __mb_sb_limit; 39 40 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 41 size_t, mbstate_t * __restrict); 42 static int _UTF8_mbsinit(const mbstate_t *); 43 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 44 const char ** __restrict, size_t, size_t, 45 mbstate_t * __restrict); 46 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 47 mbstate_t * __restrict); 48 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 49 size_t, size_t, mbstate_t * __restrict); 50 51 typedef struct { 52 wchar_t ch; 53 int want; 54 wchar_t lbound; 55 } _UTF8State; 56 57 int 58 _UTF8_init(_RuneLocale *rl) 59 { 60 61 __mbrtowc = _UTF8_mbrtowc; 62 __wcrtomb = _UTF8_wcrtomb; 63 __mbsinit = _UTF8_mbsinit; 64 __mbsnrtowcs = _UTF8_mbsnrtowcs; 65 __wcsnrtombs = _UTF8_wcsnrtombs; 66 _CurrentRuneLocale = rl; 67 __mb_cur_max = 6; 68 /* 69 * UCS-4 encoding used as the internal representation, so 70 * slots 0x0080-0x00FF are occuped and must be excluded 71 * from the single byte ctype by setting the limit. 72 */ 73 __mb_sb_limit = 128; 74 75 return (0); 76 } 77 78 static int 79 _UTF8_mbsinit(const mbstate_t *ps) 80 { 81 82 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 83 } 84 85 static size_t 86 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 87 mbstate_t * __restrict ps) 88 { 89 _UTF8State *us; 90 int ch, i, mask, want; 91 wchar_t lbound, wch; 92 93 us = (_UTF8State *)ps; 94 95 if (us->want < 0 || us->want > 6) { 96 errno = EINVAL; 97 return ((size_t)-1); 98 } 99 100 if (s == NULL) { 101 s = ""; 102 n = 1; 103 pwc = NULL; 104 } 105 106 if (n == 0) 107 /* Incomplete multibyte sequence */ 108 return ((size_t)-2); 109 110 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 111 /* Fast path for plain ASCII characters. */ 112 if (pwc != NULL) 113 *pwc = ch; 114 return (ch != '\0' ? 1 : 0); 115 } 116 117 if (us->want == 0) { 118 /* 119 * Determine the number of octets that make up this character 120 * from the first octet, and a mask that extracts the 121 * interesting bits of the first octet. We already know 122 * the character is at least two bytes long. 123 * 124 * We also specify a lower bound for the character code to 125 * detect redundant, non-"shortest form" encodings. For 126 * example, the sequence C0 80 is _not_ a legal representation 127 * of the null character. This enforces a 1-to-1 mapping 128 * between character codes and their multibyte representations. 129 */ 130 ch = (unsigned char)*s; 131 if ((ch & 0x80) == 0) { 132 mask = 0x7f; 133 want = 1; 134 lbound = 0; 135 } else if ((ch & 0xe0) == 0xc0) { 136 mask = 0x1f; 137 want = 2; 138 lbound = 0x80; 139 } else if ((ch & 0xf0) == 0xe0) { 140 mask = 0x0f; 141 want = 3; 142 lbound = 0x800; 143 } else if ((ch & 0xf8) == 0xf0) { 144 mask = 0x07; 145 want = 4; 146 lbound = 0x10000; 147 } else if ((ch & 0xfc) == 0xf8) { 148 mask = 0x03; 149 want = 5; 150 lbound = 0x200000; 151 } else if ((ch & 0xfe) == 0xfc) { 152 mask = 0x01; 153 want = 6; 154 lbound = 0x4000000; 155 } else { 156 /* 157 * Malformed input; input is not UTF-8. 158 */ 159 errno = EILSEQ; 160 return ((size_t)-1); 161 } 162 } else { 163 want = us->want; 164 lbound = us->lbound; 165 } 166 167 /* 168 * Decode the octet sequence representing the character in chunks 169 * of 6 bits, most significant first. 170 */ 171 if (us->want == 0) 172 wch = (unsigned char)*s++ & mask; 173 else 174 wch = us->ch; 175 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 176 if ((*s & 0xc0) != 0x80) { 177 /* 178 * Malformed input; bad characters in the middle 179 * of a character. 180 */ 181 errno = EILSEQ; 182 return ((size_t)-1); 183 } 184 wch <<= 6; 185 wch |= *s++ & 0x3f; 186 } 187 if (i < want) { 188 /* Incomplete multibyte sequence. */ 189 us->want = want - i; 190 us->lbound = lbound; 191 us->ch = wch; 192 return ((size_t)-2); 193 } 194 if (wch < lbound) { 195 /* 196 * Malformed input; redundant encoding. 197 */ 198 errno = EILSEQ; 199 return ((size_t)-1); 200 } 201 if (pwc != NULL) 202 *pwc = wch; 203 us->want = 0; 204 return (wch == L'\0' ? 0 : want); 205 } 206 207 static size_t 208 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 209 size_t nms, size_t len, mbstate_t * __restrict ps) 210 { 211 _UTF8State *us; 212 const char *s; 213 size_t nchr; 214 wchar_t wc; 215 size_t nb; 216 217 us = (_UTF8State *)ps; 218 219 s = *src; 220 nchr = 0; 221 222 if (dst == NULL) { 223 /* 224 * The fast path in the loop below is not safe if an ASCII 225 * character appears as anything but the first byte of a 226 * multibyte sequence. Check now to avoid doing it in the loop. 227 */ 228 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 229 errno = EILSEQ; 230 return ((size_t)-1); 231 } 232 for (;;) { 233 if (nms > 0 && (signed char)*s > 0) 234 /* 235 * Fast path for plain ASCII characters 236 * excluding NUL. 237 */ 238 nb = 1; 239 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 240 (size_t)-1) 241 /* Invalid sequence - mbrtowc() sets errno. */ 242 return ((size_t)-1); 243 else if (nb == 0 || nb == (size_t)-2) 244 return (nchr); 245 s += nb; 246 nms -= nb; 247 nchr++; 248 } 249 /*NOTREACHED*/ 250 } 251 252 /* 253 * The fast path in the loop below is not safe if an ASCII 254 * character appears as anything but the first byte of a 255 * multibyte sequence. Check now to avoid doing it in the loop. 256 */ 257 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 258 errno = EILSEQ; 259 return ((size_t)-1); 260 } 261 while (len-- > 0) { 262 if (nms > 0 && (signed char)*s > 0) { 263 /* 264 * Fast path for plain ASCII characters 265 * excluding NUL. 266 */ 267 *dst = (wchar_t)*s; 268 nb = 1; 269 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 270 (size_t)-1) { 271 *src = s; 272 return ((size_t)-1); 273 } else if (nb == (size_t)-2) { 274 *src = s + nms; 275 return (nchr); 276 } else if (nb == 0) { 277 *src = NULL; 278 return (nchr); 279 } 280 s += nb; 281 nms -= nb; 282 nchr++; 283 dst++; 284 } 285 *src = s; 286 return (nchr); 287 } 288 289 static size_t 290 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 291 { 292 _UTF8State *us; 293 unsigned char lead; 294 int i, len; 295 296 us = (_UTF8State *)ps; 297 298 if (us->want != 0) { 299 errno = EINVAL; 300 return ((size_t)-1); 301 } 302 303 if (s == NULL) 304 /* Reset to initial shift state (no-op) */ 305 return (1); 306 307 if ((wc & ~0x7f) == 0) { 308 /* Fast path for plain ASCII characters. */ 309 *s = (char)wc; 310 return (1); 311 } 312 313 /* 314 * Determine the number of octets needed to represent this character. 315 * We always output the shortest sequence possible. Also specify the 316 * first few bits of the first octet, which contains the information 317 * about the sequence length. 318 */ 319 if ((wc & ~0x7f) == 0) { 320 lead = 0; 321 len = 1; 322 } else if ((wc & ~0x7ff) == 0) { 323 lead = 0xc0; 324 len = 2; 325 } else if ((wc & ~0xffff) == 0) { 326 lead = 0xe0; 327 len = 3; 328 } else if ((wc & ~0x1fffff) == 0) { 329 lead = 0xf0; 330 len = 4; 331 } else if ((wc & ~0x3ffffff) == 0) { 332 lead = 0xf8; 333 len = 5; 334 } else if ((wc & ~0x7fffffff) == 0) { 335 lead = 0xfc; 336 len = 6; 337 } else { 338 errno = EILSEQ; 339 return ((size_t)-1); 340 } 341 342 /* 343 * Output the octets representing the character in chunks 344 * of 6 bits, least significant last. The first octet is 345 * a special case because it contains the sequence length 346 * information. 347 */ 348 for (i = len - 1; i > 0; i--) { 349 s[i] = (wc & 0x3f) | 0x80; 350 wc >>= 6; 351 } 352 *s = (wc & 0xff) | lead; 353 354 return (len); 355 } 356 357 static size_t 358 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 359 size_t nwc, size_t len, mbstate_t * __restrict ps) 360 { 361 _UTF8State *us; 362 char buf[MB_LEN_MAX]; 363 const wchar_t *s; 364 size_t nbytes; 365 size_t nb; 366 367 us = (_UTF8State *)ps; 368 369 if (us->want != 0) { 370 errno = EINVAL; 371 return ((size_t)-1); 372 } 373 374 s = *src; 375 nbytes = 0; 376 377 if (dst == NULL) { 378 while (nwc-- > 0) { 379 if (0 <= *s && *s < 0x80) 380 /* Fast path for plain ASCII characters. */ 381 nb = 1; 382 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 383 (size_t)-1) 384 /* Invalid character - wcrtomb() sets errno. */ 385 return ((size_t)-1); 386 if (*s == L'\0') 387 return (nbytes + nb - 1); 388 s++; 389 nbytes += nb; 390 } 391 return (nbytes); 392 } 393 394 while (len > 0 && nwc-- > 0) { 395 if (0 <= *s && *s < 0x80) { 396 /* Fast path for plain ASCII characters. */ 397 nb = 1; 398 *dst = *s; 399 } else if (len > (size_t)MB_CUR_MAX) { 400 /* Enough space to translate in-place. */ 401 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 402 *src = s; 403 return ((size_t)-1); 404 } 405 } else { 406 /* 407 * May not be enough space; use temp. buffer. 408 */ 409 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 410 *src = s; 411 return ((size_t)-1); 412 } 413 if (nb > (int)len) 414 /* MB sequence for character won't fit. */ 415 break; 416 memcpy(dst, buf, nb); 417 } 418 if (*s == L'\0') { 419 *src = NULL; 420 return (nbytes + nb - 1); 421 } 422 s++; 423 dst += nb; 424 len -= nb; 425 nbytes += nb; 426 } 427 *src = s; 428 return (nbytes); 429 } 430