1 /*- 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/param.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <errno.h> 31 #include <limits.h> 32 #include <runetype.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <wchar.h> 36 #include "mblocal.h" 37 38 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, 39 size_t, mbstate_t * __restrict); 40 static int _UTF8_mbsinit(const mbstate_t *); 41 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, 42 const char ** __restrict, size_t, size_t, 43 mbstate_t * __restrict); 44 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t, 45 mbstate_t * __restrict); 46 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 47 size_t, size_t, mbstate_t * __restrict); 48 49 typedef struct { 50 wchar_t ch; 51 int want; 52 wchar_t lbound; 53 } _UTF8State; 54 55 int 56 _UTF8_init(_RuneLocale *rl) 57 { 58 59 __mbrtowc = _UTF8_mbrtowc; 60 __wcrtomb = _UTF8_wcrtomb; 61 __mbsinit = _UTF8_mbsinit; 62 __mbsnrtowcs = _UTF8_mbsnrtowcs; 63 __wcsnrtombs = _UTF8_wcsnrtombs; 64 _CurrentRuneLocale = rl; 65 __mb_cur_max = 6; 66 67 return (0); 68 } 69 70 static int 71 _UTF8_mbsinit(const mbstate_t *ps) 72 { 73 74 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 75 } 76 77 static size_t 78 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 79 mbstate_t * __restrict ps) 80 { 81 _UTF8State *us; 82 int ch, i, mask, want; 83 wchar_t lbound, wch; 84 85 us = (_UTF8State *)ps; 86 87 if (us->want < 0 || us->want > 6) { 88 errno = EINVAL; 89 return ((size_t)-1); 90 } 91 92 if (s == NULL) { 93 s = ""; 94 n = 1; 95 pwc = NULL; 96 } 97 98 if (n == 0) 99 /* Incomplete multibyte sequence */ 100 return ((size_t)-2); 101 102 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 103 /* Fast path for plain ASCII characters. */ 104 if (pwc != NULL) 105 *pwc = ch; 106 return (ch != '\0' ? 1 : 0); 107 } 108 109 if (us->want == 0) { 110 /* 111 * Determine the number of octets that make up this character 112 * from the first octet, and a mask that extracts the 113 * interesting bits of the first octet. We already know 114 * the character is at least two bytes long. 115 * 116 * We also specify a lower bound for the character code to 117 * detect redundant, non-"shortest form" encodings. For 118 * example, the sequence C0 80 is _not_ a legal representation 119 * of the null character. This enforces a 1-to-1 mapping 120 * between character codes and their multibyte representations. 121 */ 122 ch = (unsigned char)*s; 123 if ((ch & 0x80) == 0) { 124 mask = 0x7f; 125 want = 1; 126 lbound = 0; 127 } else if ((ch & 0xe0) == 0xc0) { 128 mask = 0x1f; 129 want = 2; 130 lbound = 0x80; 131 } else if ((ch & 0xf0) == 0xe0) { 132 mask = 0x0f; 133 want = 3; 134 lbound = 0x800; 135 } else if ((ch & 0xf8) == 0xf0) { 136 mask = 0x07; 137 want = 4; 138 lbound = 0x10000; 139 } else if ((ch & 0xfc) == 0xf8) { 140 mask = 0x03; 141 want = 5; 142 lbound = 0x200000; 143 } else if ((ch & 0xfe) == 0xfc) { 144 mask = 0x01; 145 want = 6; 146 lbound = 0x4000000; 147 } else { 148 /* 149 * Malformed input; input is not UTF-8. 150 */ 151 errno = EILSEQ; 152 return ((size_t)-1); 153 } 154 } else { 155 want = us->want; 156 lbound = us->lbound; 157 } 158 159 /* 160 * Decode the octet sequence representing the character in chunks 161 * of 6 bits, most significant first. 162 */ 163 if (us->want == 0) 164 wch = (unsigned char)*s++ & mask; 165 else 166 wch = us->ch; 167 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 168 if ((*s & 0xc0) != 0x80) { 169 /* 170 * Malformed input; bad characters in the middle 171 * of a character. 172 */ 173 errno = EILSEQ; 174 return ((size_t)-1); 175 } 176 wch <<= 6; 177 wch |= *s++ & 0x3f; 178 } 179 if (i < want) { 180 /* Incomplete multibyte sequence. */ 181 us->want = want - i; 182 us->lbound = lbound; 183 us->ch = wch; 184 return ((size_t)-2); 185 } 186 if (wch < lbound) { 187 /* 188 * Malformed input; redundant encoding. 189 */ 190 errno = EILSEQ; 191 return ((size_t)-1); 192 } 193 if (pwc != NULL) 194 *pwc = wch; 195 us->want = 0; 196 return (wch == L'\0' ? 0 : want); 197 } 198 199 static size_t 200 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 201 size_t nms, size_t len, mbstate_t * __restrict ps) 202 { 203 _UTF8State *us; 204 const char *s; 205 size_t nchr; 206 wchar_t wc; 207 size_t nb; 208 209 us = (_UTF8State *)ps; 210 211 s = *src; 212 nchr = 0; 213 214 if (dst == NULL) { 215 /* 216 * The fast path in the loop below is not safe if an ASCII 217 * character appears as anything but the first byte of a 218 * multibyte sequence. Check now to avoid doing it in the loop. 219 */ 220 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 221 errno = EILSEQ; 222 return ((size_t)-1); 223 } 224 for (;;) { 225 if (nms > 0 && (signed char)*s > 0) 226 /* 227 * Fast path for plain ASCII characters 228 * excluding NUL. 229 */ 230 nb = 1; 231 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 232 (size_t)-1) 233 /* Invalid sequence - mbrtowc() sets errno. */ 234 return ((size_t)-1); 235 else if (nb == 0 || nb == (size_t)-2) 236 return (nchr); 237 s += nb; 238 nms -= nb; 239 nchr++; 240 } 241 /*NOTREACHED*/ 242 } 243 244 /* 245 * The fast path in the loop below is not safe if an ASCII 246 * character appears as anything but the first byte of a 247 * multibyte sequence. Check now to avoid doing it in the loop. 248 */ 249 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 250 errno = EILSEQ; 251 return ((size_t)-1); 252 } 253 while (len-- > 0) { 254 if (nms > 0 && (signed char)*s > 0) { 255 /* 256 * Fast path for plain ASCII characters 257 * excluding NUL. 258 */ 259 *dst = (wchar_t)*s; 260 nb = 1; 261 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 262 (size_t)-1) { 263 *src = s; 264 return ((size_t)-1); 265 } else if (nb == (size_t)-2) { 266 *src = s + nms; 267 return (nchr); 268 } else if (nb == 0) { 269 *src = NULL; 270 return (nchr); 271 } 272 s += nb; 273 nms -= nb; 274 nchr++; 275 dst++; 276 } 277 *src = s; 278 return (nchr); 279 } 280 281 static size_t 282 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 283 { 284 _UTF8State *us; 285 unsigned char lead; 286 int i, len; 287 288 us = (_UTF8State *)ps; 289 290 if (us->want != 0) { 291 errno = EINVAL; 292 return ((size_t)-1); 293 } 294 295 if (s == NULL) 296 /* Reset to initial shift state (no-op) */ 297 return (1); 298 299 if ((wc & ~0x7f) == 0) { 300 /* Fast path for plain ASCII characters. */ 301 *s = (char)wc; 302 return (1); 303 } 304 305 /* 306 * Determine the number of octets needed to represent this character. 307 * We always output the shortest sequence possible. Also specify the 308 * first few bits of the first octet, which contains the information 309 * about the sequence length. 310 */ 311 if ((wc & ~0x7f) == 0) { 312 lead = 0; 313 len = 1; 314 } else if ((wc & ~0x7ff) == 0) { 315 lead = 0xc0; 316 len = 2; 317 } else if ((wc & ~0xffff) == 0) { 318 lead = 0xe0; 319 len = 3; 320 } else if ((wc & ~0x1fffff) == 0) { 321 lead = 0xf0; 322 len = 4; 323 } else if ((wc & ~0x3ffffff) == 0) { 324 lead = 0xf8; 325 len = 5; 326 } else if ((wc & ~0x7fffffff) == 0) { 327 lead = 0xfc; 328 len = 6; 329 } else { 330 errno = EILSEQ; 331 return ((size_t)-1); 332 } 333 334 /* 335 * Output the octets representing the character in chunks 336 * of 6 bits, least significant last. The first octet is 337 * a special case because it contains the sequence length 338 * information. 339 */ 340 for (i = len - 1; i > 0; i--) { 341 s[i] = (wc & 0x3f) | 0x80; 342 wc >>= 6; 343 } 344 *s = (wc & 0xff) | lead; 345 346 return (len); 347 } 348 349 static size_t 350 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 351 size_t nwc, size_t len, mbstate_t * __restrict ps) 352 { 353 _UTF8State *us; 354 char buf[MB_LEN_MAX]; 355 const wchar_t *s; 356 size_t nbytes; 357 size_t nb; 358 359 us = (_UTF8State *)ps; 360 361 if (us->want != 0) { 362 errno = EINVAL; 363 return ((size_t)-1); 364 } 365 366 s = *src; 367 nbytes = 0; 368 369 if (dst == NULL) { 370 while (nwc-- > 0) { 371 if (0 <= *s && *s < 0x80) 372 /* Fast path for plain ASCII characters. */ 373 nb = 1; 374 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 375 (size_t)-1) 376 /* Invalid character - wcrtomb() sets errno. */ 377 return ((size_t)-1); 378 if (*s == L'\0') 379 return (nbytes + nb - 1); 380 s++; 381 nbytes += nb; 382 } 383 return (nbytes); 384 } 385 386 while (len > 0 && nwc-- > 0) { 387 if (0 <= *s && *s < 0x80) { 388 /* Fast path for plain ASCII characters. */ 389 nb = 1; 390 *dst = *s; 391 } else if (len > (size_t)MB_CUR_MAX) { 392 /* Enough space to translate in-place. */ 393 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 394 *src = s; 395 return ((size_t)-1); 396 } 397 } else { 398 /* 399 * May not be enough space; use temp. buffer. 400 */ 401 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 402 *src = s; 403 return ((size_t)-1); 404 } 405 if (nb > (int)len) 406 /* MB sequence for character won't fit. */ 407 break; 408 memcpy(dst, buf, nb); 409 } 410 if (*s == L'\0') { 411 *src = NULL; 412 return (nbytes + nb - 1); 413 } 414 s++; 415 dst += nb; 416 len -= nb; 417 nbytes += nb; 418 } 419 *src = s; 420 return (nbytes); 421 } 422