1 /*- 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/param.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <errno.h> 31 #include <limits.h> 32 #include <runetype.h> 33 #include <stdlib.h> 34 #include <string.h> 35 #include <wchar.h> 36 #include "mblocal.h" 37 38 size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, 39 mbstate_t * __restrict); 40 int _UTF8_mbsinit(const mbstate_t *); 41 size_t _UTF8_mbsnrtowcs(wchar_t * __restrict, const char ** __restrict, 42 size_t, size_t, mbstate_t * __restrict); 43 size_t _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); 44 size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, 45 size_t, size_t, mbstate_t * __restrict); 46 47 typedef struct { 48 wchar_t ch; 49 int want; 50 wchar_t lbound; 51 } _UTF8State; 52 53 int 54 _UTF8_init(_RuneLocale *rl) 55 { 56 57 __mbrtowc = _UTF8_mbrtowc; 58 __wcrtomb = _UTF8_wcrtomb; 59 __mbsinit = _UTF8_mbsinit; 60 __mbsnrtowcs = _UTF8_mbsnrtowcs; 61 __wcsnrtombs = _UTF8_wcsnrtombs; 62 _CurrentRuneLocale = rl; 63 __mb_cur_max = 6; 64 65 return (0); 66 } 67 68 int 69 _UTF8_mbsinit(const mbstate_t *ps) 70 { 71 72 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 73 } 74 75 size_t 76 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n, 77 mbstate_t * __restrict ps) 78 { 79 _UTF8State *us; 80 int ch, i, mask, want; 81 wchar_t lbound, wch; 82 83 us = (_UTF8State *)ps; 84 85 if (us->want < 0 || us->want > 6) { 86 errno = EINVAL; 87 return ((size_t)-1); 88 } 89 90 if (s == NULL) { 91 s = ""; 92 n = 1; 93 pwc = NULL; 94 } 95 96 if (n == 0) 97 /* Incomplete multibyte sequence */ 98 return ((size_t)-2); 99 100 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 101 /* Fast path for plain ASCII characters. */ 102 if (pwc != NULL) 103 *pwc = ch; 104 return (ch != '\0' ? 1 : 0); 105 } 106 107 if (us->want == 0) { 108 /* 109 * Determine the number of octets that make up this character 110 * from the first octet, and a mask that extracts the 111 * interesting bits of the first octet. We already know 112 * the character is at least two bytes long. 113 * 114 * We also specify a lower bound for the character code to 115 * detect redundant, non-"shortest form" encodings. For 116 * example, the sequence C0 80 is _not_ a legal representation 117 * of the null character. This enforces a 1-to-1 mapping 118 * between character codes and their multibyte representations. 119 */ 120 ch = (unsigned char)*s; 121 if ((ch & 0x80) == 0) { 122 mask = 0x7f; 123 want = 1; 124 lbound = 0; 125 } else if ((ch & 0xe0) == 0xc0) { 126 mask = 0x1f; 127 want = 2; 128 lbound = 0x80; 129 } else if ((ch & 0xf0) == 0xe0) { 130 mask = 0x0f; 131 want = 3; 132 lbound = 0x800; 133 } else if ((ch & 0xf8) == 0xf0) { 134 mask = 0x07; 135 want = 4; 136 lbound = 0x10000; 137 } else if ((ch & 0xfc) == 0xf8) { 138 mask = 0x03; 139 want = 5; 140 lbound = 0x200000; 141 } else if ((ch & 0xfc) == 0xfc) { 142 mask = 0x01; 143 want = 6; 144 lbound = 0x4000000; 145 } else { 146 /* 147 * Malformed input; input is not UTF-8. 148 */ 149 errno = EILSEQ; 150 return ((size_t)-1); 151 } 152 } else { 153 want = us->want; 154 lbound = us->lbound; 155 } 156 157 /* 158 * Decode the octet sequence representing the character in chunks 159 * of 6 bits, most significant first. 160 */ 161 if (us->want == 0) 162 wch = (unsigned char)*s++ & mask; 163 else 164 wch = us->ch; 165 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 166 if ((*s & 0xc0) != 0x80) { 167 /* 168 * Malformed input; bad characters in the middle 169 * of a character. 170 */ 171 errno = EILSEQ; 172 return ((size_t)-1); 173 } 174 wch <<= 6; 175 wch |= *s++ & 0x3f; 176 } 177 if (i < want) { 178 /* Incomplete multibyte sequence. */ 179 us->want = want - i; 180 us->lbound = lbound; 181 us->ch = wch; 182 return ((size_t)-2); 183 } 184 if (wch < lbound) { 185 /* 186 * Malformed input; redundant encoding. 187 */ 188 errno = EILSEQ; 189 return ((size_t)-1); 190 } 191 if (pwc != NULL) 192 *pwc = wch; 193 us->want = 0; 194 return (wch == L'\0' ? 0 : want); 195 } 196 197 size_t 198 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src, 199 size_t nms, size_t len, mbstate_t * __restrict ps) 200 { 201 _UTF8State *us; 202 const char *s; 203 size_t nchr; 204 wchar_t wc; 205 size_t nb; 206 207 us = (_UTF8State *)ps; 208 209 s = *src; 210 nchr = 0; 211 212 if (dst == NULL) { 213 /* 214 * The fast path in the loop below is not safe if an ASCII 215 * character appears as anything but the first byte of a 216 * multibyte sequence. Check now to avoid doing it in the loop. 217 */ 218 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 219 errno = EILSEQ; 220 return ((size_t)-1); 221 } 222 for (;;) { 223 if (nms > 0 && (signed char)*s > 0) 224 /* 225 * Fast path for plain ASCII characters 226 * excluding NUL. 227 */ 228 nb = 1; 229 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 230 (size_t)-1) 231 /* Invalid sequence - mbrtowc() sets errno. */ 232 return ((size_t)-1); 233 else if (nb == 0 || nb == (size_t)-2) 234 return (nchr); 235 s += nb; 236 nms -= nb; 237 nchr++; 238 } 239 /*NOTREACHED*/ 240 } 241 242 /* 243 * The fast path in the loop below is not safe if an ASCII 244 * character appears as anything but the first byte of a 245 * multibyte sequence. Check now to avoid doing it in the loop. 246 */ 247 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 248 errno = EILSEQ; 249 return ((size_t)-1); 250 } 251 while (len-- > 0) { 252 if (nms > 0 && (signed char)*s > 0) { 253 /* 254 * Fast path for plain ASCII characters 255 * excluding NUL. 256 */ 257 *dst = (wchar_t)*s; 258 nb = 1; 259 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 260 (size_t)-1) { 261 *src = s; 262 return ((size_t)-1); 263 } else if (nb == (size_t)-2) { 264 *src = s + nms; 265 return (nchr); 266 } else if (nb == 0) { 267 *src = NULL; 268 return (nchr); 269 } 270 s += nb; 271 nms -= nb; 272 nchr++; 273 dst++; 274 } 275 *src = s; 276 return (nchr); 277 } 278 279 size_t 280 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps) 281 { 282 _UTF8State *us; 283 unsigned char lead; 284 int i, len; 285 286 us = (_UTF8State *)ps; 287 288 if (us->want != 0) { 289 errno = EINVAL; 290 return ((size_t)-1); 291 } 292 293 if (s == NULL) 294 /* Reset to initial shift state (no-op) */ 295 return (1); 296 297 if ((wc & ~0x7f) == 0) { 298 /* Fast path for plain ASCII characters. */ 299 *s = (char)wc; 300 return (1); 301 } 302 303 /* 304 * Determine the number of octets needed to represent this character. 305 * We always output the shortest sequence possible. Also specify the 306 * first few bits of the first octet, which contains the information 307 * about the sequence length. 308 */ 309 if ((wc & ~0x7f) == 0) { 310 lead = 0; 311 len = 1; 312 } else if ((wc & ~0x7ff) == 0) { 313 lead = 0xc0; 314 len = 2; 315 } else if ((wc & ~0xffff) == 0) { 316 lead = 0xe0; 317 len = 3; 318 } else if ((wc & ~0x1fffff) == 0) { 319 lead = 0xf0; 320 len = 4; 321 } else if ((wc & ~0x3ffffff) == 0) { 322 lead = 0xf8; 323 len = 5; 324 } else if ((wc & ~0x7fffffff) == 0) { 325 lead = 0xfc; 326 len = 6; 327 } else { 328 errno = EILSEQ; 329 return ((size_t)-1); 330 } 331 332 /* 333 * Output the octets representing the character in chunks 334 * of 6 bits, least significant last. The first octet is 335 * a special case because it contains the sequence length 336 * information. 337 */ 338 for (i = len - 1; i > 0; i--) { 339 s[i] = (wc & 0x3f) | 0x80; 340 wc >>= 6; 341 } 342 *s = (wc & 0xff) | lead; 343 344 return (len); 345 } 346 347 size_t 348 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, 349 size_t nwc, size_t len, mbstate_t * __restrict ps) 350 { 351 _UTF8State *us; 352 char buf[MB_LEN_MAX]; 353 const wchar_t *s; 354 size_t nbytes; 355 size_t nb; 356 357 us = (_UTF8State *)ps; 358 359 if (us->want != 0) { 360 errno = EINVAL; 361 return ((size_t)-1); 362 } 363 364 s = *src; 365 nbytes = 0; 366 367 if (dst == NULL) { 368 while (nwc-- > 0) { 369 if (0 <= *s && *s < 0x80) 370 /* Fast path for plain ASCII characters. */ 371 nb = 1; 372 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 373 (size_t)-1) 374 /* Invalid character - wcrtomb() sets errno. */ 375 return ((size_t)-1); 376 if (*s == L'\0') 377 return (nbytes + nb - 1); 378 s++; 379 nbytes += nb; 380 } 381 return (nbytes); 382 } 383 384 while (len > 0 && nwc-- > 0) { 385 if (0 <= *s && *s < 0x80) { 386 /* Fast path for plain ASCII characters. */ 387 nb = 1; 388 *dst = *s; 389 } else if (len > (size_t)MB_CUR_MAX) { 390 /* Enough space to translate in-place. */ 391 if ((nb = (int)_UTF8_wcrtomb(dst, *s, ps)) < 0) { 392 *src = s; 393 return ((size_t)-1); 394 } 395 } else { 396 /* 397 * May not be enough space; use temp. buffer. 398 */ 399 if ((nb = (int)_UTF8_wcrtomb(buf, *s, ps)) < 0) { 400 *src = s; 401 return ((size_t)-1); 402 } 403 if (nb > (int)len) 404 /* MB sequence for character won't fit. */ 405 break; 406 memcpy(dst, buf, nb); 407 } 408 if (*s == L'\0') { 409 *src = NULL; 410 return (nbytes + nb - 1); 411 } 412 s++; 413 dst += nb; 414 len -= nb; 415 nbytes += nb; 416 } 417 *src = s; 418 return (nbytes); 419 } 420