1 /* 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 #include "lint.h" 32 #include <errno.h> 33 #include <limits.h> 34 #include "runetype.h" 35 #include <stdlib.h> 36 #include <string.h> 37 #include <wchar.h> 38 #include "mblocal.h" 39 40 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD, 41 const char *_RESTRICT_KYWD, 42 size_t, mbstate_t *_RESTRICT_KYWD); 43 static int _UTF8_mbsinit(const mbstate_t *); 44 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 45 const char **_RESTRICT_KYWD, size_t, size_t, 46 mbstate_t *_RESTRICT_KYWD); 47 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 48 mbstate_t *_RESTRICT_KYWD); 49 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD, 50 const wchar_t **_RESTRICT_KYWD, 51 size_t, size_t, mbstate_t *_RESTRICT_KYWD); 52 53 typedef struct { 54 wchar_t ch; 55 int want; 56 wchar_t lbound; 57 } _UTF8State; 58 59 int 60 _UTF8_init(_RuneLocale *rl) 61 { 62 __mbrtowc = _UTF8_mbrtowc; 63 __wcrtomb = _UTF8_wcrtomb; 64 __mbsinit = _UTF8_mbsinit; 65 __mbsnrtowcs = _UTF8_mbsnrtowcs; 66 __wcsnrtombs = _UTF8_wcsnrtombs; 67 _CurrentRuneLocale = rl; 68 69 /* 70 * In theory up to 6 bytes can be used for the encoding, 71 * but only encodings with more than 4 bytes are illegal. 72 */ 73 __ctype[520] = 4; 74 /* 75 * Note that the other CSWIDTH members are nonsensical for this 76 * this coding. They only are valid with EUC codings. 77 */ 78 79 return (0); 80 } 81 82 static int 83 _UTF8_mbsinit(const mbstate_t *ps) 84 { 85 86 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 87 } 88 89 static size_t 90 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 91 size_t n, mbstate_t *_RESTRICT_KYWD ps) 92 { 93 _UTF8State *us; 94 int ch, i, mask, want; 95 wchar_t lbound, wch; 96 97 us = (_UTF8State *)ps; 98 99 if (us->want < 0 || us->want > 6) { 100 errno = EINVAL; 101 return ((size_t)-1); 102 } 103 104 if (s == NULL) { 105 s = ""; 106 n = 1; 107 pwc = NULL; 108 } 109 110 if (n == 0) 111 /* Incomplete multibyte sequence */ 112 return ((size_t)-2); 113 114 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 115 /* Fast path for plain ASCII characters. */ 116 if (pwc != NULL) 117 *pwc = ch; 118 return (ch != '\0' ? 1 : 0); 119 } 120 121 if (us->want == 0) { 122 /* 123 * Determine the number of octets that make up this character 124 * from the first octet, and a mask that extracts the 125 * interesting bits of the first octet. We already know 126 * the character is at least two bytes long. 127 * 128 * We also specify a lower bound for the character code to 129 * detect redundant, non-"shortest form" encodings. For 130 * example, the sequence C0 80 is _not_ a legal representation 131 * of the null character. This enforces a 1-to-1 mapping 132 * between character codes and their multibyte representations. 133 */ 134 ch = (unsigned char)*s; 135 if ((ch & 0x80) == 0) { 136 mask = 0x7f; 137 want = 1; 138 lbound = 0; 139 } else if ((ch & 0xe0) == 0xc0) { 140 mask = 0x1f; 141 want = 2; 142 lbound = 0x80; 143 } else if ((ch & 0xf0) == 0xe0) { 144 mask = 0x0f; 145 want = 3; 146 lbound = 0x800; 147 } else if ((ch & 0xf8) == 0xf0) { 148 mask = 0x07; 149 want = 4; 150 lbound = 0x10000; 151 #if 0 152 /* These would be illegal in the UTF-8 space */ 153 154 } else if ((ch & 0xfc) == 0xf8) { 155 mask = 0x03; 156 want = 5; 157 lbound = 0x200000; 158 } else if ((ch & 0xfe) == 0xfc) { 159 mask = 0x01; 160 want = 6; 161 lbound = 0x4000000; 162 #endif 163 } else { 164 /* 165 * Malformed input; input is not UTF-8. 166 */ 167 errno = EILSEQ; 168 return ((size_t)-1); 169 } 170 } else { 171 want = us->want; 172 lbound = us->lbound; 173 } 174 175 /* 176 * Decode the octet sequence representing the character in chunks 177 * of 6 bits, most significant first. 178 */ 179 if (us->want == 0) 180 wch = (unsigned char)*s++ & mask; 181 else 182 wch = us->ch; 183 184 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 185 if ((*s & 0xc0) != 0x80) { 186 /* 187 * Malformed input; bad characters in the middle 188 * of a character. 189 */ 190 errno = EILSEQ; 191 return ((size_t)-1); 192 } 193 wch <<= 6; 194 wch |= *s++ & 0x3f; 195 } 196 if (i < want) { 197 /* Incomplete multibyte sequence. */ 198 us->want = want - i; 199 us->lbound = lbound; 200 us->ch = wch; 201 return ((size_t)-2); 202 } 203 if (wch < lbound) { 204 /* 205 * Malformed input; redundant encoding. 206 */ 207 errno = EILSEQ; 208 return ((size_t)-1); 209 } 210 if (pwc != NULL) 211 *pwc = wch; 212 us->want = 0; 213 return (wch == L'\0' ? 0 : want); 214 } 215 216 static size_t 217 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src, 218 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 219 { 220 _UTF8State *us; 221 const char *s; 222 size_t nchr; 223 wchar_t wc; 224 size_t nb; 225 226 us = (_UTF8State *)ps; 227 228 s = *src; 229 nchr = 0; 230 231 if (dst == NULL) { 232 /* 233 * The fast path in the loop below is not safe if an ASCII 234 * character appears as anything but the first byte of a 235 * multibyte sequence. Check now to avoid doing it in the loop. 236 */ 237 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 238 errno = EILSEQ; 239 return ((size_t)-1); 240 } 241 for (;;) { 242 if (nms > 0 && (signed char)*s > 0) 243 /* 244 * Fast path for plain ASCII characters 245 * excluding NUL. 246 */ 247 nb = 1; 248 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 249 (size_t)-1) 250 /* Invalid sequence - mbrtowc() sets errno. */ 251 return ((size_t)-1); 252 else if (nb == 0 || nb == (size_t)-2) 253 return (nchr); 254 s += nb; 255 nms -= nb; 256 nchr++; 257 } 258 /*NOTREACHED*/ 259 } 260 261 /* 262 * The fast path in the loop below is not safe if an ASCII 263 * character appears as anything but the first byte of a 264 * multibyte sequence. Check now to avoid doing it in the loop. 265 */ 266 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 267 errno = EILSEQ; 268 return ((size_t)-1); 269 } 270 while (len-- > 0) { 271 if (nms > 0 && (signed char)*s > 0) { 272 /* 273 * Fast path for plain ASCII characters 274 * excluding NUL. 275 */ 276 *dst = (wchar_t)*s; 277 nb = 1; 278 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 279 (size_t)-1) { 280 *src = s; 281 return ((size_t)-1); 282 } else if (nb == (size_t)-2) { 283 *src = s + nms; 284 return (nchr); 285 } else if (nb == 0) { 286 *src = NULL; 287 return (nchr); 288 } 289 s += nb; 290 nms -= nb; 291 nchr++; 292 dst++; 293 } 294 *src = s; 295 return (nchr); 296 } 297 298 static size_t 299 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps) 300 { 301 _UTF8State *us; 302 unsigned char lead; 303 int i, len; 304 305 us = (_UTF8State *)ps; 306 307 if (us->want != 0) { 308 errno = EINVAL; 309 return ((size_t)-1); 310 } 311 312 if (s == NULL) 313 /* Reset to initial shift state (no-op) */ 314 return (1); 315 316 if ((wc & ~0x7f) == 0) { 317 /* Fast path for plain ASCII characters. */ 318 *s = (char)wc; 319 return (1); 320 } 321 322 /* 323 * Determine the number of octets needed to represent this character. 324 * We always output the shortest sequence possible. Also specify the 325 * first few bits of the first octet, which contains the information 326 * about the sequence length. 327 */ 328 if ((wc & ~0x7f) == 0) { 329 lead = 0; 330 len = 1; 331 } else if ((wc & ~0x7ff) == 0) { 332 lead = 0xc0; 333 len = 2; 334 } else if ((wc & ~0xffff) == 0) { 335 lead = 0xe0; 336 len = 3; 337 } else if ((wc & ~0x1fffff) == 0) { 338 lead = 0xf0; 339 len = 4; 340 #if 0 341 /* Again, 5 and 6 byte encodings are simply not permitted */ 342 } else if ((wc & ~0x3ffffff) == 0) { 343 lead = 0xf8; 344 len = 5; 345 } else if ((wc & ~0x7fffffff) == 0) { 346 lead = 0xfc; 347 len = 6; 348 #endif 349 } else { 350 errno = EILSEQ; 351 return ((size_t)-1); 352 } 353 354 /* 355 * Output the octets representing the character in chunks 356 * of 6 bits, least significant last. The first octet is 357 * a special case because it contains the sequence length 358 * information. 359 */ 360 for (i = len - 1; i > 0; i--) { 361 s[i] = (wc & 0x3f) | 0x80; 362 wc >>= 6; 363 } 364 *s = (wc & 0xff) | lead; 365 366 return (len); 367 } 368 369 static size_t 370 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 371 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 372 { 373 _UTF8State *us; 374 char buf[MB_LEN_MAX]; 375 const wchar_t *s; 376 size_t nbytes; 377 size_t nb; 378 379 us = (_UTF8State *)ps; 380 381 if (us->want != 0) { 382 errno = EINVAL; 383 return ((size_t)-1); 384 } 385 386 s = *src; 387 nbytes = 0; 388 389 if (dst == NULL) { 390 while (nwc-- > 0) { 391 if (0 <= *s && *s < 0x80) 392 /* Fast path for plain ASCII characters. */ 393 nb = 1; 394 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 395 (size_t)-1) 396 /* Invalid character - wcrtomb() sets errno. */ 397 return ((size_t)-1); 398 if (*s == L'\0') 399 return (nbytes + nb - 1); 400 s++; 401 nbytes += nb; 402 } 403 return (nbytes); 404 } 405 406 while (len > 0 && nwc-- > 0) { 407 if (0 <= *s && *s < 0x80) { 408 /* Fast path for plain ASCII characters. */ 409 nb = 1; 410 *dst = *s; 411 } else if (len > (size_t)MB_CUR_MAX) { 412 /* Enough space to translate in-place. */ 413 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 414 *src = s; 415 return ((size_t)-1); 416 } 417 } else { 418 /* 419 * May not be enough space; use temp. buffer. 420 */ 421 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 422 *src = s; 423 return ((size_t)-1); 424 } 425 if (nb > (int)len) 426 /* MB sequence for character won't fit. */ 427 break; 428 (void) memcpy(dst, buf, nb); 429 } 430 if (*s == L'\0') { 431 *src = NULL; 432 return (nbytes + nb - 1); 433 } 434 s++; 435 dst += nb; 436 len -= nb; 437 nbytes += nb; 438 } 439 *src = s; 440 return (nbytes); 441 } 442