1 /* 2 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 3 * Copyright (c) 2002-2004 Tim J. Robbins 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include "lint.h" 29 #include <errno.h> 30 #include <limits.h> 31 #include "runetype.h" 32 #include <stdlib.h> 33 #include <string.h> 34 #include <wchar.h> 35 #include "mblocal.h" 36 37 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD, 38 const char *_RESTRICT_KYWD, 39 size_t, mbstate_t *_RESTRICT_KYWD); 40 static int _UTF8_mbsinit(const mbstate_t *); 41 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 42 const char **_RESTRICT_KYWD, size_t, size_t, 43 mbstate_t *_RESTRICT_KYWD); 44 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 45 mbstate_t *_RESTRICT_KYWD); 46 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD, 47 const wchar_t **_RESTRICT_KYWD, 48 size_t, size_t, mbstate_t *_RESTRICT_KYWD); 49 50 typedef struct { 51 wchar_t ch; 52 int want; 53 wchar_t lbound; 54 } _UTF8State; 55 56 int 57 _UTF8_init(_RuneLocale *rl) 58 { 59 __mbrtowc = _UTF8_mbrtowc; 60 __wcrtomb = _UTF8_wcrtomb; 61 __mbsinit = _UTF8_mbsinit; 62 __mbsnrtowcs = _UTF8_mbsnrtowcs; 63 __wcsnrtombs = _UTF8_wcsnrtombs; 64 _CurrentRuneLocale = rl; 65 66 charset_is_ascii = 0; 67 68 /* 69 * In theory up to 6 bytes can be used for the encoding, 70 * but only encodings with more than 4 bytes are illegal. 71 */ 72 __ctype[520] = 4; 73 /* 74 * Note that the other CSWIDTH members are nonsensical for this 75 * this coding. They only are valid with EUC codings. 76 */ 77 78 return (0); 79 } 80 81 static int 82 _UTF8_mbsinit(const mbstate_t *ps) 83 { 84 85 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 86 } 87 88 static size_t 89 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 90 size_t n, mbstate_t *_RESTRICT_KYWD ps) 91 { 92 _UTF8State *us; 93 int ch, i, mask, want; 94 wchar_t lbound, wch; 95 96 us = (_UTF8State *)ps; 97 98 if (us->want < 0 || us->want > 6) { 99 errno = EINVAL; 100 return ((size_t)-1); 101 } 102 103 if (s == NULL) { 104 s = ""; 105 n = 1; 106 pwc = NULL; 107 } 108 109 if (n == 0) 110 /* Incomplete multibyte sequence */ 111 return ((size_t)-2); 112 113 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 114 /* Fast path for plain ASCII characters. */ 115 if (pwc != NULL) 116 *pwc = ch; 117 return (ch != '\0' ? 1 : 0); 118 } 119 120 if (us->want == 0) { 121 /* 122 * Determine the number of octets that make up this character 123 * from the first octet, and a mask that extracts the 124 * interesting bits of the first octet. We already know 125 * the character is at least two bytes long. 126 * 127 * We also specify a lower bound for the character code to 128 * detect redundant, non-"shortest form" encodings. For 129 * example, the sequence C0 80 is _not_ a legal representation 130 * of the null character. This enforces a 1-to-1 mapping 131 * between character codes and their multibyte representations. 132 */ 133 ch = (unsigned char)*s; 134 if ((ch & 0x80) == 0) { 135 mask = 0x7f; 136 want = 1; 137 lbound = 0; 138 } else if ((ch & 0xe0) == 0xc0) { 139 mask = 0x1f; 140 want = 2; 141 lbound = 0x80; 142 } else if ((ch & 0xf0) == 0xe0) { 143 mask = 0x0f; 144 want = 3; 145 lbound = 0x800; 146 } else if ((ch & 0xf8) == 0xf0) { 147 mask = 0x07; 148 want = 4; 149 lbound = 0x10000; 150 #if 0 151 /* These would be illegal in the UTF-8 space */ 152 153 } else if ((ch & 0xfc) == 0xf8) { 154 mask = 0x03; 155 want = 5; 156 lbound = 0x200000; 157 } else if ((ch & 0xfe) == 0xfc) { 158 mask = 0x01; 159 want = 6; 160 lbound = 0x4000000; 161 #endif 162 } else { 163 /* 164 * Malformed input; input is not UTF-8. 165 */ 166 errno = EILSEQ; 167 return ((size_t)-1); 168 } 169 } else { 170 want = us->want; 171 lbound = us->lbound; 172 } 173 174 /* 175 * Decode the octet sequence representing the character in chunks 176 * of 6 bits, most significant first. 177 */ 178 if (us->want == 0) 179 wch = (unsigned char)*s++ & mask; 180 else 181 wch = us->ch; 182 183 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 184 if ((*s & 0xc0) != 0x80) { 185 /* 186 * Malformed input; bad characters in the middle 187 * of a character. 188 */ 189 errno = EILSEQ; 190 return ((size_t)-1); 191 } 192 wch <<= 6; 193 wch |= *s++ & 0x3f; 194 } 195 if (i < want) { 196 /* Incomplete multibyte sequence. */ 197 us->want = want - i; 198 us->lbound = lbound; 199 us->ch = wch; 200 return ((size_t)-2); 201 } 202 if (wch < lbound) { 203 /* 204 * Malformed input; redundant encoding. 205 */ 206 errno = EILSEQ; 207 return ((size_t)-1); 208 } 209 if (pwc != NULL) 210 *pwc = wch; 211 us->want = 0; 212 return (wch == L'\0' ? 0 : want); 213 } 214 215 static size_t 216 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src, 217 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 218 { 219 _UTF8State *us; 220 const char *s; 221 size_t nchr; 222 wchar_t wc; 223 size_t nb; 224 225 us = (_UTF8State *)ps; 226 227 s = *src; 228 nchr = 0; 229 230 if (dst == NULL) { 231 /* 232 * The fast path in the loop below is not safe if an ASCII 233 * character appears as anything but the first byte of a 234 * multibyte sequence. Check now to avoid doing it in the loop. 235 */ 236 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 237 errno = EILSEQ; 238 return ((size_t)-1); 239 } 240 for (;;) { 241 if (nms > 0 && (signed char)*s > 0) 242 /* 243 * Fast path for plain ASCII characters 244 * excluding NUL. 245 */ 246 nb = 1; 247 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 248 (size_t)-1) 249 /* Invalid sequence - mbrtowc() sets errno. */ 250 return ((size_t)-1); 251 else if (nb == 0 || nb == (size_t)-2) 252 return (nchr); 253 s += nb; 254 nms -= nb; 255 nchr++; 256 } 257 /*NOTREACHED*/ 258 } 259 260 /* 261 * The fast path in the loop below is not safe if an ASCII 262 * character appears as anything but the first byte of a 263 * multibyte sequence. Check now to avoid doing it in the loop. 264 */ 265 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 266 errno = EILSEQ; 267 return ((size_t)-1); 268 } 269 while (len-- > 0) { 270 if (nms > 0 && (signed char)*s > 0) { 271 /* 272 * Fast path for plain ASCII characters 273 * excluding NUL. 274 */ 275 *dst = (wchar_t)*s; 276 nb = 1; 277 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 278 (size_t)-1) { 279 *src = s; 280 return ((size_t)-1); 281 } else if (nb == (size_t)-2) { 282 *src = s + nms; 283 return (nchr); 284 } else if (nb == 0) { 285 *src = NULL; 286 return (nchr); 287 } 288 s += nb; 289 nms -= nb; 290 nchr++; 291 dst++; 292 } 293 *src = s; 294 return (nchr); 295 } 296 297 static size_t 298 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps) 299 { 300 _UTF8State *us; 301 unsigned char lead; 302 int i, len; 303 304 us = (_UTF8State *)ps; 305 306 if (us->want != 0) { 307 errno = EINVAL; 308 return ((size_t)-1); 309 } 310 311 if (s == NULL) 312 /* Reset to initial shift state (no-op) */ 313 return (1); 314 315 if ((wc & ~0x7f) == 0) { 316 /* Fast path for plain ASCII characters. */ 317 *s = (char)wc; 318 return (1); 319 } 320 321 /* 322 * Determine the number of octets needed to represent this character. 323 * We always output the shortest sequence possible. Also specify the 324 * first few bits of the first octet, which contains the information 325 * about the sequence length. 326 */ 327 if ((wc & ~0x7f) == 0) { 328 lead = 0; 329 len = 1; 330 } else if ((wc & ~0x7ff) == 0) { 331 lead = 0xc0; 332 len = 2; 333 } else if ((wc & ~0xffff) == 0) { 334 lead = 0xe0; 335 len = 3; 336 } else if ((wc & ~0x1fffff) == 0) { 337 lead = 0xf0; 338 len = 4; 339 #if 0 340 /* Again, 5 and 6 byte encodings are simply not permitted */ 341 } else if ((wc & ~0x3ffffff) == 0) { 342 lead = 0xf8; 343 len = 5; 344 } else if ((wc & ~0x7fffffff) == 0) { 345 lead = 0xfc; 346 len = 6; 347 #endif 348 } else { 349 errno = EILSEQ; 350 return ((size_t)-1); 351 } 352 353 /* 354 * Output the octets representing the character in chunks 355 * of 6 bits, least significant last. The first octet is 356 * a special case because it contains the sequence length 357 * information. 358 */ 359 for (i = len - 1; i > 0; i--) { 360 s[i] = (wc & 0x3f) | 0x80; 361 wc >>= 6; 362 } 363 *s = (wc & 0xff) | lead; 364 365 return (len); 366 } 367 368 static size_t 369 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 370 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 371 { 372 _UTF8State *us; 373 char buf[MB_LEN_MAX]; 374 const wchar_t *s; 375 size_t nbytes; 376 size_t nb; 377 378 us = (_UTF8State *)ps; 379 380 if (us->want != 0) { 381 errno = EINVAL; 382 return ((size_t)-1); 383 } 384 385 s = *src; 386 nbytes = 0; 387 388 if (dst == NULL) { 389 while (nwc-- > 0) { 390 if (0 <= *s && *s < 0x80) 391 /* Fast path for plain ASCII characters. */ 392 nb = 1; 393 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 394 (size_t)-1) 395 /* Invalid character - wcrtomb() sets errno. */ 396 return ((size_t)-1); 397 if (*s == L'\0') 398 return (nbytes + nb - 1); 399 s++; 400 nbytes += nb; 401 } 402 return (nbytes); 403 } 404 405 while (len > 0 && nwc-- > 0) { 406 if (0 <= *s && *s < 0x80) { 407 /* Fast path for plain ASCII characters. */ 408 nb = 1; 409 *dst = *s; 410 } else if (len > (size_t)MB_CUR_MAX) { 411 /* Enough space to translate in-place. */ 412 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 413 *src = s; 414 return ((size_t)-1); 415 } 416 } else { 417 /* 418 * May not be enough space; use temp. buffer. 419 */ 420 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 421 *src = s; 422 return ((size_t)-1); 423 } 424 if (nb > (int)len) 425 /* MB sequence for character won't fit. */ 426 break; 427 (void) memcpy(dst, buf, nb); 428 } 429 if (*s == L'\0') { 430 *src = NULL; 431 return (nbytes + nb - 1); 432 } 433 s++; 434 dst += nb; 435 len -= nb; 436 nbytes += nb; 437 } 438 *src = s; 439 return (nbytes); 440 } 441