1 /* 2 * Copyright (c) 2002-2004 Tim J. Robbins 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 #include "lint.h" 32 #include <errno.h> 33 #include <limits.h> 34 #include "runetype.h" 35 #include <stdlib.h> 36 #include <string.h> 37 #include <wchar.h> 38 #include "mblocal.h" 39 40 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD, 41 const char *_RESTRICT_KYWD, 42 size_t, mbstate_t *_RESTRICT_KYWD); 43 static int _UTF8_mbsinit(const mbstate_t *); 44 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 45 const char **_RESTRICT_KYWD, size_t, size_t, 46 mbstate_t *_RESTRICT_KYWD); 47 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 48 mbstate_t *_RESTRICT_KYWD); 49 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD, 50 const wchar_t **_RESTRICT_KYWD, 51 size_t, size_t, mbstate_t *_RESTRICT_KYWD); 52 53 typedef struct { 54 wchar_t ch; 55 int want; 56 wchar_t lbound; 57 } _UTF8State; 58 59 int 60 _UTF8_init(_RuneLocale *rl) 61 { 62 __mbrtowc = _UTF8_mbrtowc; 63 __wcrtomb = _UTF8_wcrtomb; 64 __mbsinit = _UTF8_mbsinit; 65 __mbsnrtowcs = _UTF8_mbsnrtowcs; 66 __wcsnrtombs = _UTF8_wcsnrtombs; 67 _CurrentRuneLocale = rl; 68 69 charset_is_ascii = 0; 70 71 /* 72 * In theory up to 6 bytes can be used for the encoding, 73 * but only encodings with more than 4 bytes are illegal. 74 */ 75 __ctype[520] = 4; 76 /* 77 * Note that the other CSWIDTH members are nonsensical for this 78 * this coding. They only are valid with EUC codings. 79 */ 80 81 return (0); 82 } 83 84 static int 85 _UTF8_mbsinit(const mbstate_t *ps) 86 { 87 88 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 89 } 90 91 static size_t 92 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 93 size_t n, mbstate_t *_RESTRICT_KYWD ps) 94 { 95 _UTF8State *us; 96 int ch, i, mask, want; 97 wchar_t lbound, wch; 98 99 us = (_UTF8State *)ps; 100 101 if (us->want < 0 || us->want > 6) { 102 errno = EINVAL; 103 return ((size_t)-1); 104 } 105 106 if (s == NULL) { 107 s = ""; 108 n = 1; 109 pwc = NULL; 110 } 111 112 if (n == 0) 113 /* Incomplete multibyte sequence */ 114 return ((size_t)-2); 115 116 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) { 117 /* Fast path for plain ASCII characters. */ 118 if (pwc != NULL) 119 *pwc = ch; 120 return (ch != '\0' ? 1 : 0); 121 } 122 123 if (us->want == 0) { 124 /* 125 * Determine the number of octets that make up this character 126 * from the first octet, and a mask that extracts the 127 * interesting bits of the first octet. We already know 128 * the character is at least two bytes long. 129 * 130 * We also specify a lower bound for the character code to 131 * detect redundant, non-"shortest form" encodings. For 132 * example, the sequence C0 80 is _not_ a legal representation 133 * of the null character. This enforces a 1-to-1 mapping 134 * between character codes and their multibyte representations. 135 */ 136 ch = (unsigned char)*s; 137 if ((ch & 0x80) == 0) { 138 mask = 0x7f; 139 want = 1; 140 lbound = 0; 141 } else if ((ch & 0xe0) == 0xc0) { 142 mask = 0x1f; 143 want = 2; 144 lbound = 0x80; 145 } else if ((ch & 0xf0) == 0xe0) { 146 mask = 0x0f; 147 want = 3; 148 lbound = 0x800; 149 } else if ((ch & 0xf8) == 0xf0) { 150 mask = 0x07; 151 want = 4; 152 lbound = 0x10000; 153 #if 0 154 /* These would be illegal in the UTF-8 space */ 155 156 } else if ((ch & 0xfc) == 0xf8) { 157 mask = 0x03; 158 want = 5; 159 lbound = 0x200000; 160 } else if ((ch & 0xfe) == 0xfc) { 161 mask = 0x01; 162 want = 6; 163 lbound = 0x4000000; 164 #endif 165 } else { 166 /* 167 * Malformed input; input is not UTF-8. 168 */ 169 errno = EILSEQ; 170 return ((size_t)-1); 171 } 172 } else { 173 want = us->want; 174 lbound = us->lbound; 175 } 176 177 /* 178 * Decode the octet sequence representing the character in chunks 179 * of 6 bits, most significant first. 180 */ 181 if (us->want == 0) 182 wch = (unsigned char)*s++ & mask; 183 else 184 wch = us->ch; 185 186 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 187 if ((*s & 0xc0) != 0x80) { 188 /* 189 * Malformed input; bad characters in the middle 190 * of a character. 191 */ 192 errno = EILSEQ; 193 return ((size_t)-1); 194 } 195 wch <<= 6; 196 wch |= *s++ & 0x3f; 197 } 198 if (i < want) { 199 /* Incomplete multibyte sequence. */ 200 us->want = want - i; 201 us->lbound = lbound; 202 us->ch = wch; 203 return ((size_t)-2); 204 } 205 if (wch < lbound) { 206 /* 207 * Malformed input; redundant encoding. 208 */ 209 errno = EILSEQ; 210 return ((size_t)-1); 211 } 212 if (pwc != NULL) 213 *pwc = wch; 214 us->want = 0; 215 return (wch == L'\0' ? 0 : want); 216 } 217 218 static size_t 219 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src, 220 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 221 { 222 _UTF8State *us; 223 const char *s; 224 size_t nchr; 225 wchar_t wc; 226 size_t nb; 227 228 us = (_UTF8State *)ps; 229 230 s = *src; 231 nchr = 0; 232 233 if (dst == NULL) { 234 /* 235 * The fast path in the loop below is not safe if an ASCII 236 * character appears as anything but the first byte of a 237 * multibyte sequence. Check now to avoid doing it in the loop. 238 */ 239 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 240 errno = EILSEQ; 241 return ((size_t)-1); 242 } 243 for (;;) { 244 if (nms > 0 && (signed char)*s > 0) 245 /* 246 * Fast path for plain ASCII characters 247 * excluding NUL. 248 */ 249 nb = 1; 250 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) == 251 (size_t)-1) 252 /* Invalid sequence - mbrtowc() sets errno. */ 253 return ((size_t)-1); 254 else if (nb == 0 || nb == (size_t)-2) 255 return (nchr); 256 s += nb; 257 nms -= nb; 258 nchr++; 259 } 260 /*NOTREACHED*/ 261 } 262 263 /* 264 * The fast path in the loop below is not safe if an ASCII 265 * character appears as anything but the first byte of a 266 * multibyte sequence. Check now to avoid doing it in the loop. 267 */ 268 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 269 errno = EILSEQ; 270 return ((size_t)-1); 271 } 272 while (len-- > 0) { 273 if (nms > 0 && (signed char)*s > 0) { 274 /* 275 * Fast path for plain ASCII characters 276 * excluding NUL. 277 */ 278 *dst = (wchar_t)*s; 279 nb = 1; 280 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) == 281 (size_t)-1) { 282 *src = s; 283 return ((size_t)-1); 284 } else if (nb == (size_t)-2) { 285 *src = s + nms; 286 return (nchr); 287 } else if (nb == 0) { 288 *src = NULL; 289 return (nchr); 290 } 291 s += nb; 292 nms -= nb; 293 nchr++; 294 dst++; 295 } 296 *src = s; 297 return (nchr); 298 } 299 300 static size_t 301 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps) 302 { 303 _UTF8State *us; 304 unsigned char lead; 305 int i, len; 306 307 us = (_UTF8State *)ps; 308 309 if (us->want != 0) { 310 errno = EINVAL; 311 return ((size_t)-1); 312 } 313 314 if (s == NULL) 315 /* Reset to initial shift state (no-op) */ 316 return (1); 317 318 if ((wc & ~0x7f) == 0) { 319 /* Fast path for plain ASCII characters. */ 320 *s = (char)wc; 321 return (1); 322 } 323 324 /* 325 * Determine the number of octets needed to represent this character. 326 * We always output the shortest sequence possible. Also specify the 327 * first few bits of the first octet, which contains the information 328 * about the sequence length. 329 */ 330 if ((wc & ~0x7f) == 0) { 331 lead = 0; 332 len = 1; 333 } else if ((wc & ~0x7ff) == 0) { 334 lead = 0xc0; 335 len = 2; 336 } else if ((wc & ~0xffff) == 0) { 337 lead = 0xe0; 338 len = 3; 339 } else if ((wc & ~0x1fffff) == 0) { 340 lead = 0xf0; 341 len = 4; 342 #if 0 343 /* Again, 5 and 6 byte encodings are simply not permitted */ 344 } else if ((wc & ~0x3ffffff) == 0) { 345 lead = 0xf8; 346 len = 5; 347 } else if ((wc & ~0x7fffffff) == 0) { 348 lead = 0xfc; 349 len = 6; 350 #endif 351 } else { 352 errno = EILSEQ; 353 return ((size_t)-1); 354 } 355 356 /* 357 * Output the octets representing the character in chunks 358 * of 6 bits, least significant last. The first octet is 359 * a special case because it contains the sequence length 360 * information. 361 */ 362 for (i = len - 1; i > 0; i--) { 363 s[i] = (wc & 0x3f) | 0x80; 364 wc >>= 6; 365 } 366 *s = (wc & 0xff) | lead; 367 368 return (len); 369 } 370 371 static size_t 372 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 373 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 374 { 375 _UTF8State *us; 376 char buf[MB_LEN_MAX]; 377 const wchar_t *s; 378 size_t nbytes; 379 size_t nb; 380 381 us = (_UTF8State *)ps; 382 383 if (us->want != 0) { 384 errno = EINVAL; 385 return ((size_t)-1); 386 } 387 388 s = *src; 389 nbytes = 0; 390 391 if (dst == NULL) { 392 while (nwc-- > 0) { 393 if (0 <= *s && *s < 0x80) 394 /* Fast path for plain ASCII characters. */ 395 nb = 1; 396 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 397 (size_t)-1) 398 /* Invalid character - wcrtomb() sets errno. */ 399 return ((size_t)-1); 400 if (*s == L'\0') 401 return (nbytes + nb - 1); 402 s++; 403 nbytes += nb; 404 } 405 return (nbytes); 406 } 407 408 while (len > 0 && nwc-- > 0) { 409 if (0 <= *s && *s < 0x80) { 410 /* Fast path for plain ASCII characters. */ 411 nb = 1; 412 *dst = *s; 413 } else if (len > (size_t)MB_CUR_MAX) { 414 /* Enough space to translate in-place. */ 415 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 416 *src = s; 417 return ((size_t)-1); 418 } 419 } else { 420 /* 421 * May not be enough space; use temp. buffer. 422 */ 423 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 424 *src = s; 425 return ((size_t)-1); 426 } 427 if (nb > (int)len) 428 /* MB sequence for character won't fit. */ 429 break; 430 (void) memcpy(dst, buf, nb); 431 } 432 if (*s == L'\0') { 433 *src = NULL; 434 return (nbytes + nb - 1); 435 } 436 s++; 437 dst += nb; 438 len -= nb; 439 nbytes += nb; 440 } 441 *src = s; 442 return (nbytes); 443 } 444