1 /* 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 2002-2004 Tim J. Robbins 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include "lint.h" 30 #include <errno.h> 31 #include <limits.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <wchar.h> 35 #include "mblocal.h" 36 #include "lctype.h" 37 38 static size_t _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD, 39 const char *_RESTRICT_KYWD, 40 size_t, mbstate_t *_RESTRICT_KYWD, boolean_t); 41 static int _UTF8_mbsinit(const mbstate_t *); 42 static size_t _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 43 const char **_RESTRICT_KYWD, size_t, size_t, 44 mbstate_t *_RESTRICT_KYWD); 45 static size_t _UTF8_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 46 mbstate_t *_RESTRICT_KYWD); 47 static size_t _UTF8_wcsnrtombs(char *_RESTRICT_KYWD, 48 const wchar_t **_RESTRICT_KYWD, 49 size_t, size_t, mbstate_t *_RESTRICT_KYWD); 50 51 void 52 _UTF8_init(struct lc_ctype *lct) 53 { 54 lct->lc_mbrtowc = _UTF8_mbrtowc; 55 lct->lc_wcrtomb = _UTF8_wcrtomb; 56 lct->lc_mbsinit = _UTF8_mbsinit; 57 lct->lc_mbsnrtowcs = _UTF8_mbsnrtowcs; 58 lct->lc_wcsnrtombs = _UTF8_wcsnrtombs; 59 lct->lc_is_ascii = 0; 60 lct->lc_max_mblen = 4; 61 } 62 63 static int 64 _UTF8_mbsinit(const mbstate_t *ps) 65 { 66 67 return (ps == NULL || ((const _UTF8State *)ps)->want == 0); 68 } 69 70 static size_t 71 _UTF8_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 72 size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero) 73 { 74 _UTF8State *us; 75 int ch, i, mask, want; 76 wchar_t lbound, wch; 77 78 us = (_UTF8State *)ps; 79 80 if (us->want < 0 || us->want > 6) { 81 errno = EINVAL; 82 return ((size_t)-1); 83 } 84 85 if (s == NULL) { 86 s = ""; 87 n = 1; 88 pwc = NULL; 89 } 90 91 if (n == 0) 92 /* Incomplete multibyte sequence */ 93 return ((size_t)-2); 94 95 if (us->want == 0) { 96 /* 97 * Determine the number of octets that make up this character 98 * from the first octet, and a mask that extracts the 99 * interesting bits of the first octet. We already know 100 * the character is at least two bytes long. 101 * 102 * We also specify a lower bound for the character code to 103 * detect redundant, non-"shortest form" encodings. For 104 * example, the sequence C0 80 is _not_ a legal representation 105 * of the null character. This enforces a 1-to-1 mapping 106 * between character codes and their multibyte representations. 107 */ 108 ch = (unsigned char)*s; 109 if ((ch & 0x80) == 0) { 110 /* Fast path for plain ASCII characters. */ 111 if (pwc != NULL) 112 *pwc = ch; 113 if (zero || ch != '\0') { 114 return (1); 115 } else { 116 return (0); 117 } 118 } 119 if ((ch & 0xe0) == 0xc0) { 120 mask = 0x1f; 121 want = 2; 122 lbound = 0x80; 123 } else if ((ch & 0xf0) == 0xe0) { 124 mask = 0x0f; 125 want = 3; 126 lbound = 0x800; 127 } else if ((ch & 0xf8) == 0xf0) { 128 mask = 0x07; 129 want = 4; 130 lbound = 0x10000; 131 #if 0 132 /* These would be illegal in the UTF-8 space */ 133 134 } else if ((ch & 0xfc) == 0xf8) { 135 mask = 0x03; 136 want = 5; 137 lbound = 0x200000; 138 } else if ((ch & 0xfe) == 0xfc) { 139 mask = 0x01; 140 want = 6; 141 lbound = 0x4000000; 142 #endif 143 } else { 144 /* 145 * Malformed input; input is not UTF-8. 146 */ 147 errno = EILSEQ; 148 return ((size_t)-1); 149 } 150 } else { 151 want = us->want; 152 lbound = us->lbound; 153 } 154 155 /* 156 * Decode the octet sequence representing the character in chunks 157 * of 6 bits, most significant first. 158 */ 159 if (us->want == 0) 160 wch = (unsigned char)*s++ & mask; 161 else 162 wch = us->ch; 163 164 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) { 165 if ((*s & 0xc0) != 0x80) { 166 /* 167 * Malformed input; bad characters in the middle 168 * of a character. 169 */ 170 errno = EILSEQ; 171 return ((size_t)-1); 172 } 173 wch <<= 6; 174 wch |= *s++ & 0x3f; 175 } 176 if (i < want) { 177 /* Incomplete multibyte sequence. */ 178 us->want = want - i; 179 us->lbound = lbound; 180 us->ch = wch; 181 return ((size_t)-2); 182 } 183 if (wch < lbound) { 184 /* 185 * Malformed input; redundant encoding. 186 */ 187 errno = EILSEQ; 188 return ((size_t)-1); 189 } 190 if (pwc != NULL) 191 *pwc = wch; 192 us->want = 0; 193 if (zero || wch != L'\0') { 194 return (want); 195 } else { 196 return (0); 197 } 198 } 199 200 static size_t 201 _UTF8_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, const char **_RESTRICT_KYWD src, 202 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 203 { 204 _UTF8State *us; 205 const char *s; 206 size_t nchr; 207 wchar_t wc; 208 size_t nb; 209 210 us = (_UTF8State *)ps; 211 212 s = *src; 213 nchr = 0; 214 215 if (dst == NULL) { 216 /* 217 * The fast path in the loop below is not safe if an ASCII 218 * character appears as anything but the first byte of a 219 * multibyte sequence. Check now to avoid doing it in the loop. 220 */ 221 if (nms > 0 && us->want > 0 && (signed char)*s > 0) { 222 errno = EILSEQ; 223 return ((size_t)-1); 224 } 225 for (;;) { 226 if (nms > 0 && (signed char)*s > 0) { 227 /* 228 * Fast path for plain ASCII characters 229 * excluding NUL. 230 */ 231 nb = 1; 232 } else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps, 233 B_FALSE)) == (size_t)-1) { 234 /* Invalid sequence - mbrtowc() sets errno. */ 235 return ((size_t)-1); 236 } else if (nb == 0 || nb == (size_t)-2) { 237 return (nchr); 238 } 239 s += nb; 240 nms -= nb; 241 nchr++; 242 } 243 /*NOTREACHED*/ 244 } 245 246 /* 247 * The fast path in the loop below is not safe if an ASCII 248 * character appears as anything but the first byte of a 249 * multibyte sequence. Check now to avoid doing it in the loop. 250 */ 251 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) { 252 errno = EILSEQ; 253 return ((size_t)-1); 254 } 255 while (len-- > 0) { 256 if (nms > 0 && (signed char)*s > 0) { 257 /* 258 * Fast path for plain ASCII characters 259 * excluding NUL. 260 */ 261 *dst = (wchar_t)*s; 262 nb = 1; 263 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps, B_FALSE)) == 264 (size_t)-1) { 265 *src = s; 266 return ((size_t)-1); 267 } else if (nb == (size_t)-2) { 268 *src = s + nms; 269 return (nchr); 270 } else if (nb == 0) { 271 *src = NULL; 272 return (nchr); 273 } 274 s += nb; 275 nms -= nb; 276 nchr++; 277 dst++; 278 } 279 *src = s; 280 return (nchr); 281 } 282 283 static size_t 284 _UTF8_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, mbstate_t *_RESTRICT_KYWD ps) 285 { 286 _UTF8State *us; 287 unsigned char lead; 288 int i, len; 289 290 us = (_UTF8State *)ps; 291 292 if (us->want != 0) { 293 errno = EINVAL; 294 return ((size_t)-1); 295 } 296 297 if (s == NULL) 298 /* Reset to initial shift state (no-op) */ 299 return (1); 300 301 /* 302 * Determine the number of octets needed to represent this character. 303 * We always output the shortest sequence possible. Also specify the 304 * first few bits of the first octet, which contains the information 305 * about the sequence length. 306 */ 307 if ((wc & ~0x7f) == 0) { 308 /* Fast path for plain ASCII characters. */ 309 *s = (char)wc; 310 return (1); 311 } else if ((wc & ~0x7ff) == 0) { 312 lead = 0xc0; 313 len = 2; 314 } else if ((wc & ~0xffff) == 0) { 315 lead = 0xe0; 316 len = 3; 317 } else if ((wc & ~0x1fffff) == 0) { 318 lead = 0xf0; 319 len = 4; 320 #if 0 321 /* Again, 5 and 6 byte encodings are simply not permitted */ 322 } else if ((wc & ~0x3ffffff) == 0) { 323 lead = 0xf8; 324 len = 5; 325 } else if ((wc & ~0x7fffffff) == 0) { 326 lead = 0xfc; 327 len = 6; 328 #endif 329 } else { 330 errno = EILSEQ; 331 return ((size_t)-1); 332 } 333 334 /* 335 * Output the octets representing the character in chunks 336 * of 6 bits, least significant last. The first octet is 337 * a special case because it contains the sequence length 338 * information. 339 */ 340 for (i = len - 1; i > 0; i--) { 341 s[i] = (wc & 0x3f) | 0x80; 342 wc >>= 6; 343 } 344 *s = (wc & 0xff) | lead; 345 346 return (len); 347 } 348 349 static size_t 350 _UTF8_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 351 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 352 { 353 _UTF8State *us; 354 char buf[MB_LEN_MAX]; 355 const wchar_t *s; 356 size_t nbytes; 357 size_t nb; 358 359 us = (_UTF8State *)ps; 360 361 if (us->want != 0) { 362 errno = EINVAL; 363 return ((size_t)-1); 364 } 365 366 s = *src; 367 nbytes = 0; 368 369 if (dst == NULL) { 370 while (nwc-- > 0) { 371 if (0 <= *s && *s < 0x80) 372 /* Fast path for plain ASCII characters. */ 373 nb = 1; 374 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == 375 (size_t)-1) 376 /* Invalid character - wcrtomb() sets errno. */ 377 return ((size_t)-1); 378 if (*s == L'\0') 379 return (nbytes + nb - 1); 380 s++; 381 nbytes += nb; 382 } 383 return (nbytes); 384 } 385 386 while (len > 0 && nwc-- > 0) { 387 if (0 <= *s && *s < 0x80) { 388 /* Fast path for plain ASCII characters. */ 389 nb = 1; 390 *dst = *s; 391 } else if (len > (size_t)MB_CUR_MAX) { 392 /* Enough space to translate in-place. */ 393 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) { 394 *src = s; 395 return ((size_t)-1); 396 } 397 } else { 398 /* 399 * May not be enough space; use temp. buffer. 400 */ 401 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) { 402 *src = s; 403 return ((size_t)-1); 404 } 405 if (nb > (int)len) 406 /* MB sequence for character won't fit. */ 407 break; 408 (void) memcpy(dst, buf, nb); 409 } 410 if (*s == L'\0') { 411 *src = NULL; 412 return (nbytes + nb - 1); 413 } 414 s++; 415 dst += nb; 416 len -= nb; 417 nbytes += nb; 418 } 419 *src = s; 420 return (nbytes); 421 } 422