1 /* 2 * Copyright 2013 Garrett D'Amore <garrett@damore.org> 3 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 4 * Copyright (c) 2002-2004 Tim J. Robbins. All rights reserved. 5 * Copyright (c) 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * This code is derived from software contributed to Berkeley by 9 * Paul Borman at Krystal Technologies. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 4. Neither the name of the University nor the names of its contributors 20 * may be used to endorse or promote products derived from this software 21 * without specific prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 33 * SUCH DAMAGE. 34 */ 35 36 #include "lint.h" 37 #include <errno.h> 38 #include <limits.h> 39 #include <stdlib.h> 40 #include <string.h> 41 #include <wchar.h> 42 #include <sys/types.h> 43 #include <sys/euc.h> 44 #include "mblocal.h" 45 #include "lctype.h" 46 47 static size_t _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD, 48 const char *_RESTRICT_KYWD, 49 size_t, mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t, 50 boolean_t); 51 static size_t _EUC_wcrtomb_impl(char *_RESTRICT_KYWD, wchar_t, 52 mbstate_t *_RESTRICT_KYWD, uint8_t, uint8_t, uint8_t, uint8_t); 53 54 static size_t _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD, 55 const char *_RESTRICT_KYWD, 56 size_t, mbstate_t *_RESTRICT_KYWD, boolean_t); 57 static size_t _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD, 58 const char *_RESTRICT_KYWD, 59 size_t, mbstate_t *_RESTRICT_KYWD, boolean_t); 60 static size_t _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD, 61 const char *_RESTRICT_KYWD, 62 size_t, mbstate_t *_RESTRICT_KYWD, boolean_t); 63 static size_t _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD, 64 const char *_RESTRICT_KYWD, 65 size_t, mbstate_t *_RESTRICT_KYWD, boolean_t); 66 67 static size_t _EUC_CN_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 68 mbstate_t *_RESTRICT_KYWD); 69 static size_t _EUC_JP_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 70 mbstate_t *_RESTRICT_KYWD); 71 static size_t _EUC_KR_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 72 mbstate_t *_RESTRICT_KYWD); 73 static size_t _EUC_TW_wcrtomb(char *_RESTRICT_KYWD, wchar_t, 74 mbstate_t *_RESTRICT_KYWD); 75 76 static size_t _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 77 const char **_RESTRICT_KYWD, size_t, size_t, 78 mbstate_t *_RESTRICT_KYWD); 79 static size_t _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 80 const char **_RESTRICT_KYWD, size_t, size_t, 81 mbstate_t *_RESTRICT_KYWD); 82 static size_t _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 83 const char **_RESTRICT_KYWD, size_t, size_t, 84 mbstate_t *_RESTRICT_KYWD); 85 static size_t _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD, 86 const char **_RESTRICT_KYWD, size_t, size_t, 87 mbstate_t *_RESTRICT_KYWD); 88 89 static size_t _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD, 90 const wchar_t **_RESTRICT_KYWD, size_t, size_t, 91 mbstate_t *_RESTRICT_KYWD); 92 static size_t _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD, 93 const wchar_t **_RESTRICT_KYWD, size_t, size_t, 94 mbstate_t *_RESTRICT_KYWD); 95 static size_t _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD, 96 const wchar_t **_RESTRICT_KYWD, size_t, size_t, 97 mbstate_t *_RESTRICT_KYWD); 98 static size_t _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD, 99 const wchar_t **_RESTRICT_KYWD, size_t, size_t, 100 mbstate_t *_RESTRICT_KYWD); 101 102 static int _EUC_mbsinit(const mbstate_t *); 103 104 typedef struct { 105 wchar_t ch; 106 int set; 107 int want; 108 } _EucState; 109 110 int 111 _EUC_mbsinit(const mbstate_t *ps) 112 { 113 114 return (ps == NULL || ((const _EucState *)ps)->want == 0); 115 } 116 117 /* 118 * EUC-CN uses CS0, CS1 and CS2 (4 bytes). 119 */ 120 void 121 _EUC_CN_init(struct lc_ctype *lct) 122 { 123 lct->lc_mbrtowc = _EUC_CN_mbrtowc; 124 lct->lc_wcrtomb = _EUC_CN_wcrtomb; 125 lct->lc_mbsnrtowcs = _EUC_CN_mbsnrtowcs; 126 lct->lc_wcsnrtombs = _EUC_CN_wcsnrtombs; 127 lct->lc_mbsinit = _EUC_mbsinit; 128 129 lct->lc_max_mblen = 4; 130 lct->lc_is_ascii = 0; 131 } 132 133 static size_t 134 _EUC_CN_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 135 size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero) 136 { 137 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0, zero)); 138 } 139 140 static size_t 141 _EUC_CN_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, 142 const char **_RESTRICT_KYWD src, 143 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 144 { 145 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_CN_mbrtowc)); 146 } 147 148 static size_t 149 _EUC_CN_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 150 mbstate_t *_RESTRICT_KYWD ps) 151 { 152 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 153 } 154 155 static size_t 156 _EUC_CN_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 157 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 158 { 159 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_CN_wcrtomb)); 160 } 161 162 /* 163 * EUC-KR uses only CS0 and CS1. 164 */ 165 void 166 _EUC_KR_init(struct lc_ctype *lct) 167 { 168 lct->lc_mbrtowc = _EUC_KR_mbrtowc; 169 lct->lc_wcrtomb = _EUC_KR_wcrtomb; 170 lct->lc_mbsnrtowcs = _EUC_KR_mbsnrtowcs; 171 lct->lc_wcsnrtombs = _EUC_KR_wcsnrtombs; 172 lct->lc_mbsinit = _EUC_mbsinit; 173 174 lct->lc_max_mblen = 2; 175 lct->lc_is_ascii = 0; 176 } 177 178 static size_t 179 _EUC_KR_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 180 size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero) 181 { 182 return (_EUC_mbrtowc_impl(pwc, s, n, ps, 0, 0, 0, 0, zero)); 183 } 184 185 static size_t 186 _EUC_KR_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, 187 const char **_RESTRICT_KYWD src, 188 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 189 { 190 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_KR_mbrtowc)); 191 } 192 193 static size_t 194 _EUC_KR_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 195 mbstate_t *_RESTRICT_KYWD ps) 196 { 197 return (_EUC_wcrtomb_impl(s, wc, ps, 0, 0, 0, 0)); 198 } 199 200 static size_t 201 _EUC_KR_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 202 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 203 { 204 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_KR_wcrtomb)); 205 } 206 207 /* 208 * EUC-JP uses CS0, CS1, CS2, and CS3. 209 */ 210 void 211 _EUC_JP_init(struct lc_ctype *lct) 212 { 213 lct->lc_mbrtowc = _EUC_JP_mbrtowc; 214 lct->lc_wcrtomb = _EUC_JP_wcrtomb; 215 lct->lc_mbsnrtowcs = _EUC_JP_mbsnrtowcs; 216 lct->lc_wcsnrtombs = _EUC_JP_wcsnrtombs; 217 lct->lc_mbsinit = _EUC_mbsinit; 218 219 lct->lc_max_mblen = 3; 220 lct->lc_is_ascii = 0; 221 } 222 223 static size_t 224 _EUC_JP_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 225 size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero) 226 { 227 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 2, SS3, 3, zero)); 228 } 229 230 static size_t 231 _EUC_JP_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, 232 const char **_RESTRICT_KYWD src, 233 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 234 { 235 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_JP_mbrtowc)); 236 } 237 238 static size_t 239 _EUC_JP_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 240 mbstate_t *_RESTRICT_KYWD ps) 241 { 242 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 2, SS3, 3)); 243 } 244 245 static size_t 246 _EUC_JP_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 247 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 248 { 249 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_JP_wcrtomb)); 250 } 251 252 /* 253 * EUC-TW uses CS0, CS1, and CS2. 254 */ 255 void 256 _EUC_TW_init(struct lc_ctype *lct) 257 { 258 lct->lc_mbrtowc = _EUC_TW_mbrtowc; 259 lct->lc_wcrtomb = _EUC_TW_wcrtomb; 260 lct->lc_mbsnrtowcs = _EUC_TW_mbsnrtowcs; 261 lct->lc_wcsnrtombs = _EUC_TW_wcsnrtombs; 262 lct->lc_mbsinit = _EUC_mbsinit; 263 264 lct->lc_max_mblen = 4; 265 lct->lc_is_ascii = 0; 266 } 267 268 static size_t 269 _EUC_TW_mbrtowc(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 270 size_t n, mbstate_t *_RESTRICT_KYWD ps, boolean_t zero) 271 { 272 return (_EUC_mbrtowc_impl(pwc, s, n, ps, SS2, 4, 0, 0, zero)); 273 } 274 275 static size_t 276 _EUC_TW_mbsnrtowcs(wchar_t *_RESTRICT_KYWD dst, 277 const char **_RESTRICT_KYWD src, 278 size_t nms, size_t len, mbstate_t *_RESTRICT_KYWD ps) 279 { 280 return (__mbsnrtowcs_std(dst, src, nms, len, ps, _EUC_TW_mbrtowc)); 281 } 282 283 static size_t 284 _EUC_TW_wcrtomb(char *_RESTRICT_KYWD s, wchar_t wc, 285 mbstate_t *_RESTRICT_KYWD ps) 286 { 287 return (_EUC_wcrtomb_impl(s, wc, ps, SS2, 4, 0, 0)); 288 } 289 290 static size_t 291 _EUC_TW_wcsnrtombs(char *_RESTRICT_KYWD dst, const wchar_t **_RESTRICT_KYWD src, 292 size_t nwc, size_t len, mbstate_t *_RESTRICT_KYWD ps) 293 { 294 return (__wcsnrtombs_std(dst, src, nwc, len, ps, _EUC_TW_wcrtomb)); 295 } 296 297 /* 298 * Common EUC code. 299 */ 300 301 static size_t 302 _EUC_mbrtowc_impl(wchar_t *_RESTRICT_KYWD pwc, const char *_RESTRICT_KYWD s, 303 size_t n, mbstate_t *_RESTRICT_KYWD ps, 304 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width, 305 boolean_t zero) 306 { 307 _EucState *es; 308 int i, want; 309 wchar_t wc = 0; 310 unsigned char ch, chs; 311 312 es = (_EucState *)ps; 313 314 if (es->want < 0 || es->want > MB_CUR_MAX) { 315 errno = EINVAL; 316 return ((size_t)-1); 317 } 318 319 if (s == NULL) { 320 s = ""; 321 n = 1; 322 pwc = NULL; 323 } 324 325 if (n == 0) 326 /* Incomplete multibyte sequence */ 327 return ((size_t)-2); 328 329 if (es->want == 0) { 330 /* Fast path for plain ASCII (CS0) */ 331 if (((ch = (unsigned char)*s) & 0x80) == 0) { 332 if (pwc != NULL) 333 *pwc = ch; 334 if (zero || ch != '\0') { 335 return (1); 336 } else { 337 return (0); 338 } 339 } 340 341 if (ch >= 0xa1) { 342 /* CS1 */ 343 want = 2; 344 } else if (ch == cs2) { 345 want = cs2width; 346 } else if (ch == cs3) { 347 want = cs3width; 348 } else { 349 errno = EILSEQ; 350 return ((size_t)-1); 351 } 352 353 354 es->want = want; 355 es->ch = 0; 356 } else { 357 want = es->want; 358 wc = es->ch; 359 } 360 361 for (i = 0; i < MIN(want, n); i++) { 362 wc <<= 8; 363 chs = *s; 364 wc |= chs; 365 s++; 366 } 367 if (i < want) { 368 /* Incomplete multibyte sequence */ 369 es->want = want - i; 370 es->ch = wc; 371 return ((size_t)-2); 372 } 373 if (pwc != NULL) 374 *pwc = wc; 375 es->want = 0; 376 if (zero || wc != L'\0') { 377 return (want); 378 } else { 379 return (0); 380 } 381 } 382 383 static size_t 384 _EUC_wcrtomb_impl(char *_RESTRICT_KYWD s, wchar_t wc, 385 mbstate_t *_RESTRICT_KYWD ps, 386 uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 387 { 388 _EucState *es; 389 int i, len; 390 wchar_t nm; 391 392 es = (_EucState *)ps; 393 394 if (es->want != 0) { 395 errno = EINVAL; 396 return ((size_t)-1); 397 } 398 399 if (s == NULL) 400 /* Reset to initial shift state (no-op) */ 401 return (1); 402 403 if ((wc & ~0x7f) == 0) { 404 /* Fast path for plain ASCII (CS0) */ 405 *s = (char)wc; 406 return (1); 407 } 408 409 /* Determine the "length" */ 410 if ((unsigned)wc > 0xffffff) { 411 len = 4; 412 } else if ((unsigned)wc > 0xffff) { 413 len = 3; 414 } else if ((unsigned)wc > 0xff) { 415 len = 2; 416 } else { 417 len = 1; 418 } 419 420 if (len > MB_CUR_MAX) { 421 errno = EILSEQ; 422 return ((size_t)-1); 423 } 424 425 /* This first check excludes CS1, which is implicitly valid. */ 426 if ((wc < 0xa100) || (wc > 0xffff)) { 427 /* Check for valid CS2 or CS3 */ 428 nm = (wc >> ((len - 1) * 8)); 429 if (nm == cs2) { 430 if (len != cs2width) { 431 errno = EILSEQ; 432 return ((size_t)-1); 433 } 434 } else if (nm == cs3) { 435 if (len != cs3width) { 436 errno = EILSEQ; 437 return ((size_t)-1); 438 } 439 } else { 440 errno = EILSEQ; 441 return ((size_t)-1); 442 } 443 } 444 445 /* Stash the bytes, least significant last */ 446 for (i = len - 1; i >= 0; i--) { 447 s[i] = (wc & 0xff); 448 wc >>= 8; 449 } 450 return (len); 451 } 452