1 /*- 2 * Copyright (c) 2003-2011 Tim Kientzle 3 * Copyright (c) 2011-2012 Michihiro NAKAJIMA 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "archive_platform.h" 28 __FBSDID("$FreeBSD$"); 29 30 /* 31 * Basic resizable string support, to simplify manipulating arbitrary-sized 32 * strings while minimizing heap activity. 33 * 34 * In particular, the buffer used by a string object is only grown, it 35 * never shrinks, so you can clear and reuse the same string object 36 * without incurring additional memory allocations. 37 */ 38 39 #ifdef HAVE_ERRNO_H 40 #include <errno.h> 41 #endif 42 #ifdef HAVE_ICONV_H 43 #include <iconv.h> 44 #endif 45 #ifdef HAVE_LANGINFO_H 46 #include <langinfo.h> 47 #endif 48 #ifdef HAVE_LOCALCHARSET_H 49 #include <localcharset.h> 50 #endif 51 #ifdef HAVE_STDLIB_H 52 #include <stdlib.h> 53 #endif 54 #ifdef HAVE_STRING_H 55 #include <string.h> 56 #endif 57 #ifdef HAVE_WCHAR_H 58 #include <wchar.h> 59 #endif 60 #if defined(_WIN32) && !defined(__CYGWIN__) 61 #include <windows.h> 62 #include <locale.h> 63 #endif 64 65 #include "archive_endian.h" 66 #include "archive_private.h" 67 #include "archive_string.h" 68 #include "archive_string_composition.h" 69 70 #if !defined(HAVE_WMEMCPY) && !defined(wmemcpy) 71 #define wmemcpy(a,b,i) (wchar_t *)memcpy((a), (b), (i) * sizeof(wchar_t)) 72 #endif 73 74 #if !defined(HAVE_WMEMMOVE) && !defined(wmemmove) 75 #define wmemmove(a,b,i) (wchar_t *)memmove((a), (b), (i) * sizeof(wchar_t)) 76 #endif 77 78 struct archive_string_conv { 79 struct archive_string_conv *next; 80 char *from_charset; 81 char *to_charset; 82 unsigned from_cp; 83 unsigned to_cp; 84 /* Set 1 if from_charset and to_charset are the same. */ 85 int same; 86 int flag; 87 #define SCONV_TO_CHARSET 1 /* MBS is being converted to specified 88 * charset. */ 89 #define SCONV_FROM_CHARSET (1<<1) /* MBS is being converted from 90 * specified charset. */ 91 #define SCONV_BEST_EFFORT (1<<2) /* Copy at least ASCII code. */ 92 #define SCONV_WIN_CP (1<<3) /* Use Windows API for converting 93 * MBS. */ 94 #define SCONV_UTF8_LIBARCHIVE_2 (1<<4) /* Incorrect UTF-8 made by libarchive 95 * 2.x in the wrong assumption. */ 96 #define SCONV_NORMALIZATION_C (1<<6) /* Need normalization to be Form C. 97 * Before UTF-8 characters are actually 98 * processed. */ 99 #define SCONV_NORMALIZATION_D (1<<7) /* Need normalization to be Form D. 100 * Before UTF-8 characters are actually 101 * processed. 102 * Currently this only for MAC OS X. */ 103 #define SCONV_TO_UTF8 (1<<8) /* "to charset" side is UTF-8. */ 104 #define SCONV_FROM_UTF8 (1<<9) /* "from charset" side is UTF-8. */ 105 #define SCONV_TO_UTF16BE (1<<10) /* "to charset" side is UTF-16BE. */ 106 #define SCONV_FROM_UTF16BE (1<<11) /* "from charset" side is UTF-16BE. */ 107 #define SCONV_TO_UTF16LE (1<<12) /* "to charset" side is UTF-16LE. */ 108 #define SCONV_FROM_UTF16LE (1<<13) /* "from charset" side is UTF-16LE. */ 109 #define SCONV_TO_UTF16 (SCONV_TO_UTF16BE | SCONV_TO_UTF16LE) 110 #define SCONV_FROM_UTF16 (SCONV_FROM_UTF16BE | SCONV_FROM_UTF16LE) 111 112 #if HAVE_ICONV 113 iconv_t cd; 114 iconv_t cd_w;/* Use at archive_mstring on 115 * Windows. */ 116 #endif 117 /* A temporary buffer for normalization. */ 118 struct archive_string utftmp; 119 int (*converter[2])(struct archive_string *, const void *, size_t, 120 struct archive_string_conv *); 121 int nconverter; 122 }; 123 124 #define CP_C_LOCALE 0 /* "C" locale only for this file. */ 125 #define CP_UTF16LE 1200 126 #define CP_UTF16BE 1201 127 128 #define IS_HIGH_SURROGATE_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDBFF) 129 #define IS_LOW_SURROGATE_LA(uc) ((uc) >= 0xDC00 && (uc) <= 0xDFFF) 130 #define IS_SURROGATE_PAIR_LA(uc) ((uc) >= 0xD800 && (uc) <= 0xDFFF) 131 #define UNICODE_MAX 0x10FFFF 132 #define UNICODE_R_CHAR 0xFFFD /* Replacement character. */ 133 /* Set U+FFFD(Replacement character) in UTF-8. */ 134 static const char utf8_replacement_char[] = {0xef, 0xbf, 0xbd}; 135 136 static struct archive_string_conv *find_sconv_object(struct archive *, 137 const char *, const char *); 138 static void add_sconv_object(struct archive *, struct archive_string_conv *); 139 static struct archive_string_conv *create_sconv_object(const char *, 140 const char *, unsigned, int); 141 static void free_sconv_object(struct archive_string_conv *); 142 static struct archive_string_conv *get_sconv_object(struct archive *, 143 const char *, const char *, int); 144 static unsigned make_codepage_from_charset(const char *); 145 static unsigned get_current_codepage(void); 146 static unsigned get_current_oemcp(void); 147 static size_t mbsnbytes(const void *, size_t); 148 static size_t utf16nbytes(const void *, size_t); 149 #if defined(_WIN32) && !defined(__CYGWIN__) 150 static int archive_wstring_append_from_mbs_in_codepage( 151 struct archive_wstring *, const char *, size_t, 152 struct archive_string_conv *); 153 static int archive_string_append_from_wcs_in_codepage(struct archive_string *, 154 const wchar_t *, size_t, struct archive_string_conv *); 155 static int is_big_endian(void); 156 static int strncat_in_codepage(struct archive_string *, const void *, 157 size_t, struct archive_string_conv *); 158 static int win_strncat_from_utf16be(struct archive_string *, const void *, 159 size_t, struct archive_string_conv *); 160 static int win_strncat_from_utf16le(struct archive_string *, const void *, 161 size_t, struct archive_string_conv *); 162 static int win_strncat_to_utf16be(struct archive_string *, const void *, 163 size_t, struct archive_string_conv *); 164 static int win_strncat_to_utf16le(struct archive_string *, const void *, 165 size_t, struct archive_string_conv *); 166 #endif 167 static int best_effort_strncat_from_utf16be(struct archive_string *, 168 const void *, size_t, struct archive_string_conv *); 169 static int best_effort_strncat_from_utf16le(struct archive_string *, 170 const void *, size_t, struct archive_string_conv *); 171 static int best_effort_strncat_to_utf16be(struct archive_string *, 172 const void *, size_t, struct archive_string_conv *); 173 static int best_effort_strncat_to_utf16le(struct archive_string *, 174 const void *, size_t, struct archive_string_conv *); 175 #if defined(HAVE_ICONV) 176 static int iconv_strncat_in_locale(struct archive_string *, const void *, 177 size_t, struct archive_string_conv *); 178 #endif 179 static int best_effort_strncat_in_locale(struct archive_string *, 180 const void *, size_t, struct archive_string_conv *); 181 static int _utf8_to_unicode(uint32_t *, const char *, size_t); 182 static int utf8_to_unicode(uint32_t *, const char *, size_t); 183 static inline uint32_t combine_surrogate_pair(uint32_t, uint32_t); 184 static int cesu8_to_unicode(uint32_t *, const char *, size_t); 185 static size_t unicode_to_utf8(char *, size_t, uint32_t); 186 static int utf16_to_unicode(uint32_t *, const char *, size_t, int); 187 static size_t unicode_to_utf16be(char *, size_t, uint32_t); 188 static size_t unicode_to_utf16le(char *, size_t, uint32_t); 189 static int strncat_from_utf8_libarchive2(struct archive_string *, 190 const void *, size_t, struct archive_string_conv *); 191 static int strncat_from_utf8_to_utf8(struct archive_string *, const void *, 192 size_t, struct archive_string_conv *); 193 static int archive_string_normalize_C(struct archive_string *, const void *, 194 size_t, struct archive_string_conv *); 195 static int archive_string_normalize_D(struct archive_string *, const void *, 196 size_t, struct archive_string_conv *); 197 static int archive_string_append_unicode(struct archive_string *, 198 const void *, size_t, struct archive_string_conv *); 199 200 static struct archive_string * 201 archive_string_append(struct archive_string *as, const char *p, size_t s) 202 { 203 if (archive_string_ensure(as, as->length + s + 1) == NULL) 204 return (NULL); 205 memmove(as->s + as->length, p, s); 206 as->length += s; 207 as->s[as->length] = 0; 208 return (as); 209 } 210 211 static struct archive_wstring * 212 archive_wstring_append(struct archive_wstring *as, const wchar_t *p, size_t s) 213 { 214 if (archive_wstring_ensure(as, as->length + s + 1) == NULL) 215 return (NULL); 216 wmemmove(as->s + as->length, p, s); 217 as->length += s; 218 as->s[as->length] = 0; 219 return (as); 220 } 221 222 struct archive_string * 223 archive_array_append(struct archive_string *as, const char *p, size_t s) 224 { 225 return archive_string_append(as, p, s); 226 } 227 228 void 229 archive_string_concat(struct archive_string *dest, struct archive_string *src) 230 { 231 if (archive_string_append(dest, src->s, src->length) == NULL) 232 __archive_errx(1, "Out of memory"); 233 } 234 235 void 236 archive_wstring_concat(struct archive_wstring *dest, 237 struct archive_wstring *src) 238 { 239 if (archive_wstring_append(dest, src->s, src->length) == NULL) 240 __archive_errx(1, "Out of memory"); 241 } 242 243 void 244 archive_string_free(struct archive_string *as) 245 { 246 as->length = 0; 247 as->buffer_length = 0; 248 free(as->s); 249 as->s = NULL; 250 } 251 252 void 253 archive_wstring_free(struct archive_wstring *as) 254 { 255 as->length = 0; 256 as->buffer_length = 0; 257 free(as->s); 258 as->s = NULL; 259 } 260 261 struct archive_wstring * 262 archive_wstring_ensure(struct archive_wstring *as, size_t s) 263 { 264 return (struct archive_wstring *) 265 archive_string_ensure((struct archive_string *)as, 266 s * sizeof(wchar_t)); 267 } 268 269 /* Returns NULL on any allocation failure. */ 270 struct archive_string * 271 archive_string_ensure(struct archive_string *as, size_t s) 272 { 273 char *p; 274 size_t new_length; 275 276 /* If buffer is already big enough, don't reallocate. */ 277 if (as->s && (s <= as->buffer_length)) 278 return (as); 279 280 /* 281 * Growing the buffer at least exponentially ensures that 282 * append operations are always linear in the number of 283 * characters appended. Using a smaller growth rate for 284 * larger buffers reduces memory waste somewhat at the cost of 285 * a larger constant factor. 286 */ 287 if (as->buffer_length < 32) 288 /* Start with a minimum 32-character buffer. */ 289 new_length = 32; 290 else if (as->buffer_length < 8192) 291 /* Buffers under 8k are doubled for speed. */ 292 new_length = as->buffer_length + as->buffer_length; 293 else { 294 /* Buffers 8k and over grow by at least 25% each time. */ 295 new_length = as->buffer_length + as->buffer_length / 4; 296 /* Be safe: If size wraps, fail. */ 297 if (new_length < as->buffer_length) { 298 /* On failure, wipe the string and return NULL. */ 299 archive_string_free(as); 300 errno = ENOMEM;/* Make sure errno has ENOMEM. */ 301 return (NULL); 302 } 303 } 304 /* 305 * The computation above is a lower limit to how much we'll 306 * grow the buffer. In any case, we have to grow it enough to 307 * hold the request. 308 */ 309 if (new_length < s) 310 new_length = s; 311 /* Now we can reallocate the buffer. */ 312 p = (char *)realloc(as->s, new_length); 313 if (p == NULL) { 314 /* On failure, wipe the string and return NULL. */ 315 archive_string_free(as); 316 errno = ENOMEM;/* Make sure errno has ENOMEM. */ 317 return (NULL); 318 } 319 320 as->s = p; 321 as->buffer_length = new_length; 322 return (as); 323 } 324 325 /* 326 * TODO: See if there's a way to avoid scanning 327 * the source string twice. Then test to see 328 * if it actually helps (remember that we're almost 329 * always called with pretty short arguments, so 330 * such an optimization might not help). 331 */ 332 struct archive_string * 333 archive_strncat(struct archive_string *as, const void *_p, size_t n) 334 { 335 size_t s; 336 const char *p, *pp; 337 338 p = (const char *)_p; 339 340 /* Like strlen(p), except won't examine positions beyond p[n]. */ 341 s = 0; 342 pp = p; 343 while (s < n && *pp) { 344 pp++; 345 s++; 346 } 347 if ((as = archive_string_append(as, p, s)) == NULL) 348 __archive_errx(1, "Out of memory"); 349 return (as); 350 } 351 352 struct archive_wstring * 353 archive_wstrncat(struct archive_wstring *as, const wchar_t *p, size_t n) 354 { 355 size_t s; 356 const wchar_t *pp; 357 358 /* Like strlen(p), except won't examine positions beyond p[n]. */ 359 s = 0; 360 pp = p; 361 while (s < n && *pp) { 362 pp++; 363 s++; 364 } 365 if ((as = archive_wstring_append(as, p, s)) == NULL) 366 __archive_errx(1, "Out of memory"); 367 return (as); 368 } 369 370 struct archive_string * 371 archive_strcat(struct archive_string *as, const void *p) 372 { 373 /* strcat is just strncat without an effective limit. 374 * Assert that we'll never get called with a source 375 * string over 16MB. 376 * TODO: Review all uses of strcat in the source 377 * and try to replace them with strncat(). 378 */ 379 return archive_strncat(as, p, 0x1000000); 380 } 381 382 struct archive_wstring * 383 archive_wstrcat(struct archive_wstring *as, const wchar_t *p) 384 { 385 /* Ditto. */ 386 return archive_wstrncat(as, p, 0x1000000); 387 } 388 389 struct archive_string * 390 archive_strappend_char(struct archive_string *as, char c) 391 { 392 if ((as = archive_string_append(as, &c, 1)) == NULL) 393 __archive_errx(1, "Out of memory"); 394 return (as); 395 } 396 397 struct archive_wstring * 398 archive_wstrappend_wchar(struct archive_wstring *as, wchar_t c) 399 { 400 if ((as = archive_wstring_append(as, &c, 1)) == NULL) 401 __archive_errx(1, "Out of memory"); 402 return (as); 403 } 404 405 /* 406 * Get the "current character set" name to use with iconv. 407 * On FreeBSD, the empty character set name "" chooses 408 * the correct character encoding for the current locale, 409 * so this isn't necessary. 410 * But iconv on Mac OS 10.6 doesn't seem to handle this correctly; 411 * on that system, we have to explicitly call nl_langinfo() 412 * to get the right name. Not sure about other platforms. 413 * 414 * NOTE: GNU libiconv does not recognize the character-set name 415 * which some platform nl_langinfo(CODESET) returns, so we should 416 * use locale_charset() instead of nl_langinfo(CODESET) for GNU libiconv. 417 */ 418 static const char * 419 default_iconv_charset(const char *charset) { 420 if (charset != NULL && charset[0] != '\0') 421 return charset; 422 #if HAVE_LOCALE_CHARSET && !defined(__APPLE__) 423 /* locale_charset() is broken on Mac OS */ 424 return locale_charset(); 425 #elif HAVE_NL_LANGINFO 426 return nl_langinfo(CODESET); 427 #else 428 return ""; 429 #endif 430 } 431 432 #if defined(_WIN32) && !defined(__CYGWIN__) 433 434 /* 435 * Convert MBS to WCS. 436 * Note: returns -1 if conversion fails. 437 */ 438 int 439 archive_wstring_append_from_mbs(struct archive_wstring *dest, 440 const char *p, size_t len) 441 { 442 return archive_wstring_append_from_mbs_in_codepage(dest, p, len, NULL); 443 } 444 445 static int 446 archive_wstring_append_from_mbs_in_codepage(struct archive_wstring *dest, 447 const char *s, size_t length, struct archive_string_conv *sc) 448 { 449 int count, ret = 0; 450 UINT from_cp; 451 452 if (sc != NULL) 453 from_cp = sc->from_cp; 454 else 455 from_cp = get_current_codepage(); 456 457 if (from_cp == CP_C_LOCALE) { 458 /* 459 * "C" locale special process. 460 */ 461 wchar_t *ws; 462 const unsigned char *mp; 463 464 if (NULL == archive_wstring_ensure(dest, 465 dest->length + length + 1)) 466 return (-1); 467 468 ws = dest->s + dest->length; 469 mp = (const unsigned char *)s; 470 count = 0; 471 while (count < (int)length && *mp) { 472 *ws++ = (wchar_t)*mp++; 473 count++; 474 } 475 } else if (sc != NULL && 476 (sc->flag & (SCONV_NORMALIZATION_C | SCONV_NORMALIZATION_D))) { 477 /* 478 * Normalize UTF-8 and UTF-16BE and convert it directly 479 * to UTF-16 as wchar_t. 480 */ 481 struct archive_string u16; 482 int saved_flag = sc->flag;/* save current flag. */ 483 484 if (is_big_endian()) 485 sc->flag |= SCONV_TO_UTF16BE; 486 else 487 sc->flag |= SCONV_TO_UTF16LE; 488 489 if (sc->flag & SCONV_FROM_UTF16) { 490 /* 491 * UTF-16BE/LE NFD ===> UTF-16 NFC 492 * UTF-16BE/LE NFC ===> UTF-16 NFD 493 */ 494 count = (int)utf16nbytes(s, length); 495 } else { 496 /* 497 * UTF-8 NFD ===> UTF-16 NFC 498 * UTF-8 NFC ===> UTF-16 NFD 499 */ 500 count = (int)mbsnbytes(s, length); 501 } 502 u16.s = (char *)dest->s; 503 u16.length = dest->length << 1;; 504 u16.buffer_length = dest->buffer_length; 505 if (sc->flag & SCONV_NORMALIZATION_C) 506 ret = archive_string_normalize_C(&u16, s, count, sc); 507 else 508 ret = archive_string_normalize_D(&u16, s, count, sc); 509 dest->s = (wchar_t *)u16.s; 510 dest->length = u16.length >> 1; 511 dest->buffer_length = u16.buffer_length; 512 sc->flag = saved_flag;/* restore the saved flag. */ 513 return (ret); 514 } else if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) { 515 count = (int)utf16nbytes(s, length); 516 count >>= 1; /* to be WCS length */ 517 /* Allocate memory for WCS. */ 518 if (NULL == archive_wstring_ensure(dest, 519 dest->length + count + 1)) 520 return (-1); 521 wmemcpy(dest->s + dest->length, (const wchar_t *)s, count); 522 if ((sc->flag & SCONV_FROM_UTF16BE) && !is_big_endian()) { 523 uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 524 int b; 525 for (b = 0; b < count; b++) { 526 uint16_t val = archive_le16dec(u16+b); 527 archive_be16enc(u16+b, val); 528 } 529 } else if ((sc->flag & SCONV_FROM_UTF16LE) && is_big_endian()) { 530 uint16_t *u16 = (uint16_t *)(dest->s + dest->length); 531 int b; 532 for (b = 0; b < count; b++) { 533 uint16_t val = archive_be16dec(u16+b); 534 archive_le16enc(u16+b, val); 535 } 536 } 537 } else { 538 DWORD mbflag; 539 size_t buffsize; 540 541 if (sc == NULL) 542 mbflag = 0; 543 else if (sc->flag & SCONV_FROM_CHARSET) { 544 /* Do not trust the length which comes from 545 * an archive file. */ 546 length = mbsnbytes(s, length); 547 mbflag = 0; 548 } else 549 mbflag = MB_PRECOMPOSED; 550 551 buffsize = dest->length + length + 1; 552 do { 553 /* Allocate memory for WCS. */ 554 if (NULL == archive_wstring_ensure(dest, buffsize)) 555 return (-1); 556 /* Convert MBS to WCS. */ 557 count = MultiByteToWideChar(from_cp, 558 mbflag, s, (int)length, dest->s + dest->length, 559 (int)(dest->buffer_length >> 1) -1); 560 if (count == 0 && 561 GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 562 /* Expand the WCS buffer. */ 563 buffsize = dest->buffer_length << 1; 564 continue; 565 } 566 if (count == 0 && length != 0) 567 ret = -1; 568 break; 569 } while (1); 570 } 571 dest->length += count; 572 dest->s[dest->length] = L'\0'; 573 return (ret); 574 } 575 576 #else 577 578 /* 579 * Convert MBS to WCS. 580 * Note: returns -1 if conversion fails. 581 */ 582 int 583 archive_wstring_append_from_mbs(struct archive_wstring *dest, 584 const char *p, size_t len) 585 { 586 size_t r; 587 int ret_val = 0; 588 /* 589 * No single byte will be more than one wide character, 590 * so this length estimate will always be big enough. 591 */ 592 size_t wcs_length = len; 593 size_t mbs_length = len; 594 const char *mbs = p; 595 wchar_t *wcs; 596 #if HAVE_MBRTOWC 597 mbstate_t shift_state; 598 599 memset(&shift_state, 0, sizeof(shift_state)); 600 #endif 601 if (NULL == archive_wstring_ensure(dest, dest->length + wcs_length + 1)) 602 return (-1); 603 wcs = dest->s + dest->length; 604 /* 605 * We cannot use mbsrtowcs/mbstowcs here because those may convert 606 * extra MBS when strlen(p) > len and one wide character consists of 607 * multi bytes. 608 */ 609 while (*mbs && mbs_length > 0) { 610 if (wcs_length == 0) { 611 dest->length = wcs - dest->s; 612 dest->s[dest->length] = L'\0'; 613 wcs_length = mbs_length; 614 if (NULL == archive_wstring_ensure(dest, 615 dest->length + wcs_length + 1)) 616 return (-1); 617 wcs = dest->s + dest->length; 618 } 619 #if HAVE_MBRTOWC 620 r = mbrtowc(wcs, mbs, wcs_length, &shift_state); 621 #else 622 r = mbtowc(wcs, mbs, wcs_length); 623 #endif 624 if (r == (size_t)-1 || r == (size_t)-2) { 625 ret_val = -1; 626 if (errno == EILSEQ) { 627 ++mbs; 628 --mbs_length; 629 continue; 630 } else 631 break; 632 } 633 if (r == 0 || r > mbs_length) 634 break; 635 wcs++; 636 wcs_length--; 637 mbs += r; 638 mbs_length -= r; 639 } 640 dest->length = wcs - dest->s; 641 dest->s[dest->length] = L'\0'; 642 return (ret_val); 643 } 644 645 #endif 646 647 #if defined(_WIN32) && !defined(__CYGWIN__) 648 649 /* 650 * WCS ==> MBS. 651 * Note: returns -1 if conversion fails. 652 * 653 * Win32 builds use WideCharToMultiByte from the Windows API. 654 * (Maybe Cygwin should too? WideCharToMultiByte will know a 655 * lot more about local character encodings than the wcrtomb() 656 * wrapper is going to know.) 657 */ 658 int 659 archive_string_append_from_wcs(struct archive_string *as, 660 const wchar_t *w, size_t len) 661 { 662 return archive_string_append_from_wcs_in_codepage(as, w, len, NULL); 663 } 664 665 static int 666 archive_string_append_from_wcs_in_codepage(struct archive_string *as, 667 const wchar_t *ws, size_t len, struct archive_string_conv *sc) 668 { 669 BOOL defchar_used, *dp; 670 int count, ret = 0; 671 UINT to_cp; 672 int wslen = (int)len; 673 674 if (sc != NULL) 675 to_cp = sc->to_cp; 676 else 677 to_cp = get_current_codepage(); 678 679 if (to_cp == CP_C_LOCALE) { 680 /* 681 * "C" locale special process. 682 */ 683 const wchar_t *wp = ws; 684 char *p; 685 686 if (NULL == archive_string_ensure(as, 687 as->length + wslen +1)) 688 return (-1); 689 p = as->s + as->length; 690 count = 0; 691 defchar_used = 0; 692 while (count < wslen && *wp) { 693 if (*wp > 255) { 694 *p++ = '?'; 695 wp++; 696 defchar_used = 1; 697 } else 698 *p++ = (char)*wp++; 699 count++; 700 } 701 } else if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) { 702 uint16_t *u16; 703 704 if (NULL == 705 archive_string_ensure(as, as->length + len * 2 + 2)) 706 return (-1); 707 u16 = (uint16_t *)(as->s + as->length); 708 count = 0; 709 defchar_used = 0; 710 if (sc->flag & SCONV_TO_UTF16BE) { 711 while (count < (int)len && *ws) { 712 archive_be16enc(u16+count, *ws); 713 ws++; 714 count++; 715 } 716 } else { 717 while (count < (int)len && *ws) { 718 archive_le16enc(u16+count, *ws); 719 ws++; 720 count++; 721 } 722 } 723 count <<= 1; /* to be byte size */ 724 } else { 725 /* Make sure the MBS buffer has plenty to set. */ 726 if (NULL == 727 archive_string_ensure(as, as->length + len * 2 + 1)) 728 return (-1); 729 do { 730 defchar_used = 0; 731 if (to_cp == CP_UTF8 || sc == NULL) 732 dp = NULL; 733 else 734 dp = &defchar_used; 735 count = WideCharToMultiByte(to_cp, 0, ws, wslen, 736 as->s + as->length, (int)as->buffer_length-1, NULL, dp); 737 if (count == 0 && 738 GetLastError() == ERROR_INSUFFICIENT_BUFFER) { 739 /* Expand the MBS buffer and retry. */ 740 if (NULL == archive_string_ensure(as, 741 as->buffer_length + len)) 742 return (-1); 743 continue; 744 } 745 if (count == 0) 746 ret = -1; 747 break; 748 } while (1); 749 } 750 as->length += count; 751 as->s[as->length] = '\0'; 752 return (defchar_used?-1:ret); 753 } 754 755 #elif defined(HAVE_WCTOMB) || defined(HAVE_WCRTOMB) 756 757 /* 758 * Translates a wide character string into current locale character set 759 * and appends to the archive_string. Note: returns -1 if conversion 760 * fails. 761 */ 762 int 763 archive_string_append_from_wcs(struct archive_string *as, 764 const wchar_t *w, size_t len) 765 { 766 /* We cannot use the standard wcstombs() here because it 767 * cannot tell us how big the output buffer should be. So 768 * I've built a loop around wcrtomb() or wctomb() that 769 * converts a character at a time and resizes the string as 770 * needed. We prefer wcrtomb() when it's available because 771 * it's thread-safe. */ 772 int n, ret_val = 0; 773 char *p; 774 char *end; 775 #if HAVE_WCRTOMB 776 mbstate_t shift_state; 777 778 memset(&shift_state, 0, sizeof(shift_state)); 779 #else 780 /* Clear the shift state before starting. */ 781 wctomb(NULL, L'\0'); 782 #endif 783 /* 784 * Allocate buffer for MBS. 785 * We need this allocation here since it is possible that 786 * as->s is still NULL. 787 */ 788 if (archive_string_ensure(as, as->length + len + 1) == NULL) 789 return (-1); 790 791 p = as->s + as->length; 792 end = as->s + as->buffer_length - MB_CUR_MAX -1; 793 while (*w != L'\0' && len > 0) { 794 if (p >= end) { 795 as->length = p - as->s; 796 as->s[as->length] = '\0'; 797 /* Re-allocate buffer for MBS. */ 798 if (archive_string_ensure(as, 799 as->length + len * 2 + 1) == NULL) 800 return (-1); 801 p = as->s + as->length; 802 end = as->s + as->buffer_length - MB_CUR_MAX -1; 803 } 804 #if HAVE_WCRTOMB 805 n = wcrtomb(p, *w++, &shift_state); 806 #else 807 n = wctomb(p, *w++); 808 #endif 809 if (n == -1) { 810 if (errno == EILSEQ) { 811 /* Skip an illegal wide char. */ 812 *p++ = '?'; 813 ret_val = -1; 814 } else { 815 ret_val = -1; 816 break; 817 } 818 } else 819 p += n; 820 len--; 821 } 822 as->length = p - as->s; 823 as->s[as->length] = '\0'; 824 return (ret_val); 825 } 826 827 #else /* HAVE_WCTOMB || HAVE_WCRTOMB */ 828 829 /* 830 * TODO: Test if __STDC_ISO_10646__ is defined. 831 * Non-Windows uses ISO C wcrtomb() or wctomb() to perform the conversion 832 * one character at a time. If a non-Windows platform doesn't have 833 * either of these, fall back to the built-in UTF8 conversion. 834 */ 835 int 836 archive_string_append_from_wcs(struct archive_string *as, 837 const wchar_t *w, size_t len) 838 { 839 (void)as;/* UNUSED */ 840 (void)w;/* UNUSED */ 841 (void)len;/* UNUSED */ 842 errno = ENOSYS; 843 return (-1); 844 } 845 846 #endif /* HAVE_WCTOMB || HAVE_WCRTOMB */ 847 848 /* 849 * Find a string conversion object by a pair of 'from' charset name 850 * and 'to' charset name from an archive object. 851 * Return NULL if not found. 852 */ 853 static struct archive_string_conv * 854 find_sconv_object(struct archive *a, const char *fc, const char *tc) 855 { 856 struct archive_string_conv *sc; 857 858 if (a == NULL) 859 return (NULL); 860 861 for (sc = a->sconv; sc != NULL; sc = sc->next) { 862 if (strcmp(sc->from_charset, fc) == 0 && 863 strcmp(sc->to_charset, tc) == 0) 864 break; 865 } 866 return (sc); 867 } 868 869 /* 870 * Register a string object to an archive object. 871 */ 872 static void 873 add_sconv_object(struct archive *a, struct archive_string_conv *sc) 874 { 875 struct archive_string_conv **psc; 876 877 /* Add a new sconv to sconv list. */ 878 psc = &(a->sconv); 879 while (*psc != NULL) 880 psc = &((*psc)->next); 881 *psc = sc; 882 } 883 884 static void 885 add_converter(struct archive_string_conv *sc, int (*converter) 886 (struct archive_string *, const void *, size_t, 887 struct archive_string_conv *)) 888 { 889 if (sc == NULL || sc->nconverter >= 2) 890 __archive_errx(1, "Programing error"); 891 sc->converter[sc->nconverter++] = converter; 892 } 893 894 static void 895 setup_converter(struct archive_string_conv *sc) 896 { 897 898 /* Reset. */ 899 sc->nconverter = 0; 900 901 /* 902 * Perform special sequence for the incorrect UTF-8 filenames 903 * made by libarchive2.x. 904 */ 905 if (sc->flag & SCONV_UTF8_LIBARCHIVE_2) { 906 add_converter(sc, strncat_from_utf8_libarchive2); 907 return; 908 } 909 910 /* 911 * Convert a string to UTF-16BE/LE. 912 */ 913 if (sc->flag & SCONV_TO_UTF16) { 914 /* 915 * If the current locale is UTF-8, we can translate 916 * a UTF-8 string into a UTF-16BE string. 917 */ 918 if (sc->flag & SCONV_FROM_UTF8) { 919 add_converter(sc, archive_string_append_unicode); 920 return; 921 } 922 923 #if defined(_WIN32) && !defined(__CYGWIN__) 924 if (sc->flag & SCONV_WIN_CP) { 925 if (sc->flag & SCONV_TO_UTF16BE) 926 add_converter(sc, win_strncat_to_utf16be); 927 else 928 add_converter(sc, win_strncat_to_utf16le); 929 return; 930 } 931 #endif 932 933 #if defined(HAVE_ICONV) 934 if (sc->cd != (iconv_t)-1) { 935 add_converter(sc, iconv_strncat_in_locale); 936 return; 937 } 938 #endif 939 940 if (sc->flag & SCONV_BEST_EFFORT) { 941 if (sc->flag & SCONV_TO_UTF16BE) 942 add_converter(sc, 943 best_effort_strncat_to_utf16be); 944 else 945 add_converter(sc, 946 best_effort_strncat_to_utf16le); 947 } else 948 /* Make sure we have no converter. */ 949 sc->nconverter = 0; 950 return; 951 } 952 953 /* 954 * Convert a string from UTF-16BE/LE. 955 */ 956 if (sc->flag & SCONV_FROM_UTF16) { 957 /* 958 * At least we should normalize a UTF-16BE string. 959 */ 960 if (sc->flag & SCONV_NORMALIZATION_D) 961 add_converter(sc,archive_string_normalize_D); 962 else if (sc->flag & SCONV_NORMALIZATION_C) 963 add_converter(sc, archive_string_normalize_C); 964 965 if (sc->flag & SCONV_TO_UTF8) { 966 /* 967 * If the current locale is UTF-8, we can translate 968 * a UTF-16BE/LE string into a UTF-8 string directly. 969 */ 970 if (!(sc->flag & 971 (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 972 add_converter(sc, 973 archive_string_append_unicode); 974 return; 975 } 976 977 #if defined(_WIN32) && !defined(__CYGWIN__) 978 if (sc->flag & SCONV_WIN_CP) { 979 if (sc->flag & SCONV_FROM_UTF16BE) 980 add_converter(sc, win_strncat_from_utf16be); 981 else 982 add_converter(sc, win_strncat_from_utf16le); 983 return; 984 } 985 #endif 986 987 #if defined(HAVE_ICONV) 988 if (sc->cd != (iconv_t)-1) { 989 add_converter(sc, iconv_strncat_in_locale); 990 return; 991 } 992 #endif 993 994 if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 995 == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16BE)) 996 add_converter(sc, best_effort_strncat_from_utf16be); 997 else if ((sc->flag & (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 998 == (SCONV_BEST_EFFORT | SCONV_FROM_UTF16LE)) 999 add_converter(sc, best_effort_strncat_from_utf16le); 1000 else 1001 /* Make sure we have no converter. */ 1002 sc->nconverter = 0; 1003 return; 1004 } 1005 1006 if (sc->flag & SCONV_FROM_UTF8) { 1007 /* 1008 * At least we should normalize a UTF-8 string. 1009 */ 1010 if (sc->flag & SCONV_NORMALIZATION_D) 1011 add_converter(sc,archive_string_normalize_D); 1012 else if (sc->flag & SCONV_NORMALIZATION_C) 1013 add_converter(sc, archive_string_normalize_C); 1014 1015 /* 1016 * Copy UTF-8 string with a check of CESU-8. 1017 * Apparently, iconv does not check surrogate pairs in UTF-8 1018 * when both from-charset and to-charset are UTF-8, and then 1019 * we use our UTF-8 copy code. 1020 */ 1021 if (sc->flag & SCONV_TO_UTF8) { 1022 /* 1023 * If the current locale is UTF-8, we can translate 1024 * a UTF-16BE string into a UTF-8 string directly. 1025 */ 1026 if (!(sc->flag & 1027 (SCONV_NORMALIZATION_D |SCONV_NORMALIZATION_C))) 1028 add_converter(sc, strncat_from_utf8_to_utf8); 1029 return; 1030 } 1031 } 1032 1033 #if defined(_WIN32) && !defined(__CYGWIN__) 1034 /* 1035 * On Windows we can use Windows API for a string conversion. 1036 */ 1037 if (sc->flag & SCONV_WIN_CP) { 1038 add_converter(sc, strncat_in_codepage); 1039 return; 1040 } 1041 #endif 1042 1043 #if HAVE_ICONV 1044 if (sc->cd != (iconv_t)-1) { 1045 add_converter(sc, iconv_strncat_in_locale); 1046 /* 1047 * iconv generally does not support UTF-8-MAC and so 1048 * we have to the output of iconv from NFC to NFD if 1049 * need. 1050 */ 1051 if ((sc->flag & SCONV_FROM_CHARSET) && 1052 (sc->flag & SCONV_TO_UTF8)) { 1053 if (sc->flag & SCONV_NORMALIZATION_D) 1054 add_converter(sc, archive_string_normalize_D); 1055 } 1056 return; 1057 } 1058 #endif 1059 1060 /* 1061 * Try conversion in the best effort or no conversion. 1062 */ 1063 if ((sc->flag & SCONV_BEST_EFFORT) || sc->same) 1064 add_converter(sc, best_effort_strncat_in_locale); 1065 else 1066 /* Make sure we have no converter. */ 1067 sc->nconverter = 0; 1068 } 1069 1070 /* 1071 * Return canonicalized charset-name but this supports just UTF-8, UTF-16BE 1072 * and CP932 which are referenced in create_sconv_object(). 1073 */ 1074 static const char * 1075 canonical_charset_name(const char *charset) 1076 { 1077 char cs[16]; 1078 char *p; 1079 const char *s; 1080 1081 if (charset == NULL || charset[0] == '\0' 1082 || strlen(charset) > 15) 1083 return (charset); 1084 1085 /* Copy name to uppercase. */ 1086 p = cs; 1087 s = charset; 1088 while (*s) { 1089 char c = *s++; 1090 if (c >= 'a' && c <= 'z') 1091 c -= 'a' - 'A'; 1092 *p++ = c; 1093 } 1094 *p++ = '\0'; 1095 1096 if (strcmp(cs, "UTF-8") == 0 || 1097 strcmp(cs, "UTF8") == 0) 1098 return ("UTF-8"); 1099 if (strcmp(cs, "UTF-16BE") == 0 || 1100 strcmp(cs, "UTF16BE") == 0) 1101 return ("UTF-16BE"); 1102 if (strcmp(cs, "UTF-16LE") == 0 || 1103 strcmp(cs, "UTF16LE") == 0) 1104 return ("UTF-16LE"); 1105 if (strcmp(cs, "CP932") == 0) 1106 return ("CP932"); 1107 return (charset); 1108 } 1109 1110 /* 1111 * Create a string conversion object. 1112 */ 1113 static struct archive_string_conv * 1114 create_sconv_object(const char *fc, const char *tc, 1115 unsigned current_codepage, int flag) 1116 { 1117 struct archive_string_conv *sc; 1118 1119 sc = calloc(1, sizeof(*sc)); 1120 if (sc == NULL) 1121 return (NULL); 1122 sc->next = NULL; 1123 sc->from_charset = strdup(fc); 1124 if (sc->from_charset == NULL) { 1125 free(sc); 1126 return (NULL); 1127 } 1128 sc->to_charset = strdup(tc); 1129 if (sc->to_charset == NULL) { 1130 free(sc->from_charset); 1131 free(sc); 1132 return (NULL); 1133 } 1134 archive_string_init(&sc->utftmp); 1135 1136 if (flag & SCONV_TO_CHARSET) { 1137 /* 1138 * Convert characters from the current locale charset to 1139 * a specified charset. 1140 */ 1141 sc->from_cp = current_codepage; 1142 sc->to_cp = make_codepage_from_charset(tc); 1143 #if defined(_WIN32) && !defined(__CYGWIN__) 1144 if (IsValidCodePage(sc->to_cp)) 1145 flag |= SCONV_WIN_CP; 1146 #endif 1147 } else if (flag & SCONV_FROM_CHARSET) { 1148 /* 1149 * Convert characters from a specified charset to 1150 * the current locale charset. 1151 */ 1152 sc->to_cp = current_codepage; 1153 sc->from_cp = make_codepage_from_charset(fc); 1154 #if defined(_WIN32) && !defined(__CYGWIN__) 1155 if (IsValidCodePage(sc->from_cp)) 1156 flag |= SCONV_WIN_CP; 1157 #endif 1158 } 1159 1160 /* 1161 * Check if "from charset" and "to charset" are the same. 1162 */ 1163 if (strcmp(fc, tc) == 0 || 1164 (sc->from_cp != (unsigned)-1 && sc->from_cp == sc->to_cp)) 1165 sc->same = 1; 1166 else 1167 sc->same = 0; 1168 1169 /* 1170 * Mark if "from charset" or "to charset" are UTF-8 or UTF-16BE/LE. 1171 */ 1172 if (strcmp(tc, "UTF-8") == 0) 1173 flag |= SCONV_TO_UTF8; 1174 else if (strcmp(tc, "UTF-16BE") == 0) 1175 flag |= SCONV_TO_UTF16BE; 1176 else if (strcmp(tc, "UTF-16LE") == 0) 1177 flag |= SCONV_TO_UTF16LE; 1178 if (strcmp(fc, "UTF-8") == 0) 1179 flag |= SCONV_FROM_UTF8; 1180 else if (strcmp(fc, "UTF-16BE") == 0) 1181 flag |= SCONV_FROM_UTF16BE; 1182 else if (strcmp(fc, "UTF-16LE") == 0) 1183 flag |= SCONV_FROM_UTF16LE; 1184 #if defined(_WIN32) && !defined(__CYGWIN__) 1185 if (sc->to_cp == CP_UTF8) 1186 flag |= SCONV_TO_UTF8; 1187 else if (sc->to_cp == CP_UTF16BE) 1188 flag |= SCONV_TO_UTF16BE | SCONV_WIN_CP; 1189 else if (sc->to_cp == CP_UTF16LE) 1190 flag |= SCONV_TO_UTF16LE | SCONV_WIN_CP; 1191 if (sc->from_cp == CP_UTF8) 1192 flag |= SCONV_FROM_UTF8; 1193 else if (sc->from_cp == CP_UTF16BE) 1194 flag |= SCONV_FROM_UTF16BE | SCONV_WIN_CP; 1195 else if (sc->from_cp == CP_UTF16LE) 1196 flag |= SCONV_FROM_UTF16LE | SCONV_WIN_CP; 1197 #endif 1198 1199 /* 1200 * Set a flag for Unicode NFD. Usually iconv cannot correctly 1201 * handle it. So we have to translate NFD characters to NFC ones 1202 * ourselves before iconv handles. Another reason is to prevent 1203 * that the same sight of two filenames, one is NFC and other 1204 * is NFD, would be in its directory. 1205 * On Mac OS X, although its filesystem layer automatically 1206 * convert filenames to NFD, it would be useful for filename 1207 * comparing to find out the same filenames that we normalize 1208 * that to be NFD ourselves. 1209 */ 1210 if ((flag & SCONV_FROM_CHARSET) && 1211 (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8))) { 1212 #if defined(__APPLE__) 1213 if (flag & SCONV_TO_UTF8) 1214 flag |= SCONV_NORMALIZATION_D; 1215 else 1216 #endif 1217 flag |= SCONV_NORMALIZATION_C; 1218 } 1219 #if defined(__APPLE__) 1220 /* 1221 * In case writing an archive file, make sure that a filename 1222 * going to be passed to iconv is a Unicode NFC string since 1223 * a filename in HFS Plus filesystem is a Unicode NFD one and 1224 * iconv cannot handle it with "UTF-8" charset. It is simpler 1225 * than a use of "UTF-8-MAC" charset. 1226 */ 1227 if ((flag & SCONV_TO_CHARSET) && 1228 (flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1229 !(flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1230 flag |= SCONV_NORMALIZATION_C; 1231 /* 1232 * In case reading an archive file. make sure that a filename 1233 * will be passed to users is a Unicode NFD string in order to 1234 * correctly compare the filename with other one which comes 1235 * from HFS Plus filesystem. 1236 */ 1237 if ((flag & SCONV_FROM_CHARSET) && 1238 !(flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1239 (flag & SCONV_TO_UTF8)) 1240 flag |= SCONV_NORMALIZATION_D; 1241 #endif 1242 1243 #if defined(HAVE_ICONV) 1244 sc->cd_w = (iconv_t)-1; 1245 /* 1246 * Create an iconv object. 1247 */ 1248 if (((flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) && 1249 (flag & (SCONV_FROM_UTF8 | SCONV_FROM_UTF16))) || 1250 (flag & SCONV_WIN_CP)) { 1251 /* This case we won't use iconv. */ 1252 sc->cd = (iconv_t)-1; 1253 } else { 1254 sc->cd = iconv_open(tc, fc); 1255 if (sc->cd == (iconv_t)-1 && (sc->flag & SCONV_BEST_EFFORT)) { 1256 /* 1257 * Unfortunately, all of iconv implements do support 1258 * "CP932" character-set, so we should use "SJIS" 1259 * instead if iconv_open failed. 1260 */ 1261 if (strcmp(tc, "CP932") == 0) 1262 sc->cd = iconv_open("SJIS", fc); 1263 else if (strcmp(fc, "CP932") == 0) 1264 sc->cd = iconv_open(tc, "SJIS"); 1265 } 1266 #if defined(_WIN32) && !defined(__CYGWIN__) 1267 /* 1268 * archive_mstring on Windows directly convert multi-bytes 1269 * into archive_wstring in order not to depend on locale 1270 * so that you can do a I18N programming. This will be 1271 * used only in archive_mstring_copy_mbs_len_l so far. 1272 */ 1273 if (flag & SCONV_FROM_CHARSET) { 1274 sc->cd_w = iconv_open("UTF-8", fc); 1275 if (sc->cd_w == (iconv_t)-1 && 1276 (sc->flag & SCONV_BEST_EFFORT)) { 1277 if (strcmp(fc, "CP932") == 0) 1278 sc->cd_w = iconv_open("UTF-8", "SJIS"); 1279 } 1280 } 1281 #endif /* _WIN32 && !__CYGWIN__ */ 1282 } 1283 #endif /* HAVE_ICONV */ 1284 1285 sc->flag = flag; 1286 1287 /* 1288 * Set up converters. 1289 */ 1290 setup_converter(sc); 1291 1292 return (sc); 1293 } 1294 1295 /* 1296 * Free a string conversion object. 1297 */ 1298 static void 1299 free_sconv_object(struct archive_string_conv *sc) 1300 { 1301 free(sc->from_charset); 1302 free(sc->to_charset); 1303 archive_string_free(&sc->utftmp); 1304 #if HAVE_ICONV 1305 if (sc->cd != (iconv_t)-1) 1306 iconv_close(sc->cd); 1307 if (sc->cd_w != (iconv_t)-1) 1308 iconv_close(sc->cd_w); 1309 #endif 1310 free(sc); 1311 } 1312 1313 #if defined(_WIN32) && !defined(__CYGWIN__) 1314 static unsigned 1315 my_atoi(const char *p) 1316 { 1317 unsigned cp; 1318 1319 cp = 0; 1320 while (*p) { 1321 if (*p >= '0' && *p <= '9') 1322 cp = cp * 10 + (*p - '0'); 1323 else 1324 return (-1); 1325 p++; 1326 } 1327 return (cp); 1328 } 1329 1330 /* 1331 * Translate Charset name (as used by iconv) into CodePage (as used by Windows) 1332 * Return -1 if failed. 1333 * 1334 * Note: This translation code may be insufficient. 1335 */ 1336 static struct charset { 1337 const char *name; 1338 unsigned cp; 1339 } charsets[] = { 1340 /* MUST BE SORTED! */ 1341 {"ASCII", 1252}, 1342 {"ASMO-708", 708}, 1343 {"BIG5", 950}, 1344 {"CHINESE", 936}, 1345 {"CP367", 1252}, 1346 {"CP819", 1252}, 1347 {"CP1025", 21025}, 1348 {"DOS-720", 720}, 1349 {"DOS-862", 862}, 1350 {"EUC-CN", 51936}, 1351 {"EUC-JP", 51932}, 1352 {"EUC-KR", 949}, 1353 {"EUCCN", 51936}, 1354 {"EUCJP", 51932}, 1355 {"EUCKR", 949}, 1356 {"GB18030", 54936}, 1357 {"GB2312", 936}, 1358 {"HEBREW", 1255}, 1359 {"HZ-GB-2312", 52936}, 1360 {"IBM273", 20273}, 1361 {"IBM277", 20277}, 1362 {"IBM278", 20278}, 1363 {"IBM280", 20280}, 1364 {"IBM284", 20284}, 1365 {"IBM285", 20285}, 1366 {"IBM290", 20290}, 1367 {"IBM297", 20297}, 1368 {"IBM367", 1252}, 1369 {"IBM420", 20420}, 1370 {"IBM423", 20423}, 1371 {"IBM424", 20424}, 1372 {"IBM819", 1252}, 1373 {"IBM871", 20871}, 1374 {"IBM880", 20880}, 1375 {"IBM905", 20905}, 1376 {"IBM924", 20924}, 1377 {"ISO-8859-1", 28591}, 1378 {"ISO-8859-13", 28603}, 1379 {"ISO-8859-15", 28605}, 1380 {"ISO-8859-2", 28592}, 1381 {"ISO-8859-3", 28593}, 1382 {"ISO-8859-4", 28594}, 1383 {"ISO-8859-5", 28595}, 1384 {"ISO-8859-6", 28596}, 1385 {"ISO-8859-7", 28597}, 1386 {"ISO-8859-8", 28598}, 1387 {"ISO-8859-9", 28599}, 1388 {"ISO8859-1", 28591}, 1389 {"ISO8859-13", 28603}, 1390 {"ISO8859-15", 28605}, 1391 {"ISO8859-2", 28592}, 1392 {"ISO8859-3", 28593}, 1393 {"ISO8859-4", 28594}, 1394 {"ISO8859-5", 28595}, 1395 {"ISO8859-6", 28596}, 1396 {"ISO8859-7", 28597}, 1397 {"ISO8859-8", 28598}, 1398 {"ISO8859-9", 28599}, 1399 {"JOHAB", 1361}, 1400 {"KOI8-R", 20866}, 1401 {"KOI8-U", 21866}, 1402 {"KS_C_5601-1987", 949}, 1403 {"LATIN1", 1252}, 1404 {"LATIN2", 28592}, 1405 {"MACINTOSH", 10000}, 1406 {"SHIFT-JIS", 932}, 1407 {"SHIFT_JIS", 932}, 1408 {"SJIS", 932}, 1409 {"US", 1252}, 1410 {"US-ASCII", 1252}, 1411 {"UTF-16", 1200}, 1412 {"UTF-16BE", 1201}, 1413 {"UTF-16LE", 1200}, 1414 {"UTF-8", CP_UTF8}, 1415 {"X-EUROPA", 29001}, 1416 {"X-MAC-ARABIC", 10004}, 1417 {"X-MAC-CE", 10029}, 1418 {"X-MAC-CHINESEIMP", 10008}, 1419 {"X-MAC-CHINESETRAD", 10002}, 1420 {"X-MAC-CROATIAN", 10082}, 1421 {"X-MAC-CYRILLIC", 10007}, 1422 {"X-MAC-GREEK", 10006}, 1423 {"X-MAC-HEBREW", 10005}, 1424 {"X-MAC-ICELANDIC", 10079}, 1425 {"X-MAC-JAPANESE", 10001}, 1426 {"X-MAC-KOREAN", 10003}, 1427 {"X-MAC-ROMANIAN", 10010}, 1428 {"X-MAC-THAI", 10021}, 1429 {"X-MAC-TURKISH", 10081}, 1430 {"X-MAC-UKRAINIAN", 10017}, 1431 }; 1432 static unsigned 1433 make_codepage_from_charset(const char *charset) 1434 { 1435 char cs[16]; 1436 char *p; 1437 unsigned cp; 1438 int a, b; 1439 1440 if (charset == NULL || strlen(charset) > 15) 1441 return -1; 1442 1443 /* Copy name to uppercase. */ 1444 p = cs; 1445 while (*charset) { 1446 char c = *charset++; 1447 if (c >= 'a' && c <= 'z') 1448 c -= 'a' - 'A'; 1449 *p++ = c; 1450 } 1451 *p++ = '\0'; 1452 cp = -1; 1453 1454 /* Look it up in the table first, so that we can easily 1455 * override CP367, which we map to 1252 instead of 367. */ 1456 a = 0; 1457 b = sizeof(charsets)/sizeof(charsets[0]); 1458 while (b > a) { 1459 int c = (b + a) / 2; 1460 int r = strcmp(charsets[c].name, cs); 1461 if (r < 0) 1462 a = c + 1; 1463 else if (r > 0) 1464 b = c; 1465 else 1466 return charsets[c].cp; 1467 } 1468 1469 /* If it's not in the table, try to parse it. */ 1470 switch (*cs) { 1471 case 'C': 1472 if (cs[1] == 'P' && cs[2] >= '0' && cs[2] <= '9') { 1473 cp = my_atoi(cs + 2); 1474 } else if (strcmp(cs, "CP_ACP") == 0) 1475 cp = get_current_codepage(); 1476 else if (strcmp(cs, "CP_OEMCP") == 0) 1477 cp = get_current_oemcp(); 1478 break; 1479 case 'I': 1480 if (cs[1] == 'B' && cs[2] == 'M' && 1481 cs[3] >= '0' && cs[3] <= '9') { 1482 cp = my_atoi(cs + 3); 1483 } 1484 break; 1485 case 'W': 1486 if (strncmp(cs, "WINDOWS-", 8) == 0) { 1487 cp = my_atoi(cs + 8); 1488 if (cp != 874 && (cp < 1250 || cp > 1258)) 1489 cp = -1;/* This may invalid code. */ 1490 } 1491 break; 1492 } 1493 return (cp); 1494 } 1495 1496 /* 1497 * Return ANSI Code Page of current locale set by setlocale(). 1498 */ 1499 static unsigned 1500 get_current_codepage(void) 1501 { 1502 char *locale, *p; 1503 unsigned cp; 1504 1505 locale = setlocale(LC_CTYPE, NULL); 1506 if (locale == NULL) 1507 return (GetACP()); 1508 if (locale[0] == 'C' && locale[1] == '\0') 1509 return (CP_C_LOCALE); 1510 p = strrchr(locale, '.'); 1511 if (p == NULL) 1512 return (GetACP()); 1513 cp = my_atoi(p+1); 1514 if (cp <= 0) 1515 return (GetACP()); 1516 return (cp); 1517 } 1518 1519 /* 1520 * Translation table between Locale Name and ACP/OEMCP. 1521 */ 1522 static struct { 1523 unsigned acp; 1524 unsigned ocp; 1525 const char *locale; 1526 } acp_ocp_map[] = { 1527 { 950, 950, "Chinese_Taiwan" }, 1528 { 936, 936, "Chinese_People's Republic of China" }, 1529 { 950, 950, "Chinese_Taiwan" }, 1530 { 1250, 852, "Czech_Czech Republic" }, 1531 { 1252, 850, "Danish_Denmark" }, 1532 { 1252, 850, "Dutch_Netherlands" }, 1533 { 1252, 850, "Dutch_Belgium" }, 1534 { 1252, 437, "English_United States" }, 1535 { 1252, 850, "English_Australia" }, 1536 { 1252, 850, "English_Canada" }, 1537 { 1252, 850, "English_New Zealand" }, 1538 { 1252, 850, "English_United Kingdom" }, 1539 { 1252, 437, "English_United States" }, 1540 { 1252, 850, "Finnish_Finland" }, 1541 { 1252, 850, "French_France" }, 1542 { 1252, 850, "French_Belgium" }, 1543 { 1252, 850, "French_Canada" }, 1544 { 1252, 850, "French_Switzerland" }, 1545 { 1252, 850, "German_Germany" }, 1546 { 1252, 850, "German_Austria" }, 1547 { 1252, 850, "German_Switzerland" }, 1548 { 1253, 737, "Greek_Greece" }, 1549 { 1250, 852, "Hungarian_Hungary" }, 1550 { 1252, 850, "Icelandic_Iceland" }, 1551 { 1252, 850, "Italian_Italy" }, 1552 { 1252, 850, "Italian_Switzerland" }, 1553 { 932, 932, "Japanese_Japan" }, 1554 { 949, 949, "Korean_Korea" }, 1555 { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1556 { 1252, 850, "Norwegian (BokmOl)_Norway" }, 1557 { 1252, 850, "Norwegian-Nynorsk_Norway" }, 1558 { 1250, 852, "Polish_Poland" }, 1559 { 1252, 850, "Portuguese_Portugal" }, 1560 { 1252, 850, "Portuguese_Brazil" }, 1561 { 1251, 866, "Russian_Russia" }, 1562 { 1250, 852, "Slovak_Slovakia" }, 1563 { 1252, 850, "Spanish_Spain" }, 1564 { 1252, 850, "Spanish_Mexico" }, 1565 { 1252, 850, "Spanish_Spain" }, 1566 { 1252, 850, "Swedish_Sweden" }, 1567 { 1254, 857, "Turkish_Turkey" }, 1568 { 0, 0, NULL} 1569 }; 1570 1571 /* 1572 * Return OEM Code Page of current locale set by setlocale(). 1573 */ 1574 static unsigned 1575 get_current_oemcp(void) 1576 { 1577 int i; 1578 char *locale, *p; 1579 size_t len; 1580 1581 locale = setlocale(LC_CTYPE, NULL); 1582 if (locale == NULL) 1583 return (GetOEMCP()); 1584 if (locale[0] == 'C' && locale[1] == '\0') 1585 return (CP_C_LOCALE); 1586 1587 p = strrchr(locale, '.'); 1588 if (p == NULL) 1589 return (GetOEMCP()); 1590 len = p - locale; 1591 for (i = 0; acp_ocp_map[i].acp; i++) { 1592 if (strncmp(acp_ocp_map[i].locale, locale, len) == 0) 1593 return (acp_ocp_map[i].ocp); 1594 } 1595 return (GetOEMCP()); 1596 } 1597 #else 1598 1599 /* 1600 * POSIX platform does not use CodePage. 1601 */ 1602 1603 static unsigned 1604 get_current_codepage(void) 1605 { 1606 return (-1);/* Unknown */ 1607 } 1608 static unsigned 1609 make_codepage_from_charset(const char *charset) 1610 { 1611 (void)charset; /* UNUSED */ 1612 return (-1);/* Unknown */ 1613 } 1614 static unsigned 1615 get_current_oemcp(void) 1616 { 1617 return (-1);/* Unknown */ 1618 } 1619 1620 #endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 1621 1622 /* 1623 * Return a string conversion object. 1624 */ 1625 static struct archive_string_conv * 1626 get_sconv_object(struct archive *a, const char *fc, const char *tc, int flag) 1627 { 1628 struct archive_string_conv *sc; 1629 unsigned current_codepage; 1630 1631 /* Check if we have made the sconv object. */ 1632 sc = find_sconv_object(a, fc, tc); 1633 if (sc != NULL) 1634 return (sc); 1635 1636 if (a == NULL) 1637 current_codepage = get_current_codepage(); 1638 else 1639 current_codepage = a->current_codepage; 1640 1641 sc = create_sconv_object(canonical_charset_name(fc), 1642 canonical_charset_name(tc), current_codepage, flag); 1643 if (sc == NULL) { 1644 if (a != NULL) 1645 archive_set_error(a, ENOMEM, 1646 "Could not allocate memory for " 1647 "a string conversion object"); 1648 return (NULL); 1649 } 1650 1651 /* 1652 * If there is no converter for current string conversion object, 1653 * we cannot handle this conversion. 1654 */ 1655 if (sc->nconverter == 0) { 1656 if (a != NULL) { 1657 #if HAVE_ICONV 1658 archive_set_error(a, ARCHIVE_ERRNO_MISC, 1659 "iconv_open failed : Cannot handle ``%s''", 1660 (flag & SCONV_TO_CHARSET)?tc:fc); 1661 #else 1662 archive_set_error(a, ARCHIVE_ERRNO_MISC, 1663 "A character-set conversion not fully supported " 1664 "on this platform"); 1665 #endif 1666 } 1667 /* Failed; free a sconv object. */ 1668 free_sconv_object(sc); 1669 return (NULL); 1670 } 1671 1672 /* 1673 * Success! 1674 */ 1675 if (a != NULL) 1676 add_sconv_object(a, sc); 1677 return (sc); 1678 } 1679 1680 static const char * 1681 get_current_charset(struct archive *a) 1682 { 1683 const char *cur_charset; 1684 1685 if (a == NULL) 1686 cur_charset = default_iconv_charset(""); 1687 else { 1688 cur_charset = default_iconv_charset(a->current_code); 1689 if (a->current_code == NULL) { 1690 a->current_code = strdup(cur_charset); 1691 a->current_codepage = get_current_codepage(); 1692 a->current_oemcp = get_current_oemcp(); 1693 } 1694 } 1695 return (cur_charset); 1696 } 1697 1698 /* 1699 * Make and Return a string conversion object. 1700 * Return NULL if the platform does not support the specified conversion 1701 * and best_effort is 0. 1702 * If best_effort is set, A string conversion object must be returned 1703 * unless memory allocation for the object fails, but the conversion 1704 * might fail when non-ASCII code is found. 1705 */ 1706 struct archive_string_conv * 1707 archive_string_conversion_to_charset(struct archive *a, const char *charset, 1708 int best_effort) 1709 { 1710 int flag = SCONV_TO_CHARSET; 1711 1712 if (best_effort) 1713 flag |= SCONV_BEST_EFFORT; 1714 return (get_sconv_object(a, get_current_charset(a), charset, flag)); 1715 } 1716 1717 struct archive_string_conv * 1718 archive_string_conversion_from_charset(struct archive *a, const char *charset, 1719 int best_effort) 1720 { 1721 int flag = SCONV_FROM_CHARSET; 1722 1723 if (best_effort) 1724 flag |= SCONV_BEST_EFFORT; 1725 return (get_sconv_object(a, charset, get_current_charset(a), flag)); 1726 } 1727 1728 /* 1729 * archive_string_default_conversion_*_archive() are provided for Windows 1730 * platform because other archiver application use CP_OEMCP for 1731 * MultiByteToWideChar() and WideCharToMultiByte() for the filenames 1732 * in tar or zip files. But mbstowcs/wcstombs(CRT) usually use CP_ACP 1733 * unless you use setlocale(LC_ALL, ".OCP")(specify CP_OEMCP). 1734 * So we should make a string conversion between CP_ACP and CP_OEMCP 1735 * for compatibility. 1736 */ 1737 #if defined(_WIN32) && !defined(__CYGWIN__) 1738 struct archive_string_conv * 1739 archive_string_default_conversion_for_read(struct archive *a) 1740 { 1741 const char *cur_charset = get_current_charset(a); 1742 char oemcp[16]; 1743 1744 /* NOTE: a check of cur_charset is unneeded but we need 1745 * that get_current_charset() has been surely called at 1746 * this time whatever C compiler optimized. */ 1747 if (cur_charset != NULL && 1748 (a->current_codepage == CP_C_LOCALE || 1749 a->current_codepage == a->current_oemcp)) 1750 return (NULL);/* no conversion. */ 1751 1752 _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1753 /* Make sure a null termination must be set. */ 1754 oemcp[sizeof(oemcp)-1] = '\0'; 1755 return (get_sconv_object(a, oemcp, cur_charset, 1756 SCONV_FROM_CHARSET)); 1757 } 1758 1759 struct archive_string_conv * 1760 archive_string_default_conversion_for_write(struct archive *a) 1761 { 1762 const char *cur_charset = get_current_charset(a); 1763 char oemcp[16]; 1764 1765 /* NOTE: a check of cur_charset is unneeded but we need 1766 * that get_current_charset() has been surely called at 1767 * this time whatever C compiler optimized. */ 1768 if (cur_charset != NULL && 1769 (a->current_codepage == CP_C_LOCALE || 1770 a->current_codepage == a->current_oemcp)) 1771 return (NULL);/* no conversion. */ 1772 1773 _snprintf(oemcp, sizeof(oemcp)-1, "CP%d", a->current_oemcp); 1774 /* Make sure a null termination must be set. */ 1775 oemcp[sizeof(oemcp)-1] = '\0'; 1776 return (get_sconv_object(a, cur_charset, oemcp, 1777 SCONV_TO_CHARSET)); 1778 } 1779 #else 1780 struct archive_string_conv * 1781 archive_string_default_conversion_for_read(struct archive *a) 1782 { 1783 (void)a; /* UNUSED */ 1784 return (NULL); 1785 } 1786 1787 struct archive_string_conv * 1788 archive_string_default_conversion_for_write(struct archive *a) 1789 { 1790 (void)a; /* UNUSED */ 1791 return (NULL); 1792 } 1793 #endif 1794 1795 /* 1796 * Dispose of all character conversion objects in the archive object. 1797 */ 1798 void 1799 archive_string_conversion_free(struct archive *a) 1800 { 1801 struct archive_string_conv *sc; 1802 struct archive_string_conv *sc_next; 1803 1804 for (sc = a->sconv; sc != NULL; sc = sc_next) { 1805 sc_next = sc->next; 1806 free_sconv_object(sc); 1807 } 1808 a->sconv = NULL; 1809 free(a->current_code); 1810 a->current_code = NULL; 1811 } 1812 1813 /* 1814 * Return a conversion charset name. 1815 */ 1816 const char * 1817 archive_string_conversion_charset_name(struct archive_string_conv *sc) 1818 { 1819 if (sc->flag & SCONV_TO_CHARSET) 1820 return (sc->to_charset); 1821 else 1822 return (sc->from_charset); 1823 } 1824 1825 /* 1826 * Change the behavior of a string conversion. 1827 */ 1828 void 1829 archive_string_conversion_set_opt(struct archive_string_conv *sc, int opt) 1830 { 1831 switch (opt) { 1832 /* 1833 * A filename in UTF-8 was made with libarchive 2.x in a wrong 1834 * assumption that wchar_t was Unicode. 1835 * This option enables simulating the assumption in order to read 1836 * that filename correctly. 1837 */ 1838 case SCONV_SET_OPT_UTF8_LIBARCHIVE2X: 1839 #if (defined(_WIN32) && !defined(__CYGWIN__)) \ 1840 || defined(__STDC_ISO_10646__) || defined(__APPLE__) 1841 /* 1842 * Nothing to do for it since wchar_t on these platforms 1843 * is really Unicode. 1844 */ 1845 (void)sc; /* UNUSED */ 1846 #else 1847 if ((sc->flag & SCONV_UTF8_LIBARCHIVE_2) == 0) { 1848 sc->flag |= SCONV_UTF8_LIBARCHIVE_2; 1849 /* Set up string converters. */ 1850 setup_converter(sc); 1851 } 1852 #endif 1853 break; 1854 case SCONV_SET_OPT_NORMALIZATION_C: 1855 if ((sc->flag & SCONV_NORMALIZATION_C) == 0) { 1856 sc->flag |= SCONV_NORMALIZATION_C; 1857 sc->flag &= ~SCONV_NORMALIZATION_D; 1858 /* Set up string converters. */ 1859 setup_converter(sc); 1860 } 1861 break; 1862 case SCONV_SET_OPT_NORMALIZATION_D: 1863 #if defined(HAVE_ICONV) 1864 /* 1865 * If iconv will take the string, do not change the 1866 * setting of the normalization. 1867 */ 1868 if (!(sc->flag & SCONV_WIN_CP) && 1869 (sc->flag & (SCONV_FROM_UTF16 | SCONV_FROM_UTF8)) && 1870 !(sc->flag & (SCONV_TO_UTF16 | SCONV_TO_UTF8))) 1871 break; 1872 #endif 1873 if ((sc->flag & SCONV_NORMALIZATION_D) == 0) { 1874 sc->flag |= SCONV_NORMALIZATION_D; 1875 sc->flag &= ~SCONV_NORMALIZATION_C; 1876 /* Set up string converters. */ 1877 setup_converter(sc); 1878 } 1879 break; 1880 default: 1881 break; 1882 } 1883 } 1884 1885 /* 1886 * 1887 * Copy one archive_string to another in locale conversion. 1888 * 1889 * archive_strncat_l(); 1890 * archive_strncpy_l(); 1891 * 1892 */ 1893 1894 static size_t 1895 mbsnbytes(const void *_p, size_t n) 1896 { 1897 size_t s; 1898 const char *p, *pp; 1899 1900 if (_p == NULL) 1901 return (0); 1902 p = (const char *)_p; 1903 1904 /* Like strlen(p), except won't examine positions beyond p[n]. */ 1905 s = 0; 1906 pp = p; 1907 while (s < n && *pp) { 1908 pp++; 1909 s++; 1910 } 1911 return (s); 1912 } 1913 1914 static size_t 1915 utf16nbytes(const void *_p, size_t n) 1916 { 1917 size_t s; 1918 const char *p, *pp; 1919 1920 if (_p == NULL) 1921 return (0); 1922 p = (const char *)_p; 1923 1924 /* Like strlen(p), except won't examine positions beyond p[n]. */ 1925 s = 0; 1926 pp = p; 1927 n >>= 1; 1928 while (s < n && (pp[0] || pp[1])) { 1929 pp += 2; 1930 s++; 1931 } 1932 return (s<<1); 1933 } 1934 1935 int 1936 archive_strncpy_l(struct archive_string *as, const void *_p, size_t n, 1937 struct archive_string_conv *sc) 1938 { 1939 as->length = 0; 1940 return (archive_strncat_l(as, _p, n, sc)); 1941 } 1942 1943 int 1944 archive_strncat_l(struct archive_string *as, const void *_p, size_t n, 1945 struct archive_string_conv *sc) 1946 { 1947 const void *s; 1948 size_t length = 0; 1949 int i, r = 0, r2; 1950 1951 if (_p != NULL && n > 0) { 1952 if (sc != NULL && (sc->flag & SCONV_FROM_UTF16)) 1953 length = utf16nbytes(_p, n); 1954 else 1955 length = mbsnbytes(_p, n); 1956 } 1957 1958 /* We must allocate memory even if there is no data for conversion 1959 * or copy. This simulates archive_string_append behavior. */ 1960 if (length == 0) { 1961 int tn = 1; 1962 if (sc != NULL && (sc->flag & SCONV_TO_UTF16)) 1963 tn = 2; 1964 if (archive_string_ensure(as, as->length + tn) == NULL) 1965 return (-1); 1966 as->s[as->length] = 0; 1967 if (tn == 2) 1968 as->s[as->length+1] = 0; 1969 return (0); 1970 } 1971 1972 /* 1973 * If sc is NULL, we just make a copy. 1974 */ 1975 if (sc == NULL) { 1976 if (archive_string_append(as, _p, length) == NULL) 1977 return (-1);/* No memory */ 1978 return (0); 1979 } 1980 1981 s = _p; 1982 i = 0; 1983 if (sc->nconverter > 1) { 1984 sc->utftmp.length = 0; 1985 r2 = sc->converter[0](&(sc->utftmp), s, length, sc); 1986 if (r2 != 0 && errno == ENOMEM) 1987 return (r2); 1988 if (r > r2) 1989 r = r2; 1990 s = sc->utftmp.s; 1991 length = sc->utftmp.length; 1992 ++i; 1993 } 1994 r2 = sc->converter[i](as, s, length, sc); 1995 if (r > r2) 1996 r = r2; 1997 return (r); 1998 } 1999 2000 #if HAVE_ICONV 2001 2002 /* 2003 * Return -1 if conversion fails. 2004 */ 2005 static int 2006 iconv_strncat_in_locale(struct archive_string *as, const void *_p, 2007 size_t length, struct archive_string_conv *sc) 2008 { 2009 ICONV_CONST char *itp; 2010 size_t remaining; 2011 iconv_t cd; 2012 char *outp; 2013 size_t avail, bs; 2014 int return_value = 0; /* success */ 2015 int to_size, from_size; 2016 2017 if (sc->flag & SCONV_TO_UTF16) 2018 to_size = 2; 2019 else 2020 to_size = 1; 2021 if (sc->flag & SCONV_FROM_UTF16) 2022 from_size = 2; 2023 else 2024 from_size = 1; 2025 2026 if (archive_string_ensure(as, as->length + length*2+to_size) == NULL) 2027 return (-1); 2028 2029 cd = sc->cd; 2030 itp = (char *)(uintptr_t)_p; 2031 remaining = length; 2032 outp = as->s + as->length; 2033 avail = as->buffer_length - as->length - to_size; 2034 while (remaining >= (size_t)from_size) { 2035 size_t result = iconv(cd, &itp, &remaining, &outp, &avail); 2036 2037 if (result != (size_t)-1) 2038 break; /* Conversion completed. */ 2039 2040 if (errno == EILSEQ || errno == EINVAL) { 2041 /* 2042 * If an output charset is UTF-8 or UTF-16BE/LE, 2043 * unknown character should be U+FFFD 2044 * (replacement character). 2045 */ 2046 if (sc->flag & (SCONV_TO_UTF8 | SCONV_TO_UTF16)) { 2047 size_t rbytes; 2048 if (sc->flag & SCONV_TO_UTF8) 2049 rbytes = sizeof(utf8_replacement_char); 2050 else 2051 rbytes = 2; 2052 2053 if (avail < rbytes) { 2054 as->length = outp - as->s; 2055 bs = as->buffer_length + 2056 (remaining * to_size) + rbytes; 2057 if (NULL == 2058 archive_string_ensure(as, bs)) 2059 return (-1); 2060 outp = as->s + as->length; 2061 avail = as->buffer_length 2062 - as->length - to_size; 2063 } 2064 if (sc->flag & SCONV_TO_UTF8) 2065 memcpy(outp, utf8_replacement_char, sizeof(utf8_replacement_char)); 2066 else if (sc->flag & SCONV_TO_UTF16BE) 2067 archive_be16enc(outp, UNICODE_R_CHAR); 2068 else 2069 archive_le16enc(outp, UNICODE_R_CHAR); 2070 outp += rbytes; 2071 avail -= rbytes; 2072 } else { 2073 /* Skip the illegal input bytes. */ 2074 *outp++ = '?'; 2075 avail--; 2076 } 2077 itp += from_size; 2078 remaining -= from_size; 2079 return_value = -1; /* failure */ 2080 } else { 2081 /* E2BIG no output buffer, 2082 * Increase an output buffer. */ 2083 as->length = outp - as->s; 2084 bs = as->buffer_length + remaining * 2; 2085 if (NULL == archive_string_ensure(as, bs)) 2086 return (-1); 2087 outp = as->s + as->length; 2088 avail = as->buffer_length - as->length - to_size; 2089 } 2090 } 2091 as->length = outp - as->s; 2092 as->s[as->length] = 0; 2093 if (to_size == 2) 2094 as->s[as->length+1] = 0; 2095 return (return_value); 2096 } 2097 2098 #endif /* HAVE_ICONV */ 2099 2100 2101 #if defined(_WIN32) && !defined(__CYGWIN__) 2102 2103 /* 2104 * Translate a string from a some CodePage to an another CodePage by 2105 * Windows APIs, and copy the result. Return -1 if conversion fails. 2106 */ 2107 static int 2108 strncat_in_codepage(struct archive_string *as, 2109 const void *_p, size_t length, struct archive_string_conv *sc) 2110 { 2111 const char *s = (const char *)_p; 2112 struct archive_wstring aws; 2113 size_t l; 2114 int r, saved_flag; 2115 2116 archive_string_init(&aws); 2117 saved_flag = sc->flag; 2118 sc->flag &= ~(SCONV_NORMALIZATION_D | SCONV_NORMALIZATION_C); 2119 r = archive_wstring_append_from_mbs_in_codepage(&aws, s, length, sc); 2120 sc->flag = saved_flag; 2121 if (r != 0) { 2122 archive_wstring_free(&aws); 2123 if (errno != ENOMEM) 2124 archive_string_append(as, s, length); 2125 return (-1); 2126 } 2127 2128 l = as->length; 2129 r = archive_string_append_from_wcs_in_codepage( 2130 as, aws.s, aws.length, sc); 2131 if (r != 0 && errno != ENOMEM && l == as->length) 2132 archive_string_append(as, s, length); 2133 archive_wstring_free(&aws); 2134 return (r); 2135 } 2136 2137 /* 2138 * Test whether MBS ==> WCS is okay. 2139 */ 2140 static int 2141 invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2142 { 2143 const char *p = (const char *)_p; 2144 unsigned codepage; 2145 DWORD mbflag = MB_ERR_INVALID_CHARS; 2146 2147 if (sc->flag & SCONV_FROM_CHARSET) 2148 codepage = sc->to_cp; 2149 else 2150 codepage = sc->from_cp; 2151 2152 if (codepage == CP_C_LOCALE) 2153 return (0); 2154 if (codepage != CP_UTF8) 2155 mbflag |= MB_PRECOMPOSED; 2156 2157 if (MultiByteToWideChar(codepage, mbflag, p, (int)n, NULL, 0) == 0) 2158 return (-1); /* Invalid */ 2159 return (0); /* Okay */ 2160 } 2161 2162 #else 2163 2164 /* 2165 * Test whether MBS ==> WCS is okay. 2166 */ 2167 static int 2168 invalid_mbs(const void *_p, size_t n, struct archive_string_conv *sc) 2169 { 2170 const char *p = (const char *)_p; 2171 size_t r; 2172 2173 #if HAVE_MBRTOWC 2174 mbstate_t shift_state; 2175 2176 memset(&shift_state, 0, sizeof(shift_state)); 2177 #else 2178 /* Clear the shift state before starting. */ 2179 mbtowc(NULL, NULL, 0); 2180 #endif 2181 while (n) { 2182 wchar_t wc; 2183 2184 #if HAVE_MBRTOWC 2185 r = mbrtowc(&wc, p, n, &shift_state); 2186 #else 2187 r = mbtowc(&wc, p, n); 2188 #endif 2189 if (r == (size_t)-1 || r == (size_t)-2) 2190 return (-1);/* Invalid. */ 2191 if (r == 0) 2192 break; 2193 p += r; 2194 n -= r; 2195 } 2196 (void)sc; /* UNUSED */ 2197 return (0); /* All Okey. */ 2198 } 2199 2200 #endif /* defined(_WIN32) && !defined(__CYGWIN__) */ 2201 2202 /* 2203 * Basically returns -1 because we cannot make a conversion of charset 2204 * without iconv but in some cases this would return 0. 2205 * Returns 0 if all copied characters are ASCII. 2206 * Returns 0 if both from-locale and to-locale are the same and those 2207 * can be WCS with no error. 2208 */ 2209 static int 2210 best_effort_strncat_in_locale(struct archive_string *as, const void *_p, 2211 size_t length, struct archive_string_conv *sc) 2212 { 2213 size_t remaining; 2214 const uint8_t *itp; 2215 int return_value = 0; /* success */ 2216 2217 /* 2218 * If both from-locale and to-locale is the same, this makes a copy. 2219 * And then this checks all copied MBS can be WCS if so returns 0. 2220 */ 2221 if (sc->same) { 2222 if (archive_string_append(as, _p, length) == NULL) 2223 return (-1);/* No memory */ 2224 return (invalid_mbs(_p, length, sc)); 2225 } 2226 2227 /* 2228 * If a character is ASCII, this just copies it. If not, this 2229 * assigns '?' character instead but in UTF-8 locale this assigns 2230 * byte sequence 0xEF 0xBD 0xBD, which are code point U+FFFD, 2231 * a Replacement Character in Unicode. 2232 */ 2233 2234 remaining = length; 2235 itp = (const uint8_t *)_p; 2236 while (*itp && remaining > 0) { 2237 if (*itp > 127) { 2238 // Non-ASCII: Substitute with suitable replacement 2239 if (sc->flag & SCONV_TO_UTF8) { 2240 if (archive_string_append(as, utf8_replacement_char, sizeof(utf8_replacement_char)) == NULL) { 2241 __archive_errx(1, "Out of memory"); 2242 } 2243 } else { 2244 archive_strappend_char(as, '?'); 2245 } 2246 return_value = -1; 2247 } else { 2248 archive_strappend_char(as, *itp); 2249 } 2250 ++itp; 2251 } 2252 return (return_value); 2253 } 2254 2255 2256 /* 2257 * Unicode conversion functions. 2258 * - UTF-8 <===> UTF-8 in removing surrogate pairs. 2259 * - UTF-8 NFD ===> UTF-8 NFC in removing surrogate pairs. 2260 * - UTF-8 made by libarchive 2.x ===> UTF-8. 2261 * - UTF-16BE <===> UTF-8. 2262 * 2263 */ 2264 2265 /* 2266 * Utility to convert a single UTF-8 sequence. 2267 * 2268 * Usually return used bytes, return used byte in negative value when 2269 * a unicode character is replaced with U+FFFD. 2270 * See also http://unicode.org/review/pr-121.html Public Review Issue #121 2271 * Recommended Practice for Replacement Characters. 2272 */ 2273 static int 2274 _utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2275 { 2276 static const char utf8_count[256] = { 2277 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 00 - 0F */ 2278 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 10 - 1F */ 2279 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 20 - 2F */ 2280 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 30 - 3F */ 2281 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 40 - 4F */ 2282 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 50 - 5F */ 2283 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 60 - 6F */ 2284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,/* 70 - 7F */ 2285 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 80 - 8F */ 2286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* 90 - 9F */ 2287 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* A0 - AF */ 2288 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,/* B0 - BF */ 2289 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* C0 - CF */ 2290 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,/* D0 - DF */ 2291 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,/* E0 - EF */ 2292 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0 - FF */ 2293 }; 2294 int ch, i; 2295 int cnt; 2296 uint32_t wc; 2297 2298 /* Sanity check. */ 2299 if (n == 0) 2300 return (0); 2301 /* 2302 * Decode 1-4 bytes depending on the value of the first byte. 2303 */ 2304 ch = (unsigned char)*s; 2305 if (ch == 0) 2306 return (0); /* Standard: return 0 for end-of-string. */ 2307 cnt = utf8_count[ch]; 2308 2309 /* Invalid sequence or there are not plenty bytes. */ 2310 if ((int)n < cnt) { 2311 cnt = (int)n; 2312 for (i = 1; i < cnt; i++) { 2313 if ((s[i] & 0xc0) != 0x80) { 2314 cnt = i; 2315 break; 2316 } 2317 } 2318 goto invalid_sequence; 2319 } 2320 2321 /* Make a Unicode code point from a single UTF-8 sequence. */ 2322 switch (cnt) { 2323 case 1: /* 1 byte sequence. */ 2324 *pwc = ch & 0x7f; 2325 return (cnt); 2326 case 2: /* 2 bytes sequence. */ 2327 if ((s[1] & 0xc0) != 0x80) { 2328 cnt = 1; 2329 goto invalid_sequence; 2330 } 2331 *pwc = ((ch & 0x1f) << 6) | (s[1] & 0x3f); 2332 return (cnt); 2333 case 3: /* 3 bytes sequence. */ 2334 if ((s[1] & 0xc0) != 0x80) { 2335 cnt = 1; 2336 goto invalid_sequence; 2337 } 2338 if ((s[2] & 0xc0) != 0x80) { 2339 cnt = 2; 2340 goto invalid_sequence; 2341 } 2342 wc = ((ch & 0x0f) << 12) 2343 | ((s[1] & 0x3f) << 6) 2344 | (s[2] & 0x3f); 2345 if (wc < 0x800) 2346 goto invalid_sequence;/* Overlong sequence. */ 2347 break; 2348 case 4: /* 4 bytes sequence. */ 2349 if ((s[1] & 0xc0) != 0x80) { 2350 cnt = 1; 2351 goto invalid_sequence; 2352 } 2353 if ((s[2] & 0xc0) != 0x80) { 2354 cnt = 2; 2355 goto invalid_sequence; 2356 } 2357 if ((s[3] & 0xc0) != 0x80) { 2358 cnt = 3; 2359 goto invalid_sequence; 2360 } 2361 wc = ((ch & 0x07) << 18) 2362 | ((s[1] & 0x3f) << 12) 2363 | ((s[2] & 0x3f) << 6) 2364 | (s[3] & 0x3f); 2365 if (wc < 0x10000) 2366 goto invalid_sequence;/* Overlong sequence. */ 2367 break; 2368 default: /* Others are all invalid sequence. */ 2369 if (ch == 0xc0 || ch == 0xc1) 2370 cnt = 2; 2371 else if (ch >= 0xf5 && ch <= 0xf7) 2372 cnt = 4; 2373 else if (ch >= 0xf8 && ch <= 0xfb) 2374 cnt = 5; 2375 else if (ch == 0xfc || ch == 0xfd) 2376 cnt = 6; 2377 else 2378 cnt = 1; 2379 if ((int)n < cnt) 2380 cnt = (int)n; 2381 for (i = 1; i < cnt; i++) { 2382 if ((s[i] & 0xc0) != 0x80) { 2383 cnt = i; 2384 break; 2385 } 2386 } 2387 goto invalid_sequence; 2388 } 2389 2390 /* The code point larger than 0x10FFFF is not legal 2391 * Unicode values. */ 2392 if (wc > UNICODE_MAX) 2393 goto invalid_sequence; 2394 /* Correctly gets a Unicode, returns used bytes. */ 2395 *pwc = wc; 2396 return (cnt); 2397 invalid_sequence: 2398 *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2399 return (cnt * -1); 2400 } 2401 2402 static int 2403 utf8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2404 { 2405 int cnt; 2406 2407 cnt = _utf8_to_unicode(pwc, s, n); 2408 /* Any of Surrogate pair is not legal Unicode values. */ 2409 if (cnt == 3 && IS_SURROGATE_PAIR_LA(*pwc)) 2410 return (-3); 2411 return (cnt); 2412 } 2413 2414 static inline uint32_t 2415 combine_surrogate_pair(uint32_t uc, uint32_t uc2) 2416 { 2417 uc -= 0xD800; 2418 uc *= 0x400; 2419 uc += uc2 - 0xDC00; 2420 uc += 0x10000; 2421 return (uc); 2422 } 2423 2424 /* 2425 * Convert a single UTF-8/CESU-8 sequence to a Unicode code point in 2426 * removing surrogate pairs. 2427 * 2428 * CESU-8: The Compatibility Encoding Scheme for UTF-16. 2429 * 2430 * Usually return used bytes, return used byte in negative value when 2431 * a unicode character is replaced with U+FFFD. 2432 */ 2433 static int 2434 cesu8_to_unicode(uint32_t *pwc, const char *s, size_t n) 2435 { 2436 uint32_t wc = 0; 2437 int cnt; 2438 2439 cnt = _utf8_to_unicode(&wc, s, n); 2440 if (cnt == 3 && IS_HIGH_SURROGATE_LA(wc)) { 2441 uint32_t wc2 = 0; 2442 if (n - 3 < 3) { 2443 /* Invalid byte sequence. */ 2444 goto invalid_sequence; 2445 } 2446 cnt = _utf8_to_unicode(&wc2, s+3, n-3); 2447 if (cnt != 3 || !IS_LOW_SURROGATE_LA(wc2)) { 2448 /* Invalid byte sequence. */ 2449 goto invalid_sequence; 2450 } 2451 wc = combine_surrogate_pair(wc, wc2); 2452 cnt = 6; 2453 } else if (cnt == 3 && IS_LOW_SURROGATE_LA(wc)) { 2454 /* Invalid byte sequence. */ 2455 goto invalid_sequence; 2456 } 2457 *pwc = wc; 2458 return (cnt); 2459 invalid_sequence: 2460 *pwc = UNICODE_R_CHAR;/* set the Replacement Character instead. */ 2461 if (cnt > 0) 2462 cnt *= -1; 2463 return (cnt); 2464 } 2465 2466 /* 2467 * Convert a Unicode code point to a single UTF-8 sequence. 2468 * 2469 * NOTE:This function does not check if the Unicode is legal or not. 2470 * Please you definitely check it before calling this. 2471 */ 2472 static size_t 2473 unicode_to_utf8(char *p, size_t remaining, uint32_t uc) 2474 { 2475 char *_p = p; 2476 2477 /* Invalid Unicode char maps to Replacement character */ 2478 if (uc > UNICODE_MAX) 2479 uc = UNICODE_R_CHAR; 2480 /* Translate code point to UTF8 */ 2481 if (uc <= 0x7f) { 2482 if (remaining == 0) 2483 return (0); 2484 *p++ = (char)uc; 2485 } else if (uc <= 0x7ff) { 2486 if (remaining < 2) 2487 return (0); 2488 *p++ = 0xc0 | ((uc >> 6) & 0x1f); 2489 *p++ = 0x80 | (uc & 0x3f); 2490 } else if (uc <= 0xffff) { 2491 if (remaining < 3) 2492 return (0); 2493 *p++ = 0xe0 | ((uc >> 12) & 0x0f); 2494 *p++ = 0x80 | ((uc >> 6) & 0x3f); 2495 *p++ = 0x80 | (uc & 0x3f); 2496 } else { 2497 if (remaining < 4) 2498 return (0); 2499 *p++ = 0xf0 | ((uc >> 18) & 0x07); 2500 *p++ = 0x80 | ((uc >> 12) & 0x3f); 2501 *p++ = 0x80 | ((uc >> 6) & 0x3f); 2502 *p++ = 0x80 | (uc & 0x3f); 2503 } 2504 return (p - _p); 2505 } 2506 2507 static int 2508 utf16be_to_unicode(uint32_t *pwc, const char *s, size_t n) 2509 { 2510 return (utf16_to_unicode(pwc, s, n, 1)); 2511 } 2512 2513 static int 2514 utf16le_to_unicode(uint32_t *pwc, const char *s, size_t n) 2515 { 2516 return (utf16_to_unicode(pwc, s, n, 0)); 2517 } 2518 2519 static int 2520 utf16_to_unicode(uint32_t *pwc, const char *s, size_t n, int be) 2521 { 2522 const char *utf16 = s; 2523 unsigned uc; 2524 2525 if (n == 0) 2526 return (0); 2527 if (n == 1) { 2528 /* set the Replacement Character instead. */ 2529 *pwc = UNICODE_R_CHAR; 2530 return (-1); 2531 } 2532 2533 if (be) 2534 uc = archive_be16dec(utf16); 2535 else 2536 uc = archive_le16dec(utf16); 2537 utf16 += 2; 2538 2539 /* If this is a surrogate pair, assemble the full code point.*/ 2540 if (IS_HIGH_SURROGATE_LA(uc)) { 2541 unsigned uc2; 2542 2543 if (n >= 4) { 2544 if (be) 2545 uc2 = archive_be16dec(utf16); 2546 else 2547 uc2 = archive_le16dec(utf16); 2548 } else 2549 uc2 = 0; 2550 if (IS_LOW_SURROGATE_LA(uc2)) { 2551 uc = combine_surrogate_pair(uc, uc2); 2552 utf16 += 2; 2553 } else { 2554 /* Undescribed code point should be U+FFFD 2555 * (replacement character). */ 2556 *pwc = UNICODE_R_CHAR; 2557 return (-2); 2558 } 2559 } 2560 2561 /* 2562 * Surrogate pair values(0xd800 through 0xdfff) are only 2563 * used by UTF-16, so, after above calculation, the code 2564 * must not be surrogate values, and Unicode has no codes 2565 * larger than 0x10ffff. Thus, those are not legal Unicode 2566 * values. 2567 */ 2568 if (IS_SURROGATE_PAIR_LA(uc) || uc > UNICODE_MAX) { 2569 /* Undescribed code point should be U+FFFD 2570 * (replacement character). */ 2571 *pwc = UNICODE_R_CHAR; 2572 return (((int)(utf16 - s)) * -1); 2573 } 2574 *pwc = uc; 2575 return ((int)(utf16 - s)); 2576 } 2577 2578 static size_t 2579 unicode_to_utf16be(char *p, size_t remaining, uint32_t uc) 2580 { 2581 char *utf16 = p; 2582 2583 if (uc > 0xffff) { 2584 /* We have a code point that won't fit into a 2585 * wchar_t; convert it to a surrogate pair. */ 2586 if (remaining < 4) 2587 return (0); 2588 uc -= 0x10000; 2589 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2590 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2591 return (4); 2592 } else { 2593 if (remaining < 2) 2594 return (0); 2595 archive_be16enc(utf16, uc); 2596 return (2); 2597 } 2598 } 2599 2600 static size_t 2601 unicode_to_utf16le(char *p, size_t remaining, uint32_t uc) 2602 { 2603 char *utf16 = p; 2604 2605 if (uc > 0xffff) { 2606 /* We have a code point that won't fit into a 2607 * wchar_t; convert it to a surrogate pair. */ 2608 if (remaining < 4) 2609 return (0); 2610 uc -= 0x10000; 2611 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 2612 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 2613 return (4); 2614 } else { 2615 if (remaining < 2) 2616 return (0); 2617 archive_le16enc(utf16, uc); 2618 return (2); 2619 } 2620 } 2621 2622 /* 2623 * Copy UTF-8 string in checking surrogate pair. 2624 * If any surrogate pair are found, it would be canonicalized. 2625 */ 2626 static int 2627 strncat_from_utf8_to_utf8(struct archive_string *as, const void *_p, 2628 size_t len, struct archive_string_conv *sc) 2629 { 2630 const char *s; 2631 char *p, *endp; 2632 int n, ret = 0; 2633 2634 (void)sc; /* UNUSED */ 2635 2636 if (archive_string_ensure(as, as->length + len + 1) == NULL) 2637 return (-1); 2638 2639 s = (const char *)_p; 2640 p = as->s + as->length; 2641 endp = as->s + as->buffer_length -1; 2642 do { 2643 uint32_t uc; 2644 const char *ss = s; 2645 size_t w; 2646 2647 /* 2648 * Forward byte sequence until a conversion of that is needed. 2649 */ 2650 while ((n = utf8_to_unicode(&uc, s, len)) > 0) { 2651 s += n; 2652 len -= n; 2653 } 2654 if (ss < s) { 2655 if (p + (s - ss) > endp) { 2656 as->length = p - as->s; 2657 if (archive_string_ensure(as, 2658 as->buffer_length + len + 1) == NULL) 2659 return (-1); 2660 p = as->s + as->length; 2661 endp = as->s + as->buffer_length -1; 2662 } 2663 2664 memcpy(p, ss, s - ss); 2665 p += s - ss; 2666 } 2667 2668 /* 2669 * If n is negative, current byte sequence needs a replacement. 2670 */ 2671 if (n < 0) { 2672 if (n == -3 && IS_SURROGATE_PAIR_LA(uc)) { 2673 /* Current byte sequence may be CESU-8. */ 2674 n = cesu8_to_unicode(&uc, s, len); 2675 } 2676 if (n < 0) { 2677 ret = -1; 2678 n *= -1;/* Use a replaced unicode character. */ 2679 } 2680 2681 /* Rebuild UTF-8 byte sequence. */ 2682 while ((w = unicode_to_utf8(p, endp - p, uc)) == 0) { 2683 as->length = p - as->s; 2684 if (archive_string_ensure(as, 2685 as->buffer_length + len + 1) == NULL) 2686 return (-1); 2687 p = as->s + as->length; 2688 endp = as->s + as->buffer_length -1; 2689 } 2690 p += w; 2691 s += n; 2692 len -= n; 2693 } 2694 } while (n > 0); 2695 as->length = p - as->s; 2696 as->s[as->length] = '\0'; 2697 return (ret); 2698 } 2699 2700 static int 2701 archive_string_append_unicode(struct archive_string *as, const void *_p, 2702 size_t len, struct archive_string_conv *sc) 2703 { 2704 const char *s; 2705 char *p, *endp; 2706 uint32_t uc; 2707 size_t w; 2708 int n, ret = 0, ts, tm; 2709 int (*parse)(uint32_t *, const char *, size_t); 2710 size_t (*unparse)(char *, size_t, uint32_t); 2711 2712 if (sc->flag & SCONV_TO_UTF16BE) { 2713 unparse = unicode_to_utf16be; 2714 ts = 2; 2715 } else if (sc->flag & SCONV_TO_UTF16LE) { 2716 unparse = unicode_to_utf16le; 2717 ts = 2; 2718 } else if (sc->flag & SCONV_TO_UTF8) { 2719 unparse = unicode_to_utf8; 2720 ts = 1; 2721 } else { 2722 /* 2723 * This case is going to be converted to another 2724 * character-set through iconv. 2725 */ 2726 if (sc->flag & SCONV_FROM_UTF16BE) { 2727 unparse = unicode_to_utf16be; 2728 ts = 2; 2729 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2730 unparse = unicode_to_utf16le; 2731 ts = 2; 2732 } else { 2733 unparse = unicode_to_utf8; 2734 ts = 1; 2735 } 2736 } 2737 2738 if (sc->flag & SCONV_FROM_UTF16BE) { 2739 parse = utf16be_to_unicode; 2740 tm = 1; 2741 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2742 parse = utf16le_to_unicode; 2743 tm = 1; 2744 } else { 2745 parse = cesu8_to_unicode; 2746 tm = ts; 2747 } 2748 2749 if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2750 return (-1); 2751 2752 s = (const char *)_p; 2753 p = as->s + as->length; 2754 endp = as->s + as->buffer_length - ts; 2755 while ((n = parse(&uc, s, len)) != 0) { 2756 if (n < 0) { 2757 /* Use a replaced unicode character. */ 2758 n *= -1; 2759 ret = -1; 2760 } 2761 s += n; 2762 len -= n; 2763 while ((w = unparse(p, endp - p, uc)) == 0) { 2764 /* There is not enough output buffer so 2765 * we have to expand it. */ 2766 as->length = p - as->s; 2767 if (archive_string_ensure(as, 2768 as->buffer_length + len * tm + ts) == NULL) 2769 return (-1); 2770 p = as->s + as->length; 2771 endp = as->s + as->buffer_length - ts; 2772 } 2773 p += w; 2774 } 2775 as->length = p - as->s; 2776 as->s[as->length] = '\0'; 2777 if (ts == 2) 2778 as->s[as->length+1] = '\0'; 2779 return (ret); 2780 } 2781 2782 /* 2783 * Following Constants for Hangul compositions this information comes from 2784 * Unicode Standard Annex #15 http://unicode.org/reports/tr15/ 2785 */ 2786 #define HC_SBASE 0xAC00 2787 #define HC_LBASE 0x1100 2788 #define HC_VBASE 0x1161 2789 #define HC_TBASE 0x11A7 2790 #define HC_LCOUNT 19 2791 #define HC_VCOUNT 21 2792 #define HC_TCOUNT 28 2793 #define HC_NCOUNT (HC_VCOUNT * HC_TCOUNT) 2794 #define HC_SCOUNT (HC_LCOUNT * HC_NCOUNT) 2795 2796 static uint32_t 2797 get_nfc(uint32_t uc, uint32_t uc2) 2798 { 2799 int t, b; 2800 2801 t = 0; 2802 b = sizeof(u_composition_table)/sizeof(u_composition_table[0]) -1; 2803 while (b >= t) { 2804 int m = (t + b) / 2; 2805 if (u_composition_table[m].cp1 < uc) 2806 t = m + 1; 2807 else if (u_composition_table[m].cp1 > uc) 2808 b = m - 1; 2809 else if (u_composition_table[m].cp2 < uc2) 2810 t = m + 1; 2811 else if (u_composition_table[m].cp2 > uc2) 2812 b = m - 1; 2813 else 2814 return (u_composition_table[m].nfc); 2815 } 2816 return (0); 2817 } 2818 2819 #define FDC_MAX 10 /* The maximum number of Following Decomposable 2820 * Characters. */ 2821 2822 /* 2823 * Update first code point. 2824 */ 2825 #define UPDATE_UC(new_uc) do { \ 2826 uc = new_uc; \ 2827 ucptr = NULL; \ 2828 } while (0) 2829 2830 /* 2831 * Replace first code point with second code point. 2832 */ 2833 #define REPLACE_UC_WITH_UC2() do { \ 2834 uc = uc2; \ 2835 ucptr = uc2ptr; \ 2836 n = n2; \ 2837 } while (0) 2838 2839 #define EXPAND_BUFFER() do { \ 2840 as->length = p - as->s; \ 2841 if (archive_string_ensure(as, \ 2842 as->buffer_length + len * tm + ts) == NULL)\ 2843 return (-1); \ 2844 p = as->s + as->length; \ 2845 endp = as->s + as->buffer_length - ts; \ 2846 } while (0) 2847 2848 #define UNPARSE(p, endp, uc) do { \ 2849 while ((w = unparse(p, (endp) - (p), uc)) == 0) {\ 2850 EXPAND_BUFFER(); \ 2851 } \ 2852 p += w; \ 2853 } while (0) 2854 2855 /* 2856 * Write first code point. 2857 * If the code point has not be changed from its original code, 2858 * this just copies it from its original buffer pointer. 2859 * If not, this converts it to UTF-8 byte sequence and copies it. 2860 */ 2861 #define WRITE_UC() do { \ 2862 if (ucptr) { \ 2863 if (p + n > endp) \ 2864 EXPAND_BUFFER(); \ 2865 switch (n) { \ 2866 case 4: \ 2867 *p++ = *ucptr++; \ 2868 /* FALL THROUGH */ \ 2869 case 3: \ 2870 *p++ = *ucptr++; \ 2871 /* FALL THROUGH */ \ 2872 case 2: \ 2873 *p++ = *ucptr++; \ 2874 /* FALL THROUGH */ \ 2875 case 1: \ 2876 *p++ = *ucptr; \ 2877 break; \ 2878 } \ 2879 ucptr = NULL; \ 2880 } else { \ 2881 UNPARSE(p, endp, uc); \ 2882 } \ 2883 } while (0) 2884 2885 /* 2886 * Collect following decomposable code points. 2887 */ 2888 #define COLLECT_CPS(start) do { \ 2889 int _i; \ 2890 for (_i = start; _i < FDC_MAX ; _i++) { \ 2891 nx = parse(&ucx[_i], s, len); \ 2892 if (nx <= 0) \ 2893 break; \ 2894 cx = CCC(ucx[_i]); \ 2895 if (cl >= cx && cl != 228 && cx != 228)\ 2896 break; \ 2897 s += nx; \ 2898 len -= nx; \ 2899 cl = cx; \ 2900 ccx[_i] = cx; \ 2901 } \ 2902 if (_i >= FDC_MAX) { \ 2903 ret = -1; \ 2904 ucx_size = FDC_MAX; \ 2905 } else \ 2906 ucx_size = _i; \ 2907 } while (0) 2908 2909 /* 2910 * Normalize UTF-8/UTF-16BE characters to Form C and copy the result. 2911 * 2912 * TODO: Convert composition exclusions, which are never converted 2913 * from NFC,NFD,NFKC and NFKD, to Form C. 2914 */ 2915 static int 2916 archive_string_normalize_C(struct archive_string *as, const void *_p, 2917 size_t len, struct archive_string_conv *sc) 2918 { 2919 const char *s = (const char *)_p; 2920 char *p, *endp; 2921 uint32_t uc, uc2; 2922 size_t w; 2923 int always_replace, n, n2, ret = 0, spair, ts, tm; 2924 int (*parse)(uint32_t *, const char *, size_t); 2925 size_t (*unparse)(char *, size_t, uint32_t); 2926 2927 always_replace = 1; 2928 ts = 1;/* text size. */ 2929 if (sc->flag & SCONV_TO_UTF16BE) { 2930 unparse = unicode_to_utf16be; 2931 ts = 2; 2932 if (sc->flag & SCONV_FROM_UTF16BE) 2933 always_replace = 0; 2934 } else if (sc->flag & SCONV_TO_UTF16LE) { 2935 unparse = unicode_to_utf16le; 2936 ts = 2; 2937 if (sc->flag & SCONV_FROM_UTF16LE) 2938 always_replace = 0; 2939 } else if (sc->flag & SCONV_TO_UTF8) { 2940 unparse = unicode_to_utf8; 2941 if (sc->flag & SCONV_FROM_UTF8) 2942 always_replace = 0; 2943 } else { 2944 /* 2945 * This case is going to be converted to another 2946 * character-set through iconv. 2947 */ 2948 always_replace = 0; 2949 if (sc->flag & SCONV_FROM_UTF16BE) { 2950 unparse = unicode_to_utf16be; 2951 ts = 2; 2952 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2953 unparse = unicode_to_utf16le; 2954 ts = 2; 2955 } else { 2956 unparse = unicode_to_utf8; 2957 } 2958 } 2959 2960 if (sc->flag & SCONV_FROM_UTF16BE) { 2961 parse = utf16be_to_unicode; 2962 tm = 1; 2963 spair = 4;/* surrogate pair size in UTF-16. */ 2964 } else if (sc->flag & SCONV_FROM_UTF16LE) { 2965 parse = utf16le_to_unicode; 2966 tm = 1; 2967 spair = 4;/* surrogate pair size in UTF-16. */ 2968 } else { 2969 parse = cesu8_to_unicode; 2970 tm = ts; 2971 spair = 6;/* surrogate pair size in UTF-8. */ 2972 } 2973 2974 if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 2975 return (-1); 2976 2977 p = as->s + as->length; 2978 endp = as->s + as->buffer_length - ts; 2979 while ((n = parse(&uc, s, len)) != 0) { 2980 const char *ucptr, *uc2ptr; 2981 2982 if (n < 0) { 2983 /* Use a replaced unicode character. */ 2984 UNPARSE(p, endp, uc); 2985 s += n*-1; 2986 len -= n*-1; 2987 ret = -1; 2988 continue; 2989 } else if (n == spair || always_replace) 2990 /* uc is converted from a surrogate pair. 2991 * this should be treated as a changed code. */ 2992 ucptr = NULL; 2993 else 2994 ucptr = s; 2995 s += n; 2996 len -= n; 2997 2998 /* Read second code point. */ 2999 while ((n2 = parse(&uc2, s, len)) > 0) { 3000 uint32_t ucx[FDC_MAX]; 3001 int ccx[FDC_MAX]; 3002 int cl, cx, i, nx, ucx_size; 3003 int LIndex,SIndex; 3004 uint32_t nfc; 3005 3006 if (n2 == spair || always_replace) 3007 /* uc2 is converted from a surrogate pair. 3008 * this should be treated as a changed code. */ 3009 uc2ptr = NULL; 3010 else 3011 uc2ptr = s; 3012 s += n2; 3013 len -= n2; 3014 3015 /* 3016 * If current second code point is out of decomposable 3017 * code points, finding compositions is unneeded. 3018 */ 3019 if (!IS_DECOMPOSABLE_BLOCK(uc2)) { 3020 WRITE_UC(); 3021 REPLACE_UC_WITH_UC2(); 3022 continue; 3023 } 3024 3025 /* 3026 * Try to combine current code points. 3027 */ 3028 /* 3029 * We have to combine Hangul characters according to 3030 * http://uniicode.org/reports/tr15/#Hangul 3031 */ 3032 if (0 <= (LIndex = uc - HC_LBASE) && 3033 LIndex < HC_LCOUNT) { 3034 /* 3035 * Hangul Composition. 3036 * 1. Two current code points are L and V. 3037 */ 3038 int VIndex = uc2 - HC_VBASE; 3039 if (0 <= VIndex && VIndex < HC_VCOUNT) { 3040 /* Make syllable of form LV. */ 3041 UPDATE_UC(HC_SBASE + 3042 (LIndex * HC_VCOUNT + VIndex) * 3043 HC_TCOUNT); 3044 } else { 3045 WRITE_UC(); 3046 REPLACE_UC_WITH_UC2(); 3047 } 3048 continue; 3049 } else if (0 <= (SIndex = uc - HC_SBASE) && 3050 SIndex < HC_SCOUNT && (SIndex % HC_TCOUNT) == 0) { 3051 /* 3052 * Hangul Composition. 3053 * 2. Two current code points are LV and T. 3054 */ 3055 int TIndex = uc2 - HC_TBASE; 3056 if (0 < TIndex && TIndex < HC_TCOUNT) { 3057 /* Make syllable of form LVT. */ 3058 UPDATE_UC(uc + TIndex); 3059 } else { 3060 WRITE_UC(); 3061 REPLACE_UC_WITH_UC2(); 3062 } 3063 continue; 3064 } else if ((nfc = get_nfc(uc, uc2)) != 0) { 3065 /* A composition to current code points 3066 * is found. */ 3067 UPDATE_UC(nfc); 3068 continue; 3069 } else if ((cl = CCC(uc2)) == 0) { 3070 /* Clearly 'uc2' the second code point is not 3071 * a decomposable code. */ 3072 WRITE_UC(); 3073 REPLACE_UC_WITH_UC2(); 3074 continue; 3075 } 3076 3077 /* 3078 * Collect following decomposable code points. 3079 */ 3080 cx = 0; 3081 ucx[0] = uc2; 3082 ccx[0] = cl; 3083 COLLECT_CPS(1); 3084 3085 /* 3086 * Find a composed code in the collected code points. 3087 */ 3088 i = 1; 3089 while (i < ucx_size) { 3090 int j; 3091 3092 if ((nfc = get_nfc(uc, ucx[i])) == 0) { 3093 i++; 3094 continue; 3095 } 3096 3097 /* 3098 * nfc is composed of uc and ucx[i]. 3099 */ 3100 UPDATE_UC(nfc); 3101 3102 /* 3103 * Remove ucx[i] by shifting 3104 * following code points. 3105 */ 3106 for (j = i; j+1 < ucx_size; j++) { 3107 ucx[j] = ucx[j+1]; 3108 ccx[j] = ccx[j+1]; 3109 } 3110 ucx_size --; 3111 3112 /* 3113 * Collect following code points blocked 3114 * by ucx[i] the removed code point. 3115 */ 3116 if (ucx_size > 0 && i == ucx_size && 3117 nx > 0 && cx == cl) { 3118 cl = ccx[ucx_size-1]; 3119 COLLECT_CPS(ucx_size); 3120 } 3121 /* 3122 * Restart finding a composed code with 3123 * the updated uc from the top of the 3124 * collected code points. 3125 */ 3126 i = 0; 3127 } 3128 3129 /* 3130 * Apparently the current code points are not 3131 * decomposed characters or already composed. 3132 */ 3133 WRITE_UC(); 3134 for (i = 0; i < ucx_size; i++) 3135 UNPARSE(p, endp, ucx[i]); 3136 3137 /* 3138 * Flush out remaining canonical combining characters. 3139 */ 3140 if (nx > 0 && cx == cl && len > 0) { 3141 while ((nx = parse(&ucx[0], s, len)) 3142 > 0) { 3143 cx = CCC(ucx[0]); 3144 if (cl > cx) 3145 break; 3146 s += nx; 3147 len -= nx; 3148 cl = cx; 3149 UNPARSE(p, endp, ucx[0]); 3150 } 3151 } 3152 break; 3153 } 3154 if (n2 < 0) { 3155 WRITE_UC(); 3156 /* Use a replaced unicode character. */ 3157 UNPARSE(p, endp, uc2); 3158 s += n2*-1; 3159 len -= n2*-1; 3160 ret = -1; 3161 continue; 3162 } else if (n2 == 0) { 3163 WRITE_UC(); 3164 break; 3165 } 3166 } 3167 as->length = p - as->s; 3168 as->s[as->length] = '\0'; 3169 if (ts == 2) 3170 as->s[as->length+1] = '\0'; 3171 return (ret); 3172 } 3173 3174 static int 3175 get_nfd(uint32_t *cp1, uint32_t *cp2, uint32_t uc) 3176 { 3177 int t, b; 3178 3179 /* 3180 * These are not converted to NFD on Mac OS. 3181 */ 3182 if ((uc >= 0x2000 && uc <= 0x2FFF) || 3183 (uc >= 0xF900 && uc <= 0xFAFF) || 3184 (uc >= 0x2F800 && uc <= 0x2FAFF)) 3185 return (0); 3186 /* 3187 * Those code points are not converted to NFD on Mac OS. 3188 * I do not know the reason because it is undocumented. 3189 * NFC NFD 3190 * 1109A ==> 11099 110BA 3191 * 1109C ==> 1109B 110BA 3192 * 110AB ==> 110A5 110BA 3193 */ 3194 if (uc == 0x1109A || uc == 0x1109C || uc == 0x110AB) 3195 return (0); 3196 3197 t = 0; 3198 b = sizeof(u_decomposition_table)/sizeof(u_decomposition_table[0]) -1; 3199 while (b >= t) { 3200 int m = (t + b) / 2; 3201 if (u_decomposition_table[m].nfc < uc) 3202 t = m + 1; 3203 else if (u_decomposition_table[m].nfc > uc) 3204 b = m - 1; 3205 else { 3206 *cp1 = u_decomposition_table[m].cp1; 3207 *cp2 = u_decomposition_table[m].cp2; 3208 return (1); 3209 } 3210 } 3211 return (0); 3212 } 3213 3214 #define REPLACE_UC_WITH(cp) do { \ 3215 uc = cp; \ 3216 ucptr = NULL; \ 3217 } while (0) 3218 3219 /* 3220 * Normalize UTF-8 characters to Form D and copy the result. 3221 */ 3222 static int 3223 archive_string_normalize_D(struct archive_string *as, const void *_p, 3224 size_t len, struct archive_string_conv *sc) 3225 { 3226 const char *s = (const char *)_p; 3227 char *p, *endp; 3228 uint32_t uc, uc2; 3229 size_t w; 3230 int always_replace, n, n2, ret = 0, spair, ts, tm; 3231 int (*parse)(uint32_t *, const char *, size_t); 3232 size_t (*unparse)(char *, size_t, uint32_t); 3233 3234 always_replace = 1; 3235 ts = 1;/* text size. */ 3236 if (sc->flag & SCONV_TO_UTF16BE) { 3237 unparse = unicode_to_utf16be; 3238 ts = 2; 3239 if (sc->flag & SCONV_FROM_UTF16BE) 3240 always_replace = 0; 3241 } else if (sc->flag & SCONV_TO_UTF16LE) { 3242 unparse = unicode_to_utf16le; 3243 ts = 2; 3244 if (sc->flag & SCONV_FROM_UTF16LE) 3245 always_replace = 0; 3246 } else if (sc->flag & SCONV_TO_UTF8) { 3247 unparse = unicode_to_utf8; 3248 if (sc->flag & SCONV_FROM_UTF8) 3249 always_replace = 0; 3250 } else { 3251 /* 3252 * This case is going to be converted to another 3253 * character-set through iconv. 3254 */ 3255 always_replace = 0; 3256 if (sc->flag & SCONV_FROM_UTF16BE) { 3257 unparse = unicode_to_utf16be; 3258 ts = 2; 3259 } else if (sc->flag & SCONV_FROM_UTF16LE) { 3260 unparse = unicode_to_utf16le; 3261 ts = 2; 3262 } else { 3263 unparse = unicode_to_utf8; 3264 } 3265 } 3266 3267 if (sc->flag & SCONV_FROM_UTF16BE) { 3268 parse = utf16be_to_unicode; 3269 tm = 1; 3270 spair = 4;/* surrogate pair size in UTF-16. */ 3271 } else if (sc->flag & SCONV_FROM_UTF16LE) { 3272 parse = utf16le_to_unicode; 3273 tm = 1; 3274 spair = 4;/* surrogate pair size in UTF-16. */ 3275 } else { 3276 parse = cesu8_to_unicode; 3277 tm = ts; 3278 spair = 6;/* surrogate pair size in UTF-8. */ 3279 } 3280 3281 if (archive_string_ensure(as, as->length + len * tm + ts) == NULL) 3282 return (-1); 3283 3284 p = as->s + as->length; 3285 endp = as->s + as->buffer_length - ts; 3286 while ((n = parse(&uc, s, len)) != 0) { 3287 const char *ucptr; 3288 uint32_t cp1, cp2; 3289 int SIndex; 3290 struct { 3291 uint32_t uc; 3292 int ccc; 3293 } fdc[FDC_MAX]; 3294 int fdi, fdj; 3295 int ccc; 3296 3297 check_first_code: 3298 if (n < 0) { 3299 /* Use a replaced unicode character. */ 3300 UNPARSE(p, endp, uc); 3301 s += n*-1; 3302 len -= n*-1; 3303 ret = -1; 3304 continue; 3305 } else if (n == spair || always_replace) 3306 /* uc is converted from a surrogate pair. 3307 * this should be treated as a changed code. */ 3308 ucptr = NULL; 3309 else 3310 ucptr = s; 3311 s += n; 3312 len -= n; 3313 3314 /* Hangul Decomposition. */ 3315 if ((SIndex = uc - HC_SBASE) >= 0 && SIndex < HC_SCOUNT) { 3316 int L = HC_LBASE + SIndex / HC_NCOUNT; 3317 int V = HC_VBASE + (SIndex % HC_NCOUNT) / HC_TCOUNT; 3318 int T = HC_TBASE + SIndex % HC_TCOUNT; 3319 3320 REPLACE_UC_WITH(L); 3321 WRITE_UC(); 3322 REPLACE_UC_WITH(V); 3323 WRITE_UC(); 3324 if (T != HC_TBASE) { 3325 REPLACE_UC_WITH(T); 3326 WRITE_UC(); 3327 } 3328 continue; 3329 } 3330 if (IS_DECOMPOSABLE_BLOCK(uc) && CCC(uc) != 0) { 3331 WRITE_UC(); 3332 continue; 3333 } 3334 3335 fdi = 0; 3336 while (get_nfd(&cp1, &cp2, uc) && fdi < FDC_MAX) { 3337 int k; 3338 3339 for (k = fdi; k > 0; k--) 3340 fdc[k] = fdc[k-1]; 3341 fdc[0].ccc = CCC(cp2); 3342 fdc[0].uc = cp2; 3343 fdi++; 3344 REPLACE_UC_WITH(cp1); 3345 } 3346 3347 /* Read following code points. */ 3348 while ((n2 = parse(&uc2, s, len)) > 0 && 3349 (ccc = CCC(uc2)) != 0 && fdi < FDC_MAX) { 3350 int j, k; 3351 3352 s += n2; 3353 len -= n2; 3354 for (j = 0; j < fdi; j++) { 3355 if (fdc[j].ccc > ccc) 3356 break; 3357 } 3358 if (j < fdi) { 3359 for (k = fdi; k > j; k--) 3360 fdc[k] = fdc[k-1]; 3361 fdc[j].ccc = ccc; 3362 fdc[j].uc = uc2; 3363 } else { 3364 fdc[fdi].ccc = ccc; 3365 fdc[fdi].uc = uc2; 3366 } 3367 fdi++; 3368 } 3369 3370 WRITE_UC(); 3371 for (fdj = 0; fdj < fdi; fdj++) { 3372 REPLACE_UC_WITH(fdc[fdj].uc); 3373 WRITE_UC(); 3374 } 3375 3376 if (n2 == 0) 3377 break; 3378 REPLACE_UC_WITH(uc2); 3379 n = n2; 3380 goto check_first_code; 3381 } 3382 as->length = p - as->s; 3383 as->s[as->length] = '\0'; 3384 if (ts == 2) 3385 as->s[as->length+1] = '\0'; 3386 return (ret); 3387 } 3388 3389 /* 3390 * libarchive 2.x made incorrect UTF-8 strings in the wrong assumption 3391 * that WCS is Unicode. It is true for several platforms but some are false. 3392 * And then people who did not use UTF-8 locale on the non Unicode WCS 3393 * platform and made a tar file with libarchive(mostly bsdtar) 2.x. Those 3394 * now cannot get right filename from libarchive 3.x and later since we 3395 * fixed the wrong assumption and it is incompatible to older its versions. 3396 * So we provide special option, "compat-2x.x", for resolving it. 3397 * That option enable the string conversion of libarchive 2.x. 3398 * 3399 * Translates the wrong UTF-8 string made by libarchive 2.x into current 3400 * locale character set and appends to the archive_string. 3401 * Note: returns -1 if conversion fails. 3402 */ 3403 static int 3404 strncat_from_utf8_libarchive2(struct archive_string *as, 3405 const void *_p, size_t len, struct archive_string_conv *sc) 3406 { 3407 const char *s; 3408 int n; 3409 char *p; 3410 char *end; 3411 uint32_t unicode; 3412 #if HAVE_WCRTOMB 3413 mbstate_t shift_state; 3414 3415 memset(&shift_state, 0, sizeof(shift_state)); 3416 #else 3417 /* Clear the shift state before starting. */ 3418 wctomb(NULL, L'\0'); 3419 #endif 3420 (void)sc; /* UNUSED */ 3421 /* 3422 * Allocate buffer for MBS. 3423 * We need this allocation here since it is possible that 3424 * as->s is still NULL. 3425 */ 3426 if (archive_string_ensure(as, as->length + len + 1) == NULL) 3427 return (-1); 3428 3429 s = (const char *)_p; 3430 p = as->s + as->length; 3431 end = as->s + as->buffer_length - MB_CUR_MAX -1; 3432 while ((n = _utf8_to_unicode(&unicode, s, len)) != 0) { 3433 wchar_t wc; 3434 3435 if (p >= end) { 3436 as->length = p - as->s; 3437 /* Re-allocate buffer for MBS. */ 3438 if (archive_string_ensure(as, 3439 as->length + len * 2 + 1) == NULL) 3440 return (-1); 3441 p = as->s + as->length; 3442 end = as->s + as->buffer_length - MB_CUR_MAX -1; 3443 } 3444 3445 /* 3446 * As libarchive 2.x, translates the UTF-8 characters into 3447 * wide-characters in the assumption that WCS is Unicode. 3448 */ 3449 if (n < 0) { 3450 n *= -1; 3451 wc = L'?'; 3452 } else 3453 wc = (wchar_t)unicode; 3454 3455 s += n; 3456 len -= n; 3457 /* 3458 * Translates the wide-character into the current locale MBS. 3459 */ 3460 #if HAVE_WCRTOMB 3461 n = (int)wcrtomb(p, wc, &shift_state); 3462 #else 3463 n = (int)wctomb(p, wc); 3464 #endif 3465 if (n == -1) 3466 return (-1); 3467 p += n; 3468 } 3469 as->length = p - as->s; 3470 as->s[as->length] = '\0'; 3471 return (0); 3472 } 3473 3474 3475 /* 3476 * Conversion functions between current locale dependent MBS and UTF-16BE. 3477 * strncat_from_utf16be() : UTF-16BE --> MBS 3478 * strncat_to_utf16be() : MBS --> UTF16BE 3479 */ 3480 3481 #if defined(_WIN32) && !defined(__CYGWIN__) 3482 3483 /* 3484 * Convert a UTF-16BE/LE string to current locale and copy the result. 3485 * Return -1 if conversion fails. 3486 */ 3487 static int 3488 win_strncat_from_utf16(struct archive_string *as, const void *_p, size_t bytes, 3489 struct archive_string_conv *sc, int be) 3490 { 3491 struct archive_string tmp; 3492 const char *u16; 3493 int ll; 3494 BOOL defchar; 3495 char *mbs; 3496 size_t mbs_size, b; 3497 int ret = 0; 3498 3499 bytes &= ~1; 3500 if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3501 return (-1); 3502 3503 mbs = as->s + as->length; 3504 mbs_size = as->buffer_length - as->length -1; 3505 3506 if (sc->to_cp == CP_C_LOCALE) { 3507 /* 3508 * "C" locale special process. 3509 */ 3510 u16 = _p; 3511 ll = 0; 3512 for (b = 0; b < bytes; b += 2) { 3513 uint16_t val; 3514 if (be) 3515 val = archive_be16dec(u16+b); 3516 else 3517 val = archive_le16dec(u16+b); 3518 if (val > 255) { 3519 *mbs++ = '?'; 3520 ret = -1; 3521 } else 3522 *mbs++ = (char)(val&0xff); 3523 ll++; 3524 } 3525 as->length += ll; 3526 as->s[as->length] = '\0'; 3527 return (ret); 3528 } 3529 3530 archive_string_init(&tmp); 3531 if (be) { 3532 if (is_big_endian()) { 3533 u16 = _p; 3534 } else { 3535 if (archive_string_ensure(&tmp, bytes+2) == NULL) 3536 return (-1); 3537 memcpy(tmp.s, _p, bytes); 3538 for (b = 0; b < bytes; b += 2) { 3539 uint16_t val = archive_be16dec(tmp.s+b); 3540 archive_le16enc(tmp.s+b, val); 3541 } 3542 u16 = tmp.s; 3543 } 3544 } else { 3545 if (!is_big_endian()) { 3546 u16 = _p; 3547 } else { 3548 if (archive_string_ensure(&tmp, bytes+2) == NULL) 3549 return (-1); 3550 memcpy(tmp.s, _p, bytes); 3551 for (b = 0; b < bytes; b += 2) { 3552 uint16_t val = archive_le16dec(tmp.s+b); 3553 archive_be16enc(tmp.s+b, val); 3554 } 3555 u16 = tmp.s; 3556 } 3557 } 3558 3559 do { 3560 defchar = 0; 3561 ll = WideCharToMultiByte(sc->to_cp, 0, 3562 (LPCWSTR)u16, (int)bytes>>1, mbs, (int)mbs_size, 3563 NULL, &defchar); 3564 /* Exit loop if we succeeded */ 3565 if (ll != 0 || 3566 GetLastError() != ERROR_INSUFFICIENT_BUFFER) { 3567 break; 3568 } 3569 /* Else expand buffer and loop to try again. */ 3570 ll = WideCharToMultiByte(sc->to_cp, 0, 3571 (LPCWSTR)u16, (int)bytes, NULL, 0, NULL, NULL); 3572 if (archive_string_ensure(as, ll +1) == NULL) 3573 return (-1); 3574 mbs = as->s + as->length; 3575 mbs_size = as->buffer_length - as->length -1; 3576 } while (1); 3577 archive_string_free(&tmp); 3578 as->length += ll; 3579 as->s[as->length] = '\0'; 3580 if (ll == 0 || defchar) 3581 ret = -1; 3582 return (ret); 3583 } 3584 3585 static int 3586 win_strncat_from_utf16be(struct archive_string *as, const void *_p, 3587 size_t bytes, struct archive_string_conv *sc) 3588 { 3589 return (win_strncat_from_utf16(as, _p, bytes, sc, 1)); 3590 } 3591 3592 static int 3593 win_strncat_from_utf16le(struct archive_string *as, const void *_p, 3594 size_t bytes, struct archive_string_conv *sc) 3595 { 3596 return (win_strncat_from_utf16(as, _p, bytes, sc, 0)); 3597 } 3598 3599 static int 3600 is_big_endian(void) 3601 { 3602 uint16_t d = 1; 3603 3604 return (archive_be16dec(&d) == 1); 3605 } 3606 3607 /* 3608 * Convert a current locale string to UTF-16BE/LE and copy the result. 3609 * Return -1 if conversion fails. 3610 */ 3611 static int 3612 win_strncat_to_utf16(struct archive_string *as16, const void *_p, 3613 size_t length, struct archive_string_conv *sc, int bigendian) 3614 { 3615 const char *s = (const char *)_p; 3616 char *u16; 3617 size_t count, avail; 3618 3619 if (archive_string_ensure(as16, 3620 as16->length + (length + 1) * 2) == NULL) 3621 return (-1); 3622 3623 u16 = as16->s + as16->length; 3624 avail = as16->buffer_length - 2; 3625 if (sc->from_cp == CP_C_LOCALE) { 3626 /* 3627 * "C" locale special process. 3628 */ 3629 count = 0; 3630 while (count < length && *s) { 3631 if (bigendian) 3632 archive_be16enc(u16, *s); 3633 else 3634 archive_le16enc(u16, *s); 3635 u16 += 2; 3636 s++; 3637 count++; 3638 } 3639 as16->length += count << 1; 3640 as16->s[as16->length] = 0; 3641 as16->s[as16->length+1] = 0; 3642 return (0); 3643 } 3644 do { 3645 count = MultiByteToWideChar(sc->from_cp, 3646 MB_PRECOMPOSED, s, (int)length, (LPWSTR)u16, (int)avail>>1); 3647 /* Exit loop if we succeeded */ 3648 if (count != 0 || 3649 GetLastError() != ERROR_INSUFFICIENT_BUFFER) { 3650 break; 3651 } 3652 /* Expand buffer and try again */ 3653 count = MultiByteToWideChar(sc->from_cp, 3654 MB_PRECOMPOSED, s, (int)length, NULL, 0); 3655 if (archive_string_ensure(as16, (count +1) * 2) 3656 == NULL) 3657 return (-1); 3658 u16 = as16->s + as16->length; 3659 avail = as16->buffer_length - 2; 3660 } while (1); 3661 as16->length += count * 2; 3662 as16->s[as16->length] = 0; 3663 as16->s[as16->length+1] = 0; 3664 if (count == 0) 3665 return (-1); 3666 3667 if (is_big_endian()) { 3668 if (!bigendian) { 3669 while (count > 0) { 3670 uint16_t v = archive_be16dec(u16); 3671 archive_le16enc(u16, v); 3672 u16 += 2; 3673 count--; 3674 } 3675 } 3676 } else { 3677 if (bigendian) { 3678 while (count > 0) { 3679 uint16_t v = archive_le16dec(u16); 3680 archive_be16enc(u16, v); 3681 u16 += 2; 3682 count--; 3683 } 3684 } 3685 } 3686 return (0); 3687 } 3688 3689 static int 3690 win_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3691 size_t length, struct archive_string_conv *sc) 3692 { 3693 return (win_strncat_to_utf16(as16, _p, length, sc, 1)); 3694 } 3695 3696 static int 3697 win_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3698 size_t length, struct archive_string_conv *sc) 3699 { 3700 return (win_strncat_to_utf16(as16, _p, length, sc, 0)); 3701 } 3702 3703 #endif /* _WIN32 && !__CYGWIN__ */ 3704 3705 /* 3706 * Do the best effort for conversions. 3707 * We cannot handle UTF-16BE character-set without such iconv, 3708 * but there is a chance if a string consists just ASCII code or 3709 * a current locale is UTF-8. 3710 */ 3711 3712 /* 3713 * Convert a UTF-16BE string to current locale and copy the result. 3714 * Return -1 if conversion fails. 3715 */ 3716 static int 3717 best_effort_strncat_from_utf16(struct archive_string *as, const void *_p, 3718 size_t bytes, struct archive_string_conv *sc, int be) 3719 { 3720 const char *utf16 = (const char *)_p; 3721 char *mbs; 3722 uint32_t uc; 3723 int n, ret; 3724 3725 (void)sc; /* UNUSED */ 3726 /* 3727 * Other case, we should do the best effort. 3728 * If all character are ASCII(<0x7f), we can convert it. 3729 * if not , we set a alternative character and return -1. 3730 */ 3731 ret = 0; 3732 if (archive_string_ensure(as, as->length + bytes +1) == NULL) 3733 return (-1); 3734 mbs = as->s + as->length; 3735 3736 while ((n = utf16_to_unicode(&uc, utf16, bytes, be)) != 0) { 3737 if (n < 0) { 3738 n *= -1; 3739 ret = -1; 3740 } 3741 bytes -= n; 3742 utf16 += n; 3743 3744 if (uc > 127) { 3745 /* We cannot handle it. */ 3746 *mbs++ = '?'; 3747 ret = -1; 3748 } else 3749 *mbs++ = (char)uc; 3750 } 3751 as->length = mbs - as->s; 3752 as->s[as->length] = '\0'; 3753 return (ret); 3754 } 3755 3756 static int 3757 best_effort_strncat_from_utf16be(struct archive_string *as, const void *_p, 3758 size_t bytes, struct archive_string_conv *sc) 3759 { 3760 return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 1)); 3761 } 3762 3763 static int 3764 best_effort_strncat_from_utf16le(struct archive_string *as, const void *_p, 3765 size_t bytes, struct archive_string_conv *sc) 3766 { 3767 return (best_effort_strncat_from_utf16(as, _p, bytes, sc, 0)); 3768 } 3769 3770 /* 3771 * Convert a current locale string to UTF-16BE/LE and copy the result. 3772 * Return -1 if conversion fails. 3773 */ 3774 static int 3775 best_effort_strncat_to_utf16(struct archive_string *as16, const void *_p, 3776 size_t length, struct archive_string_conv *sc, int bigendian) 3777 { 3778 const char *s = (const char *)_p; 3779 char *utf16; 3780 size_t remaining; 3781 int ret; 3782 3783 (void)sc; /* UNUSED */ 3784 /* 3785 * Other case, we should do the best effort. 3786 * If all character are ASCII(<0x7f), we can convert it. 3787 * if not , we set a alternative character and return -1. 3788 */ 3789 ret = 0; 3790 remaining = length; 3791 3792 if (archive_string_ensure(as16, 3793 as16->length + (length + 1) * 2) == NULL) 3794 return (-1); 3795 3796 utf16 = as16->s + as16->length; 3797 while (remaining--) { 3798 unsigned c = *s++; 3799 if (c > 127) { 3800 /* We cannot handle it. */ 3801 c = UNICODE_R_CHAR; 3802 ret = -1; 3803 } 3804 if (bigendian) 3805 archive_be16enc(utf16, c); 3806 else 3807 archive_le16enc(utf16, c); 3808 utf16 += 2; 3809 } 3810 as16->length = utf16 - as16->s; 3811 as16->s[as16->length] = 0; 3812 as16->s[as16->length+1] = 0; 3813 return (ret); 3814 } 3815 3816 static int 3817 best_effort_strncat_to_utf16be(struct archive_string *as16, const void *_p, 3818 size_t length, struct archive_string_conv *sc) 3819 { 3820 return (best_effort_strncat_to_utf16(as16, _p, length, sc, 1)); 3821 } 3822 3823 static int 3824 best_effort_strncat_to_utf16le(struct archive_string *as16, const void *_p, 3825 size_t length, struct archive_string_conv *sc) 3826 { 3827 return (best_effort_strncat_to_utf16(as16, _p, length, sc, 0)); 3828 } 3829 3830 3831 /* 3832 * Multistring operations. 3833 */ 3834 3835 void 3836 archive_mstring_clean(struct archive_mstring *aes) 3837 { 3838 archive_wstring_free(&(aes->aes_wcs)); 3839 archive_string_free(&(aes->aes_mbs)); 3840 archive_string_free(&(aes->aes_utf8)); 3841 archive_string_free(&(aes->aes_mbs_in_locale)); 3842 aes->aes_set = 0; 3843 } 3844 3845 void 3846 archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring *src) 3847 { 3848 dest->aes_set = src->aes_set; 3849 archive_string_copy(&(dest->aes_mbs), &(src->aes_mbs)); 3850 archive_string_copy(&(dest->aes_utf8), &(src->aes_utf8)); 3851 archive_wstring_copy(&(dest->aes_wcs), &(src->aes_wcs)); 3852 } 3853 3854 int 3855 archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, 3856 const char **p) 3857 { 3858 struct archive_string_conv *sc; 3859 int r; 3860 3861 /* If we already have a UTF8 form, return that immediately. */ 3862 if (aes->aes_set & AES_SET_UTF8) { 3863 *p = aes->aes_utf8.s; 3864 return (0); 3865 } 3866 3867 *p = NULL; 3868 if (aes->aes_set & AES_SET_MBS) { 3869 sc = archive_string_conversion_to_charset(a, "UTF-8", 1); 3870 if (sc == NULL) 3871 return (-1);/* Couldn't allocate memory for sc. */ 3872 r = archive_strncpy_l(&(aes->aes_utf8), aes->aes_mbs.s, 3873 aes->aes_mbs.length, sc); 3874 if (a == NULL) 3875 free_sconv_object(sc); 3876 if (r == 0) { 3877 aes->aes_set |= AES_SET_UTF8; 3878 *p = aes->aes_utf8.s; 3879 return (0);/* success. */ 3880 } else 3881 return (-1);/* failure. */ 3882 } 3883 return (0);/* success. */ 3884 } 3885 3886 int 3887 archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, 3888 const char **p) 3889 { 3890 int r, ret = 0; 3891 3892 (void)a; /* UNUSED */ 3893 /* If we already have an MBS form, return that immediately. */ 3894 if (aes->aes_set & AES_SET_MBS) { 3895 *p = aes->aes_mbs.s; 3896 return (ret); 3897 } 3898 3899 *p = NULL; 3900 /* If there's a WCS form, try converting with the native locale. */ 3901 if (aes->aes_set & AES_SET_WCS) { 3902 archive_string_empty(&(aes->aes_mbs)); 3903 r = archive_string_append_from_wcs(&(aes->aes_mbs), 3904 aes->aes_wcs.s, aes->aes_wcs.length); 3905 *p = aes->aes_mbs.s; 3906 if (r == 0) { 3907 aes->aes_set |= AES_SET_MBS; 3908 return (ret); 3909 } else 3910 ret = -1; 3911 } 3912 3913 /* 3914 * Only a UTF-8 form cannot avail because its conversion already 3915 * failed at archive_mstring_update_utf8(). 3916 */ 3917 return (ret); 3918 } 3919 3920 int 3921 archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, 3922 const wchar_t **wp) 3923 { 3924 int r, ret = 0; 3925 3926 (void)a;/* UNUSED */ 3927 /* Return WCS form if we already have it. */ 3928 if (aes->aes_set & AES_SET_WCS) { 3929 *wp = aes->aes_wcs.s; 3930 return (ret); 3931 } 3932 3933 *wp = NULL; 3934 /* Try converting MBS to WCS using native locale. */ 3935 if (aes->aes_set & AES_SET_MBS) { 3936 archive_wstring_empty(&(aes->aes_wcs)); 3937 r = archive_wstring_append_from_mbs(&(aes->aes_wcs), 3938 aes->aes_mbs.s, aes->aes_mbs.length); 3939 if (r == 0) { 3940 aes->aes_set |= AES_SET_WCS; 3941 *wp = aes->aes_wcs.s; 3942 } else 3943 ret = -1;/* failure. */ 3944 } 3945 return (ret); 3946 } 3947 3948 int 3949 archive_mstring_get_mbs_l(struct archive_mstring *aes, 3950 const char **p, size_t *length, struct archive_string_conv *sc) 3951 { 3952 int r, ret = 0; 3953 3954 #if defined(_WIN32) && !defined(__CYGWIN__) 3955 /* 3956 * Internationalization programming on Windows must use Wide 3957 * characters because Windows platform cannot make locale UTF-8. 3958 */ 3959 if (sc != NULL && (aes->aes_set & AES_SET_WCS) != 0) { 3960 archive_string_empty(&(aes->aes_mbs_in_locale)); 3961 r = archive_string_append_from_wcs_in_codepage( 3962 &(aes->aes_mbs_in_locale), aes->aes_wcs.s, 3963 aes->aes_wcs.length, sc); 3964 if (r == 0) { 3965 *p = aes->aes_mbs_in_locale.s; 3966 if (length != NULL) 3967 *length = aes->aes_mbs_in_locale.length; 3968 return (0); 3969 } else if (errno == ENOMEM) 3970 return (-1); 3971 else 3972 ret = -1; 3973 } 3974 #endif 3975 3976 /* If there is not an MBS form but is a WCS form, try converting 3977 * with the native locale to be used for translating it to specified 3978 * character-set. */ 3979 if ((aes->aes_set & AES_SET_MBS) == 0 && 3980 (aes->aes_set & AES_SET_WCS) != 0) { 3981 archive_string_empty(&(aes->aes_mbs)); 3982 r = archive_string_append_from_wcs(&(aes->aes_mbs), 3983 aes->aes_wcs.s, aes->aes_wcs.length); 3984 if (r == 0) 3985 aes->aes_set |= AES_SET_MBS; 3986 else if (errno == ENOMEM) 3987 return (-1); 3988 else 3989 ret = -1; 3990 } 3991 /* If we already have an MBS form, use it to be translated to 3992 * specified character-set. */ 3993 if (aes->aes_set & AES_SET_MBS) { 3994 if (sc == NULL) { 3995 /* Conversion is unneeded. */ 3996 *p = aes->aes_mbs.s; 3997 if (length != NULL) 3998 *length = aes->aes_mbs.length; 3999 return (0); 4000 } 4001 ret = archive_strncpy_l(&(aes->aes_mbs_in_locale), 4002 aes->aes_mbs.s, aes->aes_mbs.length, sc); 4003 *p = aes->aes_mbs_in_locale.s; 4004 if (length != NULL) 4005 *length = aes->aes_mbs_in_locale.length; 4006 } else { 4007 *p = NULL; 4008 if (length != NULL) 4009 *length = 0; 4010 } 4011 return (ret); 4012 } 4013 4014 int 4015 archive_mstring_copy_mbs(struct archive_mstring *aes, const char *mbs) 4016 { 4017 if (mbs == NULL) { 4018 aes->aes_set = 0; 4019 return (0); 4020 } 4021 return (archive_mstring_copy_mbs_len(aes, mbs, strlen(mbs))); 4022 } 4023 4024 int 4025 archive_mstring_copy_mbs_len(struct archive_mstring *aes, const char *mbs, 4026 size_t len) 4027 { 4028 if (mbs == NULL) { 4029 aes->aes_set = 0; 4030 return (0); 4031 } 4032 aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4033 archive_strncpy(&(aes->aes_mbs), mbs, len); 4034 archive_string_empty(&(aes->aes_utf8)); 4035 archive_wstring_empty(&(aes->aes_wcs)); 4036 return (0); 4037 } 4038 4039 int 4040 archive_mstring_copy_wcs(struct archive_mstring *aes, const wchar_t *wcs) 4041 { 4042 return archive_mstring_copy_wcs_len(aes, wcs, 4043 wcs == NULL ? 0 : wcslen(wcs)); 4044 } 4045 4046 int 4047 archive_mstring_copy_utf8(struct archive_mstring *aes, const char *utf8) 4048 { 4049 if (utf8 == NULL) { 4050 aes->aes_set = 0; 4051 } 4052 aes->aes_set = AES_SET_UTF8; 4053 archive_string_empty(&(aes->aes_mbs)); 4054 archive_string_empty(&(aes->aes_wcs)); 4055 archive_strncpy(&(aes->aes_utf8), utf8, strlen(utf8)); 4056 return (int)strlen(utf8); 4057 } 4058 4059 int 4060 archive_mstring_copy_wcs_len(struct archive_mstring *aes, const wchar_t *wcs, 4061 size_t len) 4062 { 4063 if (wcs == NULL) { 4064 aes->aes_set = 0; 4065 } 4066 aes->aes_set = AES_SET_WCS; /* Only WCS form set. */ 4067 archive_string_empty(&(aes->aes_mbs)); 4068 archive_string_empty(&(aes->aes_utf8)); 4069 archive_wstrncpy(&(aes->aes_wcs), wcs, len); 4070 return (0); 4071 } 4072 4073 int 4074 archive_mstring_copy_mbs_len_l(struct archive_mstring *aes, 4075 const char *mbs, size_t len, struct archive_string_conv *sc) 4076 { 4077 int r; 4078 4079 if (mbs == NULL) { 4080 aes->aes_set = 0; 4081 return (0); 4082 } 4083 archive_string_empty(&(aes->aes_mbs)); 4084 archive_wstring_empty(&(aes->aes_wcs)); 4085 archive_string_empty(&(aes->aes_utf8)); 4086 #if defined(_WIN32) && !defined(__CYGWIN__) 4087 /* 4088 * Internationalization programming on Windows must use Wide 4089 * characters because Windows platform cannot make locale UTF-8. 4090 */ 4091 if (sc == NULL) { 4092 if (archive_string_append(&(aes->aes_mbs), 4093 mbs, mbsnbytes(mbs, len)) == NULL) { 4094 aes->aes_set = 0; 4095 r = -1; 4096 } else { 4097 aes->aes_set = AES_SET_MBS; 4098 r = 0; 4099 } 4100 #if defined(HAVE_ICONV) 4101 } else if (sc != NULL && sc->cd_w != (iconv_t)-1) { 4102 /* 4103 * This case happens only when MultiByteToWideChar() cannot 4104 * handle sc->from_cp, and we have to iconv in order to 4105 * translate character-set to wchar_t,UTF-16. 4106 */ 4107 iconv_t cd = sc->cd; 4108 unsigned from_cp; 4109 int flag; 4110 4111 /* 4112 * Translate multi-bytes from some character-set to UTF-8. 4113 */ 4114 sc->cd = sc->cd_w; 4115 r = archive_strncpy_l(&(aes->aes_utf8), mbs, len, sc); 4116 sc->cd = cd; 4117 if (r != 0) { 4118 aes->aes_set = 0; 4119 return (r); 4120 } 4121 aes->aes_set = AES_SET_UTF8; 4122 4123 /* 4124 * Append the UTF-8 string into wstring. 4125 */ 4126 flag = sc->flag; 4127 sc->flag &= ~(SCONV_NORMALIZATION_C 4128 | SCONV_TO_UTF16| SCONV_FROM_UTF16); 4129 from_cp = sc->from_cp; 4130 sc->from_cp = CP_UTF8; 4131 r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), 4132 aes->aes_utf8.s, aes->aes_utf8.length, sc); 4133 sc->flag = flag; 4134 sc->from_cp = from_cp; 4135 if (r == 0) 4136 aes->aes_set |= AES_SET_WCS; 4137 #endif 4138 } else { 4139 r = archive_wstring_append_from_mbs_in_codepage( 4140 &(aes->aes_wcs), mbs, len, sc); 4141 if (r == 0) 4142 aes->aes_set = AES_SET_WCS; 4143 else 4144 aes->aes_set = 0; 4145 } 4146 #else 4147 r = archive_strncpy_l(&(aes->aes_mbs), mbs, len, sc); 4148 if (r == 0) 4149 aes->aes_set = AES_SET_MBS; /* Only MBS form is set now. */ 4150 else 4151 aes->aes_set = 0; 4152 #endif 4153 return (r); 4154 } 4155 4156 /* 4157 * The 'update' form tries to proactively update all forms of 4158 * this string (WCS and MBS) and returns an error if any of 4159 * them fail. This is used by the 'pax' handler, for instance, 4160 * to detect and report character-conversion failures early while 4161 * still allowing clients to get potentially useful values from 4162 * the more tolerant lazy conversions. (get_mbs and get_wcs will 4163 * strive to give the user something useful, so you can get hopefully 4164 * usable values even if some of the character conversions are failing.) 4165 */ 4166 int 4167 archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, 4168 const char *utf8) 4169 { 4170 struct archive_string_conv *sc; 4171 int r; 4172 4173 if (utf8 == NULL) { 4174 aes->aes_set = 0; 4175 return (0); /* Succeeded in clearing everything. */ 4176 } 4177 4178 /* Save the UTF8 string. */ 4179 archive_strcpy(&(aes->aes_utf8), utf8); 4180 4181 /* Empty the mbs and wcs strings. */ 4182 archive_string_empty(&(aes->aes_mbs)); 4183 archive_wstring_empty(&(aes->aes_wcs)); 4184 4185 aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */ 4186 4187 /* Try converting UTF-8 to MBS, return false on failure. */ 4188 sc = archive_string_conversion_from_charset(a, "UTF-8", 1); 4189 if (sc == NULL) 4190 return (-1);/* Couldn't allocate memory for sc. */ 4191 r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); 4192 if (a == NULL) 4193 free_sconv_object(sc); 4194 if (r != 0) 4195 return (-1); 4196 aes->aes_set = AES_SET_UTF8 | AES_SET_MBS; /* Both UTF8 and MBS set. */ 4197 4198 /* Try converting MBS to WCS, return false on failure. */ 4199 if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, 4200 aes->aes_mbs.length)) 4201 return (-1); 4202 aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; 4203 4204 /* All conversions succeeded. */ 4205 return (0); 4206 } 4207