1 /*- 2 * Copyright (c) 2011-2012 Michihiro NAKAJIMA 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 #include "test.h" 26 27 #include <locale.h> 28 29 #define __LIBARCHIVE_TEST 30 #include "archive_string.h" 31 32 /* 33 Execute the following to rebuild the data for this program: 34 tail -n +36 test_archive_string_conversion.c | /bin/sh 35 # 36 # This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt 37 # 38 if="NormalizationTest.txt" 39 if [ ! -f ${if} ]; then 40 echo "Not found: \"${if}\"" 41 exit 0 42 fi 43 of=test_archive_string_conversion.txt.Z 44 awk -F ';' '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} > ${of}.uu 45 exit 1 46 */ 47 48 static int 49 unicode_to_utf8(char *p, uint32_t uc) 50 { 51 char *_p = p; 52 53 /* Translate code point to UTF8 */ 54 if (uc <= 0x7f) { 55 *p++ = (char)uc; 56 } else if (uc <= 0x7ff) { 57 *p++ = 0xc0 | ((uc >> 6) & 0x1f); 58 *p++ = 0x80 | (uc & 0x3f); 59 } else if (uc <= 0xffff) { 60 *p++ = 0xe0 | ((uc >> 12) & 0x0f); 61 *p++ = 0x80 | ((uc >> 6) & 0x3f); 62 *p++ = 0x80 | (uc & 0x3f); 63 } else { 64 *p++ = 0xf0 | ((uc >> 18) & 0x07); 65 *p++ = 0x80 | ((uc >> 12) & 0x3f); 66 *p++ = 0x80 | ((uc >> 6) & 0x3f); 67 *p++ = 0x80 | (uc & 0x3f); 68 } 69 return ((int)(p - _p)); 70 } 71 72 static void 73 archive_be16enc(void *pp, uint16_t u) 74 { 75 unsigned char *p = (unsigned char *)pp; 76 77 p[0] = (u >> 8) & 0xff; 78 p[1] = u & 0xff; 79 } 80 81 static int 82 unicode_to_utf16be(char *p, uint32_t uc) 83 { 84 char *utf16 = p; 85 86 if (uc > 0xffff) { 87 /* We have a code point that won't fit into a 88 * wchar_t; convert it to a surrogate pair. */ 89 uc -= 0x10000; 90 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 91 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 92 return (4); 93 } else { 94 archive_be16enc(utf16, uc); 95 return (2); 96 } 97 } 98 99 static void 100 archive_le16enc(void *pp, uint16_t u) 101 { 102 unsigned char *p = (unsigned char *)pp; 103 104 p[0] = u & 0xff; 105 p[1] = (u >> 8) & 0xff; 106 } 107 108 static size_t 109 unicode_to_utf16le(char *p, uint32_t uc) 110 { 111 char *utf16 = p; 112 113 if (uc > 0xffff) { 114 /* We have a code point that won't fit into a 115 * wchar_t; convert it to a surrogate pair. */ 116 uc -= 0x10000; 117 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 118 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 119 return (4); 120 } else { 121 archive_le16enc(utf16, uc); 122 return (2); 123 } 124 } 125 126 static int 127 wc_size(void) 128 { 129 return (sizeof(wchar_t)); 130 } 131 132 static int 133 unicode_to_wc(wchar_t *wp, uint32_t uc) 134 { 135 if (wc_size() == 4) { 136 *wp = (wchar_t)uc; 137 return (1); 138 } 139 if (uc > 0xffff) { 140 /* We have a code point that won't fit into a 141 * wchar_t; convert it to a surrogate pair. */ 142 uc -= 0x10000; 143 *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800); 144 *wp = (wchar_t)((uc & 0x3ff) + 0xDC00); 145 return (2); 146 } else { 147 *wp = (wchar_t)uc; 148 return (1); 149 } 150 } 151 152 /* 153 * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not 154 * converted to NFD on Mac OS. 155 * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html 156 */ 157 static int 158 scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le, 159 const char *pattern, int mac_nfd) 160 { 161 unsigned uc = 0; 162 const char *p = pattern; 163 char *op = out; 164 wchar_t *owp = wout; 165 char *op16be = u16be; 166 char *op16le = u16le; 167 int ret = 0; 168 169 for (;;) { 170 if (*p >= '0' && *p <= '9') 171 uc = (uc << 4) + (*p - '0'); 172 else if (*p >= 'A' && *p <= 'F') 173 uc = (uc << 4) + (*p - 'A' + 0x0a); 174 else { 175 if (mac_nfd && op == out) { 176 /* 177 * These are not converted to NFD on Mac OS. 178 * U+2000 - U+2FFF 179 * U+F900 - U+FAFF 180 * U+2F800 - U+2FAFF 181 */ 182 switch (uc) { 183 case 0x2194: case 0x219A: case 0x219B: 184 case 0x21AE: case 0x21CD: case 0x21CE: 185 case 0x21CF: case 0x2204: case 0x2209: 186 case 0x220C: case 0x2224: case 0x2226: 187 case 0x2241: case 0x2244: case 0x2247: 188 case 0x2249: case 0x2260: case 0x2262: 189 case 0x226D: case 0x226E: case 0x226F: 190 case 0x2270: case 0x2271: case 0x2274: 191 case 0x2275: case 0x2276: case 0x2278: 192 case 0x2279: case 0x227A: case 0x227B: 193 case 0x2280: case 0x2281: case 0x2284: 194 case 0x2285: case 0x2288: case 0x2289: 195 case 0x22AC: case 0x22AD: case 0x22AE: 196 case 0x22AF: case 0x22E0: case 0x22E1: 197 case 0x22E2: case 0x22E3: case 0x22EA: 198 case 0x22EB: case 0x22EC: case 0x22ED: 199 200 /* 201 * Those code points are not converted to 202 * NFD on Mac OS. I do not know the reason 203 * because it is undocumented. 204 * NFC NFD 205 * 1109A ==> 11099 110BA 206 * 1109C ==> 1109B 110BA 207 * 110AB ==> 110A5 110BA 208 */ 209 case 0x1109A: case 0x1109C: case 0x110AB: 210 ret = 1; 211 break; 212 } 213 } 214 op16be += unicode_to_utf16be(op16be, uc); 215 op16le += unicode_to_utf16le(op16le, uc); 216 owp += unicode_to_wc(owp, uc); 217 op += unicode_to_utf8(op, uc); 218 if (!*p) { 219 *op16be++ = 0; 220 *op16be = 0; 221 *op16le++ = 0; 222 *op16le = 0; 223 *owp = L'\0'; 224 *op = '\0'; 225 break; 226 } 227 uc = 0; 228 } 229 p++; 230 } 231 return (ret); 232 } 233 234 static int 235 is_wc_unicode(void) 236 { 237 #if defined(_WIN32) && !defined(__CYGWIN__) 238 return (1); 239 #else 240 return (0); 241 #endif 242 } 243 244 /* 245 * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters. 246 * On Mac OS, the characters to be Form D. 247 * On other platforms, the characters to be Form C. 248 */ 249 static void 250 test_archive_string_normalization_nfc(const char *testdata) 251 { 252 struct archive *a, *a2; 253 struct archive_string utf8; 254 struct archive_mstring mstr; 255 struct archive_string_conv *f_sconv8, *t_sconv8; 256 struct archive_string_conv *f_sconv16be, *f_sconv16le; 257 FILE *fp; 258 char buff[512]; 259 int line = 0; 260 int locale_is_utf8, wc_is_unicode; 261 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C; 262 263 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); 264 wc_is_unicode = is_wc_unicode(); 265 /* If it doesn't exist, just warn and return. */ 266 if (!locale_is_utf8 && !wc_is_unicode) { 267 skipping("A test of string normalization for NFC requires " 268 "a suitable locale; en_US.UTF-8 not available on this " 269 "system"); 270 return; 271 } 272 273 archive_string_init(&utf8); 274 memset(&mstr, 0, sizeof(mstr)); 275 276 /* 277 * Create string conversion objects. 278 */ 279 assert((a = archive_read_new()) != NULL); 280 assertA(NULL != (f_sconv8 = 281 archive_string_conversion_from_charset(a, "UTF-8", 0))); 282 assertA(NULL != (f_sconv16be = 283 archive_string_conversion_from_charset(a, "UTF-16BE", 0))); 284 assertA(NULL != (f_sconv16le = 285 archive_string_conversion_from_charset(a, "UTF-16LE", 0))); 286 assert((a2 = archive_write_new()) != NULL); 287 assertA(NULL != (t_sconv8 = 288 archive_string_conversion_to_charset(a2, "UTF-8", 0))); 289 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || 290 t_sconv8 == NULL) { 291 /* We cannot continue this test. */ 292 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 293 return; 294 } 295 archive_string_conversion_set_opt(f_sconv8, sconv_opt); 296 archive_string_conversion_set_opt(f_sconv16be, sconv_opt); 297 archive_string_conversion_set_opt(f_sconv16le, sconv_opt); 298 archive_string_conversion_set_opt(t_sconv8, sconv_opt); 299 300 /* Open a test pattern file. */ 301 assert((fp = fopen(testdata, "r")) != NULL); 302 303 /* 304 * Read test data. 305 * Test data format: 306 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' 307 * Unicode pattern format: 308 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} 309 */ 310 while (fgets(buff, sizeof(buff), fp) != NULL) { 311 char nfc[80], nfd[80]; 312 char utf8_nfc[80], utf8_nfd[80]; 313 char utf16be_nfc[80], utf16be_nfd[80]; 314 char utf16le_nfc[80], utf16le_nfd[80]; 315 wchar_t wc_nfc[40], wc_nfd[40]; 316 char *e, *p; 317 const wchar_t *wp; 318 const char *mp; 319 size_t mplen; 320 321 line++; 322 if (buff[0] == '#') 323 continue; 324 p = strchr(buff, ';'); 325 if (p == NULL) 326 continue; 327 *p++ = '\0'; 328 /* Copy an NFC pattern */ 329 strncpy(nfc, buff, sizeof(nfc)-1); 330 nfc[sizeof(nfc)-1] = '\0'; 331 e = p; 332 p = strchr(p, '\n'); 333 if (p == NULL) 334 continue; 335 *p = '\0'; 336 /* Copy an NFD pattern */ 337 strncpy(nfd, e, sizeof(nfd)-1); 338 nfd[sizeof(nfd)-1] = '\0'; 339 340 /* 341 * Get an NFC patterns. 342 */ 343 scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc, 344 nfc, 0); 345 346 /* 347 * Get an NFD patterns. 348 */ 349 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, 350 nfd, 0); 351 352 if (locale_is_utf8) { 353 /* 354 * Normalize an NFD string for import. 355 */ 356 assertEqualInt(0, archive_strcpy_l( 357 &utf8, utf8_nfd, f_sconv8)); 358 failure("NFD(%s) should be converted to NFC(%s):%d", 359 nfd, nfc, line); 360 assertEqualUTF8String(utf8_nfc, utf8.s); 361 362 /* 363 * Normalize an NFC string for import. 364 */ 365 assertEqualInt(0, archive_strcpy_l( 366 &utf8, utf8_nfc, f_sconv8)); 367 failure("NFC(%s) should not be any changed:%d", 368 nfc, line); 369 assertEqualUTF8String(utf8_nfc, utf8.s); 370 371 /* 372 * Copy an NFC string for export. 373 */ 374 assertEqualInt(0, archive_strcpy_l( 375 &utf8, utf8_nfc, t_sconv8)); 376 failure("NFC(%s) should not be any changed:%d", 377 nfc, line); 378 assertEqualUTF8String(utf8_nfc, utf8.s); 379 380 /* 381 * Normalize an NFD string in UTF-16BE for import. 382 */ 383 assertEqualInt(0, archive_strncpy_l( 384 &utf8, utf16be_nfd, 100000, f_sconv16be)); 385 failure("NFD(%s) should be converted to NFC(%s):%d", 386 nfd, nfc, line); 387 assertEqualUTF8String(utf8_nfc, utf8.s); 388 389 /* 390 * Normalize an NFD string in UTF-16LE for import. 391 */ 392 assertEqualInt(0, archive_strncpy_l( 393 &utf8, utf16le_nfd, 100000, f_sconv16le)); 394 failure("NFD(%s) should be converted to NFC(%s):%d", 395 nfd, nfc, line); 396 assertEqualUTF8String(utf8_nfc, utf8.s); 397 } 398 399 /* 400 * Test for archive_mstring interface. 401 * In specific, Windows platform UTF-16BE is directly 402 * converted to/from wide-character to avoid the effect of 403 * current locale since windows platform cannot make 404 * locale UTF-8. 405 */ 406 if (locale_is_utf8 || wc_is_unicode) { 407 /* 408 * Normalize an NFD string in UTF-8 for import. 409 */ 410 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 411 &mstr, utf8_nfd, 100000, f_sconv8)); 412 assertEqualInt(0, 413 archive_mstring_get_wcs(a, &mstr, &wp)); 414 failure("UTF-8 NFD(%s) should be converted " 415 "to WCS NFC(%s):%d", nfd, nfc, line); 416 assertEqualWString(wc_nfc, wp); 417 418 /* 419 * Normalize an NFD string in UTF-16BE for import. 420 */ 421 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 422 &mstr, utf16be_nfd, 100000, f_sconv16be)); 423 assertEqualInt(0, 424 archive_mstring_get_wcs(a, &mstr, &wp)); 425 failure("UTF-8 NFD(%s) should be converted " 426 "to WCS NFC(%s):%d", nfd, nfc, line); 427 assertEqualWString(wc_nfc, wp); 428 429 /* 430 * Normalize an NFD string in UTF-16LE for import. 431 */ 432 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 433 &mstr, utf16le_nfd, 100000, f_sconv16le)); 434 assertEqualInt(0, 435 archive_mstring_get_wcs(a, &mstr, &wp)); 436 failure("UTF-8 NFD(%s) should be converted " 437 "to WCS NFC(%s):%d", nfd, nfc, line); 438 assertEqualWString(wc_nfc, wp); 439 440 /* 441 * Copy an NFC wide-string for export. 442 */ 443 assertEqualInt(0, 444 archive_mstring_copy_wcs(&mstr, wc_nfc)); 445 assertEqualInt(0, archive_mstring_get_mbs_l( 446 a, &mstr, &mp, &mplen, t_sconv8)); 447 failure("WCS NFC(%s) should be UTF-8 NFC:%d" 448 ,nfc, line); 449 assertEqualUTF8String(utf8_nfc, mp); 450 } 451 } 452 453 archive_string_free(&utf8); 454 archive_mstring_clean(&mstr); 455 fclose(fp); 456 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 457 assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); 458 } 459 460 static void 461 test_archive_string_normalization_mac_nfd(const char *testdata) 462 { 463 struct archive *a, *a2; 464 struct archive_string utf8; 465 struct archive_mstring mstr; 466 struct archive_string_conv *f_sconv8, *t_sconv8; 467 struct archive_string_conv *f_sconv16be, *f_sconv16le; 468 FILE *fp; 469 char buff[512]; 470 int line = 0; 471 int locale_is_utf8, wc_is_unicode; 472 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D; 473 474 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); 475 wc_is_unicode = is_wc_unicode(); 476 /* If it doesn't exist, just warn and return. */ 477 if (!locale_is_utf8 && !wc_is_unicode) { 478 skipping("A test of string normalization for NFD requires " 479 "a suitable locale; en_US.UTF-8 not available on this " 480 "system"); 481 return; 482 } 483 484 archive_string_init(&utf8); 485 memset(&mstr, 0, sizeof(mstr)); 486 487 /* 488 * Create string conversion objects. 489 */ 490 assert((a = archive_read_new()) != NULL); 491 assertA(NULL != (f_sconv8 = 492 archive_string_conversion_from_charset(a, "UTF-8", 0))); 493 assertA(NULL != (f_sconv16be = 494 archive_string_conversion_from_charset(a, "UTF-16BE", 0))); 495 assertA(NULL != (f_sconv16le = 496 archive_string_conversion_from_charset(a, "UTF-16LE", 0))); 497 assert((a2 = archive_write_new()) != NULL); 498 assertA(NULL != (t_sconv8 = 499 archive_string_conversion_to_charset(a2, "UTF-8", 0))); 500 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || 501 t_sconv8 == NULL) { 502 /* We cannot continue this test. */ 503 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 504 return; 505 } 506 archive_string_conversion_set_opt(f_sconv8, sconv_opt); 507 archive_string_conversion_set_opt(f_sconv16be, sconv_opt); 508 archive_string_conversion_set_opt(f_sconv16le, sconv_opt); 509 archive_string_conversion_set_opt(t_sconv8, sconv_opt); 510 511 /* Open a test pattern file. */ 512 assert((fp = fopen(testdata, "r")) != NULL); 513 514 /* 515 * Read test data. 516 * Test data format: 517 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' 518 * Unicode pattern format: 519 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} 520 */ 521 while (fgets(buff, sizeof(buff), fp) != NULL) { 522 char nfc[80], nfd[80]; 523 char utf8_nfc[80], utf8_nfd[80]; 524 char utf16be_nfc[80], utf16be_nfd[80]; 525 char utf16le_nfc[80], utf16le_nfd[80]; 526 wchar_t wc_nfc[40], wc_nfd[40]; 527 char *e, *p; 528 const wchar_t *wp; 529 const char *mp; 530 size_t mplen; 531 int should_be_nfc; 532 533 line++; 534 if (buff[0] == '#') 535 continue; 536 p = strchr(buff, ';'); 537 if (p == NULL) 538 continue; 539 *p++ = '\0'; 540 /* Copy an NFC pattern */ 541 strncpy(nfc, buff, sizeof(nfc)-1); 542 nfc[sizeof(nfc)-1] = '\0'; 543 e = p; 544 p = strchr(p, '\n'); 545 if (p == NULL) 546 continue; 547 *p = '\0'; 548 /* Copy an NFD pattern */ 549 strncpy(nfd, e, sizeof(nfd)-1); 550 nfd[sizeof(nfd)-1] = '\0'; 551 552 /* 553 * Get an NFC patterns. 554 */ 555 should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc, 556 utf16be_nfc, utf16le_nfc, nfc, 1); 557 558 /* 559 * Get an NFD patterns. 560 */ 561 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, 562 nfd, 0); 563 564 if (locale_is_utf8) { 565 /* 566 * Normalize an NFC string for import. 567 */ 568 assertEqualInt(0, archive_strcpy_l( 569 &utf8, utf8_nfc, f_sconv8)); 570 if (should_be_nfc) { 571 failure("NFC(%s) should not be converted to" 572 " NFD(%s):%d", nfc, nfd, line); 573 assertEqualUTF8String(utf8_nfc, utf8.s); 574 } else { 575 failure("NFC(%s) should be converted to" 576 " NFD(%s):%d", nfc, nfd, line); 577 assertEqualUTF8String(utf8_nfd, utf8.s); 578 } 579 580 /* 581 * Normalize an NFD string for import. 582 */ 583 assertEqualInt(0, archive_strcpy_l( 584 &utf8, utf8_nfd, f_sconv8)); 585 failure("NFD(%s) should not be any changed:%d", 586 nfd, line); 587 assertEqualUTF8String(utf8_nfd, utf8.s); 588 589 /* 590 * Copy an NFD string for export. 591 */ 592 assertEqualInt(0, archive_strcpy_l( 593 &utf8, utf8_nfd, t_sconv8)); 594 failure("NFD(%s) should not be any changed:%d", 595 nfd, line); 596 assertEqualUTF8String(utf8_nfd, utf8.s); 597 598 /* 599 * Normalize an NFC string in UTF-16BE for import. 600 */ 601 assertEqualInt(0, archive_strncpy_l( 602 &utf8, utf16be_nfc, 100000, f_sconv16be)); 603 if (should_be_nfc) { 604 failure("NFC(%s) should not be converted to" 605 " NFD(%s):%d", nfc, nfd, line); 606 assertEqualUTF8String(utf8_nfc, utf8.s); 607 } else { 608 failure("NFC(%s) should be converted to" 609 " NFD(%s):%d", nfc, nfd, line); 610 assertEqualUTF8String(utf8_nfd, utf8.s); 611 } 612 613 /* 614 * Normalize an NFC string in UTF-16LE for import. 615 */ 616 assertEqualInt(0, archive_strncpy_l( 617 &utf8, utf16le_nfc, 100000, f_sconv16le)); 618 if (should_be_nfc) { 619 failure("NFC(%s) should not be converted to" 620 " NFD(%s):%d", nfc, nfd, line); 621 assertEqualUTF8String(utf8_nfc, utf8.s); 622 } else { 623 failure("NFC(%s) should be converted to" 624 " NFD(%s):%d", nfc, nfd, line); 625 assertEqualUTF8String(utf8_nfd, utf8.s); 626 } 627 } 628 629 /* 630 * Test for archive_mstring interface. 631 * In specific, Windows platform UTF-16BE is directly 632 * converted to/from wide-character to avoid the effect of 633 * current locale since windows platform cannot make 634 * locale UTF-8. 635 */ 636 if (locale_is_utf8 || wc_is_unicode) { 637 /* 638 * Normalize an NFD string in UTF-8 for import. 639 */ 640 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 641 &mstr, utf8_nfc, 100000, f_sconv8)); 642 assertEqualInt(0, 643 archive_mstring_get_wcs(a, &mstr, &wp)); 644 if (should_be_nfc) { 645 failure("UTF-8 NFC(%s) should not be converted " 646 "to WCS NFD(%s):%d", nfc, nfd, line); 647 assertEqualWString(wc_nfc, wp); 648 } else { 649 failure("UTF-8 NFC(%s) should be converted " 650 "to WCS NFD(%s):%d", nfc, nfd, line); 651 assertEqualWString(wc_nfd, wp); 652 } 653 654 /* 655 * Normalize an NFD string in UTF-16BE for import. 656 */ 657 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 658 &mstr, utf16be_nfc, 100000, f_sconv16be)); 659 assertEqualInt(0, 660 archive_mstring_get_wcs(a, &mstr, &wp)); 661 if (should_be_nfc) { 662 failure("UTF-16BE NFC(%s) should not be " 663 "converted to WCS NFD(%s):%d", 664 nfc, nfd, line); 665 assertEqualWString(wc_nfc, wp); 666 } else { 667 failure("UTF-16BE NFC(%s) should be converted " 668 "to WCS NFD(%s):%d", nfc, nfd, line); 669 assertEqualWString(wc_nfd, wp); 670 } 671 672 /* 673 * Normalize an NFD string in UTF-16LE for import. 674 */ 675 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 676 &mstr, utf16le_nfc, 100000, f_sconv16le)); 677 assertEqualInt(0, 678 archive_mstring_get_wcs(a, &mstr, &wp)); 679 if (should_be_nfc) { 680 failure("UTF-16LE NFC(%s) should not be " 681 "converted to WCS NFD(%s):%d", 682 nfc, nfd, line); 683 assertEqualWString(wc_nfc, wp); 684 } else { 685 failure("UTF-16LE NFC(%s) should be converted " 686 "to WCS NFD(%s):%d", nfc, nfd, line); 687 assertEqualWString(wc_nfd, wp); 688 } 689 690 /* 691 * Copy an NFD wide-string for export. 692 */ 693 assertEqualInt(0, archive_mstring_copy_wcs( 694 &mstr, wc_nfd)); 695 assertEqualInt(0, archive_mstring_get_mbs_l( 696 a, &mstr, &mp, &mplen, t_sconv8)); 697 failure("WCS NFD(%s) should be UTF-8 NFD:%d" 698 ,nfd, line); 699 assertEqualUTF8String(utf8_nfd, mp); 700 } 701 } 702 703 archive_string_free(&utf8); 704 archive_mstring_clean(&mstr); 705 fclose(fp); 706 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 707 assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); 708 } 709 710 static void 711 test_archive_string_canonicalization(void) 712 { 713 struct archive *a; 714 struct archive_string_conv *sconv; 715 716 setlocale(LC_ALL, "en_US.UTF-8"); 717 718 assert((a = archive_read_new()) != NULL); 719 720 assertA(NULL != (sconv = 721 archive_string_conversion_to_charset(a, "UTF-8", 1))); 722 failure("Charset name should be UTF-8"); 723 assertEqualString("UTF-8", 724 archive_string_conversion_charset_name(sconv)); 725 726 assertA(NULL != (sconv = 727 archive_string_conversion_to_charset(a, "UTF8", 1))); 728 failure("Charset name should be UTF-8"); 729 assertEqualString("UTF-8", 730 archive_string_conversion_charset_name(sconv)); 731 732 assertA(NULL != (sconv = 733 archive_string_conversion_to_charset(a, "utf8", 1))); 734 failure("Charset name should be UTF-8"); 735 assertEqualString("UTF-8", 736 archive_string_conversion_charset_name(sconv)); 737 738 assertA(NULL != (sconv = 739 archive_string_conversion_to_charset(a, "UTF-16BE", 1))); 740 failure("Charset name should be UTF-16BE"); 741 assertEqualString("UTF-16BE", 742 archive_string_conversion_charset_name(sconv)); 743 744 assertA(NULL != (sconv = 745 archive_string_conversion_to_charset(a, "UTF16BE", 1))); 746 failure("Charset name should be UTF-16BE"); 747 assertEqualString("UTF-16BE", 748 archive_string_conversion_charset_name(sconv)); 749 750 assertA(NULL != (sconv = 751 archive_string_conversion_to_charset(a, "utf16be", 1))); 752 failure("Charset name should be UTF-16BE"); 753 assertEqualString("UTF-16BE", 754 archive_string_conversion_charset_name(sconv)); 755 756 assertA(NULL != (sconv = 757 archive_string_conversion_to_charset(a, "UTF-16LE", 1))); 758 failure("Charset name should be UTF-16LE"); 759 assertEqualString("UTF-16LE", 760 archive_string_conversion_charset_name(sconv)); 761 762 assertA(NULL != (sconv = 763 archive_string_conversion_to_charset(a, "UTF16LE", 1))); 764 failure("Charset name should be UTF-16LE"); 765 assertEqualString("UTF-16LE", 766 archive_string_conversion_charset_name(sconv)); 767 768 assertA(NULL != (sconv = 769 archive_string_conversion_to_charset(a, "utf16le", 1))); 770 failure("Charset name should be UTF-16LE"); 771 assertEqualString("UTF-16LE", 772 archive_string_conversion_charset_name(sconv)); 773 774 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 775 776 } 777 778 static void 779 check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc, 780 const char *exp, const wchar_t *wexp) 781 { 782 /* Do all the tests on a copy so that we can have a clear initial state every time */ 783 struct archive_mstring mstr2; 784 const char *p = NULL; 785 const wchar_t *wp = NULL; 786 size_t len = 0; 787 788 memset(&mstr2, 0, sizeof(mstr2)); 789 790 archive_mstring_copy(&mstr2, mstr); 791 assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p)); 792 assertEqualString(exp, p); 793 p = NULL; 794 795 archive_mstring_copy(&mstr2, mstr); 796 assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p)); 797 assertEqualString(exp, p); 798 p = NULL; 799 800 archive_mstring_copy(&mstr2, mstr); 801 assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp)); 802 assertEqualWString(wexp, wp); 803 wp = NULL; 804 805 archive_mstring_copy(&mstr2, mstr); 806 assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc)); 807 assertEqualString(exp, p); 808 assertEqualInt(len, strlen(exp)); 809 p = NULL; 810 len = 0; 811 812 archive_mstring_clean(&mstr2); 813 } 814 815 /* 816 * Make sure no matter what the input encoding is, the string can be 817 * converted too all the output encodings. 818 */ 819 static void 820 test_archive_string_set_get(void) 821 { 822 struct archive *a; 823 struct archive_mstring mstr; 824 struct archive_string_conv *sc; 825 826 setlocale(LC_ALL, "en_US.UTF-8"); 827 828 assert((a = archive_read_new()) != NULL); 829 memset(&mstr, 0, sizeof(mstr)); 830 831 assertA(NULL != (sc = 832 archive_string_conversion_to_charset(a, "UTF-8", 1))); 833 failure("Charset name should be UTF-8"); 834 assertEqualString("UTF-8", 835 archive_string_conversion_charset_name(sc)); 836 837 assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA")); 838 check_string(a, &mstr, sc, "AAA", L"AAA"); 839 assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB")); 840 check_string(a, &mstr, sc, "BBBB", L"BBBB"); 841 assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12")); 842 check_string(a, &mstr, sc, "CCC12", L"CCC12"); 843 assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc)); 844 check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l"); 845 assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H")); 846 check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H"); 847 848 archive_mstring_clean(&mstr); 849 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 850 851 } 852 853 DEFINE_TEST(test_archive_string_conversion) 854 { 855 static const char reffile[] = "test_archive_string_conversion.txt.Z"; 856 static const char testdata[] = "testdata.txt"; 857 struct archive *a; 858 struct archive_entry *ae; 859 char buff[512]; 860 ssize_t size; 861 FILE *fp; 862 863 /* 864 * Extract a test pattern file. 865 */ 866 extract_reference_file(reffile); 867 assert((a = archive_read_new()) != NULL); 868 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); 869 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a)); 870 assertEqualIntA(a, ARCHIVE_OK, 871 archive_read_open_filename(a, reffile, 512)); 872 873 assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); 874 assert((fp = fopen(testdata, "w")) != NULL); 875 while ((size = archive_read_data(a, buff, 512)) > 0) 876 assertEqualInt(size, fwrite(buff, 1, size, fp)); 877 assertEqualInt(0, fclose(fp)); 878 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 879 880 test_archive_string_normalization_nfc(testdata); 881 test_archive_string_normalization_mac_nfd(testdata); 882 test_archive_string_canonicalization(); 883 test_archive_string_set_get(); 884 } 885 886 DEFINE_TEST(test_archive_string_conversion_utf16_utf8) 887 { 888 #if !defined(_WIN32) || defined(__CYGWIN__) 889 skipping("This test is meant to verify unicode string handling on Windows"); 890 #else 891 struct archive_mstring mstr; 892 const char* utf8_string; 893 894 memset(&mstr, 0, sizeof(mstr)); 895 896 assertEqualInt(ARCHIVE_OK, 897 archive_mstring_copy_wcs(&mstr, L"\U0000043f\U00000440\U00000438")); 898 899 /* Conversion from WCS to UTF-8 should always succeed */ 900 assertEqualInt(ARCHIVE_OK, 901 archive_mstring_get_utf8(NULL, &mstr, &utf8_string)); 902 assertEqualString("\xD0\xBF\xD1\x80\xD0\xB8", utf8_string); 903 904 archive_mstring_clean(&mstr); 905 #endif 906 } 907 908 DEFINE_TEST(test_archive_string_conversion_utf8_utf16) 909 { 910 #if !defined(_WIN32) || defined(__CYGWIN__) 911 skipping("This test is meant to verify unicode string handling on Windows"); 912 #else 913 struct archive_mstring mstr; 914 const wchar_t* wcs_string; 915 916 memset(&mstr, 0, sizeof(mstr)); 917 918 assertEqualInt(6, 919 archive_mstring_copy_utf8(&mstr, "\xD0\xBF\xD1\x80\xD0\xB8")); 920 921 /* Conversion from UTF-8 to WCS should always succeed */ 922 assertEqualInt(ARCHIVE_OK, 923 archive_mstring_get_wcs(NULL, &mstr, &wcs_string)); 924 assertEqualWString(L"\U0000043f\U00000440\U00000438", wcs_string); 925 926 archive_mstring_clean(&mstr); 927 #endif 928 } 929 930 DEFINE_TEST(test_archive_string_update_utf8_win) 931 { 932 #if !defined(_WIN32) || defined(__CYGWIN__) 933 skipping("This test is meant to verify unicode string handling on Windows" 934 " with the C locale"); 935 #else 936 static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8"; 937 static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438"; 938 struct archive_mstring mstr; 939 int r; 940 941 memset(&mstr, 0, sizeof(mstr)); 942 943 r = archive_mstring_update_utf8(NULL, &mstr, utf8_string); 944 945 /* On Windows, this should reliably fail with the C locale */ 946 assertEqualInt(-1, r); 947 assertEqualInt(0, mstr.aes_set & AES_SET_MBS); 948 949 /* NOTE: We access the internals to validate that they were set by the 950 * 'archive_mstring_update_utf8' function */ 951 /* UTF-8 should always be set */ 952 assertEqualInt(AES_SET_UTF8, mstr.aes_set & AES_SET_UTF8); 953 assertEqualString(utf8_string, mstr.aes_utf8.s); 954 /* WCS should always be set as well */ 955 assertEqualInt(AES_SET_WCS, mstr.aes_set & AES_SET_WCS); 956 assertEqualWString(wcs_string, mstr.aes_wcs.s); 957 958 archive_mstring_clean(&mstr); 959 #endif 960 } 961 962 DEFINE_TEST(test_archive_string_update_utf8_utf8) 963 { 964 static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8"; 965 static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438"; 966 struct archive_mstring mstr; 967 int r; 968 969 memset(&mstr, 0, sizeof(mstr)); 970 971 if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) { 972 skipping("UTF-8 not supported on this system."); 973 return; 974 } 975 976 r = archive_mstring_update_utf8(NULL, &mstr, utf8_string); 977 978 /* All conversions should have succeeded */ 979 assertEqualInt(0, r); 980 assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set); 981 assertEqualString(utf8_string, mstr.aes_utf8.s); 982 assertEqualString(utf8_string, mstr.aes_mbs.s); 983 assertEqualWString(wcs_string, mstr.aes_wcs.s); 984 985 archive_mstring_clean(&mstr); 986 } 987 988 DEFINE_TEST(test_archive_string_update_utf8_koi8) 989 { 990 static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8"; 991 static const char koi8_string[] = "\xD0\xD2\xC9"; 992 static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438"; 993 struct archive_mstring mstr; 994 int r; 995 996 memset(&mstr, 0, sizeof(mstr)); 997 998 if (setlocale(LC_ALL, "ru_RU.KOI8-R") == NULL) { 999 skipping("KOI8-R locale not available on this system."); 1000 return; 1001 } 1002 1003 r = archive_mstring_update_utf8(NULL, &mstr, utf8_string); 1004 1005 /* All conversions should have succeeded */ 1006 assertEqualInt(0, r); 1007 assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set); 1008 assertEqualString(utf8_string, mstr.aes_utf8.s); 1009 assertEqualString(koi8_string, mstr.aes_mbs.s); 1010 #if defined(_WIN32) && !defined(__CYGWIN__) 1011 assertEqualWString(wcs_string, mstr.aes_wcs.s); 1012 #else 1013 /* No guarantee of how WCS strings behave, however this test test is 1014 * primarily meant for Windows */ 1015 (void)wcs_string; 1016 #endif 1017 1018 archive_mstring_clean(&mstr); 1019 } 1020