1 /*- 2 * Copyright (c) 2011-2012 Michihiro NAKAJIMA 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 #include "test.h" 26 __FBSDID("$FreeBSD$"); 27 28 #include <locale.h> 29 30 #define __LIBARCHIVE_TEST 31 #include "archive_string.h" 32 33 /* 34 Execute the following to rebuild the data for this program: 35 tail -n +36 test_archive_string_conversion.c | /bin/sh 36 # 37 # This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt 38 # 39 if="NormalizationTest.txt" 40 if [ ! -f ${if} ]; then 41 echo "Not found: \"${if}\"" 42 exit 0 43 fi 44 of=test_archive_string_conversion.txt.Z 45 echo "\$FreeBSD\$" > ${of}.uu 46 awk -F ';' '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu 47 exit 1 48 */ 49 50 static int 51 unicode_to_utf8(char *p, uint32_t uc) 52 { 53 char *_p = p; 54 55 /* Translate code point to UTF8 */ 56 if (uc <= 0x7f) { 57 *p++ = (char)uc; 58 } else if (uc <= 0x7ff) { 59 *p++ = 0xc0 | ((uc >> 6) & 0x1f); 60 *p++ = 0x80 | (uc & 0x3f); 61 } else if (uc <= 0xffff) { 62 *p++ = 0xe0 | ((uc >> 12) & 0x0f); 63 *p++ = 0x80 | ((uc >> 6) & 0x3f); 64 *p++ = 0x80 | (uc & 0x3f); 65 } else { 66 *p++ = 0xf0 | ((uc >> 18) & 0x07); 67 *p++ = 0x80 | ((uc >> 12) & 0x3f); 68 *p++ = 0x80 | ((uc >> 6) & 0x3f); 69 *p++ = 0x80 | (uc & 0x3f); 70 } 71 return ((int)(p - _p)); 72 } 73 74 static void 75 archive_be16enc(void *pp, uint16_t u) 76 { 77 unsigned char *p = (unsigned char *)pp; 78 79 p[0] = (u >> 8) & 0xff; 80 p[1] = u & 0xff; 81 } 82 83 static int 84 unicode_to_utf16be(char *p, uint32_t uc) 85 { 86 char *utf16 = p; 87 88 if (uc > 0xffff) { 89 /* We have a code point that won't fit into a 90 * wchar_t; convert it to a surrogate pair. */ 91 uc -= 0x10000; 92 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 93 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 94 return (4); 95 } else { 96 archive_be16enc(utf16, uc); 97 return (2); 98 } 99 } 100 101 static void 102 archive_le16enc(void *pp, uint16_t u) 103 { 104 unsigned char *p = (unsigned char *)pp; 105 106 p[0] = u & 0xff; 107 p[1] = (u >> 8) & 0xff; 108 } 109 110 static size_t 111 unicode_to_utf16le(char *p, uint32_t uc) 112 { 113 char *utf16 = p; 114 115 if (uc > 0xffff) { 116 /* We have a code point that won't fit into a 117 * wchar_t; convert it to a surrogate pair. */ 118 uc -= 0x10000; 119 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); 120 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); 121 return (4); 122 } else { 123 archive_le16enc(utf16, uc); 124 return (2); 125 } 126 } 127 128 static int 129 wc_size(void) 130 { 131 return (sizeof(wchar_t)); 132 } 133 134 static int 135 unicode_to_wc(wchar_t *wp, uint32_t uc) 136 { 137 if (wc_size() == 4) { 138 *wp = (wchar_t)uc; 139 return (1); 140 } 141 if (uc > 0xffff) { 142 /* We have a code point that won't fit into a 143 * wchar_t; convert it to a surrogate pair. */ 144 uc -= 0x10000; 145 *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800); 146 *wp = (wchar_t)((uc & 0x3ff) + 0xDC00); 147 return (2); 148 } else { 149 *wp = (wchar_t)uc; 150 return (1); 151 } 152 } 153 154 /* 155 * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not 156 * converted to NFD on Mac OS. 157 * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html 158 */ 159 static int 160 scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le, 161 const char *pattern, int mac_nfd) 162 { 163 unsigned uc = 0; 164 const char *p = pattern; 165 char *op = out; 166 wchar_t *owp = wout; 167 char *op16be = u16be; 168 char *op16le = u16le; 169 int ret = 0; 170 171 for (;;) { 172 if (*p >= '0' && *p <= '9') 173 uc = (uc << 4) + (*p - '0'); 174 else if (*p >= 'A' && *p <= 'F') 175 uc = (uc << 4) + (*p - 'A' + 0x0a); 176 else { 177 if (mac_nfd && op == out) { 178 /* 179 * These are not converted to NFD on Mac OS. 180 * U+2000 - U+2FFF 181 * U+F900 - U+FAFF 182 * U+2F800 - U+2FAFF 183 */ 184 switch (uc) { 185 case 0x2194: case 0x219A: case 0x219B: 186 case 0x21AE: case 0x21CD: case 0x21CE: 187 case 0x21CF: case 0x2204: case 0x2209: 188 case 0x220C: case 0x2224: case 0x2226: 189 case 0x2241: case 0x2244: case 0x2247: 190 case 0x2249: case 0x2260: case 0x2262: 191 case 0x226D: case 0x226E: case 0x226F: 192 case 0x2270: case 0x2271: case 0x2274: 193 case 0x2275: case 0x2276: case 0x2278: 194 case 0x2279: case 0x227A: case 0x227B: 195 case 0x2280: case 0x2281: case 0x2284: 196 case 0x2285: case 0x2288: case 0x2289: 197 case 0x22AC: case 0x22AD: case 0x22AE: 198 case 0x22AF: case 0x22E0: case 0x22E1: 199 case 0x22E2: case 0x22E3: case 0x22EA: 200 case 0x22EB: case 0x22EC: case 0x22ED: 201 202 /* 203 * Those code points are not converted to 204 * NFD on Mac OS. I do not know the reason 205 * because it is undocumented. 206 * NFC NFD 207 * 1109A ==> 11099 110BA 208 * 1109C ==> 1109B 110BA 209 * 110AB ==> 110A5 110BA 210 */ 211 case 0x1109A: case 0x1109C: case 0x110AB: 212 ret = 1; 213 break; 214 } 215 } 216 op16be += unicode_to_utf16be(op16be, uc); 217 op16le += unicode_to_utf16le(op16le, uc); 218 owp += unicode_to_wc(owp, uc); 219 op += unicode_to_utf8(op, uc); 220 if (!*p) { 221 *op16be++ = 0; 222 *op16be = 0; 223 *op16le++ = 0; 224 *op16le = 0; 225 *owp = L'\0'; 226 *op = '\0'; 227 break; 228 } 229 uc = 0; 230 } 231 p++; 232 } 233 return (ret); 234 } 235 236 static int 237 is_wc_unicode(void) 238 { 239 #if defined(_WIN32) && !defined(__CYGWIN__) 240 return (1); 241 #else 242 return (0); 243 #endif 244 } 245 246 /* 247 * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters. 248 * On Mac OS, the characters to be Form D. 249 * On other platforms, the characters to be Form C. 250 */ 251 static void 252 test_archive_string_normalization_nfc(const char *testdata) 253 { 254 struct archive *a, *a2; 255 struct archive_string utf8; 256 struct archive_mstring mstr; 257 struct archive_string_conv *f_sconv8, *t_sconv8; 258 struct archive_string_conv *f_sconv16be, *f_sconv16le; 259 FILE *fp; 260 char buff[512]; 261 int line = 0; 262 int locale_is_utf8, wc_is_unicode; 263 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C; 264 265 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); 266 wc_is_unicode = is_wc_unicode(); 267 /* If it doesn't exist, just warn and return. */ 268 if (!locale_is_utf8 && !wc_is_unicode) { 269 skipping("A test of string normalization for NFC requires " 270 "a suitable locale; en_US.UTF-8 not available on this " 271 "system"); 272 return; 273 } 274 275 archive_string_init(&utf8); 276 memset(&mstr, 0, sizeof(mstr)); 277 278 /* 279 * Create string conversion objects. 280 */ 281 assert((a = archive_read_new()) != NULL); 282 assertA(NULL != (f_sconv8 = 283 archive_string_conversion_from_charset(a, "UTF-8", 0))); 284 assertA(NULL != (f_sconv16be = 285 archive_string_conversion_from_charset(a, "UTF-16BE", 0))); 286 assertA(NULL != (f_sconv16le = 287 archive_string_conversion_from_charset(a, "UTF-16LE", 0))); 288 assert((a2 = archive_write_new()) != NULL); 289 assertA(NULL != (t_sconv8 = 290 archive_string_conversion_to_charset(a2, "UTF-8", 0))); 291 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || 292 t_sconv8 == NULL) { 293 /* We cannot continue this test. */ 294 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 295 return; 296 } 297 archive_string_conversion_set_opt(f_sconv8, sconv_opt); 298 archive_string_conversion_set_opt(f_sconv16be, sconv_opt); 299 archive_string_conversion_set_opt(f_sconv16le, sconv_opt); 300 archive_string_conversion_set_opt(t_sconv8, sconv_opt); 301 302 /* Open a test pattern file. */ 303 assert((fp = fopen(testdata, "r")) != NULL); 304 305 /* 306 * Read test data. 307 * Test data format: 308 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' 309 * Unicode pattern format: 310 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} 311 */ 312 while (fgets(buff, sizeof(buff), fp) != NULL) { 313 char nfc[80], nfd[80]; 314 char utf8_nfc[80], utf8_nfd[80]; 315 char utf16be_nfc[80], utf16be_nfd[80]; 316 char utf16le_nfc[80], utf16le_nfd[80]; 317 wchar_t wc_nfc[40], wc_nfd[40]; 318 char *e, *p; 319 const wchar_t *wp; 320 const char *mp; 321 size_t mplen; 322 323 line++; 324 if (buff[0] == '#') 325 continue; 326 p = strchr(buff, ';'); 327 if (p == NULL) 328 continue; 329 *p++ = '\0'; 330 /* Copy an NFC pattern */ 331 strncpy(nfc, buff, sizeof(nfc)-1); 332 nfc[sizeof(nfc)-1] = '\0'; 333 e = p; 334 p = strchr(p, '\n'); 335 if (p == NULL) 336 continue; 337 *p = '\0'; 338 /* Copy an NFD pattern */ 339 strncpy(nfd, e, sizeof(nfd)-1); 340 nfd[sizeof(nfd)-1] = '\0'; 341 342 /* 343 * Get an NFC patterns. 344 */ 345 scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc, 346 nfc, 0); 347 348 /* 349 * Get an NFD patterns. 350 */ 351 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, 352 nfd, 0); 353 354 if (locale_is_utf8) { 355 /* 356 * Normalize an NFD string for import. 357 */ 358 assertEqualInt(0, archive_strcpy_l( 359 &utf8, utf8_nfd, f_sconv8)); 360 failure("NFD(%s) should be converted to NFC(%s):%d", 361 nfd, nfc, line); 362 assertEqualUTF8String(utf8_nfc, utf8.s); 363 364 /* 365 * Normalize an NFC string for import. 366 */ 367 assertEqualInt(0, archive_strcpy_l( 368 &utf8, utf8_nfc, f_sconv8)); 369 failure("NFC(%s) should not be any changed:%d", 370 nfc, line); 371 assertEqualUTF8String(utf8_nfc, utf8.s); 372 373 /* 374 * Copy an NFC string for export. 375 */ 376 assertEqualInt(0, archive_strcpy_l( 377 &utf8, utf8_nfc, t_sconv8)); 378 failure("NFC(%s) should not be any changed:%d", 379 nfc, line); 380 assertEqualUTF8String(utf8_nfc, utf8.s); 381 382 /* 383 * Normalize an NFD string in UTF-16BE for import. 384 */ 385 assertEqualInt(0, archive_strncpy_l( 386 &utf8, utf16be_nfd, 100000, f_sconv16be)); 387 failure("NFD(%s) should be converted to NFC(%s):%d", 388 nfd, nfc, line); 389 assertEqualUTF8String(utf8_nfc, utf8.s); 390 391 /* 392 * Normalize an NFD string in UTF-16LE for import. 393 */ 394 assertEqualInt(0, archive_strncpy_l( 395 &utf8, utf16le_nfd, 100000, f_sconv16le)); 396 failure("NFD(%s) should be converted to NFC(%s):%d", 397 nfd, nfc, line); 398 assertEqualUTF8String(utf8_nfc, utf8.s); 399 } 400 401 /* 402 * Test for archive_mstring interface. 403 * In specific, Windows platform UTF-16BE is directly 404 * converted to/from wide-character to avoid the effect of 405 * current locale since windows platform cannot make 406 * locale UTF-8. 407 */ 408 if (locale_is_utf8 || wc_is_unicode) { 409 /* 410 * Normalize an NFD string in UTF-8 for import. 411 */ 412 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 413 &mstr, utf8_nfd, 100000, f_sconv8)); 414 assertEqualInt(0, 415 archive_mstring_get_wcs(a, &mstr, &wp)); 416 failure("UTF-8 NFD(%s) should be converted " 417 "to WCS NFC(%s):%d", nfd, nfc, line); 418 assertEqualWString(wc_nfc, wp); 419 420 /* 421 * Normalize an NFD string in UTF-16BE for import. 422 */ 423 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 424 &mstr, utf16be_nfd, 100000, f_sconv16be)); 425 assertEqualInt(0, 426 archive_mstring_get_wcs(a, &mstr, &wp)); 427 failure("UTF-8 NFD(%s) should be converted " 428 "to WCS NFC(%s):%d", nfd, nfc, line); 429 assertEqualWString(wc_nfc, wp); 430 431 /* 432 * Normalize an NFD string in UTF-16LE for import. 433 */ 434 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 435 &mstr, utf16le_nfd, 100000, f_sconv16le)); 436 assertEqualInt(0, 437 archive_mstring_get_wcs(a, &mstr, &wp)); 438 failure("UTF-8 NFD(%s) should be converted " 439 "to WCS NFC(%s):%d", nfd, nfc, line); 440 assertEqualWString(wc_nfc, wp); 441 442 /* 443 * Copy an NFC wide-string for export. 444 */ 445 assertEqualInt(0, 446 archive_mstring_copy_wcs(&mstr, wc_nfc)); 447 assertEqualInt(0, archive_mstring_get_mbs_l( 448 a, &mstr, &mp, &mplen, t_sconv8)); 449 failure("WCS NFC(%s) should be UTF-8 NFC:%d" 450 ,nfc, line); 451 assertEqualUTF8String(utf8_nfc, mp); 452 } 453 } 454 455 archive_string_free(&utf8); 456 archive_mstring_clean(&mstr); 457 fclose(fp); 458 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 459 assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); 460 } 461 462 static void 463 test_archive_string_normalization_mac_nfd(const char *testdata) 464 { 465 struct archive *a, *a2; 466 struct archive_string utf8; 467 struct archive_mstring mstr; 468 struct archive_string_conv *f_sconv8, *t_sconv8; 469 struct archive_string_conv *f_sconv16be, *f_sconv16le; 470 FILE *fp; 471 char buff[512]; 472 int line = 0; 473 int locale_is_utf8, wc_is_unicode; 474 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D; 475 476 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); 477 wc_is_unicode = is_wc_unicode(); 478 /* If it doesn't exist, just warn and return. */ 479 if (!locale_is_utf8 && !wc_is_unicode) { 480 skipping("A test of string normalization for NFD requires " 481 "a suitable locale; en_US.UTF-8 not available on this " 482 "system"); 483 return; 484 } 485 486 archive_string_init(&utf8); 487 memset(&mstr, 0, sizeof(mstr)); 488 489 /* 490 * Create string conversion objects. 491 */ 492 assert((a = archive_read_new()) != NULL); 493 assertA(NULL != (f_sconv8 = 494 archive_string_conversion_from_charset(a, "UTF-8", 0))); 495 assertA(NULL != (f_sconv16be = 496 archive_string_conversion_from_charset(a, "UTF-16BE", 0))); 497 assertA(NULL != (f_sconv16le = 498 archive_string_conversion_from_charset(a, "UTF-16LE", 0))); 499 assert((a2 = archive_write_new()) != NULL); 500 assertA(NULL != (t_sconv8 = 501 archive_string_conversion_to_charset(a2, "UTF-8", 0))); 502 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || 503 t_sconv8 == NULL) { 504 /* We cannot continue this test. */ 505 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 506 return; 507 } 508 archive_string_conversion_set_opt(f_sconv8, sconv_opt); 509 archive_string_conversion_set_opt(f_sconv16be, sconv_opt); 510 archive_string_conversion_set_opt(f_sconv16le, sconv_opt); 511 archive_string_conversion_set_opt(t_sconv8, sconv_opt); 512 513 /* Open a test pattern file. */ 514 assert((fp = fopen(testdata, "r")) != NULL); 515 516 /* 517 * Read test data. 518 * Test data format: 519 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' 520 * Unicode pattern format: 521 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} 522 */ 523 while (fgets(buff, sizeof(buff), fp) != NULL) { 524 char nfc[80], nfd[80]; 525 char utf8_nfc[80], utf8_nfd[80]; 526 char utf16be_nfc[80], utf16be_nfd[80]; 527 char utf16le_nfc[80], utf16le_nfd[80]; 528 wchar_t wc_nfc[40], wc_nfd[40]; 529 char *e, *p; 530 const wchar_t *wp; 531 const char *mp; 532 size_t mplen; 533 int should_be_nfc; 534 535 line++; 536 if (buff[0] == '#') 537 continue; 538 p = strchr(buff, ';'); 539 if (p == NULL) 540 continue; 541 *p++ = '\0'; 542 /* Copy an NFC pattern */ 543 strncpy(nfc, buff, sizeof(nfc)-1); 544 nfc[sizeof(nfc)-1] = '\0'; 545 e = p; 546 p = strchr(p, '\n'); 547 if (p == NULL) 548 continue; 549 *p = '\0'; 550 /* Copy an NFD pattern */ 551 strncpy(nfd, e, sizeof(nfd)-1); 552 nfd[sizeof(nfd)-1] = '\0'; 553 554 /* 555 * Get an NFC patterns. 556 */ 557 should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc, 558 utf16be_nfc, utf16le_nfc, nfc, 1); 559 560 /* 561 * Get an NFD patterns. 562 */ 563 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, 564 nfd, 0); 565 566 if (locale_is_utf8) { 567 /* 568 * Normalize an NFC string for import. 569 */ 570 assertEqualInt(0, archive_strcpy_l( 571 &utf8, utf8_nfc, f_sconv8)); 572 if (should_be_nfc) { 573 failure("NFC(%s) should not be converted to" 574 " NFD(%s):%d", nfc, nfd, line); 575 assertEqualUTF8String(utf8_nfc, utf8.s); 576 } else { 577 failure("NFC(%s) should be converted to" 578 " NFD(%s):%d", nfc, nfd, line); 579 assertEqualUTF8String(utf8_nfd, utf8.s); 580 } 581 582 /* 583 * Normalize an NFD string for import. 584 */ 585 assertEqualInt(0, archive_strcpy_l( 586 &utf8, utf8_nfd, f_sconv8)); 587 failure("NFD(%s) should not be any changed:%d", 588 nfd, line); 589 assertEqualUTF8String(utf8_nfd, utf8.s); 590 591 /* 592 * Copy an NFD string for export. 593 */ 594 assertEqualInt(0, archive_strcpy_l( 595 &utf8, utf8_nfd, t_sconv8)); 596 failure("NFD(%s) should not be any changed:%d", 597 nfd, line); 598 assertEqualUTF8String(utf8_nfd, utf8.s); 599 600 /* 601 * Normalize an NFC string in UTF-16BE for import. 602 */ 603 assertEqualInt(0, archive_strncpy_l( 604 &utf8, utf16be_nfc, 100000, f_sconv16be)); 605 if (should_be_nfc) { 606 failure("NFC(%s) should not be converted to" 607 " NFD(%s):%d", nfc, nfd, line); 608 assertEqualUTF8String(utf8_nfc, utf8.s); 609 } else { 610 failure("NFC(%s) should be converted to" 611 " NFD(%s):%d", nfc, nfd, line); 612 assertEqualUTF8String(utf8_nfd, utf8.s); 613 } 614 615 /* 616 * Normalize an NFC string in UTF-16LE for import. 617 */ 618 assertEqualInt(0, archive_strncpy_l( 619 &utf8, utf16le_nfc, 100000, f_sconv16le)); 620 if (should_be_nfc) { 621 failure("NFC(%s) should not be converted to" 622 " NFD(%s):%d", nfc, nfd, line); 623 assertEqualUTF8String(utf8_nfc, utf8.s); 624 } else { 625 failure("NFC(%s) should be converted to" 626 " NFD(%s):%d", nfc, nfd, line); 627 assertEqualUTF8String(utf8_nfd, utf8.s); 628 } 629 } 630 631 /* 632 * Test for archive_mstring interface. 633 * In specific, Windows platform UTF-16BE is directly 634 * converted to/from wide-character to avoid the effect of 635 * current locale since windows platform cannot make 636 * locale UTF-8. 637 */ 638 if (locale_is_utf8 || wc_is_unicode) { 639 /* 640 * Normalize an NFD string in UTF-8 for import. 641 */ 642 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 643 &mstr, utf8_nfc, 100000, f_sconv8)); 644 assertEqualInt(0, 645 archive_mstring_get_wcs(a, &mstr, &wp)); 646 if (should_be_nfc) { 647 failure("UTF-8 NFC(%s) should not be converted " 648 "to WCS NFD(%s):%d", nfc, nfd, line); 649 assertEqualWString(wc_nfc, wp); 650 } else { 651 failure("UTF-8 NFC(%s) should be converted " 652 "to WCS NFD(%s):%d", nfc, nfd, line); 653 assertEqualWString(wc_nfd, wp); 654 } 655 656 /* 657 * Normalize an NFD string in UTF-16BE for import. 658 */ 659 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 660 &mstr, utf16be_nfc, 100000, f_sconv16be)); 661 assertEqualInt(0, 662 archive_mstring_get_wcs(a, &mstr, &wp)); 663 if (should_be_nfc) { 664 failure("UTF-16BE NFC(%s) should not be " 665 "converted to WCS NFD(%s):%d", 666 nfc, nfd, line); 667 assertEqualWString(wc_nfc, wp); 668 } else { 669 failure("UTF-16BE NFC(%s) should be converted " 670 "to WCS NFD(%s):%d", nfc, nfd, line); 671 assertEqualWString(wc_nfd, wp); 672 } 673 674 /* 675 * Normalize an NFD string in UTF-16LE for import. 676 */ 677 assertEqualInt(0, archive_mstring_copy_mbs_len_l( 678 &mstr, utf16le_nfc, 100000, f_sconv16le)); 679 assertEqualInt(0, 680 archive_mstring_get_wcs(a, &mstr, &wp)); 681 if (should_be_nfc) { 682 failure("UTF-16LE NFC(%s) should not be " 683 "converted to WCS NFD(%s):%d", 684 nfc, nfd, line); 685 assertEqualWString(wc_nfc, wp); 686 } else { 687 failure("UTF-16LE NFC(%s) should be converted " 688 "to WCS NFD(%s):%d", nfc, nfd, line); 689 assertEqualWString(wc_nfd, wp); 690 } 691 692 /* 693 * Copy an NFD wide-string for export. 694 */ 695 assertEqualInt(0, archive_mstring_copy_wcs( 696 &mstr, wc_nfd)); 697 assertEqualInt(0, archive_mstring_get_mbs_l( 698 a, &mstr, &mp, &mplen, t_sconv8)); 699 failure("WCS NFD(%s) should be UTF-8 NFD:%d" 700 ,nfd, line); 701 assertEqualUTF8String(utf8_nfd, mp); 702 } 703 } 704 705 archive_string_free(&utf8); 706 archive_mstring_clean(&mstr); 707 fclose(fp); 708 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 709 assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); 710 } 711 712 static void 713 test_archive_string_canonicalization(void) 714 { 715 struct archive *a; 716 struct archive_string_conv *sconv; 717 718 setlocale(LC_ALL, "en_US.UTF-8"); 719 720 assert((a = archive_read_new()) != NULL); 721 722 assertA(NULL != (sconv = 723 archive_string_conversion_to_charset(a, "UTF-8", 1))); 724 failure("Charset name should be UTF-8"); 725 assertEqualString("UTF-8", 726 archive_string_conversion_charset_name(sconv)); 727 728 assertA(NULL != (sconv = 729 archive_string_conversion_to_charset(a, "UTF8", 1))); 730 failure("Charset name should be UTF-8"); 731 assertEqualString("UTF-8", 732 archive_string_conversion_charset_name(sconv)); 733 734 assertA(NULL != (sconv = 735 archive_string_conversion_to_charset(a, "utf8", 1))); 736 failure("Charset name should be UTF-8"); 737 assertEqualString("UTF-8", 738 archive_string_conversion_charset_name(sconv)); 739 740 assertA(NULL != (sconv = 741 archive_string_conversion_to_charset(a, "UTF-16BE", 1))); 742 failure("Charset name should be UTF-16BE"); 743 assertEqualString("UTF-16BE", 744 archive_string_conversion_charset_name(sconv)); 745 746 assertA(NULL != (sconv = 747 archive_string_conversion_to_charset(a, "UTF16BE", 1))); 748 failure("Charset name should be UTF-16BE"); 749 assertEqualString("UTF-16BE", 750 archive_string_conversion_charset_name(sconv)); 751 752 assertA(NULL != (sconv = 753 archive_string_conversion_to_charset(a, "utf16be", 1))); 754 failure("Charset name should be UTF-16BE"); 755 assertEqualString("UTF-16BE", 756 archive_string_conversion_charset_name(sconv)); 757 758 assertA(NULL != (sconv = 759 archive_string_conversion_to_charset(a, "UTF-16LE", 1))); 760 failure("Charset name should be UTF-16LE"); 761 assertEqualString("UTF-16LE", 762 archive_string_conversion_charset_name(sconv)); 763 764 assertA(NULL != (sconv = 765 archive_string_conversion_to_charset(a, "UTF16LE", 1))); 766 failure("Charset name should be UTF-16LE"); 767 assertEqualString("UTF-16LE", 768 archive_string_conversion_charset_name(sconv)); 769 770 assertA(NULL != (sconv = 771 archive_string_conversion_to_charset(a, "utf16le", 1))); 772 failure("Charset name should be UTF-16LE"); 773 assertEqualString("UTF-16LE", 774 archive_string_conversion_charset_name(sconv)); 775 776 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 777 778 } 779 780 static void 781 check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc, 782 const char *exp, const wchar_t *wexp) 783 { 784 /* Do all the tests on a copy so that we can have a clear initial state every time */ 785 struct archive_mstring mstr2; 786 const char *p = NULL; 787 const wchar_t *wp = NULL; 788 size_t len = 0; 789 790 memset(&mstr2, 0, sizeof(mstr2)); 791 792 archive_mstring_copy(&mstr2, mstr); 793 assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p)); 794 assertEqualString(exp, p); 795 p = NULL; 796 797 archive_mstring_copy(&mstr2, mstr); 798 assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p)); 799 assertEqualString(exp, p); 800 p = NULL; 801 802 archive_mstring_copy(&mstr2, mstr); 803 assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp)); 804 assertEqualWString(wexp, wp); 805 wp = NULL; 806 807 archive_mstring_copy(&mstr2, mstr); 808 assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc)); 809 assertEqualString(exp, p); 810 assertEqualInt(len, strlen(exp)); 811 p = NULL; 812 len = 0; 813 814 archive_mstring_clean(&mstr2); 815 } 816 817 /* 818 * Make sure no matter what the input encoding is, the string can be 819 * converted too all the output encodings. 820 */ 821 static void 822 test_archive_string_set_get(void) 823 { 824 struct archive *a; 825 struct archive_mstring mstr; 826 struct archive_string_conv *sc; 827 828 setlocale(LC_ALL, "en_US.UTF-8"); 829 830 assert((a = archive_read_new()) != NULL); 831 memset(&mstr, 0, sizeof(mstr)); 832 833 assertA(NULL != (sc = 834 archive_string_conversion_to_charset(a, "UTF-8", 1))); 835 failure("Charset name should be UTF-8"); 836 assertEqualString("UTF-8", 837 archive_string_conversion_charset_name(sc)); 838 839 assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA")); 840 check_string(a, &mstr, sc, "AAA", L"AAA"); 841 assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB")); 842 check_string(a, &mstr, sc, "BBBB", L"BBBB"); 843 assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12")); 844 check_string(a, &mstr, sc, "CCC12", L"CCC12"); 845 assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc)); 846 check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l"); 847 assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H")); 848 check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H"); 849 850 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 851 852 } 853 854 DEFINE_TEST(test_archive_string_conversion) 855 { 856 static const char reffile[] = "test_archive_string_conversion.txt.Z"; 857 static const char testdata[] = "testdata.txt"; 858 struct archive *a; 859 struct archive_entry *ae; 860 char buff[512]; 861 ssize_t size; 862 FILE *fp; 863 864 /* 865 * Extract a test pattern file. 866 */ 867 extract_reference_file(reffile); 868 assert((a = archive_read_new()) != NULL); 869 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); 870 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a)); 871 assertEqualIntA(a, ARCHIVE_OK, 872 archive_read_open_filename(a, reffile, 512)); 873 874 assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); 875 assert((fp = fopen(testdata, "w")) != NULL); 876 while ((size = archive_read_data(a, buff, 512)) > 0) 877 assertEqualInt(size, fwrite(buff, 1, size, fp)); 878 assertEqualInt(0, fclose(fp)); 879 assertEqualInt(ARCHIVE_OK, archive_read_free(a)); 880 881 test_archive_string_normalization_nfc(testdata); 882 test_archive_string_normalization_mac_nfd(testdata); 883 test_archive_string_canonicalization(); 884 test_archive_string_set_get(); 885 } 886