1 /*-
2 * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25 #include "test.h"
26
27 #include <locale.h>
28
29 #define __LIBARCHIVE_TEST
30 #include "archive_string.h"
31
32 /*
33 Execute the following to rebuild the data for this program:
34 tail -n +36 test_archive_string_conversion.c | /bin/sh
35 #
36 # This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
37 #
38 if="NormalizationTest.txt"
39 if [ ! -f ${if} ]; then
40 echo "Not found: \"${if}\""
41 exit 0
42 fi
43 of=test_archive_string_conversion.txt.Z
44 awk -F ';' '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} > ${of}.uu
45 exit 1
46 */
47
48 static int
unicode_to_utf8(char * p,uint32_t uc)49 unicode_to_utf8(char *p, uint32_t uc)
50 {
51 char *_p = p;
52
53 /* Translate code point to UTF8 */
54 if (uc <= 0x7f) {
55 *p++ = (char)uc;
56 } else if (uc <= 0x7ff) {
57 *p++ = 0xc0 | ((uc >> 6) & 0x1f);
58 *p++ = 0x80 | (uc & 0x3f);
59 } else if (uc <= 0xffff) {
60 *p++ = 0xe0 | ((uc >> 12) & 0x0f);
61 *p++ = 0x80 | ((uc >> 6) & 0x3f);
62 *p++ = 0x80 | (uc & 0x3f);
63 } else {
64 *p++ = 0xf0 | ((uc >> 18) & 0x07);
65 *p++ = 0x80 | ((uc >> 12) & 0x3f);
66 *p++ = 0x80 | ((uc >> 6) & 0x3f);
67 *p++ = 0x80 | (uc & 0x3f);
68 }
69 return ((int)(p - _p));
70 }
71
72 static void
archive_be16enc(void * pp,uint16_t u)73 archive_be16enc(void *pp, uint16_t u)
74 {
75 unsigned char *p = (unsigned char *)pp;
76
77 p[0] = (u >> 8) & 0xff;
78 p[1] = u & 0xff;
79 }
80
81 static int
unicode_to_utf16be(char * p,uint32_t uc)82 unicode_to_utf16be(char *p, uint32_t uc)
83 {
84 char *utf16 = p;
85
86 if (uc > 0xffff) {
87 /* We have a code point that won't fit into a
88 * wchar_t; convert it to a surrogate pair. */
89 uc -= 0x10000;
90 archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
91 archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
92 return (4);
93 } else {
94 archive_be16enc(utf16, uc);
95 return (2);
96 }
97 }
98
99 static void
archive_le16enc(void * pp,uint16_t u)100 archive_le16enc(void *pp, uint16_t u)
101 {
102 unsigned char *p = (unsigned char *)pp;
103
104 p[0] = u & 0xff;
105 p[1] = (u >> 8) & 0xff;
106 }
107
108 static size_t
unicode_to_utf16le(char * p,uint32_t uc)109 unicode_to_utf16le(char *p, uint32_t uc)
110 {
111 char *utf16 = p;
112
113 if (uc > 0xffff) {
114 /* We have a code point that won't fit into a
115 * wchar_t; convert it to a surrogate pair. */
116 uc -= 0x10000;
117 archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
118 archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
119 return (4);
120 } else {
121 archive_le16enc(utf16, uc);
122 return (2);
123 }
124 }
125
126 static int
wc_size(void)127 wc_size(void)
128 {
129 return (sizeof(wchar_t));
130 }
131
132 static int
unicode_to_wc(wchar_t * wp,uint32_t uc)133 unicode_to_wc(wchar_t *wp, uint32_t uc)
134 {
135 if (wc_size() == 4) {
136 *wp = (wchar_t)uc;
137 return (1);
138 }
139 if (uc > 0xffff) {
140 /* We have a code point that won't fit into a
141 * wchar_t; convert it to a surrogate pair. */
142 uc -= 0x10000;
143 *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
144 *wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
145 return (2);
146 } else {
147 *wp = (wchar_t)uc;
148 return (1);
149 }
150 }
151
152 /*
153 * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
154 * converted to NFD on Mac OS.
155 * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
156 */
157 static int
scan_unicode_pattern(char * out,wchar_t * wout,char * u16be,char * u16le,const char * pattern,int mac_nfd)158 scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
159 const char *pattern, int mac_nfd)
160 {
161 unsigned uc = 0;
162 const char *p = pattern;
163 char *op = out;
164 wchar_t *owp = wout;
165 char *op16be = u16be;
166 char *op16le = u16le;
167 int ret = 0;
168
169 for (;;) {
170 if (*p >= '0' && *p <= '9')
171 uc = (uc << 4) + (*p - '0');
172 else if (*p >= 'A' && *p <= 'F')
173 uc = (uc << 4) + (*p - 'A' + 0x0a);
174 else {
175 if (mac_nfd && op == out) {
176 /*
177 * These are not converted to NFD on Mac OS.
178 * U+2000 - U+2FFF
179 * U+F900 - U+FAFF
180 * U+2F800 - U+2FAFF
181 */
182 switch (uc) {
183 case 0x2194: case 0x219A: case 0x219B:
184 case 0x21AE: case 0x21CD: case 0x21CE:
185 case 0x21CF: case 0x2204: case 0x2209:
186 case 0x220C: case 0x2224: case 0x2226:
187 case 0x2241: case 0x2244: case 0x2247:
188 case 0x2249: case 0x2260: case 0x2262:
189 case 0x226D: case 0x226E: case 0x226F:
190 case 0x2270: case 0x2271: case 0x2274:
191 case 0x2275: case 0x2276: case 0x2278:
192 case 0x2279: case 0x227A: case 0x227B:
193 case 0x2280: case 0x2281: case 0x2284:
194 case 0x2285: case 0x2288: case 0x2289:
195 case 0x22AC: case 0x22AD: case 0x22AE:
196 case 0x22AF: case 0x22E0: case 0x22E1:
197 case 0x22E2: case 0x22E3: case 0x22EA:
198 case 0x22EB: case 0x22EC: case 0x22ED:
199
200 /*
201 * Those code points are not converted to
202 * NFD on Mac OS. I do not know the reason
203 * because it is undocumented.
204 * NFC NFD
205 * 1109A ==> 11099 110BA
206 * 1109C ==> 1109B 110BA
207 * 110AB ==> 110A5 110BA
208 */
209 case 0x1109A: case 0x1109C: case 0x110AB:
210 ret = 1;
211 break;
212 }
213 }
214 op16be += unicode_to_utf16be(op16be, uc);
215 op16le += unicode_to_utf16le(op16le, uc);
216 owp += unicode_to_wc(owp, uc);
217 op += unicode_to_utf8(op, uc);
218 if (!*p) {
219 *op16be++ = 0;
220 *op16be = 0;
221 *op16le++ = 0;
222 *op16le = 0;
223 *owp = L'\0';
224 *op = '\0';
225 break;
226 }
227 uc = 0;
228 }
229 p++;
230 }
231 return (ret);
232 }
233
234 static int
is_wc_unicode(void)235 is_wc_unicode(void)
236 {
237 #if defined(_WIN32) && !defined(__CYGWIN__)
238 return (1);
239 #else
240 return (0);
241 #endif
242 }
243
244 /*
245 * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
246 * On Mac OS, the characters to be Form D.
247 * On other platforms, the characters to be Form C.
248 */
249 static void
test_archive_string_normalization_nfc(const char * testdata)250 test_archive_string_normalization_nfc(const char *testdata)
251 {
252 struct archive *a, *a2;
253 struct archive_string utf8;
254 struct archive_mstring mstr;
255 struct archive_string_conv *f_sconv8, *t_sconv8;
256 struct archive_string_conv *f_sconv16be, *f_sconv16le;
257 FILE *fp;
258 char buff[512];
259 int line = 0;
260 int locale_is_utf8, wc_is_unicode;
261 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
262
263 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
264 wc_is_unicode = is_wc_unicode();
265 /* If it doesn't exist, just warn and return. */
266 if (!locale_is_utf8 && !wc_is_unicode) {
267 skipping("A test of string normalization for NFC requires "
268 "a suitable locale; en_US.UTF-8 not available on this "
269 "system");
270 return;
271 }
272
273 archive_string_init(&utf8);
274 memset(&mstr, 0, sizeof(mstr));
275
276 /*
277 * Create string conversion objects.
278 */
279 assert((a = archive_read_new()) != NULL);
280 assertA(NULL != (f_sconv8 =
281 archive_string_conversion_from_charset(a, "UTF-8", 0)));
282 assertA(NULL != (f_sconv16be =
283 archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
284 assertA(NULL != (f_sconv16le =
285 archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
286 assert((a2 = archive_write_new()) != NULL);
287 assertA(NULL != (t_sconv8 =
288 archive_string_conversion_to_charset(a2, "UTF-8", 0)));
289 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
290 t_sconv8 == NULL) {
291 /* We cannot continue this test. */
292 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
293 return;
294 }
295 archive_string_conversion_set_opt(f_sconv8, sconv_opt);
296 archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
297 archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
298 archive_string_conversion_set_opt(t_sconv8, sconv_opt);
299
300 /* Open a test pattern file. */
301 assert((fp = fopen(testdata, "r")) != NULL);
302
303 /*
304 * Read test data.
305 * Test data format:
306 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
307 * Unicode pattern format:
308 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
309 */
310 while (fgets(buff, sizeof(buff), fp) != NULL) {
311 char nfc[80], nfd[80];
312 char utf8_nfc[80], utf8_nfd[80];
313 char utf16be_nfc[80], utf16be_nfd[80];
314 char utf16le_nfc[80], utf16le_nfd[80];
315 wchar_t wc_nfc[40], wc_nfd[40];
316 char *e, *p;
317 const wchar_t *wp;
318 const char *mp;
319 size_t mplen;
320
321 line++;
322 if (buff[0] == '#')
323 continue;
324 p = strchr(buff, ';');
325 if (p == NULL)
326 continue;
327 *p++ = '\0';
328 /* Copy an NFC pattern */
329 strncpy(nfc, buff, sizeof(nfc)-1);
330 nfc[sizeof(nfc)-1] = '\0';
331 e = p;
332 p = strchr(p, '\n');
333 if (p == NULL)
334 continue;
335 *p = '\0';
336 /* Copy an NFD pattern */
337 strncpy(nfd, e, sizeof(nfd)-1);
338 nfd[sizeof(nfd)-1] = '\0';
339
340 /*
341 * Get an NFC patterns.
342 */
343 scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
344 nfc, 0);
345
346 /*
347 * Get an NFD patterns.
348 */
349 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
350 nfd, 0);
351
352 if (locale_is_utf8) {
353 /*
354 * Normalize an NFD string for import.
355 */
356 assertEqualInt(0, archive_strcpy_l(
357 &utf8, utf8_nfd, f_sconv8));
358 failure("NFD(%s) should be converted to NFC(%s):%d",
359 nfd, nfc, line);
360 assertEqualUTF8String(utf8_nfc, utf8.s);
361
362 /*
363 * Normalize an NFC string for import.
364 */
365 assertEqualInt(0, archive_strcpy_l(
366 &utf8, utf8_nfc, f_sconv8));
367 failure("NFC(%s) should not be any changed:%d",
368 nfc, line);
369 assertEqualUTF8String(utf8_nfc, utf8.s);
370
371 /*
372 * Copy an NFC string for export.
373 */
374 assertEqualInt(0, archive_strcpy_l(
375 &utf8, utf8_nfc, t_sconv8));
376 failure("NFC(%s) should not be any changed:%d",
377 nfc, line);
378 assertEqualUTF8String(utf8_nfc, utf8.s);
379
380 /*
381 * Normalize an NFD string in UTF-16BE for import.
382 */
383 assertEqualInt(0, archive_strncpy_l(
384 &utf8, utf16be_nfd, 100000, f_sconv16be));
385 failure("NFD(%s) should be converted to NFC(%s):%d",
386 nfd, nfc, line);
387 assertEqualUTF8String(utf8_nfc, utf8.s);
388
389 /*
390 * Normalize an NFD string in UTF-16LE for import.
391 */
392 assertEqualInt(0, archive_strncpy_l(
393 &utf8, utf16le_nfd, 100000, f_sconv16le));
394 failure("NFD(%s) should be converted to NFC(%s):%d",
395 nfd, nfc, line);
396 assertEqualUTF8String(utf8_nfc, utf8.s);
397 }
398
399 /*
400 * Test for archive_mstring interface.
401 * In specific, Windows platform UTF-16BE is directly
402 * converted to/from wide-character to avoid the effect of
403 * current locale since windows platform cannot make
404 * locale UTF-8.
405 */
406 if (locale_is_utf8 || wc_is_unicode) {
407 /*
408 * Normalize an NFD string in UTF-8 for import.
409 */
410 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
411 &mstr, utf8_nfd, 100000, f_sconv8));
412 assertEqualInt(0,
413 archive_mstring_get_wcs(a, &mstr, &wp));
414 failure("UTF-8 NFD(%s) should be converted "
415 "to WCS NFC(%s):%d", nfd, nfc, line);
416 assertEqualWString(wc_nfc, wp);
417
418 /*
419 * Normalize an NFD string in UTF-16BE for import.
420 */
421 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
422 &mstr, utf16be_nfd, 100000, f_sconv16be));
423 assertEqualInt(0,
424 archive_mstring_get_wcs(a, &mstr, &wp));
425 failure("UTF-8 NFD(%s) should be converted "
426 "to WCS NFC(%s):%d", nfd, nfc, line);
427 assertEqualWString(wc_nfc, wp);
428
429 /*
430 * Normalize an NFD string in UTF-16LE for import.
431 */
432 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
433 &mstr, utf16le_nfd, 100000, f_sconv16le));
434 assertEqualInt(0,
435 archive_mstring_get_wcs(a, &mstr, &wp));
436 failure("UTF-8 NFD(%s) should be converted "
437 "to WCS NFC(%s):%d", nfd, nfc, line);
438 assertEqualWString(wc_nfc, wp);
439
440 /*
441 * Copy an NFC wide-string for export.
442 */
443 assertEqualInt(0,
444 archive_mstring_copy_wcs(&mstr, wc_nfc));
445 assertEqualInt(0, archive_mstring_get_mbs_l(
446 a, &mstr, &mp, &mplen, t_sconv8));
447 failure("WCS NFC(%s) should be UTF-8 NFC:%d"
448 ,nfc, line);
449 assertEqualUTF8String(utf8_nfc, mp);
450 }
451 }
452
453 archive_string_free(&utf8);
454 archive_mstring_clean(&mstr);
455 fclose(fp);
456 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
457 assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
458 }
459
460 static void
test_archive_string_normalization_mac_nfd(const char * testdata)461 test_archive_string_normalization_mac_nfd(const char *testdata)
462 {
463 struct archive *a, *a2;
464 struct archive_string utf8;
465 struct archive_mstring mstr;
466 struct archive_string_conv *f_sconv8, *t_sconv8;
467 struct archive_string_conv *f_sconv16be, *f_sconv16le;
468 FILE *fp;
469 char buff[512];
470 int line = 0;
471 int locale_is_utf8, wc_is_unicode;
472 int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
473
474 locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
475 wc_is_unicode = is_wc_unicode();
476 /* If it doesn't exist, just warn and return. */
477 if (!locale_is_utf8 && !wc_is_unicode) {
478 skipping("A test of string normalization for NFD requires "
479 "a suitable locale; en_US.UTF-8 not available on this "
480 "system");
481 return;
482 }
483
484 archive_string_init(&utf8);
485 memset(&mstr, 0, sizeof(mstr));
486
487 /*
488 * Create string conversion objects.
489 */
490 assert((a = archive_read_new()) != NULL);
491 assertA(NULL != (f_sconv8 =
492 archive_string_conversion_from_charset(a, "UTF-8", 0)));
493 assertA(NULL != (f_sconv16be =
494 archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
495 assertA(NULL != (f_sconv16le =
496 archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
497 assert((a2 = archive_write_new()) != NULL);
498 assertA(NULL != (t_sconv8 =
499 archive_string_conversion_to_charset(a2, "UTF-8", 0)));
500 if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
501 t_sconv8 == NULL) {
502 /* We cannot continue this test. */
503 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
504 return;
505 }
506 archive_string_conversion_set_opt(f_sconv8, sconv_opt);
507 archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
508 archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
509 archive_string_conversion_set_opt(t_sconv8, sconv_opt);
510
511 /* Open a test pattern file. */
512 assert((fp = fopen(testdata, "r")) != NULL);
513
514 /*
515 * Read test data.
516 * Test data format:
517 * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
518 * Unicode pattern format:
519 * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
520 */
521 while (fgets(buff, sizeof(buff), fp) != NULL) {
522 char nfc[80], nfd[80];
523 char utf8_nfc[80], utf8_nfd[80];
524 char utf16be_nfc[80], utf16be_nfd[80];
525 char utf16le_nfc[80], utf16le_nfd[80];
526 wchar_t wc_nfc[40], wc_nfd[40];
527 char *e, *p;
528 const wchar_t *wp;
529 const char *mp;
530 size_t mplen;
531 int should_be_nfc;
532
533 line++;
534 if (buff[0] == '#')
535 continue;
536 p = strchr(buff, ';');
537 if (p == NULL)
538 continue;
539 *p++ = '\0';
540 /* Copy an NFC pattern */
541 strncpy(nfc, buff, sizeof(nfc)-1);
542 nfc[sizeof(nfc)-1] = '\0';
543 e = p;
544 p = strchr(p, '\n');
545 if (p == NULL)
546 continue;
547 *p = '\0';
548 /* Copy an NFD pattern */
549 strncpy(nfd, e, sizeof(nfd)-1);
550 nfd[sizeof(nfd)-1] = '\0';
551
552 /*
553 * Get an NFC patterns.
554 */
555 should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
556 utf16be_nfc, utf16le_nfc, nfc, 1);
557
558 /*
559 * Get an NFD patterns.
560 */
561 scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
562 nfd, 0);
563
564 if (locale_is_utf8) {
565 /*
566 * Normalize an NFC string for import.
567 */
568 assertEqualInt(0, archive_strcpy_l(
569 &utf8, utf8_nfc, f_sconv8));
570 if (should_be_nfc) {
571 failure("NFC(%s) should not be converted to"
572 " NFD(%s):%d", nfc, nfd, line);
573 assertEqualUTF8String(utf8_nfc, utf8.s);
574 } else {
575 failure("NFC(%s) should be converted to"
576 " NFD(%s):%d", nfc, nfd, line);
577 assertEqualUTF8String(utf8_nfd, utf8.s);
578 }
579
580 /*
581 * Normalize an NFD string for import.
582 */
583 assertEqualInt(0, archive_strcpy_l(
584 &utf8, utf8_nfd, f_sconv8));
585 failure("NFD(%s) should not be any changed:%d",
586 nfd, line);
587 assertEqualUTF8String(utf8_nfd, utf8.s);
588
589 /*
590 * Copy an NFD string for export.
591 */
592 assertEqualInt(0, archive_strcpy_l(
593 &utf8, utf8_nfd, t_sconv8));
594 failure("NFD(%s) should not be any changed:%d",
595 nfd, line);
596 assertEqualUTF8String(utf8_nfd, utf8.s);
597
598 /*
599 * Normalize an NFC string in UTF-16BE for import.
600 */
601 assertEqualInt(0, archive_strncpy_l(
602 &utf8, utf16be_nfc, 100000, f_sconv16be));
603 if (should_be_nfc) {
604 failure("NFC(%s) should not be converted to"
605 " NFD(%s):%d", nfc, nfd, line);
606 assertEqualUTF8String(utf8_nfc, utf8.s);
607 } else {
608 failure("NFC(%s) should be converted to"
609 " NFD(%s):%d", nfc, nfd, line);
610 assertEqualUTF8String(utf8_nfd, utf8.s);
611 }
612
613 /*
614 * Normalize an NFC string in UTF-16LE for import.
615 */
616 assertEqualInt(0, archive_strncpy_l(
617 &utf8, utf16le_nfc, 100000, f_sconv16le));
618 if (should_be_nfc) {
619 failure("NFC(%s) should not be converted to"
620 " NFD(%s):%d", nfc, nfd, line);
621 assertEqualUTF8String(utf8_nfc, utf8.s);
622 } else {
623 failure("NFC(%s) should be converted to"
624 " NFD(%s):%d", nfc, nfd, line);
625 assertEqualUTF8String(utf8_nfd, utf8.s);
626 }
627 }
628
629 /*
630 * Test for archive_mstring interface.
631 * In specific, Windows platform UTF-16BE is directly
632 * converted to/from wide-character to avoid the effect of
633 * current locale since windows platform cannot make
634 * locale UTF-8.
635 */
636 if (locale_is_utf8 || wc_is_unicode) {
637 /*
638 * Normalize an NFD string in UTF-8 for import.
639 */
640 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
641 &mstr, utf8_nfc, 100000, f_sconv8));
642 assertEqualInt(0,
643 archive_mstring_get_wcs(a, &mstr, &wp));
644 if (should_be_nfc) {
645 failure("UTF-8 NFC(%s) should not be converted "
646 "to WCS NFD(%s):%d", nfc, nfd, line);
647 assertEqualWString(wc_nfc, wp);
648 } else {
649 failure("UTF-8 NFC(%s) should be converted "
650 "to WCS NFD(%s):%d", nfc, nfd, line);
651 assertEqualWString(wc_nfd, wp);
652 }
653
654 /*
655 * Normalize an NFD string in UTF-16BE for import.
656 */
657 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
658 &mstr, utf16be_nfc, 100000, f_sconv16be));
659 assertEqualInt(0,
660 archive_mstring_get_wcs(a, &mstr, &wp));
661 if (should_be_nfc) {
662 failure("UTF-16BE NFC(%s) should not be "
663 "converted to WCS NFD(%s):%d",
664 nfc, nfd, line);
665 assertEqualWString(wc_nfc, wp);
666 } else {
667 failure("UTF-16BE NFC(%s) should be converted "
668 "to WCS NFD(%s):%d", nfc, nfd, line);
669 assertEqualWString(wc_nfd, wp);
670 }
671
672 /*
673 * Normalize an NFD string in UTF-16LE for import.
674 */
675 assertEqualInt(0, archive_mstring_copy_mbs_len_l(
676 &mstr, utf16le_nfc, 100000, f_sconv16le));
677 assertEqualInt(0,
678 archive_mstring_get_wcs(a, &mstr, &wp));
679 if (should_be_nfc) {
680 failure("UTF-16LE NFC(%s) should not be "
681 "converted to WCS NFD(%s):%d",
682 nfc, nfd, line);
683 assertEqualWString(wc_nfc, wp);
684 } else {
685 failure("UTF-16LE NFC(%s) should be converted "
686 "to WCS NFD(%s):%d", nfc, nfd, line);
687 assertEqualWString(wc_nfd, wp);
688 }
689
690 /*
691 * Copy an NFD wide-string for export.
692 */
693 assertEqualInt(0, archive_mstring_copy_wcs(
694 &mstr, wc_nfd));
695 assertEqualInt(0, archive_mstring_get_mbs_l(
696 a, &mstr, &mp, &mplen, t_sconv8));
697 failure("WCS NFD(%s) should be UTF-8 NFD:%d"
698 ,nfd, line);
699 assertEqualUTF8String(utf8_nfd, mp);
700 }
701 }
702
703 archive_string_free(&utf8);
704 archive_mstring_clean(&mstr);
705 fclose(fp);
706 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
707 assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
708 }
709
710 static void
test_archive_string_canonicalization(void)711 test_archive_string_canonicalization(void)
712 {
713 struct archive *a;
714 struct archive_string_conv *sconv;
715
716 setlocale(LC_ALL, "en_US.UTF-8");
717
718 assert((a = archive_read_new()) != NULL);
719
720 assertA(NULL != (sconv =
721 archive_string_conversion_to_charset(a, "UTF-8", 1)));
722 failure("Charset name should be UTF-8");
723 assertEqualString("UTF-8",
724 archive_string_conversion_charset_name(sconv));
725
726 assertA(NULL != (sconv =
727 archive_string_conversion_to_charset(a, "UTF8", 1)));
728 failure("Charset name should be UTF-8");
729 assertEqualString("UTF-8",
730 archive_string_conversion_charset_name(sconv));
731
732 assertA(NULL != (sconv =
733 archive_string_conversion_to_charset(a, "utf8", 1)));
734 failure("Charset name should be UTF-8");
735 assertEqualString("UTF-8",
736 archive_string_conversion_charset_name(sconv));
737
738 assertA(NULL != (sconv =
739 archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
740 failure("Charset name should be UTF-16BE");
741 assertEqualString("UTF-16BE",
742 archive_string_conversion_charset_name(sconv));
743
744 assertA(NULL != (sconv =
745 archive_string_conversion_to_charset(a, "UTF16BE", 1)));
746 failure("Charset name should be UTF-16BE");
747 assertEqualString("UTF-16BE",
748 archive_string_conversion_charset_name(sconv));
749
750 assertA(NULL != (sconv =
751 archive_string_conversion_to_charset(a, "utf16be", 1)));
752 failure("Charset name should be UTF-16BE");
753 assertEqualString("UTF-16BE",
754 archive_string_conversion_charset_name(sconv));
755
756 assertA(NULL != (sconv =
757 archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
758 failure("Charset name should be UTF-16LE");
759 assertEqualString("UTF-16LE",
760 archive_string_conversion_charset_name(sconv));
761
762 assertA(NULL != (sconv =
763 archive_string_conversion_to_charset(a, "UTF16LE", 1)));
764 failure("Charset name should be UTF-16LE");
765 assertEqualString("UTF-16LE",
766 archive_string_conversion_charset_name(sconv));
767
768 assertA(NULL != (sconv =
769 archive_string_conversion_to_charset(a, "utf16le", 1)));
770 failure("Charset name should be UTF-16LE");
771 assertEqualString("UTF-16LE",
772 archive_string_conversion_charset_name(sconv));
773
774 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
775
776 }
777
778 static void
check_string(struct archive * a,struct archive_mstring * mstr,struct archive_string_conv * sc,const char * exp,const wchar_t * wexp)779 check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
780 const char *exp, const wchar_t *wexp)
781 {
782 /* Do all the tests on a copy so that we can have a clear initial state every time */
783 struct archive_mstring mstr2;
784 const char *p = NULL;
785 const wchar_t *wp = NULL;
786 size_t len = 0;
787
788 memset(&mstr2, 0, sizeof(mstr2));
789
790 archive_mstring_copy(&mstr2, mstr);
791 assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
792 assertEqualString(exp, p);
793 p = NULL;
794
795 archive_mstring_copy(&mstr2, mstr);
796 assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
797 assertEqualString(exp, p);
798 p = NULL;
799
800 archive_mstring_copy(&mstr2, mstr);
801 assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
802 assertEqualWString(wexp, wp);
803 wp = NULL;
804
805 archive_mstring_copy(&mstr2, mstr);
806 assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
807 assertEqualString(exp, p);
808 assertEqualInt(len, strlen(exp));
809 p = NULL;
810 len = 0;
811
812 archive_mstring_clean(&mstr2);
813 }
814
815 /*
816 * Make sure no matter what the input encoding is, the string can be
817 * converted too all the output encodings.
818 */
819 static void
test_archive_string_set_get(void)820 test_archive_string_set_get(void)
821 {
822 struct archive *a;
823 struct archive_mstring mstr;
824 struct archive_string_conv *sc;
825
826 setlocale(LC_ALL, "en_US.UTF-8");
827
828 assert((a = archive_read_new()) != NULL);
829 memset(&mstr, 0, sizeof(mstr));
830
831 assertA(NULL != (sc =
832 archive_string_conversion_to_charset(a, "UTF-8", 1)));
833 failure("Charset name should be UTF-8");
834 assertEqualString("UTF-8",
835 archive_string_conversion_charset_name(sc));
836
837 assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
838 check_string(a, &mstr, sc, "AAA", L"AAA");
839 assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
840 check_string(a, &mstr, sc, "BBBB", L"BBBB");
841 assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
842 check_string(a, &mstr, sc, "CCC12", L"CCC12");
843 assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
844 check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
845 assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
846 check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
847
848 archive_mstring_clean(&mstr);
849 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
850
851 }
852
DEFINE_TEST(test_archive_string_conversion)853 DEFINE_TEST(test_archive_string_conversion)
854 {
855 static const char reffile[] = "test_archive_string_conversion.txt.Z";
856 static const char testdata[] = "testdata.txt";
857 struct archive *a;
858 struct archive_entry *ae;
859 char buff[512];
860 ssize_t size;
861 FILE *fp;
862
863 /*
864 * Extract a test pattern file.
865 */
866 extract_reference_file(reffile);
867 assert((a = archive_read_new()) != NULL);
868 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
869 assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
870 assertEqualIntA(a, ARCHIVE_OK,
871 archive_read_open_filename(a, reffile, 512));
872
873 assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
874 assert((fp = fopen(testdata, "w")) != NULL);
875 while ((size = archive_read_data(a, buff, 512)) > 0)
876 assertEqualInt(size, fwrite(buff, 1, size, fp));
877 assertEqualInt(0, fclose(fp));
878 assertEqualInt(ARCHIVE_OK, archive_read_free(a));
879
880 test_archive_string_normalization_nfc(testdata);
881 test_archive_string_normalization_mac_nfd(testdata);
882 test_archive_string_canonicalization();
883 test_archive_string_set_get();
884 }
885
DEFINE_TEST(test_archive_string_conversion_utf16_utf8)886 DEFINE_TEST(test_archive_string_conversion_utf16_utf8)
887 {
888 #if !defined(_WIN32) || defined(__CYGWIN__)
889 skipping("This test is meant to verify unicode string handling on Windows");
890 #else
891 struct archive_mstring mstr;
892 const char* utf8_string;
893
894 memset(&mstr, 0, sizeof(mstr));
895
896 assertEqualInt(ARCHIVE_OK,
897 archive_mstring_copy_wcs(&mstr, L"\U0000043f\U00000440\U00000438"));
898
899 /* Conversion from WCS to UTF-8 should always succeed */
900 assertEqualInt(ARCHIVE_OK,
901 archive_mstring_get_utf8(NULL, &mstr, &utf8_string));
902 assertEqualString("\xD0\xBF\xD1\x80\xD0\xB8", utf8_string);
903
904 archive_mstring_clean(&mstr);
905 #endif
906 }
907
DEFINE_TEST(test_archive_string_conversion_utf8_utf16)908 DEFINE_TEST(test_archive_string_conversion_utf8_utf16)
909 {
910 #if !defined(_WIN32) || defined(__CYGWIN__)
911 skipping("This test is meant to verify unicode string handling on Windows");
912 #else
913 struct archive_mstring mstr;
914 const wchar_t* wcs_string;
915
916 memset(&mstr, 0, sizeof(mstr));
917
918 assertEqualInt(6,
919 archive_mstring_copy_utf8(&mstr, "\xD0\xBF\xD1\x80\xD0\xB8"));
920
921 /* Conversion from UTF-8 to WCS should always succeed */
922 assertEqualInt(ARCHIVE_OK,
923 archive_mstring_get_wcs(NULL, &mstr, &wcs_string));
924 assertEqualWString(L"\U0000043f\U00000440\U00000438", wcs_string);
925
926 archive_mstring_clean(&mstr);
927 #endif
928 }
929
DEFINE_TEST(test_archive_string_update_utf8_win)930 DEFINE_TEST(test_archive_string_update_utf8_win)
931 {
932 #if !defined(_WIN32) || defined(__CYGWIN__)
933 skipping("This test is meant to verify unicode string handling on Windows"
934 " with the C locale");
935 #else
936 static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
937 static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
938 struct archive_mstring mstr;
939 int r;
940
941 memset(&mstr, 0, sizeof(mstr));
942
943 r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
944
945 /* On Windows, this should reliably fail with the C locale */
946 assertEqualInt(-1, r);
947 assertEqualInt(0, mstr.aes_set & AES_SET_MBS);
948
949 /* NOTE: We access the internals to validate that they were set by the
950 * 'archive_mstring_update_utf8' function */
951 /* UTF-8 should always be set */
952 assertEqualInt(AES_SET_UTF8, mstr.aes_set & AES_SET_UTF8);
953 assertEqualString(utf8_string, mstr.aes_utf8.s);
954 /* WCS should always be set as well */
955 assertEqualInt(AES_SET_WCS, mstr.aes_set & AES_SET_WCS);
956 assertEqualWString(wcs_string, mstr.aes_wcs.s);
957
958 archive_mstring_clean(&mstr);
959 #endif
960 }
961
DEFINE_TEST(test_archive_string_update_utf8_utf8)962 DEFINE_TEST(test_archive_string_update_utf8_utf8)
963 {
964 static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
965 static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
966 struct archive_mstring mstr;
967 int r;
968
969 memset(&mstr, 0, sizeof(mstr));
970
971 if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) {
972 skipping("UTF-8 not supported on this system.");
973 return;
974 }
975
976 r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
977
978 /* All conversions should have succeeded */
979 assertEqualInt(0, r);
980 assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
981 assertEqualString(utf8_string, mstr.aes_utf8.s);
982 assertEqualString(utf8_string, mstr.aes_mbs.s);
983 assertEqualWString(wcs_string, mstr.aes_wcs.s);
984
985 archive_mstring_clean(&mstr);
986 }
987
DEFINE_TEST(test_archive_string_update_utf8_koi8)988 DEFINE_TEST(test_archive_string_update_utf8_koi8)
989 {
990 static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
991 static const char koi8_string[] = "\xD0\xD2\xC9";
992 static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
993 struct archive_mstring mstr;
994 int r;
995
996 memset(&mstr, 0, sizeof(mstr));
997
998 if (setlocale(LC_ALL, "ru_RU.KOI8-R") == NULL) {
999 skipping("KOI8-R locale not available on this system.");
1000 return;
1001 }
1002
1003 r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
1004
1005 /* All conversions should have succeeded */
1006 assertEqualInt(0, r);
1007 assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
1008 assertEqualString(utf8_string, mstr.aes_utf8.s);
1009 assertEqualString(koi8_string, mstr.aes_mbs.s);
1010 #if defined(_WIN32) && !defined(__CYGWIN__)
1011 assertEqualWString(wcs_string, mstr.aes_wcs.s);
1012 #else
1013 /* No guarantee of how WCS strings behave, however this test test is
1014 * primarily meant for Windows */
1015 (void)wcs_string;
1016 #endif
1017
1018 archive_mstring_clean(&mstr);
1019 }
1020