xref: /freebsd/contrib/libarchive/libarchive/test/test_archive_string_conversion.c (revision 058ac3e8063366dafa634d9107642e12b038bf09)
1 /*-
2  * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "test.h"
26 __FBSDID("$FreeBSD$");
27 
28 #include <locale.h>
29 
30 #define __LIBARCHIVE_TEST
31 #include "archive_string.h"
32 
33 /*
34 Execute the following to rebuild the data for this program:
35    tail -n +36 test_archive_string_conversion.c | /bin/sh
36 #
37 # This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
38 #
39 if="NormalizationTest.txt"
40 if [ ! -f ${if} ]; then
41   echo "Not found: \"${if}\""
42   exit 0
43 fi
44 of=test_archive_string_conversion.txt.Z
45 echo "\$FreeBSD\$" > ${of}.uu
46 awk -F ';'  '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu
47 exit 1
48 */
49 
50 static int
51 unicode_to_utf8(char *p, uint32_t uc)
52 {
53         char *_p = p;
54 
55         /* Translate code point to UTF8 */
56         if (uc <= 0x7f) {
57                 *p++ = (char)uc;
58         } else if (uc <= 0x7ff) {
59                 *p++ = 0xc0 | ((uc >> 6) & 0x1f);
60                 *p++ = 0x80 | (uc & 0x3f);
61         } else if (uc <= 0xffff) {
62                 *p++ = 0xe0 | ((uc >> 12) & 0x0f);
63                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
64                 *p++ = 0x80 | (uc & 0x3f);
65         } else {
66                 *p++ = 0xf0 | ((uc >> 18) & 0x07);
67                 *p++ = 0x80 | ((uc >> 12) & 0x3f);
68                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
69                 *p++ = 0x80 | (uc & 0x3f);
70         }
71         return ((int)(p - _p));
72 }
73 
74 static void
75 archive_be16enc(void *pp, uint16_t u)
76 {
77         unsigned char *p = (unsigned char *)pp;
78 
79         p[0] = (u >> 8) & 0xff;
80         p[1] = u & 0xff;
81 }
82 
83 static int
84 unicode_to_utf16be(char *p, uint32_t uc)
85 {
86 	char *utf16 = p;
87 
88 	if (uc > 0xffff) {
89 		/* We have a code point that won't fit into a
90 		 * wchar_t; convert it to a surrogate pair. */
91 		uc -= 0x10000;
92 		archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
93 		archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
94 		return (4);
95 	} else {
96 		archive_be16enc(utf16, uc);
97 		return (2);
98 	}
99 }
100 
101 static void
102 archive_le16enc(void *pp, uint16_t u)
103 {
104 	unsigned char *p = (unsigned char *)pp;
105 
106 	p[0] = u & 0xff;
107 	p[1] = (u >> 8) & 0xff;
108 }
109 
110 static size_t
111 unicode_to_utf16le(char *p, uint32_t uc)
112 {
113 	char *utf16 = p;
114 
115 	if (uc > 0xffff) {
116 		/* We have a code point that won't fit into a
117 		 * wchar_t; convert it to a surrogate pair. */
118 		uc -= 0x10000;
119 		archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
120 		archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
121 		return (4);
122 	} else {
123 		archive_le16enc(utf16, uc);
124 		return (2);
125 	}
126 }
127 
128 static int
129 wc_size(void)
130 {
131 	return (sizeof(wchar_t));
132 }
133 
134 static int
135 unicode_to_wc(wchar_t *wp, uint32_t uc)
136 {
137 	if (wc_size() == 4) {
138 		*wp = (wchar_t)uc;
139 		return (1);
140 	}
141 	if (uc > 0xffff) {
142 		/* We have a code point that won't fit into a
143 		 * wchar_t; convert it to a surrogate pair. */
144 		uc -= 0x10000;
145 		*wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
146 		*wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
147 		return (2);
148 	} else {
149 		*wp = (wchar_t)uc;
150 		return (1);
151 	}
152 }
153 
154 /*
155  * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
156  * converted to NFD on Mac OS.
157  * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
158  */
159 static int
160 scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
161     const char *pattern, int mac_nfd)
162 {
163 	unsigned uc = 0;
164 	const char *p = pattern;
165 	char *op = out;
166 	wchar_t *owp = wout;
167 	char *op16be = u16be;
168 	char *op16le = u16le;
169 	int ret = 0;
170 
171 	for (;;) {
172 		if (*p >= '0' && *p <= '9')
173 			uc = (uc << 4) + (*p - '0');
174 		else if (*p >= 'A' && *p <= 'F')
175 			uc = (uc << 4) + (*p - 'A' + 0x0a);
176 		else {
177 			if (mac_nfd && op == out) {
178 				/*
179 				 * These are not converted to NFD on Mac OS.
180  				 * U+2000 - U+2FFF
181 				 * U+F900 - U+FAFF
182 				 * U+2F800 - U+2FAFF
183 				 */
184 				switch (uc) {
185 				case 0x2194: case 0x219A: case 0x219B:
186 				case 0x21AE: case 0x21CD: case 0x21CE:
187 				case 0x21CF: case 0x2204: case 0x2209:
188 				case 0x220C: case 0x2224: case 0x2226:
189 				case 0x2241: case 0x2244: case 0x2247:
190 				case 0x2249: case 0x2260: case 0x2262:
191 				case 0x226D: case 0x226E: case 0x226F:
192 				case 0x2270: case 0x2271: case 0x2274:
193 				case 0x2275: case 0x2276: case 0x2278:
194 				case 0x2279: case 0x227A: case 0x227B:
195 				case 0x2280: case 0x2281: case 0x2284:
196 				case 0x2285: case 0x2288: case 0x2289:
197 				case 0x22AC: case 0x22AD: case 0x22AE:
198 				case 0x22AF: case 0x22E0: case 0x22E1:
199 				case 0x22E2: case 0x22E3: case 0x22EA:
200 				case 0x22EB: case 0x22EC: case 0x22ED:
201 
202 				/*
203 				 * Those code points are not converted to
204 				 * NFD on Mac OS. I do not know the reason
205 				 * because it is undocumented.
206 				 *   NFC        NFD
207 				 *   1109A  ==> 11099 110BA
208 				 *   1109C  ==> 1109B 110BA
209 				 *   110AB  ==> 110A5 110BA
210 				 */
211 				case 0x1109A: case 0x1109C: case 0x110AB:
212 					ret = 1;
213 					break;
214 				}
215 			}
216 			op16be += unicode_to_utf16be(op16be, uc);
217 			op16le += unicode_to_utf16le(op16le, uc);
218 			owp += unicode_to_wc(owp, uc);
219 			op += unicode_to_utf8(op, uc);
220 			if (!*p) {
221 				*op16be++ = 0;
222 				*op16be = 0;
223 				*op16le++ = 0;
224 				*op16le = 0;
225 				*owp = L'\0';
226 				*op = '\0';
227 				break;
228 			}
229 			uc = 0;
230 		}
231 		p++;
232 	}
233 	return (ret);
234 }
235 
236 static int
237 is_wc_unicode(void)
238 {
239 #if defined(_WIN32) && !defined(__CYGWIN__)
240 	return (1);
241 #else
242 	return (0);
243 #endif
244 }
245 
246 /*
247  * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
248  * On Mac OS, the characters to be Form D.
249  * On other platforms, the characters to be Form C.
250  */
251 static void
252 test_archive_string_normalization_nfc(const char *testdata)
253 {
254 	struct archive *a, *a2;
255 	struct archive_string utf8;
256 	struct archive_mstring mstr;
257 	struct archive_string_conv *f_sconv8, *t_sconv8;
258 	struct archive_string_conv *f_sconv16be, *f_sconv16le;
259 	FILE *fp;
260 	char buff[512];
261 	int line = 0;
262 	int locale_is_utf8, wc_is_unicode;
263 	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
264 
265 	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
266 	wc_is_unicode = is_wc_unicode();
267 	/* If it doesn't exist, just warn and return. */
268 	if (!locale_is_utf8 && !wc_is_unicode) {
269 		skipping("A test of string normalization for NFC requires "
270 		    "a suitable locale; en_US.UTF-8 not available on this "
271 		    "system");
272 		return;
273 	}
274 
275 	archive_string_init(&utf8);
276 	memset(&mstr, 0, sizeof(mstr));
277 
278 	/*
279 	 * Create string conversion objects.
280 	 */
281 	assert((a = archive_read_new()) != NULL);
282 	assertA(NULL != (f_sconv8 =
283 	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
284 	assertA(NULL != (f_sconv16be =
285 	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
286 	assertA(NULL != (f_sconv16le =
287 	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
288 	assert((a2 = archive_write_new()) != NULL);
289 	assertA(NULL != (t_sconv8 =
290 	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
291 	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
292 	    t_sconv8 == NULL) {
293 		/* We cannot continue this test. */
294 		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
295 		return;
296 	}
297 	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
298 	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
299 	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
300 	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
301 
302 	/* Open a test pattern file. */
303 	assert((fp = fopen(testdata, "r")) != NULL);
304 
305 	/*
306 	 * Read test data.
307 	 *  Test data format:
308 	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
309 	 *  Unicode pattern format:
310 	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
311 	 */
312 	while (fgets(buff, sizeof(buff), fp) != NULL) {
313 		char nfc[80], nfd[80];
314 		char utf8_nfc[80], utf8_nfd[80];
315 		char utf16be_nfc[80], utf16be_nfd[80];
316 		char utf16le_nfc[80], utf16le_nfd[80];
317 		wchar_t wc_nfc[40], wc_nfd[40];
318 		char *e, *p;
319 		const wchar_t *wp;
320 		const char *mp;
321 		size_t mplen;
322 
323 		line++;
324 		if (buff[0] == '#')
325 			continue;
326 		p = strchr(buff, ';');
327 		if (p == NULL)
328 			continue;
329 		*p++ = '\0';
330 		/* Copy an NFC pattern */
331 		strncpy(nfc, buff, sizeof(nfc)-1);
332 		nfc[sizeof(nfc)-1] = '\0';
333 		e = p;
334 		p = strchr(p, '\n');
335 		if (p == NULL)
336 			continue;
337 		*p = '\0';
338 		/* Copy an NFD pattern */
339 		strncpy(nfd, e, sizeof(nfd)-1);
340 		nfd[sizeof(nfd)-1] = '\0';
341 
342 		/*
343 		 * Get an NFC patterns.
344 		 */
345 		scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
346 		    nfc, 0);
347 
348 		/*
349 		 * Get an NFD patterns.
350 		 */
351 		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
352 		    nfd, 0);
353 
354 		if (locale_is_utf8) {
355 			/*
356 			 * Normalize an NFD string for import.
357 			 */
358 			assertEqualInt(0, archive_strcpy_l(
359 			    &utf8, utf8_nfd, f_sconv8));
360 			failure("NFD(%s) should be converted to NFC(%s):%d",
361 			    nfd, nfc, line);
362 			assertEqualUTF8String(utf8_nfc, utf8.s);
363 
364 			/*
365 			 * Normalize an NFC string for import.
366 			 */
367 			assertEqualInt(0, archive_strcpy_l(
368 			    &utf8, utf8_nfc, f_sconv8));
369 			failure("NFC(%s) should not be any changed:%d",
370 			    nfc, line);
371 			assertEqualUTF8String(utf8_nfc, utf8.s);
372 
373 			/*
374 			 * Copy an NFC string for export.
375 			 */
376 			assertEqualInt(0, archive_strcpy_l(
377 			    &utf8, utf8_nfc, t_sconv8));
378 			failure("NFC(%s) should not be any changed:%d",
379 			    nfc, line);
380 			assertEqualUTF8String(utf8_nfc, utf8.s);
381 
382 			/*
383 			 * Normalize an NFD string in UTF-16BE for import.
384 			 */
385 			assertEqualInt(0, archive_strncpy_l(
386 			    &utf8, utf16be_nfd, 100000, f_sconv16be));
387 			failure("NFD(%s) should be converted to NFC(%s):%d",
388 			    nfd, nfc, line);
389 			assertEqualUTF8String(utf8_nfc, utf8.s);
390 
391 			/*
392 			 * Normalize an NFD string in UTF-16LE for import.
393 			 */
394 			assertEqualInt(0, archive_strncpy_l(
395 			    &utf8, utf16le_nfd, 100000, f_sconv16le));
396 			failure("NFD(%s) should be converted to NFC(%s):%d",
397 			    nfd, nfc, line);
398 			assertEqualUTF8String(utf8_nfc, utf8.s);
399 		}
400 
401 		/*
402 		 * Test for archive_mstring interface.
403 		 * In specific, Windows platform UTF-16BE is directly
404 		 * converted to/from wide-character to avoid the effect of
405 		 * current locale since windows platform cannot make
406 		 * locale UTF-8.
407 		 */
408 		if (locale_is_utf8 || wc_is_unicode) {
409 			/*
410 			 * Normalize an NFD string in UTF-8 for import.
411 			 */
412 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
413 			    &mstr, utf8_nfd, 100000, f_sconv8));
414 			assertEqualInt(0,
415 			    archive_mstring_get_wcs(a, &mstr, &wp));
416 			failure("UTF-8 NFD(%s) should be converted "
417 			    "to WCS NFC(%s):%d", nfd, nfc, line);
418 			assertEqualWString(wc_nfc, wp);
419 
420 			/*
421 			 * Normalize an NFD string in UTF-16BE for import.
422 			 */
423 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
424 			    &mstr, utf16be_nfd, 100000, f_sconv16be));
425 			assertEqualInt(0,
426 			    archive_mstring_get_wcs(a, &mstr, &wp));
427 			failure("UTF-8 NFD(%s) should be converted "
428 			    "to WCS NFC(%s):%d", nfd, nfc, line);
429 			assertEqualWString(wc_nfc, wp);
430 
431 			/*
432 			 * Normalize an NFD string in UTF-16LE for import.
433 			 */
434 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
435 			    &mstr, utf16le_nfd, 100000, f_sconv16le));
436 			assertEqualInt(0,
437 			    archive_mstring_get_wcs(a, &mstr, &wp));
438 			failure("UTF-8 NFD(%s) should be converted "
439 			    "to WCS NFC(%s):%d", nfd, nfc, line);
440 			assertEqualWString(wc_nfc, wp);
441 
442 			/*
443 			 * Copy an NFC wide-string for export.
444 			 */
445 			assertEqualInt(0,
446 			    archive_mstring_copy_wcs(&mstr, wc_nfc));
447 			assertEqualInt(0, archive_mstring_get_mbs_l(
448 			    a, &mstr, &mp, &mplen, t_sconv8));
449 			failure("WCS NFC(%s) should be UTF-8 NFC:%d"
450 			    ,nfc, line);
451 			assertEqualUTF8String(utf8_nfc, mp);
452 		}
453 	}
454 
455 	archive_string_free(&utf8);
456 	archive_mstring_clean(&mstr);
457 	fclose(fp);
458 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
459 	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
460 }
461 
462 static void
463 test_archive_string_normalization_mac_nfd(const char *testdata)
464 {
465 	struct archive *a, *a2;
466 	struct archive_string utf8;
467 	struct archive_mstring mstr;
468 	struct archive_string_conv *f_sconv8, *t_sconv8;
469 	struct archive_string_conv *f_sconv16be, *f_sconv16le;
470 	FILE *fp;
471 	char buff[512];
472 	int line = 0;
473 	int locale_is_utf8, wc_is_unicode;
474 	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
475 
476 	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
477 	wc_is_unicode = is_wc_unicode();
478 	/* If it doesn't exist, just warn and return. */
479 	if (!locale_is_utf8 && !wc_is_unicode) {
480 		skipping("A test of string normalization for NFD requires "
481 		    "a suitable locale; en_US.UTF-8 not available on this "
482 		    "system");
483 		return;
484 	}
485 
486 	archive_string_init(&utf8);
487 	memset(&mstr, 0, sizeof(mstr));
488 
489 	/*
490 	 * Create string conversion objects.
491 	 */
492 	assert((a = archive_read_new()) != NULL);
493 	assertA(NULL != (f_sconv8 =
494 	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
495 	assertA(NULL != (f_sconv16be =
496 	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
497 	assertA(NULL != (f_sconv16le =
498 	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
499 	assert((a2 = archive_write_new()) != NULL);
500 	assertA(NULL != (t_sconv8 =
501 	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
502 	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
503 	    t_sconv8 == NULL) {
504 		/* We cannot continue this test. */
505 		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
506 		return;
507 	}
508 	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
509 	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
510 	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
511 	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
512 
513 	/* Open a test pattern file. */
514 	assert((fp = fopen(testdata, "r")) != NULL);
515 
516 	/*
517 	 * Read test data.
518 	 *  Test data format:
519 	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
520 	 *  Unicode pattern format:
521 	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
522 	 */
523 	while (fgets(buff, sizeof(buff), fp) != NULL) {
524 		char nfc[80], nfd[80];
525 		char utf8_nfc[80], utf8_nfd[80];
526 		char utf16be_nfc[80], utf16be_nfd[80];
527 		char utf16le_nfc[80], utf16le_nfd[80];
528 		wchar_t wc_nfc[40], wc_nfd[40];
529 		char *e, *p;
530 		const wchar_t *wp;
531 		const char *mp;
532 		size_t mplen;
533 		int should_be_nfc;
534 
535 		line++;
536 		if (buff[0] == '#')
537 			continue;
538 		p = strchr(buff, ';');
539 		if (p == NULL)
540 			continue;
541 		*p++ = '\0';
542 		/* Copy an NFC pattern */
543 		strncpy(nfc, buff, sizeof(nfc)-1);
544 		nfc[sizeof(nfc)-1] = '\0';
545 		e = p;
546 		p = strchr(p, '\n');
547 		if (p == NULL)
548 			continue;
549 		*p = '\0';
550 		/* Copy an NFD pattern */
551 		strncpy(nfd, e, sizeof(nfd)-1);
552 		nfd[sizeof(nfd)-1] = '\0';
553 
554 		/*
555 		 * Get an NFC patterns.
556 		 */
557 		should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
558 			utf16be_nfc, utf16le_nfc, nfc, 1);
559 
560 		/*
561 		 * Get an NFD patterns.
562 		 */
563 		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
564 		    nfd, 0);
565 
566 		if (locale_is_utf8) {
567 			/*
568 			 * Normalize an NFC string for import.
569 			 */
570 			assertEqualInt(0, archive_strcpy_l(
571 			    &utf8, utf8_nfc, f_sconv8));
572 			if (should_be_nfc) {
573 				failure("NFC(%s) should not be converted to"
574 				    " NFD(%s):%d", nfc, nfd, line);
575 				assertEqualUTF8String(utf8_nfc, utf8.s);
576 			} else {
577 				failure("NFC(%s) should be converted to"
578 				    " NFD(%s):%d", nfc, nfd, line);
579 				assertEqualUTF8String(utf8_nfd, utf8.s);
580 			}
581 
582 			/*
583 			 * Normalize an NFD string for import.
584 			 */
585 			assertEqualInt(0, archive_strcpy_l(
586 			    &utf8, utf8_nfd, f_sconv8));
587 			failure("NFD(%s) should not be any changed:%d",
588 			    nfd, line);
589 			assertEqualUTF8String(utf8_nfd, utf8.s);
590 
591 			/*
592 			 * Copy an NFD string for export.
593 			 */
594 			assertEqualInt(0, archive_strcpy_l(
595 			    &utf8, utf8_nfd, t_sconv8));
596 			failure("NFD(%s) should not be any changed:%d",
597 			    nfd, line);
598 			assertEqualUTF8String(utf8_nfd, utf8.s);
599 
600 			/*
601 			 * Normalize an NFC string in UTF-16BE for import.
602 			 */
603 			assertEqualInt(0, archive_strncpy_l(
604 			    &utf8, utf16be_nfc, 100000, f_sconv16be));
605 			if (should_be_nfc) {
606 				failure("NFC(%s) should not be converted to"
607 				    " NFD(%s):%d", nfc, nfd, line);
608 				assertEqualUTF8String(utf8_nfc, utf8.s);
609 			} else {
610 				failure("NFC(%s) should be converted to"
611 				    " NFD(%s):%d", nfc, nfd, line);
612 				assertEqualUTF8String(utf8_nfd, utf8.s);
613 			}
614 
615 			/*
616 			 * Normalize an NFC string in UTF-16LE for import.
617 			 */
618 			assertEqualInt(0, archive_strncpy_l(
619 			    &utf8, utf16le_nfc, 100000, f_sconv16le));
620 			if (should_be_nfc) {
621 				failure("NFC(%s) should not be converted to"
622 				    " NFD(%s):%d", nfc, nfd, line);
623 				assertEqualUTF8String(utf8_nfc, utf8.s);
624 			} else {
625 				failure("NFC(%s) should be converted to"
626 				    " NFD(%s):%d", nfc, nfd, line);
627 				assertEqualUTF8String(utf8_nfd, utf8.s);
628 			}
629 		}
630 
631 		/*
632 		 * Test for archive_mstring interface.
633 		 * In specific, Windows platform UTF-16BE is directly
634 		 * converted to/from wide-character to avoid the effect of
635 		 * current locale since windows platform cannot make
636 		 * locale UTF-8.
637 		 */
638 		if (locale_is_utf8 || wc_is_unicode) {
639 			/*
640 			 * Normalize an NFD string in UTF-8 for import.
641 			 */
642 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
643 			    &mstr, utf8_nfc, 100000, f_sconv8));
644 			assertEqualInt(0,
645 			    archive_mstring_get_wcs(a, &mstr, &wp));
646 			if (should_be_nfc) {
647 				failure("UTF-8 NFC(%s) should not be converted "
648 				    "to WCS NFD(%s):%d", nfc, nfd, line);
649 				assertEqualWString(wc_nfc, wp);
650 			} else {
651 				failure("UTF-8 NFC(%s) should be converted "
652 				    "to WCS NFD(%s):%d", nfc, nfd, line);
653 				assertEqualWString(wc_nfd, wp);
654 			}
655 
656 			/*
657 			 * Normalize an NFD string in UTF-16BE for import.
658 			 */
659 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
660 			    &mstr, utf16be_nfc, 100000, f_sconv16be));
661 			assertEqualInt(0,
662 			    archive_mstring_get_wcs(a, &mstr, &wp));
663 			if (should_be_nfc) {
664 				failure("UTF-16BE NFC(%s) should not be "
665 				    "converted to WCS NFD(%s):%d",
666 				    nfc, nfd, line);
667 				assertEqualWString(wc_nfc, wp);
668 			} else {
669 				failure("UTF-16BE NFC(%s) should be converted "
670 				    "to WCS NFD(%s):%d", nfc, nfd, line);
671 				assertEqualWString(wc_nfd, wp);
672 			}
673 
674 			/*
675 			 * Normalize an NFD string in UTF-16LE for import.
676 			 */
677 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
678 			    &mstr, utf16le_nfc, 100000, f_sconv16le));
679 			assertEqualInt(0,
680 			    archive_mstring_get_wcs(a, &mstr, &wp));
681 			if (should_be_nfc) {
682 				failure("UTF-16LE NFC(%s) should not be "
683 				    "converted to WCS NFD(%s):%d",
684 				    nfc, nfd, line);
685 				assertEqualWString(wc_nfc, wp);
686 			} else {
687 				failure("UTF-16LE NFC(%s) should be converted "
688 				    "to WCS NFD(%s):%d", nfc, nfd, line);
689 				assertEqualWString(wc_nfd, wp);
690 			}
691 
692 			/*
693 			 * Copy an NFD wide-string for export.
694 			 */
695 			assertEqualInt(0, archive_mstring_copy_wcs(
696 			    &mstr, wc_nfd));
697 			assertEqualInt(0, archive_mstring_get_mbs_l(
698 			    a, &mstr, &mp, &mplen, t_sconv8));
699 			failure("WCS NFD(%s) should be UTF-8 NFD:%d"
700 			    ,nfd, line);
701 			assertEqualUTF8String(utf8_nfd, mp);
702 		}
703 	}
704 
705 	archive_string_free(&utf8);
706 	archive_mstring_clean(&mstr);
707 	fclose(fp);
708 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
709 	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
710 }
711 
712 static void
713 test_archive_string_canonicalization(void)
714 {
715 	struct archive *a;
716 	struct archive_string_conv *sconv;
717 
718 	setlocale(LC_ALL, "en_US.UTF-8");
719 
720 	assert((a = archive_read_new()) != NULL);
721 
722 	assertA(NULL != (sconv =
723 	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
724 	failure("Charset name should be UTF-8");
725 	assertEqualString("UTF-8",
726 	    archive_string_conversion_charset_name(sconv));
727 
728 	assertA(NULL != (sconv =
729 	    archive_string_conversion_to_charset(a, "UTF8", 1)));
730 	failure("Charset name should be UTF-8");
731 	assertEqualString("UTF-8",
732 	    archive_string_conversion_charset_name(sconv));
733 
734 	assertA(NULL != (sconv =
735 	    archive_string_conversion_to_charset(a, "utf8", 1)));
736 	failure("Charset name should be UTF-8");
737 	assertEqualString("UTF-8",
738 	    archive_string_conversion_charset_name(sconv));
739 
740 	assertA(NULL != (sconv =
741 	    archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
742 	failure("Charset name should be UTF-16BE");
743 	assertEqualString("UTF-16BE",
744 	    archive_string_conversion_charset_name(sconv));
745 
746 	assertA(NULL != (sconv =
747 	    archive_string_conversion_to_charset(a, "UTF16BE", 1)));
748 	failure("Charset name should be UTF-16BE");
749 	assertEqualString("UTF-16BE",
750 	    archive_string_conversion_charset_name(sconv));
751 
752 	assertA(NULL != (sconv =
753 	    archive_string_conversion_to_charset(a, "utf16be", 1)));
754 	failure("Charset name should be UTF-16BE");
755 	assertEqualString("UTF-16BE",
756 	    archive_string_conversion_charset_name(sconv));
757 
758 	assertA(NULL != (sconv =
759 	    archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
760 	failure("Charset name should be UTF-16LE");
761 	assertEqualString("UTF-16LE",
762 	    archive_string_conversion_charset_name(sconv));
763 
764 	assertA(NULL != (sconv =
765 	    archive_string_conversion_to_charset(a, "UTF16LE", 1)));
766 	failure("Charset name should be UTF-16LE");
767 	assertEqualString("UTF-16LE",
768 	    archive_string_conversion_charset_name(sconv));
769 
770 	assertA(NULL != (sconv =
771 	    archive_string_conversion_to_charset(a, "utf16le", 1)));
772 	failure("Charset name should be UTF-16LE");
773 	assertEqualString("UTF-16LE",
774 	    archive_string_conversion_charset_name(sconv));
775 
776 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
777 
778 }
779 
780 static void
781 check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
782   const char *exp, const wchar_t *wexp)
783 {
784 	/* Do all the tests on a copy so that we can have a clear initial state every time */
785 	struct archive_mstring mstr2;
786 	const char *p = NULL;
787 	const wchar_t *wp = NULL;
788 	size_t len = 0;
789 
790 	memset(&mstr2, 0, sizeof(mstr2));
791 
792 	archive_mstring_copy(&mstr2, mstr);
793 	assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
794 	assertEqualString(exp, p);
795 	p = NULL;
796 
797 	archive_mstring_copy(&mstr2, mstr);
798 	assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
799 	assertEqualString(exp, p);
800 	p = NULL;
801 
802 	archive_mstring_copy(&mstr2, mstr);
803 	assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
804 	assertEqualWString(wexp, wp);
805 	wp = NULL;
806 
807 	archive_mstring_copy(&mstr2, mstr);
808 	assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
809 	assertEqualString(exp, p);
810 	assertEqualInt(len, strlen(exp));
811 	p = NULL;
812 	len = 0;
813 
814 	archive_mstring_clean(&mstr2);
815 }
816 
817 /*
818  * Make sure no matter what the input encoding is, the string can be
819  * converted too all the output encodings.
820  */
821 static void
822 test_archive_string_set_get(void)
823 {
824 	struct archive *a;
825 	struct archive_mstring mstr;
826 	struct archive_string_conv *sc;
827 
828 	setlocale(LC_ALL, "en_US.UTF-8");
829 
830 	assert((a = archive_read_new()) != NULL);
831 	memset(&mstr, 0, sizeof(mstr));
832 
833 	assertA(NULL != (sc =
834 	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
835 	failure("Charset name should be UTF-8");
836 	assertEqualString("UTF-8",
837 	    archive_string_conversion_charset_name(sc));
838 
839 	assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
840 	check_string(a, &mstr, sc, "AAA", L"AAA");
841 	assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
842 	check_string(a, &mstr, sc, "BBBB", L"BBBB");
843 	assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
844 	check_string(a, &mstr, sc, "CCC12", L"CCC12");
845 	assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
846 	check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
847 	assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
848 	check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
849 
850         archive_mstring_clean(&mstr);
851 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
852 
853 }
854 
855 DEFINE_TEST(test_archive_string_conversion)
856 {
857 	static const char reffile[] = "test_archive_string_conversion.txt.Z";
858 	static const char testdata[] = "testdata.txt";
859 	struct archive *a;
860 	struct archive_entry *ae;
861 	char buff[512];
862 	ssize_t size;
863 	FILE *fp;
864 
865 	/*
866 	 * Extract a test pattern file.
867 	 */
868 	extract_reference_file(reffile);
869 	assert((a = archive_read_new()) != NULL);
870 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
871 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
872         assertEqualIntA(a, ARCHIVE_OK,
873             archive_read_open_filename(a, reffile, 512));
874 
875 	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
876 	assert((fp = fopen(testdata, "w")) != NULL);
877 	while ((size = archive_read_data(a, buff, 512)) > 0)
878 		assertEqualInt(size, fwrite(buff, 1, size, fp));
879 	assertEqualInt(0, fclose(fp));
880 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
881 
882 	test_archive_string_normalization_nfc(testdata);
883 	test_archive_string_normalization_mac_nfd(testdata);
884 	test_archive_string_canonicalization();
885 	test_archive_string_set_get();
886 }
887