xref: /freebsd/contrib/libarchive/libarchive/test/test_archive_string_conversion.c (revision bd66c1b43e33540205dbc1187c2f2a15c58b57ba)
1 /*-
2  * Copyright (c) 2011-2012 Michihiro NAKAJIMA
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "test.h"
26 
27 #include <locale.h>
28 
29 #define __LIBARCHIVE_TEST
30 #include "archive_string.h"
31 
32 /*
33 Execute the following to rebuild the data for this program:
34    tail -n +36 test_archive_string_conversion.c | /bin/sh
35 #
36 # This requires http://unicode.org/Public/6.0.0/ucd/NormalizationTest.txt
37 #
38 if="NormalizationTest.txt"
39 if [ ! -f ${if} ]; then
40   echo "Not found: \"${if}\""
41   exit 0
42 fi
43 of=test_archive_string_conversion.txt.Z
44 awk -F ';'  '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} > ${of}.uu
45 exit 1
46 */
47 
48 static int
unicode_to_utf8(char * p,uint32_t uc)49 unicode_to_utf8(char *p, uint32_t uc)
50 {
51         char *_p = p;
52 
53         /* Translate code point to UTF8 */
54         if (uc <= 0x7f) {
55                 *p++ = (char)uc;
56         } else if (uc <= 0x7ff) {
57                 *p++ = 0xc0 | ((uc >> 6) & 0x1f);
58                 *p++ = 0x80 | (uc & 0x3f);
59         } else if (uc <= 0xffff) {
60                 *p++ = 0xe0 | ((uc >> 12) & 0x0f);
61                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
62                 *p++ = 0x80 | (uc & 0x3f);
63         } else {
64                 *p++ = 0xf0 | ((uc >> 18) & 0x07);
65                 *p++ = 0x80 | ((uc >> 12) & 0x3f);
66                 *p++ = 0x80 | ((uc >> 6) & 0x3f);
67                 *p++ = 0x80 | (uc & 0x3f);
68         }
69         return ((int)(p - _p));
70 }
71 
72 static void
archive_be16enc(void * pp,uint16_t u)73 archive_be16enc(void *pp, uint16_t u)
74 {
75         unsigned char *p = (unsigned char *)pp;
76 
77         p[0] = (u >> 8) & 0xff;
78         p[1] = u & 0xff;
79 }
80 
81 static int
unicode_to_utf16be(char * p,uint32_t uc)82 unicode_to_utf16be(char *p, uint32_t uc)
83 {
84 	char *utf16 = p;
85 
86 	if (uc > 0xffff) {
87 		/* We have a code point that won't fit into a
88 		 * wchar_t; convert it to a surrogate pair. */
89 		uc -= 0x10000;
90 		archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
91 		archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
92 		return (4);
93 	} else {
94 		archive_be16enc(utf16, uc);
95 		return (2);
96 	}
97 }
98 
99 static void
archive_le16enc(void * pp,uint16_t u)100 archive_le16enc(void *pp, uint16_t u)
101 {
102 	unsigned char *p = (unsigned char *)pp;
103 
104 	p[0] = u & 0xff;
105 	p[1] = (u >> 8) & 0xff;
106 }
107 
108 static size_t
unicode_to_utf16le(char * p,uint32_t uc)109 unicode_to_utf16le(char *p, uint32_t uc)
110 {
111 	char *utf16 = p;
112 
113 	if (uc > 0xffff) {
114 		/* We have a code point that won't fit into a
115 		 * wchar_t; convert it to a surrogate pair. */
116 		uc -= 0x10000;
117 		archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800);
118 		archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00);
119 		return (4);
120 	} else {
121 		archive_le16enc(utf16, uc);
122 		return (2);
123 	}
124 }
125 
126 static int
wc_size(void)127 wc_size(void)
128 {
129 	return (sizeof(wchar_t));
130 }
131 
132 static int
unicode_to_wc(wchar_t * wp,uint32_t uc)133 unicode_to_wc(wchar_t *wp, uint32_t uc)
134 {
135 	if (wc_size() == 4) {
136 		*wp = (wchar_t)uc;
137 		return (1);
138 	}
139 	if (uc > 0xffff) {
140 		/* We have a code point that won't fit into a
141 		 * wchar_t; convert it to a surrogate pair. */
142 		uc -= 0x10000;
143 		*wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800);
144 		*wp = (wchar_t)((uc & 0x3ff) + 0xDC00);
145 		return (2);
146 	} else {
147 		*wp = (wchar_t)uc;
148 		return (1);
149 	}
150 }
151 
152 /*
153  * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not
154  * converted to NFD on Mac OS.
155  * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html
156  */
157 static int
scan_unicode_pattern(char * out,wchar_t * wout,char * u16be,char * u16le,const char * pattern,int mac_nfd)158 scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le,
159     const char *pattern, int mac_nfd)
160 {
161 	unsigned uc = 0;
162 	const char *p = pattern;
163 	char *op = out;
164 	wchar_t *owp = wout;
165 	char *op16be = u16be;
166 	char *op16le = u16le;
167 	int ret = 0;
168 
169 	for (;;) {
170 		if (*p >= '0' && *p <= '9')
171 			uc = (uc << 4) + (*p - '0');
172 		else if (*p >= 'A' && *p <= 'F')
173 			uc = (uc << 4) + (*p - 'A' + 0x0a);
174 		else {
175 			if (mac_nfd && op == out) {
176 				/*
177 				 * These are not converted to NFD on Mac OS.
178  				 * U+2000 - U+2FFF
179 				 * U+F900 - U+FAFF
180 				 * U+2F800 - U+2FAFF
181 				 */
182 				switch (uc) {
183 				case 0x2194: case 0x219A: case 0x219B:
184 				case 0x21AE: case 0x21CD: case 0x21CE:
185 				case 0x21CF: case 0x2204: case 0x2209:
186 				case 0x220C: case 0x2224: case 0x2226:
187 				case 0x2241: case 0x2244: case 0x2247:
188 				case 0x2249: case 0x2260: case 0x2262:
189 				case 0x226D: case 0x226E: case 0x226F:
190 				case 0x2270: case 0x2271: case 0x2274:
191 				case 0x2275: case 0x2276: case 0x2278:
192 				case 0x2279: case 0x227A: case 0x227B:
193 				case 0x2280: case 0x2281: case 0x2284:
194 				case 0x2285: case 0x2288: case 0x2289:
195 				case 0x22AC: case 0x22AD: case 0x22AE:
196 				case 0x22AF: case 0x22E0: case 0x22E1:
197 				case 0x22E2: case 0x22E3: case 0x22EA:
198 				case 0x22EB: case 0x22EC: case 0x22ED:
199 
200 				/*
201 				 * Those code points are not converted to
202 				 * NFD on Mac OS. I do not know the reason
203 				 * because it is undocumented.
204 				 *   NFC        NFD
205 				 *   1109A  ==> 11099 110BA
206 				 *   1109C  ==> 1109B 110BA
207 				 *   110AB  ==> 110A5 110BA
208 				 */
209 				case 0x1109A: case 0x1109C: case 0x110AB:
210 					ret = 1;
211 					break;
212 				}
213 			}
214 			op16be += unicode_to_utf16be(op16be, uc);
215 			op16le += unicode_to_utf16le(op16le, uc);
216 			owp += unicode_to_wc(owp, uc);
217 			op += unicode_to_utf8(op, uc);
218 			if (!*p) {
219 				*op16be++ = 0;
220 				*op16be = 0;
221 				*op16le++ = 0;
222 				*op16le = 0;
223 				*owp = L'\0';
224 				*op = '\0';
225 				break;
226 			}
227 			uc = 0;
228 		}
229 		p++;
230 	}
231 	return (ret);
232 }
233 
234 static int
is_wc_unicode(void)235 is_wc_unicode(void)
236 {
237 #if defined(_WIN32) && !defined(__CYGWIN__)
238 	return (1);
239 #else
240 	return (0);
241 #endif
242 }
243 
244 /*
245  * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters.
246  * On Mac OS, the characters to be Form D.
247  * On other platforms, the characters to be Form C.
248  */
249 static void
test_archive_string_normalization_nfc(const char * testdata)250 test_archive_string_normalization_nfc(const char *testdata)
251 {
252 	struct archive *a, *a2;
253 	struct archive_string utf8;
254 	struct archive_mstring mstr;
255 	struct archive_string_conv *f_sconv8, *t_sconv8;
256 	struct archive_string_conv *f_sconv16be, *f_sconv16le;
257 	FILE *fp;
258 	char buff[512];
259 	int line = 0;
260 	int locale_is_utf8, wc_is_unicode;
261 	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_C;
262 
263 	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
264 	wc_is_unicode = is_wc_unicode();
265 	/* If it doesn't exist, just warn and return. */
266 	if (!locale_is_utf8 && !wc_is_unicode) {
267 		skipping("A test of string normalization for NFC requires "
268 		    "a suitable locale; en_US.UTF-8 not available on this "
269 		    "system");
270 		return;
271 	}
272 
273 	archive_string_init(&utf8);
274 	memset(&mstr, 0, sizeof(mstr));
275 
276 	/*
277 	 * Create string conversion objects.
278 	 */
279 	assert((a = archive_read_new()) != NULL);
280 	assertA(NULL != (f_sconv8 =
281 	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
282 	assertA(NULL != (f_sconv16be =
283 	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
284 	assertA(NULL != (f_sconv16le =
285 	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
286 	assert((a2 = archive_write_new()) != NULL);
287 	assertA(NULL != (t_sconv8 =
288 	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
289 	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
290 	    t_sconv8 == NULL) {
291 		/* We cannot continue this test. */
292 		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
293 		return;
294 	}
295 	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
296 	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
297 	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
298 	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
299 
300 	/* Open a test pattern file. */
301 	assert((fp = fopen(testdata, "r")) != NULL);
302 
303 	/*
304 	 * Read test data.
305 	 *  Test data format:
306 	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
307 	 *  Unicode pattern format:
308 	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
309 	 */
310 	while (fgets(buff, sizeof(buff), fp) != NULL) {
311 		char nfc[80], nfd[80];
312 		char utf8_nfc[80], utf8_nfd[80];
313 		char utf16be_nfc[80], utf16be_nfd[80];
314 		char utf16le_nfc[80], utf16le_nfd[80];
315 		wchar_t wc_nfc[40], wc_nfd[40];
316 		char *e, *p;
317 		const wchar_t *wp;
318 		const char *mp;
319 		size_t mplen;
320 
321 		line++;
322 		if (buff[0] == '#')
323 			continue;
324 		p = strchr(buff, ';');
325 		if (p == NULL)
326 			continue;
327 		*p++ = '\0';
328 		/* Copy an NFC pattern */
329 		strncpy(nfc, buff, sizeof(nfc)-1);
330 		nfc[sizeof(nfc)-1] = '\0';
331 		e = p;
332 		p = strchr(p, '\n');
333 		if (p == NULL)
334 			continue;
335 		*p = '\0';
336 		/* Copy an NFD pattern */
337 		strncpy(nfd, e, sizeof(nfd)-1);
338 		nfd[sizeof(nfd)-1] = '\0';
339 
340 		/*
341 		 * Get an NFC patterns.
342 		 */
343 		scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc,
344 		    nfc, 0);
345 
346 		/*
347 		 * Get an NFD patterns.
348 		 */
349 		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
350 		    nfd, 0);
351 
352 		if (locale_is_utf8) {
353 			/*
354 			 * Normalize an NFD string for import.
355 			 */
356 			assertEqualInt(0, archive_strcpy_l(
357 			    &utf8, utf8_nfd, f_sconv8));
358 			failure("NFD(%s) should be converted to NFC(%s):%d",
359 			    nfd, nfc, line);
360 			assertEqualUTF8String(utf8_nfc, utf8.s);
361 
362 			/*
363 			 * Normalize an NFC string for import.
364 			 */
365 			assertEqualInt(0, archive_strcpy_l(
366 			    &utf8, utf8_nfc, f_sconv8));
367 			failure("NFC(%s) should not be any changed:%d",
368 			    nfc, line);
369 			assertEqualUTF8String(utf8_nfc, utf8.s);
370 
371 			/*
372 			 * Copy an NFC string for export.
373 			 */
374 			assertEqualInt(0, archive_strcpy_l(
375 			    &utf8, utf8_nfc, t_sconv8));
376 			failure("NFC(%s) should not be any changed:%d",
377 			    nfc, line);
378 			assertEqualUTF8String(utf8_nfc, utf8.s);
379 
380 			/*
381 			 * Normalize an NFD string in UTF-16BE for import.
382 			 */
383 			assertEqualInt(0, archive_strncpy_l(
384 			    &utf8, utf16be_nfd, 100000, f_sconv16be));
385 			failure("NFD(%s) should be converted to NFC(%s):%d",
386 			    nfd, nfc, line);
387 			assertEqualUTF8String(utf8_nfc, utf8.s);
388 
389 			/*
390 			 * Normalize an NFD string in UTF-16LE for import.
391 			 */
392 			assertEqualInt(0, archive_strncpy_l(
393 			    &utf8, utf16le_nfd, 100000, f_sconv16le));
394 			failure("NFD(%s) should be converted to NFC(%s):%d",
395 			    nfd, nfc, line);
396 			assertEqualUTF8String(utf8_nfc, utf8.s);
397 		}
398 
399 		/*
400 		 * Test for archive_mstring interface.
401 		 * In specific, Windows platform UTF-16BE is directly
402 		 * converted to/from wide-character to avoid the effect of
403 		 * current locale since windows platform cannot make
404 		 * locale UTF-8.
405 		 */
406 		if (locale_is_utf8 || wc_is_unicode) {
407 			/*
408 			 * Normalize an NFD string in UTF-8 for import.
409 			 */
410 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
411 			    &mstr, utf8_nfd, 100000, f_sconv8));
412 			assertEqualInt(0,
413 			    archive_mstring_get_wcs(a, &mstr, &wp));
414 			failure("UTF-8 NFD(%s) should be converted "
415 			    "to WCS NFC(%s):%d", nfd, nfc, line);
416 			assertEqualWString(wc_nfc, wp);
417 
418 			/*
419 			 * Normalize an NFD string in UTF-16BE for import.
420 			 */
421 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
422 			    &mstr, utf16be_nfd, 100000, f_sconv16be));
423 			assertEqualInt(0,
424 			    archive_mstring_get_wcs(a, &mstr, &wp));
425 			failure("UTF-8 NFD(%s) should be converted "
426 			    "to WCS NFC(%s):%d", nfd, nfc, line);
427 			assertEqualWString(wc_nfc, wp);
428 
429 			/*
430 			 * Normalize an NFD string in UTF-16LE for import.
431 			 */
432 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
433 			    &mstr, utf16le_nfd, 100000, f_sconv16le));
434 			assertEqualInt(0,
435 			    archive_mstring_get_wcs(a, &mstr, &wp));
436 			failure("UTF-8 NFD(%s) should be converted "
437 			    "to WCS NFC(%s):%d", nfd, nfc, line);
438 			assertEqualWString(wc_nfc, wp);
439 
440 			/*
441 			 * Copy an NFC wide-string for export.
442 			 */
443 			assertEqualInt(0,
444 			    archive_mstring_copy_wcs(&mstr, wc_nfc));
445 			assertEqualInt(0, archive_mstring_get_mbs_l(
446 			    a, &mstr, &mp, &mplen, t_sconv8));
447 			failure("WCS NFC(%s) should be UTF-8 NFC:%d"
448 			    ,nfc, line);
449 			assertEqualUTF8String(utf8_nfc, mp);
450 		}
451 	}
452 
453 	archive_string_free(&utf8);
454 	archive_mstring_clean(&mstr);
455 	fclose(fp);
456 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
457 	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
458 }
459 
460 static void
test_archive_string_normalization_mac_nfd(const char * testdata)461 test_archive_string_normalization_mac_nfd(const char *testdata)
462 {
463 	struct archive *a, *a2;
464 	struct archive_string utf8;
465 	struct archive_mstring mstr;
466 	struct archive_string_conv *f_sconv8, *t_sconv8;
467 	struct archive_string_conv *f_sconv16be, *f_sconv16le;
468 	FILE *fp;
469 	char buff[512];
470 	int line = 0;
471 	int locale_is_utf8, wc_is_unicode;
472 	int sconv_opt = SCONV_SET_OPT_NORMALIZATION_D;
473 
474 	locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8"));
475 	wc_is_unicode = is_wc_unicode();
476 	/* If it doesn't exist, just warn and return. */
477 	if (!locale_is_utf8 && !wc_is_unicode) {
478 		skipping("A test of string normalization for NFD requires "
479 		    "a suitable locale; en_US.UTF-8 not available on this "
480 		    "system");
481 		return;
482 	}
483 
484 	archive_string_init(&utf8);
485 	memset(&mstr, 0, sizeof(mstr));
486 
487 	/*
488 	 * Create string conversion objects.
489 	 */
490 	assert((a = archive_read_new()) != NULL);
491 	assertA(NULL != (f_sconv8 =
492 	    archive_string_conversion_from_charset(a, "UTF-8", 0)));
493 	assertA(NULL != (f_sconv16be =
494 	    archive_string_conversion_from_charset(a, "UTF-16BE", 0)));
495 	assertA(NULL != (f_sconv16le =
496 	    archive_string_conversion_from_charset(a, "UTF-16LE", 0)));
497 	assert((a2 = archive_write_new()) != NULL);
498 	assertA(NULL != (t_sconv8 =
499 	    archive_string_conversion_to_charset(a2, "UTF-8", 0)));
500 	if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL ||
501 	    t_sconv8 == NULL) {
502 		/* We cannot continue this test. */
503 		assertEqualInt(ARCHIVE_OK, archive_read_free(a));
504 		return;
505 	}
506 	archive_string_conversion_set_opt(f_sconv8, sconv_opt);
507 	archive_string_conversion_set_opt(f_sconv16be, sconv_opt);
508 	archive_string_conversion_set_opt(f_sconv16le, sconv_opt);
509 	archive_string_conversion_set_opt(t_sconv8, sconv_opt);
510 
511 	/* Open a test pattern file. */
512 	assert((fp = fopen(testdata, "r")) != NULL);
513 
514 	/*
515 	 * Read test data.
516 	 *  Test data format:
517 	 *     <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n'
518 	 *  Unicode pattern format:
519 	 *     [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,}
520 	 */
521 	while (fgets(buff, sizeof(buff), fp) != NULL) {
522 		char nfc[80], nfd[80];
523 		char utf8_nfc[80], utf8_nfd[80];
524 		char utf16be_nfc[80], utf16be_nfd[80];
525 		char utf16le_nfc[80], utf16le_nfd[80];
526 		wchar_t wc_nfc[40], wc_nfd[40];
527 		char *e, *p;
528 		const wchar_t *wp;
529 		const char *mp;
530 		size_t mplen;
531 		int should_be_nfc;
532 
533 		line++;
534 		if (buff[0] == '#')
535 			continue;
536 		p = strchr(buff, ';');
537 		if (p == NULL)
538 			continue;
539 		*p++ = '\0';
540 		/* Copy an NFC pattern */
541 		strncpy(nfc, buff, sizeof(nfc)-1);
542 		nfc[sizeof(nfc)-1] = '\0';
543 		e = p;
544 		p = strchr(p, '\n');
545 		if (p == NULL)
546 			continue;
547 		*p = '\0';
548 		/* Copy an NFD pattern */
549 		strncpy(nfd, e, sizeof(nfd)-1);
550 		nfd[sizeof(nfd)-1] = '\0';
551 
552 		/*
553 		 * Get an NFC patterns.
554 		 */
555 		should_be_nfc = scan_unicode_pattern(utf8_nfc, wc_nfc,
556 			utf16be_nfc, utf16le_nfc, nfc, 1);
557 
558 		/*
559 		 * Get an NFD patterns.
560 		 */
561 		scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd,
562 		    nfd, 0);
563 
564 		if (locale_is_utf8) {
565 			/*
566 			 * Normalize an NFC string for import.
567 			 */
568 			assertEqualInt(0, archive_strcpy_l(
569 			    &utf8, utf8_nfc, f_sconv8));
570 			if (should_be_nfc) {
571 				failure("NFC(%s) should not be converted to"
572 				    " NFD(%s):%d", nfc, nfd, line);
573 				assertEqualUTF8String(utf8_nfc, utf8.s);
574 			} else {
575 				failure("NFC(%s) should be converted to"
576 				    " NFD(%s):%d", nfc, nfd, line);
577 				assertEqualUTF8String(utf8_nfd, utf8.s);
578 			}
579 
580 			/*
581 			 * Normalize an NFD string for import.
582 			 */
583 			assertEqualInt(0, archive_strcpy_l(
584 			    &utf8, utf8_nfd, f_sconv8));
585 			failure("NFD(%s) should not be any changed:%d",
586 			    nfd, line);
587 			assertEqualUTF8String(utf8_nfd, utf8.s);
588 
589 			/*
590 			 * Copy an NFD string for export.
591 			 */
592 			assertEqualInt(0, archive_strcpy_l(
593 			    &utf8, utf8_nfd, t_sconv8));
594 			failure("NFD(%s) should not be any changed:%d",
595 			    nfd, line);
596 			assertEqualUTF8String(utf8_nfd, utf8.s);
597 
598 			/*
599 			 * Normalize an NFC string in UTF-16BE for import.
600 			 */
601 			assertEqualInt(0, archive_strncpy_l(
602 			    &utf8, utf16be_nfc, 100000, f_sconv16be));
603 			if (should_be_nfc) {
604 				failure("NFC(%s) should not be converted to"
605 				    " NFD(%s):%d", nfc, nfd, line);
606 				assertEqualUTF8String(utf8_nfc, utf8.s);
607 			} else {
608 				failure("NFC(%s) should be converted to"
609 				    " NFD(%s):%d", nfc, nfd, line);
610 				assertEqualUTF8String(utf8_nfd, utf8.s);
611 			}
612 
613 			/*
614 			 * Normalize an NFC string in UTF-16LE for import.
615 			 */
616 			assertEqualInt(0, archive_strncpy_l(
617 			    &utf8, utf16le_nfc, 100000, f_sconv16le));
618 			if (should_be_nfc) {
619 				failure("NFC(%s) should not be converted to"
620 				    " NFD(%s):%d", nfc, nfd, line);
621 				assertEqualUTF8String(utf8_nfc, utf8.s);
622 			} else {
623 				failure("NFC(%s) should be converted to"
624 				    " NFD(%s):%d", nfc, nfd, line);
625 				assertEqualUTF8String(utf8_nfd, utf8.s);
626 			}
627 		}
628 
629 		/*
630 		 * Test for archive_mstring interface.
631 		 * In specific, Windows platform UTF-16BE is directly
632 		 * converted to/from wide-character to avoid the effect of
633 		 * current locale since windows platform cannot make
634 		 * locale UTF-8.
635 		 */
636 		if (locale_is_utf8 || wc_is_unicode) {
637 			/*
638 			 * Normalize an NFD string in UTF-8 for import.
639 			 */
640 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
641 			    &mstr, utf8_nfc, 100000, f_sconv8));
642 			assertEqualInt(0,
643 			    archive_mstring_get_wcs(a, &mstr, &wp));
644 			if (should_be_nfc) {
645 				failure("UTF-8 NFC(%s) should not be converted "
646 				    "to WCS NFD(%s):%d", nfc, nfd, line);
647 				assertEqualWString(wc_nfc, wp);
648 			} else {
649 				failure("UTF-8 NFC(%s) should be converted "
650 				    "to WCS NFD(%s):%d", nfc, nfd, line);
651 				assertEqualWString(wc_nfd, wp);
652 			}
653 
654 			/*
655 			 * Normalize an NFD string in UTF-16BE for import.
656 			 */
657 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
658 			    &mstr, utf16be_nfc, 100000, f_sconv16be));
659 			assertEqualInt(0,
660 			    archive_mstring_get_wcs(a, &mstr, &wp));
661 			if (should_be_nfc) {
662 				failure("UTF-16BE NFC(%s) should not be "
663 				    "converted to WCS NFD(%s):%d",
664 				    nfc, nfd, line);
665 				assertEqualWString(wc_nfc, wp);
666 			} else {
667 				failure("UTF-16BE NFC(%s) should be converted "
668 				    "to WCS NFD(%s):%d", nfc, nfd, line);
669 				assertEqualWString(wc_nfd, wp);
670 			}
671 
672 			/*
673 			 * Normalize an NFD string in UTF-16LE for import.
674 			 */
675 			assertEqualInt(0, archive_mstring_copy_mbs_len_l(
676 			    &mstr, utf16le_nfc, 100000, f_sconv16le));
677 			assertEqualInt(0,
678 			    archive_mstring_get_wcs(a, &mstr, &wp));
679 			if (should_be_nfc) {
680 				failure("UTF-16LE NFC(%s) should not be "
681 				    "converted to WCS NFD(%s):%d",
682 				    nfc, nfd, line);
683 				assertEqualWString(wc_nfc, wp);
684 			} else {
685 				failure("UTF-16LE NFC(%s) should be converted "
686 				    "to WCS NFD(%s):%d", nfc, nfd, line);
687 				assertEqualWString(wc_nfd, wp);
688 			}
689 
690 			/*
691 			 * Copy an NFD wide-string for export.
692 			 */
693 			assertEqualInt(0, archive_mstring_copy_wcs(
694 			    &mstr, wc_nfd));
695 			assertEqualInt(0, archive_mstring_get_mbs_l(
696 			    a, &mstr, &mp, &mplen, t_sconv8));
697 			failure("WCS NFD(%s) should be UTF-8 NFD:%d"
698 			    ,nfd, line);
699 			assertEqualUTF8String(utf8_nfd, mp);
700 		}
701 	}
702 
703 	archive_string_free(&utf8);
704 	archive_mstring_clean(&mstr);
705 	fclose(fp);
706 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
707 	assertEqualInt(ARCHIVE_OK, archive_write_free(a2));
708 }
709 
710 static void
test_archive_string_canonicalization(void)711 test_archive_string_canonicalization(void)
712 {
713 	struct archive *a;
714 	struct archive_string_conv *sconv;
715 
716 	setlocale(LC_ALL, "en_US.UTF-8");
717 
718 	assert((a = archive_read_new()) != NULL);
719 
720 	assertA(NULL != (sconv =
721 	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
722 	failure("Charset name should be UTF-8");
723 	assertEqualString("UTF-8",
724 	    archive_string_conversion_charset_name(sconv));
725 
726 	assertA(NULL != (sconv =
727 	    archive_string_conversion_to_charset(a, "UTF8", 1)));
728 	failure("Charset name should be UTF-8");
729 	assertEqualString("UTF-8",
730 	    archive_string_conversion_charset_name(sconv));
731 
732 	assertA(NULL != (sconv =
733 	    archive_string_conversion_to_charset(a, "utf8", 1)));
734 	failure("Charset name should be UTF-8");
735 	assertEqualString("UTF-8",
736 	    archive_string_conversion_charset_name(sconv));
737 
738 	assertA(NULL != (sconv =
739 	    archive_string_conversion_to_charset(a, "UTF-16BE", 1)));
740 	failure("Charset name should be UTF-16BE");
741 	assertEqualString("UTF-16BE",
742 	    archive_string_conversion_charset_name(sconv));
743 
744 	assertA(NULL != (sconv =
745 	    archive_string_conversion_to_charset(a, "UTF16BE", 1)));
746 	failure("Charset name should be UTF-16BE");
747 	assertEqualString("UTF-16BE",
748 	    archive_string_conversion_charset_name(sconv));
749 
750 	assertA(NULL != (sconv =
751 	    archive_string_conversion_to_charset(a, "utf16be", 1)));
752 	failure("Charset name should be UTF-16BE");
753 	assertEqualString("UTF-16BE",
754 	    archive_string_conversion_charset_name(sconv));
755 
756 	assertA(NULL != (sconv =
757 	    archive_string_conversion_to_charset(a, "UTF-16LE", 1)));
758 	failure("Charset name should be UTF-16LE");
759 	assertEqualString("UTF-16LE",
760 	    archive_string_conversion_charset_name(sconv));
761 
762 	assertA(NULL != (sconv =
763 	    archive_string_conversion_to_charset(a, "UTF16LE", 1)));
764 	failure("Charset name should be UTF-16LE");
765 	assertEqualString("UTF-16LE",
766 	    archive_string_conversion_charset_name(sconv));
767 
768 	assertA(NULL != (sconv =
769 	    archive_string_conversion_to_charset(a, "utf16le", 1)));
770 	failure("Charset name should be UTF-16LE");
771 	assertEqualString("UTF-16LE",
772 	    archive_string_conversion_charset_name(sconv));
773 
774 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
775 
776 }
777 
778 static void
check_string(struct archive * a,struct archive_mstring * mstr,struct archive_string_conv * sc,const char * exp,const wchar_t * wexp)779 check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc,
780   const char *exp, const wchar_t *wexp)
781 {
782 	/* Do all the tests on a copy so that we can have a clear initial state every time */
783 	struct archive_mstring mstr2;
784 	const char *p = NULL;
785 	const wchar_t *wp = NULL;
786 	size_t len = 0;
787 
788 	memset(&mstr2, 0, sizeof(mstr2));
789 
790 	archive_mstring_copy(&mstr2, mstr);
791 	assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p));
792 	assertEqualString(exp, p);
793 	p = NULL;
794 
795 	archive_mstring_copy(&mstr2, mstr);
796 	assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p));
797 	assertEqualString(exp, p);
798 	p = NULL;
799 
800 	archive_mstring_copy(&mstr2, mstr);
801 	assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp));
802 	assertEqualWString(wexp, wp);
803 	wp = NULL;
804 
805 	archive_mstring_copy(&mstr2, mstr);
806 	assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc));
807 	assertEqualString(exp, p);
808 	assertEqualInt(len, strlen(exp));
809 	p = NULL;
810 	len = 0;
811 
812 	archive_mstring_clean(&mstr2);
813 }
814 
815 /*
816  * Make sure no matter what the input encoding is, the string can be
817  * converted too all the output encodings.
818  */
819 static void
test_archive_string_set_get(void)820 test_archive_string_set_get(void)
821 {
822 	struct archive *a;
823 	struct archive_mstring mstr;
824 	struct archive_string_conv *sc;
825 
826 	setlocale(LC_ALL, "en_US.UTF-8");
827 
828 	assert((a = archive_read_new()) != NULL);
829 	memset(&mstr, 0, sizeof(mstr));
830 
831 	assertA(NULL != (sc =
832 	    archive_string_conversion_to_charset(a, "UTF-8", 1)));
833 	failure("Charset name should be UTF-8");
834 	assertEqualString("UTF-8",
835 	    archive_string_conversion_charset_name(sc));
836 
837 	assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA"));
838 	check_string(a, &mstr, sc, "AAA", L"AAA");
839 	assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB"));
840 	check_string(a, &mstr, sc, "BBBB", L"BBBB");
841 	assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12"));
842 	check_string(a, &mstr, sc, "CCC12", L"CCC12");
843 	assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc));
844 	check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l");
845 	assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H"));
846 	check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H");
847 
848         archive_mstring_clean(&mstr);
849 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
850 
851 }
852 
DEFINE_TEST(test_archive_string_conversion)853 DEFINE_TEST(test_archive_string_conversion)
854 {
855 	static const char reffile[] = "test_archive_string_conversion.txt.Z";
856 	static const char testdata[] = "testdata.txt";
857 	struct archive *a;
858 	struct archive_entry *ae;
859 	char buff[512];
860 	ssize_t size;
861 	FILE *fp;
862 
863 	/*
864 	 * Extract a test pattern file.
865 	 */
866 	extract_reference_file(reffile);
867 	assert((a = archive_read_new()) != NULL);
868 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a));
869 	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a));
870         assertEqualIntA(a, ARCHIVE_OK,
871             archive_read_open_filename(a, reffile, 512));
872 
873 	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
874 	assert((fp = fopen(testdata, "w")) != NULL);
875 	while ((size = archive_read_data(a, buff, 512)) > 0)
876 		assertEqualInt(size, fwrite(buff, 1, size, fp));
877 	assertEqualInt(0, fclose(fp));
878 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
879 
880 	test_archive_string_normalization_nfc(testdata);
881 	test_archive_string_normalization_mac_nfd(testdata);
882 	test_archive_string_canonicalization();
883 	test_archive_string_set_get();
884 }
885 
DEFINE_TEST(test_archive_string_conversion_utf16_utf8)886 DEFINE_TEST(test_archive_string_conversion_utf16_utf8)
887 {
888 #if !defined(_WIN32) || defined(__CYGWIN__)
889 	skipping("This test is meant to verify unicode string handling on Windows");
890 #else
891 	struct archive_mstring mstr;
892 	const char* utf8_string;
893 
894 	memset(&mstr, 0, sizeof(mstr));
895 
896 	assertEqualInt(ARCHIVE_OK,
897 	    archive_mstring_copy_wcs(&mstr, L"\U0000043f\U00000440\U00000438"));
898 
899 	/* Conversion from WCS to UTF-8 should always succeed */
900 	assertEqualInt(ARCHIVE_OK,
901 	    archive_mstring_get_utf8(NULL, &mstr, &utf8_string));
902 	assertEqualString("\xD0\xBF\xD1\x80\xD0\xB8", utf8_string);
903 
904 	archive_mstring_clean(&mstr);
905 #endif
906 }
907 
DEFINE_TEST(test_archive_string_conversion_utf8_utf16)908 DEFINE_TEST(test_archive_string_conversion_utf8_utf16)
909 {
910 #if !defined(_WIN32) || defined(__CYGWIN__)
911 	skipping("This test is meant to verify unicode string handling on Windows");
912 #else
913 	struct archive_mstring mstr;
914 	const wchar_t* wcs_string;
915 
916 	memset(&mstr, 0, sizeof(mstr));
917 
918 	assertEqualInt(6,
919 	    archive_mstring_copy_utf8(&mstr, "\xD0\xBF\xD1\x80\xD0\xB8"));
920 
921 	/* Conversion from UTF-8 to WCS should always succeed */
922 	assertEqualInt(ARCHIVE_OK,
923 	    archive_mstring_get_wcs(NULL, &mstr, &wcs_string));
924 	assertEqualWString(L"\U0000043f\U00000440\U00000438", wcs_string);
925 
926 	archive_mstring_clean(&mstr);
927 #endif
928 }
929 
DEFINE_TEST(test_archive_string_update_utf8_win)930 DEFINE_TEST(test_archive_string_update_utf8_win)
931 {
932 #if !defined(_WIN32) || defined(__CYGWIN__)
933 	skipping("This test is meant to verify unicode string handling on Windows"
934 	    " with the C locale");
935 #else
936 	static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
937 	static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
938 	struct archive_mstring mstr;
939 	int r;
940 
941 	memset(&mstr, 0, sizeof(mstr));
942 
943 	r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
944 
945 	/* On Windows, this should reliably fail with the C locale */
946 	assertEqualInt(-1, r);
947 	assertEqualInt(0, mstr.aes_set & AES_SET_MBS);
948 
949 	/* NOTE: We access the internals to validate that they were set by the
950 	 *       'archive_mstring_update_utf8' function */
951 	/* UTF-8 should always be set */
952 	assertEqualInt(AES_SET_UTF8, mstr.aes_set & AES_SET_UTF8);
953 	assertEqualString(utf8_string, mstr.aes_utf8.s);
954 	/* WCS should always be set as well */
955 	assertEqualInt(AES_SET_WCS, mstr.aes_set & AES_SET_WCS);
956 	assertEqualWString(wcs_string, mstr.aes_wcs.s);
957 
958 	archive_mstring_clean(&mstr);
959 #endif
960 }
961 
DEFINE_TEST(test_archive_string_update_utf8_utf8)962 DEFINE_TEST(test_archive_string_update_utf8_utf8)
963 {
964 	static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
965 	static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
966 	struct archive_mstring mstr;
967 	int r;
968 
969 	memset(&mstr, 0, sizeof(mstr));
970 
971 	if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) {
972 		skipping("UTF-8 not supported on this system.");
973 		return;
974 	}
975 
976 	r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
977 
978 	/* All conversions should have succeeded */
979 	assertEqualInt(0, r);
980 	assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
981 	assertEqualString(utf8_string, mstr.aes_utf8.s);
982 	assertEqualString(utf8_string, mstr.aes_mbs.s);
983 	assertEqualWString(wcs_string, mstr.aes_wcs.s);
984 
985 	archive_mstring_clean(&mstr);
986 }
987 
DEFINE_TEST(test_archive_string_update_utf8_koi8)988 DEFINE_TEST(test_archive_string_update_utf8_koi8)
989 {
990 	static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
991 	static const char koi8_string[] = "\xD0\xD2\xC9";
992 	static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
993 	struct archive_mstring mstr;
994 	int r;
995 
996 	memset(&mstr, 0, sizeof(mstr));
997 
998 	if (setlocale(LC_ALL, "ru_RU.KOI8-R") == NULL) {
999 		skipping("KOI8-R locale not available on this system.");
1000 		return;
1001 	}
1002 
1003 	r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
1004 
1005 	/* All conversions should have succeeded */
1006 	assertEqualInt(0, r);
1007 	assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
1008 	assertEqualString(utf8_string, mstr.aes_utf8.s);
1009 	assertEqualString(koi8_string, mstr.aes_mbs.s);
1010 #if defined(_WIN32) && !defined(__CYGWIN__)
1011 	assertEqualWString(wcs_string, mstr.aes_wcs.s);
1012 #else
1013 	/* No guarantee of how WCS strings behave, however this test test is
1014 	 * primarily meant for Windows */
1015 	(void)wcs_string;
1016 #endif
1017 
1018 	archive_mstring_clean(&mstr);
1019 }
1020