1*7f2fe78bSCy Schubert /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2*7f2fe78bSCy Schubert /* util/support/t_utf8.c - test UTF-8 boundary conditions */
3*7f2fe78bSCy Schubert /*
4*7f2fe78bSCy Schubert * Copyright (C) 2015 by the Massachusetts Institute of Technology.
5*7f2fe78bSCy Schubert * All rights reserved.
6*7f2fe78bSCy Schubert *
7*7f2fe78bSCy Schubert * Redistribution and use in source and binary forms, with or without
8*7f2fe78bSCy Schubert * modification, are permitted provided that the following conditions
9*7f2fe78bSCy Schubert * are met:
10*7f2fe78bSCy Schubert *
11*7f2fe78bSCy Schubert * * Redistributions of source code must retain the above copyright
12*7f2fe78bSCy Schubert * notice, this list of conditions and the following disclaimer.
13*7f2fe78bSCy Schubert *
14*7f2fe78bSCy Schubert * * Redistributions in binary form must reproduce the above copyright
15*7f2fe78bSCy Schubert * notice, this list of conditions and the following disclaimer in
16*7f2fe78bSCy Schubert * the documentation and/or other materials provided with the
17*7f2fe78bSCy Schubert * distribution.
18*7f2fe78bSCy Schubert *
19*7f2fe78bSCy Schubert * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20*7f2fe78bSCy Schubert * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21*7f2fe78bSCy Schubert * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
22*7f2fe78bSCy Schubert * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
23*7f2fe78bSCy Schubert * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
24*7f2fe78bSCy Schubert * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25*7f2fe78bSCy Schubert * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26*7f2fe78bSCy Schubert * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27*7f2fe78bSCy Schubert * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28*7f2fe78bSCy Schubert * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29*7f2fe78bSCy Schubert * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
30*7f2fe78bSCy Schubert * OF THE POSSIBILITY OF SUCH DAMAGE.
31*7f2fe78bSCy Schubert */
32*7f2fe78bSCy Schubert
33*7f2fe78bSCy Schubert #include <stdio.h>
34*7f2fe78bSCy Schubert #include <string.h>
35*7f2fe78bSCy Schubert
36*7f2fe78bSCy Schubert #include "k5-platform.h"
37*7f2fe78bSCy Schubert #include "k5-utf8.h"
38*7f2fe78bSCy Schubert
39*7f2fe78bSCy Schubert /*
40*7f2fe78bSCy Schubert * Convenience macro to allow testing of old encodings.
41*7f2fe78bSCy Schubert *
42*7f2fe78bSCy Schubert * "Old" means ISO/IEC 10646 prior to 2011, when the highest valid code point
43*7f2fe78bSCy Schubert * was U+7FFFFFFF instead of U+10FFFF.
44*7f2fe78bSCy Schubert */
45*7f2fe78bSCy Schubert #ifdef OLDENCODINGS
46*7f2fe78bSCy Schubert #define L(x) (x)
47*7f2fe78bSCy Schubert #else
48*7f2fe78bSCy Schubert #define L(x) 0
49*7f2fe78bSCy Schubert #endif
50*7f2fe78bSCy Schubert
51*7f2fe78bSCy Schubert /*
52*7f2fe78bSCy Schubert * len is 0 for invalid encoding prefixes (KRB5_UTF8_CHARLEN2() partially
53*7f2fe78bSCy Schubert * enforces the validity of the first two bytes, based on masking the second
54*7f2fe78bSCy Schubert * byte. It doesn't check whether bit 6 is 0, though, and doesn't catch the
55*7f2fe78bSCy Schubert * range between U+110000 and U+13FFFF).
56*7f2fe78bSCy Schubert *
57*7f2fe78bSCy Schubert * ucs is 0 for invalid encodings (including ones with valid prefixes according
58*7f2fe78bSCy Schubert * to KRB5_UTF8_CHARLEN2(); krb5int_utf8_to_ucs4() will still fail on them
59*7f2fe78bSCy Schubert * because it checks more things.) Code points above U+10FFFF are excluded by
60*7f2fe78bSCy Schubert * the actual test code and remain in the table for possibly testing the old
61*7f2fe78bSCy Schubert * implementation that didn't exclude them.
62*7f2fe78bSCy Schubert *
63*7f2fe78bSCy Schubert * Neither krb5int_ucs4_to_utf8() nor krb5int_utf8_to_ucs4() excludes the
64*7f2fe78bSCy Schubert * surrogate pair range.
65*7f2fe78bSCy Schubert */
66*7f2fe78bSCy Schubert struct testcase {
67*7f2fe78bSCy Schubert const char *p;
68*7f2fe78bSCy Schubert krb5_ucs4 ucs;
69*7f2fe78bSCy Schubert int len;
70*7f2fe78bSCy Schubert } testcases[] = {
71*7f2fe78bSCy Schubert { "\x7f", 0x0000007f, 1 }, /* Lowest 1-byte encoding */
72*7f2fe78bSCy Schubert { "\xc0\x80", 0x00000000, 0 }, /* Invalid 2-byte encoding */
73*7f2fe78bSCy Schubert { "\xc2\x80", 0x00000080, 2 }, /* Lowest valid 2-byte encoding */
74*7f2fe78bSCy Schubert { "\xdf\xbf", 0x000007ff, 2 }, /* Highest valid 2-byte encoding*/
75*7f2fe78bSCy Schubert { "\xdf\xff", 0x00000000, 2 }, /* Invalid 2-byte encoding*/
76*7f2fe78bSCy Schubert { "\xe0\x80\x80", 0x00000000, 0 }, /* Invalid 3-byte encoding */
77*7f2fe78bSCy Schubert { "\xe0\xa0\x80", 0x00000800, 3 }, /* Lowest valid 3-byte encoding */
78*7f2fe78bSCy Schubert { "\xef\xbf\xbf", 0x0000ffff, 3 }, /* Highest valid 3-byte encoding */
79*7f2fe78bSCy Schubert { "\xef\xff\xff", 0x00000000, 3 }, /* Invalid 3-byte encoding */
80*7f2fe78bSCy Schubert { "\xf0\x80\x80\x80", 0x00000000, 0 }, /* Invalid 4-byte encoding */
81*7f2fe78bSCy Schubert { "\xf0\x90\x80\x80", 0x00010000, 4 }, /* Lowest valid 4-byte encoding */
82*7f2fe78bSCy Schubert { "\xf4\x8f\xbf\xbf", 0x0010ffff, 4 }, /* Highest valid 4-byte encoding */
83*7f2fe78bSCy Schubert /* Next higher 4-byte encoding (old) */
84*7f2fe78bSCy Schubert { "\xf4\x90\x80\x80", 0x00110000, 4 },
85*7f2fe78bSCy Schubert /* Highest 4-byte encoding starting with 0xf4 (old) */
86*7f2fe78bSCy Schubert { "\xf4\xbf\xbf\xbf", 0x0013ffff, 4 },
87*7f2fe78bSCy Schubert /* Next higher 4-byte prefix byte (old) */
88*7f2fe78bSCy Schubert { "\xf5\x80\x80\x80", 0x00140000, L(4) },
89*7f2fe78bSCy Schubert /* Highest valid 4-byte encoding (old) */
90*7f2fe78bSCy Schubert { "\xf7\xbf\xbf\xbf", 0x001fffff, L(4) },
91*7f2fe78bSCy Schubert /* Invalid 4-byte encoding */
92*7f2fe78bSCy Schubert { "\xf7\xff\xff\xff", 0x00000000, L(4) },
93*7f2fe78bSCy Schubert /* Invalid 5-byte encoding */
94*7f2fe78bSCy Schubert { "\xf8\x80\x80\x80\x80", 0x00000000, 0 },
95*7f2fe78bSCy Schubert /* Lowest valid 5-byte encoding (old) */
96*7f2fe78bSCy Schubert { "\xf8\x88\x80\x80\x80", 0x00200000, L(5) },
97*7f2fe78bSCy Schubert /* Highest valid 5-byte encoding (old) */
98*7f2fe78bSCy Schubert { "\xfb\xbf\xbf\xbf\xbf", 0x03ffffff, L(5) },
99*7f2fe78bSCy Schubert /* Invalid 5-byte encoding */
100*7f2fe78bSCy Schubert { "\xfb\xff\xff\xff\xff", 0x00000000, L(5) },
101*7f2fe78bSCy Schubert /* Invalid 6-byte encoding */
102*7f2fe78bSCy Schubert { "\xfc\x80\x80\x80\x80\x80", 0x00000000, 0 },
103*7f2fe78bSCy Schubert /* Lowest valid 6-byte encoding (old) */
104*7f2fe78bSCy Schubert { "\xfc\x84\x80\x80\x80\x80", 0x04000000, L(6) },
105*7f2fe78bSCy Schubert /* Highest valid 6-byte encoding (old) */
106*7f2fe78bSCy Schubert { "\xfd\xbf\xbf\xbf\xbf\xbf", 0x7fffffff, L(6) },
107*7f2fe78bSCy Schubert /* Invalid 6-byte encoding */
108*7f2fe78bSCy Schubert { "\xfd\xff\xff\xff\xff\xff", 0x00000000, L(6) },
109*7f2fe78bSCy Schubert };
110*7f2fe78bSCy Schubert
111*7f2fe78bSCy Schubert static void
printhex(const char * p)112*7f2fe78bSCy Schubert printhex(const char *p)
113*7f2fe78bSCy Schubert {
114*7f2fe78bSCy Schubert for (; *p != '\0'; p++) {
115*7f2fe78bSCy Schubert printf("%02x ", (unsigned char)*p);
116*7f2fe78bSCy Schubert }
117*7f2fe78bSCy Schubert }
118*7f2fe78bSCy Schubert
119*7f2fe78bSCy Schubert static void
printtest(struct testcase * t)120*7f2fe78bSCy Schubert printtest(struct testcase *t)
121*7f2fe78bSCy Schubert {
122*7f2fe78bSCy Schubert printhex(t->p);
123*7f2fe78bSCy Schubert printf("0x%08lx, %d\n", (unsigned long)t->ucs, t->len);
124*7f2fe78bSCy Schubert }
125*7f2fe78bSCy Schubert
126*7f2fe78bSCy Schubert static int
test_decode(struct testcase * t,int high4)127*7f2fe78bSCy Schubert test_decode(struct testcase *t, int high4)
128*7f2fe78bSCy Schubert {
129*7f2fe78bSCy Schubert int len, status = 0;
130*7f2fe78bSCy Schubert krb5_ucs4 u = 0;
131*7f2fe78bSCy Schubert
132*7f2fe78bSCy Schubert len = KRB5_UTF8_CHARLEN2(t->p, len);
133*7f2fe78bSCy Schubert if (len != t->len) {
134*7f2fe78bSCy Schubert printf("expected len=%d, got len=%d\n", t->len, len);
135*7f2fe78bSCy Schubert status = 1;
136*7f2fe78bSCy Schubert }
137*7f2fe78bSCy Schubert if ((t->len == 0 || high4) && krb5int_utf8_to_ucs4(t->p, &u) != -1) {
138*7f2fe78bSCy Schubert printf("unexpected success in utf8_to_ucs4\n");
139*7f2fe78bSCy Schubert status = 1;
140*7f2fe78bSCy Schubert }
141*7f2fe78bSCy Schubert if (krb5int_utf8_to_ucs4(t->p, &u) != 0 && t->ucs != 0 && !high4) {
142*7f2fe78bSCy Schubert printf("unexpected failure in utf8_to_ucs4\n");
143*7f2fe78bSCy Schubert status = 1;
144*7f2fe78bSCy Schubert }
145*7f2fe78bSCy Schubert if (t->ucs != u && !high4) {
146*7f2fe78bSCy Schubert printf("expected 0x%08lx, got 0x%08lx\n", (unsigned long)t->ucs,
147*7f2fe78bSCy Schubert (unsigned long)u);
148*7f2fe78bSCy Schubert status = 1;
149*7f2fe78bSCy Schubert }
150*7f2fe78bSCy Schubert return status;
151*7f2fe78bSCy Schubert }
152*7f2fe78bSCy Schubert
153*7f2fe78bSCy Schubert static int
test_encode(struct testcase * t,int high4)154*7f2fe78bSCy Schubert test_encode(struct testcase *t, int high4)
155*7f2fe78bSCy Schubert {
156*7f2fe78bSCy Schubert size_t size;
157*7f2fe78bSCy Schubert char buf[7];
158*7f2fe78bSCy Schubert
159*7f2fe78bSCy Schubert memset(buf, 0, sizeof(buf));
160*7f2fe78bSCy Schubert size = krb5int_ucs4_to_utf8(t->ucs, buf);
161*7f2fe78bSCy Schubert if (high4 && size != 0) {
162*7f2fe78bSCy Schubert printf("unexpected success beyond U+10FFFF\n");
163*7f2fe78bSCy Schubert return 1;
164*7f2fe78bSCy Schubert }
165*7f2fe78bSCy Schubert if (!high4 && size == 0) {
166*7f2fe78bSCy Schubert printf("unexpected zero size on encode\n");
167*7f2fe78bSCy Schubert return 1;
168*7f2fe78bSCy Schubert }
169*7f2fe78bSCy Schubert if (size != 0 && strcmp(t->p, buf) != 0) {
170*7f2fe78bSCy Schubert printf("expected ");
171*7f2fe78bSCy Schubert printhex(t->p);
172*7f2fe78bSCy Schubert printf("got ");
173*7f2fe78bSCy Schubert printhex(buf);
174*7f2fe78bSCy Schubert printf("\n");
175*7f2fe78bSCy Schubert return 1;
176*7f2fe78bSCy Schubert }
177*7f2fe78bSCy Schubert return 0;
178*7f2fe78bSCy Schubert }
179*7f2fe78bSCy Schubert
180*7f2fe78bSCy Schubert int
main(int argc,char ** argv)181*7f2fe78bSCy Schubert main(int argc, char **argv)
182*7f2fe78bSCy Schubert {
183*7f2fe78bSCy Schubert size_t ncases = sizeof(testcases) / sizeof(testcases[0]);
184*7f2fe78bSCy Schubert size_t i;
185*7f2fe78bSCy Schubert struct testcase *t;
186*7f2fe78bSCy Schubert int status = 0, verbose = 0;
187*7f2fe78bSCy Schubert /* Is this a "high" 4-byte encoding above U+10FFFF? */
188*7f2fe78bSCy Schubert int high4;
189*7f2fe78bSCy Schubert
190*7f2fe78bSCy Schubert if (argc == 2 && strcmp(argv[1], "-v") == 0)
191*7f2fe78bSCy Schubert verbose = 1;
192*7f2fe78bSCy Schubert for (i = 0; i < ncases; i++) {
193*7f2fe78bSCy Schubert t = &testcases[i];
194*7f2fe78bSCy Schubert if (verbose)
195*7f2fe78bSCy Schubert printtest(t);
196*7f2fe78bSCy Schubert #ifndef OLDENCODINGS
197*7f2fe78bSCy Schubert high4 = t->ucs > 0x10ffff;
198*7f2fe78bSCy Schubert #else
199*7f2fe78bSCy Schubert high4 = 0;
200*7f2fe78bSCy Schubert #endif
201*7f2fe78bSCy Schubert if (test_decode(t, high4) != 0)
202*7f2fe78bSCy Schubert status = 1;
203*7f2fe78bSCy Schubert if (t->ucs == 0)
204*7f2fe78bSCy Schubert continue;
205*7f2fe78bSCy Schubert if (test_encode(t, high4) != 0)
206*7f2fe78bSCy Schubert status = 1;
207*7f2fe78bSCy Schubert }
208*7f2fe78bSCy Schubert return status;
209*7f2fe78bSCy Schubert }
210