1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* util/support/t_utf8.c - test UTF-8 boundary conditions */ 3 /* 4 * Copyright (C) 2015 by the Massachusetts Institute of Technology. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 23 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 30 * OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 #include <stdio.h> 34 #include <string.h> 35 36 #include "k5-platform.h" 37 #include "k5-utf8.h" 38 39 /* 40 * Convenience macro to allow testing of old encodings. 41 * 42 * "Old" means ISO/IEC 10646 prior to 2011, when the highest valid code point 43 * was U+7FFFFFFF instead of U+10FFFF. 44 */ 45 #ifdef OLDENCODINGS 46 #define L(x) (x) 47 #else 48 #define L(x) 0 49 #endif 50 51 /* 52 * len is 0 for invalid encoding prefixes (KRB5_UTF8_CHARLEN2() partially 53 * enforces the validity of the first two bytes, based on masking the second 54 * byte. It doesn't check whether bit 6 is 0, though, and doesn't catch the 55 * range between U+110000 and U+13FFFF). 56 * 57 * ucs is 0 for invalid encodings (including ones with valid prefixes according 58 * to KRB5_UTF8_CHARLEN2(); krb5int_utf8_to_ucs4() will still fail on them 59 * because it checks more things.) Code points above U+10FFFF are excluded by 60 * the actual test code and remain in the table for possibly testing the old 61 * implementation that didn't exclude them. 62 * 63 * Neither krb5int_ucs4_to_utf8() nor krb5int_utf8_to_ucs4() excludes the 64 * surrogate pair range. 65 */ 66 struct testcase { 67 const char *p; 68 krb5_ucs4 ucs; 69 int len; 70 } testcases[] = { 71 { "\x7f", 0x0000007f, 1 }, /* Lowest 1-byte encoding */ 72 { "\xc0\x80", 0x00000000, 0 }, /* Invalid 2-byte encoding */ 73 { "\xc2\x80", 0x00000080, 2 }, /* Lowest valid 2-byte encoding */ 74 { "\xdf\xbf", 0x000007ff, 2 }, /* Highest valid 2-byte encoding*/ 75 { "\xdf\xff", 0x00000000, 2 }, /* Invalid 2-byte encoding*/ 76 { "\xe0\x80\x80", 0x00000000, 0 }, /* Invalid 3-byte encoding */ 77 { "\xe0\xa0\x80", 0x00000800, 3 }, /* Lowest valid 3-byte encoding */ 78 { "\xef\xbf\xbf", 0x0000ffff, 3 }, /* Highest valid 3-byte encoding */ 79 { "\xef\xff\xff", 0x00000000, 3 }, /* Invalid 3-byte encoding */ 80 { "\xf0\x80\x80\x80", 0x00000000, 0 }, /* Invalid 4-byte encoding */ 81 { "\xf0\x90\x80\x80", 0x00010000, 4 }, /* Lowest valid 4-byte encoding */ 82 { "\xf4\x8f\xbf\xbf", 0x0010ffff, 4 }, /* Highest valid 4-byte encoding */ 83 /* Next higher 4-byte encoding (old) */ 84 { "\xf4\x90\x80\x80", 0x00110000, 4 }, 85 /* Highest 4-byte encoding starting with 0xf4 (old) */ 86 { "\xf4\xbf\xbf\xbf", 0x0013ffff, 4 }, 87 /* Next higher 4-byte prefix byte (old) */ 88 { "\xf5\x80\x80\x80", 0x00140000, L(4) }, 89 /* Highest valid 4-byte encoding (old) */ 90 { "\xf7\xbf\xbf\xbf", 0x001fffff, L(4) }, 91 /* Invalid 4-byte encoding */ 92 { "\xf7\xff\xff\xff", 0x00000000, L(4) }, 93 /* Invalid 5-byte encoding */ 94 { "\xf8\x80\x80\x80\x80", 0x00000000, 0 }, 95 /* Lowest valid 5-byte encoding (old) */ 96 { "\xf8\x88\x80\x80\x80", 0x00200000, L(5) }, 97 /* Highest valid 5-byte encoding (old) */ 98 { "\xfb\xbf\xbf\xbf\xbf", 0x03ffffff, L(5) }, 99 /* Invalid 5-byte encoding */ 100 { "\xfb\xff\xff\xff\xff", 0x00000000, L(5) }, 101 /* Invalid 6-byte encoding */ 102 { "\xfc\x80\x80\x80\x80\x80", 0x00000000, 0 }, 103 /* Lowest valid 6-byte encoding (old) */ 104 { "\xfc\x84\x80\x80\x80\x80", 0x04000000, L(6) }, 105 /* Highest valid 6-byte encoding (old) */ 106 { "\xfd\xbf\xbf\xbf\xbf\xbf", 0x7fffffff, L(6) }, 107 /* Invalid 6-byte encoding */ 108 { "\xfd\xff\xff\xff\xff\xff", 0x00000000, L(6) }, 109 }; 110 111 static void 112 printhex(const char *p) 113 { 114 for (; *p != '\0'; p++) { 115 printf("%02x ", (unsigned char)*p); 116 } 117 } 118 119 static void 120 printtest(struct testcase *t) 121 { 122 printhex(t->p); 123 printf("0x%08lx, %d\n", (unsigned long)t->ucs, t->len); 124 } 125 126 static int 127 test_decode(struct testcase *t, int high4) 128 { 129 int len, status = 0; 130 krb5_ucs4 u = 0; 131 132 len = KRB5_UTF8_CHARLEN2(t->p, len); 133 if (len != t->len) { 134 printf("expected len=%d, got len=%d\n", t->len, len); 135 status = 1; 136 } 137 if ((t->len == 0 || high4) && krb5int_utf8_to_ucs4(t->p, &u) != -1) { 138 printf("unexpected success in utf8_to_ucs4\n"); 139 status = 1; 140 } 141 if (krb5int_utf8_to_ucs4(t->p, &u) != 0 && t->ucs != 0 && !high4) { 142 printf("unexpected failure in utf8_to_ucs4\n"); 143 status = 1; 144 } 145 if (t->ucs != u && !high4) { 146 printf("expected 0x%08lx, got 0x%08lx\n", (unsigned long)t->ucs, 147 (unsigned long)u); 148 status = 1; 149 } 150 return status; 151 } 152 153 static int 154 test_encode(struct testcase *t, int high4) 155 { 156 size_t size; 157 char buf[7]; 158 159 memset(buf, 0, sizeof(buf)); 160 size = krb5int_ucs4_to_utf8(t->ucs, buf); 161 if (high4 && size != 0) { 162 printf("unexpected success beyond U+10FFFF\n"); 163 return 1; 164 } 165 if (!high4 && size == 0) { 166 printf("unexpected zero size on encode\n"); 167 return 1; 168 } 169 if (size != 0 && strcmp(t->p, buf) != 0) { 170 printf("expected "); 171 printhex(t->p); 172 printf("got "); 173 printhex(buf); 174 printf("\n"); 175 return 1; 176 } 177 return 0; 178 } 179 180 int 181 main(int argc, char **argv) 182 { 183 size_t ncases = sizeof(testcases) / sizeof(testcases[0]); 184 size_t i; 185 struct testcase *t; 186 int status = 0, verbose = 0; 187 /* Is this a "high" 4-byte encoding above U+10FFFF? */ 188 int high4; 189 190 if (argc == 2 && strcmp(argv[1], "-v") == 0) 191 verbose = 1; 192 for (i = 0; i < ncases; i++) { 193 t = &testcases[i]; 194 if (verbose) 195 printtest(t); 196 #ifndef OLDENCODINGS 197 high4 = t->ucs > 0x10ffff; 198 #else 199 high4 = 0; 200 #endif 201 if (test_decode(t, high4) != 0) 202 status = 1; 203 if (t->ucs == 0) 204 continue; 205 if (test_encode(t, high4) != 0) 206 status = 1; 207 } 208 return status; 209 } 210