1 /* 2 * Kernel module for testing utf-8 support. 3 * 4 * Copyright 2017 Collabora Ltd. 5 * 6 * This software is licensed under the terms of the GNU General Public 7 * License version 2, as published by the Free Software Foundation, and 8 * may be copied, distributed, and modified under those terms. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 */ 15 16 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 17 18 #include <linux/module.h> 19 #include <linux/printk.h> 20 #include <linux/unicode.h> 21 #include <linux/dcache.h> 22 23 #include "utf8n.h" 24 25 unsigned int failed_tests; 26 unsigned int total_tests; 27 28 /* Tests will be based on this version. */ 29 #define latest_maj 12 30 #define latest_min 1 31 #define latest_rev 0 32 33 #define _test(cond, func, line, fmt, ...) do { \ 34 total_tests++; \ 35 if (!cond) { \ 36 failed_tests++; \ 37 pr_err("test %s:%d Failed: %s%s", \ 38 func, line, #cond, (fmt?":":".")); \ 39 if (fmt) \ 40 pr_err(fmt, ##__VA_ARGS__); \ 41 } \ 42 } while (0) 43 #define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) 44 #define test(cond) _test(cond, __func__, __LINE__, "") 45 46 const static struct { 47 /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 48 unsigned char str[10]; 49 unsigned char dec[10]; 50 } nfdi_test_data[] = { 51 /* Trivial sequence */ 52 { 53 /* "ABba" decomposes to itself */ 54 .str = "aBba", 55 .dec = "aBba", 56 }, 57 /* Simple equivalent sequences */ 58 { 59 /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to 60 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on 61 canonical decomposition */ 62 .str = {0xc2, 0xbc, 0x00}, 63 .dec = {0xc2, 0xbc, 0x00}, 64 }, 65 { 66 /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to 67 'LETTER A' + 'COMBINING DIAERESIS' */ 68 .str = {0xc3, 0xa4, 0x00}, 69 .dec = {0x61, 0xcc, 0x88, 0x00}, 70 }, 71 { 72 /* 'LATIN SMALL LETTER LJ' can't decompose to 73 'LETTER L' + 'LETTER J' on canonical decomposition */ 74 .str = {0xC7, 0x89, 0x00}, 75 .dec = {0xC7, 0x89, 0x00}, 76 }, 77 { 78 /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ 79 .str = {0xCE, 0x87, 0x00}, 80 .dec = {0xC2, 0xB7, 0x00} 81 }, 82 /* Canonical ordering */ 83 { 84 /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes 85 to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ 86 .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, 87 .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, 88 }, 89 { 90 /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' 91 decomposes to 92 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ 93 .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, 94 95 .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, 96 }, 97 98 }; 99 100 const static struct { 101 /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 102 unsigned char str[30]; 103 unsigned char ncf[30]; 104 } nfdicf_test_data[] = { 105 /* Trivial sequences */ 106 { 107 /* "ABba" folds to lowercase */ 108 .str = {0x41, 0x42, 0x62, 0x61, 0x00}, 109 .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, 110 }, 111 { 112 /* All ASCII folds to lower-case */ 113 .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1", 114 .ncf = "abcdefghijklmnopqrstuvwxyz0.1", 115 }, 116 { 117 /* LATIN SMALL LETTER SHARP S folds to 118 LATIN SMALL LETTER S + LATIN SMALL LETTER S */ 119 .str = {0xc3, 0x9f, 0x00}, 120 .ncf = {0x73, 0x73, 0x00}, 121 }, 122 { 123 /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to 124 LATIN SMALL LETTER A + COMBINING RING ABOVE */ 125 .str = {0xC3, 0x85, 0x00}, 126 .ncf = {0x61, 0xcc, 0x8a, 0x00}, 127 }, 128 /* Introduced by UTF-8.0.0. */ 129 /* Cherokee letters are interesting test-cases because they fold 130 to upper-case. Before 8.0.0, Cherokee lowercase were 131 undefined, thus, the folding from LC is not stable between 132 7.0.0 -> 8.0.0, but it is from UC. */ 133 { 134 /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ 135 .str = {0xea, 0xad, 0xb0, 0x00}, 136 .ncf = {0xe1, 0x8e, 0xa0, 0x00}, 137 }, 138 { 139 /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ 140 .str = {0xe1, 0x8f, 0xb8, 0x00}, 141 .ncf = {0xe1, 0x8f, 0xb0, 0x00}, 142 }, 143 { 144 /* OLD HUNGARIAN CAPITAL LETTER AMB folds to 145 OLD HUNGARIAN SMALL LETTER AMB */ 146 .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, 147 .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, 148 }, 149 /* Introduced by UTF-9.0.0. */ 150 { 151 /* OSAGE CAPITAL LETTER CHA folds to 152 OSAGE SMALL LETTER CHA */ 153 .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, 154 .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, 155 }, 156 { 157 /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to 158 LATIN LETTER SMALL CAPITAL I */ 159 .str = {0xea, 0x9e, 0xae, 0x00}, 160 .ncf = {0xc9, 0xaa, 0x00}, 161 }, 162 /* Introduced by UTF-11.0.0. */ 163 { 164 /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI 165 CAPITAL LETTER AN */ 166 .str = {0xe1, 0xb2, 0x90, 0x00}, 167 .ncf = {0xe1, 0x83, 0x90, 0x00}, 168 } 169 }; 170 171 static void check_utf8_nfdi(void) 172 { 173 int i; 174 struct utf8cursor u8c; 175 const struct utf8data *data; 176 177 data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev)); 178 if (!data) { 179 pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", 180 __func__, latest_maj, latest_min, latest_rev); 181 return; 182 } 183 184 for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 185 int len = strlen(nfdi_test_data[i].str); 186 int nlen = strlen(nfdi_test_data[i].dec); 187 int j = 0; 188 unsigned char c; 189 190 test((utf8len(data, nfdi_test_data[i].str) == nlen)); 191 test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen)); 192 193 if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0) 194 pr_err("can't create cursor\n"); 195 196 while ((c = utf8byte(&u8c)) > 0) { 197 test_f((c == nfdi_test_data[i].dec[j]), 198 "Unexpected byte 0x%x should be 0x%x\n", 199 c, nfdi_test_data[i].dec[j]); 200 j++; 201 } 202 203 test((j == nlen)); 204 } 205 } 206 207 static void check_utf8_nfdicf(void) 208 { 209 int i; 210 struct utf8cursor u8c; 211 const struct utf8data *data; 212 213 data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev)); 214 if (!data) { 215 pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", 216 __func__, latest_maj, latest_min, latest_rev); 217 return; 218 } 219 220 for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 221 int len = strlen(nfdicf_test_data[i].str); 222 int nlen = strlen(nfdicf_test_data[i].ncf); 223 int j = 0; 224 unsigned char c; 225 226 test((utf8len(data, nfdicf_test_data[i].str) == nlen)); 227 test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen)); 228 229 if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0) 230 pr_err("can't create cursor\n"); 231 232 while ((c = utf8byte(&u8c)) > 0) { 233 test_f((c == nfdicf_test_data[i].ncf[j]), 234 "Unexpected byte 0x%x should be 0x%x\n", 235 c, nfdicf_test_data[i].ncf[j]); 236 j++; 237 } 238 239 test((j == nlen)); 240 } 241 } 242 243 static void check_utf8_comparisons(void) 244 { 245 int i; 246 struct unicode_map *table = utf8_load("12.1.0"); 247 248 if (IS_ERR(table)) { 249 pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n", 250 __func__, latest_maj, latest_min, latest_rev); 251 return; 252 } 253 254 for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 255 const struct qstr s1 = {.name = nfdi_test_data[i].str, 256 .len = sizeof(nfdi_test_data[i].str)}; 257 const struct qstr s2 = {.name = nfdi_test_data[i].dec, 258 .len = sizeof(nfdi_test_data[i].dec)}; 259 260 test_f(!utf8_strncmp(table, &s1, &s2), 261 "%s %s comparison mismatch\n", s1.name, s2.name); 262 } 263 264 for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 265 const struct qstr s1 = {.name = nfdicf_test_data[i].str, 266 .len = sizeof(nfdicf_test_data[i].str)}; 267 const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, 268 .len = sizeof(nfdicf_test_data[i].ncf)}; 269 270 test_f(!utf8_strncasecmp(table, &s1, &s2), 271 "%s %s comparison mismatch\n", s1.name, s2.name); 272 } 273 274 utf8_unload(table); 275 } 276 277 static void check_supported_versions(void) 278 { 279 /* Unicode 7.0.0 should be supported. */ 280 test(utf8version_is_supported(7, 0, 0)); 281 282 /* Unicode 9.0.0 should be supported. */ 283 test(utf8version_is_supported(9, 0, 0)); 284 285 /* Unicode 1x.0.0 (the latest version) should be supported. */ 286 test(utf8version_is_supported(latest_maj, latest_min, latest_rev)); 287 288 /* Next versions don't exist. */ 289 test(!utf8version_is_supported(13, 0, 0)); 290 test(!utf8version_is_supported(0, 0, 0)); 291 test(!utf8version_is_supported(-1, -1, -1)); 292 } 293 294 static int __init init_test_ucd(void) 295 { 296 failed_tests = 0; 297 total_tests = 0; 298 299 check_supported_versions(); 300 check_utf8_nfdi(); 301 check_utf8_nfdicf(); 302 check_utf8_comparisons(); 303 304 if (!failed_tests) 305 pr_info("All %u tests passed\n", total_tests); 306 else 307 pr_err("%u out of %u tests failed\n", failed_tests, 308 total_tests); 309 return 0; 310 } 311 312 static void __exit exit_test_ucd(void) 313 { 314 } 315 316 module_init(init_test_ucd); 317 module_exit(exit_test_ucd); 318 319 MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); 320 MODULE_LICENSE("GPL"); 321