1 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */ 2 /* util/support/t_utf16.c - test UTF-16 conversion functions */ 3 /* 4 * Copyright (C) 2017 by the Massachusetts Institute of Technology. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 14 * * Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the 17 * distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 22 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 23 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 24 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 25 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 28 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 30 * OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33 /* 34 * This program tests conversions between UTF-8 and little-endian UTF-16, with 35 * an eye mainly towards covering UTF-16 edge cases and UTF-8 decoding results 36 * which we detect as invalid in utf8_conv.c. t_utf8.c covers more UTF-8 edge 37 * cases. 38 */ 39 40 #include <stdio.h> 41 #include <string.h> 42 43 #include "k5-platform.h" 44 #include "k5-utf8.h" 45 46 struct test { 47 const char *utf8; 48 const char *utf16; 49 size_t utf16len; 50 } tests[] = { 51 { "", "", 0 }, 52 { "abcd", "a\0b\0c\0d\0", 8 }, 53 /* From RFC 2781 (tests code point 0x12345 and some ASCII) */ 54 { "\xF0\x92\x8D\x85=Ra", "\x08\xD8\x45\xDF=\0R\0a\0", 10 }, 55 /* Lowest and highest Supplementary Plane code points */ 56 { "\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF", 57 "\x00\xD8\x00\xDC \0\xFF\xDB\xFF\xDF", 10 }, 58 /* Basic Multilingual Plane code points near and above surrogate range */ 59 { "\xED\x9F\xBF", "\xFF\xD7", 2 }, 60 { "\xEE\x80\x80 \xEE\xBF\xBF", "\x00\xE0 \0\xFF\xEF", 6 }, 61 /* Invalid UTF-8: decodes to value in surrogate pair range */ 62 { "\xED\xA0\x80", NULL, 0 }, /* 0xD800 */ 63 { "\xED\xAF\xBF", NULL, 0 }, /* 0xDBFF */ 64 { "\xED\xB0\x80", NULL, 0 }, /* 0xDC00 */ 65 { "\xED\xBF\xBF", NULL, 0 }, /* 0xDFFF */ 66 /* Invalid UTF-8: decodes to value above Unicode range */ 67 { "\xF4\x90\x80\x80", NULL, 0 }, 68 { "\xF4\xBF\xBF\xBF", NULL, 0 }, 69 { "\xF5\x80\x80\x80", NULL, 0 }, /* thrown out early due to first byte */ 70 /* Invalid UTF-16: odd numbers of UTF-16 bytes */ 71 { NULL, "\x00", 1 }, 72 { NULL, "\x01\x00\x02", 3 }, 73 /* Invalid UTF-16: high surrogate without a following low surrogate */ 74 { NULL, "\x00\xD8\x00\x00", 4 }, 75 { NULL, "\x00\xD8\xFF\xDB", 4 }, 76 { NULL, "\xFF\xDB", 2 }, 77 /* Invalid UTF-16: low surrogate without a preceding high surrogate */ 78 { NULL, "\x61\x00\x00\xDC", 4 }, 79 { NULL, "\xFF\xDF\xFF\xDB", 4 }, 80 }; 81 82 int 83 main(int argc, char **argv) 84 { 85 int ret; 86 struct test *t; 87 size_t i, utf16len; 88 uint8_t *utf16; 89 char *utf8; 90 91 for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { 92 t = &tests[i]; 93 if (t->utf8 != NULL) { 94 ret = k5_utf8_to_utf16le(t->utf8, &utf16, &utf16len); 95 if (t->utf16 == NULL) { 96 assert(ret == EINVAL); 97 } else { 98 assert(ret == 0); 99 assert(t->utf16len == utf16len); 100 assert(memcmp(t->utf16, utf16, utf16len) == 0); 101 free(utf16); 102 } 103 } 104 105 if (t->utf16 != NULL) { 106 ret = k5_utf16le_to_utf8((uint8_t *)t->utf16, t->utf16len, &utf8); 107 if (t->utf8 == NULL) { 108 assert(ret == EINVAL); 109 } else { 110 assert(ret == 0); 111 assert(strcmp(t->utf8, utf8) == 0); 112 free(utf8); 113 } 114 } 115 } 116 return 0; 117 } 118