1 /*- 2 * Copyright (c) 2011, 2012 3 * Zhihao Yuan. All rights reserved. 4 * 5 * See the LICENSE file for redistribution information. 6 */ 7 8 #include <sys/types.h> 9 10 int looks_utf8(const char *, size_t); 11 int looks_utf16(const char *, size_t); 12 int decode_utf8(const char *); 13 int decode_utf16(const char *, int); 14 15 #define F 0 /* character never appears in text */ 16 #define T 1 /* character appears in plain ASCII text */ 17 #define I 2 /* character appears in ISO-8859 text */ 18 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 19 20 static char text_chars[256] = { 21 /* BEL BS HT LF FF CR */ 22 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 23 /* ESC */ 24 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 25 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 26 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 27 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 28 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 29 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 30 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 31 /* NEL */ 32 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 33 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 34 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 35 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 36 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 37 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 38 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 39 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 40 }; 41 42 /* 43 * looks_utf8 -- 44 * Decide whether some text looks like UTF-8. Returns: 45 * 46 * -1: invalid UTF-8 47 * 0: uses odd control characters, so doesn't look like text 48 * 1: 7-bit text 49 * 2: definitely UTF-8 text (valid high-bit set bytes) 50 * 51 * Based on RFC 3629. UTF-8 with BOM is not accepted. 52 * 53 * PUBLIC: int looks_utf8(const char *, size_t); 54 */ 55 int 56 looks_utf8(const char *ibuf, size_t nbytes) 57 { 58 const u_char *buf = (u_char *)ibuf; 59 size_t i; 60 int n; 61 int gotone = 0, ctrl = 0; 62 63 for (i = 0; i < nbytes; i++) { 64 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 65 /* 66 * Even if the whole file is valid UTF-8 sequences, 67 * still reject it if it uses weird control characters. 68 */ 69 70 if (text_chars[buf[i]] != T) 71 ctrl = 1; 72 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 73 return -1; 74 } else { /* 11xxxxxx begins UTF-8 */ 75 int following; 76 77 if ((buf[i] & 0x20) == 0) /* 110xxxxx */ 78 if (buf[i] > 0xC1) /* C0, C1 */ 79 following = 1; 80 else return -1; 81 else if ((buf[i] & 0x10) == 0) /* 1110xxxx */ 82 following = 2; 83 else if ((buf[i] & 0x08) == 0) /* 11110xxx */ 84 if (buf[i] < 0xF5) 85 following = 3; 86 else return -1; /* F5, F6, F7 */ 87 else 88 return -1; /* F8~FF */ 89 90 for (n = 0; n < following; n++) { 91 i++; 92 if (i >= nbytes) 93 goto done; 94 95 if ((buf[i] & 0xc0) != 0x80) /* 10xxxxxx */ 96 return -1; 97 } 98 99 gotone = 1; 100 } 101 } 102 done: 103 return ctrl ? 0 : (gotone ? 2 : 1); 104 } 105 106 /* 107 * looks_utf16 -- 108 * Decide whether some text looks like UTF-16. Returns: 109 * 110 * 0: invalid UTF-16 111 * 1: Little-endian UTF-16 112 * 2: Big-endian UTF-16 113 * 114 * PUBLIC: int looks_utf16(const char *, size_t); 115 */ 116 int 117 looks_utf16(const char *ibuf, size_t nbytes) 118 { 119 const u_char *buf = (u_char *)ibuf; 120 int bigend; 121 size_t i; 122 unsigned int c; 123 int bom; 124 int following = 0; 125 126 if (nbytes < 2) 127 return 0; 128 129 bom = buf[0] << 8 ^ buf[1]; 130 if (bom == 0xFFFE) 131 bigend = 0; 132 else if (bom == 0xFEFF) 133 bigend = 1; 134 else 135 return 0; 136 137 for (i = 2; i + 1 < nbytes; i += 2) { 138 if (bigend) 139 c = buf[i] << 8 ^ buf[i + 1]; 140 else 141 c = buf[i] ^ buf[i + 1] << 8; 142 143 if (!following) 144 if (c < 0xD800 || c > 0xDFFF) 145 if (c < 128 && text_chars[c] != T) 146 return 0; 147 else 148 following = 0; 149 else if (c > 0xDBFF) 150 return 0; 151 else { 152 following = 1; 153 continue; 154 } 155 else if (c < 0xDC00 || c > 0xDFFF) 156 return 0; 157 } 158 159 return 1 + bigend; 160 } 161 162 #undef F 163 #undef T 164 #undef I 165 #undef X 166 167 /* 168 * decode_utf8 -- 169 * Decode a UTF-8 character from byte string to Unicode. 170 * Returns -1 if the first byte is a not UTF-8 leader. 171 * 172 * Based on RFC 3629, but without error detection. 173 * 174 * PUBLIC: int decode_utf8(const char *); 175 */ 176 int 177 decode_utf8(const char *ibuf) 178 { 179 const u_char *buf = (u_char *)ibuf; 180 int u = -1; 181 182 if ((buf[0] & 0x80) == 0) 183 u = buf[0]; 184 else if ((buf[0] & 0x40) == 0); 185 else { 186 if ((buf[0] & 0x20) == 0) 187 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80); 188 else if ((buf[0] & 0x10) == 0) 189 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6 190 ^ (buf[2] ^ 0x80); 191 else if (((buf[0] & 0x08) == 0)) 192 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12 193 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80); 194 } 195 196 return u; 197 } 198 199 /* 200 * decode_utf16 -- 201 * Decode a UTF-16 character from byte string to Unicode. 202 * Returns -1 if the first unsigned integer is invalid. 203 * 204 * No error detection on supplementary bytes. 205 * 206 * PUBLIC: int decode_utf16(const char *, int); 207 */ 208 int 209 decode_utf16(const char* ibuf, int bigend) 210 { 211 const u_char *buf = (u_char *)ibuf; 212 int u = -1; 213 unsigned int w1, w2; 214 215 if (bigend) 216 w1 = buf[0] << 8 ^ buf[1]; 217 else 218 w1 = buf[0] ^ buf[1] << 8; 219 220 if (w1 < 0xD800 || w1 > 0xDFFF) 221 u = w1; 222 else if (w1 > 0xDBFF); 223 else { 224 if (bigend) 225 w2 = buf[2] << 8 ^ buf[3]; 226 else 227 w2 = buf[2] ^ buf[3] << 8; 228 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000; 229 } 230 231 return u; 232 } 233