1 /*- 2 * Copyright (c) 2011, 2012 3 * Zhihao Yuan. All rights reserved. 4 * 5 * See the LICENSE file for redistribution information. 6 */ 7 8 #ifndef lint 9 static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $"; 10 #endif /* not lint */ 11 12 #include <sys/types.h> 13 14 int looks_utf8 __P((const char *, size_t)); 15 int looks_utf16 __P((const char *, size_t)); 16 int decode_utf8 __P((const char *)); 17 int decode_utf16 __P((const char *, int)); 18 19 #define F 0 /* character never appears in text */ 20 #define T 1 /* character appears in plain ASCII text */ 21 #define I 2 /* character appears in ISO-8859 text */ 22 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 23 24 static char text_chars[256] = { 25 /* BEL BS HT LF FF CR */ 26 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 27 /* ESC */ 28 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 29 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 30 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 31 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 32 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 33 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 34 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 35 /* NEL */ 36 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 37 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 38 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 39 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 40 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 41 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 42 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 43 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 44 }; 45 46 /* 47 * looks_utf8 -- 48 * Decide whether some text looks like UTF-8. Returns: 49 * 50 * -1: invalid UTF-8 51 * 0: uses odd control characters, so doesn't look like text 52 * 1: 7-bit text 53 * 2: definitely UTF-8 text (valid high-bit set bytes) 54 * 55 * Based on RFC 3629. UTF-8 with BOM is not accepted. 56 * 57 * PUBLIC: int looks_utf8 __P((const char *, size_t)); 58 */ 59 int 60 looks_utf8(const char *ibuf, size_t nbytes) 61 { 62 const u_char *buf = (u_char *)ibuf; 63 size_t i; 64 int n; 65 int gotone = 0, ctrl = 0; 66 67 for (i = 0; i < nbytes; i++) { 68 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 69 /* 70 * Even if the whole file is valid UTF-8 sequences, 71 * still reject it if it uses weird control characters. 72 */ 73 74 if (text_chars[buf[i]] != T) 75 ctrl = 1; 76 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 77 return -1; 78 } else { /* 11xxxxxx begins UTF-8 */ 79 int following; 80 81 if ((buf[i] & 0x20) == 0) /* 110xxxxx */ 82 if (buf[i] > 0xC1) /* C0, C1 */ 83 following = 1; 84 else return -1; 85 else if ((buf[i] & 0x10) == 0) /* 1110xxxx */ 86 following = 2; 87 else if ((buf[i] & 0x08) == 0) /* 11110xxx */ 88 if (buf[i] < 0xF5) 89 following = 3; 90 else return -1; /* F5, F6, F7 */ 91 else 92 return -1; /* F8~FF */ 93 94 for (n = 0; n < following; n++) { 95 i++; 96 if (i >= nbytes) 97 goto done; 98 99 if (buf[i] & 0x40) /* 10xxxxxx */ 100 return -1; 101 } 102 103 gotone = 1; 104 } 105 } 106 done: 107 return ctrl ? 0 : (gotone ? 2 : 1); 108 } 109 110 /* 111 * looks_utf16 -- 112 * Decide whether some text looks like UTF-16. Returns: 113 * 114 * 0: invalid UTF-16 115 * 1: Little-endian UTF-16 116 * 2: Big-endian UTF-16 117 * 118 * PUBLIC: int looks_utf16 __P((const char *, size_t)); 119 */ 120 int 121 looks_utf16(const char *ibuf, size_t nbytes) 122 { 123 const u_char *buf = (u_char *)ibuf; 124 int bigend; 125 size_t i; 126 unsigned int c; 127 int bom; 128 int following = 0; 129 130 if (nbytes < 2) 131 return 0; 132 133 bom = buf[0] << 8 ^ buf[1]; 134 if (bom == 0xFFFE) 135 bigend = 0; 136 else if (bom == 0xFEFF) 137 bigend = 1; 138 else 139 return 0; 140 141 for (i = 2; i + 1 < nbytes; i += 2) { 142 if (bigend) 143 c = buf[i] << 8 ^ buf[i + 1]; 144 else 145 c = buf[i] ^ buf[i + 1] << 8; 146 147 if (!following) 148 if (c < 0xD800 || c > 0xDFFF) 149 if (c < 128 && text_chars[c] != T) 150 return 0; 151 else 152 following = 0; 153 else if (c > 0xDBFF) 154 return 0; 155 else { 156 following = 1; 157 continue; 158 } 159 else if (c < 0xDC00 || c > 0xDFFF) 160 return 0; 161 } 162 163 return 1 + bigend; 164 } 165 166 #undef F 167 #undef T 168 #undef I 169 #undef X 170 171 /* 172 * decode_utf8 -- 173 * Decode a UTF-8 character from byte string to Unicode. 174 * Returns -1 if the first byte is a not UTF-8 leader. 175 * 176 * Based on RFC 3629, but without error detection. 177 * 178 * PUBLIC: int decode_utf8 __P((const char *)); 179 */ 180 int decode_utf8(const char *ibuf) { 181 const u_char *buf = (u_char *)ibuf; 182 int u = -1; 183 184 if ((buf[0] & 0x80) == 0) 185 u = buf[0]; 186 else if ((buf[0] & 0x40) == 0); 187 else { 188 if ((buf[0] & 0x20) == 0) 189 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80); 190 else if ((buf[0] & 0x10) == 0) 191 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6 192 ^ (buf[2] ^ 0x80); 193 else if (((buf[0] & 0x08) == 0)) 194 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12 195 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80); 196 } 197 return u; 198 } 199 200 /* 201 * decode_utf16 -- 202 * Decode a UTF-16 character from byte string to Unicode. 203 * Returns -1 if the first unsigned integer is invalid. 204 * 205 * No error detection on supplementary bytes. 206 * 207 * PUBLIC: int decode_utf16 __P((const char *, int)); 208 */ 209 int decode_utf16(const char* ibuf, int bigend) { 210 const u_char *buf = (u_char *)ibuf; 211 int u = -1; 212 unsigned int w1, w2; 213 214 if (bigend) 215 w1 = buf[0] << 8 ^ buf[1]; 216 else 217 w1 = buf[0] ^ buf[1] << 8; 218 219 if (w1 < 0xD800 || w1 > 0xDFFF) 220 u = w1; 221 else if (w1 > 0xDBFF); 222 else { 223 if (bigend) 224 w2 = buf[2] << 8 ^ buf[3]; 225 else 226 w2 = buf[2] ^ buf[3] << 8; 227 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000; 228 } 229 return u; 230 } 231