1 /*-
2 * Copyright (c) 2011, 2012
3 * Zhihao Yuan. All rights reserved.
4 *
5 * See the LICENSE file for redistribution information.
6 */
7
8 #include <sys/types.h>
9
10 int looks_utf8(const char *, size_t);
11 int looks_utf16(const char *, size_t);
12 int decode_utf8(const char *);
13 int decode_utf16(const char *, int);
14
15 #define F 0 /* character never appears in text */
16 #define T 1 /* character appears in plain ASCII text */
17 #define I 2 /* character appears in ISO-8859 text */
18 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
19
20 static char text_chars[256] = {
21 /* BEL BS HT LF FF CR */
22 F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
23 /* ESC */
24 F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
25 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
26 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
27 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
28 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
29 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
30 T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
31 /* NEL */
32 X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
33 X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
34 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
35 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
36 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
37 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
38 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
39 I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
40 };
41
42 /*
43 * looks_utf8 --
44 * Decide whether some text looks like UTF-8. Returns:
45 *
46 * -1: invalid UTF-8
47 * 0: uses odd control characters, so doesn't look like text
48 * 1: 7-bit text
49 * 2: definitely UTF-8 text (valid high-bit set bytes)
50 *
51 * Based on RFC 3629. UTF-8 with BOM is not accepted.
52 *
53 * PUBLIC: int looks_utf8(const char *, size_t);
54 */
55 int
looks_utf8(const char * ibuf,size_t nbytes)56 looks_utf8(const char *ibuf, size_t nbytes)
57 {
58 const u_char *buf = (u_char *)ibuf;
59 size_t i;
60 int n;
61 int gotone = 0, ctrl = 0;
62
63 for (i = 0; i < nbytes; i++) {
64 if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
65 /*
66 * Even if the whole file is valid UTF-8 sequences,
67 * still reject it if it uses weird control characters.
68 */
69
70 if (text_chars[buf[i]] != T)
71 ctrl = 1;
72 } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
73 return -1;
74 } else { /* 11xxxxxx begins UTF-8 */
75 int following;
76
77 if ((buf[i] & 0x20) == 0) /* 110xxxxx */
78 if (buf[i] > 0xC1) /* C0, C1 */
79 following = 1;
80 else return -1;
81 else if ((buf[i] & 0x10) == 0) /* 1110xxxx */
82 following = 2;
83 else if ((buf[i] & 0x08) == 0) /* 11110xxx */
84 if (buf[i] < 0xF5)
85 following = 3;
86 else return -1; /* F5, F6, F7 */
87 else
88 return -1; /* F8~FF */
89
90 for (n = 0; n < following; n++) {
91 i++;
92 if (i >= nbytes)
93 goto done;
94
95 if ((buf[i] & 0xc0) != 0x80) /* 10xxxxxx */
96 return -1;
97 }
98
99 gotone = 1;
100 }
101 }
102 done:
103 return ctrl ? 0 : (gotone ? 2 : 1);
104 }
105
106 /*
107 * looks_utf16 --
108 * Decide whether some text looks like UTF-16. Returns:
109 *
110 * 0: invalid UTF-16
111 * 1: Little-endian UTF-16
112 * 2: Big-endian UTF-16
113 *
114 * PUBLIC: int looks_utf16(const char *, size_t);
115 */
116 int
looks_utf16(const char * ibuf,size_t nbytes)117 looks_utf16(const char *ibuf, size_t nbytes)
118 {
119 const u_char *buf = (u_char *)ibuf;
120 int bigend;
121 size_t i;
122 unsigned int c;
123 int bom;
124 int following = 0;
125
126 if (nbytes < 2)
127 return 0;
128
129 bom = buf[0] << 8 ^ buf[1];
130 if (bom == 0xFFFE)
131 bigend = 0;
132 else if (bom == 0xFEFF)
133 bigend = 1;
134 else
135 return 0;
136
137 for (i = 2; i + 1 < nbytes; i += 2) {
138 if (bigend)
139 c = buf[i] << 8 ^ buf[i + 1];
140 else
141 c = buf[i] ^ buf[i + 1] << 8;
142
143 if (!following)
144 if (c < 0xD800 || c > 0xDFFF)
145 if (c < 128 && text_chars[c] != T)
146 return 0;
147 else
148 following = 0;
149 else if (c > 0xDBFF)
150 return 0;
151 else {
152 following = 1;
153 continue;
154 }
155 else if (c < 0xDC00 || c > 0xDFFF)
156 return 0;
157 }
158
159 return 1 + bigend;
160 }
161
162 #undef F
163 #undef T
164 #undef I
165 #undef X
166
167 /*
168 * decode_utf8 --
169 * Decode a UTF-8 character from byte string to Unicode.
170 * Returns -1 if the first byte is a not UTF-8 leader.
171 *
172 * Based on RFC 3629, but without error detection.
173 *
174 * PUBLIC: int decode_utf8(const char *);
175 */
176 int
decode_utf8(const char * ibuf)177 decode_utf8(const char *ibuf)
178 {
179 const u_char *buf = (u_char *)ibuf;
180 int u = -1;
181
182 if ((buf[0] & 0x80) == 0)
183 u = buf[0];
184 else if ((buf[0] & 0x40) == 0);
185 else {
186 if ((buf[0] & 0x20) == 0)
187 u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
188 else if ((buf[0] & 0x10) == 0)
189 u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
190 ^ (buf[2] ^ 0x80);
191 else if (((buf[0] & 0x08) == 0))
192 u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
193 ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
194 }
195
196 return u;
197 }
198
199 /*
200 * decode_utf16 --
201 * Decode a UTF-16 character from byte string to Unicode.
202 * Returns -1 if the first unsigned integer is invalid.
203 *
204 * No error detection on supplementary bytes.
205 *
206 * PUBLIC: int decode_utf16(const char *, int);
207 */
208 int
decode_utf16(const char * ibuf,int bigend)209 decode_utf16(const char* ibuf, int bigend)
210 {
211 const u_char *buf = (u_char *)ibuf;
212 int u = -1;
213 unsigned int w1, w2;
214
215 if (bigend)
216 w1 = buf[0] << 8 ^ buf[1];
217 else
218 w1 = buf[0] ^ buf[1] << 8;
219
220 if (w1 < 0xD800 || w1 > 0xDFFF)
221 u = w1;
222 else if (w1 > 0xDBFF);
223 else {
224 if (bigend)
225 w2 = buf[2] << 8 ^ buf[3];
226 else
227 w2 = buf[2] ^ buf[3] << 8;
228 u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
229 }
230
231 return u;
232 }
233