1f0957ccaSPeter Wemm /*-
2f0957ccaSPeter Wemm * Copyright (c) 2011, 2012
3f0957ccaSPeter Wemm * Zhihao Yuan. All rights reserved.
4f0957ccaSPeter Wemm *
5f0957ccaSPeter Wemm * See the LICENSE file for redistribution information.
6f0957ccaSPeter Wemm */
7f0957ccaSPeter Wemm
8f0957ccaSPeter Wemm #include <sys/types.h>
9f0957ccaSPeter Wemm
10c271fa92SBaptiste Daroussin int looks_utf8(const char *, size_t);
11c271fa92SBaptiste Daroussin int looks_utf16(const char *, size_t);
12c271fa92SBaptiste Daroussin int decode_utf8(const char *);
13c271fa92SBaptiste Daroussin int decode_utf16(const char *, int);
14f0957ccaSPeter Wemm
15f0957ccaSPeter Wemm #define F 0 /* character never appears in text */
16f0957ccaSPeter Wemm #define T 1 /* character appears in plain ASCII text */
17f0957ccaSPeter Wemm #define I 2 /* character appears in ISO-8859 text */
18f0957ccaSPeter Wemm #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
19f0957ccaSPeter Wemm
20f0957ccaSPeter Wemm static char text_chars[256] = {
21f0957ccaSPeter Wemm /* BEL BS HT LF FF CR */
22f0957ccaSPeter Wemm F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
23f0957ccaSPeter Wemm /* ESC */
24f0957ccaSPeter Wemm F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
25f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
26f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
27f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
28f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
29f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
30f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
31f0957ccaSPeter Wemm /* NEL */
32f0957ccaSPeter Wemm X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
33f0957ccaSPeter Wemm X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
34f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
35f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
36f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
37f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
38f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
39f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
40f0957ccaSPeter Wemm };
41f0957ccaSPeter Wemm
42f0957ccaSPeter Wemm /*
43f0957ccaSPeter Wemm * looks_utf8 --
44f0957ccaSPeter Wemm * Decide whether some text looks like UTF-8. Returns:
45f0957ccaSPeter Wemm *
46f0957ccaSPeter Wemm * -1: invalid UTF-8
47f0957ccaSPeter Wemm * 0: uses odd control characters, so doesn't look like text
48f0957ccaSPeter Wemm * 1: 7-bit text
49f0957ccaSPeter Wemm * 2: definitely UTF-8 text (valid high-bit set bytes)
50f0957ccaSPeter Wemm *
51f0957ccaSPeter Wemm * Based on RFC 3629. UTF-8 with BOM is not accepted.
52f0957ccaSPeter Wemm *
53c271fa92SBaptiste Daroussin * PUBLIC: int looks_utf8(const char *, size_t);
54f0957ccaSPeter Wemm */
55f0957ccaSPeter Wemm int
looks_utf8(const char * ibuf,size_t nbytes)56f0957ccaSPeter Wemm looks_utf8(const char *ibuf, size_t nbytes)
57f0957ccaSPeter Wemm {
58f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf;
59f0957ccaSPeter Wemm size_t i;
60f0957ccaSPeter Wemm int n;
61f0957ccaSPeter Wemm int gotone = 0, ctrl = 0;
62f0957ccaSPeter Wemm
63f0957ccaSPeter Wemm for (i = 0; i < nbytes; i++) {
64f0957ccaSPeter Wemm if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
65f0957ccaSPeter Wemm /*
66f0957ccaSPeter Wemm * Even if the whole file is valid UTF-8 sequences,
67f0957ccaSPeter Wemm * still reject it if it uses weird control characters.
68f0957ccaSPeter Wemm */
69f0957ccaSPeter Wemm
70f0957ccaSPeter Wemm if (text_chars[buf[i]] != T)
71f0957ccaSPeter Wemm ctrl = 1;
72f0957ccaSPeter Wemm } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
73f0957ccaSPeter Wemm return -1;
74f0957ccaSPeter Wemm } else { /* 11xxxxxx begins UTF-8 */
75f0957ccaSPeter Wemm int following;
76f0957ccaSPeter Wemm
77f0957ccaSPeter Wemm if ((buf[i] & 0x20) == 0) /* 110xxxxx */
78f0957ccaSPeter Wemm if (buf[i] > 0xC1) /* C0, C1 */
79f0957ccaSPeter Wemm following = 1;
80f0957ccaSPeter Wemm else return -1;
81f0957ccaSPeter Wemm else if ((buf[i] & 0x10) == 0) /* 1110xxxx */
82f0957ccaSPeter Wemm following = 2;
83f0957ccaSPeter Wemm else if ((buf[i] & 0x08) == 0) /* 11110xxx */
84f0957ccaSPeter Wemm if (buf[i] < 0xF5)
85f0957ccaSPeter Wemm following = 3;
86f0957ccaSPeter Wemm else return -1; /* F5, F6, F7 */
87f0957ccaSPeter Wemm else
88f0957ccaSPeter Wemm return -1; /* F8~FF */
89f0957ccaSPeter Wemm
90f0957ccaSPeter Wemm for (n = 0; n < following; n++) {
91f0957ccaSPeter Wemm i++;
92f0957ccaSPeter Wemm if (i >= nbytes)
93f0957ccaSPeter Wemm goto done;
94f0957ccaSPeter Wemm
95*f2dfec1fSYuri Pankov if ((buf[i] & 0xc0) != 0x80) /* 10xxxxxx */
96f0957ccaSPeter Wemm return -1;
97f0957ccaSPeter Wemm }
98f0957ccaSPeter Wemm
99f0957ccaSPeter Wemm gotone = 1;
100f0957ccaSPeter Wemm }
101f0957ccaSPeter Wemm }
102f0957ccaSPeter Wemm done:
103f0957ccaSPeter Wemm return ctrl ? 0 : (gotone ? 2 : 1);
104f0957ccaSPeter Wemm }
105f0957ccaSPeter Wemm
106f0957ccaSPeter Wemm /*
107f0957ccaSPeter Wemm * looks_utf16 --
108f0957ccaSPeter Wemm * Decide whether some text looks like UTF-16. Returns:
109f0957ccaSPeter Wemm *
110f0957ccaSPeter Wemm * 0: invalid UTF-16
111f0957ccaSPeter Wemm * 1: Little-endian UTF-16
112f0957ccaSPeter Wemm * 2: Big-endian UTF-16
113f0957ccaSPeter Wemm *
114c271fa92SBaptiste Daroussin * PUBLIC: int looks_utf16(const char *, size_t);
115f0957ccaSPeter Wemm */
116f0957ccaSPeter Wemm int
looks_utf16(const char * ibuf,size_t nbytes)117f0957ccaSPeter Wemm looks_utf16(const char *ibuf, size_t nbytes)
118f0957ccaSPeter Wemm {
119f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf;
120f0957ccaSPeter Wemm int bigend;
121f0957ccaSPeter Wemm size_t i;
122f0957ccaSPeter Wemm unsigned int c;
123f0957ccaSPeter Wemm int bom;
124f0957ccaSPeter Wemm int following = 0;
125f0957ccaSPeter Wemm
126f0957ccaSPeter Wemm if (nbytes < 2)
127f0957ccaSPeter Wemm return 0;
128f0957ccaSPeter Wemm
129f0957ccaSPeter Wemm bom = buf[0] << 8 ^ buf[1];
130f0957ccaSPeter Wemm if (bom == 0xFFFE)
131f0957ccaSPeter Wemm bigend = 0;
132f0957ccaSPeter Wemm else if (bom == 0xFEFF)
133f0957ccaSPeter Wemm bigend = 1;
134f0957ccaSPeter Wemm else
135f0957ccaSPeter Wemm return 0;
136f0957ccaSPeter Wemm
137f0957ccaSPeter Wemm for (i = 2; i + 1 < nbytes; i += 2) {
138f0957ccaSPeter Wemm if (bigend)
139f0957ccaSPeter Wemm c = buf[i] << 8 ^ buf[i + 1];
140f0957ccaSPeter Wemm else
141f0957ccaSPeter Wemm c = buf[i] ^ buf[i + 1] << 8;
142f0957ccaSPeter Wemm
143f0957ccaSPeter Wemm if (!following)
144f0957ccaSPeter Wemm if (c < 0xD800 || c > 0xDFFF)
145f0957ccaSPeter Wemm if (c < 128 && text_chars[c] != T)
146f0957ccaSPeter Wemm return 0;
147f0957ccaSPeter Wemm else
148f0957ccaSPeter Wemm following = 0;
149f0957ccaSPeter Wemm else if (c > 0xDBFF)
150f0957ccaSPeter Wemm return 0;
151f0957ccaSPeter Wemm else {
152f0957ccaSPeter Wemm following = 1;
153f0957ccaSPeter Wemm continue;
154f0957ccaSPeter Wemm }
155f0957ccaSPeter Wemm else if (c < 0xDC00 || c > 0xDFFF)
156f0957ccaSPeter Wemm return 0;
157f0957ccaSPeter Wemm }
158f0957ccaSPeter Wemm
159f0957ccaSPeter Wemm return 1 + bigend;
160f0957ccaSPeter Wemm }
161f0957ccaSPeter Wemm
162f0957ccaSPeter Wemm #undef F
163f0957ccaSPeter Wemm #undef T
164f0957ccaSPeter Wemm #undef I
165f0957ccaSPeter Wemm #undef X
166f0957ccaSPeter Wemm
167f0957ccaSPeter Wemm /*
168f0957ccaSPeter Wemm * decode_utf8 --
169f0957ccaSPeter Wemm * Decode a UTF-8 character from byte string to Unicode.
170f0957ccaSPeter Wemm * Returns -1 if the first byte is a not UTF-8 leader.
171f0957ccaSPeter Wemm *
172f0957ccaSPeter Wemm * Based on RFC 3629, but without error detection.
173f0957ccaSPeter Wemm *
174c271fa92SBaptiste Daroussin * PUBLIC: int decode_utf8(const char *);
175f0957ccaSPeter Wemm */
176c271fa92SBaptiste Daroussin int
decode_utf8(const char * ibuf)177c271fa92SBaptiste Daroussin decode_utf8(const char *ibuf)
178c271fa92SBaptiste Daroussin {
179f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf;
180f0957ccaSPeter Wemm int u = -1;
181f0957ccaSPeter Wemm
182f0957ccaSPeter Wemm if ((buf[0] & 0x80) == 0)
183f0957ccaSPeter Wemm u = buf[0];
184f0957ccaSPeter Wemm else if ((buf[0] & 0x40) == 0);
185f0957ccaSPeter Wemm else {
186f0957ccaSPeter Wemm if ((buf[0] & 0x20) == 0)
187f0957ccaSPeter Wemm u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
188f0957ccaSPeter Wemm else if ((buf[0] & 0x10) == 0)
189f0957ccaSPeter Wemm u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
190f0957ccaSPeter Wemm ^ (buf[2] ^ 0x80);
191f0957ccaSPeter Wemm else if (((buf[0] & 0x08) == 0))
192f0957ccaSPeter Wemm u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
193f0957ccaSPeter Wemm ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
194f0957ccaSPeter Wemm }
195c271fa92SBaptiste Daroussin
196f0957ccaSPeter Wemm return u;
197f0957ccaSPeter Wemm }
198f0957ccaSPeter Wemm
199f0957ccaSPeter Wemm /*
200f0957ccaSPeter Wemm * decode_utf16 --
201f0957ccaSPeter Wemm * Decode a UTF-16 character from byte string to Unicode.
202f0957ccaSPeter Wemm * Returns -1 if the first unsigned integer is invalid.
203f0957ccaSPeter Wemm *
204f0957ccaSPeter Wemm * No error detection on supplementary bytes.
205f0957ccaSPeter Wemm *
206c271fa92SBaptiste Daroussin * PUBLIC: int decode_utf16(const char *, int);
207f0957ccaSPeter Wemm */
208c271fa92SBaptiste Daroussin int
decode_utf16(const char * ibuf,int bigend)209c271fa92SBaptiste Daroussin decode_utf16(const char* ibuf, int bigend)
210c271fa92SBaptiste Daroussin {
211f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf;
212f0957ccaSPeter Wemm int u = -1;
213f0957ccaSPeter Wemm unsigned int w1, w2;
214f0957ccaSPeter Wemm
215f0957ccaSPeter Wemm if (bigend)
216f0957ccaSPeter Wemm w1 = buf[0] << 8 ^ buf[1];
217f0957ccaSPeter Wemm else
218f0957ccaSPeter Wemm w1 = buf[0] ^ buf[1] << 8;
219f0957ccaSPeter Wemm
220f0957ccaSPeter Wemm if (w1 < 0xD800 || w1 > 0xDFFF)
221f0957ccaSPeter Wemm u = w1;
222f0957ccaSPeter Wemm else if (w1 > 0xDBFF);
223f0957ccaSPeter Wemm else {
224f0957ccaSPeter Wemm if (bigend)
225f0957ccaSPeter Wemm w2 = buf[2] << 8 ^ buf[3];
226f0957ccaSPeter Wemm else
227f0957ccaSPeter Wemm w2 = buf[2] ^ buf[3] << 8;
228f0957ccaSPeter Wemm u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
229f0957ccaSPeter Wemm }
230c271fa92SBaptiste Daroussin
231f0957ccaSPeter Wemm return u;
232f0957ccaSPeter Wemm }
233