xref: /freebsd/contrib/nvi/common/encoding.c (revision c271fa9295c13b3cc926562c9b204fa597dba7e6)
1f0957ccaSPeter Wemm /*-
2f0957ccaSPeter Wemm  * Copyright (c) 2011, 2012
3f0957ccaSPeter Wemm  *	Zhihao Yuan.  All rights reserved.
4f0957ccaSPeter Wemm  *
5f0957ccaSPeter Wemm  * See the LICENSE file for redistribution information.
6f0957ccaSPeter Wemm  */
7f0957ccaSPeter Wemm 
8f0957ccaSPeter Wemm #ifndef lint
9f0957ccaSPeter Wemm static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
10f0957ccaSPeter Wemm #endif /* not lint */
11f0957ccaSPeter Wemm 
12f0957ccaSPeter Wemm #include <sys/types.h>
13f0957ccaSPeter Wemm 
14*c271fa92SBaptiste Daroussin int looks_utf8(const char *, size_t);
15*c271fa92SBaptiste Daroussin int looks_utf16(const char *, size_t);
16*c271fa92SBaptiste Daroussin int decode_utf8(const char *);
17*c271fa92SBaptiste Daroussin int decode_utf16(const char *, int);
18f0957ccaSPeter Wemm 
19f0957ccaSPeter Wemm #define F 0   /* character never appears in text */
20f0957ccaSPeter Wemm #define T 1   /* character appears in plain ASCII text */
21f0957ccaSPeter Wemm #define I 2   /* character appears in ISO-8859 text */
22f0957ccaSPeter Wemm #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23f0957ccaSPeter Wemm 
24f0957ccaSPeter Wemm static char text_chars[256] = {
25f0957ccaSPeter Wemm 	/*                  BEL BS HT LF    FF CR    */
26f0957ccaSPeter Wemm 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
27f0957ccaSPeter Wemm 	/*                              ESC          */
28f0957ccaSPeter Wemm 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
29f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
30f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
31f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
32f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
33f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
34f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
35f0957ccaSPeter Wemm 	/*            NEL                            */
36f0957ccaSPeter Wemm 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
37f0957ccaSPeter Wemm 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
38f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
39f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
40f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
41f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
42f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
43f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
44f0957ccaSPeter Wemm };
45f0957ccaSPeter Wemm 
46f0957ccaSPeter Wemm /*
47f0957ccaSPeter Wemm  * looks_utf8 --
48f0957ccaSPeter Wemm  *  Decide whether some text looks like UTF-8. Returns:
49f0957ccaSPeter Wemm  *
50f0957ccaSPeter Wemm  *     -1: invalid UTF-8
51f0957ccaSPeter Wemm  *      0: uses odd control characters, so doesn't look like text
52f0957ccaSPeter Wemm  *      1: 7-bit text
53f0957ccaSPeter Wemm  *      2: definitely UTF-8 text (valid high-bit set bytes)
54f0957ccaSPeter Wemm  *
55f0957ccaSPeter Wemm  *  Based on RFC 3629. UTF-8 with BOM is not accepted.
56f0957ccaSPeter Wemm  *
57*c271fa92SBaptiste Daroussin  * PUBLIC: int looks_utf8(const char *, size_t);
58f0957ccaSPeter Wemm  */
59f0957ccaSPeter Wemm int
60f0957ccaSPeter Wemm looks_utf8(const char *ibuf, size_t nbytes)
61f0957ccaSPeter Wemm {
62f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
63f0957ccaSPeter Wemm 	size_t i;
64f0957ccaSPeter Wemm 	int n;
65f0957ccaSPeter Wemm 	int gotone = 0, ctrl = 0;
66f0957ccaSPeter Wemm 
67f0957ccaSPeter Wemm 	for (i = 0; i < nbytes; i++) {
68f0957ccaSPeter Wemm 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
69f0957ccaSPeter Wemm 			/*
70f0957ccaSPeter Wemm 			 * Even if the whole file is valid UTF-8 sequences,
71f0957ccaSPeter Wemm 			 * still reject it if it uses weird control characters.
72f0957ccaSPeter Wemm 			 */
73f0957ccaSPeter Wemm 
74f0957ccaSPeter Wemm 			if (text_chars[buf[i]] != T)
75f0957ccaSPeter Wemm 				ctrl = 1;
76f0957ccaSPeter Wemm 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
77f0957ccaSPeter Wemm 			return -1;
78f0957ccaSPeter Wemm 		} else {			   /* 11xxxxxx begins UTF-8 */
79f0957ccaSPeter Wemm 			int following;
80f0957ccaSPeter Wemm 
81f0957ccaSPeter Wemm 			if ((buf[i] & 0x20) == 0)	/* 110xxxxx */
82f0957ccaSPeter Wemm 				if (buf[i] > 0xC1)	/* C0, C1 */
83f0957ccaSPeter Wemm 					following = 1;
84f0957ccaSPeter Wemm 				else return -1;
85f0957ccaSPeter Wemm 			else if ((buf[i] & 0x10) == 0)	/* 1110xxxx */
86f0957ccaSPeter Wemm 				following = 2;
87f0957ccaSPeter Wemm 			else if ((buf[i] & 0x08) == 0)	/* 11110xxx */
88f0957ccaSPeter Wemm 				if (buf[i] < 0xF5)
89f0957ccaSPeter Wemm 					following = 3;
90f0957ccaSPeter Wemm 				else return -1;		/* F5, F6, F7 */
91f0957ccaSPeter Wemm 			else
92f0957ccaSPeter Wemm 				return -1;		/* F8~FF */
93f0957ccaSPeter Wemm 
94f0957ccaSPeter Wemm 			for (n = 0; n < following; n++) {
95f0957ccaSPeter Wemm 				i++;
96f0957ccaSPeter Wemm 				if (i >= nbytes)
97f0957ccaSPeter Wemm 					goto done;
98f0957ccaSPeter Wemm 
99f0957ccaSPeter Wemm 				if (buf[i] & 0x40)	/* 10xxxxxx */
100f0957ccaSPeter Wemm 					return -1;
101f0957ccaSPeter Wemm 			}
102f0957ccaSPeter Wemm 
103f0957ccaSPeter Wemm 			gotone = 1;
104f0957ccaSPeter Wemm 		}
105f0957ccaSPeter Wemm 	}
106f0957ccaSPeter Wemm done:
107f0957ccaSPeter Wemm 	return ctrl ? 0 : (gotone ? 2 : 1);
108f0957ccaSPeter Wemm }
109f0957ccaSPeter Wemm 
110f0957ccaSPeter Wemm /*
111f0957ccaSPeter Wemm  * looks_utf16 --
112f0957ccaSPeter Wemm  *  Decide whether some text looks like UTF-16. Returns:
113f0957ccaSPeter Wemm  *
114f0957ccaSPeter Wemm  *      0: invalid UTF-16
115f0957ccaSPeter Wemm  *      1: Little-endian UTF-16
116f0957ccaSPeter Wemm  *      2: Big-endian UTF-16
117f0957ccaSPeter Wemm  *
118*c271fa92SBaptiste Daroussin  * PUBLIC: int looks_utf16(const char *, size_t);
119f0957ccaSPeter Wemm  */
120f0957ccaSPeter Wemm int
121f0957ccaSPeter Wemm looks_utf16(const char *ibuf, size_t nbytes)
122f0957ccaSPeter Wemm {
123f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
124f0957ccaSPeter Wemm 	int bigend;
125f0957ccaSPeter Wemm 	size_t i;
126f0957ccaSPeter Wemm 	unsigned int c;
127f0957ccaSPeter Wemm 	int bom;
128f0957ccaSPeter Wemm 	int following = 0;
129f0957ccaSPeter Wemm 
130f0957ccaSPeter Wemm 	if (nbytes < 2)
131f0957ccaSPeter Wemm 		return 0;
132f0957ccaSPeter Wemm 
133f0957ccaSPeter Wemm 	bom = buf[0] << 8 ^ buf[1];
134f0957ccaSPeter Wemm 	if (bom == 0xFFFE)
135f0957ccaSPeter Wemm 		bigend = 0;
136f0957ccaSPeter Wemm 	else if (bom == 0xFEFF)
137f0957ccaSPeter Wemm 		bigend = 1;
138f0957ccaSPeter Wemm 	else
139f0957ccaSPeter Wemm 		return 0;
140f0957ccaSPeter Wemm 
141f0957ccaSPeter Wemm 	for (i = 2; i + 1 < nbytes; i += 2) {
142f0957ccaSPeter Wemm 		if (bigend)
143f0957ccaSPeter Wemm 			c = buf[i] << 8 ^ buf[i + 1];
144f0957ccaSPeter Wemm 		else
145f0957ccaSPeter Wemm 			c = buf[i] ^ buf[i + 1] << 8;
146f0957ccaSPeter Wemm 
147f0957ccaSPeter Wemm 		if (!following)
148f0957ccaSPeter Wemm 			if (c < 0xD800 || c > 0xDFFF)
149f0957ccaSPeter Wemm 				if (c < 128 && text_chars[c] != T)
150f0957ccaSPeter Wemm 					return 0;
151f0957ccaSPeter Wemm 				else
152f0957ccaSPeter Wemm 					following = 0;
153f0957ccaSPeter Wemm 			else if (c > 0xDBFF)
154f0957ccaSPeter Wemm 				return 0;
155f0957ccaSPeter Wemm 			else {
156f0957ccaSPeter Wemm 				following = 1;
157f0957ccaSPeter Wemm 				continue;
158f0957ccaSPeter Wemm 			}
159f0957ccaSPeter Wemm 		else if (c < 0xDC00 || c > 0xDFFF)
160f0957ccaSPeter Wemm 			return 0;
161f0957ccaSPeter Wemm 	}
162f0957ccaSPeter Wemm 
163f0957ccaSPeter Wemm 	return 1 + bigend;
164f0957ccaSPeter Wemm }
165f0957ccaSPeter Wemm 
166f0957ccaSPeter Wemm #undef F
167f0957ccaSPeter Wemm #undef T
168f0957ccaSPeter Wemm #undef I
169f0957ccaSPeter Wemm #undef X
170f0957ccaSPeter Wemm 
171f0957ccaSPeter Wemm /*
172f0957ccaSPeter Wemm  * decode_utf8 --
173f0957ccaSPeter Wemm  *  Decode a UTF-8 character from byte string to Unicode.
174f0957ccaSPeter Wemm  *  Returns -1 if the first byte is a not UTF-8 leader.
175f0957ccaSPeter Wemm  *
176f0957ccaSPeter Wemm  *  Based on RFC 3629, but without error detection.
177f0957ccaSPeter Wemm  *
178*c271fa92SBaptiste Daroussin  * PUBLIC: int decode_utf8(const char *);
179f0957ccaSPeter Wemm  */
180*c271fa92SBaptiste Daroussin int
181*c271fa92SBaptiste Daroussin decode_utf8(const char *ibuf)
182*c271fa92SBaptiste Daroussin {
183f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
184f0957ccaSPeter Wemm 	int u = -1;
185f0957ccaSPeter Wemm 
186f0957ccaSPeter Wemm 	if ((buf[0] & 0x80) == 0)
187f0957ccaSPeter Wemm 		u = buf[0];
188f0957ccaSPeter Wemm 	else if ((buf[0] & 0x40) == 0);
189f0957ccaSPeter Wemm 	else {
190f0957ccaSPeter Wemm 		if ((buf[0] & 0x20) == 0)
191f0957ccaSPeter Wemm 			u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
192f0957ccaSPeter Wemm 		else if ((buf[0] & 0x10) == 0)
193f0957ccaSPeter Wemm 			u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
194f0957ccaSPeter Wemm 			  ^ (buf[2] ^ 0x80);
195f0957ccaSPeter Wemm 		else if (((buf[0] & 0x08) == 0))
196f0957ccaSPeter Wemm 			u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
197f0957ccaSPeter Wemm 			  ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
198f0957ccaSPeter Wemm 	}
199*c271fa92SBaptiste Daroussin 
200f0957ccaSPeter Wemm 	return u;
201f0957ccaSPeter Wemm }
202f0957ccaSPeter Wemm 
203f0957ccaSPeter Wemm /*
204f0957ccaSPeter Wemm  * decode_utf16 --
205f0957ccaSPeter Wemm  *  Decode a UTF-16 character from byte string to Unicode.
206f0957ccaSPeter Wemm  *  Returns -1 if the first unsigned integer is invalid.
207f0957ccaSPeter Wemm  *
208f0957ccaSPeter Wemm  *  No error detection on supplementary bytes.
209f0957ccaSPeter Wemm  *
210*c271fa92SBaptiste Daroussin  * PUBLIC: int decode_utf16(const char *, int);
211f0957ccaSPeter Wemm  */
212*c271fa92SBaptiste Daroussin int
213*c271fa92SBaptiste Daroussin decode_utf16(const char* ibuf, int bigend)
214*c271fa92SBaptiste Daroussin {
215f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
216f0957ccaSPeter Wemm 	int u = -1;
217f0957ccaSPeter Wemm 	unsigned int w1, w2;
218f0957ccaSPeter Wemm 
219f0957ccaSPeter Wemm 	if (bigend)
220f0957ccaSPeter Wemm 		w1 = buf[0] << 8 ^ buf[1];
221f0957ccaSPeter Wemm 	else
222f0957ccaSPeter Wemm 		w1 = buf[0] ^ buf[1] << 8;
223f0957ccaSPeter Wemm 
224f0957ccaSPeter Wemm 	if (w1 < 0xD800 || w1 > 0xDFFF)
225f0957ccaSPeter Wemm 		u = w1;
226f0957ccaSPeter Wemm 	else if (w1 > 0xDBFF);
227f0957ccaSPeter Wemm 	else {
228f0957ccaSPeter Wemm 		if (bigend)
229f0957ccaSPeter Wemm 			w2 = buf[2] << 8 ^ buf[3];
230f0957ccaSPeter Wemm 		else
231f0957ccaSPeter Wemm 			w2 = buf[2] ^ buf[3] << 8;
232f0957ccaSPeter Wemm 		u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
233f0957ccaSPeter Wemm 	}
234*c271fa92SBaptiste Daroussin 
235f0957ccaSPeter Wemm 	return u;
236f0957ccaSPeter Wemm }
237