xref: /freebsd/contrib/nvi/common/encoding.c (revision 110d525ec6188f3c9dc4f54c4bc1cced2f7184cd)
1f0957ccaSPeter Wemm /*-
2f0957ccaSPeter Wemm  * Copyright (c) 2011, 2012
3f0957ccaSPeter Wemm  *	Zhihao Yuan.  All rights reserved.
4f0957ccaSPeter Wemm  *
5f0957ccaSPeter Wemm  * See the LICENSE file for redistribution information.
6f0957ccaSPeter Wemm  */
7f0957ccaSPeter Wemm 
8f0957ccaSPeter Wemm #include <sys/types.h>
9f0957ccaSPeter Wemm 
10c271fa92SBaptiste Daroussin int looks_utf8(const char *, size_t);
11c271fa92SBaptiste Daroussin int looks_utf16(const char *, size_t);
12c271fa92SBaptiste Daroussin int decode_utf8(const char *);
13c271fa92SBaptiste Daroussin int decode_utf16(const char *, int);
14f0957ccaSPeter Wemm 
15f0957ccaSPeter Wemm #define F 0   /* character never appears in text */
16f0957ccaSPeter Wemm #define T 1   /* character appears in plain ASCII text */
17f0957ccaSPeter Wemm #define I 2   /* character appears in ISO-8859 text */
18f0957ccaSPeter Wemm #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
19f0957ccaSPeter Wemm 
20f0957ccaSPeter Wemm static char text_chars[256] = {
21f0957ccaSPeter Wemm 	/*                  BEL BS HT LF    FF CR    */
22f0957ccaSPeter Wemm 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
23f0957ccaSPeter Wemm 	/*                              ESC          */
24f0957ccaSPeter Wemm 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
25f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
26f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
27f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
28f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
29f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
30f0957ccaSPeter Wemm 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
31f0957ccaSPeter Wemm 	/*            NEL                            */
32f0957ccaSPeter Wemm 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
33f0957ccaSPeter Wemm 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
34f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
35f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
36f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
37f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
38f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
39f0957ccaSPeter Wemm 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
40f0957ccaSPeter Wemm };
41f0957ccaSPeter Wemm 
42f0957ccaSPeter Wemm /*
43f0957ccaSPeter Wemm  * looks_utf8 --
44f0957ccaSPeter Wemm  *  Decide whether some text looks like UTF-8. Returns:
45f0957ccaSPeter Wemm  *
46f0957ccaSPeter Wemm  *     -1: invalid UTF-8
47f0957ccaSPeter Wemm  *      0: uses odd control characters, so doesn't look like text
48f0957ccaSPeter Wemm  *      1: 7-bit text
49f0957ccaSPeter Wemm  *      2: definitely UTF-8 text (valid high-bit set bytes)
50f0957ccaSPeter Wemm  *
51f0957ccaSPeter Wemm  *  Based on RFC 3629. UTF-8 with BOM is not accepted.
52f0957ccaSPeter Wemm  *
53c271fa92SBaptiste Daroussin  * PUBLIC: int looks_utf8(const char *, size_t);
54f0957ccaSPeter Wemm  */
55f0957ccaSPeter Wemm int
looks_utf8(const char * ibuf,size_t nbytes)56f0957ccaSPeter Wemm looks_utf8(const char *ibuf, size_t nbytes)
57f0957ccaSPeter Wemm {
58f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
59f0957ccaSPeter Wemm 	size_t i;
60f0957ccaSPeter Wemm 	int n;
61f0957ccaSPeter Wemm 	int gotone = 0, ctrl = 0;
62f0957ccaSPeter Wemm 
63f0957ccaSPeter Wemm 	for (i = 0; i < nbytes; i++) {
64f0957ccaSPeter Wemm 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
65f0957ccaSPeter Wemm 			/*
66f0957ccaSPeter Wemm 			 * Even if the whole file is valid UTF-8 sequences,
67f0957ccaSPeter Wemm 			 * still reject it if it uses weird control characters.
68f0957ccaSPeter Wemm 			 */
69f0957ccaSPeter Wemm 
70f0957ccaSPeter Wemm 			if (text_chars[buf[i]] != T)
71f0957ccaSPeter Wemm 				ctrl = 1;
72f0957ccaSPeter Wemm 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
73f0957ccaSPeter Wemm 			return -1;
74f0957ccaSPeter Wemm 		} else {			   /* 11xxxxxx begins UTF-8 */
75f0957ccaSPeter Wemm 			int following;
76f0957ccaSPeter Wemm 
77f0957ccaSPeter Wemm 			if ((buf[i] & 0x20) == 0)	/* 110xxxxx */
78f0957ccaSPeter Wemm 				if (buf[i] > 0xC1)	/* C0, C1 */
79f0957ccaSPeter Wemm 					following = 1;
80f0957ccaSPeter Wemm 				else return -1;
81f0957ccaSPeter Wemm 			else if ((buf[i] & 0x10) == 0)	/* 1110xxxx */
82f0957ccaSPeter Wemm 				following = 2;
83f0957ccaSPeter Wemm 			else if ((buf[i] & 0x08) == 0)	/* 11110xxx */
84f0957ccaSPeter Wemm 				if (buf[i] < 0xF5)
85f0957ccaSPeter Wemm 					following = 3;
86f0957ccaSPeter Wemm 				else return -1;		/* F5, F6, F7 */
87f0957ccaSPeter Wemm 			else
88f0957ccaSPeter Wemm 				return -1;		/* F8~FF */
89f0957ccaSPeter Wemm 
90f0957ccaSPeter Wemm 			for (n = 0; n < following; n++) {
91f0957ccaSPeter Wemm 				i++;
92f0957ccaSPeter Wemm 				if (i >= nbytes)
93f0957ccaSPeter Wemm 					goto done;
94f0957ccaSPeter Wemm 
95*f2dfec1fSYuri Pankov 				if ((buf[i] & 0xc0) != 0x80)	/* 10xxxxxx */
96f0957ccaSPeter Wemm 					return -1;
97f0957ccaSPeter Wemm 			}
98f0957ccaSPeter Wemm 
99f0957ccaSPeter Wemm 			gotone = 1;
100f0957ccaSPeter Wemm 		}
101f0957ccaSPeter Wemm 	}
102f0957ccaSPeter Wemm done:
103f0957ccaSPeter Wemm 	return ctrl ? 0 : (gotone ? 2 : 1);
104f0957ccaSPeter Wemm }
105f0957ccaSPeter Wemm 
106f0957ccaSPeter Wemm /*
107f0957ccaSPeter Wemm  * looks_utf16 --
108f0957ccaSPeter Wemm  *  Decide whether some text looks like UTF-16. Returns:
109f0957ccaSPeter Wemm  *
110f0957ccaSPeter Wemm  *      0: invalid UTF-16
111f0957ccaSPeter Wemm  *      1: Little-endian UTF-16
112f0957ccaSPeter Wemm  *      2: Big-endian UTF-16
113f0957ccaSPeter Wemm  *
114c271fa92SBaptiste Daroussin  * PUBLIC: int looks_utf16(const char *, size_t);
115f0957ccaSPeter Wemm  */
116f0957ccaSPeter Wemm int
looks_utf16(const char * ibuf,size_t nbytes)117f0957ccaSPeter Wemm looks_utf16(const char *ibuf, size_t nbytes)
118f0957ccaSPeter Wemm {
119f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
120f0957ccaSPeter Wemm 	int bigend;
121f0957ccaSPeter Wemm 	size_t i;
122f0957ccaSPeter Wemm 	unsigned int c;
123f0957ccaSPeter Wemm 	int bom;
124f0957ccaSPeter Wemm 	int following = 0;
125f0957ccaSPeter Wemm 
126f0957ccaSPeter Wemm 	if (nbytes < 2)
127f0957ccaSPeter Wemm 		return 0;
128f0957ccaSPeter Wemm 
129f0957ccaSPeter Wemm 	bom = buf[0] << 8 ^ buf[1];
130f0957ccaSPeter Wemm 	if (bom == 0xFFFE)
131f0957ccaSPeter Wemm 		bigend = 0;
132f0957ccaSPeter Wemm 	else if (bom == 0xFEFF)
133f0957ccaSPeter Wemm 		bigend = 1;
134f0957ccaSPeter Wemm 	else
135f0957ccaSPeter Wemm 		return 0;
136f0957ccaSPeter Wemm 
137f0957ccaSPeter Wemm 	for (i = 2; i + 1 < nbytes; i += 2) {
138f0957ccaSPeter Wemm 		if (bigend)
139f0957ccaSPeter Wemm 			c = buf[i] << 8 ^ buf[i + 1];
140f0957ccaSPeter Wemm 		else
141f0957ccaSPeter Wemm 			c = buf[i] ^ buf[i + 1] << 8;
142f0957ccaSPeter Wemm 
143f0957ccaSPeter Wemm 		if (!following)
144f0957ccaSPeter Wemm 			if (c < 0xD800 || c > 0xDFFF)
145f0957ccaSPeter Wemm 				if (c < 128 && text_chars[c] != T)
146f0957ccaSPeter Wemm 					return 0;
147f0957ccaSPeter Wemm 				else
148f0957ccaSPeter Wemm 					following = 0;
149f0957ccaSPeter Wemm 			else if (c > 0xDBFF)
150f0957ccaSPeter Wemm 				return 0;
151f0957ccaSPeter Wemm 			else {
152f0957ccaSPeter Wemm 				following = 1;
153f0957ccaSPeter Wemm 				continue;
154f0957ccaSPeter Wemm 			}
155f0957ccaSPeter Wemm 		else if (c < 0xDC00 || c > 0xDFFF)
156f0957ccaSPeter Wemm 			return 0;
157f0957ccaSPeter Wemm 	}
158f0957ccaSPeter Wemm 
159f0957ccaSPeter Wemm 	return 1 + bigend;
160f0957ccaSPeter Wemm }
161f0957ccaSPeter Wemm 
162f0957ccaSPeter Wemm #undef F
163f0957ccaSPeter Wemm #undef T
164f0957ccaSPeter Wemm #undef I
165f0957ccaSPeter Wemm #undef X
166f0957ccaSPeter Wemm 
167f0957ccaSPeter Wemm /*
168f0957ccaSPeter Wemm  * decode_utf8 --
169f0957ccaSPeter Wemm  *  Decode a UTF-8 character from byte string to Unicode.
170f0957ccaSPeter Wemm  *  Returns -1 if the first byte is a not UTF-8 leader.
171f0957ccaSPeter Wemm  *
172f0957ccaSPeter Wemm  *  Based on RFC 3629, but without error detection.
173f0957ccaSPeter Wemm  *
174c271fa92SBaptiste Daroussin  * PUBLIC: int decode_utf8(const char *);
175f0957ccaSPeter Wemm  */
176c271fa92SBaptiste Daroussin int
decode_utf8(const char * ibuf)177c271fa92SBaptiste Daroussin decode_utf8(const char *ibuf)
178c271fa92SBaptiste Daroussin {
179f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
180f0957ccaSPeter Wemm 	int u = -1;
181f0957ccaSPeter Wemm 
182f0957ccaSPeter Wemm 	if ((buf[0] & 0x80) == 0)
183f0957ccaSPeter Wemm 		u = buf[0];
184f0957ccaSPeter Wemm 	else if ((buf[0] & 0x40) == 0);
185f0957ccaSPeter Wemm 	else {
186f0957ccaSPeter Wemm 		if ((buf[0] & 0x20) == 0)
187f0957ccaSPeter Wemm 			u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
188f0957ccaSPeter Wemm 		else if ((buf[0] & 0x10) == 0)
189f0957ccaSPeter Wemm 			u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
190f0957ccaSPeter Wemm 			  ^ (buf[2] ^ 0x80);
191f0957ccaSPeter Wemm 		else if (((buf[0] & 0x08) == 0))
192f0957ccaSPeter Wemm 			u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
193f0957ccaSPeter Wemm 			  ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
194f0957ccaSPeter Wemm 	}
195c271fa92SBaptiste Daroussin 
196f0957ccaSPeter Wemm 	return u;
197f0957ccaSPeter Wemm }
198f0957ccaSPeter Wemm 
199f0957ccaSPeter Wemm /*
200f0957ccaSPeter Wemm  * decode_utf16 --
201f0957ccaSPeter Wemm  *  Decode a UTF-16 character from byte string to Unicode.
202f0957ccaSPeter Wemm  *  Returns -1 if the first unsigned integer is invalid.
203f0957ccaSPeter Wemm  *
204f0957ccaSPeter Wemm  *  No error detection on supplementary bytes.
205f0957ccaSPeter Wemm  *
206c271fa92SBaptiste Daroussin  * PUBLIC: int decode_utf16(const char *, int);
207f0957ccaSPeter Wemm  */
208c271fa92SBaptiste Daroussin int
decode_utf16(const char * ibuf,int bigend)209c271fa92SBaptiste Daroussin decode_utf16(const char* ibuf, int bigend)
210c271fa92SBaptiste Daroussin {
211f0957ccaSPeter Wemm 	const u_char *buf = (u_char *)ibuf;
212f0957ccaSPeter Wemm 	int u = -1;
213f0957ccaSPeter Wemm 	unsigned int w1, w2;
214f0957ccaSPeter Wemm 
215f0957ccaSPeter Wemm 	if (bigend)
216f0957ccaSPeter Wemm 		w1 = buf[0] << 8 ^ buf[1];
217f0957ccaSPeter Wemm 	else
218f0957ccaSPeter Wemm 		w1 = buf[0] ^ buf[1] << 8;
219f0957ccaSPeter Wemm 
220f0957ccaSPeter Wemm 	if (w1 < 0xD800 || w1 > 0xDFFF)
221f0957ccaSPeter Wemm 		u = w1;
222f0957ccaSPeter Wemm 	else if (w1 > 0xDBFF);
223f0957ccaSPeter Wemm 	else {
224f0957ccaSPeter Wemm 		if (bigend)
225f0957ccaSPeter Wemm 			w2 = buf[2] << 8 ^ buf[3];
226f0957ccaSPeter Wemm 		else
227f0957ccaSPeter Wemm 			w2 = buf[2] ^ buf[3] << 8;
228f0957ccaSPeter Wemm 		u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
229f0957ccaSPeter Wemm 	}
230c271fa92SBaptiste Daroussin 
231f0957ccaSPeter Wemm 	return u;
232f0957ccaSPeter Wemm }
233