xref: /freebsd/contrib/nvi/common/encoding.c (revision c66ec88fed842fbaad62c30d510644ceb7bd2d71)
1 /*-
2  * Copyright (c) 2011, 2012
3  *	Zhihao Yuan.  All rights reserved.
4  *
5  * See the LICENSE file for redistribution information.
6  */
7 
8 #include <sys/types.h>
9 
10 int looks_utf8(const char *, size_t);
11 int looks_utf16(const char *, size_t);
12 int decode_utf8(const char *);
13 int decode_utf16(const char *, int);
14 
15 #define F 0   /* character never appears in text */
16 #define T 1   /* character appears in plain ASCII text */
17 #define I 2   /* character appears in ISO-8859 text */
18 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
19 
20 static char text_chars[256] = {
21 	/*                  BEL BS HT LF    FF CR    */
22 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
23 	/*                              ESC          */
24 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
25 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
26 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
27 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
28 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
29 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
30 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
31 	/*            NEL                            */
32 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
33 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
34 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
35 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
36 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
37 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
38 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
39 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
40 };
41 
42 /*
43  * looks_utf8 --
44  *  Decide whether some text looks like UTF-8. Returns:
45  *
46  *     -1: invalid UTF-8
47  *      0: uses odd control characters, so doesn't look like text
48  *      1: 7-bit text
49  *      2: definitely UTF-8 text (valid high-bit set bytes)
50  *
51  *  Based on RFC 3629. UTF-8 with BOM is not accepted.
52  *
53  * PUBLIC: int looks_utf8(const char *, size_t);
54  */
55 int
56 looks_utf8(const char *ibuf, size_t nbytes)
57 {
58 	const u_char *buf = (u_char *)ibuf;
59 	size_t i;
60 	int n;
61 	int gotone = 0, ctrl = 0;
62 
63 	for (i = 0; i < nbytes; i++) {
64 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
65 			/*
66 			 * Even if the whole file is valid UTF-8 sequences,
67 			 * still reject it if it uses weird control characters.
68 			 */
69 
70 			if (text_chars[buf[i]] != T)
71 				ctrl = 1;
72 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
73 			return -1;
74 		} else {			   /* 11xxxxxx begins UTF-8 */
75 			int following;
76 
77 			if ((buf[i] & 0x20) == 0)	/* 110xxxxx */
78 				if (buf[i] > 0xC1)	/* C0, C1 */
79 					following = 1;
80 				else return -1;
81 			else if ((buf[i] & 0x10) == 0)	/* 1110xxxx */
82 				following = 2;
83 			else if ((buf[i] & 0x08) == 0)	/* 11110xxx */
84 				if (buf[i] < 0xF5)
85 					following = 3;
86 				else return -1;		/* F5, F6, F7 */
87 			else
88 				return -1;		/* F8~FF */
89 
90 			for (n = 0; n < following; n++) {
91 				i++;
92 				if (i >= nbytes)
93 					goto done;
94 
95 				if ((buf[i] & 0xc0) != 0x80)	/* 10xxxxxx */
96 					return -1;
97 			}
98 
99 			gotone = 1;
100 		}
101 	}
102 done:
103 	return ctrl ? 0 : (gotone ? 2 : 1);
104 }
105 
106 /*
107  * looks_utf16 --
108  *  Decide whether some text looks like UTF-16. Returns:
109  *
110  *      0: invalid UTF-16
111  *      1: Little-endian UTF-16
112  *      2: Big-endian UTF-16
113  *
114  * PUBLIC: int looks_utf16(const char *, size_t);
115  */
116 int
117 looks_utf16(const char *ibuf, size_t nbytes)
118 {
119 	const u_char *buf = (u_char *)ibuf;
120 	int bigend;
121 	size_t i;
122 	unsigned int c;
123 	int bom;
124 	int following = 0;
125 
126 	if (nbytes < 2)
127 		return 0;
128 
129 	bom = buf[0] << 8 ^ buf[1];
130 	if (bom == 0xFFFE)
131 		bigend = 0;
132 	else if (bom == 0xFEFF)
133 		bigend = 1;
134 	else
135 		return 0;
136 
137 	for (i = 2; i + 1 < nbytes; i += 2) {
138 		if (bigend)
139 			c = buf[i] << 8 ^ buf[i + 1];
140 		else
141 			c = buf[i] ^ buf[i + 1] << 8;
142 
143 		if (!following)
144 			if (c < 0xD800 || c > 0xDFFF)
145 				if (c < 128 && text_chars[c] != T)
146 					return 0;
147 				else
148 					following = 0;
149 			else if (c > 0xDBFF)
150 				return 0;
151 			else {
152 				following = 1;
153 				continue;
154 			}
155 		else if (c < 0xDC00 || c > 0xDFFF)
156 			return 0;
157 	}
158 
159 	return 1 + bigend;
160 }
161 
162 #undef F
163 #undef T
164 #undef I
165 #undef X
166 
167 /*
168  * decode_utf8 --
169  *  Decode a UTF-8 character from byte string to Unicode.
170  *  Returns -1 if the first byte is a not UTF-8 leader.
171  *
172  *  Based on RFC 3629, but without error detection.
173  *
174  * PUBLIC: int decode_utf8(const char *);
175  */
176 int
177 decode_utf8(const char *ibuf)
178 {
179 	const u_char *buf = (u_char *)ibuf;
180 	int u = -1;
181 
182 	if ((buf[0] & 0x80) == 0)
183 		u = buf[0];
184 	else if ((buf[0] & 0x40) == 0);
185 	else {
186 		if ((buf[0] & 0x20) == 0)
187 			u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
188 		else if ((buf[0] & 0x10) == 0)
189 			u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
190 			  ^ (buf[2] ^ 0x80);
191 		else if (((buf[0] & 0x08) == 0))
192 			u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
193 			  ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
194 	}
195 
196 	return u;
197 }
198 
199 /*
200  * decode_utf16 --
201  *  Decode a UTF-16 character from byte string to Unicode.
202  *  Returns -1 if the first unsigned integer is invalid.
203  *
204  *  No error detection on supplementary bytes.
205  *
206  * PUBLIC: int decode_utf16(const char *, int);
207  */
208 int
209 decode_utf16(const char* ibuf, int bigend)
210 {
211 	const u_char *buf = (u_char *)ibuf;
212 	int u = -1;
213 	unsigned int w1, w2;
214 
215 	if (bigend)
216 		w1 = buf[0] << 8 ^ buf[1];
217 	else
218 		w1 = buf[0] ^ buf[1] << 8;
219 
220 	if (w1 < 0xD800 || w1 > 0xDFFF)
221 		u = w1;
222 	else if (w1 > 0xDBFF);
223 	else {
224 		if (bigend)
225 			w2 = buf[2] << 8 ^ buf[3];
226 		else
227 			w2 = buf[2] ^ buf[3] << 8;
228 		u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
229 	}
230 
231 	return u;
232 }
233