xref: /freebsd/contrib/nvi/common/encoding.c (revision 0b3105a37d7adcadcb720112fed4dc4e8040be99)
1 /*-
2  * Copyright (c) 2011, 2012
3  *	Zhihao Yuan.  All rights reserved.
4  *
5  * See the LICENSE file for redistribution information.
6  */
7 
8 #ifndef lint
9 static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
10 #endif /* not lint */
11 
12 #include <sys/types.h>
13 
14 int looks_utf8(const char *, size_t);
15 int looks_utf16(const char *, size_t);
16 int decode_utf8(const char *);
17 int decode_utf16(const char *, int);
18 
19 #define F 0   /* character never appears in text */
20 #define T 1   /* character appears in plain ASCII text */
21 #define I 2   /* character appears in ISO-8859 text */
22 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
23 
24 static char text_chars[256] = {
25 	/*                  BEL BS HT LF    FF CR    */
26 	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
27 	/*                              ESC          */
28 	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
29 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
30 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
31 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
32 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
33 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
34 	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
35 	/*            NEL                            */
36 	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
37 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
38 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
39 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
40 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
41 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
42 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
43 	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
44 };
45 
46 /*
47  * looks_utf8 --
48  *  Decide whether some text looks like UTF-8. Returns:
49  *
50  *     -1: invalid UTF-8
51  *      0: uses odd control characters, so doesn't look like text
52  *      1: 7-bit text
53  *      2: definitely UTF-8 text (valid high-bit set bytes)
54  *
55  *  Based on RFC 3629. UTF-8 with BOM is not accepted.
56  *
57  * PUBLIC: int looks_utf8(const char *, size_t);
58  */
59 int
60 looks_utf8(const char *ibuf, size_t nbytes)
61 {
62 	const u_char *buf = (u_char *)ibuf;
63 	size_t i;
64 	int n;
65 	int gotone = 0, ctrl = 0;
66 
67 	for (i = 0; i < nbytes; i++) {
68 		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
69 			/*
70 			 * Even if the whole file is valid UTF-8 sequences,
71 			 * still reject it if it uses weird control characters.
72 			 */
73 
74 			if (text_chars[buf[i]] != T)
75 				ctrl = 1;
76 		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
77 			return -1;
78 		} else {			   /* 11xxxxxx begins UTF-8 */
79 			int following;
80 
81 			if ((buf[i] & 0x20) == 0)	/* 110xxxxx */
82 				if (buf[i] > 0xC1)	/* C0, C1 */
83 					following = 1;
84 				else return -1;
85 			else if ((buf[i] & 0x10) == 0)	/* 1110xxxx */
86 				following = 2;
87 			else if ((buf[i] & 0x08) == 0)	/* 11110xxx */
88 				if (buf[i] < 0xF5)
89 					following = 3;
90 				else return -1;		/* F5, F6, F7 */
91 			else
92 				return -1;		/* F8~FF */
93 
94 			for (n = 0; n < following; n++) {
95 				i++;
96 				if (i >= nbytes)
97 					goto done;
98 
99 				if (buf[i] & 0x40)	/* 10xxxxxx */
100 					return -1;
101 			}
102 
103 			gotone = 1;
104 		}
105 	}
106 done:
107 	return ctrl ? 0 : (gotone ? 2 : 1);
108 }
109 
110 /*
111  * looks_utf16 --
112  *  Decide whether some text looks like UTF-16. Returns:
113  *
114  *      0: invalid UTF-16
115  *      1: Little-endian UTF-16
116  *      2: Big-endian UTF-16
117  *
118  * PUBLIC: int looks_utf16(const char *, size_t);
119  */
120 int
121 looks_utf16(const char *ibuf, size_t nbytes)
122 {
123 	const u_char *buf = (u_char *)ibuf;
124 	int bigend;
125 	size_t i;
126 	unsigned int c;
127 	int bom;
128 	int following = 0;
129 
130 	if (nbytes < 2)
131 		return 0;
132 
133 	bom = buf[0] << 8 ^ buf[1];
134 	if (bom == 0xFFFE)
135 		bigend = 0;
136 	else if (bom == 0xFEFF)
137 		bigend = 1;
138 	else
139 		return 0;
140 
141 	for (i = 2; i + 1 < nbytes; i += 2) {
142 		if (bigend)
143 			c = buf[i] << 8 ^ buf[i + 1];
144 		else
145 			c = buf[i] ^ buf[i + 1] << 8;
146 
147 		if (!following)
148 			if (c < 0xD800 || c > 0xDFFF)
149 				if (c < 128 && text_chars[c] != T)
150 					return 0;
151 				else
152 					following = 0;
153 			else if (c > 0xDBFF)
154 				return 0;
155 			else {
156 				following = 1;
157 				continue;
158 			}
159 		else if (c < 0xDC00 || c > 0xDFFF)
160 			return 0;
161 	}
162 
163 	return 1 + bigend;
164 }
165 
166 #undef F
167 #undef T
168 #undef I
169 #undef X
170 
171 /*
172  * decode_utf8 --
173  *  Decode a UTF-8 character from byte string to Unicode.
174  *  Returns -1 if the first byte is a not UTF-8 leader.
175  *
176  *  Based on RFC 3629, but without error detection.
177  *
178  * PUBLIC: int decode_utf8(const char *);
179  */
180 int
181 decode_utf8(const char *ibuf)
182 {
183 	const u_char *buf = (u_char *)ibuf;
184 	int u = -1;
185 
186 	if ((buf[0] & 0x80) == 0)
187 		u = buf[0];
188 	else if ((buf[0] & 0x40) == 0);
189 	else {
190 		if ((buf[0] & 0x20) == 0)
191 			u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
192 		else if ((buf[0] & 0x10) == 0)
193 			u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
194 			  ^ (buf[2] ^ 0x80);
195 		else if (((buf[0] & 0x08) == 0))
196 			u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
197 			  ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
198 	}
199 
200 	return u;
201 }
202 
203 /*
204  * decode_utf16 --
205  *  Decode a UTF-16 character from byte string to Unicode.
206  *  Returns -1 if the first unsigned integer is invalid.
207  *
208  *  No error detection on supplementary bytes.
209  *
210  * PUBLIC: int decode_utf16(const char *, int);
211  */
212 int
213 decode_utf16(const char* ibuf, int bigend)
214 {
215 	const u_char *buf = (u_char *)ibuf;
216 	int u = -1;
217 	unsigned int w1, w2;
218 
219 	if (bigend)
220 		w1 = buf[0] << 8 ^ buf[1];
221 	else
222 		w1 = buf[0] ^ buf[1] << 8;
223 
224 	if (w1 < 0xD800 || w1 > 0xDFFF)
225 		u = w1;
226 	else if (w1 > 0xDBFF);
227 	else {
228 		if (bigend)
229 			w2 = buf[2] << 8 ^ buf[3];
230 		else
231 			w2 = buf[2] ^ buf[3] << 8;
232 		u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
233 	}
234 
235 	return u;
236 }
237