encoding.c - OpenGrok cross reference for /freebsd/contrib/file/src/encoding.c

Lines Matching +full:iso +full:- +full:8 +full:x16
2  * Copyright (c) Ian F. Darwin 1986-1995.
4  * maintained 1995-present by Christos Zoulas and others.
29  * Encoding -- determine the character encoding of a text file.
31  * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
71  * the text converted into one-file_unichar_t-per-character Unicode in
79 	const unsigned char *buf = CAST(const unsigned char *, b->fbuf);  in file_encoding()
80 	size_t nbytes = b->flen;  in file_encoding()
96 	if (nbytes > ms->encoding_max)  in file_encoding()
97 		nbytes = ms->encoding_max;  in file_encoding()
107 			DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen));  in file_encoding()
108 			*code = "Unicode text, UTF-7";  in file_encoding()
109 			*code_mime = "utf-7";  in file_encoding()
113 			*code_mime = "us-ascii";  in file_encoding()
117 		*code = "Unicode text, UTF-8 (with BOM)";  in file_encoding()
118 		*code_mime = "utf-8";  in file_encoding()
121 		*code = "Unicode text, UTF-8";  in file_encoding()
122 		*code_mime = "utf-8";  in file_encoding()
125 			*code = "Unicode text, UTF-32, little-endian";  in file_encoding()
126 			*code_mime = "utf-32le";  in file_encoding()
128 			*code = "Unicode text, UTF-32, big-endian";  in file_encoding()
129 			*code_mime = "utf-32be";  in file_encoding()
134 			*code = "Unicode text, UTF-16, little-endian";  in file_encoding()
135 			*code_mime = "utf-16le";  in file_encoding()
137 			*code = "Unicode text, UTF-16, big-endian";  in file_encoding()
138 			*code_mime = "utf-16be";  in file_encoding()
143 		*code = "ISO-8859";  in file_encoding()
144 		*code_mime = "iso-8859-1";  in file_encoding()
147 		*code = "Non-ISO extended-ASCII";  in file_encoding()
148 		*code_mime = "unknown-8bit";  in file_encoding()
214  * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
215  * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
221  * make a real mess on VT100-style displays if they're not paired properly,
224  * A file is considered to be ISO-8859 text if its characters are all
226  * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
229  * character code if its characters are all either ISO-8859 (according to
231  * ISO-8859 considers to be control characters but the IBM PC and Macintosh
237 #define I 2   /* character appears in ISO-8859 text */
238 #define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
287  * Decide whether some text looks like UTF-8. Returns:
289  *     -1: invalid UTF-8
291  *      1: 7-bit text
292  *      2: definitely UTF-8 text (valid high-bit set bytes)
294  * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
313 // first is information about the first byte in a UTF-8 sequence.
315     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
316     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
317     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
318     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
319     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
320     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
321     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
322     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
323     AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
324     //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
325     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
326     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
327     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
328     XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
329     XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
330     S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
331     S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
332     S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
335 // acceptRange gives the range of valid values for the second byte in a UTF-8
364 			 * Even if the whole file is valid UTF-8 sequences,  in file_looks_utf8()
374 			return -1;  in file_looks_utf8()
375 		} else {			   /* 11xxxxxx begins UTF-8 */  in file_looks_utf8()
381 				return -1;  in file_looks_utf8()
399 				return -1;  in file_looks_utf8()
407 				     (buf[i] < ar->lo || buf[i] > ar->hi))  in file_looks_utf8()
408 					return -1;  in file_looks_utf8()
411 					return -1;  in file_looks_utf8()
426  * Decide whether some text looks like UTF-8 with BOM. If there is no
427  * BOM, return -1; otherwise return the result of looks_utf8 on the
435 		return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);  in looks_utf8_with_BOM()
437 		return -1;  in looks_utf8_with_BOM()
446 		case '8':  in looks_utf7()
454 			return -1;  in looks_utf7()
457 		return -1;  in looks_utf7()
490 			    bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8));  in looks_ucs16()
493 			    bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8));  in looks_ucs16()
509 			uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00);  in looks_ucs16()
516 			hi = uc - 0xd800 + 1;  in looks_ucs16()
548 			    | (CAST(file_unichar_t, bf[i + 2]) << 8)  in looks_ucs32()
553 			    | (CAST(file_unichar_t, bf[i + 1]) << 8)   in looks_ucs32()
557 		if (ubf[*ulen - 1] == 0xfffe)  in looks_ucs32()
559 		if (ubf[*ulen - 1] < 128 &&  in looks_ucs32()
560 		    text_chars[CAST(size_t, ubf[*ulen - 1])] != T)  in looks_ucs32()
572  * This table maps each EBCDIC character to an (8-bit extended) ASCII
578  * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
579  * Edition, July, 1999, pp. I-1 - I-4.
582  * on most of the printing characters that also appear in (7-bit) ASCII.
590  * between old-style and internationalized examples of text.
595  16,  17,  18,  19, 157, 133,   8, 135,  24,  25, 146, 143,  28,  29,  30,  31,
600 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
609 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
614  * The following EBCDIC-to-ASCII table may relate more closely to reality,
620  * Unix-derived software on IBM's 390 systems) to the corresponding
621  * characters from ISO 8859-1.
631 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
648  * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.