Lines Matching +full:iso +full:- +full:8 +full:x16
2 * Copyright (c) Ian F. Darwin 1986-1995.
4 * maintained 1995-present by Christos Zoulas and others.
29 * Encoding -- determine the character encoding of a text file.
31 * Joerg Wunsch <joerg@freebsd.org> wrote the original support for 8-bit
71 * the text converted into one-file_unichar_t-per-character Unicode in
79 const unsigned char *buf = CAST(const unsigned char *, b->fbuf); in file_encoding()
80 size_t nbytes = b->flen; in file_encoding()
96 if (nbytes > ms->encoding_max) in file_encoding()
97 nbytes = ms->encoding_max; in file_encoding()
107 DPRINTF(("utf-7 %" SIZE_T_FORMAT "u\n", *ulen)); in file_encoding()
108 *code = "Unicode text, UTF-7"; in file_encoding()
109 *code_mime = "utf-7"; in file_encoding()
113 *code_mime = "us-ascii"; in file_encoding()
117 *code = "Unicode text, UTF-8 (with BOM)"; in file_encoding()
118 *code_mime = "utf-8"; in file_encoding()
121 *code = "Unicode text, UTF-8"; in file_encoding()
122 *code_mime = "utf-8"; in file_encoding()
125 *code = "Unicode text, UTF-32, little-endian"; in file_encoding()
126 *code_mime = "utf-32le"; in file_encoding()
128 *code = "Unicode text, UTF-32, big-endian"; in file_encoding()
129 *code_mime = "utf-32be"; in file_encoding()
134 *code = "Unicode text, UTF-16, little-endian"; in file_encoding()
135 *code_mime = "utf-16le"; in file_encoding()
137 *code = "Unicode text, UTF-16, big-endian"; in file_encoding()
138 *code_mime = "utf-16be"; in file_encoding()
143 *code = "ISO-8859"; in file_encoding()
144 *code_mime = "iso-8859-1"; in file_encoding()
147 *code = "Non-ISO extended-ASCII"; in file_encoding()
148 *code_mime = "unknown-8bit"; in file_encoding()
214 * include, with hesitation, the X3.64/ECMA-43 control nextline (0x85),
215 * because that's what the dd EBCDIC->ASCII table maps the EBCDIC newline
221 * make a real mess on VT100-style displays if they're not paired properly,
224 * A file is considered to be ISO-8859 text if its characters are all
226 * from the ISO-8859 8-bit extension, characters 0xA0 ... 0xFF.
229 * character code if its characters are all either ISO-8859 (according to
231 * ISO-8859 considers to be control characters but the IBM PC and Macintosh
237 #define I 2 /* character appears in ISO-8859 text */
238 #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
287 * Decide whether some text looks like UTF-8. Returns:
289 * -1: invalid UTF-8
291 * 1: 7-bit text
292 * 2: definitely UTF-8 text (valid high-bit set bytes)
294 * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
313 // first is information about the first byte in a UTF-8 sequence.
315 // 1 2 3 4 5 6 7 8 9 A B C D E F
316 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x00-0x0F
317 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x10-0x1F
318 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x20-0x2F
319 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x30-0x3F
320 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x40-0x4F
321 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x50-0x5F
322 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x60-0x6F
323 AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, AS, // 0x70-0x7F
324 // 1 2 3 4 5 6 7 8 9 A B C D E F
325 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x80-0x8F
326 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0x90-0x9F
327 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xA0-0xAF
328 XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xB0-0xBF
329 XX, XX, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xC0-0xCF
330 S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, S1, // 0xD0-0xDF
331 S2, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S3, S4, S3, S3, // 0xE0-0xEF
332 S5, S6, S6, S6, S7, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, XX, // 0xF0-0xFF
335 // acceptRange gives the range of valid values for the second byte in a UTF-8
364 * Even if the whole file is valid UTF-8 sequences, in file_looks_utf8()
374 return -1; in file_looks_utf8()
375 } else { /* 11xxxxxx begins UTF-8 */ in file_looks_utf8()
381 return -1; in file_looks_utf8()
399 return -1; in file_looks_utf8()
407 (buf[i] < ar->lo || buf[i] > ar->hi)) in file_looks_utf8()
408 return -1; in file_looks_utf8()
411 return -1; in file_looks_utf8()
426 * Decide whether some text looks like UTF-8 with BOM. If there is no
427 * BOM, return -1; otherwise return the result of looks_utf8 on the
435 return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); in looks_utf8_with_BOM()
437 return -1; in looks_utf8_with_BOM()
446 case '8': in looks_utf7()
454 return -1; in looks_utf7()
457 return -1; in looks_utf7()
490 bf[i + 1] | (CAST(file_unichar_t, bf[i]) << 8)); in looks_ucs16()
493 bf[i] | (CAST(file_unichar_t, bf[i + 1]) << 8)); in looks_ucs16()
509 uc = 0x10000 + 0x400 * (hi - 1) + (uc - 0xdc00); in looks_ucs16()
516 hi = uc - 0xd800 + 1; in looks_ucs16()
548 | (CAST(file_unichar_t, bf[i + 2]) << 8) in looks_ucs32()
553 | (CAST(file_unichar_t, bf[i + 1]) << 8) in looks_ucs32()
557 if (ubf[*ulen - 1] == 0xfffe) in looks_ucs32()
559 if (ubf[*ulen - 1] < 128 && in looks_ucs32()
560 text_chars[CAST(size_t, ubf[*ulen - 1])] != T) in looks_ucs32()
572 * This table maps each EBCDIC character to an (8-bit extended) ASCII
578 * Architecture/390: Principles of Operation_, SA22-7201-06, Seventh
579 * Edition, July, 1999, pp. I-1 - I-4.
582 * on most of the printing characters that also appear in (7-bit) ASCII.
590 * between old-style and internationalized examples of text.
595 16, 17, 18, 19, 157, 133, 8, 135, 24, 25, 146, 143, 28, 29, 30, 31,
600 '-', '/', 178, 179, 180, 181, 182, 183, 184, 185, 203, ',', '%', '_', '>', '?',
609 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 250, 251, 252, 253, 254, 255
614 * The following EBCDIC-to-ASCII table may relate more closely to reality,
620 * Unix-derived software on IBM's 390 systems) to the corresponding
621 * characters from ISO 8859-1.
631 0x90,0x91,0x16,0x93,0x94,0x95,0x96,0x04,0x98,0x99,0x9A,0x9B,0x14,0x15,0x9E,0x1A,
648 * Copy buf[0 ... nbytes-1] into out[], translating EBCDIC to ASCII.