1f0957ccaSPeter Wemm /*- 2f0957ccaSPeter Wemm * Copyright (c) 2011, 2012 3f0957ccaSPeter Wemm * Zhihao Yuan. All rights reserved. 4f0957ccaSPeter Wemm * 5f0957ccaSPeter Wemm * See the LICENSE file for redistribution information. 6f0957ccaSPeter Wemm */ 7f0957ccaSPeter Wemm 8f0957ccaSPeter Wemm #ifndef lint 9f0957ccaSPeter Wemm static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $"; 10f0957ccaSPeter Wemm #endif /* not lint */ 11f0957ccaSPeter Wemm 12f0957ccaSPeter Wemm #include <sys/types.h> 13f0957ccaSPeter Wemm 14*c271fa92SBaptiste Daroussin int looks_utf8(const char *, size_t); 15*c271fa92SBaptiste Daroussin int looks_utf16(const char *, size_t); 16*c271fa92SBaptiste Daroussin int decode_utf8(const char *); 17*c271fa92SBaptiste Daroussin int decode_utf16(const char *, int); 18f0957ccaSPeter Wemm 19f0957ccaSPeter Wemm #define F 0 /* character never appears in text */ 20f0957ccaSPeter Wemm #define T 1 /* character appears in plain ASCII text */ 21f0957ccaSPeter Wemm #define I 2 /* character appears in ISO-8859 text */ 22f0957ccaSPeter Wemm #define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ 23f0957ccaSPeter Wemm 24f0957ccaSPeter Wemm static char text_chars[256] = { 25f0957ccaSPeter Wemm /* BEL BS HT LF FF CR */ 26f0957ccaSPeter Wemm F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ 27f0957ccaSPeter Wemm /* ESC */ 28f0957ccaSPeter Wemm F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ 29f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ 30f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ 31f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ 32f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ 33f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ 34f0957ccaSPeter Wemm T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ 35f0957ccaSPeter Wemm /* NEL */ 36f0957ccaSPeter Wemm X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ 37f0957ccaSPeter Wemm X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ 38f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ 39f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ 40f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ 41f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ 42f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ 43f0957ccaSPeter Wemm I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ 44f0957ccaSPeter Wemm }; 45f0957ccaSPeter Wemm 46f0957ccaSPeter Wemm /* 47f0957ccaSPeter Wemm * looks_utf8 -- 48f0957ccaSPeter Wemm * Decide whether some text looks like UTF-8. Returns: 49f0957ccaSPeter Wemm * 50f0957ccaSPeter Wemm * -1: invalid UTF-8 51f0957ccaSPeter Wemm * 0: uses odd control characters, so doesn't look like text 52f0957ccaSPeter Wemm * 1: 7-bit text 53f0957ccaSPeter Wemm * 2: definitely UTF-8 text (valid high-bit set bytes) 54f0957ccaSPeter Wemm * 55f0957ccaSPeter Wemm * Based on RFC 3629. UTF-8 with BOM is not accepted. 56f0957ccaSPeter Wemm * 57*c271fa92SBaptiste Daroussin * PUBLIC: int looks_utf8(const char *, size_t); 58f0957ccaSPeter Wemm */ 59f0957ccaSPeter Wemm int 60f0957ccaSPeter Wemm looks_utf8(const char *ibuf, size_t nbytes) 61f0957ccaSPeter Wemm { 62f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf; 63f0957ccaSPeter Wemm size_t i; 64f0957ccaSPeter Wemm int n; 65f0957ccaSPeter Wemm int gotone = 0, ctrl = 0; 66f0957ccaSPeter Wemm 67f0957ccaSPeter Wemm for (i = 0; i < nbytes; i++) { 68f0957ccaSPeter Wemm if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ 69f0957ccaSPeter Wemm /* 70f0957ccaSPeter Wemm * Even if the whole file is valid UTF-8 sequences, 71f0957ccaSPeter Wemm * still reject it if it uses weird control characters. 72f0957ccaSPeter Wemm */ 73f0957ccaSPeter Wemm 74f0957ccaSPeter Wemm if (text_chars[buf[i]] != T) 75f0957ccaSPeter Wemm ctrl = 1; 76f0957ccaSPeter Wemm } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ 77f0957ccaSPeter Wemm return -1; 78f0957ccaSPeter Wemm } else { /* 11xxxxxx begins UTF-8 */ 79f0957ccaSPeter Wemm int following; 80f0957ccaSPeter Wemm 81f0957ccaSPeter Wemm if ((buf[i] & 0x20) == 0) /* 110xxxxx */ 82f0957ccaSPeter Wemm if (buf[i] > 0xC1) /* C0, C1 */ 83f0957ccaSPeter Wemm following = 1; 84f0957ccaSPeter Wemm else return -1; 85f0957ccaSPeter Wemm else if ((buf[i] & 0x10) == 0) /* 1110xxxx */ 86f0957ccaSPeter Wemm following = 2; 87f0957ccaSPeter Wemm else if ((buf[i] & 0x08) == 0) /* 11110xxx */ 88f0957ccaSPeter Wemm if (buf[i] < 0xF5) 89f0957ccaSPeter Wemm following = 3; 90f0957ccaSPeter Wemm else return -1; /* F5, F6, F7 */ 91f0957ccaSPeter Wemm else 92f0957ccaSPeter Wemm return -1; /* F8~FF */ 93f0957ccaSPeter Wemm 94f0957ccaSPeter Wemm for (n = 0; n < following; n++) { 95f0957ccaSPeter Wemm i++; 96f0957ccaSPeter Wemm if (i >= nbytes) 97f0957ccaSPeter Wemm goto done; 98f0957ccaSPeter Wemm 99f0957ccaSPeter Wemm if (buf[i] & 0x40) /* 10xxxxxx */ 100f0957ccaSPeter Wemm return -1; 101f0957ccaSPeter Wemm } 102f0957ccaSPeter Wemm 103f0957ccaSPeter Wemm gotone = 1; 104f0957ccaSPeter Wemm } 105f0957ccaSPeter Wemm } 106f0957ccaSPeter Wemm done: 107f0957ccaSPeter Wemm return ctrl ? 0 : (gotone ? 2 : 1); 108f0957ccaSPeter Wemm } 109f0957ccaSPeter Wemm 110f0957ccaSPeter Wemm /* 111f0957ccaSPeter Wemm * looks_utf16 -- 112f0957ccaSPeter Wemm * Decide whether some text looks like UTF-16. Returns: 113f0957ccaSPeter Wemm * 114f0957ccaSPeter Wemm * 0: invalid UTF-16 115f0957ccaSPeter Wemm * 1: Little-endian UTF-16 116f0957ccaSPeter Wemm * 2: Big-endian UTF-16 117f0957ccaSPeter Wemm * 118*c271fa92SBaptiste Daroussin * PUBLIC: int looks_utf16(const char *, size_t); 119f0957ccaSPeter Wemm */ 120f0957ccaSPeter Wemm int 121f0957ccaSPeter Wemm looks_utf16(const char *ibuf, size_t nbytes) 122f0957ccaSPeter Wemm { 123f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf; 124f0957ccaSPeter Wemm int bigend; 125f0957ccaSPeter Wemm size_t i; 126f0957ccaSPeter Wemm unsigned int c; 127f0957ccaSPeter Wemm int bom; 128f0957ccaSPeter Wemm int following = 0; 129f0957ccaSPeter Wemm 130f0957ccaSPeter Wemm if (nbytes < 2) 131f0957ccaSPeter Wemm return 0; 132f0957ccaSPeter Wemm 133f0957ccaSPeter Wemm bom = buf[0] << 8 ^ buf[1]; 134f0957ccaSPeter Wemm if (bom == 0xFFFE) 135f0957ccaSPeter Wemm bigend = 0; 136f0957ccaSPeter Wemm else if (bom == 0xFEFF) 137f0957ccaSPeter Wemm bigend = 1; 138f0957ccaSPeter Wemm else 139f0957ccaSPeter Wemm return 0; 140f0957ccaSPeter Wemm 141f0957ccaSPeter Wemm for (i = 2; i + 1 < nbytes; i += 2) { 142f0957ccaSPeter Wemm if (bigend) 143f0957ccaSPeter Wemm c = buf[i] << 8 ^ buf[i + 1]; 144f0957ccaSPeter Wemm else 145f0957ccaSPeter Wemm c = buf[i] ^ buf[i + 1] << 8; 146f0957ccaSPeter Wemm 147f0957ccaSPeter Wemm if (!following) 148f0957ccaSPeter Wemm if (c < 0xD800 || c > 0xDFFF) 149f0957ccaSPeter Wemm if (c < 128 && text_chars[c] != T) 150f0957ccaSPeter Wemm return 0; 151f0957ccaSPeter Wemm else 152f0957ccaSPeter Wemm following = 0; 153f0957ccaSPeter Wemm else if (c > 0xDBFF) 154f0957ccaSPeter Wemm return 0; 155f0957ccaSPeter Wemm else { 156f0957ccaSPeter Wemm following = 1; 157f0957ccaSPeter Wemm continue; 158f0957ccaSPeter Wemm } 159f0957ccaSPeter Wemm else if (c < 0xDC00 || c > 0xDFFF) 160f0957ccaSPeter Wemm return 0; 161f0957ccaSPeter Wemm } 162f0957ccaSPeter Wemm 163f0957ccaSPeter Wemm return 1 + bigend; 164f0957ccaSPeter Wemm } 165f0957ccaSPeter Wemm 166f0957ccaSPeter Wemm #undef F 167f0957ccaSPeter Wemm #undef T 168f0957ccaSPeter Wemm #undef I 169f0957ccaSPeter Wemm #undef X 170f0957ccaSPeter Wemm 171f0957ccaSPeter Wemm /* 172f0957ccaSPeter Wemm * decode_utf8 -- 173f0957ccaSPeter Wemm * Decode a UTF-8 character from byte string to Unicode. 174f0957ccaSPeter Wemm * Returns -1 if the first byte is a not UTF-8 leader. 175f0957ccaSPeter Wemm * 176f0957ccaSPeter Wemm * Based on RFC 3629, but without error detection. 177f0957ccaSPeter Wemm * 178*c271fa92SBaptiste Daroussin * PUBLIC: int decode_utf8(const char *); 179f0957ccaSPeter Wemm */ 180*c271fa92SBaptiste Daroussin int 181*c271fa92SBaptiste Daroussin decode_utf8(const char *ibuf) 182*c271fa92SBaptiste Daroussin { 183f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf; 184f0957ccaSPeter Wemm int u = -1; 185f0957ccaSPeter Wemm 186f0957ccaSPeter Wemm if ((buf[0] & 0x80) == 0) 187f0957ccaSPeter Wemm u = buf[0]; 188f0957ccaSPeter Wemm else if ((buf[0] & 0x40) == 0); 189f0957ccaSPeter Wemm else { 190f0957ccaSPeter Wemm if ((buf[0] & 0x20) == 0) 191f0957ccaSPeter Wemm u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80); 192f0957ccaSPeter Wemm else if ((buf[0] & 0x10) == 0) 193f0957ccaSPeter Wemm u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6 194f0957ccaSPeter Wemm ^ (buf[2] ^ 0x80); 195f0957ccaSPeter Wemm else if (((buf[0] & 0x08) == 0)) 196f0957ccaSPeter Wemm u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12 197f0957ccaSPeter Wemm ^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80); 198f0957ccaSPeter Wemm } 199*c271fa92SBaptiste Daroussin 200f0957ccaSPeter Wemm return u; 201f0957ccaSPeter Wemm } 202f0957ccaSPeter Wemm 203f0957ccaSPeter Wemm /* 204f0957ccaSPeter Wemm * decode_utf16 -- 205f0957ccaSPeter Wemm * Decode a UTF-16 character from byte string to Unicode. 206f0957ccaSPeter Wemm * Returns -1 if the first unsigned integer is invalid. 207f0957ccaSPeter Wemm * 208f0957ccaSPeter Wemm * No error detection on supplementary bytes. 209f0957ccaSPeter Wemm * 210*c271fa92SBaptiste Daroussin * PUBLIC: int decode_utf16(const char *, int); 211f0957ccaSPeter Wemm */ 212*c271fa92SBaptiste Daroussin int 213*c271fa92SBaptiste Daroussin decode_utf16(const char* ibuf, int bigend) 214*c271fa92SBaptiste Daroussin { 215f0957ccaSPeter Wemm const u_char *buf = (u_char *)ibuf; 216f0957ccaSPeter Wemm int u = -1; 217f0957ccaSPeter Wemm unsigned int w1, w2; 218f0957ccaSPeter Wemm 219f0957ccaSPeter Wemm if (bigend) 220f0957ccaSPeter Wemm w1 = buf[0] << 8 ^ buf[1]; 221f0957ccaSPeter Wemm else 222f0957ccaSPeter Wemm w1 = buf[0] ^ buf[1] << 8; 223f0957ccaSPeter Wemm 224f0957ccaSPeter Wemm if (w1 < 0xD800 || w1 > 0xDFFF) 225f0957ccaSPeter Wemm u = w1; 226f0957ccaSPeter Wemm else if (w1 > 0xDBFF); 227f0957ccaSPeter Wemm else { 228f0957ccaSPeter Wemm if (bigend) 229f0957ccaSPeter Wemm w2 = buf[2] << 8 ^ buf[3]; 230f0957ccaSPeter Wemm else 231f0957ccaSPeter Wemm w2 = buf[2] ^ buf[3] << 8; 232f0957ccaSPeter Wemm u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000; 233f0957ccaSPeter Wemm } 234*c271fa92SBaptiste Daroussin 235f0957ccaSPeter Wemm return u; 236f0957ccaSPeter Wemm } 237