1*6b5e5868SGarrett D'Amore /* 2*6b5e5868SGarrett D'Amore * This file and its contents are supplied under the terms of the 3*6b5e5868SGarrett D'Amore * Common Development and Distribution License ("CDDL"), version 1.0. 4*6b5e5868SGarrett D'Amore * You may only use this file in accordance with the terms version 1.0 5*6b5e5868SGarrett D'Amore * of the CDDL. 6*6b5e5868SGarrett D'Amore * 7*6b5e5868SGarrett D'Amore * A full copy of the text of the CDDL should have accompanied this 8*6b5e5868SGarrett D'Amore * source. A copy of the CDDL is also available via the Internet at 9*6b5e5868SGarrett D'Amore * http://www.illumos.org/license/CDDL. 10*6b5e5868SGarrett D'Amore */ 11*6b5e5868SGarrett D'Amore 12*6b5e5868SGarrett D'Amore /* 13*6b5e5868SGarrett D'Amore * Copyright 2010 Nexenta Systems, Inc. All rights reserved. 14*6b5e5868SGarrett D'Amore */ 15*6b5e5868SGarrett D'Amore 16*6b5e5868SGarrett D'Amore /* 17*6b5e5868SGarrett D'Amore * The functions in this file convert from the standard multibyte forms 18*6b5e5868SGarrett D'Amore * to the wide character forms used internally by libc. Unfortunately, 19*6b5e5868SGarrett D'Amore * this approach means that we need a method for each and every encoding. 20*6b5e5868SGarrett D'Amore */ 21*6b5e5868SGarrett D'Amore 22*6b5e5868SGarrett D'Amore #include <stdlib.h> 23*6b5e5868SGarrett D'Amore #include <wchar.h> 24*6b5e5868SGarrett D'Amore #include <string.h> 25*6b5e5868SGarrett D'Amore #include <sys/types.h> 26*6b5e5868SGarrett D'Amore #include "localedef.h" 27*6b5e5868SGarrett D'Amore 28*6b5e5868SGarrett D'Amore static int towide_none(wchar_t *, const char *, int); 29*6b5e5868SGarrett D'Amore static int towide_utf8(wchar_t *, const char *, int); 30*6b5e5868SGarrett D'Amore static int towide_big5(wchar_t *, const char *, int); 31*6b5e5868SGarrett D'Amore static int towide_gbk(wchar_t *, const char *, int); 32*6b5e5868SGarrett D'Amore static int towide_gb2312(wchar_t *, const char *, int); 33*6b5e5868SGarrett D'Amore static int towide_gb18030(wchar_t *, const char *, int); 34*6b5e5868SGarrett D'Amore static int towide_mskanji(wchar_t *, const char *, int); 35*6b5e5868SGarrett D'Amore static int towide_euccn(wchar_t *, const char *, int); 36*6b5e5868SGarrett D'Amore static int towide_eucjp(wchar_t *, const char *, int); 37*6b5e5868SGarrett D'Amore static int towide_euckr(wchar_t *, const char *, int); 38*6b5e5868SGarrett D'Amore static int towide_euctw(wchar_t *, const char *, int); 39*6b5e5868SGarrett D'Amore 40*6b5e5868SGarrett D'Amore static int tomb_none(char *, wchar_t); 41*6b5e5868SGarrett D'Amore static int tomb_utf8(char *, wchar_t); 42*6b5e5868SGarrett D'Amore static int tomb_mbs(char *, wchar_t); 43*6b5e5868SGarrett D'Amore 44*6b5e5868SGarrett D'Amore static int (*_towide)(wchar_t *, const char *, int) = towide_none; 45*6b5e5868SGarrett D'Amore static int (*_tomb)(char *, wchar_t) = tomb_none; 46*6b5e5868SGarrett D'Amore static const char *_encoding = "NONE"; 47*6b5e5868SGarrett D'Amore 48*6b5e5868SGarrett D'Amore /* 49*6b5e5868SGarrett D'Amore * Table of supported encodings. We only bother to list the multibyte 50*6b5e5868SGarrett D'Amore * encodings here, because single byte locales are handed by "NONE". 51*6b5e5868SGarrett D'Amore */ 52*6b5e5868SGarrett D'Amore static struct { 53*6b5e5868SGarrett D'Amore const char *name; 54*6b5e5868SGarrett D'Amore /* the name that the underlying libc implemenation uses */ 55*6b5e5868SGarrett D'Amore const char *cname; 56*6b5e5868SGarrett D'Amore int (*towide)(wchar_t *, const char *, int); 57*6b5e5868SGarrett D'Amore int (*tomb)(char *, wchar_t); 58*6b5e5868SGarrett D'Amore } mb_encodings[] = { 59*6b5e5868SGarrett D'Amore { "UTF-8", "UTF-8", towide_utf8, tomb_utf8 }, 60*6b5e5868SGarrett D'Amore { "UTF8", "UTF-8", towide_utf8, tomb_utf8 }, 61*6b5e5868SGarrett D'Amore { "utf8", "UTF-8", towide_utf8, tomb_utf8 }, 62*6b5e5868SGarrett D'Amore { "utf-8", "UTF-8", towide_utf8, tomb_utf8 }, 63*6b5e5868SGarrett D'Amore 64*6b5e5868SGarrett D'Amore { "EUC-CN", "EUC-CN", towide_euccn, tomb_mbs }, 65*6b5e5868SGarrett D'Amore { "eucCN", "EUC-CN", towide_euccn, tomb_mbs }, 66*6b5e5868SGarrett D'Amore 67*6b5e5868SGarrett D'Amore { "EUC-JP", "EUC-JP", towide_eucjp, tomb_mbs }, 68*6b5e5868SGarrett D'Amore { "eucJP", "EUC-JP", towide_eucjp, tomb_mbs }, 69*6b5e5868SGarrett D'Amore 70*6b5e5868SGarrett D'Amore { "EUC-KR", "EUC-KR", towide_euckr, tomb_mbs }, 71*6b5e5868SGarrett D'Amore { "eucKR", "EUC-KR", towide_euckr, tomb_mbs }, 72*6b5e5868SGarrett D'Amore 73*6b5e5868SGarrett D'Amore { "EUC-TW", "EUC-TW", towide_euctw, tomb_mbs }, 74*6b5e5868SGarrett D'Amore { "eucTW", "EUC-TW", towide_euctw, tomb_mbs }, 75*6b5e5868SGarrett D'Amore 76*6b5e5868SGarrett D'Amore { "MS_Kanji", "MSKanji", towide_mskanji, tomb_mbs }, 77*6b5e5868SGarrett D'Amore { "MSKanji", "MSKanji", towide_mskanji, tomb_mbs }, 78*6b5e5868SGarrett D'Amore { "PCK", "MSKanji", towide_mskanji, tomb_mbs }, 79*6b5e5868SGarrett D'Amore { "SJIS", "MSKanji", towide_mskanji, tomb_mbs }, 80*6b5e5868SGarrett D'Amore { "Shift_JIS", "MSKanji", towide_mskanji, tomb_mbs }, 81*6b5e5868SGarrett D'Amore 82*6b5e5868SGarrett D'Amore { "BIG5", "BIG5", towide_big5, tomb_mbs }, 83*6b5e5868SGarrett D'Amore { "big5", "BIG5", towide_big5, tomb_mbs }, 84*6b5e5868SGarrett D'Amore { "Big5", "BIG5", towide_big5, tomb_mbs }, 85*6b5e5868SGarrett D'Amore 86*6b5e5868SGarrett D'Amore { "GBK", "GBK", towide_gbk, tomb_mbs }, 87*6b5e5868SGarrett D'Amore 88*6b5e5868SGarrett D'Amore { "GB18030", "GB18030", towide_gb18030, tomb_mbs }, 89*6b5e5868SGarrett D'Amore 90*6b5e5868SGarrett D'Amore { "GB2312", "GB2312", towide_gb2312, tomb_mbs }, 91*6b5e5868SGarrett D'Amore 92*6b5e5868SGarrett D'Amore { "ASCII", "ASCII", towide_none, tomb_none }, 93*6b5e5868SGarrett D'Amore { "US-ASCII", "ASCII", towide_none, tomb_none }, 94*6b5e5868SGarrett D'Amore { "646", "ASCII", towide_none, tomb_none }, 95*6b5e5868SGarrett D'Amore 96*6b5e5868SGarrett D'Amore { NULL, NULL }, 97*6b5e5868SGarrett D'Amore }; 98*6b5e5868SGarrett D'Amore 99*6b5e5868SGarrett D'Amore static char * 100*6b5e5868SGarrett D'Amore show_mb(const char *mb) 101*6b5e5868SGarrett D'Amore { 102*6b5e5868SGarrett D'Amore static char buf[64]; 103*6b5e5868SGarrett D'Amore 104*6b5e5868SGarrett D'Amore /* ASCII stuff we just print */ 105*6b5e5868SGarrett D'Amore if (isascii(*mb) && isgraph(*mb)) { 106*6b5e5868SGarrett D'Amore buf[0] = *mb; 107*6b5e5868SGarrett D'Amore buf[1] = 0; 108*6b5e5868SGarrett D'Amore return (buf); 109*6b5e5868SGarrett D'Amore } 110*6b5e5868SGarrett D'Amore buf[0] = 0; 111*6b5e5868SGarrett D'Amore while (*mb != 0) { 112*6b5e5868SGarrett D'Amore char scr[8]; 113*6b5e5868SGarrett D'Amore (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb); 114*6b5e5868SGarrett D'Amore (void) strlcat(buf, scr, sizeof (buf)); 115*6b5e5868SGarrett D'Amore mb++; 116*6b5e5868SGarrett D'Amore } 117*6b5e5868SGarrett D'Amore return (buf); 118*6b5e5868SGarrett D'Amore } 119*6b5e5868SGarrett D'Amore 120*6b5e5868SGarrett D'Amore static char *widemsg; 121*6b5e5868SGarrett D'Amore 122*6b5e5868SGarrett D'Amore void 123*6b5e5868SGarrett D'Amore werr(const char *fmt, ...) 124*6b5e5868SGarrett D'Amore { 125*6b5e5868SGarrett D'Amore char *msg; 126*6b5e5868SGarrett D'Amore 127*6b5e5868SGarrett D'Amore va_list va; 128*6b5e5868SGarrett D'Amore va_start(va, fmt); 129*6b5e5868SGarrett D'Amore (void) vasprintf(&msg, fmt, va); 130*6b5e5868SGarrett D'Amore va_end(va); 131*6b5e5868SGarrett D'Amore 132*6b5e5868SGarrett D'Amore free(widemsg); 133*6b5e5868SGarrett D'Amore widemsg = msg; 134*6b5e5868SGarrett D'Amore } 135*6b5e5868SGarrett D'Amore 136*6b5e5868SGarrett D'Amore /* 137*6b5e5868SGarrett D'Amore * This is used for 8-bit encodings. 138*6b5e5868SGarrett D'Amore */ 139*6b5e5868SGarrett D'Amore int 140*6b5e5868SGarrett D'Amore towide_none(wchar_t *c, const char *mb, int n) 141*6b5e5868SGarrett D'Amore { 142*6b5e5868SGarrett D'Amore if (mb_cur_max != 1) { 143*6b5e5868SGarrett D'Amore werr("invalid or unsupported multibyte locale"); 144*6b5e5868SGarrett D'Amore return (-1); 145*6b5e5868SGarrett D'Amore } 146*6b5e5868SGarrett D'Amore if (n < 1) { 147*6b5e5868SGarrett D'Amore werr("no character data"); 148*6b5e5868SGarrett D'Amore return (-1); 149*6b5e5868SGarrett D'Amore } 150*6b5e5868SGarrett D'Amore *c = (uint8_t)*mb; 151*6b5e5868SGarrett D'Amore return (1); 152*6b5e5868SGarrett D'Amore } 153*6b5e5868SGarrett D'Amore 154*6b5e5868SGarrett D'Amore int 155*6b5e5868SGarrett D'Amore tomb_none(char *mb, wchar_t wc) 156*6b5e5868SGarrett D'Amore { 157*6b5e5868SGarrett D'Amore if (mb_cur_max != 1) { 158*6b5e5868SGarrett D'Amore werr("invalid or unsupported multibyte locale"); 159*6b5e5868SGarrett D'Amore return (-1); 160*6b5e5868SGarrett D'Amore } 161*6b5e5868SGarrett D'Amore *(uint8_t *)mb = (wc & 0xff); 162*6b5e5868SGarrett D'Amore mb[1] = 0; 163*6b5e5868SGarrett D'Amore return (1); 164*6b5e5868SGarrett D'Amore } 165*6b5e5868SGarrett D'Amore 166*6b5e5868SGarrett D'Amore /* 167*6b5e5868SGarrett D'Amore * UTF-8 stores wide characters in UTF-32 form. 168*6b5e5868SGarrett D'Amore */ 169*6b5e5868SGarrett D'Amore int 170*6b5e5868SGarrett D'Amore towide_utf8(wchar_t *wc, const char *mb, int n) 171*6b5e5868SGarrett D'Amore { 172*6b5e5868SGarrett D'Amore wchar_t c; 173*6b5e5868SGarrett D'Amore int nb; 174*6b5e5868SGarrett D'Amore int lv; /* lowest legal value */ 175*6b5e5868SGarrett D'Amore int i; 176*6b5e5868SGarrett D'Amore const uint8_t *s = (const uint8_t *)mb; 177*6b5e5868SGarrett D'Amore 178*6b5e5868SGarrett D'Amore if (n < 1) { 179*6b5e5868SGarrett D'Amore werr("no utf8 data"); 180*6b5e5868SGarrett D'Amore return (-1); 181*6b5e5868SGarrett D'Amore } 182*6b5e5868SGarrett D'Amore c = *s; 183*6b5e5868SGarrett D'Amore 184*6b5e5868SGarrett D'Amore if ((c & 0x80) == 0) { 185*6b5e5868SGarrett D'Amore /* 7-bit ASCII */ 186*6b5e5868SGarrett D'Amore *wc = c; 187*6b5e5868SGarrett D'Amore return (1); 188*6b5e5868SGarrett D'Amore } else if ((c & 0xe0) == 0xc0) { 189*6b5e5868SGarrett D'Amore /* u80-u7ff - two bytes encoded */ 190*6b5e5868SGarrett D'Amore nb = 2; 191*6b5e5868SGarrett D'Amore lv = 0x80; 192*6b5e5868SGarrett D'Amore c &= ~0xe0; 193*6b5e5868SGarrett D'Amore } else if ((c & 0xf0) == 0xe0) { 194*6b5e5868SGarrett D'Amore /* u800-uffff - three bytes encoded */ 195*6b5e5868SGarrett D'Amore nb = 3; 196*6b5e5868SGarrett D'Amore lv = 0x800; 197*6b5e5868SGarrett D'Amore c &= ~0xf0; 198*6b5e5868SGarrett D'Amore } else if ((c & 0xf8) == 0xf0) { 199*6b5e5868SGarrett D'Amore /* u1000-u1fffff - four bytes encoded */ 200*6b5e5868SGarrett D'Amore nb = 4; 201*6b5e5868SGarrett D'Amore lv = 0x1000; 202*6b5e5868SGarrett D'Amore c &= ~0xf8; 203*6b5e5868SGarrett D'Amore } else { 204*6b5e5868SGarrett D'Amore /* 5 and 6 byte encodings are not legal unicode */ 205*6b5e5868SGarrett D'Amore werr("utf8 encoding too large (%s)", show_mb(mb)); 206*6b5e5868SGarrett D'Amore return (-1); 207*6b5e5868SGarrett D'Amore } 208*6b5e5868SGarrett D'Amore if (nb > n) { 209*6b5e5868SGarrett D'Amore werr("incomplete utf8 sequence (%s)", show_mb(mb)); 210*6b5e5868SGarrett D'Amore return (-1); 211*6b5e5868SGarrett D'Amore } 212*6b5e5868SGarrett D'Amore 213*6b5e5868SGarrett D'Amore for (i = 1; i < nb; i++) { 214*6b5e5868SGarrett D'Amore if (((s[i]) & 0xc0) != 0x80) { 215*6b5e5868SGarrett D'Amore werr("illegal utf8 byte (%x)", s[i]); 216*6b5e5868SGarrett D'Amore return (-1); 217*6b5e5868SGarrett D'Amore } 218*6b5e5868SGarrett D'Amore c <<= 6; 219*6b5e5868SGarrett D'Amore c |= (s[i] & 0x3f); 220*6b5e5868SGarrett D'Amore } 221*6b5e5868SGarrett D'Amore 222*6b5e5868SGarrett D'Amore if (c < lv) { 223*6b5e5868SGarrett D'Amore werr("illegal redundant utf8 encoding (%s)", show_mb(mb)); 224*6b5e5868SGarrett D'Amore return (-1); 225*6b5e5868SGarrett D'Amore } 226*6b5e5868SGarrett D'Amore *wc = c; 227*6b5e5868SGarrett D'Amore return (nb); 228*6b5e5868SGarrett D'Amore } 229*6b5e5868SGarrett D'Amore 230*6b5e5868SGarrett D'Amore int 231*6b5e5868SGarrett D'Amore tomb_utf8(char *mb, wchar_t wc) 232*6b5e5868SGarrett D'Amore { 233*6b5e5868SGarrett D'Amore uint8_t *s = (uint8_t *)mb; 234*6b5e5868SGarrett D'Amore uint8_t msk; 235*6b5e5868SGarrett D'Amore int cnt; 236*6b5e5868SGarrett D'Amore int i; 237*6b5e5868SGarrett D'Amore 238*6b5e5868SGarrett D'Amore if (wc <= 0x7f) { 239*6b5e5868SGarrett D'Amore s[0] = wc & 0x7f; 240*6b5e5868SGarrett D'Amore s[1] = 0; 241*6b5e5868SGarrett D'Amore return (1); 242*6b5e5868SGarrett D'Amore } 243*6b5e5868SGarrett D'Amore if (wc <= 0x7ff) { 244*6b5e5868SGarrett D'Amore cnt = 2; 245*6b5e5868SGarrett D'Amore msk = 0xc0; 246*6b5e5868SGarrett D'Amore } else if (wc <= 0xffff) { 247*6b5e5868SGarrett D'Amore cnt = 3; 248*6b5e5868SGarrett D'Amore msk = 0xe0; 249*6b5e5868SGarrett D'Amore } else if (wc <= 0x1fffff) { 250*6b5e5868SGarrett D'Amore cnt = 4; 251*6b5e5868SGarrett D'Amore msk = 0xf0; 252*6b5e5868SGarrett D'Amore } else { 253*6b5e5868SGarrett D'Amore werr("illegal uf8 char (%x)", wc); 254*6b5e5868SGarrett D'Amore return (-1); 255*6b5e5868SGarrett D'Amore } 256*6b5e5868SGarrett D'Amore for (i = cnt - 1; i; i--) { 257*6b5e5868SGarrett D'Amore s[i] = (wc & 0x3f) | 0x80; 258*6b5e5868SGarrett D'Amore wc >>= 6; 259*6b5e5868SGarrett D'Amore } 260*6b5e5868SGarrett D'Amore s[0] = (msk) | wc; 261*6b5e5868SGarrett D'Amore s[cnt] = 0; 262*6b5e5868SGarrett D'Amore return (cnt); 263*6b5e5868SGarrett D'Amore } 264*6b5e5868SGarrett D'Amore 265*6b5e5868SGarrett D'Amore /* 266*6b5e5868SGarrett D'Amore * Several encodings share a simplistic dual byte encoding. In these 267*6b5e5868SGarrett D'Amore * forms, they all indicate that a two byte sequence is to be used if 268*6b5e5868SGarrett D'Amore * the first byte has its high bit set. They all store this simple 269*6b5e5868SGarrett D'Amore * encoding as a 16-bit value, although a great many of the possible 270*6b5e5868SGarrett D'Amore * code points are not used in most character sets. This gives a possible 271*6b5e5868SGarrett D'Amore * set of just over 32,000 valid code points. 272*6b5e5868SGarrett D'Amore * 273*6b5e5868SGarrett D'Amore * 0x00 - 0x7f - 1 byte encoding 274*6b5e5868SGarrett D'Amore * 0x80 - 0x7fff - illegal 275*6b5e5868SGarrett D'Amore * 0x8000 - 0xffff - 2 byte encoding 276*6b5e5868SGarrett D'Amore */ 277*6b5e5868SGarrett D'Amore static int 278*6b5e5868SGarrett D'Amore towide_dbcs(wchar_t *wc, const char *mb, int n) 279*6b5e5868SGarrett D'Amore { 280*6b5e5868SGarrett D'Amore wchar_t c; 281*6b5e5868SGarrett D'Amore 282*6b5e5868SGarrett D'Amore c = *(uint8_t *)mb; 283*6b5e5868SGarrett D'Amore 284*6b5e5868SGarrett D'Amore if (n < 1) { 285*6b5e5868SGarrett D'Amore werr("no character data"); 286*6b5e5868SGarrett D'Amore return (-1); 287*6b5e5868SGarrett D'Amore } 288*6b5e5868SGarrett D'Amore if ((c & 0x80) == 0) { 289*6b5e5868SGarrett D'Amore /* 7-bit */ 290*6b5e5868SGarrett D'Amore *wc = c; 291*6b5e5868SGarrett D'Amore return (1); 292*6b5e5868SGarrett D'Amore } 293*6b5e5868SGarrett D'Amore if (n < 2) { 294*6b5e5868SGarrett D'Amore werr("incomplete character sequence (%s)", show_mb(mb)); 295*6b5e5868SGarrett D'Amore return (-1); 296*6b5e5868SGarrett D'Amore } 297*6b5e5868SGarrett D'Amore 298*6b5e5868SGarrett D'Amore /* Store both bytes as a single 16-bit wide. */ 299*6b5e5868SGarrett D'Amore c <<= 8; 300*6b5e5868SGarrett D'Amore c |= (uint8_t)(mb[1]); 301*6b5e5868SGarrett D'Amore *wc = c; 302*6b5e5868SGarrett D'Amore return (2); 303*6b5e5868SGarrett D'Amore } 304*6b5e5868SGarrett D'Amore 305*6b5e5868SGarrett D'Amore /* 306*6b5e5868SGarrett D'Amore * Most multibyte locales just convert the wide character to the multibyte 307*6b5e5868SGarrett D'Amore * form by stripping leading null bytes, and writing the 32-bit quantity 308*6b5e5868SGarrett D'Amore * in big-endian order. 309*6b5e5868SGarrett D'Amore */ 310*6b5e5868SGarrett D'Amore int 311*6b5e5868SGarrett D'Amore tomb_mbs(char *mb, wchar_t wc) 312*6b5e5868SGarrett D'Amore { 313*6b5e5868SGarrett D'Amore uint8_t *s = (uint8_t *)mb; 314*6b5e5868SGarrett D'Amore int n = 0, c; 315*6b5e5868SGarrett D'Amore 316*6b5e5868SGarrett D'Amore if ((wc & 0xff000000U) != 0) { 317*6b5e5868SGarrett D'Amore n = 4; 318*6b5e5868SGarrett D'Amore } else if ((wc & 0x00ff0000U) != 0) { 319*6b5e5868SGarrett D'Amore n = 3; 320*6b5e5868SGarrett D'Amore } else if ((wc & 0x0000ff00U) != 0) { 321*6b5e5868SGarrett D'Amore n = 2; 322*6b5e5868SGarrett D'Amore } else { 323*6b5e5868SGarrett D'Amore n = 1; 324*6b5e5868SGarrett D'Amore } 325*6b5e5868SGarrett D'Amore c = n; 326*6b5e5868SGarrett D'Amore while (n) { 327*6b5e5868SGarrett D'Amore n--; 328*6b5e5868SGarrett D'Amore s[n] = wc & 0xff; 329*6b5e5868SGarrett D'Amore wc >>= 8; 330*6b5e5868SGarrett D'Amore } 331*6b5e5868SGarrett D'Amore /* ensure null termination */ 332*6b5e5868SGarrett D'Amore s[c] = 0; 333*6b5e5868SGarrett D'Amore return (c); 334*6b5e5868SGarrett D'Amore } 335*6b5e5868SGarrett D'Amore 336*6b5e5868SGarrett D'Amore 337*6b5e5868SGarrett D'Amore /* 338*6b5e5868SGarrett D'Amore * big5 is a simple dual byte character set. 339*6b5e5868SGarrett D'Amore */ 340*6b5e5868SGarrett D'Amore int 341*6b5e5868SGarrett D'Amore towide_big5(wchar_t *wc, const char *mb, int n) 342*6b5e5868SGarrett D'Amore { 343*6b5e5868SGarrett D'Amore return (towide_dbcs(wc, mb, n)); 344*6b5e5868SGarrett D'Amore } 345*6b5e5868SGarrett D'Amore 346*6b5e5868SGarrett D'Amore /* 347*6b5e5868SGarrett D'Amore * GBK encodes wides in the same way that big5 does, the high order 348*6b5e5868SGarrett D'Amore * bit of the first byte indicates a double byte character. 349*6b5e5868SGarrett D'Amore */ 350*6b5e5868SGarrett D'Amore int 351*6b5e5868SGarrett D'Amore towide_gbk(wchar_t *wc, const char *mb, int n) 352*6b5e5868SGarrett D'Amore { 353*6b5e5868SGarrett D'Amore return (towide_dbcs(wc, mb, n)); 354*6b5e5868SGarrett D'Amore } 355*6b5e5868SGarrett D'Amore 356*6b5e5868SGarrett D'Amore /* 357*6b5e5868SGarrett D'Amore * GB2312 is another DBCS. Its cleaner than others in that the second 358*6b5e5868SGarrett D'Amore * byte does not encode ASCII, but it supports characters. 359*6b5e5868SGarrett D'Amore */ 360*6b5e5868SGarrett D'Amore int 361*6b5e5868SGarrett D'Amore towide_gb2312(wchar_t *wc, const char *mb, int n) 362*6b5e5868SGarrett D'Amore { 363*6b5e5868SGarrett D'Amore return (towide_dbcs(wc, mb, n)); 364*6b5e5868SGarrett D'Amore } 365*6b5e5868SGarrett D'Amore 366*6b5e5868SGarrett D'Amore /* 367*6b5e5868SGarrett D'Amore * GB18030. This encodes as 8, 16, or 32-bits. 368*6b5e5868SGarrett D'Amore * 7-bit values are in 1 byte, 4 byte sequences are used when 369*6b5e5868SGarrett D'Amore * the second byte encodes 0x30-39 and all other sequences are 2 bytes. 370*6b5e5868SGarrett D'Amore */ 371*6b5e5868SGarrett D'Amore int 372*6b5e5868SGarrett D'Amore towide_gb18030(wchar_t *wc, const char *mb, int n) 373*6b5e5868SGarrett D'Amore { 374*6b5e5868SGarrett D'Amore wchar_t c; 375*6b5e5868SGarrett D'Amore 376*6b5e5868SGarrett D'Amore c = *(uint8_t *)mb; 377*6b5e5868SGarrett D'Amore 378*6b5e5868SGarrett D'Amore if (n < 1) { 379*6b5e5868SGarrett D'Amore werr("no character data"); 380*6b5e5868SGarrett D'Amore return (-1); 381*6b5e5868SGarrett D'Amore } 382*6b5e5868SGarrett D'Amore if ((c & 0x80) == 0) { 383*6b5e5868SGarrett D'Amore /* 7-bit */ 384*6b5e5868SGarrett D'Amore *wc = c; 385*6b5e5868SGarrett D'Amore return (1); 386*6b5e5868SGarrett D'Amore } 387*6b5e5868SGarrett D'Amore if (n < 2) { 388*6b5e5868SGarrett D'Amore werr("incomplete character sequence (%s)", show_mb(mb)); 389*6b5e5868SGarrett D'Amore return (-1); 390*6b5e5868SGarrett D'Amore } 391*6b5e5868SGarrett D'Amore 392*6b5e5868SGarrett D'Amore /* pull in the second byte */ 393*6b5e5868SGarrett D'Amore c <<= 8; 394*6b5e5868SGarrett D'Amore c |= (uint8_t)(mb[1]); 395*6b5e5868SGarrett D'Amore 396*6b5e5868SGarrett D'Amore if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) { 397*6b5e5868SGarrett D'Amore if (n < 4) { 398*6b5e5868SGarrett D'Amore werr("incomplete 4-byte character sequence (%s)", 399*6b5e5868SGarrett D'Amore show_mb(mb)); 400*6b5e5868SGarrett D'Amore return (-1); 401*6b5e5868SGarrett D'Amore } 402*6b5e5868SGarrett D'Amore c <<= 8; 403*6b5e5868SGarrett D'Amore c |= (uint8_t)(mb[2]); 404*6b5e5868SGarrett D'Amore c <<= 8; 405*6b5e5868SGarrett D'Amore c |= (uint8_t)(mb[3]); 406*6b5e5868SGarrett D'Amore *wc = c; 407*6b5e5868SGarrett D'Amore return (4); 408*6b5e5868SGarrett D'Amore } 409*6b5e5868SGarrett D'Amore 410*6b5e5868SGarrett D'Amore *wc = c; 411*6b5e5868SGarrett D'Amore return (2); 412*6b5e5868SGarrett D'Amore } 413*6b5e5868SGarrett D'Amore 414*6b5e5868SGarrett D'Amore /* 415*6b5e5868SGarrett D'Amore * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it 416*6b5e5868SGarrett D'Amore * also has a range of single byte characters above 0x80. (0xa1-0xdf). 417*6b5e5868SGarrett D'Amore */ 418*6b5e5868SGarrett D'Amore int 419*6b5e5868SGarrett D'Amore towide_mskanji(wchar_t *wc, const char *mb, int n) 420*6b5e5868SGarrett D'Amore { 421*6b5e5868SGarrett D'Amore wchar_t c; 422*6b5e5868SGarrett D'Amore 423*6b5e5868SGarrett D'Amore c = *(uint8_t *)mb; 424*6b5e5868SGarrett D'Amore 425*6b5e5868SGarrett D'Amore if (n < 1) { 426*6b5e5868SGarrett D'Amore werr("no character data"); 427*6b5e5868SGarrett D'Amore return (-1); 428*6b5e5868SGarrett D'Amore } 429*6b5e5868SGarrett D'Amore if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) { 430*6b5e5868SGarrett D'Amore /* 7-bit */ 431*6b5e5868SGarrett D'Amore *wc = c; 432*6b5e5868SGarrett D'Amore return (-1); 433*6b5e5868SGarrett D'Amore } 434*6b5e5868SGarrett D'Amore 435*6b5e5868SGarrett D'Amore if (n < 2) { 436*6b5e5868SGarrett D'Amore werr("incomplete character sequence (%s)", show_mb(mb)); 437*6b5e5868SGarrett D'Amore return (-1); 438*6b5e5868SGarrett D'Amore } 439*6b5e5868SGarrett D'Amore 440*6b5e5868SGarrett D'Amore /* Store both bytes as a single 16-bit wide. */ 441*6b5e5868SGarrett D'Amore c <<= 8; 442*6b5e5868SGarrett D'Amore c |= (uint8_t)(mb[1]); 443*6b5e5868SGarrett D'Amore *wc = c; 444*6b5e5868SGarrett D'Amore return (2); 445*6b5e5868SGarrett D'Amore } 446*6b5e5868SGarrett D'Amore 447*6b5e5868SGarrett D'Amore /* 448*6b5e5868SGarrett D'Amore * EUC forms. EUC encodings are "variable". FreeBSD carries some additional 449*6b5e5868SGarrett D'Amore * variable data to encode these, but we're going to treat each as independent 450*6b5e5868SGarrett D'Amore * instead. Its the only way we can sensibly move forward. 451*6b5e5868SGarrett D'Amore * 452*6b5e5868SGarrett D'Amore * Note that the way in which the different EUC forms vary is how wide 453*6b5e5868SGarrett D'Amore * CS2 and CS3 are and what the first byte of them is. 454*6b5e5868SGarrett D'Amore */ 455*6b5e5868SGarrett D'Amore static int 456*6b5e5868SGarrett D'Amore towide_euc_impl(wchar_t *wc, const char *mb, int n, 457*6b5e5868SGarrett D'Amore uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) 458*6b5e5868SGarrett D'Amore { 459*6b5e5868SGarrett D'Amore int i; 460*6b5e5868SGarrett D'Amore int width; 461*6b5e5868SGarrett D'Amore wchar_t c; 462*6b5e5868SGarrett D'Amore 463*6b5e5868SGarrett D'Amore c = *(uint8_t *)mb; 464*6b5e5868SGarrett D'Amore 465*6b5e5868SGarrett D'Amore if (n < 1) { 466*6b5e5868SGarrett D'Amore werr("no character data"); 467*6b5e5868SGarrett D'Amore return (-1); 468*6b5e5868SGarrett D'Amore } 469*6b5e5868SGarrett D'Amore 470*6b5e5868SGarrett D'Amore /* 471*6b5e5868SGarrett D'Amore * All variations of EUC encode 7-bit ASCII as one byte, and use 472*6b5e5868SGarrett D'Amore * additional bytes for more than that. 473*6b5e5868SGarrett D'Amore */ 474*6b5e5868SGarrett D'Amore if ((c & 0x80) == 0) { 475*6b5e5868SGarrett D'Amore /* 7-bit */ 476*6b5e5868SGarrett D'Amore *wc = c; 477*6b5e5868SGarrett D'Amore return (1); 478*6b5e5868SGarrett D'Amore } 479*6b5e5868SGarrett D'Amore 480*6b5e5868SGarrett D'Amore /* 481*6b5e5868SGarrett D'Amore * All EUC variants reserve 0xa1-0xff to identify CS1, which 482*6b5e5868SGarrett D'Amore * is always two bytes wide. Note that unused CS will be zero, 483*6b5e5868SGarrett D'Amore * and that cannot be true because we know that the high order 484*6b5e5868SGarrett D'Amore * bit must be set. 485*6b5e5868SGarrett D'Amore */ 486*6b5e5868SGarrett D'Amore if (c >= 0xa1) { 487*6b5e5868SGarrett D'Amore width = 2; 488*6b5e5868SGarrett D'Amore } else if (c == cs2) { 489*6b5e5868SGarrett D'Amore width = cs2width; 490*6b5e5868SGarrett D'Amore } else if (c == cs3) { 491*6b5e5868SGarrett D'Amore width = cs3width; 492*6b5e5868SGarrett D'Amore } 493*6b5e5868SGarrett D'Amore 494*6b5e5868SGarrett D'Amore if (n < width) { 495*6b5e5868SGarrett D'Amore werr("incomplete character sequence (%s)", show_mb(mb)); 496*6b5e5868SGarrett D'Amore return (-1); 497*6b5e5868SGarrett D'Amore } 498*6b5e5868SGarrett D'Amore 499*6b5e5868SGarrett D'Amore for (i = 1; i < width; i++) { 500*6b5e5868SGarrett D'Amore /* pull in the next byte */ 501*6b5e5868SGarrett D'Amore c <<= 8; 502*6b5e5868SGarrett D'Amore c |= (uint8_t)(mb[i]); 503*6b5e5868SGarrett D'Amore } 504*6b5e5868SGarrett D'Amore 505*6b5e5868SGarrett D'Amore *wc = c; 506*6b5e5868SGarrett D'Amore return (width); 507*6b5e5868SGarrett D'Amore } 508*6b5e5868SGarrett D'Amore 509*6b5e5868SGarrett D'Amore /* 510*6b5e5868SGarrett D'Amore * EUC-CN encodes as follows: 511*6b5e5868SGarrett D'Amore * 512*6b5e5868SGarrett D'Amore * Code set 0 (ASCII): 0x21-0x7E 513*6b5e5868SGarrett D'Amore * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 514*6b5e5868SGarrett D'Amore * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE 515*6b5e5868SGarrett D'Amore * Code set 3: unused 516*6b5e5868SGarrett D'Amore */ 517*6b5e5868SGarrett D'Amore int 518*6b5e5868SGarrett D'Amore towide_euccn(wchar_t *wc, const char *mb, int n) 519*6b5e5868SGarrett D'Amore { 520*6b5e5868SGarrett D'Amore return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 521*6b5e5868SGarrett D'Amore } 522*6b5e5868SGarrett D'Amore 523*6b5e5868SGarrett D'Amore /* 524*6b5e5868SGarrett D'Amore * EUC-JP encodes as follows: 525*6b5e5868SGarrett D'Amore * 526*6b5e5868SGarrett D'Amore * Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E 527*6b5e5868SGarrett D'Amore * Code set 1 (JIS X 0208): 0xA1A1-0xFEFE 528*6b5e5868SGarrett D'Amore * Code set 2 (half-width katakana): 0x8EA1-0x8EDF 529*6b5e5868SGarrett D'Amore * Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE 530*6b5e5868SGarrett D'Amore */ 531*6b5e5868SGarrett D'Amore int 532*6b5e5868SGarrett D'Amore towide_eucjp(wchar_t *wc, const char *mb, int n) 533*6b5e5868SGarrett D'Amore { 534*6b5e5868SGarrett D'Amore return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3)); 535*6b5e5868SGarrett D'Amore } 536*6b5e5868SGarrett D'Amore 537*6b5e5868SGarrett D'Amore /* 538*6b5e5868SGarrett D'Amore * EUC-KR encodes as follows: 539*6b5e5868SGarrett D'Amore * 540*6b5e5868SGarrett D'Amore * Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E 541*6b5e5868SGarrett D'Amore * Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE 542*6b5e5868SGarrett D'Amore * Code set 2: unused 543*6b5e5868SGarrett D'Amore * Code set 3: unused 544*6b5e5868SGarrett D'Amore */ 545*6b5e5868SGarrett D'Amore int 546*6b5e5868SGarrett D'Amore towide_euckr(wchar_t *wc, const char *mb, int n) 547*6b5e5868SGarrett D'Amore { 548*6b5e5868SGarrett D'Amore return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0)); 549*6b5e5868SGarrett D'Amore } 550*6b5e5868SGarrett D'Amore 551*6b5e5868SGarrett D'Amore /* 552*6b5e5868SGarrett D'Amore * EUC-TW encodes as follows: 553*6b5e5868SGarrett D'Amore * 554*6b5e5868SGarrett D'Amore * Code set 0 (ASCII): 0x21-0x7E 555*6b5e5868SGarrett D'Amore * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE 556*6b5e5868SGarrett D'Amore * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE 557*6b5e5868SGarrett D'Amore * Code set 3: unused 558*6b5e5868SGarrett D'Amore */ 559*6b5e5868SGarrett D'Amore int 560*6b5e5868SGarrett D'Amore towide_euctw(wchar_t *wc, const char *mb, int n) 561*6b5e5868SGarrett D'Amore { 562*6b5e5868SGarrett D'Amore return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); 563*6b5e5868SGarrett D'Amore } 564*6b5e5868SGarrett D'Amore 565*6b5e5868SGarrett D'Amore /* 566*6b5e5868SGarrett D'Amore * Public entry points. 567*6b5e5868SGarrett D'Amore */ 568*6b5e5868SGarrett D'Amore 569*6b5e5868SGarrett D'Amore int 570*6b5e5868SGarrett D'Amore to_wide(wchar_t *wc, const char *mb) 571*6b5e5868SGarrett D'Amore { 572*6b5e5868SGarrett D'Amore /* this won't fail hard */ 573*6b5e5868SGarrett D'Amore return (_towide(wc, mb, strlen(mb) + 1)); 574*6b5e5868SGarrett D'Amore } 575*6b5e5868SGarrett D'Amore 576*6b5e5868SGarrett D'Amore int 577*6b5e5868SGarrett D'Amore to_mb(char *mb, wchar_t wc) 578*6b5e5868SGarrett D'Amore { 579*6b5e5868SGarrett D'Amore int rv; 580*6b5e5868SGarrett D'Amore 581*6b5e5868SGarrett D'Amore if ((rv = _tomb(mb, wc)) < 0) { 582*6b5e5868SGarrett D'Amore errf(widemsg); 583*6b5e5868SGarrett D'Amore free(widemsg); 584*6b5e5868SGarrett D'Amore widemsg = NULL; 585*6b5e5868SGarrett D'Amore } 586*6b5e5868SGarrett D'Amore return (rv); 587*6b5e5868SGarrett D'Amore } 588*6b5e5868SGarrett D'Amore 589*6b5e5868SGarrett D'Amore char * 590*6b5e5868SGarrett D'Amore to_mb_string(const wchar_t *wcs) 591*6b5e5868SGarrett D'Amore { 592*6b5e5868SGarrett D'Amore char *mbs; 593*6b5e5868SGarrett D'Amore char *ptr; 594*6b5e5868SGarrett D'Amore int len; 595*6b5e5868SGarrett D'Amore 596*6b5e5868SGarrett D'Amore mbs = malloc((wcslen(wcs) * mb_cur_max) + 1); 597*6b5e5868SGarrett D'Amore if (mbs == NULL) { 598*6b5e5868SGarrett D'Amore errf("out of memory"); 599*6b5e5868SGarrett D'Amore return (NULL); 600*6b5e5868SGarrett D'Amore } 601*6b5e5868SGarrett D'Amore ptr = mbs; 602*6b5e5868SGarrett D'Amore while (*wcs) { 603*6b5e5868SGarrett D'Amore if ((len = to_mb(ptr, *wcs)) < 0) { 604*6b5e5868SGarrett D'Amore INTERR; 605*6b5e5868SGarrett D'Amore free(mbs); 606*6b5e5868SGarrett D'Amore return (NULL); 607*6b5e5868SGarrett D'Amore } 608*6b5e5868SGarrett D'Amore wcs++; 609*6b5e5868SGarrett D'Amore ptr += len; 610*6b5e5868SGarrett D'Amore } 611*6b5e5868SGarrett D'Amore *ptr = 0; 612*6b5e5868SGarrett D'Amore return (mbs); 613*6b5e5868SGarrett D'Amore } 614*6b5e5868SGarrett D'Amore 615*6b5e5868SGarrett D'Amore void 616*6b5e5868SGarrett D'Amore set_wide_encoding(const char *encoding) 617*6b5e5868SGarrett D'Amore { 618*6b5e5868SGarrett D'Amore int i; 619*6b5e5868SGarrett D'Amore 620*6b5e5868SGarrett D'Amore _towide = towide_none; 621*6b5e5868SGarrett D'Amore _tomb = tomb_none; 622*6b5e5868SGarrett D'Amore _encoding = "NONE"; 623*6b5e5868SGarrett D'Amore 624*6b5e5868SGarrett D'Amore for (i = 0; mb_encodings[i].name; i++) { 625*6b5e5868SGarrett D'Amore if (strcasecmp(encoding, mb_encodings[i].name) == 0) { 626*6b5e5868SGarrett D'Amore _towide = mb_encodings[i].towide; 627*6b5e5868SGarrett D'Amore _tomb = mb_encodings[i].tomb; 628*6b5e5868SGarrett D'Amore _encoding = mb_encodings[i].cname; 629*6b5e5868SGarrett D'Amore } 630*6b5e5868SGarrett D'Amore } 631*6b5e5868SGarrett D'Amore } 632*6b5e5868SGarrett D'Amore 633*6b5e5868SGarrett D'Amore const char * 634*6b5e5868SGarrett D'Amore get_wide_encoding(void) 635*6b5e5868SGarrett D'Amore { 636*6b5e5868SGarrett D'Amore return (_encoding); 637*6b5e5868SGarrett D'Amore } 638