1*91e1e26aSAlexander Pyhalov /* 2*91e1e26aSAlexander Pyhalov * CDDL HEADER START 3*91e1e26aSAlexander Pyhalov * 4*91e1e26aSAlexander Pyhalov * The contents of this file are subject to the terms of the 5*91e1e26aSAlexander Pyhalov * Common Development and Distribution License (the "License"). 6*91e1e26aSAlexander Pyhalov * You may not use this file except in compliance with the License. 7*91e1e26aSAlexander Pyhalov * 8*91e1e26aSAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9*91e1e26aSAlexander Pyhalov * or http://www.opensolaris.org/os/licensing. 10*91e1e26aSAlexander Pyhalov * See the License for the specific language governing permissions 11*91e1e26aSAlexander Pyhalov * and limitations under the License. 12*91e1e26aSAlexander Pyhalov * 13*91e1e26aSAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each 14*91e1e26aSAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE. 15*91e1e26aSAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the 16*91e1e26aSAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying 17*91e1e26aSAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner] 18*91e1e26aSAlexander Pyhalov * 19*91e1e26aSAlexander Pyhalov * CDDL HEADER END 20*91e1e26aSAlexander Pyhalov */ 21*91e1e26aSAlexander Pyhalov 22*91e1e26aSAlexander Pyhalov /* 23*91e1e26aSAlexander Pyhalov * Copyright (c) 1997, by Sun Microsystems, Inc. 24*91e1e26aSAlexander Pyhalov * All rights reserved. 25*91e1e26aSAlexander Pyhalov */ 26*91e1e26aSAlexander Pyhalov 27*91e1e26aSAlexander Pyhalov #include <stdio.h> 28*91e1e26aSAlexander Pyhalov #include <stdlib.h> 29*91e1e26aSAlexander Pyhalov #include <errno.h> 30*91e1e26aSAlexander Pyhalov #include <sys/types.h> 31*91e1e26aSAlexander Pyhalov 32*91e1e26aSAlexander Pyhalov #include "tab_lookup.h" /* table lookup data types */ 33*91e1e26aSAlexander Pyhalov 34*91e1e26aSAlexander Pyhalov #define MSB 0x80 /* most significant bit */ 35*91e1e26aSAlexander Pyhalov #define ONEBYTE 0xff /* right most byte */ 36*91e1e26aSAlexander Pyhalov 37*91e1e26aSAlexander Pyhalov enum _USTATE { U0, U1, U11, U2, U3, U4 }; 38*91e1e26aSAlexander Pyhalov 39*91e1e26aSAlexander Pyhalov 40*91e1e26aSAlexander Pyhalov int get_ibm_by_utf(_icv_state *st, char c1, char c2, int *unidx, 41*91e1e26aSAlexander Pyhalov unsigned long *ibm_code); 42*91e1e26aSAlexander Pyhalov 43*91e1e26aSAlexander Pyhalov int bisearch(unsigned long val, _icv_state *st, int n); 44*91e1e26aSAlexander Pyhalov 45*91e1e26aSAlexander Pyhalov int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf, 46*91e1e26aSAlexander Pyhalov size_t buflen, _icv_state *st); 47*91e1e26aSAlexander Pyhalov 48*91e1e26aSAlexander Pyhalov /* 49*91e1e26aSAlexander Pyhalov * Actual conversion; called from iconv() 50*91e1e26aSAlexander Pyhalov * Input is UTF-8 data. 51*91e1e26aSAlexander Pyhalov * first convert to UCS2 52*91e1e26aSAlexander Pyhalov */ 53*91e1e26aSAlexander Pyhalov size_t 54*91e1e26aSAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft, 55*91e1e26aSAlexander Pyhalov char **outbuf, size_t *outbytesleft) 56*91e1e26aSAlexander Pyhalov { 57*91e1e26aSAlexander Pyhalov /* 58*91e1e26aSAlexander Pyhalov * Actual conversion; called from iconv() 59*91e1e26aSAlexander Pyhalov */ 60*91e1e26aSAlexander Pyhalov /*========================================================= 61*91e1e26aSAlexander Pyhalov * 62*91e1e26aSAlexander Pyhalov * State Machine for interpreting UTF8 code 63*91e1e26aSAlexander Pyhalov * 64*91e1e26aSAlexander Pyhalov *========================================================= 65*91e1e26aSAlexander Pyhalov * 66*91e1e26aSAlexander Pyhalov * 3 byte unicode 67*91e1e26aSAlexander Pyhalov * +----->------->-------+ 68*91e1e26aSAlexander Pyhalov * | | 69*91e1e26aSAlexander Pyhalov * ^ v 70*91e1e26aSAlexander Pyhalov * | 2 byte U2 ---> U3 71*91e1e26aSAlexander Pyhalov * | unicode v 72*91e1e26aSAlexander Pyhalov * +------> U0 -------> U1 +-------->U4---+ 73*91e1e26aSAlexander Pyhalov * ^ ascii | | ^ | 74*91e1e26aSAlexander Pyhalov * | | +-------->--------->--------+ | 75*91e1e26aSAlexander Pyhalov * | v v 76*91e1e26aSAlexander Pyhalov * +----<---+-----<------------<------------<------------+ 77*91e1e26aSAlexander Pyhalov * 78*91e1e26aSAlexander Pyhalov * +----<---+-----<------------<------------<------------+ 79*91e1e26aSAlexander Pyhalov * 80*91e1e26aSAlexander Pyhalov *=========================================================*/ 81*91e1e26aSAlexander Pyhalov 82*91e1e26aSAlexander Pyhalov char c1 = '\0', c2 = '\0'; 83*91e1e26aSAlexander Pyhalov int n, unidx; 84*91e1e26aSAlexander Pyhalov unsigned long ibm_code; 85*91e1e26aSAlexander Pyhalov 86*91e1e26aSAlexander Pyhalov #ifdef DEBUG 87*91e1e26aSAlexander Pyhalov fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n"); 88*91e1e26aSAlexander Pyhalov #endif 89*91e1e26aSAlexander Pyhalov 90*91e1e26aSAlexander Pyhalov if (st == NULL) { 91*91e1e26aSAlexander Pyhalov errno = EBADF; 92*91e1e26aSAlexander Pyhalov return ((size_t) -1); 93*91e1e26aSAlexander Pyhalov } 94*91e1e26aSAlexander Pyhalov 95*91e1e26aSAlexander Pyhalov if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 96*91e1e26aSAlexander Pyhalov st->ustate = U0; 97*91e1e26aSAlexander Pyhalov st->_errno = 0; 98*91e1e26aSAlexander Pyhalov st->shift = SHIFT_IN; 99*91e1e26aSAlexander Pyhalov return ((size_t) 0); 100*91e1e26aSAlexander Pyhalov } 101*91e1e26aSAlexander Pyhalov 102*91e1e26aSAlexander Pyhalov st->_errno = 0; /* reset internal errno */ 103*91e1e26aSAlexander Pyhalov errno = 0; /* reset external errno */ 104*91e1e26aSAlexander Pyhalov 105*91e1e26aSAlexander Pyhalov /* a state machine for interpreting UTF8 code */ 106*91e1e26aSAlexander Pyhalov while (*inbytesleft > 0 && *outbytesleft > 0) { 107*91e1e26aSAlexander Pyhalov switch (st->ustate) { 108*91e1e26aSAlexander Pyhalov case U0: 109*91e1e26aSAlexander Pyhalov /* it is ascii, convert it immediately */ 110*91e1e26aSAlexander Pyhalov if ((**inbuf & MSB) == 0) { /* ASCII */ 111*91e1e26aSAlexander Pyhalov st->ustate = U4; 112*91e1e26aSAlexander Pyhalov st->keepc[0] = **inbuf; 113*91e1e26aSAlexander Pyhalov c1 = 0x0; 114*91e1e26aSAlexander Pyhalov c2 = **inbuf; 115*91e1e26aSAlexander Pyhalov continue; 116*91e1e26aSAlexander Pyhalov } else { /* Chinese character */ 117*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */ 118*91e1e26aSAlexander Pyhalov st->ustate = U1; 119*91e1e26aSAlexander Pyhalov st->keepc[0] = **inbuf; 120*91e1e26aSAlexander Pyhalov } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */ 121*91e1e26aSAlexander Pyhalov st->ustate = U2; 122*91e1e26aSAlexander Pyhalov st->keepc[0] = **inbuf; 123*91e1e26aSAlexander Pyhalov } else { /* illegal unicode */ 124*91e1e26aSAlexander Pyhalov /* st->_errno = errno = EINVAL; */ 125*91e1e26aSAlexander Pyhalov /* possible UNICODE ko_KR-UTF8 */ 126*91e1e26aSAlexander Pyhalov c1 =st->keepc[0] = **inbuf; 127*91e1e26aSAlexander Pyhalov st->ustate = U11; 128*91e1e26aSAlexander Pyhalov break; 129*91e1e26aSAlexander Pyhalov } 130*91e1e26aSAlexander Pyhalov } 131*91e1e26aSAlexander Pyhalov break; 132*91e1e26aSAlexander Pyhalov case U1: /* 2 byte unicode */ 133*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) { 134*91e1e26aSAlexander Pyhalov st->ustate = U4; 135*91e1e26aSAlexander Pyhalov st->keepc[1] = **inbuf; 136*91e1e26aSAlexander Pyhalov c1 = (st->keepc[0]&0x1c)>>2; 137*91e1e26aSAlexander Pyhalov c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f); 138*91e1e26aSAlexander Pyhalov #ifdef DEBUG 139*91e1e26aSAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x --> ", 140*91e1e26aSAlexander Pyhalov st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 141*91e1e26aSAlexander Pyhalov #endif 142*91e1e26aSAlexander Pyhalov continue; /* should not advance *inbuf */ 143*91e1e26aSAlexander Pyhalov } else { 144*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL; 145*91e1e26aSAlexander Pyhalov } 146*91e1e26aSAlexander Pyhalov break; 147*91e1e26aSAlexander Pyhalov case U11: /* 3 byte unicode - 2nd byte */ 148*91e1e26aSAlexander Pyhalov c2 =st->keepc[1] = **inbuf; 149*91e1e26aSAlexander Pyhalov st->ustate = U4; 150*91e1e26aSAlexander Pyhalov continue; 151*91e1e26aSAlexander Pyhalov break; 152*91e1e26aSAlexander Pyhalov case U2: /* 3 byte unicode - 2nd byte */ 153*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) { 154*91e1e26aSAlexander Pyhalov st->ustate = U3; 155*91e1e26aSAlexander Pyhalov st->keepc[1] = **inbuf; 156*91e1e26aSAlexander Pyhalov } else { 157*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL; 158*91e1e26aSAlexander Pyhalov } 159*91e1e26aSAlexander Pyhalov break; 160*91e1e26aSAlexander Pyhalov case U3: /* 3 byte unicode - 3rd byte */ 161*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) { 162*91e1e26aSAlexander Pyhalov st->ustate = U4; 163*91e1e26aSAlexander Pyhalov st->keepc[2] = **inbuf; 164*91e1e26aSAlexander Pyhalov c1 = ((st->keepc[0]&0x0f)<<4) | 165*91e1e26aSAlexander Pyhalov ((st->keepc[1]&0x3c)>>2); 166*91e1e26aSAlexander Pyhalov c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 167*91e1e26aSAlexander Pyhalov #ifdef DEBUG 168*91e1e26aSAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 169*91e1e26aSAlexander Pyhalov st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 170*91e1e26aSAlexander Pyhalov #endif 171*91e1e26aSAlexander Pyhalov continue; /* should not advance *inbuf */ 172*91e1e26aSAlexander Pyhalov } else { 173*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL; 174*91e1e26aSAlexander Pyhalov } 175*91e1e26aSAlexander Pyhalov break; 176*91e1e26aSAlexander Pyhalov case U4: 177*91e1e26aSAlexander Pyhalov n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code); 178*91e1e26aSAlexander Pyhalov if (n != 0) { /* legal unicode;illegal Big5 */ 179*91e1e26aSAlexander Pyhalov st->_errno = errno = EILSEQ; 180*91e1e26aSAlexander Pyhalov break; 181*91e1e26aSAlexander Pyhalov } 182*91e1e26aSAlexander Pyhalov 183*91e1e26aSAlexander Pyhalov n = utf8_to_ibm(unidx, ibm_code, 184*91e1e26aSAlexander Pyhalov *outbuf, *outbytesleft, st); 185*91e1e26aSAlexander Pyhalov if (n > 0) { 186*91e1e26aSAlexander Pyhalov (*outbuf) += n; 187*91e1e26aSAlexander Pyhalov (*outbytesleft) -= n; 188*91e1e26aSAlexander Pyhalov } else { 189*91e1e26aSAlexander Pyhalov st->_errno = errno; 190*91e1e26aSAlexander Pyhalov return((size_t)-1); 191*91e1e26aSAlexander Pyhalov } 192*91e1e26aSAlexander Pyhalov st->ustate = U0; 193*91e1e26aSAlexander Pyhalov st->_errno = 0; 194*91e1e26aSAlexander Pyhalov break; 195*91e1e26aSAlexander Pyhalov default: /* should never come here */ 196*91e1e26aSAlexander Pyhalov st->_errno = errno = EILSEQ; 197*91e1e26aSAlexander Pyhalov st->ustate = U0; /* reset state */ 198*91e1e26aSAlexander Pyhalov break; 199*91e1e26aSAlexander Pyhalov } 200*91e1e26aSAlexander Pyhalov 201*91e1e26aSAlexander Pyhalov (*inbuf)++; 202*91e1e26aSAlexander Pyhalov (*inbytesleft)--; 203*91e1e26aSAlexander Pyhalov 204*91e1e26aSAlexander Pyhalov if (st->_errno) { 205*91e1e26aSAlexander Pyhalov #ifdef DEBUG 206*91e1e26aSAlexander Pyhalov fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 207*91e1e26aSAlexander Pyhalov st->_errno, st->ustate); 208*91e1e26aSAlexander Pyhalov #endif 209*91e1e26aSAlexander Pyhalov break; 210*91e1e26aSAlexander Pyhalov } 211*91e1e26aSAlexander Pyhalov 212*91e1e26aSAlexander Pyhalov if (errno) 213*91e1e26aSAlexander Pyhalov return((size_t)-1); 214*91e1e26aSAlexander Pyhalov } 215*91e1e26aSAlexander Pyhalov 216*91e1e26aSAlexander Pyhalov if (*outbytesleft == 0) { 217*91e1e26aSAlexander Pyhalov errno = E2BIG; 218*91e1e26aSAlexander Pyhalov return((size_t)-1); 219*91e1e26aSAlexander Pyhalov } 220*91e1e26aSAlexander Pyhalov return (*inbytesleft); 221*91e1e26aSAlexander Pyhalov } 222*91e1e26aSAlexander Pyhalov 223*91e1e26aSAlexander Pyhalov 224*91e1e26aSAlexander Pyhalov /* 225*91e1e26aSAlexander Pyhalov * Match IBM code by UTF8 code; 226*91e1e26aSAlexander Pyhalov * Return: = 0 - match from Unicode to IBM found 227*91e1e26aSAlexander Pyhalov * = 1 - match from Unicode to IBM NOT found 228*91e1e26aSAlexander Pyhalov * 229*91e1e26aSAlexander Pyhalov * Since binary search of the UTF8 to IBM table is necessary, might as well 230*91e1e26aSAlexander Pyhalov * return index and IBM code matching to the unicode. 231*91e1e26aSAlexander Pyhalov */ 232*91e1e26aSAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code) 233*91e1e26aSAlexander Pyhalov _icv_state *st; 234*91e1e26aSAlexander Pyhalov char c1, c2; 235*91e1e26aSAlexander Pyhalov int *unidx; 236*91e1e26aSAlexander Pyhalov unsigned long *ibm_code; 237*91e1e26aSAlexander Pyhalov { 238*91e1e26aSAlexander Pyhalov unsigned long unicode; 239*91e1e26aSAlexander Pyhalov 240*91e1e26aSAlexander Pyhalov unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 241*91e1e26aSAlexander Pyhalov *unidx = bisearch(unicode, st, st->table_size); 242*91e1e26aSAlexander Pyhalov if ((*unidx) >= 0) 243*91e1e26aSAlexander Pyhalov { 244*91e1e26aSAlexander Pyhalov if ( st->left_to_right ) 245*91e1e26aSAlexander Pyhalov *ibm_code = st->table[*unidx].right_code; 246*91e1e26aSAlexander Pyhalov else 247*91e1e26aSAlexander Pyhalov *ibm_code = st->table[*unidx].left_code; 248*91e1e26aSAlexander Pyhalov } 249*91e1e26aSAlexander Pyhalov #ifdef DEBUG 250*91e1e26aSAlexander Pyhalov fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code); 251*91e1e26aSAlexander Pyhalov #endif 252*91e1e26aSAlexander Pyhalov 253*91e1e26aSAlexander Pyhalov return(0); 254*91e1e26aSAlexander Pyhalov } 255*91e1e26aSAlexander Pyhalov 256*91e1e26aSAlexander Pyhalov 257*91e1e26aSAlexander Pyhalov /* 258*91e1e26aSAlexander Pyhalov * ISO/IEC 10646 (Unicode) --> IBM 259*91e1e26aSAlexander Pyhalov * Unicode --> UTF8 (FSS-UTF) 260*91e1e26aSAlexander Pyhalov * (File System Safe Universal Character Set Transformation Format) 261*91e1e26aSAlexander Pyhalov * Return: > 0 - converted with enough space in output buffer 262*91e1e26aSAlexander Pyhalov * = 0 - no space in outbuf 263*91e1e26aSAlexander Pyhalov */ 264*91e1e26aSAlexander Pyhalov int utf8_to_ibm(unidx, ibm_code, buf, buflen, st) 265*91e1e26aSAlexander Pyhalov int unidx; 266*91e1e26aSAlexander Pyhalov unsigned long ibm_code; 267*91e1e26aSAlexander Pyhalov char *buf; 268*91e1e26aSAlexander Pyhalov size_t buflen; 269*91e1e26aSAlexander Pyhalov _icv_state *st; 270*91e1e26aSAlexander Pyhalov 271*91e1e26aSAlexander Pyhalov { 272*91e1e26aSAlexander Pyhalov unsigned long val; /* IBM value */ 273*91e1e26aSAlexander Pyhalov char c1, c2, ibm_str[3]; 274*91e1e26aSAlexander Pyhalov 275*91e1e26aSAlexander Pyhalov if (unidx < 0) /* no match from UTF8 to IBM */ 276*91e1e26aSAlexander Pyhalov ibm_code = (unsigned long)NON_ID_CHAR; 277*91e1e26aSAlexander Pyhalov 278*91e1e26aSAlexander Pyhalov { 279*91e1e26aSAlexander Pyhalov val = ibm_code & 0xffff; 280*91e1e26aSAlexander Pyhalov c1 = (char) ((val & 0xff00) >> 8); 281*91e1e26aSAlexander Pyhalov c2 = (char) (val & 0xff); 282*91e1e26aSAlexander Pyhalov } 283*91e1e26aSAlexander Pyhalov 284*91e1e26aSAlexander Pyhalov /* it is single byte ascii */ 285*91e1e26aSAlexander Pyhalov if ( c1 == 0x0 ) { 286*91e1e26aSAlexander Pyhalov if ( st->shift == SHIFT_OUT ) { 287*91e1e26aSAlexander Pyhalov if (buflen < 2) { 288*91e1e26aSAlexander Pyhalov errno = E2BIG; 289*91e1e26aSAlexander Pyhalov return 0; 290*91e1e26aSAlexander Pyhalov } 291*91e1e26aSAlexander Pyhalov *buf = SHIFT_IN; 292*91e1e26aSAlexander Pyhalov *(buf+1) = c2; 293*91e1e26aSAlexander Pyhalov st->shift = SHIFT_IN; 294*91e1e26aSAlexander Pyhalov return 2; 295*91e1e26aSAlexander Pyhalov } 296*91e1e26aSAlexander Pyhalov if (buflen < 1) { 297*91e1e26aSAlexander Pyhalov errno = E2BIG; 298*91e1e26aSAlexander Pyhalov return 0; 299*91e1e26aSAlexander Pyhalov } 300*91e1e26aSAlexander Pyhalov *buf = c2; 301*91e1e26aSAlexander Pyhalov return 1; 302*91e1e26aSAlexander Pyhalov } 303*91e1e26aSAlexander Pyhalov 304*91e1e26aSAlexander Pyhalov /* it is the first two bytes character */ 305*91e1e26aSAlexander Pyhalov if ( st->shift == SHIFT_IN ) { 306*91e1e26aSAlexander Pyhalov if (buflen < 3) { 307*91e1e26aSAlexander Pyhalov errno = E2BIG; 308*91e1e26aSAlexander Pyhalov return 0; 309*91e1e26aSAlexander Pyhalov } 310*91e1e26aSAlexander Pyhalov *buf = SHIFT_OUT; 311*91e1e26aSAlexander Pyhalov st->shift = SHIFT_OUT; 312*91e1e26aSAlexander Pyhalov *(buf+1) = c1; 313*91e1e26aSAlexander Pyhalov *(buf+2) = c2; 314*91e1e26aSAlexander Pyhalov return 3; 315*91e1e26aSAlexander Pyhalov } 316*91e1e26aSAlexander Pyhalov 317*91e1e26aSAlexander Pyhalov *buf = ibm_str[0] = c1; 318*91e1e26aSAlexander Pyhalov *(buf+1) = ibm_str[1] = c2; 319*91e1e26aSAlexander Pyhalov ibm_str[2] = NULL; 320*91e1e26aSAlexander Pyhalov 321*91e1e26aSAlexander Pyhalov #ifdef DEBUG 322*91e1e26aSAlexander Pyhalov fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1)); 323*91e1e26aSAlexander Pyhalov #endif 324*91e1e26aSAlexander Pyhalov 325*91e1e26aSAlexander Pyhalov 326*91e1e26aSAlexander Pyhalov if (buflen < 2) { 327*91e1e26aSAlexander Pyhalov errno = E2BIG; 328*91e1e26aSAlexander Pyhalov return(0); 329*91e1e26aSAlexander Pyhalov } 330*91e1e26aSAlexander Pyhalov 331*91e1e26aSAlexander Pyhalov return(2); 332*91e1e26aSAlexander Pyhalov } 333