1*91e1e26aSAlexander Pyhalov /* 2*91e1e26aSAlexander Pyhalov * CDDL HEADER START 3*91e1e26aSAlexander Pyhalov * 4*91e1e26aSAlexander Pyhalov * The contents of this file are subject to the terms of the 5*91e1e26aSAlexander Pyhalov * Common Development and Distribution License (the "License"). 6*91e1e26aSAlexander Pyhalov * You may not use this file except in compliance with the License. 7*91e1e26aSAlexander Pyhalov * 8*91e1e26aSAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9*91e1e26aSAlexander Pyhalov * or http://www.opensolaris.org/os/licensing. 10*91e1e26aSAlexander Pyhalov * See the License for the specific language governing permissions 11*91e1e26aSAlexander Pyhalov * and limitations under the License. 12*91e1e26aSAlexander Pyhalov * 13*91e1e26aSAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each 14*91e1e26aSAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE. 15*91e1e26aSAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the 16*91e1e26aSAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying 17*91e1e26aSAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner] 18*91e1e26aSAlexander Pyhalov * 19*91e1e26aSAlexander Pyhalov * CDDL HEADER END 20*91e1e26aSAlexander Pyhalov */ 21*91e1e26aSAlexander Pyhalov 22*91e1e26aSAlexander Pyhalov /* 23*91e1e26aSAlexander Pyhalov * Copyright (c) 1997, by Sun Microsystems, Inc. 24*91e1e26aSAlexander Pyhalov * All rights reserved. 25*91e1e26aSAlexander Pyhalov */ 26*91e1e26aSAlexander Pyhalov 27*91e1e26aSAlexander Pyhalov #include <stdio.h> 28*91e1e26aSAlexander Pyhalov #include <stdlib.h> 29*91e1e26aSAlexander Pyhalov #include <errno.h> 30*91e1e26aSAlexander Pyhalov #include <sys/types.h> 31*91e1e26aSAlexander Pyhalov 32*91e1e26aSAlexander Pyhalov #include "tab_lookup.h" /* table lookup data types */ 33*91e1e26aSAlexander Pyhalov 34*91e1e26aSAlexander Pyhalov #define MSB 0x80 /* most significant bit */ 35*91e1e26aSAlexander Pyhalov #define ONEBYTE 0xff /* right most byte */ 36*91e1e26aSAlexander Pyhalov 37*91e1e26aSAlexander Pyhalov enum _USTATE { U0, U1, U11, U2, U3, U4 }; 38*91e1e26aSAlexander Pyhalov 39*91e1e26aSAlexander Pyhalov 40*91e1e26aSAlexander Pyhalov 41*91e1e26aSAlexander Pyhalov 42*91e1e26aSAlexander Pyhalov /* 43*91e1e26aSAlexander Pyhalov * Actual conversion; called from iconv() 44*91e1e26aSAlexander Pyhalov * Input is UTF-8 data. 45*91e1e26aSAlexander Pyhalov * first convert to UCS2 46*91e1e26aSAlexander Pyhalov */ 47*91e1e26aSAlexander Pyhalov size_t 48*91e1e26aSAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft, 49*91e1e26aSAlexander Pyhalov char **outbuf, size_t *outbytesleft) 50*91e1e26aSAlexander Pyhalov { 51*91e1e26aSAlexander Pyhalov /* 52*91e1e26aSAlexander Pyhalov * Actual conversion; called from iconv() 53*91e1e26aSAlexander Pyhalov */ 54*91e1e26aSAlexander Pyhalov /*========================================================= 55*91e1e26aSAlexander Pyhalov * 56*91e1e26aSAlexander Pyhalov * State Machine for interpreting UTF8 code 57*91e1e26aSAlexander Pyhalov * 58*91e1e26aSAlexander Pyhalov *========================================================= 59*91e1e26aSAlexander Pyhalov * 60*91e1e26aSAlexander Pyhalov * 3 byte unicode 61*91e1e26aSAlexander Pyhalov * +----->------->-------+ 62*91e1e26aSAlexander Pyhalov * | | 63*91e1e26aSAlexander Pyhalov * ^ v 64*91e1e26aSAlexander Pyhalov * | 2 byte U2 ---> U3 65*91e1e26aSAlexander Pyhalov * | unicode v 66*91e1e26aSAlexander Pyhalov * +------> U0 -------> U1 +-------->U4---+ 67*91e1e26aSAlexander Pyhalov * ^ ascii | | ^ | 68*91e1e26aSAlexander Pyhalov * | | +-------->--------->--------+ | 69*91e1e26aSAlexander Pyhalov * | v v 70*91e1e26aSAlexander Pyhalov * +----<---+-----<------------<------------<------------+ 71*91e1e26aSAlexander Pyhalov * 72*91e1e26aSAlexander Pyhalov * +----<---+-----<------------<------------<------------+ 73*91e1e26aSAlexander Pyhalov * 74*91e1e26aSAlexander Pyhalov *=========================================================*/ 75*91e1e26aSAlexander Pyhalov 76*91e1e26aSAlexander Pyhalov char c1, c2; 77*91e1e26aSAlexander Pyhalov int n, unidx; 78*91e1e26aSAlexander Pyhalov unsigned long ibm_code; 79*91e1e26aSAlexander Pyhalov 80*91e1e26aSAlexander Pyhalov #ifdef DEBUG 81*91e1e26aSAlexander Pyhalov fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n"); 82*91e1e26aSAlexander Pyhalov #endif 83*91e1e26aSAlexander Pyhalov 84*91e1e26aSAlexander Pyhalov if (st == NULL) { 85*91e1e26aSAlexander Pyhalov errno = EBADF; 86*91e1e26aSAlexander Pyhalov return ((size_t) -1); 87*91e1e26aSAlexander Pyhalov } 88*91e1e26aSAlexander Pyhalov 89*91e1e26aSAlexander Pyhalov if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 90*91e1e26aSAlexander Pyhalov st->ustate = U0; 91*91e1e26aSAlexander Pyhalov st->_errno = 0; 92*91e1e26aSAlexander Pyhalov return ((size_t) 0); 93*91e1e26aSAlexander Pyhalov } 94*91e1e26aSAlexander Pyhalov 95*91e1e26aSAlexander Pyhalov st->_errno = 0; /* reset internal errno */ 96*91e1e26aSAlexander Pyhalov errno = 0; /* reset external errno */ 97*91e1e26aSAlexander Pyhalov 98*91e1e26aSAlexander Pyhalov /* a state machine for interpreting UTF8 code */ 99*91e1e26aSAlexander Pyhalov while (*inbytesleft > 0 && *outbytesleft > 0) { 100*91e1e26aSAlexander Pyhalov switch (st->ustate) { 101*91e1e26aSAlexander Pyhalov case U0: /* assuming ASCII in the beginning */ 102*91e1e26aSAlexander Pyhalov if ((**inbuf & MSB) == 0) { /* ASCII */ 103*91e1e26aSAlexander Pyhalov **outbuf = **inbuf; 104*91e1e26aSAlexander Pyhalov (*outbuf)++; 105*91e1e26aSAlexander Pyhalov (*outbytesleft)--; 106*91e1e26aSAlexander Pyhalov } else { /* Chinese character */ 107*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */ 108*91e1e26aSAlexander Pyhalov st->ustate = U1; 109*91e1e26aSAlexander Pyhalov st->keepc[0] = **inbuf; 110*91e1e26aSAlexander Pyhalov } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */ 111*91e1e26aSAlexander Pyhalov st->ustate = U2; 112*91e1e26aSAlexander Pyhalov st->keepc[0] = **inbuf; 113*91e1e26aSAlexander Pyhalov } else { /* illegal unicode */ 114*91e1e26aSAlexander Pyhalov /* st->_errno = errno = EINVAL; */ 115*91e1e26aSAlexander Pyhalov /* possible UNICODE ko_KR-UTF8 */ 116*91e1e26aSAlexander Pyhalov c1 =st->keepc[0] = **inbuf; 117*91e1e26aSAlexander Pyhalov st->ustate = U11; 118*91e1e26aSAlexander Pyhalov break; 119*91e1e26aSAlexander Pyhalov } 120*91e1e26aSAlexander Pyhalov } 121*91e1e26aSAlexander Pyhalov break; 122*91e1e26aSAlexander Pyhalov case U1: /* 2 byte unicode */ 123*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) { 124*91e1e26aSAlexander Pyhalov st->ustate = U4; 125*91e1e26aSAlexander Pyhalov st->keepc[1] = **inbuf; 126*91e1e26aSAlexander Pyhalov c1 = (st->keepc[0]&0x1c)>>2; 127*91e1e26aSAlexander Pyhalov c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f); 128*91e1e26aSAlexander Pyhalov #ifdef DEBUG 129*91e1e26aSAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x --> ", 130*91e1e26aSAlexander Pyhalov st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE); 131*91e1e26aSAlexander Pyhalov #endif 132*91e1e26aSAlexander Pyhalov continue; /* should not advance *inbuf */ 133*91e1e26aSAlexander Pyhalov } else { 134*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL; 135*91e1e26aSAlexander Pyhalov } 136*91e1e26aSAlexander Pyhalov break; 137*91e1e26aSAlexander Pyhalov case U11: /* 3 byte unicode - 2nd byte */ 138*91e1e26aSAlexander Pyhalov c2 =st->keepc[1] = **inbuf; 139*91e1e26aSAlexander Pyhalov st->ustate = U4; 140*91e1e26aSAlexander Pyhalov continue; 141*91e1e26aSAlexander Pyhalov break; 142*91e1e26aSAlexander Pyhalov case U2: /* 3 byte unicode - 2nd byte */ 143*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) { 144*91e1e26aSAlexander Pyhalov st->ustate = U3; 145*91e1e26aSAlexander Pyhalov st->keepc[1] = **inbuf; 146*91e1e26aSAlexander Pyhalov } else { 147*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL; 148*91e1e26aSAlexander Pyhalov } 149*91e1e26aSAlexander Pyhalov break; 150*91e1e26aSAlexander Pyhalov case U3: /* 3 byte unicode - 3rd byte */ 151*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) { 152*91e1e26aSAlexander Pyhalov st->ustate = U4; 153*91e1e26aSAlexander Pyhalov st->keepc[2] = **inbuf; 154*91e1e26aSAlexander Pyhalov c1 = ((st->keepc[0]&0x0f)<<4) | 155*91e1e26aSAlexander Pyhalov ((st->keepc[1]&0x3c)>>2); 156*91e1e26aSAlexander Pyhalov c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f); 157*91e1e26aSAlexander Pyhalov #ifdef DEBUG 158*91e1e26aSAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE, 159*91e1e26aSAlexander Pyhalov st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE); 160*91e1e26aSAlexander Pyhalov #endif 161*91e1e26aSAlexander Pyhalov continue; /* should not advance *inbuf */ 162*91e1e26aSAlexander Pyhalov } else { 163*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL; 164*91e1e26aSAlexander Pyhalov } 165*91e1e26aSAlexander Pyhalov break; 166*91e1e26aSAlexander Pyhalov case U4: 167*91e1e26aSAlexander Pyhalov n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code); 168*91e1e26aSAlexander Pyhalov if (n != 0) { /* legal unicode;illegal Big5 */ 169*91e1e26aSAlexander Pyhalov st->_errno = errno = EILSEQ; 170*91e1e26aSAlexander Pyhalov break; 171*91e1e26aSAlexander Pyhalov } 172*91e1e26aSAlexander Pyhalov 173*91e1e26aSAlexander Pyhalov n = utf8_to_ibm(unidx, ibm_code, 174*91e1e26aSAlexander Pyhalov *outbuf, *outbytesleft); 175*91e1e26aSAlexander Pyhalov if (n > 0) { 176*91e1e26aSAlexander Pyhalov (*outbuf) += n; 177*91e1e26aSAlexander Pyhalov (*outbytesleft) -= n; 178*91e1e26aSAlexander Pyhalov } else { 179*91e1e26aSAlexander Pyhalov st->_errno = errno; 180*91e1e26aSAlexander Pyhalov return((size_t)-1); 181*91e1e26aSAlexander Pyhalov } 182*91e1e26aSAlexander Pyhalov st->ustate = U0; 183*91e1e26aSAlexander Pyhalov st->_errno = 0; 184*91e1e26aSAlexander Pyhalov break; 185*91e1e26aSAlexander Pyhalov default: /* should never come here */ 186*91e1e26aSAlexander Pyhalov st->_errno = errno = EILSEQ; 187*91e1e26aSAlexander Pyhalov st->ustate = U0; /* reset state */ 188*91e1e26aSAlexander Pyhalov break; 189*91e1e26aSAlexander Pyhalov } 190*91e1e26aSAlexander Pyhalov 191*91e1e26aSAlexander Pyhalov (*inbuf)++; 192*91e1e26aSAlexander Pyhalov (*inbytesleft)--; 193*91e1e26aSAlexander Pyhalov 194*91e1e26aSAlexander Pyhalov if (st->_errno) { 195*91e1e26aSAlexander Pyhalov #ifdef DEBUG 196*91e1e26aSAlexander Pyhalov fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n", 197*91e1e26aSAlexander Pyhalov st->_errno, st->ustate); 198*91e1e26aSAlexander Pyhalov #endif 199*91e1e26aSAlexander Pyhalov break; 200*91e1e26aSAlexander Pyhalov } 201*91e1e26aSAlexander Pyhalov 202*91e1e26aSAlexander Pyhalov if (errno) 203*91e1e26aSAlexander Pyhalov return((size_t)-1); 204*91e1e26aSAlexander Pyhalov } 205*91e1e26aSAlexander Pyhalov 206*91e1e26aSAlexander Pyhalov if (*outbytesleft == 0) { 207*91e1e26aSAlexander Pyhalov errno = E2BIG; 208*91e1e26aSAlexander Pyhalov return((size_t)-1); 209*91e1e26aSAlexander Pyhalov } 210*91e1e26aSAlexander Pyhalov return (*inbytesleft); 211*91e1e26aSAlexander Pyhalov } 212*91e1e26aSAlexander Pyhalov 213*91e1e26aSAlexander Pyhalov 214*91e1e26aSAlexander Pyhalov /* 215*91e1e26aSAlexander Pyhalov * Match IBM code by UTF8 code; 216*91e1e26aSAlexander Pyhalov * Return: = 0 - match from Unicode to IBM found 217*91e1e26aSAlexander Pyhalov * = 1 - match from Unicode to IBM NOT found 218*91e1e26aSAlexander Pyhalov * 219*91e1e26aSAlexander Pyhalov * Since binary search of the UTF8 to IBM table is necessary, might as well 220*91e1e26aSAlexander Pyhalov * return index and IBM code matching to the unicode. 221*91e1e26aSAlexander Pyhalov */ 222*91e1e26aSAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code) 223*91e1e26aSAlexander Pyhalov _icv_state *st; 224*91e1e26aSAlexander Pyhalov char c1, c2; 225*91e1e26aSAlexander Pyhalov int *unidx; 226*91e1e26aSAlexander Pyhalov unsigned long *ibm_code; 227*91e1e26aSAlexander Pyhalov { 228*91e1e26aSAlexander Pyhalov unsigned long unicode; 229*91e1e26aSAlexander Pyhalov 230*91e1e26aSAlexander Pyhalov unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE); 231*91e1e26aSAlexander Pyhalov *unidx = bisearch(unicode, st, st->table_size); 232*91e1e26aSAlexander Pyhalov if ((*unidx) >= 0) 233*91e1e26aSAlexander Pyhalov { 234*91e1e26aSAlexander Pyhalov if ( st->left_to_right ) 235*91e1e26aSAlexander Pyhalov *ibm_code = st->table[*unidx].right_code; 236*91e1e26aSAlexander Pyhalov else 237*91e1e26aSAlexander Pyhalov *ibm_code = st->table[*unidx].left_code; 238*91e1e26aSAlexander Pyhalov } 239*91e1e26aSAlexander Pyhalov else 240*91e1e26aSAlexander Pyhalov ; /* match from UTF8 to IBM not found */ 241*91e1e26aSAlexander Pyhalov #ifdef DEBUG 242*91e1e26aSAlexander Pyhalov fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code); 243*91e1e26aSAlexander Pyhalov #endif 244*91e1e26aSAlexander Pyhalov 245*91e1e26aSAlexander Pyhalov return(0); 246*91e1e26aSAlexander Pyhalov } 247*91e1e26aSAlexander Pyhalov 248*91e1e26aSAlexander Pyhalov 249*91e1e26aSAlexander Pyhalov /* 250*91e1e26aSAlexander Pyhalov * ISO/IEC 10646 (Unicode) --> IBM 251*91e1e26aSAlexander Pyhalov * Unicode --> UTF8 (FSS-UTF) 252*91e1e26aSAlexander Pyhalov * (File System Safe Universal Character Set Transformation Format) 253*91e1e26aSAlexander Pyhalov * Return: > 0 - converted with enough space in output buffer 254*91e1e26aSAlexander Pyhalov * = 0 - no space in outbuf 255*91e1e26aSAlexander Pyhalov */ 256*91e1e26aSAlexander Pyhalov int utf8_to_ibm(unidx, ibm_code, buf, buflen) 257*91e1e26aSAlexander Pyhalov int unidx; 258*91e1e26aSAlexander Pyhalov unsigned long ibm_code; 259*91e1e26aSAlexander Pyhalov char *buf; 260*91e1e26aSAlexander Pyhalov size_t buflen; 261*91e1e26aSAlexander Pyhalov 262*91e1e26aSAlexander Pyhalov { 263*91e1e26aSAlexander Pyhalov unsigned long val; /* IBM value */ 264*91e1e26aSAlexander Pyhalov char c1, c2, ibm_str[3]; 265*91e1e26aSAlexander Pyhalov 266*91e1e26aSAlexander Pyhalov if (unidx < 0) /* no match from UTF8 to IBM */ 267*91e1e26aSAlexander Pyhalov ibm_code = (unsigned long)NON_ID_CHAR; 268*91e1e26aSAlexander Pyhalov 269*91e1e26aSAlexander Pyhalov { 270*91e1e26aSAlexander Pyhalov val = ibm_code & 0xffff; 271*91e1e26aSAlexander Pyhalov c1 = (char) ((val & 0xff00) >> 8); 272*91e1e26aSAlexander Pyhalov c2 = (char) (val & 0xff); 273*91e1e26aSAlexander Pyhalov } 274*91e1e26aSAlexander Pyhalov 275*91e1e26aSAlexander Pyhalov *buf = ibm_str[0] = c1; 276*91e1e26aSAlexander Pyhalov *(buf+1) = ibm_str[1] = c2; 277*91e1e26aSAlexander Pyhalov ibm_str[2] = NULL; 278*91e1e26aSAlexander Pyhalov 279*91e1e26aSAlexander Pyhalov #ifdef DEBUG 280*91e1e26aSAlexander Pyhalov fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1)); 281*91e1e26aSAlexander Pyhalov #endif 282*91e1e26aSAlexander Pyhalov 283*91e1e26aSAlexander Pyhalov 284*91e1e26aSAlexander Pyhalov if (buflen < 2) { 285*91e1e26aSAlexander Pyhalov errno = E2BIG; 286*91e1e26aSAlexander Pyhalov return(0); 287*91e1e26aSAlexander Pyhalov } 288*91e1e26aSAlexander Pyhalov 289*91e1e26aSAlexander Pyhalov return(2); 290*91e1e26aSAlexander Pyhalov } 291