1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright(c) 2001 Sun Microsystems, Inc. 23 * All rights reserved. 24 */ 25 #include <stdio.h> 26 #include <errno.h> 27 #include <stdlib.h> 28 #include <strings.h> 29 #include <sys/types.h> 30 #include "iscii.h" 31 #include "common_defs.h" 32 33 #define MSB 0x80 /* most significant bit */ 34 #define ONEBYTE 0xff /* right most byte */ 35 36 #define REPLACE_CHAR '?' 37 38 #define utf8_len(Ch) (Ch < 0x80 ? 1 : (Ch < 0xe0 ? 2 : (Ch < 0xf0 ? 3 : (Ch < 0xf8 ? 4 : (Ch < 0xfc ? 5 : 6))))) 39 40 #define analyze_utf8(Ch, Mask, nBytes) \ 41 if (Ch < 128) { \ 42 nBytes = 1; \ 43 Mask = 0x7f; \ 44 } else if ((Ch & 0xe0) == 0xc0) { \ 45 nBytes = 2; \ 46 Mask = 0x1f; \ 47 } else if ((Ch & 0xf0) == 0xe0) { \ 48 nBytes = 3; \ 49 Mask = 0x0f; \ 50 } else if ((Ch & 0xf8) == 0xf0) { \ 51 nBytes = 4; \ 52 Mask = 0x07; \ 53 } else if ((Ch & 0xfc) == 0xf8) { \ 54 nBytes = 5; \ 55 Mask = 0x03; \ 56 } else if ((Ch & 0xfe) == 0xfc) { \ 57 nBytes = 6; \ 58 Mask = 0x01; \ 59 } else \ 60 nBytes = -1; 61 62 #define ucs2_from_utf8(mUCS, Ch, Ct, Mask, Len) \ 63 (mUCS) = (Ch)[0] & (Mask); \ 64 for ((Ct) = 1; (Ct) < (Len); ++(Ct)) { \ 65 if ( ( (Ch)[(Ct)] & 0xc0) != 0x80) { \ 66 (mUCS) = -1; \ 67 break; \ 68 } \ 69 (mUCS) <<= 6; \ 70 (mUCS) |= ((Ch)[(Ct)] & 0x3f); \ 71 } \ 72 73 74 typedef struct _icv_state { 75 char aATR; 76 uchar_t keepc[4]; 77 int halant_context; /* preceded by the Halant character or not */ 78 int _ustate; 79 int _errno; 80 } _iconv_st; 81 82 enum _CSTATE { U0, U1, U2, U3, U4, U5, U6 }; 83 84 /* 85 * Open; called from iconv_open() 86 */ 87 void * 88 _icv_open() 89 { 90 _iconv_st *st; 91 92 if ((st = (_iconv_st*)malloc(sizeof(_iconv_st))) == NULL) { 93 errno = ENOMEM; 94 return ((void*)-1); 95 } 96 97 bzero(st, sizeof(_iconv_st)); 98 st->aATR = 0x42; /* Devanagiri */ 99 100 return ((void*)st); 101 } 102 103 typedef enum { t_NONE, t_NUKTA, t_EXT, t_HALANT, t_DOUBLE_DANDA } Type; 104 105 static int 106 traverse_table(Entry *entry, int num, ucs_t ucs, Type *type) 107 { 108 int i=0; 109 int retc=0; 110 111 *type = t_NONE; 112 113 for ( ; i < num; ++i ) { 114 Entry en = entry[i]; 115 116 if (en.count == NUKTA || en.count == EXT || en.count == HALANT || en.count == DOUBLE_DANDA) { 117 if ( ucs < en.ucs ) break; 118 if ( ucs == en.ucs ) { /* found */ 119 if ( en.count == NUKTA ) *type = t_NUKTA; 120 if ( en.count == EXT ) *type = t_EXT; 121 if ( en.count == HALANT ) *type = t_HALANT; 122 if ( en.count == DOUBLE_DANDA ) *type = t_DOUBLE_DANDA; 123 retc = en.iscii; 124 break; 125 } 126 } else { 127 if ( ucs < en.ucs ) break; 128 if ( ucs >= en.ucs && ucs < en.ucs + en.count ) { 129 retc = en.iscii + ( ucs - en.ucs ); 130 break; 131 } 132 } 133 } 134 135 return retc; 136 } 137 138 static int 139 ucs_to_iscii(ucs_t uiid, char **outbuf, size_t *outbytesleft, int isc_type, int *halant_context) 140 { 141 int nBytesRet = 0 ; 142 Type type = t_NONE; 143 int iscii; 144 Entries en = unicode_table[isc_type]; 145 146 if ( *outbytesleft == 0 ) { 147 errno = E2BIG; 148 return 0; 149 } 150 151 iscii = traverse_table(en.entry, en.items, uiid, &type); 152 if ( iscii == 0 ) { 153 **outbuf = REPLACE_CHAR; 154 nBytesRet ++; 155 } else { 156 if ( type != t_NONE ) { 157 158 /* buggy code */ 159 if ( *outbytesleft < 2 ) { 160 errno = E2BIG; 161 return 0; 162 } 163 164 switch (type) 165 { 166 case t_NUKTA: 167 **outbuf = (uchar_t) iscii; 168 *(*outbuf+1) = ISC_nukta; 169 nBytesRet = 2; 170 171 break; 172 case t_EXT: 173 **outbuf = ISC_ext; 174 *(*outbuf+1) = (uchar_t) iscii; 175 nBytesRet = 2; 176 177 break; 178 case t_HALANT: 179 if ( (uiid == UNI_ZWJ || uiid == UNI_ZWNJ) && *halant_context ) 180 { 181 if ( uiid == UNI_ZWJ ) **outbuf = ISC_nukta; /* soft halant */ 182 else **outbuf = ISC_halant; /* explicit halant */ 183 184 nBytesRet = 1; 185 } /* consume the UNI_ZWNJ or UNI_ZWJ if *halant_context is 0 */ 186 187 break; 188 case t_DOUBLE_DANDA: 189 **outbuf = ISC_danda; 190 *(*outbuf+1) = (uchar_t) iscii; 191 nBytesRet = 2; 192 break; 193 case t_NONE: 194 /* Not reached */ 195 break; 196 } 197 } else { 198 **outbuf = (uchar_t) iscii; 199 nBytesRet = 1; 200 } 201 } 202 203 /* if iscii == ISC_halant but type == t_HALANT, set *halant_context to 0 */ 204 if ( iscii == ISC_halant && type == t_NONE ) *halant_context = 1; 205 else *halant_context = 0; 206 207 return nBytesRet; 208 } 209 210 /* 211 * Close; called from iconv_close() 212 */ 213 void 214 _icv_close(_iconv_st *st) 215 { 216 if (!st) 217 errno = EBADF; 218 else 219 free(st); 220 } 221 222 /* 223 * Conversion routine; called from iconv() 224 */ 225 size_t 226 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, 227 char **outbuf, size_t *outbytesleft) 228 { 229 int n=0; 230 231 if (st == NULL) { 232 errno = EBADF; 233 return ((size_t) -1); 234 } 235 236 237 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ 238 st->aATR = 0x42; /* Devangiri */ 239 st->_ustate = U0; 240 st->_errno = 0; 241 return ((size_t) 0); 242 } 243 244 st->_errno = errno = 0; 245 246 while (*inbytesleft > 0 && *outbytesleft > 0) { 247 248 uchar_t first_byte; 249 250 switch ( st->_ustate ) { 251 case U0: 252 if ((**inbuf & MSB) == 0) { /* ASCII */ 253 **outbuf = **inbuf; 254 (*outbuf)++; (*outbytesleft)--; 255 } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */ 256 257 /* invalid sequence if the first byte is either 0xc0 or 0xc1 */ 258 if ( number_of_bytes_in_utf8_char[((uchar_t) **inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 259 errno = EILSEQ; 260 else { 261 st->_ustate = U1; 262 st->keepc[0] = **inbuf; 263 } 264 } else if ((**inbuf & 0xf0) == 0xe0) { 265 st->_ustate = U2; 266 st->keepc[0] = **inbuf; 267 } else { 268 /* four bytes of UTF-8 sequences */ 269 if ( number_of_bytes_in_utf8_char[((uchar_t) **inbuf)] == ICV_TYPE_ILLEGAL_CHAR ) 270 errno = EILSEQ; 271 else { 272 st->_ustate = U4; 273 st->keepc[0] = **inbuf; 274 } 275 } 276 break; 277 case U1: 278 if ((**inbuf & 0xc0) == MSB) { /* U+0080 -- U+07FF */ 279 **outbuf = REPLACE_CHAR; 280 (*outbuf)++; 281 (*outbytesleft)--; 282 st->_ustate = U0; 283 } else { 284 errno = EILSEQ; 285 } 286 break; 287 case U2: 288 289 first_byte = st->keepc[0]; 290 291 /* if the first byte is 0xed, it is illegal sequence if the second 292 * one is between 0xa0 and 0xbf because surrogate section is ill-formed 293 */ 294 if (((uchar_t) **inbuf) < valid_min_2nd_byte[first_byte] || 295 ((uchar_t) **inbuf) > valid_max_2nd_byte[first_byte] ) 296 errno = EILSEQ; 297 else { 298 st->_ustate = U3; 299 st->keepc[1] = **inbuf; 300 } 301 break; 302 case U3: 303 if ((**inbuf & 0xc0) == MSB) { 304 unsigned char mChar = st->keepc[0]; 305 ucs_t ucsid = 0; 306 int i=0, mask=0, len=0; 307 ISCII isc_type; 308 309 st->keepc[2] = **inbuf; 310 311 analyze_utf8(mChar, mask, len); 312 313 ucs2_from_utf8(ucsid, (char *)&st->keepc[0], i, mask, len); 314 315 /* 0xfffe and 0xffff should not be allowed */ 316 if ( ucsid == 0xFFFE || ucsid == 0xFFFF ) 317 { 318 errno = EILSEQ; 319 break; 320 } 321 322 get_script_types(ucsid, isc_type); 323 if ( isc_type != NUM_ISCII && st->aATR != aTRs[isc_type] ) { 324 if ( *outbytesleft < 2 ) { 325 errno = E2BIG; 326 return (size_t)-1; 327 } 328 329 **outbuf = (uchar_t)ISC_atr; 330 (*outbuf)++; 331 **outbuf = aTRs[isc_type]; 332 (*outbuf)++; 333 (*outbytesleft)-=2; 334 st->aATR = aTRs[isc_type]; 335 } 336 337 /* UNI_INV, UNI_ZWJ, UNI_ZWNJ would occur within any India Script as 338 Consonant invisible, explicit halant and soft halant */ 339 if ( ucsid == UNI_INV || ucsid == UNI_ZWNJ || ucsid == UNI_ZWJ ) 340 isc_type = isc_TYPE[ st->aATR - 0x42 ]; 341 342 if ( isc_type == NUM_ISCII ) { 343 if ( *outbytesleft < 1 ) { 344 errno = E2BIG; 345 return (size_t)-1; 346 } 347 348 **outbuf = REPLACE_CHAR; 349 (*outbuf)++; 350 (*outbytesleft)--; 351 } else { 352 n = ucs_to_iscii(ucsid, outbuf, outbytesleft, isc_type, &st->halant_context); 353 if ( n > 0 ) { 354 (*outbuf) += n; 355 (*outbytesleft) -= n; 356 } else if ( errno == E2BIG ) { 357 /* n == 0 if the ZWJ or ZWNJ has been consumed without error */ 358 st->_errno = errno; 359 errno = E2BIG; 360 return (size_t)-1; 361 } 362 } 363 } else { 364 errno = EILSEQ; 365 return (size_t)-1; 366 } 367 st->_ustate = U0; 368 break; 369 case U4: 370 371 first_byte = st->keepc[0]; 372 373 /* if the first byte is 0xf0, it is illegal sequence if 374 * the second one is between 0x80 and 0x8f 375 * for Four-Byte UTF: U+10000..U+10FFFF 376 */ 377 if (((uchar_t) **inbuf) < valid_min_2nd_byte[first_byte] || 378 ((uchar_t) **inbuf) > valid_max_2nd_byte[first_byte] ) 379 errno = EILSEQ; 380 else { 381 st->_ustate = U5; 382 st->keepc[1] = **inbuf; 383 } 384 break; 385 case U5: 386 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 387 { 388 st->_ustate = U6; 389 st->keepc[2] = **inbuf; 390 } 391 else 392 errno = EILSEQ; 393 break; 394 case U6: 395 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */ 396 { 397 st->keepc[3] = **inbuf; 398 st->_ustate = U0; 399 400 /* replace with REPLACE_CHAR */ 401 **outbuf = REPLACE_CHAR; 402 (*outbuf)++; 403 (*outbytesleft)--; 404 } 405 else 406 errno = EILSEQ; 407 break; 408 } 409 410 if (errno) 411 break; 412 413 (*inbuf)++; 414 (*inbytesleft)--; 415 } /* end of while loop */ 416 417 if (errno) return (size_t) -1; 418 419 if (*inbytesleft == 0 && st->_ustate != U0) { 420 errno = EINVAL; 421 return (size_t)-1; 422 } 423 424 if (*inbytesleft > 0 && *outbytesleft == 0) { 425 errno = E2BIG; 426 return((size_t)-1); 427 } 428 429 return (size_t)(*inbytesleft); 430 } 431