/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2000, by Sun Microsystems, Inc. * All rights reserved. */ #include #include #include #include #include #include "common_defs.h" #include "big5hk_unicode.h" /* HKSCS to Unicode mapping table */ #define MSB 0x80 /* most significant bit */ #define MBYTE 0x8e /* multi-byte (4 byte character) */ #define PMASK 0xa0 /* plane number mask */ #define ONEBYTE 0xff /* right most byte */ /* non-identified character */ #define UTF8_NON_ID_CHAR1 0xEF #define UTF8_NON_ID_CHAR2 0xBF #define UTF8_NON_ID_CHAR3 0xBD typedef struct _icv_state { char keepc[2]; /* maximum # byte of HKSCS code */ short cstate; /* state machine id */ int _errno; /* internal errno */ boolean little_endian; boolean bom_written; }_iconv_st; enum _CSTATE { C0, C1 }; static int hkscs_2nd_byte(char); static int hkscs_to_utf8(_iconv_st *, char*, size_t, int *); static int binsearch(unsigned long, hkscs_utf[], int); /* * Open; called from iconv_open() */ void * _icv_open() { _iconv_st *st; if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) { errno = ENOMEM; return ((void *) -1); } st->cstate = C0; st->_errno = 0; st->little_endian = false; st->bom_written = false; #if defined(UCS_2LE) st->little_endian = true; st->bom_written = true; #endif return ((void *) st); } /* * Close; called from iconv_close() */ void _icv_close(_iconv_st *st) { if (!st) errno = EBADF; else free(st); } /* * Actual conversion; called from iconv() */ /*======================================================= * * State Machine for interpreting HKSCS code * *======================================================= * * 1st C * +--------> C0 ----------> C1 * | ascii | 2nd C | * ^ v v * +----<-----+-----<--------+ * *=======================================================*/ /* * HKSCS encoding range: * High byte: 0x81 - 0xFE * Low byte: 0x40 - 0x7E, 0xA1 - 0xFE * * For HKSCS: * 0x8140 - 0x8DFE ( 641 encoding space) * 0x8E40 - 0xA0FE ( 2898 encoding space) * 0xC6A1 - 0xC8FE ( 359 encoding space) * 0xF9D6 - 0xF9FE ( 41 encoding space) * 0xFA40 - 0xFEFE ( 763 encoding space) * Total: 4702 * For BIG5 * 0xA140 - 0xC8FE * 0xC940 - 0xFEFE */ size_t _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { int n; int uconv_num = 0; #ifdef DEBUG fprintf(stderr, "========== iconv(): HKSCS --> UTF2 ==========\n"); #endif if (st == NULL) { errno = EBADF; return ((size_t) -1); } if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */ st->cstate = C0; st->_errno = 0; return ((size_t) 0); } st->_errno = 0; /* reset internal errno */ errno = 0; /* reset external errno */ /* a state machine for interpreting CNS 11643 code */ while (*inbytesleft > 0 && *outbytesleft > 0) { switch (st->cstate) { case C0: /* assuming ASCII in the beginning */ if (**inbuf & MSB) { st->keepc[0] = (**inbuf); st->cstate = C1; } else { /* real ASCII */ /* * code conversion for UCS-2LE to support Samba */ if (st->little_endian) { if (!st->bom_written) { if (*outbytesleft < 4) errno = E2BIG; else { *(*outbuf)++ = (uchar_t)0xff; *(*outbuf)++ = (uchar_t)0xfe; *outbytesleft -= 2; st->bom_written = true; } } if (*outbytesleft < 2) errno = E2BIG; else { *(*outbuf)++ = **inbuf; *(*outbuf)++ = (uchar_t)0x0; *outbytesleft -= 2; } } else { **outbuf = **inbuf; (*outbuf)++; (*outbytesleft)--; } } break; case C1: /* Chinese characters: 2nd byte */ if (hkscs_2nd_byte(**inbuf) == 0) { int uconv_num_internal = 0; st->keepc[1] = (**inbuf); n = hkscs_to_utf8(st, *outbuf, *outbytesleft, &uconv_num_internal); if (n > 0) { (*outbuf) += n; (*outbytesleft) -= n; uconv_num += uconv_num_internal; st->cstate = C0; } else { /* don't reset state */ st->_errno = errno = E2BIG; } } else { /* input char doesn't belong * to the input code set */ st->_errno = errno = EILSEQ; } break; default: /* should never come here */ st->_errno = errno = EILSEQ; st->cstate = C0; /* reset state */ break; } if (st->_errno) { #ifdef DEBUG fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n", st->_errno, st->cstate); #endif break; } (*inbuf)++; (*inbytesleft)--; } if (*inbytesleft == 0 && st->cstate != C0) errno = EINVAL; if (*inbytesleft > 0 && *outbytesleft == 0) errno = E2BIG; if (errno) { /* * if error, *inbuf points to the byte following the last byte * successfully used in the conversion. */ *inbuf -= (st->cstate - C0); *inbytesleft += (st->cstate - C0); st->cstate = C0; return ((size_t) -1); } return uconv_num; } /* * Test whether inbuf is a valid character for 2nd byte HKSCS code * Return: = 0 - valid HKSCS 2nd byte * = 1 - invalid HKSCS 2nd byte */ static int hkscs_2nd_byte(char inbuf) { unsigned int buf = (unsigned int) (inbuf & ONEBYTE); if ((buf >= 0x40) && (buf <= 0x7E)) return (0); if ((buf >= 0xA1) && (buf <= 0xFE)) return (0); return(1); } #ifdef UDC_SUPPORT typedef struct _udc_sect { unsigned int start, end, count; } UDC; UDC udc[] = { { 0x8140, 0x84FE, 0x274 } }; #define START_UNICODE 0xF0000 static int ifUDC(UDC *udc, unsigned int code) { int i; for (i=0; i < 1; ++i) if (code >= udc[i].start && code <= udc[i].end) { unsigned char c1, c2, leading_c1; c1 = (unsigned char)(code >> 8); c2 = (unsigned char)code; leading_c1 = (unsigned char) (udc[i].start >> 8); return START_UNICODE + (i ? udc[i-1].count : 0) + \ (c1 - leading_c1) * 157 + ((c2 <= 0x7E) ? (c2 - 0x40) : ((c2 - 0x40) - (0xA1 - 0x7F))); } return 0; } #endif /* * HKSCS code --> ISO/IEC 10646 (Unicode) * Unicode --> UTF8 (FSS-UTF) * (File System Safe Universal Character Set Transformation Format) * Return: > 0 - converted with enough space in output buffer * = 0 - no space in outbuf */ static int hkscs_to_utf8(_iconv_st *st, char *buf, size_t buflen, int *uconv_num) { unsigned long hkscs_val; /* HKSCS value */ int unidx = 0; /* Unicode index */ unsigned long uni_val = 0; /* Unicode */ char *keepc = st->keepc; hkscs_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE); #ifdef DEBUG fprintf(stderr, "%x\t", hkscs_val); #endif #ifdef UDC_SUPPORT if ((uni_val = ifUDC(udc, hkscs_val)) == 0) { #endif unidx = binsearch(hkscs_val, hkscs_utf_tab, MAX_HKSCS_NUM); if (unidx >= 0) uni_val = hkscs_utf_tab[unidx].unicode; #ifdef UDC_SUPPORT } #endif #ifdef DEBUG fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val); #endif /* * Code version for UCS-2LE to support Samba */ if (st->little_endian) { int size = 0; if (unidx < 0 || uni_val > 0x00ffff ) { uni_val = ICV_CHAR_UCS2_REPLACEMENT; *uconv_num = 1; } if (!st->bom_written) { if (buflen < 4) return 0; *(buf + size++) = (uchar_t)0xff; *(buf + size++) = (uchar_t)0xfe; st->bom_written = true; } if (buflen < 2) return 0; *(buf + size++) = (uchar_t)(uni_val & 0xff); *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff); return size; } if (unidx >= 0) { /* do Unicode to UTF8 conversion */ if (uni_val >= 0x0080 && uni_val <= 0x07ff) { if (buflen < 2) { #ifdef DEBUG fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n"); #endif errno = E2BIG; return(0); } *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0; *(buf+1) = (char)(uni_val & 0x3f) | 0x80; #ifdef DEBUG fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE); #endif return(2); } if (uni_val >= 0x0800 && uni_val <= 0xffff) { if (buflen < 3) { #ifdef DEBUG fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n"); #endif errno = E2BIG; return(0); } *buf = (char)((uni_val >> 12) & 0xf) | 0xe0; *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80; *(buf+2) = (char)(uni_val & 0x3f) | 0x80; #ifdef DEBUG fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE); #endif return(3); } if (uni_val >= 0x10000 && uni_val <= 0x10ffff) { if (buflen < 4) { errno = E2BIG; return 0; } *buf = (char)((uni_val >> 18) & 0x7) | 0xf0; *(buf+1) = (char)((uni_val >> 12) & 0x3f) | 0x80; *(buf+2) = (char)((uni_val >>6) & 0x3f) | 0x80; *(buf+3) = (char)(uni_val & 0x3f) | 0x80; return(4); } } /* can't find a match in HKSCS --> UTF8 table or illegal UTF8 code */ if (buflen < 3) { #ifdef DEBUG fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n"); #endif errno = E2BIG; return(0); } *(unsigned char*) buf = UTF8_NON_ID_CHAR1; *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2; *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3; /* non-identical conversion */ *uconv_num = 1; #ifdef DEBUG fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2)); #endif return(3); } /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */ static int binsearch(unsigned long x, hkscs_utf v[], int n) { int low, high, mid; low = 0; high = n - 1; while (low <= high) { mid = (low + high) / 2; if (x < v[mid].hkscscode) high = mid - 1; else if (x > v[mid].hkscscode) low = mid + 1; else /* found match */ return mid; } return (-1); /* no match */ }