116d86563SAlexander Pyhalov /*
216d86563SAlexander Pyhalov * CDDL HEADER START
316d86563SAlexander Pyhalov *
416d86563SAlexander Pyhalov * The contents of this file are subject to the terms of the
516d86563SAlexander Pyhalov * Common Development and Distribution License (the "License").
616d86563SAlexander Pyhalov * You may not use this file except in compliance with the License.
716d86563SAlexander Pyhalov *
816d86563SAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
916d86563SAlexander Pyhalov * or http://www.opensolaris.org/os/licensing.
1016d86563SAlexander Pyhalov * See the License for the specific language governing permissions
1116d86563SAlexander Pyhalov * and limitations under the License.
1216d86563SAlexander Pyhalov *
1316d86563SAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each
1416d86563SAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE.
1516d86563SAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the
1616d86563SAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying
1716d86563SAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner]
1816d86563SAlexander Pyhalov *
1916d86563SAlexander Pyhalov * CDDL HEADER END
2016d86563SAlexander Pyhalov */
2116d86563SAlexander Pyhalov
2216d86563SAlexander Pyhalov /*
2316d86563SAlexander Pyhalov * Copyright (c) 1997, by Sun Microsystems, Inc.
2416d86563SAlexander Pyhalov * All rights reserved.
2516d86563SAlexander Pyhalov */
2616d86563SAlexander Pyhalov
2716d86563SAlexander Pyhalov #include <stdio.h>
2816d86563SAlexander Pyhalov #include <stdlib.h>
2916d86563SAlexander Pyhalov #include <errno.h>
3016d86563SAlexander Pyhalov #include <sys/types.h>
3116d86563SAlexander Pyhalov
3216d86563SAlexander Pyhalov #include "tab_lookup.h" /* table lookup data types */
3316d86563SAlexander Pyhalov
3416d86563SAlexander Pyhalov #define MSB 0x80 /* most significant bit */
3516d86563SAlexander Pyhalov #define ONEBYTE 0xff /* right most byte */
3616d86563SAlexander Pyhalov
3716d86563SAlexander Pyhalov enum _USTATE { U0, U1, U11, U2, U3, U4 };
3816d86563SAlexander Pyhalov
3916d86563SAlexander Pyhalov
4016d86563SAlexander Pyhalov int get_ibm_by_utf(_icv_state *st, char c1, char c2, int *unidx,
4116d86563SAlexander Pyhalov unsigned long *ibm_code);
4216d86563SAlexander Pyhalov
4316d86563SAlexander Pyhalov int bisearch(unsigned long val, _icv_state *st, int n);
4416d86563SAlexander Pyhalov
4516d86563SAlexander Pyhalov int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf,
4616d86563SAlexander Pyhalov size_t buflen, _icv_state *st);
4716d86563SAlexander Pyhalov
4816d86563SAlexander Pyhalov /*
4916d86563SAlexander Pyhalov * Actual conversion; called from iconv()
5016d86563SAlexander Pyhalov * Input is UTF-8 data.
5116d86563SAlexander Pyhalov * first convert to UCS2
5216d86563SAlexander Pyhalov */
5316d86563SAlexander Pyhalov size_t
_icv_iconv(_icv_state * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)5416d86563SAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
5516d86563SAlexander Pyhalov char **outbuf, size_t *outbytesleft)
5616d86563SAlexander Pyhalov {
5716d86563SAlexander Pyhalov /*
5816d86563SAlexander Pyhalov * Actual conversion; called from iconv()
5916d86563SAlexander Pyhalov */
6016d86563SAlexander Pyhalov /*=========================================================
6116d86563SAlexander Pyhalov *
6216d86563SAlexander Pyhalov * State Machine for interpreting UTF8 code
6316d86563SAlexander Pyhalov *
6416d86563SAlexander Pyhalov *=========================================================
6516d86563SAlexander Pyhalov *
6616d86563SAlexander Pyhalov * 3 byte unicode
6716d86563SAlexander Pyhalov * +----->------->-------+
6816d86563SAlexander Pyhalov * | |
6916d86563SAlexander Pyhalov * ^ v
7016d86563SAlexander Pyhalov * | 2 byte U2 ---> U3
7116d86563SAlexander Pyhalov * | unicode v
7216d86563SAlexander Pyhalov * +------> U0 -------> U1 +-------->U4---+
7316d86563SAlexander Pyhalov * ^ ascii | | ^ |
7416d86563SAlexander Pyhalov * | | +-------->--------->--------+ |
7516d86563SAlexander Pyhalov * | v v
7616d86563SAlexander Pyhalov * +----<---+-----<------------<------------<------------+
7716d86563SAlexander Pyhalov *
7816d86563SAlexander Pyhalov * +----<---+-----<------------<------------<------------+
7916d86563SAlexander Pyhalov *
8016d86563SAlexander Pyhalov *=========================================================*/
8116d86563SAlexander Pyhalov
8216d86563SAlexander Pyhalov char c1 = '\0', c2 = '\0';
8316d86563SAlexander Pyhalov int n, unidx;
8416d86563SAlexander Pyhalov unsigned long ibm_code;
8516d86563SAlexander Pyhalov
8616d86563SAlexander Pyhalov #ifdef DEBUG
8716d86563SAlexander Pyhalov fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n");
8816d86563SAlexander Pyhalov #endif
8916d86563SAlexander Pyhalov
9016d86563SAlexander Pyhalov if (st == NULL) {
9116d86563SAlexander Pyhalov errno = EBADF;
9216d86563SAlexander Pyhalov return ((size_t) -1);
9316d86563SAlexander Pyhalov }
9416d86563SAlexander Pyhalov
9516d86563SAlexander Pyhalov if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
9616d86563SAlexander Pyhalov st->ustate = U0;
9716d86563SAlexander Pyhalov st->_errno = 0;
9816d86563SAlexander Pyhalov st->shift = SHIFT_IN;
9916d86563SAlexander Pyhalov return ((size_t) 0);
10016d86563SAlexander Pyhalov }
10116d86563SAlexander Pyhalov
10216d86563SAlexander Pyhalov st->_errno = 0; /* reset internal errno */
10316d86563SAlexander Pyhalov errno = 0; /* reset external errno */
10416d86563SAlexander Pyhalov
10516d86563SAlexander Pyhalov /* a state machine for interpreting UTF8 code */
10616d86563SAlexander Pyhalov while (*inbytesleft > 0 && *outbytesleft > 0) {
10716d86563SAlexander Pyhalov switch (st->ustate) {
10816d86563SAlexander Pyhalov case U0:
10916d86563SAlexander Pyhalov /* it is ascii, convert it immediately */
11016d86563SAlexander Pyhalov if ((**inbuf & MSB) == 0) { /* ASCII */
11116d86563SAlexander Pyhalov st->ustate = U4;
11216d86563SAlexander Pyhalov st->keepc[0] = **inbuf;
11316d86563SAlexander Pyhalov c1 = 0x0;
11416d86563SAlexander Pyhalov c2 = **inbuf;
11516d86563SAlexander Pyhalov continue;
11616d86563SAlexander Pyhalov } else { /* Chinese character */
11716d86563SAlexander Pyhalov if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
11816d86563SAlexander Pyhalov st->ustate = U1;
11916d86563SAlexander Pyhalov st->keepc[0] = **inbuf;
12016d86563SAlexander Pyhalov } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */
12116d86563SAlexander Pyhalov st->ustate = U2;
12216d86563SAlexander Pyhalov st->keepc[0] = **inbuf;
12316d86563SAlexander Pyhalov } else { /* illegal unicode */
12416d86563SAlexander Pyhalov /* st->_errno = errno = EINVAL; */
12516d86563SAlexander Pyhalov /* possible UNICODE ko_KR-UTF8 */
12616d86563SAlexander Pyhalov c1 =st->keepc[0] = **inbuf;
12716d86563SAlexander Pyhalov st->ustate = U11;
12816d86563SAlexander Pyhalov break;
12916d86563SAlexander Pyhalov }
13016d86563SAlexander Pyhalov }
13116d86563SAlexander Pyhalov break;
13216d86563SAlexander Pyhalov case U1: /* 2 byte unicode */
13316d86563SAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
13416d86563SAlexander Pyhalov st->ustate = U4;
13516d86563SAlexander Pyhalov st->keepc[1] = **inbuf;
13616d86563SAlexander Pyhalov c1 = (st->keepc[0]&0x1c)>>2;
13716d86563SAlexander Pyhalov c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
13816d86563SAlexander Pyhalov #ifdef DEBUG
13916d86563SAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x --> ",
14016d86563SAlexander Pyhalov st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
14116d86563SAlexander Pyhalov #endif
14216d86563SAlexander Pyhalov continue; /* should not advance *inbuf */
14316d86563SAlexander Pyhalov } else {
14416d86563SAlexander Pyhalov st->_errno = errno = EINVAL;
14516d86563SAlexander Pyhalov }
14616d86563SAlexander Pyhalov break;
14716d86563SAlexander Pyhalov case U11: /* 3 byte unicode - 2nd byte */
14816d86563SAlexander Pyhalov c2 =st->keepc[1] = **inbuf;
14916d86563SAlexander Pyhalov st->ustate = U4;
15016d86563SAlexander Pyhalov continue;
15116d86563SAlexander Pyhalov break;
15216d86563SAlexander Pyhalov case U2: /* 3 byte unicode - 2nd byte */
15316d86563SAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
15416d86563SAlexander Pyhalov st->ustate = U3;
15516d86563SAlexander Pyhalov st->keepc[1] = **inbuf;
15616d86563SAlexander Pyhalov } else {
15716d86563SAlexander Pyhalov st->_errno = errno = EINVAL;
15816d86563SAlexander Pyhalov }
15916d86563SAlexander Pyhalov break;
16016d86563SAlexander Pyhalov case U3: /* 3 byte unicode - 3rd byte */
16116d86563SAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
16216d86563SAlexander Pyhalov st->ustate = U4;
16316d86563SAlexander Pyhalov st->keepc[2] = **inbuf;
16416d86563SAlexander Pyhalov c1 = ((st->keepc[0]&0x0f)<<4) |
16516d86563SAlexander Pyhalov ((st->keepc[1]&0x3c)>>2);
16616d86563SAlexander Pyhalov c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
16716d86563SAlexander Pyhalov #ifdef DEBUG
16816d86563SAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
16916d86563SAlexander Pyhalov st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
17016d86563SAlexander Pyhalov #endif
17116d86563SAlexander Pyhalov continue; /* should not advance *inbuf */
17216d86563SAlexander Pyhalov } else {
17316d86563SAlexander Pyhalov st->_errno = errno = EINVAL;
17416d86563SAlexander Pyhalov }
17516d86563SAlexander Pyhalov break;
17616d86563SAlexander Pyhalov case U4:
17716d86563SAlexander Pyhalov n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
17816d86563SAlexander Pyhalov if (n != 0) { /* legal unicode;illegal Big5 */
17916d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
18016d86563SAlexander Pyhalov break;
18116d86563SAlexander Pyhalov }
18216d86563SAlexander Pyhalov
18316d86563SAlexander Pyhalov n = utf8_to_ibm(unidx, ibm_code,
18416d86563SAlexander Pyhalov *outbuf, *outbytesleft, st);
18516d86563SAlexander Pyhalov if (n > 0) {
18616d86563SAlexander Pyhalov (*outbuf) += n;
18716d86563SAlexander Pyhalov (*outbytesleft) -= n;
18816d86563SAlexander Pyhalov } else {
18916d86563SAlexander Pyhalov st->_errno = errno;
19016d86563SAlexander Pyhalov return((size_t)-1);
19116d86563SAlexander Pyhalov }
19216d86563SAlexander Pyhalov st->ustate = U0;
19316d86563SAlexander Pyhalov st->_errno = 0;
19416d86563SAlexander Pyhalov break;
19516d86563SAlexander Pyhalov default: /* should never come here */
19616d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
19716d86563SAlexander Pyhalov st->ustate = U0; /* reset state */
19816d86563SAlexander Pyhalov break;
19916d86563SAlexander Pyhalov }
20016d86563SAlexander Pyhalov
20116d86563SAlexander Pyhalov (*inbuf)++;
20216d86563SAlexander Pyhalov (*inbytesleft)--;
20316d86563SAlexander Pyhalov
20416d86563SAlexander Pyhalov if (st->_errno) {
20516d86563SAlexander Pyhalov #ifdef DEBUG
20616d86563SAlexander Pyhalov fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
20716d86563SAlexander Pyhalov st->_errno, st->ustate);
20816d86563SAlexander Pyhalov #endif
20916d86563SAlexander Pyhalov break;
21016d86563SAlexander Pyhalov }
21116d86563SAlexander Pyhalov
21216d86563SAlexander Pyhalov if (errno)
21316d86563SAlexander Pyhalov return((size_t)-1);
21416d86563SAlexander Pyhalov }
21516d86563SAlexander Pyhalov
21616d86563SAlexander Pyhalov if (*outbytesleft == 0) {
21716d86563SAlexander Pyhalov errno = E2BIG;
21816d86563SAlexander Pyhalov return((size_t)-1);
21916d86563SAlexander Pyhalov }
22016d86563SAlexander Pyhalov return (*inbytesleft);
22116d86563SAlexander Pyhalov }
22216d86563SAlexander Pyhalov
22316d86563SAlexander Pyhalov
22416d86563SAlexander Pyhalov /*
22516d86563SAlexander Pyhalov * Match IBM code by UTF8 code;
22616d86563SAlexander Pyhalov * Return: = 0 - match from Unicode to IBM found
22716d86563SAlexander Pyhalov * = 1 - match from Unicode to IBM NOT found
22816d86563SAlexander Pyhalov *
22916d86563SAlexander Pyhalov * Since binary search of the UTF8 to IBM table is necessary, might as well
23016d86563SAlexander Pyhalov * return index and IBM code matching to the unicode.
23116d86563SAlexander Pyhalov */
get_ibm_by_utf(st,c1,c2,unidx,ibm_code)23216d86563SAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
23316d86563SAlexander Pyhalov _icv_state *st;
23416d86563SAlexander Pyhalov char c1, c2;
23516d86563SAlexander Pyhalov int *unidx;
23616d86563SAlexander Pyhalov unsigned long *ibm_code;
23716d86563SAlexander Pyhalov {
23816d86563SAlexander Pyhalov unsigned long unicode;
23916d86563SAlexander Pyhalov
24016d86563SAlexander Pyhalov unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
24116d86563SAlexander Pyhalov *unidx = bisearch(unicode, st, st->table_size);
24216d86563SAlexander Pyhalov if ((*unidx) >= 0)
24316d86563SAlexander Pyhalov {
24416d86563SAlexander Pyhalov if ( st->left_to_right )
24516d86563SAlexander Pyhalov *ibm_code = st->table[*unidx].right_code;
24616d86563SAlexander Pyhalov else
24716d86563SAlexander Pyhalov *ibm_code = st->table[*unidx].left_code;
24816d86563SAlexander Pyhalov }
24916d86563SAlexander Pyhalov #ifdef DEBUG
25016d86563SAlexander Pyhalov fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
25116d86563SAlexander Pyhalov #endif
25216d86563SAlexander Pyhalov
25316d86563SAlexander Pyhalov return(0);
25416d86563SAlexander Pyhalov }
25516d86563SAlexander Pyhalov
25616d86563SAlexander Pyhalov
25716d86563SAlexander Pyhalov /*
25816d86563SAlexander Pyhalov * ISO/IEC 10646 (Unicode) --> IBM
25916d86563SAlexander Pyhalov * Unicode --> UTF8 (FSS-UTF)
26016d86563SAlexander Pyhalov * (File System Safe Universal Character Set Transformation Format)
26116d86563SAlexander Pyhalov * Return: > 0 - converted with enough space in output buffer
26216d86563SAlexander Pyhalov * = 0 - no space in outbuf
26316d86563SAlexander Pyhalov */
utf8_to_ibm(int unidx,unsigned long ibm_code,char * buf,size_t buflen,_icv_state * st)264*f642269fSToomas Soome int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf, size_t buflen,
265*f642269fSToomas Soome _icv_state *st)
26616d86563SAlexander Pyhalov {
26716d86563SAlexander Pyhalov unsigned long val; /* IBM value */
26816d86563SAlexander Pyhalov char c1, c2, ibm_str[3];
26916d86563SAlexander Pyhalov
27016d86563SAlexander Pyhalov if (unidx < 0) /* no match from UTF8 to IBM */
27116d86563SAlexander Pyhalov ibm_code = (unsigned long)NON_ID_CHAR;
27216d86563SAlexander Pyhalov
27316d86563SAlexander Pyhalov {
27416d86563SAlexander Pyhalov val = ibm_code & 0xffff;
27516d86563SAlexander Pyhalov c1 = (char) ((val & 0xff00) >> 8);
27616d86563SAlexander Pyhalov c2 = (char) (val & 0xff);
27716d86563SAlexander Pyhalov }
27816d86563SAlexander Pyhalov
27916d86563SAlexander Pyhalov /* it is single byte ascii */
28016d86563SAlexander Pyhalov if ( c1 == 0x0 ) {
28116d86563SAlexander Pyhalov if ( st->shift == SHIFT_OUT ) {
28216d86563SAlexander Pyhalov if (buflen < 2) {
28316d86563SAlexander Pyhalov errno = E2BIG;
28416d86563SAlexander Pyhalov return 0;
28516d86563SAlexander Pyhalov }
28616d86563SAlexander Pyhalov *buf = SHIFT_IN;
28716d86563SAlexander Pyhalov *(buf+1) = c2;
28816d86563SAlexander Pyhalov st->shift = SHIFT_IN;
28916d86563SAlexander Pyhalov return 2;
29016d86563SAlexander Pyhalov }
29116d86563SAlexander Pyhalov if (buflen < 1) {
29216d86563SAlexander Pyhalov errno = E2BIG;
29316d86563SAlexander Pyhalov return 0;
29416d86563SAlexander Pyhalov }
29516d86563SAlexander Pyhalov *buf = c2;
29616d86563SAlexander Pyhalov return 1;
29716d86563SAlexander Pyhalov }
29816d86563SAlexander Pyhalov
29916d86563SAlexander Pyhalov /* it is the first two bytes character */
30016d86563SAlexander Pyhalov if ( st->shift == SHIFT_IN ) {
30116d86563SAlexander Pyhalov if (buflen < 3) {
30216d86563SAlexander Pyhalov errno = E2BIG;
30316d86563SAlexander Pyhalov return 0;
30416d86563SAlexander Pyhalov }
30516d86563SAlexander Pyhalov *buf = SHIFT_OUT;
30616d86563SAlexander Pyhalov st->shift = SHIFT_OUT;
30716d86563SAlexander Pyhalov *(buf+1) = c1;
30816d86563SAlexander Pyhalov *(buf+2) = c2;
30916d86563SAlexander Pyhalov return 3;
31016d86563SAlexander Pyhalov }
31116d86563SAlexander Pyhalov
31216d86563SAlexander Pyhalov *buf = ibm_str[0] = c1;
31316d86563SAlexander Pyhalov *(buf+1) = ibm_str[1] = c2;
314*f642269fSToomas Soome ibm_str[2] = '\0';
31516d86563SAlexander Pyhalov
31616d86563SAlexander Pyhalov #ifdef DEBUG
31716d86563SAlexander Pyhalov fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
31816d86563SAlexander Pyhalov #endif
31916d86563SAlexander Pyhalov
32016d86563SAlexander Pyhalov
32116d86563SAlexander Pyhalov if (buflen < 2) {
32216d86563SAlexander Pyhalov errno = E2BIG;
32316d86563SAlexander Pyhalov return(0);
32416d86563SAlexander Pyhalov }
32516d86563SAlexander Pyhalov
32616d86563SAlexander Pyhalov return(2);
32716d86563SAlexander Pyhalov }
328