xref: /illumos-gate/usr/src/lib/iconv_modules/common/utf8%ibm.c (revision f642269fe771b10890afea92038f4531cd50cfd9)
116d86563SAlexander Pyhalov /*
216d86563SAlexander Pyhalov  * CDDL HEADER START
316d86563SAlexander Pyhalov  *
416d86563SAlexander Pyhalov  * The contents of this file are subject to the terms of the
516d86563SAlexander Pyhalov  * Common Development and Distribution License (the "License").
616d86563SAlexander Pyhalov  * You may not use this file except in compliance with the License.
716d86563SAlexander Pyhalov  *
816d86563SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
916d86563SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
1016d86563SAlexander Pyhalov  * See the License for the specific language governing permissions
1116d86563SAlexander Pyhalov  * and limitations under the License.
1216d86563SAlexander Pyhalov  *
1316d86563SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
1416d86563SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
1516d86563SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
1616d86563SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
1716d86563SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
1816d86563SAlexander Pyhalov  *
1916d86563SAlexander Pyhalov  * CDDL HEADER END
2016d86563SAlexander Pyhalov  */
2116d86563SAlexander Pyhalov 
2216d86563SAlexander Pyhalov /*
2316d86563SAlexander Pyhalov  * Copyright (c) 1997, by Sun Microsystems, Inc.
2416d86563SAlexander Pyhalov  * All rights reserved.
2516d86563SAlexander Pyhalov  */
2616d86563SAlexander Pyhalov 
2716d86563SAlexander Pyhalov #include <stdio.h>
2816d86563SAlexander Pyhalov #include <stdlib.h>
2916d86563SAlexander Pyhalov #include <errno.h>
3016d86563SAlexander Pyhalov #include <sys/types.h>
3116d86563SAlexander Pyhalov 
3216d86563SAlexander Pyhalov #include "tab_lookup.h"	/* table lookup data types */
3316d86563SAlexander Pyhalov 
3416d86563SAlexander Pyhalov #define MSB     0x80    /* most significant bit */
3516d86563SAlexander Pyhalov #define ONEBYTE 0xff    /* right most byte */
3616d86563SAlexander Pyhalov 
3716d86563SAlexander Pyhalov enum _USTATE    { U0, U1, U11, U2, U3, U4 };
3816d86563SAlexander Pyhalov 
3916d86563SAlexander Pyhalov 
4016d86563SAlexander Pyhalov int get_ibm_by_utf(_icv_state	*st, char c1, char c2, int *unidx,
4116d86563SAlexander Pyhalov     unsigned long   *ibm_code);
4216d86563SAlexander Pyhalov 
4316d86563SAlexander Pyhalov int bisearch(unsigned long val, _icv_state *st, int n);
4416d86563SAlexander Pyhalov 
4516d86563SAlexander Pyhalov int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf,
4616d86563SAlexander Pyhalov     size_t buflen, _icv_state *st);
4716d86563SAlexander Pyhalov 
4816d86563SAlexander Pyhalov /*
4916d86563SAlexander Pyhalov  * Actual conversion; called from iconv()
5016d86563SAlexander Pyhalov  * Input is UTF-8 data.
5116d86563SAlexander Pyhalov  * first convert to UCS2
5216d86563SAlexander Pyhalov  */
5316d86563SAlexander Pyhalov size_t
_icv_iconv(_icv_state * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)5416d86563SAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
5516d86563SAlexander Pyhalov                         char **outbuf, size_t *outbytesleft)
5616d86563SAlexander Pyhalov {
5716d86563SAlexander Pyhalov /*
5816d86563SAlexander Pyhalov  * Actual conversion; called from iconv()
5916d86563SAlexander Pyhalov  */
6016d86563SAlexander Pyhalov /*=========================================================
6116d86563SAlexander Pyhalov  *
6216d86563SAlexander Pyhalov  *       State Machine for interpreting UTF8 code
6316d86563SAlexander Pyhalov  *
6416d86563SAlexander Pyhalov  *=========================================================
6516d86563SAlexander Pyhalov  *
6616d86563SAlexander Pyhalov  *               3 byte unicode
6716d86563SAlexander Pyhalov  *          +----->------->-------+
6816d86563SAlexander Pyhalov  *          |                     |
6916d86563SAlexander Pyhalov  *          ^                     v
7016d86563SAlexander Pyhalov  *          |  2 byte             U2 ---> U3
7116d86563SAlexander Pyhalov  *          |  unicode                    v
7216d86563SAlexander Pyhalov  * +------> U0 -------> U1                +-------->U4---+
7316d86563SAlexander Pyhalov  * ^  ascii |           |                           ^    |
7416d86563SAlexander Pyhalov  * |        |           +-------->--------->--------+    |
7516d86563SAlexander Pyhalov  * |        v                                            v
7616d86563SAlexander Pyhalov  * +----<---+-----<------------<------------<------------+
7716d86563SAlexander Pyhalov  *
7816d86563SAlexander Pyhalov  * +----<---+-----<------------<------------<------------+
7916d86563SAlexander Pyhalov  *
8016d86563SAlexander Pyhalov  *=========================================================*/
8116d86563SAlexander Pyhalov 
8216d86563SAlexander Pyhalov         char            c1 = '\0', c2 = '\0';
8316d86563SAlexander Pyhalov         int             n, unidx;
8416d86563SAlexander Pyhalov         unsigned long   ibm_code;
8516d86563SAlexander Pyhalov 
8616d86563SAlexander Pyhalov #ifdef DEBUG
8716d86563SAlexander Pyhalov     fprintf(stderr, "==========     iconv(): UTF8 --> IBM     ==========\n");
8816d86563SAlexander Pyhalov #endif
8916d86563SAlexander Pyhalov 
9016d86563SAlexander Pyhalov         if (st == NULL) {
9116d86563SAlexander Pyhalov                 errno = EBADF;
9216d86563SAlexander Pyhalov                 return ((size_t) -1);
9316d86563SAlexander Pyhalov         }
9416d86563SAlexander Pyhalov 
9516d86563SAlexander Pyhalov         if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
9616d86563SAlexander Pyhalov                 st->ustate = U0;
9716d86563SAlexander Pyhalov                 st->_errno = 0;
9816d86563SAlexander Pyhalov 		st->shift = SHIFT_IN;
9916d86563SAlexander Pyhalov                 return ((size_t) 0);
10016d86563SAlexander Pyhalov         }
10116d86563SAlexander Pyhalov 
10216d86563SAlexander Pyhalov         st->_errno = 0;         /* reset internal errno */
10316d86563SAlexander Pyhalov         errno = 0;              /* reset external errno */
10416d86563SAlexander Pyhalov 
10516d86563SAlexander Pyhalov         /* a state machine for interpreting UTF8 code */
10616d86563SAlexander Pyhalov         while (*inbytesleft > 0 && *outbytesleft > 0) {
10716d86563SAlexander Pyhalov                 switch (st->ustate) {
10816d86563SAlexander Pyhalov                 case U0:
10916d86563SAlexander Pyhalov 			/* it is ascii, convert it immediately */
11016d86563SAlexander Pyhalov                         if ((**inbuf & MSB) == 0) {     /* ASCII */
11116d86563SAlexander Pyhalov 				st->ustate = U4;
11216d86563SAlexander Pyhalov 				st->keepc[0] = **inbuf;
11316d86563SAlexander Pyhalov 				c1 = 0x0;
11416d86563SAlexander Pyhalov 				c2 = **inbuf;
11516d86563SAlexander Pyhalov 				continue;
11616d86563SAlexander Pyhalov                         } else {        /* Chinese character */
11716d86563SAlexander Pyhalov                                 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
11816d86563SAlexander Pyhalov                                         st->ustate = U1;
11916d86563SAlexander Pyhalov                                         st->keepc[0] = **inbuf;
12016d86563SAlexander Pyhalov                                 } else if ((**inbuf & 0xf0) == 0xe0) {  /* 3 byte */
12116d86563SAlexander Pyhalov                                         st->ustate = U2;
12216d86563SAlexander Pyhalov                                         st->keepc[0] = **inbuf;
12316d86563SAlexander Pyhalov                                 } else {        /* illegal unicode */
12416d86563SAlexander Pyhalov                                         /* st->_errno = errno = EINVAL; */
12516d86563SAlexander Pyhalov 				/* possible UNICODE ko_KR-UTF8 */
12616d86563SAlexander Pyhalov 				c1 =st->keepc[0] = **inbuf;
12716d86563SAlexander Pyhalov                                 st->ustate = U11;
12816d86563SAlexander Pyhalov                                         break;
12916d86563SAlexander Pyhalov                                 }
13016d86563SAlexander Pyhalov                         }
13116d86563SAlexander Pyhalov                         break;
13216d86563SAlexander Pyhalov                 case U1:                /* 2 byte unicode */
13316d86563SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
13416d86563SAlexander Pyhalov                                 st->ustate = U4;
13516d86563SAlexander Pyhalov                                 st->keepc[1] = **inbuf;
13616d86563SAlexander Pyhalov                                 c1 = (st->keepc[0]&0x1c)>>2;
13716d86563SAlexander Pyhalov                                 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
13816d86563SAlexander Pyhalov #ifdef DEBUG
13916d86563SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x   --> ",
14016d86563SAlexander Pyhalov         st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
14116d86563SAlexander Pyhalov #endif
14216d86563SAlexander Pyhalov                                 continue;       /* should not advance *inbuf */
14316d86563SAlexander Pyhalov                         } else {
14416d86563SAlexander Pyhalov                                  st->_errno = errno = EINVAL;
14516d86563SAlexander Pyhalov                         }
14616d86563SAlexander Pyhalov                         break;
14716d86563SAlexander Pyhalov                 case U11:                /* 3 byte unicode - 2nd byte */
14816d86563SAlexander Pyhalov 				c2 =st->keepc[1] = **inbuf;
14916d86563SAlexander Pyhalov                                 st->ustate = U4;
15016d86563SAlexander Pyhalov 				continue;
15116d86563SAlexander Pyhalov 			break;
15216d86563SAlexander Pyhalov                 case U2:                /* 3 byte unicode - 2nd byte */
15316d86563SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
15416d86563SAlexander Pyhalov                                 st->ustate = U3;
15516d86563SAlexander Pyhalov                                 st->keepc[1] = **inbuf;
15616d86563SAlexander Pyhalov                         } else {
15716d86563SAlexander Pyhalov                                 st->_errno = errno = EINVAL;
15816d86563SAlexander Pyhalov                         }
15916d86563SAlexander Pyhalov                         break;
16016d86563SAlexander Pyhalov                 case U3:                /* 3 byte unicode - 3rd byte */
16116d86563SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
16216d86563SAlexander Pyhalov                                 st->ustate = U4;
16316d86563SAlexander Pyhalov                                 st->keepc[2] = **inbuf;
16416d86563SAlexander Pyhalov                                 c1 = ((st->keepc[0]&0x0f)<<4) |
16516d86563SAlexander Pyhalov                                         ((st->keepc[1]&0x3c)>>2);
16616d86563SAlexander Pyhalov                                 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
16716d86563SAlexander Pyhalov #ifdef DEBUG
16816d86563SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
16916d86563SAlexander Pyhalov                 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
17016d86563SAlexander Pyhalov #endif
17116d86563SAlexander Pyhalov                                 continue;       /* should not advance *inbuf */
17216d86563SAlexander Pyhalov                         } else {
17316d86563SAlexander Pyhalov                                 st->_errno = errno = EINVAL;
17416d86563SAlexander Pyhalov                         }
17516d86563SAlexander Pyhalov                         break;
17616d86563SAlexander Pyhalov                 case U4:
17716d86563SAlexander Pyhalov                         n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
17816d86563SAlexander Pyhalov                         if (n != 0) {   /* legal unicode;illegal Big5 */
17916d86563SAlexander Pyhalov                                 st->_errno = errno = EILSEQ;
18016d86563SAlexander Pyhalov                                 break;
18116d86563SAlexander Pyhalov                         }
18216d86563SAlexander Pyhalov 
18316d86563SAlexander Pyhalov                         n = utf8_to_ibm(unidx, ibm_code,
18416d86563SAlexander Pyhalov                                         *outbuf, *outbytesleft, st);
18516d86563SAlexander Pyhalov                         if (n > 0) {
18616d86563SAlexander Pyhalov                                 (*outbuf) += n;
18716d86563SAlexander Pyhalov                                 (*outbytesleft) -= n;
18816d86563SAlexander Pyhalov                         } else {
18916d86563SAlexander Pyhalov                                 st->_errno = errno;
19016d86563SAlexander Pyhalov                                 return((size_t)-1);
19116d86563SAlexander Pyhalov                         }
19216d86563SAlexander Pyhalov                         st->ustate = U0;
19316d86563SAlexander Pyhalov                         st->_errno = 0;
19416d86563SAlexander Pyhalov                         break;
19516d86563SAlexander Pyhalov                 default:                        /* should never come here */
19616d86563SAlexander Pyhalov                         st->_errno = errno = EILSEQ;
19716d86563SAlexander Pyhalov                         st->ustate = U0;        /* reset state */
19816d86563SAlexander Pyhalov                         break;
19916d86563SAlexander Pyhalov                 }
20016d86563SAlexander Pyhalov 
20116d86563SAlexander Pyhalov                 (*inbuf)++;
20216d86563SAlexander Pyhalov                 (*inbytesleft)--;
20316d86563SAlexander Pyhalov 
20416d86563SAlexander Pyhalov                 if (st->_errno) {
20516d86563SAlexander Pyhalov #ifdef DEBUG
20616d86563SAlexander Pyhalov     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
20716d86563SAlexander Pyhalov                 st->_errno, st->ustate);
20816d86563SAlexander Pyhalov #endif
20916d86563SAlexander Pyhalov                         break;
21016d86563SAlexander Pyhalov                 }
21116d86563SAlexander Pyhalov 
21216d86563SAlexander Pyhalov                 if (errno)
21316d86563SAlexander Pyhalov                         return((size_t)-1);
21416d86563SAlexander Pyhalov         }
21516d86563SAlexander Pyhalov 
21616d86563SAlexander Pyhalov         if (*outbytesleft == 0) {
21716d86563SAlexander Pyhalov                 errno = E2BIG;
21816d86563SAlexander Pyhalov                 return((size_t)-1);
21916d86563SAlexander Pyhalov         }
22016d86563SAlexander Pyhalov         return (*inbytesleft);
22116d86563SAlexander Pyhalov }
22216d86563SAlexander Pyhalov 
22316d86563SAlexander Pyhalov 
22416d86563SAlexander Pyhalov /*
22516d86563SAlexander Pyhalov  * Match IBM code by UTF8 code;
22616d86563SAlexander Pyhalov  * Return: = 0 - match from Unicode to IBM found
22716d86563SAlexander Pyhalov  *         = 1 - match from Unicode to IBM NOT found
22816d86563SAlexander Pyhalov  *
22916d86563SAlexander Pyhalov  * Since binary search of the UTF8 to IBM table is necessary, might as well
23016d86563SAlexander Pyhalov  * return index and IBM code matching to the unicode.
23116d86563SAlexander Pyhalov  */
get_ibm_by_utf(st,c1,c2,unidx,ibm_code)23216d86563SAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
23316d86563SAlexander Pyhalov _icv_state	*st;
23416d86563SAlexander Pyhalov char            c1, c2;
23516d86563SAlexander Pyhalov int             *unidx;
23616d86563SAlexander Pyhalov unsigned long   *ibm_code;
23716d86563SAlexander Pyhalov {
23816d86563SAlexander Pyhalov         unsigned long   unicode;
23916d86563SAlexander Pyhalov 
24016d86563SAlexander Pyhalov         unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
24116d86563SAlexander Pyhalov         *unidx = bisearch(unicode, st, st->table_size);
24216d86563SAlexander Pyhalov         if ((*unidx) >= 0)
24316d86563SAlexander Pyhalov 	{
24416d86563SAlexander Pyhalov             if ( st->left_to_right )
24516d86563SAlexander Pyhalov                 *ibm_code = st->table[*unidx].right_code;
24616d86563SAlexander Pyhalov 	    else
24716d86563SAlexander Pyhalov                 *ibm_code = st->table[*unidx].left_code;
24816d86563SAlexander Pyhalov 	}
24916d86563SAlexander Pyhalov #ifdef DEBUG
25016d86563SAlexander Pyhalov     fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
25116d86563SAlexander Pyhalov #endif
25216d86563SAlexander Pyhalov 
25316d86563SAlexander Pyhalov         return(0);
25416d86563SAlexander Pyhalov }
25516d86563SAlexander Pyhalov 
25616d86563SAlexander Pyhalov 
25716d86563SAlexander Pyhalov /*
25816d86563SAlexander Pyhalov  * ISO/IEC 10646 (Unicode) --> IBM
25916d86563SAlexander Pyhalov  * Unicode --> UTF8 (FSS-UTF)
26016d86563SAlexander Pyhalov  *             (File System Safe Universal Character Set Transformation Format)
26116d86563SAlexander Pyhalov  * Return: > 0 - converted with enough space in output buffer
26216d86563SAlexander Pyhalov  *         = 0 - no space in outbuf
26316d86563SAlexander Pyhalov  */
utf8_to_ibm(int unidx,unsigned long ibm_code,char * buf,size_t buflen,_icv_state * st)264*f642269fSToomas Soome int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf, size_t buflen,
265*f642269fSToomas Soome     _icv_state *st)
26616d86563SAlexander Pyhalov {
26716d86563SAlexander Pyhalov         unsigned long   val;            /* IBM value */
26816d86563SAlexander Pyhalov         char            c1, c2, ibm_str[3];
26916d86563SAlexander Pyhalov 
27016d86563SAlexander Pyhalov         if (unidx < 0)         /* no match from UTF8 to IBM */
27116d86563SAlexander Pyhalov 	    ibm_code = (unsigned long)NON_ID_CHAR;
27216d86563SAlexander Pyhalov 
27316d86563SAlexander Pyhalov         {
27416d86563SAlexander Pyhalov                 val = ibm_code & 0xffff;
27516d86563SAlexander Pyhalov                 c1 = (char) ((val & 0xff00) >> 8);
27616d86563SAlexander Pyhalov                 c2 = (char) (val & 0xff);
27716d86563SAlexander Pyhalov         }
27816d86563SAlexander Pyhalov 
27916d86563SAlexander Pyhalov 	/* it is single byte ascii */
28016d86563SAlexander Pyhalov 	if ( c1 == 0x0 ) {
28116d86563SAlexander Pyhalov 		if ( st->shift == SHIFT_OUT ) {
28216d86563SAlexander Pyhalov 			if (buflen < 2) {
28316d86563SAlexander Pyhalov 				errno = E2BIG;
28416d86563SAlexander Pyhalov 				return 0;
28516d86563SAlexander Pyhalov 			}
28616d86563SAlexander Pyhalov 			*buf = SHIFT_IN;
28716d86563SAlexander Pyhalov 			*(buf+1) = c2;
28816d86563SAlexander Pyhalov 			st->shift = SHIFT_IN;
28916d86563SAlexander Pyhalov 			return 2;
29016d86563SAlexander Pyhalov 		}
29116d86563SAlexander Pyhalov 		if (buflen < 1) {
29216d86563SAlexander Pyhalov 			errno = E2BIG;
29316d86563SAlexander Pyhalov 			return 0;
29416d86563SAlexander Pyhalov 		}
29516d86563SAlexander Pyhalov 		*buf = c2;
29616d86563SAlexander Pyhalov 		return 1;
29716d86563SAlexander Pyhalov        }
29816d86563SAlexander Pyhalov 
29916d86563SAlexander Pyhalov 	/* it is the first two bytes character */
30016d86563SAlexander Pyhalov 	if ( st->shift == SHIFT_IN ) {
30116d86563SAlexander Pyhalov 		if (buflen < 3) {
30216d86563SAlexander Pyhalov 			errno = E2BIG;
30316d86563SAlexander Pyhalov 			return 0;
30416d86563SAlexander Pyhalov 		}
30516d86563SAlexander Pyhalov 		*buf = SHIFT_OUT;
30616d86563SAlexander Pyhalov 		st->shift = SHIFT_OUT;
30716d86563SAlexander Pyhalov 		*(buf+1) = c1;
30816d86563SAlexander Pyhalov 		*(buf+2) = c2;
30916d86563SAlexander Pyhalov 		return 3;
31016d86563SAlexander Pyhalov 	}
31116d86563SAlexander Pyhalov 
31216d86563SAlexander Pyhalov         *buf = ibm_str[0] = c1;
31316d86563SAlexander Pyhalov         *(buf+1) = ibm_str[1] = c2;
314*f642269fSToomas Soome         ibm_str[2] = '\0';
31516d86563SAlexander Pyhalov 
31616d86563SAlexander Pyhalov #ifdef DEBUG
31716d86563SAlexander Pyhalov     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
31816d86563SAlexander Pyhalov #endif
31916d86563SAlexander Pyhalov 
32016d86563SAlexander Pyhalov 
32116d86563SAlexander Pyhalov         if (buflen < 2) {
32216d86563SAlexander Pyhalov                 errno = E2BIG;
32316d86563SAlexander Pyhalov                 return(0);
32416d86563SAlexander Pyhalov         }
32516d86563SAlexander Pyhalov 
32616d86563SAlexander Pyhalov         return(2);
32716d86563SAlexander Pyhalov }
328