xref: /titanic_50/usr/src/lib/iconv_modules/common/utf8%ibm.c (revision 880d797826457b77414b37d531cc3e1aa166ecbe)
1*880d7978SAlexander Pyhalov /*
2*880d7978SAlexander Pyhalov  * CDDL HEADER START
3*880d7978SAlexander Pyhalov  *
4*880d7978SAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*880d7978SAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*880d7978SAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*880d7978SAlexander Pyhalov  *
8*880d7978SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*880d7978SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*880d7978SAlexander Pyhalov  * See the License for the specific language governing permissions
11*880d7978SAlexander Pyhalov  * and limitations under the License.
12*880d7978SAlexander Pyhalov  *
13*880d7978SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*880d7978SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*880d7978SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*880d7978SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*880d7978SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*880d7978SAlexander Pyhalov  *
19*880d7978SAlexander Pyhalov  * CDDL HEADER END
20*880d7978SAlexander Pyhalov  */
21*880d7978SAlexander Pyhalov 
22*880d7978SAlexander Pyhalov /*
23*880d7978SAlexander Pyhalov  * Copyright (c) 1997, by Sun Microsystems, Inc.
24*880d7978SAlexander Pyhalov  * All rights reserved.
25*880d7978SAlexander Pyhalov  */
26*880d7978SAlexander Pyhalov 
27*880d7978SAlexander Pyhalov #include <stdio.h>
28*880d7978SAlexander Pyhalov #include <stdlib.h>
29*880d7978SAlexander Pyhalov #include <errno.h>
30*880d7978SAlexander Pyhalov #include <sys/types.h>
31*880d7978SAlexander Pyhalov 
32*880d7978SAlexander Pyhalov #include "tab_lookup.h"   	/* table lookup data types */
33*880d7978SAlexander Pyhalov 
34*880d7978SAlexander Pyhalov #define MSB     0x80    /* most significant bit */
35*880d7978SAlexander Pyhalov #define ONEBYTE 0xff    /* right most byte */
36*880d7978SAlexander Pyhalov 
37*880d7978SAlexander Pyhalov enum _USTATE    { U0, U1, U11, U2, U3, U4 };
38*880d7978SAlexander Pyhalov 
39*880d7978SAlexander Pyhalov 
40*880d7978SAlexander Pyhalov int get_ibm_by_utf(_icv_state	*st, char c1, char c2, int *unidx,
41*880d7978SAlexander Pyhalov     unsigned long   *ibm_code);
42*880d7978SAlexander Pyhalov 
43*880d7978SAlexander Pyhalov int bisearch(unsigned long val, _icv_state *st, int n);
44*880d7978SAlexander Pyhalov 
45*880d7978SAlexander Pyhalov int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf,
46*880d7978SAlexander Pyhalov     size_t buflen, _icv_state *st);
47*880d7978SAlexander Pyhalov 
48*880d7978SAlexander Pyhalov /*
49*880d7978SAlexander Pyhalov  * Actual conversion; called from iconv()
50*880d7978SAlexander Pyhalov  * Input is UTF-8 data.
51*880d7978SAlexander Pyhalov  * first convert to UCS2
52*880d7978SAlexander Pyhalov  */
53*880d7978SAlexander Pyhalov size_t
_icv_iconv(_icv_state * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)54*880d7978SAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
55*880d7978SAlexander Pyhalov                         char **outbuf, size_t *outbytesleft)
56*880d7978SAlexander Pyhalov {
57*880d7978SAlexander Pyhalov /*
58*880d7978SAlexander Pyhalov  * Actual conversion; called from iconv()
59*880d7978SAlexander Pyhalov  */
60*880d7978SAlexander Pyhalov /*=========================================================
61*880d7978SAlexander Pyhalov  *
62*880d7978SAlexander Pyhalov  *       State Machine for interpreting UTF8 code
63*880d7978SAlexander Pyhalov  *
64*880d7978SAlexander Pyhalov  *=========================================================
65*880d7978SAlexander Pyhalov  *
66*880d7978SAlexander Pyhalov  *               3 byte unicode
67*880d7978SAlexander Pyhalov  *          +----->------->-------+
68*880d7978SAlexander Pyhalov  *          |                     |
69*880d7978SAlexander Pyhalov  *          ^                     v
70*880d7978SAlexander Pyhalov  *          |  2 byte             U2 ---> U3
71*880d7978SAlexander Pyhalov  *          |  unicode                    v
72*880d7978SAlexander Pyhalov  * +------> U0 -------> U1                +-------->U4---+
73*880d7978SAlexander Pyhalov  * ^  ascii |           |                           ^    |
74*880d7978SAlexander Pyhalov  * |        |           +-------->--------->--------+    |
75*880d7978SAlexander Pyhalov  * |        v                                            v
76*880d7978SAlexander Pyhalov  * +----<---+-----<------------<------------<------------+
77*880d7978SAlexander Pyhalov  *
78*880d7978SAlexander Pyhalov  * +----<---+-----<------------<------------<------------+
79*880d7978SAlexander Pyhalov  *
80*880d7978SAlexander Pyhalov  *=========================================================*/
81*880d7978SAlexander Pyhalov 
82*880d7978SAlexander Pyhalov         char            c1 = '\0', c2 = '\0';
83*880d7978SAlexander Pyhalov         int             n, unidx;
84*880d7978SAlexander Pyhalov         unsigned long   ibm_code;
85*880d7978SAlexander Pyhalov 
86*880d7978SAlexander Pyhalov #ifdef DEBUG
87*880d7978SAlexander Pyhalov     fprintf(stderr, "==========     iconv(): UTF8 --> IBM     ==========\n");
88*880d7978SAlexander Pyhalov #endif
89*880d7978SAlexander Pyhalov 
90*880d7978SAlexander Pyhalov         if (st == NULL) {
91*880d7978SAlexander Pyhalov                 errno = EBADF;
92*880d7978SAlexander Pyhalov                 return ((size_t) -1);
93*880d7978SAlexander Pyhalov         }
94*880d7978SAlexander Pyhalov 
95*880d7978SAlexander Pyhalov         if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
96*880d7978SAlexander Pyhalov                 st->ustate = U0;
97*880d7978SAlexander Pyhalov                 st->_errno = 0;
98*880d7978SAlexander Pyhalov 		st->shift = SHIFT_IN;
99*880d7978SAlexander Pyhalov                 return ((size_t) 0);
100*880d7978SAlexander Pyhalov         }
101*880d7978SAlexander Pyhalov 
102*880d7978SAlexander Pyhalov         st->_errno = 0;         /* reset internal errno */
103*880d7978SAlexander Pyhalov         errno = 0;              /* reset external errno */
104*880d7978SAlexander Pyhalov 
105*880d7978SAlexander Pyhalov         /* a state machine for interpreting UTF8 code */
106*880d7978SAlexander Pyhalov         while (*inbytesleft > 0 && *outbytesleft > 0) {
107*880d7978SAlexander Pyhalov                 switch (st->ustate) {
108*880d7978SAlexander Pyhalov                 case U0:
109*880d7978SAlexander Pyhalov 			/* it is ascii, convert it immediately */
110*880d7978SAlexander Pyhalov                         if ((**inbuf & MSB) == 0) {     /* ASCII */
111*880d7978SAlexander Pyhalov 				st->ustate = U4;
112*880d7978SAlexander Pyhalov 				st->keepc[0] = **inbuf;
113*880d7978SAlexander Pyhalov 				c1 = 0x0;
114*880d7978SAlexander Pyhalov 				c2 = **inbuf;
115*880d7978SAlexander Pyhalov 				continue;
116*880d7978SAlexander Pyhalov                         } else {        /* Chinese character */
117*880d7978SAlexander Pyhalov                                 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
118*880d7978SAlexander Pyhalov                                         st->ustate = U1;
119*880d7978SAlexander Pyhalov                                         st->keepc[0] = **inbuf;
120*880d7978SAlexander Pyhalov                                 } else if ((**inbuf & 0xf0) == 0xe0) {  /* 3 byte */
121*880d7978SAlexander Pyhalov                                         st->ustate = U2;
122*880d7978SAlexander Pyhalov                                         st->keepc[0] = **inbuf;
123*880d7978SAlexander Pyhalov                                 } else {        /* illegal unicode */
124*880d7978SAlexander Pyhalov                                         /* st->_errno = errno = EINVAL; */
125*880d7978SAlexander Pyhalov 				/* possible UNICODE ko_KR-UTF8 */
126*880d7978SAlexander Pyhalov 				c1 =st->keepc[0] = **inbuf;
127*880d7978SAlexander Pyhalov                                 st->ustate = U11;
128*880d7978SAlexander Pyhalov                                         break;
129*880d7978SAlexander Pyhalov                                 }
130*880d7978SAlexander Pyhalov                         }
131*880d7978SAlexander Pyhalov                         break;
132*880d7978SAlexander Pyhalov                 case U1:                /* 2 byte unicode */
133*880d7978SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
134*880d7978SAlexander Pyhalov                                 st->ustate = U4;
135*880d7978SAlexander Pyhalov                                 st->keepc[1] = **inbuf;
136*880d7978SAlexander Pyhalov                                 c1 = (st->keepc[0]&0x1c)>>2;
137*880d7978SAlexander Pyhalov                                 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
138*880d7978SAlexander Pyhalov #ifdef DEBUG
139*880d7978SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x   --> ",
140*880d7978SAlexander Pyhalov         st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
141*880d7978SAlexander Pyhalov #endif
142*880d7978SAlexander Pyhalov                                 continue;       /* should not advance *inbuf */
143*880d7978SAlexander Pyhalov                         } else {
144*880d7978SAlexander Pyhalov                                  st->_errno = errno = EINVAL;
145*880d7978SAlexander Pyhalov                         }
146*880d7978SAlexander Pyhalov                         break;
147*880d7978SAlexander Pyhalov                 case U11:                /* 3 byte unicode - 2nd byte */
148*880d7978SAlexander Pyhalov 				c2 =st->keepc[1] = **inbuf;
149*880d7978SAlexander Pyhalov                                 st->ustate = U4;
150*880d7978SAlexander Pyhalov 				continue;
151*880d7978SAlexander Pyhalov 			break;
152*880d7978SAlexander Pyhalov                 case U2:                /* 3 byte unicode - 2nd byte */
153*880d7978SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
154*880d7978SAlexander Pyhalov                                 st->ustate = U3;
155*880d7978SAlexander Pyhalov                                 st->keepc[1] = **inbuf;
156*880d7978SAlexander Pyhalov                         } else {
157*880d7978SAlexander Pyhalov                                 st->_errno = errno = EINVAL;
158*880d7978SAlexander Pyhalov                         }
159*880d7978SAlexander Pyhalov                         break;
160*880d7978SAlexander Pyhalov                 case U3:                /* 3 byte unicode - 3rd byte */
161*880d7978SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
162*880d7978SAlexander Pyhalov                                 st->ustate = U4;
163*880d7978SAlexander Pyhalov                                 st->keepc[2] = **inbuf;
164*880d7978SAlexander Pyhalov                                 c1 = ((st->keepc[0]&0x0f)<<4) |
165*880d7978SAlexander Pyhalov                                         ((st->keepc[1]&0x3c)>>2);
166*880d7978SAlexander Pyhalov                                 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
167*880d7978SAlexander Pyhalov #ifdef DEBUG
168*880d7978SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
169*880d7978SAlexander Pyhalov                 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
170*880d7978SAlexander Pyhalov #endif
171*880d7978SAlexander Pyhalov                                 continue;       /* should not advance *inbuf */
172*880d7978SAlexander Pyhalov                         } else {
173*880d7978SAlexander Pyhalov                                 st->_errno = errno = EINVAL;
174*880d7978SAlexander Pyhalov                         }
175*880d7978SAlexander Pyhalov                         break;
176*880d7978SAlexander Pyhalov                 case U4:
177*880d7978SAlexander Pyhalov                         n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
178*880d7978SAlexander Pyhalov                         if (n != 0) {   /* legal unicode;illegal Big5 */
179*880d7978SAlexander Pyhalov                                 st->_errno = errno = EILSEQ;
180*880d7978SAlexander Pyhalov                                 break;
181*880d7978SAlexander Pyhalov                         }
182*880d7978SAlexander Pyhalov 
183*880d7978SAlexander Pyhalov                         n = utf8_to_ibm(unidx, ibm_code,
184*880d7978SAlexander Pyhalov                                         *outbuf, *outbytesleft, st);
185*880d7978SAlexander Pyhalov                         if (n > 0) {
186*880d7978SAlexander Pyhalov                                 (*outbuf) += n;
187*880d7978SAlexander Pyhalov                                 (*outbytesleft) -= n;
188*880d7978SAlexander Pyhalov                         } else {
189*880d7978SAlexander Pyhalov                                 st->_errno = errno;
190*880d7978SAlexander Pyhalov                                 return((size_t)-1);
191*880d7978SAlexander Pyhalov                         }
192*880d7978SAlexander Pyhalov                         st->ustate = U0;
193*880d7978SAlexander Pyhalov                         st->_errno = 0;
194*880d7978SAlexander Pyhalov                         break;
195*880d7978SAlexander Pyhalov                 default:                        /* should never come here */
196*880d7978SAlexander Pyhalov                         st->_errno = errno = EILSEQ;
197*880d7978SAlexander Pyhalov                         st->ustate = U0;        /* reset state */
198*880d7978SAlexander Pyhalov                         break;
199*880d7978SAlexander Pyhalov                 }
200*880d7978SAlexander Pyhalov 
201*880d7978SAlexander Pyhalov                 (*inbuf)++;
202*880d7978SAlexander Pyhalov                 (*inbytesleft)--;
203*880d7978SAlexander Pyhalov 
204*880d7978SAlexander Pyhalov                 if (st->_errno) {
205*880d7978SAlexander Pyhalov #ifdef DEBUG
206*880d7978SAlexander Pyhalov     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
207*880d7978SAlexander Pyhalov                 st->_errno, st->ustate);
208*880d7978SAlexander Pyhalov #endif
209*880d7978SAlexander Pyhalov                         break;
210*880d7978SAlexander Pyhalov                 }
211*880d7978SAlexander Pyhalov 
212*880d7978SAlexander Pyhalov                 if (errno)
213*880d7978SAlexander Pyhalov                         return((size_t)-1);
214*880d7978SAlexander Pyhalov         }
215*880d7978SAlexander Pyhalov 
216*880d7978SAlexander Pyhalov         if (*outbytesleft == 0) {
217*880d7978SAlexander Pyhalov                 errno = E2BIG;
218*880d7978SAlexander Pyhalov                 return((size_t)-1);
219*880d7978SAlexander Pyhalov         }
220*880d7978SAlexander Pyhalov         return (*inbytesleft);
221*880d7978SAlexander Pyhalov }
222*880d7978SAlexander Pyhalov 
223*880d7978SAlexander Pyhalov 
224*880d7978SAlexander Pyhalov /*
225*880d7978SAlexander Pyhalov  * Match IBM code by UTF8 code;
226*880d7978SAlexander Pyhalov  * Return: = 0 - match from Unicode to IBM found
227*880d7978SAlexander Pyhalov  *         = 1 - match from Unicode to IBM NOT found
228*880d7978SAlexander Pyhalov  *
229*880d7978SAlexander Pyhalov  * Since binary search of the UTF8 to IBM table is necessary, might as well
230*880d7978SAlexander Pyhalov  * return index and IBM code matching to the unicode.
231*880d7978SAlexander Pyhalov  */
get_ibm_by_utf(st,c1,c2,unidx,ibm_code)232*880d7978SAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
233*880d7978SAlexander Pyhalov _icv_state	*st;
234*880d7978SAlexander Pyhalov char            c1, c2;
235*880d7978SAlexander Pyhalov int             *unidx;
236*880d7978SAlexander Pyhalov unsigned long   *ibm_code;
237*880d7978SAlexander Pyhalov {
238*880d7978SAlexander Pyhalov         unsigned long   unicode;
239*880d7978SAlexander Pyhalov 
240*880d7978SAlexander Pyhalov         unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
241*880d7978SAlexander Pyhalov         *unidx = bisearch(unicode, st, st->table_size);
242*880d7978SAlexander Pyhalov         if ((*unidx) >= 0)
243*880d7978SAlexander Pyhalov 	{
244*880d7978SAlexander Pyhalov             if ( st->left_to_right )
245*880d7978SAlexander Pyhalov                 *ibm_code = st->table[*unidx].right_code;
246*880d7978SAlexander Pyhalov 	    else
247*880d7978SAlexander Pyhalov                 *ibm_code = st->table[*unidx].left_code;
248*880d7978SAlexander Pyhalov 	}
249*880d7978SAlexander Pyhalov #ifdef DEBUG
250*880d7978SAlexander Pyhalov     fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
251*880d7978SAlexander Pyhalov #endif
252*880d7978SAlexander Pyhalov 
253*880d7978SAlexander Pyhalov         return(0);
254*880d7978SAlexander Pyhalov }
255*880d7978SAlexander Pyhalov 
256*880d7978SAlexander Pyhalov 
257*880d7978SAlexander Pyhalov /*
258*880d7978SAlexander Pyhalov  * ISO/IEC 10646 (Unicode) --> IBM
259*880d7978SAlexander Pyhalov  * Unicode --> UTF8 (FSS-UTF)
260*880d7978SAlexander Pyhalov  *             (File System Safe Universal Character Set Transformation Format)
261*880d7978SAlexander Pyhalov  * Return: > 0 - converted with enough space in output buffer
262*880d7978SAlexander Pyhalov  *         = 0 - no space in outbuf
263*880d7978SAlexander Pyhalov  */
utf8_to_ibm(unidx,ibm_code,buf,buflen,st)264*880d7978SAlexander Pyhalov int utf8_to_ibm(unidx, ibm_code, buf, buflen, st)
265*880d7978SAlexander Pyhalov int             unidx;
266*880d7978SAlexander Pyhalov unsigned long   ibm_code;
267*880d7978SAlexander Pyhalov char            *buf;
268*880d7978SAlexander Pyhalov size_t          buflen;
269*880d7978SAlexander Pyhalov _icv_state 	*st;
270*880d7978SAlexander Pyhalov 
271*880d7978SAlexander Pyhalov {
272*880d7978SAlexander Pyhalov         unsigned long   val;            /* IBM value */
273*880d7978SAlexander Pyhalov         char            c1, c2, ibm_str[3];
274*880d7978SAlexander Pyhalov 
275*880d7978SAlexander Pyhalov         if (unidx < 0)         /* no match from UTF8 to IBM */
276*880d7978SAlexander Pyhalov 	    ibm_code = (unsigned long)NON_ID_CHAR;
277*880d7978SAlexander Pyhalov 
278*880d7978SAlexander Pyhalov         {
279*880d7978SAlexander Pyhalov                 val = ibm_code & 0xffff;
280*880d7978SAlexander Pyhalov                 c1 = (char) ((val & 0xff00) >> 8);
281*880d7978SAlexander Pyhalov                 c2 = (char) (val & 0xff);
282*880d7978SAlexander Pyhalov         }
283*880d7978SAlexander Pyhalov 
284*880d7978SAlexander Pyhalov 	/* it is single byte ascii */
285*880d7978SAlexander Pyhalov 	if ( c1 == 0x0 ) {
286*880d7978SAlexander Pyhalov 		if ( st->shift == SHIFT_OUT ) {
287*880d7978SAlexander Pyhalov 			if (buflen < 2) {
288*880d7978SAlexander Pyhalov 				errno = E2BIG;
289*880d7978SAlexander Pyhalov 				return 0;
290*880d7978SAlexander Pyhalov 			}
291*880d7978SAlexander Pyhalov 			*buf = SHIFT_IN;
292*880d7978SAlexander Pyhalov 			*(buf+1) = c2;
293*880d7978SAlexander Pyhalov 			st->shift = SHIFT_IN;
294*880d7978SAlexander Pyhalov 			return 2;
295*880d7978SAlexander Pyhalov 		}
296*880d7978SAlexander Pyhalov 		if (buflen < 1) {
297*880d7978SAlexander Pyhalov 			errno = E2BIG;
298*880d7978SAlexander Pyhalov 			return 0;
299*880d7978SAlexander Pyhalov 		}
300*880d7978SAlexander Pyhalov 		*buf = c2;
301*880d7978SAlexander Pyhalov 		return 1;
302*880d7978SAlexander Pyhalov        }
303*880d7978SAlexander Pyhalov 
304*880d7978SAlexander Pyhalov 	/* it is the first two bytes character */
305*880d7978SAlexander Pyhalov 	if ( st->shift == SHIFT_IN ) {
306*880d7978SAlexander Pyhalov 		if (buflen < 3) {
307*880d7978SAlexander Pyhalov 			errno = E2BIG;
308*880d7978SAlexander Pyhalov 			return 0;
309*880d7978SAlexander Pyhalov 		}
310*880d7978SAlexander Pyhalov 		*buf = SHIFT_OUT;
311*880d7978SAlexander Pyhalov 		st->shift = SHIFT_OUT;
312*880d7978SAlexander Pyhalov 		*(buf+1) = c1;
313*880d7978SAlexander Pyhalov 		*(buf+2) = c2;
314*880d7978SAlexander Pyhalov 		return 3;
315*880d7978SAlexander Pyhalov 	}
316*880d7978SAlexander Pyhalov 
317*880d7978SAlexander Pyhalov         *buf = ibm_str[0] = c1;
318*880d7978SAlexander Pyhalov         *(buf+1) = ibm_str[1] = c2;
319*880d7978SAlexander Pyhalov         ibm_str[2] = NULL;
320*880d7978SAlexander Pyhalov 
321*880d7978SAlexander Pyhalov #ifdef DEBUG
322*880d7978SAlexander Pyhalov     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
323*880d7978SAlexander Pyhalov #endif
324*880d7978SAlexander Pyhalov 
325*880d7978SAlexander Pyhalov 
326*880d7978SAlexander Pyhalov         if (buflen < 2) {
327*880d7978SAlexander Pyhalov                 errno = E2BIG;
328*880d7978SAlexander Pyhalov                 return(0);
329*880d7978SAlexander Pyhalov         }
330*880d7978SAlexander Pyhalov 
331*880d7978SAlexander Pyhalov         return(2);
332*880d7978SAlexander Pyhalov }
333