xref: /illumos-gate/usr/src/lib/iconv_modules/common/utf8%ibm.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov  * CDDL HEADER START
3*16d86563SAlexander Pyhalov  *
4*16d86563SAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov  *
8*16d86563SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov  * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov  * and limitations under the License.
12*16d86563SAlexander Pyhalov  *
13*16d86563SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov  *
19*16d86563SAlexander Pyhalov  * CDDL HEADER END
20*16d86563SAlexander Pyhalov  */
21*16d86563SAlexander Pyhalov 
22*16d86563SAlexander Pyhalov /*
23*16d86563SAlexander Pyhalov  * Copyright (c) 1997, by Sun Microsystems, Inc.
24*16d86563SAlexander Pyhalov  * All rights reserved.
25*16d86563SAlexander Pyhalov  */
26*16d86563SAlexander Pyhalov 
27*16d86563SAlexander Pyhalov #include <stdio.h>
28*16d86563SAlexander Pyhalov #include <stdlib.h>
29*16d86563SAlexander Pyhalov #include <errno.h>
30*16d86563SAlexander Pyhalov #include <sys/types.h>
31*16d86563SAlexander Pyhalov 
32*16d86563SAlexander Pyhalov #include "tab_lookup.h"   	/* table lookup data types */
33*16d86563SAlexander Pyhalov 
34*16d86563SAlexander Pyhalov #define MSB     0x80    /* most significant bit */
35*16d86563SAlexander Pyhalov #define ONEBYTE 0xff    /* right most byte */
36*16d86563SAlexander Pyhalov 
37*16d86563SAlexander Pyhalov enum _USTATE    { U0, U1, U11, U2, U3, U4 };
38*16d86563SAlexander Pyhalov 
39*16d86563SAlexander Pyhalov 
40*16d86563SAlexander Pyhalov int get_ibm_by_utf(_icv_state	*st, char c1, char c2, int *unidx,
41*16d86563SAlexander Pyhalov     unsigned long   *ibm_code);
42*16d86563SAlexander Pyhalov 
43*16d86563SAlexander Pyhalov int bisearch(unsigned long val, _icv_state *st, int n);
44*16d86563SAlexander Pyhalov 
45*16d86563SAlexander Pyhalov int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf,
46*16d86563SAlexander Pyhalov     size_t buflen, _icv_state *st);
47*16d86563SAlexander Pyhalov 
48*16d86563SAlexander Pyhalov /*
49*16d86563SAlexander Pyhalov  * Actual conversion; called from iconv()
50*16d86563SAlexander Pyhalov  * Input is UTF-8 data.
51*16d86563SAlexander Pyhalov  * first convert to UCS2
52*16d86563SAlexander Pyhalov  */
53*16d86563SAlexander Pyhalov size_t
54*16d86563SAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
55*16d86563SAlexander Pyhalov                         char **outbuf, size_t *outbytesleft)
56*16d86563SAlexander Pyhalov {
57*16d86563SAlexander Pyhalov /*
58*16d86563SAlexander Pyhalov  * Actual conversion; called from iconv()
59*16d86563SAlexander Pyhalov  */
60*16d86563SAlexander Pyhalov /*=========================================================
61*16d86563SAlexander Pyhalov  *
62*16d86563SAlexander Pyhalov  *       State Machine for interpreting UTF8 code
63*16d86563SAlexander Pyhalov  *
64*16d86563SAlexander Pyhalov  *=========================================================
65*16d86563SAlexander Pyhalov  *
66*16d86563SAlexander Pyhalov  *               3 byte unicode
67*16d86563SAlexander Pyhalov  *          +----->------->-------+
68*16d86563SAlexander Pyhalov  *          |                     |
69*16d86563SAlexander Pyhalov  *          ^                     v
70*16d86563SAlexander Pyhalov  *          |  2 byte             U2 ---> U3
71*16d86563SAlexander Pyhalov  *          |  unicode                    v
72*16d86563SAlexander Pyhalov  * +------> U0 -------> U1                +-------->U4---+
73*16d86563SAlexander Pyhalov  * ^  ascii |           |                           ^    |
74*16d86563SAlexander Pyhalov  * |        |           +-------->--------->--------+    |
75*16d86563SAlexander Pyhalov  * |        v                                            v
76*16d86563SAlexander Pyhalov  * +----<---+-----<------------<------------<------------+
77*16d86563SAlexander Pyhalov  *
78*16d86563SAlexander Pyhalov  * +----<---+-----<------------<------------<------------+
79*16d86563SAlexander Pyhalov  *
80*16d86563SAlexander Pyhalov  *=========================================================*/
81*16d86563SAlexander Pyhalov 
82*16d86563SAlexander Pyhalov         char            c1 = '\0', c2 = '\0';
83*16d86563SAlexander Pyhalov         int             n, unidx;
84*16d86563SAlexander Pyhalov         unsigned long   ibm_code;
85*16d86563SAlexander Pyhalov 
86*16d86563SAlexander Pyhalov #ifdef DEBUG
87*16d86563SAlexander Pyhalov     fprintf(stderr, "==========     iconv(): UTF8 --> IBM     ==========\n");
88*16d86563SAlexander Pyhalov #endif
89*16d86563SAlexander Pyhalov 
90*16d86563SAlexander Pyhalov         if (st == NULL) {
91*16d86563SAlexander Pyhalov                 errno = EBADF;
92*16d86563SAlexander Pyhalov                 return ((size_t) -1);
93*16d86563SAlexander Pyhalov         }
94*16d86563SAlexander Pyhalov 
95*16d86563SAlexander Pyhalov         if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
96*16d86563SAlexander Pyhalov                 st->ustate = U0;
97*16d86563SAlexander Pyhalov                 st->_errno = 0;
98*16d86563SAlexander Pyhalov 		st->shift = SHIFT_IN;
99*16d86563SAlexander Pyhalov                 return ((size_t) 0);
100*16d86563SAlexander Pyhalov         }
101*16d86563SAlexander Pyhalov 
102*16d86563SAlexander Pyhalov         st->_errno = 0;         /* reset internal errno */
103*16d86563SAlexander Pyhalov         errno = 0;              /* reset external errno */
104*16d86563SAlexander Pyhalov 
105*16d86563SAlexander Pyhalov         /* a state machine for interpreting UTF8 code */
106*16d86563SAlexander Pyhalov         while (*inbytesleft > 0 && *outbytesleft > 0) {
107*16d86563SAlexander Pyhalov                 switch (st->ustate) {
108*16d86563SAlexander Pyhalov                 case U0:
109*16d86563SAlexander Pyhalov 			/* it is ascii, convert it immediately */
110*16d86563SAlexander Pyhalov                         if ((**inbuf & MSB) == 0) {     /* ASCII */
111*16d86563SAlexander Pyhalov 				st->ustate = U4;
112*16d86563SAlexander Pyhalov 				st->keepc[0] = **inbuf;
113*16d86563SAlexander Pyhalov 				c1 = 0x0;
114*16d86563SAlexander Pyhalov 				c2 = **inbuf;
115*16d86563SAlexander Pyhalov 				continue;
116*16d86563SAlexander Pyhalov                         } else {        /* Chinese character */
117*16d86563SAlexander Pyhalov                                 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
118*16d86563SAlexander Pyhalov                                         st->ustate = U1;
119*16d86563SAlexander Pyhalov                                         st->keepc[0] = **inbuf;
120*16d86563SAlexander Pyhalov                                 } else if ((**inbuf & 0xf0) == 0xe0) {  /* 3 byte */
121*16d86563SAlexander Pyhalov                                         st->ustate = U2;
122*16d86563SAlexander Pyhalov                                         st->keepc[0] = **inbuf;
123*16d86563SAlexander Pyhalov                                 } else {        /* illegal unicode */
124*16d86563SAlexander Pyhalov                                         /* st->_errno = errno = EINVAL; */
125*16d86563SAlexander Pyhalov 				/* possible UNICODE ko_KR-UTF8 */
126*16d86563SAlexander Pyhalov 				c1 =st->keepc[0] = **inbuf;
127*16d86563SAlexander Pyhalov                                 st->ustate = U11;
128*16d86563SAlexander Pyhalov                                         break;
129*16d86563SAlexander Pyhalov                                 }
130*16d86563SAlexander Pyhalov                         }
131*16d86563SAlexander Pyhalov                         break;
132*16d86563SAlexander Pyhalov                 case U1:                /* 2 byte unicode */
133*16d86563SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
134*16d86563SAlexander Pyhalov                                 st->ustate = U4;
135*16d86563SAlexander Pyhalov                                 st->keepc[1] = **inbuf;
136*16d86563SAlexander Pyhalov                                 c1 = (st->keepc[0]&0x1c)>>2;
137*16d86563SAlexander Pyhalov                                 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
138*16d86563SAlexander Pyhalov #ifdef DEBUG
139*16d86563SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x   --> ",
140*16d86563SAlexander Pyhalov         st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
141*16d86563SAlexander Pyhalov #endif
142*16d86563SAlexander Pyhalov                                 continue;       /* should not advance *inbuf */
143*16d86563SAlexander Pyhalov                         } else {
144*16d86563SAlexander Pyhalov                                  st->_errno = errno = EINVAL;
145*16d86563SAlexander Pyhalov                         }
146*16d86563SAlexander Pyhalov                         break;
147*16d86563SAlexander Pyhalov                 case U11:                /* 3 byte unicode - 2nd byte */
148*16d86563SAlexander Pyhalov 				c2 =st->keepc[1] = **inbuf;
149*16d86563SAlexander Pyhalov                                 st->ustate = U4;
150*16d86563SAlexander Pyhalov 				continue;
151*16d86563SAlexander Pyhalov 			break;
152*16d86563SAlexander Pyhalov                 case U2:                /* 3 byte unicode - 2nd byte */
153*16d86563SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
154*16d86563SAlexander Pyhalov                                 st->ustate = U3;
155*16d86563SAlexander Pyhalov                                 st->keepc[1] = **inbuf;
156*16d86563SAlexander Pyhalov                         } else {
157*16d86563SAlexander Pyhalov                                 st->_errno = errno = EINVAL;
158*16d86563SAlexander Pyhalov                         }
159*16d86563SAlexander Pyhalov                         break;
160*16d86563SAlexander Pyhalov                 case U3:                /* 3 byte unicode - 3rd byte */
161*16d86563SAlexander Pyhalov                         if ((**inbuf & 0xc0) == MSB) {
162*16d86563SAlexander Pyhalov                                 st->ustate = U4;
163*16d86563SAlexander Pyhalov                                 st->keepc[2] = **inbuf;
164*16d86563SAlexander Pyhalov                                 c1 = ((st->keepc[0]&0x0f)<<4) |
165*16d86563SAlexander Pyhalov                                         ((st->keepc[1]&0x3c)>>2);
166*16d86563SAlexander Pyhalov                                 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
167*16d86563SAlexander Pyhalov #ifdef DEBUG
168*16d86563SAlexander Pyhalov     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
169*16d86563SAlexander Pyhalov                 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
170*16d86563SAlexander Pyhalov #endif
171*16d86563SAlexander Pyhalov                                 continue;       /* should not advance *inbuf */
172*16d86563SAlexander Pyhalov                         } else {
173*16d86563SAlexander Pyhalov                                 st->_errno = errno = EINVAL;
174*16d86563SAlexander Pyhalov                         }
175*16d86563SAlexander Pyhalov                         break;
176*16d86563SAlexander Pyhalov                 case U4:
177*16d86563SAlexander Pyhalov                         n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
178*16d86563SAlexander Pyhalov                         if (n != 0) {   /* legal unicode;illegal Big5 */
179*16d86563SAlexander Pyhalov                                 st->_errno = errno = EILSEQ;
180*16d86563SAlexander Pyhalov                                 break;
181*16d86563SAlexander Pyhalov                         }
182*16d86563SAlexander Pyhalov 
183*16d86563SAlexander Pyhalov                         n = utf8_to_ibm(unidx, ibm_code,
184*16d86563SAlexander Pyhalov                                         *outbuf, *outbytesleft, st);
185*16d86563SAlexander Pyhalov                         if (n > 0) {
186*16d86563SAlexander Pyhalov                                 (*outbuf) += n;
187*16d86563SAlexander Pyhalov                                 (*outbytesleft) -= n;
188*16d86563SAlexander Pyhalov                         } else {
189*16d86563SAlexander Pyhalov                                 st->_errno = errno;
190*16d86563SAlexander Pyhalov                                 return((size_t)-1);
191*16d86563SAlexander Pyhalov                         }
192*16d86563SAlexander Pyhalov                         st->ustate = U0;
193*16d86563SAlexander Pyhalov                         st->_errno = 0;
194*16d86563SAlexander Pyhalov                         break;
195*16d86563SAlexander Pyhalov                 default:                        /* should never come here */
196*16d86563SAlexander Pyhalov                         st->_errno = errno = EILSEQ;
197*16d86563SAlexander Pyhalov                         st->ustate = U0;        /* reset state */
198*16d86563SAlexander Pyhalov                         break;
199*16d86563SAlexander Pyhalov                 }
200*16d86563SAlexander Pyhalov 
201*16d86563SAlexander Pyhalov                 (*inbuf)++;
202*16d86563SAlexander Pyhalov                 (*inbytesleft)--;
203*16d86563SAlexander Pyhalov 
204*16d86563SAlexander Pyhalov                 if (st->_errno) {
205*16d86563SAlexander Pyhalov #ifdef DEBUG
206*16d86563SAlexander Pyhalov     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
207*16d86563SAlexander Pyhalov                 st->_errno, st->ustate);
208*16d86563SAlexander Pyhalov #endif
209*16d86563SAlexander Pyhalov                         break;
210*16d86563SAlexander Pyhalov                 }
211*16d86563SAlexander Pyhalov 
212*16d86563SAlexander Pyhalov                 if (errno)
213*16d86563SAlexander Pyhalov                         return((size_t)-1);
214*16d86563SAlexander Pyhalov         }
215*16d86563SAlexander Pyhalov 
216*16d86563SAlexander Pyhalov         if (*outbytesleft == 0) {
217*16d86563SAlexander Pyhalov                 errno = E2BIG;
218*16d86563SAlexander Pyhalov                 return((size_t)-1);
219*16d86563SAlexander Pyhalov         }
220*16d86563SAlexander Pyhalov         return (*inbytesleft);
221*16d86563SAlexander Pyhalov }
222*16d86563SAlexander Pyhalov 
223*16d86563SAlexander Pyhalov 
224*16d86563SAlexander Pyhalov /*
225*16d86563SAlexander Pyhalov  * Match IBM code by UTF8 code;
226*16d86563SAlexander Pyhalov  * Return: = 0 - match from Unicode to IBM found
227*16d86563SAlexander Pyhalov  *         = 1 - match from Unicode to IBM NOT found
228*16d86563SAlexander Pyhalov  *
229*16d86563SAlexander Pyhalov  * Since binary search of the UTF8 to IBM table is necessary, might as well
230*16d86563SAlexander Pyhalov  * return index and IBM code matching to the unicode.
231*16d86563SAlexander Pyhalov  */
232*16d86563SAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
233*16d86563SAlexander Pyhalov _icv_state	*st;
234*16d86563SAlexander Pyhalov char            c1, c2;
235*16d86563SAlexander Pyhalov int             *unidx;
236*16d86563SAlexander Pyhalov unsigned long   *ibm_code;
237*16d86563SAlexander Pyhalov {
238*16d86563SAlexander Pyhalov         unsigned long   unicode;
239*16d86563SAlexander Pyhalov 
240*16d86563SAlexander Pyhalov         unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
241*16d86563SAlexander Pyhalov         *unidx = bisearch(unicode, st, st->table_size);
242*16d86563SAlexander Pyhalov         if ((*unidx) >= 0)
243*16d86563SAlexander Pyhalov 	{
244*16d86563SAlexander Pyhalov             if ( st->left_to_right )
245*16d86563SAlexander Pyhalov                 *ibm_code = st->table[*unidx].right_code;
246*16d86563SAlexander Pyhalov 	    else
247*16d86563SAlexander Pyhalov                 *ibm_code = st->table[*unidx].left_code;
248*16d86563SAlexander Pyhalov 	}
249*16d86563SAlexander Pyhalov #ifdef DEBUG
250*16d86563SAlexander Pyhalov     fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
251*16d86563SAlexander Pyhalov #endif
252*16d86563SAlexander Pyhalov 
253*16d86563SAlexander Pyhalov         return(0);
254*16d86563SAlexander Pyhalov }
255*16d86563SAlexander Pyhalov 
256*16d86563SAlexander Pyhalov 
257*16d86563SAlexander Pyhalov /*
258*16d86563SAlexander Pyhalov  * ISO/IEC 10646 (Unicode) --> IBM
259*16d86563SAlexander Pyhalov  * Unicode --> UTF8 (FSS-UTF)
260*16d86563SAlexander Pyhalov  *             (File System Safe Universal Character Set Transformation Format)
261*16d86563SAlexander Pyhalov  * Return: > 0 - converted with enough space in output buffer
262*16d86563SAlexander Pyhalov  *         = 0 - no space in outbuf
263*16d86563SAlexander Pyhalov  */
264*16d86563SAlexander Pyhalov int utf8_to_ibm(unidx, ibm_code, buf, buflen, st)
265*16d86563SAlexander Pyhalov int             unidx;
266*16d86563SAlexander Pyhalov unsigned long   ibm_code;
267*16d86563SAlexander Pyhalov char            *buf;
268*16d86563SAlexander Pyhalov size_t          buflen;
269*16d86563SAlexander Pyhalov _icv_state 	*st;
270*16d86563SAlexander Pyhalov 
271*16d86563SAlexander Pyhalov {
272*16d86563SAlexander Pyhalov         unsigned long   val;            /* IBM value */
273*16d86563SAlexander Pyhalov         char            c1, c2, ibm_str[3];
274*16d86563SAlexander Pyhalov 
275*16d86563SAlexander Pyhalov         if (unidx < 0)         /* no match from UTF8 to IBM */
276*16d86563SAlexander Pyhalov 	    ibm_code = (unsigned long)NON_ID_CHAR;
277*16d86563SAlexander Pyhalov 
278*16d86563SAlexander Pyhalov         {
279*16d86563SAlexander Pyhalov                 val = ibm_code & 0xffff;
280*16d86563SAlexander Pyhalov                 c1 = (char) ((val & 0xff00) >> 8);
281*16d86563SAlexander Pyhalov                 c2 = (char) (val & 0xff);
282*16d86563SAlexander Pyhalov         }
283*16d86563SAlexander Pyhalov 
284*16d86563SAlexander Pyhalov 	/* it is single byte ascii */
285*16d86563SAlexander Pyhalov 	if ( c1 == 0x0 ) {
286*16d86563SAlexander Pyhalov 		if ( st->shift == SHIFT_OUT ) {
287*16d86563SAlexander Pyhalov 			if (buflen < 2) {
288*16d86563SAlexander Pyhalov 				errno = E2BIG;
289*16d86563SAlexander Pyhalov 				return 0;
290*16d86563SAlexander Pyhalov 			}
291*16d86563SAlexander Pyhalov 			*buf = SHIFT_IN;
292*16d86563SAlexander Pyhalov 			*(buf+1) = c2;
293*16d86563SAlexander Pyhalov 			st->shift = SHIFT_IN;
294*16d86563SAlexander Pyhalov 			return 2;
295*16d86563SAlexander Pyhalov 		}
296*16d86563SAlexander Pyhalov 		if (buflen < 1) {
297*16d86563SAlexander Pyhalov 			errno = E2BIG;
298*16d86563SAlexander Pyhalov 			return 0;
299*16d86563SAlexander Pyhalov 		}
300*16d86563SAlexander Pyhalov 		*buf = c2;
301*16d86563SAlexander Pyhalov 		return 1;
302*16d86563SAlexander Pyhalov        }
303*16d86563SAlexander Pyhalov 
304*16d86563SAlexander Pyhalov 	/* it is the first two bytes character */
305*16d86563SAlexander Pyhalov 	if ( st->shift == SHIFT_IN ) {
306*16d86563SAlexander Pyhalov 		if (buflen < 3) {
307*16d86563SAlexander Pyhalov 			errno = E2BIG;
308*16d86563SAlexander Pyhalov 			return 0;
309*16d86563SAlexander Pyhalov 		}
310*16d86563SAlexander Pyhalov 		*buf = SHIFT_OUT;
311*16d86563SAlexander Pyhalov 		st->shift = SHIFT_OUT;
312*16d86563SAlexander Pyhalov 		*(buf+1) = c1;
313*16d86563SAlexander Pyhalov 		*(buf+2) = c2;
314*16d86563SAlexander Pyhalov 		return 3;
315*16d86563SAlexander Pyhalov 	}
316*16d86563SAlexander Pyhalov 
317*16d86563SAlexander Pyhalov         *buf = ibm_str[0] = c1;
318*16d86563SAlexander Pyhalov         *(buf+1) = ibm_str[1] = c2;
319*16d86563SAlexander Pyhalov         ibm_str[2] = NULL;
320*16d86563SAlexander Pyhalov 
321*16d86563SAlexander Pyhalov #ifdef DEBUG
322*16d86563SAlexander Pyhalov     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
323*16d86563SAlexander Pyhalov #endif
324*16d86563SAlexander Pyhalov 
325*16d86563SAlexander Pyhalov 
326*16d86563SAlexander Pyhalov         if (buflen < 2) {
327*16d86563SAlexander Pyhalov                 errno = E2BIG;
328*16d86563SAlexander Pyhalov                 return(0);
329*16d86563SAlexander Pyhalov         }
330*16d86563SAlexander Pyhalov 
331*16d86563SAlexander Pyhalov         return(2);
332*16d86563SAlexander Pyhalov }
333