xref: /illumos-gate/usr/src/lib/iconv_modules/common/utf8%ibm.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1997, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include <sys/types.h>
31 
32 #include "tab_lookup.h"   	/* table lookup data types */
33 
34 #define MSB     0x80    /* most significant bit */
35 #define ONEBYTE 0xff    /* right most byte */
36 
37 enum _USTATE    { U0, U1, U11, U2, U3, U4 };
38 
39 
40 int get_ibm_by_utf(_icv_state	*st, char c1, char c2, int *unidx,
41     unsigned long   *ibm_code);
42 
43 int bisearch(unsigned long val, _icv_state *st, int n);
44 
45 int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf,
46     size_t buflen, _icv_state *st);
47 
48 /*
49  * Actual conversion; called from iconv()
50  * Input is UTF-8 data.
51  * first convert to UCS2
52  */
53 size_t
54 _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
55                         char **outbuf, size_t *outbytesleft)
56 {
57 /*
58  * Actual conversion; called from iconv()
59  */
60 /*=========================================================
61  *
62  *       State Machine for interpreting UTF8 code
63  *
64  *=========================================================
65  *
66  *               3 byte unicode
67  *          +----->------->-------+
68  *          |                     |
69  *          ^                     v
70  *          |  2 byte             U2 ---> U3
71  *          |  unicode                    v
72  * +------> U0 -------> U1                +-------->U4---+
73  * ^  ascii |           |                           ^    |
74  * |        |           +-------->--------->--------+    |
75  * |        v                                            v
76  * +----<---+-----<------------<------------<------------+
77  *
78  * +----<---+-----<------------<------------<------------+
79  *
80  *=========================================================*/
81 
82         char            c1 = '\0', c2 = '\0';
83         int             n, unidx;
84         unsigned long   ibm_code;
85 
86 #ifdef DEBUG
87     fprintf(stderr, "==========     iconv(): UTF8 --> IBM     ==========\n");
88 #endif
89 
90         if (st == NULL) {
91                 errno = EBADF;
92                 return ((size_t) -1);
93         }
94 
95         if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
96                 st->ustate = U0;
97                 st->_errno = 0;
98 		st->shift = SHIFT_IN;
99                 return ((size_t) 0);
100         }
101 
102         st->_errno = 0;         /* reset internal errno */
103         errno = 0;              /* reset external errno */
104 
105         /* a state machine for interpreting UTF8 code */
106         while (*inbytesleft > 0 && *outbytesleft > 0) {
107                 switch (st->ustate) {
108                 case U0:
109 			/* it is ascii, convert it immediately */
110                         if ((**inbuf & MSB) == 0) {     /* ASCII */
111 				st->ustate = U4;
112 				st->keepc[0] = **inbuf;
113 				c1 = 0x0;
114 				c2 = **inbuf;
115 				continue;
116                         } else {        /* Chinese character */
117                                 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
118                                         st->ustate = U1;
119                                         st->keepc[0] = **inbuf;
120                                 } else if ((**inbuf & 0xf0) == 0xe0) {  /* 3 byte */
121                                         st->ustate = U2;
122                                         st->keepc[0] = **inbuf;
123                                 } else {        /* illegal unicode */
124                                         /* st->_errno = errno = EINVAL; */
125 				/* possible UNICODE ko_KR-UTF8 */
126 				c1 =st->keepc[0] = **inbuf;
127                                 st->ustate = U11;
128                                         break;
129                                 }
130                         }
131                         break;
132                 case U1:                /* 2 byte unicode */
133                         if ((**inbuf & 0xc0) == MSB) {
134                                 st->ustate = U4;
135                                 st->keepc[1] = **inbuf;
136                                 c1 = (st->keepc[0]&0x1c)>>2;
137                                 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
138 #ifdef DEBUG
139     fprintf(stderr, "UTF8: %02x%02x   --> ",
140         st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
141 #endif
142                                 continue;       /* should not advance *inbuf */
143                         } else {
144                                  st->_errno = errno = EINVAL;
145                         }
146                         break;
147                 case U11:                /* 3 byte unicode - 2nd byte */
148 				c2 =st->keepc[1] = **inbuf;
149                                 st->ustate = U4;
150 				continue;
151 			break;
152                 case U2:                /* 3 byte unicode - 2nd byte */
153                         if ((**inbuf & 0xc0) == MSB) {
154                                 st->ustate = U3;
155                                 st->keepc[1] = **inbuf;
156                         } else {
157                                 st->_errno = errno = EINVAL;
158                         }
159                         break;
160                 case U3:                /* 3 byte unicode - 3rd byte */
161                         if ((**inbuf & 0xc0) == MSB) {
162                                 st->ustate = U4;
163                                 st->keepc[2] = **inbuf;
164                                 c1 = ((st->keepc[0]&0x0f)<<4) |
165                                         ((st->keepc[1]&0x3c)>>2);
166                                 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
167 #ifdef DEBUG
168     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
169                 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
170 #endif
171                                 continue;       /* should not advance *inbuf */
172                         } else {
173                                 st->_errno = errno = EINVAL;
174                         }
175                         break;
176                 case U4:
177                         n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
178                         if (n != 0) {   /* legal unicode;illegal Big5 */
179                                 st->_errno = errno = EILSEQ;
180                                 break;
181                         }
182 
183                         n = utf8_to_ibm(unidx, ibm_code,
184                                         *outbuf, *outbytesleft, st);
185                         if (n > 0) {
186                                 (*outbuf) += n;
187                                 (*outbytesleft) -= n;
188                         } else {
189                                 st->_errno = errno;
190                                 return((size_t)-1);
191                         }
192                         st->ustate = U0;
193                         st->_errno = 0;
194                         break;
195                 default:                        /* should never come here */
196                         st->_errno = errno = EILSEQ;
197                         st->ustate = U0;        /* reset state */
198                         break;
199                 }
200 
201                 (*inbuf)++;
202                 (*inbytesleft)--;
203 
204                 if (st->_errno) {
205 #ifdef DEBUG
206     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
207                 st->_errno, st->ustate);
208 #endif
209                         break;
210                 }
211 
212                 if (errno)
213                         return((size_t)-1);
214         }
215 
216         if (*outbytesleft == 0) {
217                 errno = E2BIG;
218                 return((size_t)-1);
219         }
220         return (*inbytesleft);
221 }
222 
223 
224 /*
225  * Match IBM code by UTF8 code;
226  * Return: = 0 - match from Unicode to IBM found
227  *         = 1 - match from Unicode to IBM NOT found
228  *
229  * Since binary search of the UTF8 to IBM table is necessary, might as well
230  * return index and IBM code matching to the unicode.
231  */
232 int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
233 _icv_state	*st;
234 char            c1, c2;
235 int             *unidx;
236 unsigned long   *ibm_code;
237 {
238         unsigned long   unicode;
239 
240         unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
241         *unidx = bisearch(unicode, st, st->table_size);
242         if ((*unidx) >= 0)
243 	{
244             if ( st->left_to_right )
245                 *ibm_code = st->table[*unidx].right_code;
246 	    else
247                 *ibm_code = st->table[*unidx].left_code;
248 	}
249 #ifdef DEBUG
250     fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
251 #endif
252 
253         return(0);
254 }
255 
256 
257 /*
258  * ISO/IEC 10646 (Unicode) --> IBM
259  * Unicode --> UTF8 (FSS-UTF)
260  *             (File System Safe Universal Character Set Transformation Format)
261  * Return: > 0 - converted with enough space in output buffer
262  *         = 0 - no space in outbuf
263  */
264 int utf8_to_ibm(unidx, ibm_code, buf, buflen, st)
265 int             unidx;
266 unsigned long   ibm_code;
267 char            *buf;
268 size_t          buflen;
269 _icv_state 	*st;
270 
271 {
272         unsigned long   val;            /* IBM value */
273         char            c1, c2, ibm_str[3];
274 
275         if (unidx < 0)         /* no match from UTF8 to IBM */
276 	    ibm_code = (unsigned long)NON_ID_CHAR;
277 
278         {
279                 val = ibm_code & 0xffff;
280                 c1 = (char) ((val & 0xff00) >> 8);
281                 c2 = (char) (val & 0xff);
282         }
283 
284 	/* it is single byte ascii */
285 	if ( c1 == 0x0 ) {
286 		if ( st->shift == SHIFT_OUT ) {
287 			if (buflen < 2) {
288 				errno = E2BIG;
289 				return 0;
290 			}
291 			*buf = SHIFT_IN;
292 			*(buf+1) = c2;
293 			st->shift = SHIFT_IN;
294 			return 2;
295 		}
296 		if (buflen < 1) {
297 			errno = E2BIG;
298 			return 0;
299 		}
300 		*buf = c2;
301 		return 1;
302        }
303 
304 	/* it is the first two bytes character */
305 	if ( st->shift == SHIFT_IN ) {
306 		if (buflen < 3) {
307 			errno = E2BIG;
308 			return 0;
309 		}
310 		*buf = SHIFT_OUT;
311 		st->shift = SHIFT_OUT;
312 		*(buf+1) = c1;
313 		*(buf+2) = c2;
314 		return 3;
315 	}
316 
317         *buf = ibm_str[0] = c1;
318         *(buf+1) = ibm_str[1] = c2;
319         ibm_str[2] = NULL;
320 
321 #ifdef DEBUG
322     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
323 #endif
324 
325 
326         if (buflen < 2) {
327                 errno = E2BIG;
328                 return(0);
329         }
330 
331         return(2);
332 }
333