xref: /illumos-gate/usr/src/lib/iconv_modules/common/cnv_utf8ibm.c (revision 45ede40b2394db7967e59f19288fae9b62efd4aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1997, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include <sys/types.h>
31 
32 #include "tab_lookup.h"   	/* table lookup data types */
33 
34 #define MSB     0x80    /* most significant bit */
35 #define ONEBYTE 0xff    /* right most byte */
36 
37 enum _USTATE    { U0, U1, U11, U2, U3, U4 };
38 
39 
40 
41 
42 /*
43  * Actual conversion; called from iconv()
44  * Input is UTF-8 data.
45  * first convert to UCS2
46  */
47 size_t
48 _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
49                         char **outbuf, size_t *outbytesleft)
50 {
51 /*
52  * Actual conversion; called from iconv()
53  */
54 /*=========================================================
55  *
56  *       State Machine for interpreting UTF8 code
57  *
58  *=========================================================
59  *
60  *               3 byte unicode
61  *          +----->------->-------+
62  *          |                     |
63  *          ^                     v
64  *          |  2 byte             U2 ---> U3
65  *          |  unicode                    v
66  * +------> U0 -------> U1                +-------->U4---+
67  * ^  ascii |           |                           ^    |
68  * |        |           +-------->--------->--------+    |
69  * |        v                                            v
70  * +----<---+-----<------------<------------<------------+
71  *
72  * +----<---+-----<------------<------------<------------+
73  *
74  *=========================================================*/
75 
76         char            c1, c2;
77         int             n, unidx;
78         unsigned long   ibm_code;
79 
80 #ifdef DEBUG
81     fprintf(stderr, "==========     iconv(): UTF8 --> IBM     ==========\n");
82 #endif
83 
84         if (st == NULL) {
85                 errno = EBADF;
86                 return ((size_t) -1);
87         }
88 
89         if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
90                 st->ustate = U0;
91                 st->_errno = 0;
92                 return ((size_t) 0);
93         }
94 
95         st->_errno = 0;         /* reset internal errno */
96         errno = 0;              /* reset external errno */
97 
98         /* a state machine for interpreting UTF8 code */
99         while (*inbytesleft > 0 && *outbytesleft > 0) {
100                 switch (st->ustate) {
101                 case U0:                /* assuming ASCII in the beginning */
102                         if ((**inbuf & MSB) == 0) {     /* ASCII */
103                                 **outbuf = **inbuf;
104                                 (*outbuf)++;
105                                 (*outbytesleft)--;
106                         } else {        /* Chinese character */
107                                 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
108                                         st->ustate = U1;
109                                         st->keepc[0] = **inbuf;
110                                 } else if ((**inbuf & 0xf0) == 0xe0) {  /* 3 byte */
111                                         st->ustate = U2;
112                                         st->keepc[0] = **inbuf;
113                                 } else {        /* illegal unicode */
114                                         /* st->_errno = errno = EINVAL; */
115 				/* possible UNICODE ko_KR-UTF8 */
116 				c1 =st->keepc[0] = **inbuf;
117                                 st->ustate = U11;
118                                         break;
119                                 }
120                         }
121                         break;
122                 case U1:                /* 2 byte unicode */
123                         if ((**inbuf & 0xc0) == MSB) {
124                                 st->ustate = U4;
125                                 st->keepc[1] = **inbuf;
126                                 c1 = (st->keepc[0]&0x1c)>>2;
127                                 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
128 #ifdef DEBUG
129     fprintf(stderr, "UTF8: %02x%02x   --> ",
130         st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
131 #endif
132                                 continue;       /* should not advance *inbuf */
133                         } else {
134                                  st->_errno = errno = EINVAL;
135                         }
136                         break;
137                 case U11:                /* 3 byte unicode - 2nd byte */
138 				c2 =st->keepc[1] = **inbuf;
139                                 st->ustate = U4;
140 				continue;
141 			break;
142                 case U2:                /* 3 byte unicode - 2nd byte */
143                         if ((**inbuf & 0xc0) == MSB) {
144                                 st->ustate = U3;
145                                 st->keepc[1] = **inbuf;
146                         } else {
147                                 st->_errno = errno = EINVAL;
148                         }
149                         break;
150                 case U3:                /* 3 byte unicode - 3rd byte */
151                         if ((**inbuf & 0xc0) == MSB) {
152                                 st->ustate = U4;
153                                 st->keepc[2] = **inbuf;
154                                 c1 = ((st->keepc[0]&0x0f)<<4) |
155                                         ((st->keepc[1]&0x3c)>>2);
156                                 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
157 #ifdef DEBUG
158     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
159                 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
160 #endif
161                                 continue;       /* should not advance *inbuf */
162                         } else {
163                                 st->_errno = errno = EINVAL;
164                         }
165                         break;
166                 case U4:
167                         n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
168                         if (n != 0) {   /* legal unicode;illegal Big5 */
169                                 st->_errno = errno = EILSEQ;
170                                 break;
171                         }
172 
173                         n = utf8_to_ibm(unidx, ibm_code,
174                                         *outbuf, *outbytesleft);
175                         if (n > 0) {
176                                 (*outbuf) += n;
177                                 (*outbytesleft) -= n;
178                         } else {
179                                 st->_errno = errno;
180                                 return((size_t)-1);
181                         }
182                         st->ustate = U0;
183                         st->_errno = 0;
184                         break;
185                 default:                        /* should never come here */
186                         st->_errno = errno = EILSEQ;
187                         st->ustate = U0;        /* reset state */
188                         break;
189                 }
190 
191                 (*inbuf)++;
192                 (*inbytesleft)--;
193 
194                 if (st->_errno) {
195 #ifdef DEBUG
196     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
197                 st->_errno, st->ustate);
198 #endif
199                         break;
200                 }
201 
202                 if (errno)
203                         return((size_t)-1);
204         }
205 
206         if (*outbytesleft == 0) {
207                 errno = E2BIG;
208                 return((size_t)-1);
209         }
210         return (*inbytesleft);
211 }
212 
213 
214 /*
215  * Match IBM code by UTF8 code;
216  * Return: = 0 - match from Unicode to IBM found
217  *         = 1 - match from Unicode to IBM NOT found
218  *
219  * Since binary search of the UTF8 to IBM table is necessary, might as well
220  * return index and IBM code matching to the unicode.
221  */
222 int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
223 _icv_state	*st;
224 char            c1, c2;
225 int             *unidx;
226 unsigned long   *ibm_code;
227 {
228         unsigned long   unicode;
229 
230         unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
231         *unidx = bisearch(unicode, st, st->table_size);
232         if ((*unidx) >= 0)
233 	{
234             if ( st->left_to_right )
235                 *ibm_code = st->table[*unidx].right_code;
236 	    else
237                 *ibm_code = st->table[*unidx].left_code;
238 	}
239         else
240                 ;      /* match from UTF8 to IBM not found */
241 #ifdef DEBUG
242     fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
243 #endif
244 
245         return(0);
246 }
247 
248 
249 /*
250  * ISO/IEC 10646 (Unicode) --> IBM
251  * Unicode --> UTF8 (FSS-UTF)
252  *             (File System Safe Universal Character Set Transformation Format)
253  * Return: > 0 - converted with enough space in output buffer
254  *         = 0 - no space in outbuf
255  */
256 int utf8_to_ibm(unidx, ibm_code, buf, buflen)
257 int             unidx;
258 unsigned long   ibm_code;
259 char            *buf;
260 size_t          buflen;
261 
262 {
263         unsigned long   val;            /* IBM value */
264         char            c1, c2, ibm_str[3];
265 
266         if (unidx < 0)         /* no match from UTF8 to IBM */
267 	    ibm_code = (unsigned long)NON_ID_CHAR;
268 
269         {
270                 val = ibm_code & 0xffff;
271                 c1 = (char) ((val & 0xff00) >> 8);
272                 c2 = (char) (val & 0xff);
273         }
274 
275         *buf = ibm_str[0] = c1;
276         *(buf+1) = ibm_str[1] = c2;
277         ibm_str[2] = NULL;
278 
279 #ifdef DEBUG
280     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
281 #endif
282 
283 
284         if (buflen < 2) {
285                 errno = E2BIG;
286                 return(0);
287         }
288 
289         return(2);
290 }
291