1*91e1e26aSAlexander Pyhalov /*
2*91e1e26aSAlexander Pyhalov * CDDL HEADER START
3*91e1e26aSAlexander Pyhalov *
4*91e1e26aSAlexander Pyhalov * The contents of this file are subject to the terms of the
5*91e1e26aSAlexander Pyhalov * Common Development and Distribution License (the "License").
6*91e1e26aSAlexander Pyhalov * You may not use this file except in compliance with the License.
7*91e1e26aSAlexander Pyhalov *
8*91e1e26aSAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*91e1e26aSAlexander Pyhalov * or http://www.opensolaris.org/os/licensing.
10*91e1e26aSAlexander Pyhalov * See the License for the specific language governing permissions
11*91e1e26aSAlexander Pyhalov * and limitations under the License.
12*91e1e26aSAlexander Pyhalov *
13*91e1e26aSAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each
14*91e1e26aSAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE.
15*91e1e26aSAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the
16*91e1e26aSAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying
17*91e1e26aSAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner]
18*91e1e26aSAlexander Pyhalov *
19*91e1e26aSAlexander Pyhalov * CDDL HEADER END
20*91e1e26aSAlexander Pyhalov */
21*91e1e26aSAlexander Pyhalov
22*91e1e26aSAlexander Pyhalov /*
23*91e1e26aSAlexander Pyhalov * Copyright (c) 1997, by Sun Microsystems, Inc.
24*91e1e26aSAlexander Pyhalov * All rights reserved.
25*91e1e26aSAlexander Pyhalov */
26*91e1e26aSAlexander Pyhalov
27*91e1e26aSAlexander Pyhalov #include <stdio.h>
28*91e1e26aSAlexander Pyhalov #include <stdlib.h>
29*91e1e26aSAlexander Pyhalov #include <errno.h>
30*91e1e26aSAlexander Pyhalov #include <sys/types.h>
31*91e1e26aSAlexander Pyhalov
32*91e1e26aSAlexander Pyhalov #include "tab_lookup.h" /* table lookup data types */
33*91e1e26aSAlexander Pyhalov
34*91e1e26aSAlexander Pyhalov #define MSB 0x80 /* most significant bit */
35*91e1e26aSAlexander Pyhalov #define ONEBYTE 0xff /* right most byte */
36*91e1e26aSAlexander Pyhalov
37*91e1e26aSAlexander Pyhalov enum _USTATE { U0, U1, U11, U2, U3, U4 };
38*91e1e26aSAlexander Pyhalov
39*91e1e26aSAlexander Pyhalov
40*91e1e26aSAlexander Pyhalov
41*91e1e26aSAlexander Pyhalov
42*91e1e26aSAlexander Pyhalov /*
43*91e1e26aSAlexander Pyhalov * Actual conversion; called from iconv()
44*91e1e26aSAlexander Pyhalov * Input is UTF-8 data.
45*91e1e26aSAlexander Pyhalov * first convert to UCS2
46*91e1e26aSAlexander Pyhalov */
47*91e1e26aSAlexander Pyhalov size_t
_icv_iconv(_icv_state * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)48*91e1e26aSAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
49*91e1e26aSAlexander Pyhalov char **outbuf, size_t *outbytesleft)
50*91e1e26aSAlexander Pyhalov {
51*91e1e26aSAlexander Pyhalov /*
52*91e1e26aSAlexander Pyhalov * Actual conversion; called from iconv()
53*91e1e26aSAlexander Pyhalov */
54*91e1e26aSAlexander Pyhalov /*=========================================================
55*91e1e26aSAlexander Pyhalov *
56*91e1e26aSAlexander Pyhalov * State Machine for interpreting UTF8 code
57*91e1e26aSAlexander Pyhalov *
58*91e1e26aSAlexander Pyhalov *=========================================================
59*91e1e26aSAlexander Pyhalov *
60*91e1e26aSAlexander Pyhalov * 3 byte unicode
61*91e1e26aSAlexander Pyhalov * +----->------->-------+
62*91e1e26aSAlexander Pyhalov * | |
63*91e1e26aSAlexander Pyhalov * ^ v
64*91e1e26aSAlexander Pyhalov * | 2 byte U2 ---> U3
65*91e1e26aSAlexander Pyhalov * | unicode v
66*91e1e26aSAlexander Pyhalov * +------> U0 -------> U1 +-------->U4---+
67*91e1e26aSAlexander Pyhalov * ^ ascii | | ^ |
68*91e1e26aSAlexander Pyhalov * | | +-------->--------->--------+ |
69*91e1e26aSAlexander Pyhalov * | v v
70*91e1e26aSAlexander Pyhalov * +----<---+-----<------------<------------<------------+
71*91e1e26aSAlexander Pyhalov *
72*91e1e26aSAlexander Pyhalov * +----<---+-----<------------<------------<------------+
73*91e1e26aSAlexander Pyhalov *
74*91e1e26aSAlexander Pyhalov *=========================================================*/
75*91e1e26aSAlexander Pyhalov
76*91e1e26aSAlexander Pyhalov char c1, c2;
77*91e1e26aSAlexander Pyhalov int n, unidx;
78*91e1e26aSAlexander Pyhalov unsigned long ibm_code;
79*91e1e26aSAlexander Pyhalov
80*91e1e26aSAlexander Pyhalov #ifdef DEBUG
81*91e1e26aSAlexander Pyhalov fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n");
82*91e1e26aSAlexander Pyhalov #endif
83*91e1e26aSAlexander Pyhalov
84*91e1e26aSAlexander Pyhalov if (st == NULL) {
85*91e1e26aSAlexander Pyhalov errno = EBADF;
86*91e1e26aSAlexander Pyhalov return ((size_t) -1);
87*91e1e26aSAlexander Pyhalov }
88*91e1e26aSAlexander Pyhalov
89*91e1e26aSAlexander Pyhalov if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
90*91e1e26aSAlexander Pyhalov st->ustate = U0;
91*91e1e26aSAlexander Pyhalov st->_errno = 0;
92*91e1e26aSAlexander Pyhalov return ((size_t) 0);
93*91e1e26aSAlexander Pyhalov }
94*91e1e26aSAlexander Pyhalov
95*91e1e26aSAlexander Pyhalov st->_errno = 0; /* reset internal errno */
96*91e1e26aSAlexander Pyhalov errno = 0; /* reset external errno */
97*91e1e26aSAlexander Pyhalov
98*91e1e26aSAlexander Pyhalov /* a state machine for interpreting UTF8 code */
99*91e1e26aSAlexander Pyhalov while (*inbytesleft > 0 && *outbytesleft > 0) {
100*91e1e26aSAlexander Pyhalov switch (st->ustate) {
101*91e1e26aSAlexander Pyhalov case U0: /* assuming ASCII in the beginning */
102*91e1e26aSAlexander Pyhalov if ((**inbuf & MSB) == 0) { /* ASCII */
103*91e1e26aSAlexander Pyhalov **outbuf = **inbuf;
104*91e1e26aSAlexander Pyhalov (*outbuf)++;
105*91e1e26aSAlexander Pyhalov (*outbytesleft)--;
106*91e1e26aSAlexander Pyhalov } else { /* Chinese character */
107*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
108*91e1e26aSAlexander Pyhalov st->ustate = U1;
109*91e1e26aSAlexander Pyhalov st->keepc[0] = **inbuf;
110*91e1e26aSAlexander Pyhalov } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */
111*91e1e26aSAlexander Pyhalov st->ustate = U2;
112*91e1e26aSAlexander Pyhalov st->keepc[0] = **inbuf;
113*91e1e26aSAlexander Pyhalov } else { /* illegal unicode */
114*91e1e26aSAlexander Pyhalov /* st->_errno = errno = EINVAL; */
115*91e1e26aSAlexander Pyhalov /* possible UNICODE ko_KR-UTF8 */
116*91e1e26aSAlexander Pyhalov c1 =st->keepc[0] = **inbuf;
117*91e1e26aSAlexander Pyhalov st->ustate = U11;
118*91e1e26aSAlexander Pyhalov break;
119*91e1e26aSAlexander Pyhalov }
120*91e1e26aSAlexander Pyhalov }
121*91e1e26aSAlexander Pyhalov break;
122*91e1e26aSAlexander Pyhalov case U1: /* 2 byte unicode */
123*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
124*91e1e26aSAlexander Pyhalov st->ustate = U4;
125*91e1e26aSAlexander Pyhalov st->keepc[1] = **inbuf;
126*91e1e26aSAlexander Pyhalov c1 = (st->keepc[0]&0x1c)>>2;
127*91e1e26aSAlexander Pyhalov c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
128*91e1e26aSAlexander Pyhalov #ifdef DEBUG
129*91e1e26aSAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x --> ",
130*91e1e26aSAlexander Pyhalov st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
131*91e1e26aSAlexander Pyhalov #endif
132*91e1e26aSAlexander Pyhalov continue; /* should not advance *inbuf */
133*91e1e26aSAlexander Pyhalov } else {
134*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL;
135*91e1e26aSAlexander Pyhalov }
136*91e1e26aSAlexander Pyhalov break;
137*91e1e26aSAlexander Pyhalov case U11: /* 3 byte unicode - 2nd byte */
138*91e1e26aSAlexander Pyhalov c2 =st->keepc[1] = **inbuf;
139*91e1e26aSAlexander Pyhalov st->ustate = U4;
140*91e1e26aSAlexander Pyhalov continue;
141*91e1e26aSAlexander Pyhalov break;
142*91e1e26aSAlexander Pyhalov case U2: /* 3 byte unicode - 2nd byte */
143*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
144*91e1e26aSAlexander Pyhalov st->ustate = U3;
145*91e1e26aSAlexander Pyhalov st->keepc[1] = **inbuf;
146*91e1e26aSAlexander Pyhalov } else {
147*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL;
148*91e1e26aSAlexander Pyhalov }
149*91e1e26aSAlexander Pyhalov break;
150*91e1e26aSAlexander Pyhalov case U3: /* 3 byte unicode - 3rd byte */
151*91e1e26aSAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
152*91e1e26aSAlexander Pyhalov st->ustate = U4;
153*91e1e26aSAlexander Pyhalov st->keepc[2] = **inbuf;
154*91e1e26aSAlexander Pyhalov c1 = ((st->keepc[0]&0x0f)<<4) |
155*91e1e26aSAlexander Pyhalov ((st->keepc[1]&0x3c)>>2);
156*91e1e26aSAlexander Pyhalov c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
157*91e1e26aSAlexander Pyhalov #ifdef DEBUG
158*91e1e26aSAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
159*91e1e26aSAlexander Pyhalov st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
160*91e1e26aSAlexander Pyhalov #endif
161*91e1e26aSAlexander Pyhalov continue; /* should not advance *inbuf */
162*91e1e26aSAlexander Pyhalov } else {
163*91e1e26aSAlexander Pyhalov st->_errno = errno = EINVAL;
164*91e1e26aSAlexander Pyhalov }
165*91e1e26aSAlexander Pyhalov break;
166*91e1e26aSAlexander Pyhalov case U4:
167*91e1e26aSAlexander Pyhalov n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
168*91e1e26aSAlexander Pyhalov if (n != 0) { /* legal unicode;illegal Big5 */
169*91e1e26aSAlexander Pyhalov st->_errno = errno = EILSEQ;
170*91e1e26aSAlexander Pyhalov break;
171*91e1e26aSAlexander Pyhalov }
172*91e1e26aSAlexander Pyhalov
173*91e1e26aSAlexander Pyhalov n = utf8_to_ibm(unidx, ibm_code,
174*91e1e26aSAlexander Pyhalov *outbuf, *outbytesleft);
175*91e1e26aSAlexander Pyhalov if (n > 0) {
176*91e1e26aSAlexander Pyhalov (*outbuf) += n;
177*91e1e26aSAlexander Pyhalov (*outbytesleft) -= n;
178*91e1e26aSAlexander Pyhalov } else {
179*91e1e26aSAlexander Pyhalov st->_errno = errno;
180*91e1e26aSAlexander Pyhalov return((size_t)-1);
181*91e1e26aSAlexander Pyhalov }
182*91e1e26aSAlexander Pyhalov st->ustate = U0;
183*91e1e26aSAlexander Pyhalov st->_errno = 0;
184*91e1e26aSAlexander Pyhalov break;
185*91e1e26aSAlexander Pyhalov default: /* should never come here */
186*91e1e26aSAlexander Pyhalov st->_errno = errno = EILSEQ;
187*91e1e26aSAlexander Pyhalov st->ustate = U0; /* reset state */
188*91e1e26aSAlexander Pyhalov break;
189*91e1e26aSAlexander Pyhalov }
190*91e1e26aSAlexander Pyhalov
191*91e1e26aSAlexander Pyhalov (*inbuf)++;
192*91e1e26aSAlexander Pyhalov (*inbytesleft)--;
193*91e1e26aSAlexander Pyhalov
194*91e1e26aSAlexander Pyhalov if (st->_errno) {
195*91e1e26aSAlexander Pyhalov #ifdef DEBUG
196*91e1e26aSAlexander Pyhalov fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
197*91e1e26aSAlexander Pyhalov st->_errno, st->ustate);
198*91e1e26aSAlexander Pyhalov #endif
199*91e1e26aSAlexander Pyhalov break;
200*91e1e26aSAlexander Pyhalov }
201*91e1e26aSAlexander Pyhalov
202*91e1e26aSAlexander Pyhalov if (errno)
203*91e1e26aSAlexander Pyhalov return((size_t)-1);
204*91e1e26aSAlexander Pyhalov }
205*91e1e26aSAlexander Pyhalov
206*91e1e26aSAlexander Pyhalov if (*outbytesleft == 0) {
207*91e1e26aSAlexander Pyhalov errno = E2BIG;
208*91e1e26aSAlexander Pyhalov return((size_t)-1);
209*91e1e26aSAlexander Pyhalov }
210*91e1e26aSAlexander Pyhalov return (*inbytesleft);
211*91e1e26aSAlexander Pyhalov }
212*91e1e26aSAlexander Pyhalov
213*91e1e26aSAlexander Pyhalov
214*91e1e26aSAlexander Pyhalov /*
215*91e1e26aSAlexander Pyhalov * Match IBM code by UTF8 code;
216*91e1e26aSAlexander Pyhalov * Return: = 0 - match from Unicode to IBM found
217*91e1e26aSAlexander Pyhalov * = 1 - match from Unicode to IBM NOT found
218*91e1e26aSAlexander Pyhalov *
219*91e1e26aSAlexander Pyhalov * Since binary search of the UTF8 to IBM table is necessary, might as well
220*91e1e26aSAlexander Pyhalov * return index and IBM code matching to the unicode.
221*91e1e26aSAlexander Pyhalov */
get_ibm_by_utf(st,c1,c2,unidx,ibm_code)222*91e1e26aSAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
223*91e1e26aSAlexander Pyhalov _icv_state *st;
224*91e1e26aSAlexander Pyhalov char c1, c2;
225*91e1e26aSAlexander Pyhalov int *unidx;
226*91e1e26aSAlexander Pyhalov unsigned long *ibm_code;
227*91e1e26aSAlexander Pyhalov {
228*91e1e26aSAlexander Pyhalov unsigned long unicode;
229*91e1e26aSAlexander Pyhalov
230*91e1e26aSAlexander Pyhalov unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
231*91e1e26aSAlexander Pyhalov *unidx = bisearch(unicode, st, st->table_size);
232*91e1e26aSAlexander Pyhalov if ((*unidx) >= 0)
233*91e1e26aSAlexander Pyhalov {
234*91e1e26aSAlexander Pyhalov if ( st->left_to_right )
235*91e1e26aSAlexander Pyhalov *ibm_code = st->table[*unidx].right_code;
236*91e1e26aSAlexander Pyhalov else
237*91e1e26aSAlexander Pyhalov *ibm_code = st->table[*unidx].left_code;
238*91e1e26aSAlexander Pyhalov }
239*91e1e26aSAlexander Pyhalov else
240*91e1e26aSAlexander Pyhalov ; /* match from UTF8 to IBM not found */
241*91e1e26aSAlexander Pyhalov #ifdef DEBUG
242*91e1e26aSAlexander Pyhalov fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
243*91e1e26aSAlexander Pyhalov #endif
244*91e1e26aSAlexander Pyhalov
245*91e1e26aSAlexander Pyhalov return(0);
246*91e1e26aSAlexander Pyhalov }
247*91e1e26aSAlexander Pyhalov
248*91e1e26aSAlexander Pyhalov
249*91e1e26aSAlexander Pyhalov /*
250*91e1e26aSAlexander Pyhalov * ISO/IEC 10646 (Unicode) --> IBM
251*91e1e26aSAlexander Pyhalov * Unicode --> UTF8 (FSS-UTF)
252*91e1e26aSAlexander Pyhalov * (File System Safe Universal Character Set Transformation Format)
253*91e1e26aSAlexander Pyhalov * Return: > 0 - converted with enough space in output buffer
254*91e1e26aSAlexander Pyhalov * = 0 - no space in outbuf
255*91e1e26aSAlexander Pyhalov */
utf8_to_ibm(unidx,ibm_code,buf,buflen)256*91e1e26aSAlexander Pyhalov int utf8_to_ibm(unidx, ibm_code, buf, buflen)
257*91e1e26aSAlexander Pyhalov int unidx;
258*91e1e26aSAlexander Pyhalov unsigned long ibm_code;
259*91e1e26aSAlexander Pyhalov char *buf;
260*91e1e26aSAlexander Pyhalov size_t buflen;
261*91e1e26aSAlexander Pyhalov
262*91e1e26aSAlexander Pyhalov {
263*91e1e26aSAlexander Pyhalov unsigned long val; /* IBM value */
264*91e1e26aSAlexander Pyhalov char c1, c2, ibm_str[3];
265*91e1e26aSAlexander Pyhalov
266*91e1e26aSAlexander Pyhalov if (unidx < 0) /* no match from UTF8 to IBM */
267*91e1e26aSAlexander Pyhalov ibm_code = (unsigned long)NON_ID_CHAR;
268*91e1e26aSAlexander Pyhalov
269*91e1e26aSAlexander Pyhalov {
270*91e1e26aSAlexander Pyhalov val = ibm_code & 0xffff;
271*91e1e26aSAlexander Pyhalov c1 = (char) ((val & 0xff00) >> 8);
272*91e1e26aSAlexander Pyhalov c2 = (char) (val & 0xff);
273*91e1e26aSAlexander Pyhalov }
274*91e1e26aSAlexander Pyhalov
275*91e1e26aSAlexander Pyhalov *buf = ibm_str[0] = c1;
276*91e1e26aSAlexander Pyhalov *(buf+1) = ibm_str[1] = c2;
277*91e1e26aSAlexander Pyhalov ibm_str[2] = NULL;
278*91e1e26aSAlexander Pyhalov
279*91e1e26aSAlexander Pyhalov #ifdef DEBUG
280*91e1e26aSAlexander Pyhalov fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
281*91e1e26aSAlexander Pyhalov #endif
282*91e1e26aSAlexander Pyhalov
283*91e1e26aSAlexander Pyhalov
284*91e1e26aSAlexander Pyhalov if (buflen < 2) {
285*91e1e26aSAlexander Pyhalov errno = E2BIG;
286*91e1e26aSAlexander Pyhalov return(0);
287*91e1e26aSAlexander Pyhalov }
288*91e1e26aSAlexander Pyhalov
289*91e1e26aSAlexander Pyhalov return(2);
290*91e1e26aSAlexander Pyhalov }
291