1*880d7978SAlexander Pyhalov /*
2*880d7978SAlexander Pyhalov * CDDL HEADER START
3*880d7978SAlexander Pyhalov *
4*880d7978SAlexander Pyhalov * The contents of this file are subject to the terms of the
5*880d7978SAlexander Pyhalov * Common Development and Distribution License (the "License").
6*880d7978SAlexander Pyhalov * You may not use this file except in compliance with the License.
7*880d7978SAlexander Pyhalov *
8*880d7978SAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*880d7978SAlexander Pyhalov * or http://www.opensolaris.org/os/licensing.
10*880d7978SAlexander Pyhalov * See the License for the specific language governing permissions
11*880d7978SAlexander Pyhalov * and limitations under the License.
12*880d7978SAlexander Pyhalov *
13*880d7978SAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each
14*880d7978SAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE.
15*880d7978SAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the
16*880d7978SAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying
17*880d7978SAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner]
18*880d7978SAlexander Pyhalov *
19*880d7978SAlexander Pyhalov * CDDL HEADER END
20*880d7978SAlexander Pyhalov */
21*880d7978SAlexander Pyhalov
22*880d7978SAlexander Pyhalov /*
23*880d7978SAlexander Pyhalov * Copyright (c) 1997, by Sun Microsystems, Inc.
24*880d7978SAlexander Pyhalov * All rights reserved.
25*880d7978SAlexander Pyhalov */
26*880d7978SAlexander Pyhalov
27*880d7978SAlexander Pyhalov #include <stdio.h>
28*880d7978SAlexander Pyhalov #include <stdlib.h>
29*880d7978SAlexander Pyhalov #include <errno.h>
30*880d7978SAlexander Pyhalov #include <sys/types.h>
31*880d7978SAlexander Pyhalov
32*880d7978SAlexander Pyhalov #include "tab_lookup.h" /* table lookup data types */
33*880d7978SAlexander Pyhalov
34*880d7978SAlexander Pyhalov #define MSB 0x80 /* most significant bit */
35*880d7978SAlexander Pyhalov #define ONEBYTE 0xff /* right most byte */
36*880d7978SAlexander Pyhalov
37*880d7978SAlexander Pyhalov enum _USTATE { U0, U1, U11, U2, U3, U4 };
38*880d7978SAlexander Pyhalov
39*880d7978SAlexander Pyhalov
40*880d7978SAlexander Pyhalov int get_ibm_by_utf(_icv_state *st, char c1, char c2, int *unidx,
41*880d7978SAlexander Pyhalov unsigned long *ibm_code);
42*880d7978SAlexander Pyhalov
43*880d7978SAlexander Pyhalov int bisearch(unsigned long val, _icv_state *st, int n);
44*880d7978SAlexander Pyhalov
45*880d7978SAlexander Pyhalov int utf8_to_ibm(int unidx, unsigned long ibm_code, char *buf,
46*880d7978SAlexander Pyhalov size_t buflen, _icv_state *st);
47*880d7978SAlexander Pyhalov
48*880d7978SAlexander Pyhalov /*
49*880d7978SAlexander Pyhalov * Actual conversion; called from iconv()
50*880d7978SAlexander Pyhalov * Input is UTF-8 data.
51*880d7978SAlexander Pyhalov * first convert to UCS2
52*880d7978SAlexander Pyhalov */
53*880d7978SAlexander Pyhalov size_t
_icv_iconv(_icv_state * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)54*880d7978SAlexander Pyhalov _icv_iconv(_icv_state *st, char **inbuf, size_t *inbytesleft,
55*880d7978SAlexander Pyhalov char **outbuf, size_t *outbytesleft)
56*880d7978SAlexander Pyhalov {
57*880d7978SAlexander Pyhalov /*
58*880d7978SAlexander Pyhalov * Actual conversion; called from iconv()
59*880d7978SAlexander Pyhalov */
60*880d7978SAlexander Pyhalov /*=========================================================
61*880d7978SAlexander Pyhalov *
62*880d7978SAlexander Pyhalov * State Machine for interpreting UTF8 code
63*880d7978SAlexander Pyhalov *
64*880d7978SAlexander Pyhalov *=========================================================
65*880d7978SAlexander Pyhalov *
66*880d7978SAlexander Pyhalov * 3 byte unicode
67*880d7978SAlexander Pyhalov * +----->------->-------+
68*880d7978SAlexander Pyhalov * | |
69*880d7978SAlexander Pyhalov * ^ v
70*880d7978SAlexander Pyhalov * | 2 byte U2 ---> U3
71*880d7978SAlexander Pyhalov * | unicode v
72*880d7978SAlexander Pyhalov * +------> U0 -------> U1 +-------->U4---+
73*880d7978SAlexander Pyhalov * ^ ascii | | ^ |
74*880d7978SAlexander Pyhalov * | | +-------->--------->--------+ |
75*880d7978SAlexander Pyhalov * | v v
76*880d7978SAlexander Pyhalov * +----<---+-----<------------<------------<------------+
77*880d7978SAlexander Pyhalov *
78*880d7978SAlexander Pyhalov * +----<---+-----<------------<------------<------------+
79*880d7978SAlexander Pyhalov *
80*880d7978SAlexander Pyhalov *=========================================================*/
81*880d7978SAlexander Pyhalov
82*880d7978SAlexander Pyhalov char c1 = '\0', c2 = '\0';
83*880d7978SAlexander Pyhalov int n, unidx;
84*880d7978SAlexander Pyhalov unsigned long ibm_code;
85*880d7978SAlexander Pyhalov
86*880d7978SAlexander Pyhalov #ifdef DEBUG
87*880d7978SAlexander Pyhalov fprintf(stderr, "========== iconv(): UTF8 --> IBM ==========\n");
88*880d7978SAlexander Pyhalov #endif
89*880d7978SAlexander Pyhalov
90*880d7978SAlexander Pyhalov if (st == NULL) {
91*880d7978SAlexander Pyhalov errno = EBADF;
92*880d7978SAlexander Pyhalov return ((size_t) -1);
93*880d7978SAlexander Pyhalov }
94*880d7978SAlexander Pyhalov
95*880d7978SAlexander Pyhalov if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
96*880d7978SAlexander Pyhalov st->ustate = U0;
97*880d7978SAlexander Pyhalov st->_errno = 0;
98*880d7978SAlexander Pyhalov st->shift = SHIFT_IN;
99*880d7978SAlexander Pyhalov return ((size_t) 0);
100*880d7978SAlexander Pyhalov }
101*880d7978SAlexander Pyhalov
102*880d7978SAlexander Pyhalov st->_errno = 0; /* reset internal errno */
103*880d7978SAlexander Pyhalov errno = 0; /* reset external errno */
104*880d7978SAlexander Pyhalov
105*880d7978SAlexander Pyhalov /* a state machine for interpreting UTF8 code */
106*880d7978SAlexander Pyhalov while (*inbytesleft > 0 && *outbytesleft > 0) {
107*880d7978SAlexander Pyhalov switch (st->ustate) {
108*880d7978SAlexander Pyhalov case U0:
109*880d7978SAlexander Pyhalov /* it is ascii, convert it immediately */
110*880d7978SAlexander Pyhalov if ((**inbuf & MSB) == 0) { /* ASCII */
111*880d7978SAlexander Pyhalov st->ustate = U4;
112*880d7978SAlexander Pyhalov st->keepc[0] = **inbuf;
113*880d7978SAlexander Pyhalov c1 = 0x0;
114*880d7978SAlexander Pyhalov c2 = **inbuf;
115*880d7978SAlexander Pyhalov continue;
116*880d7978SAlexander Pyhalov } else { /* Chinese character */
117*880d7978SAlexander Pyhalov if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode */
118*880d7978SAlexander Pyhalov st->ustate = U1;
119*880d7978SAlexander Pyhalov st->keepc[0] = **inbuf;
120*880d7978SAlexander Pyhalov } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte */
121*880d7978SAlexander Pyhalov st->ustate = U2;
122*880d7978SAlexander Pyhalov st->keepc[0] = **inbuf;
123*880d7978SAlexander Pyhalov } else { /* illegal unicode */
124*880d7978SAlexander Pyhalov /* st->_errno = errno = EINVAL; */
125*880d7978SAlexander Pyhalov /* possible UNICODE ko_KR-UTF8 */
126*880d7978SAlexander Pyhalov c1 =st->keepc[0] = **inbuf;
127*880d7978SAlexander Pyhalov st->ustate = U11;
128*880d7978SAlexander Pyhalov break;
129*880d7978SAlexander Pyhalov }
130*880d7978SAlexander Pyhalov }
131*880d7978SAlexander Pyhalov break;
132*880d7978SAlexander Pyhalov case U1: /* 2 byte unicode */
133*880d7978SAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
134*880d7978SAlexander Pyhalov st->ustate = U4;
135*880d7978SAlexander Pyhalov st->keepc[1] = **inbuf;
136*880d7978SAlexander Pyhalov c1 = (st->keepc[0]&0x1c)>>2;
137*880d7978SAlexander Pyhalov c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
138*880d7978SAlexander Pyhalov #ifdef DEBUG
139*880d7978SAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x --> ",
140*880d7978SAlexander Pyhalov st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
141*880d7978SAlexander Pyhalov #endif
142*880d7978SAlexander Pyhalov continue; /* should not advance *inbuf */
143*880d7978SAlexander Pyhalov } else {
144*880d7978SAlexander Pyhalov st->_errno = errno = EINVAL;
145*880d7978SAlexander Pyhalov }
146*880d7978SAlexander Pyhalov break;
147*880d7978SAlexander Pyhalov case U11: /* 3 byte unicode - 2nd byte */
148*880d7978SAlexander Pyhalov c2 =st->keepc[1] = **inbuf;
149*880d7978SAlexander Pyhalov st->ustate = U4;
150*880d7978SAlexander Pyhalov continue;
151*880d7978SAlexander Pyhalov break;
152*880d7978SAlexander Pyhalov case U2: /* 3 byte unicode - 2nd byte */
153*880d7978SAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
154*880d7978SAlexander Pyhalov st->ustate = U3;
155*880d7978SAlexander Pyhalov st->keepc[1] = **inbuf;
156*880d7978SAlexander Pyhalov } else {
157*880d7978SAlexander Pyhalov st->_errno = errno = EINVAL;
158*880d7978SAlexander Pyhalov }
159*880d7978SAlexander Pyhalov break;
160*880d7978SAlexander Pyhalov case U3: /* 3 byte unicode - 3rd byte */
161*880d7978SAlexander Pyhalov if ((**inbuf & 0xc0) == MSB) {
162*880d7978SAlexander Pyhalov st->ustate = U4;
163*880d7978SAlexander Pyhalov st->keepc[2] = **inbuf;
164*880d7978SAlexander Pyhalov c1 = ((st->keepc[0]&0x0f)<<4) |
165*880d7978SAlexander Pyhalov ((st->keepc[1]&0x3c)>>2);
166*880d7978SAlexander Pyhalov c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
167*880d7978SAlexander Pyhalov #ifdef DEBUG
168*880d7978SAlexander Pyhalov fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
169*880d7978SAlexander Pyhalov st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
170*880d7978SAlexander Pyhalov #endif
171*880d7978SAlexander Pyhalov continue; /* should not advance *inbuf */
172*880d7978SAlexander Pyhalov } else {
173*880d7978SAlexander Pyhalov st->_errno = errno = EINVAL;
174*880d7978SAlexander Pyhalov }
175*880d7978SAlexander Pyhalov break;
176*880d7978SAlexander Pyhalov case U4:
177*880d7978SAlexander Pyhalov n = get_ibm_by_utf(st, c1, c2, &unidx, &ibm_code);
178*880d7978SAlexander Pyhalov if (n != 0) { /* legal unicode;illegal Big5 */
179*880d7978SAlexander Pyhalov st->_errno = errno = EILSEQ;
180*880d7978SAlexander Pyhalov break;
181*880d7978SAlexander Pyhalov }
182*880d7978SAlexander Pyhalov
183*880d7978SAlexander Pyhalov n = utf8_to_ibm(unidx, ibm_code,
184*880d7978SAlexander Pyhalov *outbuf, *outbytesleft, st);
185*880d7978SAlexander Pyhalov if (n > 0) {
186*880d7978SAlexander Pyhalov (*outbuf) += n;
187*880d7978SAlexander Pyhalov (*outbytesleft) -= n;
188*880d7978SAlexander Pyhalov } else {
189*880d7978SAlexander Pyhalov st->_errno = errno;
190*880d7978SAlexander Pyhalov return((size_t)-1);
191*880d7978SAlexander Pyhalov }
192*880d7978SAlexander Pyhalov st->ustate = U0;
193*880d7978SAlexander Pyhalov st->_errno = 0;
194*880d7978SAlexander Pyhalov break;
195*880d7978SAlexander Pyhalov default: /* should never come here */
196*880d7978SAlexander Pyhalov st->_errno = errno = EILSEQ;
197*880d7978SAlexander Pyhalov st->ustate = U0; /* reset state */
198*880d7978SAlexander Pyhalov break;
199*880d7978SAlexander Pyhalov }
200*880d7978SAlexander Pyhalov
201*880d7978SAlexander Pyhalov (*inbuf)++;
202*880d7978SAlexander Pyhalov (*inbytesleft)--;
203*880d7978SAlexander Pyhalov
204*880d7978SAlexander Pyhalov if (st->_errno) {
205*880d7978SAlexander Pyhalov #ifdef DEBUG
206*880d7978SAlexander Pyhalov fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
207*880d7978SAlexander Pyhalov st->_errno, st->ustate);
208*880d7978SAlexander Pyhalov #endif
209*880d7978SAlexander Pyhalov break;
210*880d7978SAlexander Pyhalov }
211*880d7978SAlexander Pyhalov
212*880d7978SAlexander Pyhalov if (errno)
213*880d7978SAlexander Pyhalov return((size_t)-1);
214*880d7978SAlexander Pyhalov }
215*880d7978SAlexander Pyhalov
216*880d7978SAlexander Pyhalov if (*outbytesleft == 0) {
217*880d7978SAlexander Pyhalov errno = E2BIG;
218*880d7978SAlexander Pyhalov return((size_t)-1);
219*880d7978SAlexander Pyhalov }
220*880d7978SAlexander Pyhalov return (*inbytesleft);
221*880d7978SAlexander Pyhalov }
222*880d7978SAlexander Pyhalov
223*880d7978SAlexander Pyhalov
224*880d7978SAlexander Pyhalov /*
225*880d7978SAlexander Pyhalov * Match IBM code by UTF8 code;
226*880d7978SAlexander Pyhalov * Return: = 0 - match from Unicode to IBM found
227*880d7978SAlexander Pyhalov * = 1 - match from Unicode to IBM NOT found
228*880d7978SAlexander Pyhalov *
229*880d7978SAlexander Pyhalov * Since binary search of the UTF8 to IBM table is necessary, might as well
230*880d7978SAlexander Pyhalov * return index and IBM code matching to the unicode.
231*880d7978SAlexander Pyhalov */
get_ibm_by_utf(st,c1,c2,unidx,ibm_code)232*880d7978SAlexander Pyhalov int get_ibm_by_utf(st, c1, c2, unidx, ibm_code)
233*880d7978SAlexander Pyhalov _icv_state *st;
234*880d7978SAlexander Pyhalov char c1, c2;
235*880d7978SAlexander Pyhalov int *unidx;
236*880d7978SAlexander Pyhalov unsigned long *ibm_code;
237*880d7978SAlexander Pyhalov {
238*880d7978SAlexander Pyhalov unsigned long unicode;
239*880d7978SAlexander Pyhalov
240*880d7978SAlexander Pyhalov unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
241*880d7978SAlexander Pyhalov *unidx = bisearch(unicode, st, st->table_size);
242*880d7978SAlexander Pyhalov if ((*unidx) >= 0)
243*880d7978SAlexander Pyhalov {
244*880d7978SAlexander Pyhalov if ( st->left_to_right )
245*880d7978SAlexander Pyhalov *ibm_code = st->table[*unidx].right_code;
246*880d7978SAlexander Pyhalov else
247*880d7978SAlexander Pyhalov *ibm_code = st->table[*unidx].left_code;
248*880d7978SAlexander Pyhalov }
249*880d7978SAlexander Pyhalov #ifdef DEBUG
250*880d7978SAlexander Pyhalov fprintf(stderr, "Unicode=%04x, idx=%5d, IBM=%x ", unicode, *unidx, *ibm_code);
251*880d7978SAlexander Pyhalov #endif
252*880d7978SAlexander Pyhalov
253*880d7978SAlexander Pyhalov return(0);
254*880d7978SAlexander Pyhalov }
255*880d7978SAlexander Pyhalov
256*880d7978SAlexander Pyhalov
257*880d7978SAlexander Pyhalov /*
258*880d7978SAlexander Pyhalov * ISO/IEC 10646 (Unicode) --> IBM
259*880d7978SAlexander Pyhalov * Unicode --> UTF8 (FSS-UTF)
260*880d7978SAlexander Pyhalov * (File System Safe Universal Character Set Transformation Format)
261*880d7978SAlexander Pyhalov * Return: > 0 - converted with enough space in output buffer
262*880d7978SAlexander Pyhalov * = 0 - no space in outbuf
263*880d7978SAlexander Pyhalov */
utf8_to_ibm(unidx,ibm_code,buf,buflen,st)264*880d7978SAlexander Pyhalov int utf8_to_ibm(unidx, ibm_code, buf, buflen, st)
265*880d7978SAlexander Pyhalov int unidx;
266*880d7978SAlexander Pyhalov unsigned long ibm_code;
267*880d7978SAlexander Pyhalov char *buf;
268*880d7978SAlexander Pyhalov size_t buflen;
269*880d7978SAlexander Pyhalov _icv_state *st;
270*880d7978SAlexander Pyhalov
271*880d7978SAlexander Pyhalov {
272*880d7978SAlexander Pyhalov unsigned long val; /* IBM value */
273*880d7978SAlexander Pyhalov char c1, c2, ibm_str[3];
274*880d7978SAlexander Pyhalov
275*880d7978SAlexander Pyhalov if (unidx < 0) /* no match from UTF8 to IBM */
276*880d7978SAlexander Pyhalov ibm_code = (unsigned long)NON_ID_CHAR;
277*880d7978SAlexander Pyhalov
278*880d7978SAlexander Pyhalov {
279*880d7978SAlexander Pyhalov val = ibm_code & 0xffff;
280*880d7978SAlexander Pyhalov c1 = (char) ((val & 0xff00) >> 8);
281*880d7978SAlexander Pyhalov c2 = (char) (val & 0xff);
282*880d7978SAlexander Pyhalov }
283*880d7978SAlexander Pyhalov
284*880d7978SAlexander Pyhalov /* it is single byte ascii */
285*880d7978SAlexander Pyhalov if ( c1 == 0x0 ) {
286*880d7978SAlexander Pyhalov if ( st->shift == SHIFT_OUT ) {
287*880d7978SAlexander Pyhalov if (buflen < 2) {
288*880d7978SAlexander Pyhalov errno = E2BIG;
289*880d7978SAlexander Pyhalov return 0;
290*880d7978SAlexander Pyhalov }
291*880d7978SAlexander Pyhalov *buf = SHIFT_IN;
292*880d7978SAlexander Pyhalov *(buf+1) = c2;
293*880d7978SAlexander Pyhalov st->shift = SHIFT_IN;
294*880d7978SAlexander Pyhalov return 2;
295*880d7978SAlexander Pyhalov }
296*880d7978SAlexander Pyhalov if (buflen < 1) {
297*880d7978SAlexander Pyhalov errno = E2BIG;
298*880d7978SAlexander Pyhalov return 0;
299*880d7978SAlexander Pyhalov }
300*880d7978SAlexander Pyhalov *buf = c2;
301*880d7978SAlexander Pyhalov return 1;
302*880d7978SAlexander Pyhalov }
303*880d7978SAlexander Pyhalov
304*880d7978SAlexander Pyhalov /* it is the first two bytes character */
305*880d7978SAlexander Pyhalov if ( st->shift == SHIFT_IN ) {
306*880d7978SAlexander Pyhalov if (buflen < 3) {
307*880d7978SAlexander Pyhalov errno = E2BIG;
308*880d7978SAlexander Pyhalov return 0;
309*880d7978SAlexander Pyhalov }
310*880d7978SAlexander Pyhalov *buf = SHIFT_OUT;
311*880d7978SAlexander Pyhalov st->shift = SHIFT_OUT;
312*880d7978SAlexander Pyhalov *(buf+1) = c1;
313*880d7978SAlexander Pyhalov *(buf+2) = c2;
314*880d7978SAlexander Pyhalov return 3;
315*880d7978SAlexander Pyhalov }
316*880d7978SAlexander Pyhalov
317*880d7978SAlexander Pyhalov *buf = ibm_str[0] = c1;
318*880d7978SAlexander Pyhalov *(buf+1) = ibm_str[1] = c2;
319*880d7978SAlexander Pyhalov ibm_str[2] = NULL;
320*880d7978SAlexander Pyhalov
321*880d7978SAlexander Pyhalov #ifdef DEBUG
322*880d7978SAlexander Pyhalov fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
323*880d7978SAlexander Pyhalov #endif
324*880d7978SAlexander Pyhalov
325*880d7978SAlexander Pyhalov
326*880d7978SAlexander Pyhalov if (buflen < 2) {
327*880d7978SAlexander Pyhalov errno = E2BIG;
328*880d7978SAlexander Pyhalov return(0);
329*880d7978SAlexander Pyhalov }
330*880d7978SAlexander Pyhalov
331*880d7978SAlexander Pyhalov return(2);
332*880d7978SAlexander Pyhalov }
333