xref: /illumos-gate/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_CN.iso2022-7.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov  * CDDL HEADER START
3*16d86563SAlexander Pyhalov  *
4*16d86563SAlexander Pyhalov  * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov  * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov  * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov  *
8*16d86563SAlexander Pyhalov  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov  * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov  * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov  * and limitations under the License.
12*16d86563SAlexander Pyhalov  *
13*16d86563SAlexander Pyhalov  * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov  * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov  * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov  * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov  * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov  *
19*16d86563SAlexander Pyhalov  * CDDL HEADER END
20*16d86563SAlexander Pyhalov  */
21*16d86563SAlexander Pyhalov /*
22*16d86563SAlexander Pyhalov  * Copyright(c) 1998 Sun Microsystems, Inc.
23*16d86563SAlexander Pyhalov  */
24*16d86563SAlexander Pyhalov 
25*16d86563SAlexander Pyhalov #include <stdio.h>
26*16d86563SAlexander Pyhalov #include <errno.h>
27*16d86563SAlexander Pyhalov #include <stdlib.h>
28*16d86563SAlexander Pyhalov #include <sys/types.h>
29*16d86563SAlexander Pyhalov #include <unicode_gb2312.h>
30*16d86563SAlexander Pyhalov #include "common_defs.h"
31*16d86563SAlexander Pyhalov 
32*16d86563SAlexander Pyhalov #define SI	0x0f
33*16d86563SAlexander Pyhalov #define SO	0x0e
34*16d86563SAlexander Pyhalov #define ESC	0x1b
35*16d86563SAlexander Pyhalov #define MSB	0x80
36*16d86563SAlexander Pyhalov 
37*16d86563SAlexander Pyhalov #define NON_ID_CHAR '?'
38*16d86563SAlexander Pyhalov 
39*16d86563SAlexander Pyhalov typedef struct _icv_state {
40*16d86563SAlexander Pyhalov 	short	_ustate;
41*16d86563SAlexander Pyhalov 	short	_istate;
42*16d86563SAlexander Pyhalov 	short	_gstate;
43*16d86563SAlexander Pyhalov 	char	_cbuf[3];
44*16d86563SAlexander Pyhalov } _iconv_st;
45*16d86563SAlexander Pyhalov 
46*16d86563SAlexander Pyhalov enum	_USTATE	{ U0, U1, U2, U3, U4, U5, U6 };
47*16d86563SAlexander Pyhalov enum	_ISTATE	{ IN, OUT };
48*16d86563SAlexander Pyhalov enum	_GSTATE	{ G0, G1 };
49*16d86563SAlexander Pyhalov 
50*16d86563SAlexander Pyhalov int unicode_to_iso(char in_byte1, char in_byte2, char *buf, int	buflen);
51*16d86563SAlexander Pyhalov 
52*16d86563SAlexander Pyhalov /*
53*16d86563SAlexander Pyhalov  * Open; called from iconv_open()
54*16d86563SAlexander Pyhalov  */
55*16d86563SAlexander Pyhalov void *
_icv_open()56*16d86563SAlexander Pyhalov _icv_open()
57*16d86563SAlexander Pyhalov {
58*16d86563SAlexander Pyhalov 	_iconv_st *st;
59*16d86563SAlexander Pyhalov 
60*16d86563SAlexander Pyhalov 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
61*16d86563SAlexander Pyhalov 		errno = ENOMEM;
62*16d86563SAlexander Pyhalov 		return ((void *) -1);
63*16d86563SAlexander Pyhalov 	}
64*16d86563SAlexander Pyhalov 
65*16d86563SAlexander Pyhalov 	st->_ustate = U0;
66*16d86563SAlexander Pyhalov 	st->_istate = IN;
67*16d86563SAlexander Pyhalov 	st->_gstate = G0;
68*16d86563SAlexander Pyhalov 
69*16d86563SAlexander Pyhalov 	return ((void *)st);
70*16d86563SAlexander Pyhalov }
71*16d86563SAlexander Pyhalov 
72*16d86563SAlexander Pyhalov 
73*16d86563SAlexander Pyhalov /*
74*16d86563SAlexander Pyhalov  * Close; called from iconv_close()
75*16d86563SAlexander Pyhalov  */
76*16d86563SAlexander Pyhalov void
_icv_close(_iconv_st * st)77*16d86563SAlexander Pyhalov _icv_close(_iconv_st *st)
78*16d86563SAlexander Pyhalov {
79*16d86563SAlexander Pyhalov 	if (st == NULL)
80*16d86563SAlexander Pyhalov 		errno = EBADF;
81*16d86563SAlexander Pyhalov 	else
82*16d86563SAlexander Pyhalov 		free(st);
83*16d86563SAlexander Pyhalov }
84*16d86563SAlexander Pyhalov 
85*16d86563SAlexander Pyhalov 
86*16d86563SAlexander Pyhalov /*
87*16d86563SAlexander Pyhalov  * Actual conversion; called from iconv()
88*16d86563SAlexander Pyhalov  */
89*16d86563SAlexander Pyhalov size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)90*16d86563SAlexander Pyhalov _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
91*16d86563SAlexander Pyhalov 			char **outbuf, size_t*outbytesleft)
92*16d86563SAlexander Pyhalov {
93*16d86563SAlexander Pyhalov 	char	c1, c2;
94*16d86563SAlexander Pyhalov 	int	n;
95*16d86563SAlexander Pyhalov 
96*16d86563SAlexander Pyhalov 	if (st == NULL) {
97*16d86563SAlexander Pyhalov 		errno = EBADF;
98*16d86563SAlexander Pyhalov 		return ((size_t)-1);
99*16d86563SAlexander Pyhalov 	}
100*16d86563SAlexander Pyhalov 
101*16d86563SAlexander Pyhalov 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
102*16d86563SAlexander Pyhalov 		st->_ustate = U0;
103*16d86563SAlexander Pyhalov 		st->_istate = IN;
104*16d86563SAlexander Pyhalov 		st->_gstate = G0;
105*16d86563SAlexander Pyhalov 		return ((size_t)0);
106*16d86563SAlexander Pyhalov 	}
107*16d86563SAlexander Pyhalov 
108*16d86563SAlexander Pyhalov 	errno = 0;
109*16d86563SAlexander Pyhalov 
110*16d86563SAlexander Pyhalov 	while (*inbytesleft > 0 && *outbytesleft > 0) {
111*16d86563SAlexander Pyhalov 
112*16d86563SAlexander Pyhalov 	    uchar_t  first_byte;
113*16d86563SAlexander Pyhalov 
114*16d86563SAlexander Pyhalov 	    switch (st->_ustate) {
115*16d86563SAlexander Pyhalov 	    case U0:
116*16d86563SAlexander Pyhalov 		if ((**inbuf & MSB) == 0) {	/* ASCII */
117*16d86563SAlexander Pyhalov 		    if (st->_istate == OUT) {
118*16d86563SAlexander Pyhalov 			st->_istate = IN;
119*16d86563SAlexander Pyhalov 			**outbuf = SI;
120*16d86563SAlexander Pyhalov 			(*outbuf)++, (*outbytesleft)--;
121*16d86563SAlexander Pyhalov 			if (*outbytesleft <= 0) {
122*16d86563SAlexander Pyhalov 			    errno = E2BIG;
123*16d86563SAlexander Pyhalov 			    return ((size_t)-1);
124*16d86563SAlexander Pyhalov 			}
125*16d86563SAlexander Pyhalov 		    }
126*16d86563SAlexander Pyhalov 		    **outbuf = **inbuf;
127*16d86563SAlexander Pyhalov 		    (*outbuf)++, (*outbytesleft)--;
128*16d86563SAlexander Pyhalov 	        } else {
129*16d86563SAlexander Pyhalov 		    if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */
130*16d86563SAlexander Pyhalov 
131*16d86563SAlexander Pyhalov 		        /* invalid sequence if the first char is either 0xc0 or 0xc1 */
132*16d86563SAlexander Pyhalov 		        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
133*16d86563SAlexander Pyhalov 			 {
134*16d86563SAlexander Pyhalov 			    errno = EILSEQ;
135*16d86563SAlexander Pyhalov 		            break;
136*16d86563SAlexander Pyhalov 			 }
137*16d86563SAlexander Pyhalov 		        else
138*16d86563SAlexander Pyhalov 			 {
139*16d86563SAlexander Pyhalov 			     st->_ustate = U1;
140*16d86563SAlexander Pyhalov 			     st->_cbuf[0] = **inbuf;
141*16d86563SAlexander Pyhalov 			 }
142*16d86563SAlexander Pyhalov 		    } else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */
143*16d86563SAlexander Pyhalov 			st->_ustate = U2;
144*16d86563SAlexander Pyhalov 			st->_cbuf[0] = **inbuf;
145*16d86563SAlexander Pyhalov 		    } else {
146*16d86563SAlexander Pyhalov 		        /* four bytes of UTF-8 sequences */
147*16d86563SAlexander Pyhalov 		        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
148*16d86563SAlexander Pyhalov 			  {
149*16d86563SAlexander Pyhalov 			    errno = EILSEQ;
150*16d86563SAlexander Pyhalov 		            break;
151*16d86563SAlexander Pyhalov 		           }
152*16d86563SAlexander Pyhalov 		        else {
153*16d86563SAlexander Pyhalov 			   st->_ustate = U4;
154*16d86563SAlexander Pyhalov 			   st->_cbuf[0] = **inbuf;
155*16d86563SAlexander Pyhalov 			}
156*16d86563SAlexander Pyhalov 		    }
157*16d86563SAlexander Pyhalov 		    if (st->_istate == IN) {
158*16d86563SAlexander Pyhalov 			if (st->_gstate == G0) {
159*16d86563SAlexander Pyhalov 			    if (*outbytesleft < 4) {
160*16d86563SAlexander Pyhalov 				errno = E2BIG;
161*16d86563SAlexander Pyhalov 				return ((size_t)-1);
162*16d86563SAlexander Pyhalov 			    }
163*16d86563SAlexander Pyhalov 			    st->_gstate = G1;
164*16d86563SAlexander Pyhalov 			    **outbuf = ESC;
165*16d86563SAlexander Pyhalov 			    *(*outbuf+1) = '$';
166*16d86563SAlexander Pyhalov 			    *(*outbuf+2) = ')';
167*16d86563SAlexander Pyhalov 			    *(*outbuf+3) = 'A';
168*16d86563SAlexander Pyhalov 			    (*outbuf) += 4, (*outbytesleft) -= 4;
169*16d86563SAlexander Pyhalov 			    if (*outbytesleft <= 0) {
170*16d86563SAlexander Pyhalov 				errno = E2BIG;
171*16d86563SAlexander Pyhalov 				return ((size_t)-1);
172*16d86563SAlexander Pyhalov 			    }
173*16d86563SAlexander Pyhalov 			}
174*16d86563SAlexander Pyhalov 			st->_istate = OUT;
175*16d86563SAlexander Pyhalov 			**outbuf = SO;
176*16d86563SAlexander Pyhalov 			(*outbuf)++, (*outbytesleft)--;
177*16d86563SAlexander Pyhalov 		    }
178*16d86563SAlexander Pyhalov 		}
179*16d86563SAlexander Pyhalov 		break;
180*16d86563SAlexander Pyhalov 	    case U1:
181*16d86563SAlexander Pyhalov 		if ((**inbuf & 0xc0) == MSB) {	/* two-byte UTF */
182*16d86563SAlexander Pyhalov 		    c1 = (st->_cbuf[0]&0x1c)>>2;
183*16d86563SAlexander Pyhalov 		    c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f);
184*16d86563SAlexander Pyhalov 		    n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft);
185*16d86563SAlexander Pyhalov 		    if (n > 0) {
186*16d86563SAlexander Pyhalov 			(*outbuf) += n, (*outbytesleft) -= n;
187*16d86563SAlexander Pyhalov 		    } else {
188*16d86563SAlexander Pyhalov 			errno = E2BIG;
189*16d86563SAlexander Pyhalov 			return ((size_t)-1);
190*16d86563SAlexander Pyhalov 		    }
191*16d86563SAlexander Pyhalov 		    st->_ustate = U0;
192*16d86563SAlexander Pyhalov 		} else {
193*16d86563SAlexander Pyhalov 		    errno = EILSEQ;
194*16d86563SAlexander Pyhalov 		}
195*16d86563SAlexander Pyhalov 		break;
196*16d86563SAlexander Pyhalov 	    case U2:
197*16d86563SAlexander Pyhalov 
198*16d86563SAlexander Pyhalov 	        first_byte = st->_cbuf[0];
199*16d86563SAlexander Pyhalov 
200*16d86563SAlexander Pyhalov 	        /* if the first byte is 0xed, it is illegal sequence if the second
201*16d86563SAlexander Pyhalov 		 * one is one between 0xa0 and 0xbf because surrogate section is ill-formed
202*16d86563SAlexander Pyhalov 		 */
203*16d86563SAlexander Pyhalov 	        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
204*16d86563SAlexander Pyhalov 		    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
205*16d86563SAlexander Pyhalov 		     errno = EILSEQ;
206*16d86563SAlexander Pyhalov 	        else {
207*16d86563SAlexander Pyhalov 		     st->_ustate = U3;
208*16d86563SAlexander Pyhalov 		     st->_cbuf[1] = **inbuf;
209*16d86563SAlexander Pyhalov 		}
210*16d86563SAlexander Pyhalov 		break;
211*16d86563SAlexander Pyhalov 	    case U3:
212*16d86563SAlexander Pyhalov 		if ((**inbuf & 0xc0) == MSB) {	/* three-byte UTF */
213*16d86563SAlexander Pyhalov 		    c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2);
214*16d86563SAlexander Pyhalov 		    c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f);
215*16d86563SAlexander Pyhalov 		    n = unicode_to_iso(c1, c2, *outbuf, *outbytesleft);
216*16d86563SAlexander Pyhalov 		    if (n > 0) {
217*16d86563SAlexander Pyhalov 			(*outbuf) += n, (*outbytesleft) -= n;
218*16d86563SAlexander Pyhalov 		    } else if ( n == -1 ) {
219*16d86563SAlexander Pyhalov 		        errno = EILSEQ; /* unicode is either 0xfffe or 0xffff */
220*16d86563SAlexander Pyhalov 		    } else {
221*16d86563SAlexander Pyhalov 			errno = E2BIG;
222*16d86563SAlexander Pyhalov 			return ((size_t)-1);
223*16d86563SAlexander Pyhalov 		    }
224*16d86563SAlexander Pyhalov 		    st->_ustate = U0;
225*16d86563SAlexander Pyhalov 		} else {
226*16d86563SAlexander Pyhalov 		    errno = EILSEQ;
227*16d86563SAlexander Pyhalov 		}
228*16d86563SAlexander Pyhalov 		break;
229*16d86563SAlexander Pyhalov 	     case U4:
230*16d86563SAlexander Pyhalov 	        first_byte = st->_cbuf[0];
231*16d86563SAlexander Pyhalov 
232*16d86563SAlexander Pyhalov 	        /* if the first byte is 0xf0, it is illegal sequence if
233*16d86563SAlexander Pyhalov 		 * the second one is between 0x80 and 0x8f
234*16d86563SAlexander Pyhalov 		 * for Four-Byte UTF: U+10000..U+10FFFF
235*16d86563SAlexander Pyhalov 		 */
236*16d86563SAlexander Pyhalov 	        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
237*16d86563SAlexander Pyhalov 		    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
238*16d86563SAlexander Pyhalov 		     errno = EILSEQ;
239*16d86563SAlexander Pyhalov 		else {
240*16d86563SAlexander Pyhalov 		     st->_ustate = U5;
241*16d86563SAlexander Pyhalov 		     st->_cbuf[1] = **inbuf;
242*16d86563SAlexander Pyhalov 		}
243*16d86563SAlexander Pyhalov 	        break;
244*16d86563SAlexander Pyhalov 	     case U5:
245*16d86563SAlexander Pyhalov 		if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
246*16d86563SAlexander Pyhalov 		  {
247*16d86563SAlexander Pyhalov 		     st->_ustate = U6;
248*16d86563SAlexander Pyhalov 		     st->_cbuf[2] = **inbuf;
249*16d86563SAlexander Pyhalov 	          }
250*16d86563SAlexander Pyhalov 		else
251*16d86563SAlexander Pyhalov 		     errno = EILSEQ;
252*16d86563SAlexander Pyhalov 	        break;
253*16d86563SAlexander Pyhalov 	     case U6:
254*16d86563SAlexander Pyhalov 	        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
255*16d86563SAlexander Pyhalov 		   {
256*16d86563SAlexander Pyhalov 		      /* replace with double NON_ID_CHARs */
257*16d86563SAlexander Pyhalov 		      if ( *outbytesleft < 2 )
258*16d86563SAlexander Pyhalov 			 errno = E2BIG;
259*16d86563SAlexander Pyhalov 		      else
260*16d86563SAlexander Pyhalov 			{
261*16d86563SAlexander Pyhalov 			   **outbuf = NON_ID_CHAR;
262*16d86563SAlexander Pyhalov 			   *(*outbuf+1) = NON_ID_CHAR;
263*16d86563SAlexander Pyhalov 			   (*outbytesleft) -= 2;
264*16d86563SAlexander Pyhalov 
265*16d86563SAlexander Pyhalov 			   st->_ustate = U0;
266*16d86563SAlexander Pyhalov 			}
267*16d86563SAlexander Pyhalov 	           }
268*16d86563SAlexander Pyhalov 		else
269*16d86563SAlexander Pyhalov 		     errno = EILSEQ;
270*16d86563SAlexander Pyhalov 	        break;
271*16d86563SAlexander Pyhalov 	    }
272*16d86563SAlexander Pyhalov 
273*16d86563SAlexander Pyhalov 	    if (errno)
274*16d86563SAlexander Pyhalov 		return ((size_t)-1);
275*16d86563SAlexander Pyhalov 
276*16d86563SAlexander Pyhalov 	    (*inbuf)++; (*inbytesleft)--;
277*16d86563SAlexander Pyhalov 	}
278*16d86563SAlexander Pyhalov 
279*16d86563SAlexander Pyhalov 	if (*inbytesleft == 0 && st->_ustate != U0) {
280*16d86563SAlexander Pyhalov 	        errno = EINVAL;
281*16d86563SAlexander Pyhalov 	        return ((size_t) -1);
282*16d86563SAlexander Pyhalov 	}
283*16d86563SAlexander Pyhalov 
284*16d86563SAlexander Pyhalov 	if (*inbytesleft > 0 && *outbytesleft == 0) {
285*16d86563SAlexander Pyhalov 		errno = E2BIG;
286*16d86563SAlexander Pyhalov 		return ((size_t)-1);
287*16d86563SAlexander Pyhalov 	}
288*16d86563SAlexander Pyhalov 	return ((size_t)(*inbytesleft));
289*16d86563SAlexander Pyhalov }
290*16d86563SAlexander Pyhalov 
291*16d86563SAlexander Pyhalov 
unicode_to_iso(in_byte1,in_byte2,buf,buflen)292*16d86563SAlexander Pyhalov int unicode_to_iso(in_byte1, in_byte2, buf, buflen)
293*16d86563SAlexander Pyhalov char	in_byte1, in_byte2;
294*16d86563SAlexander Pyhalov char	*buf;
295*16d86563SAlexander Pyhalov int	buflen;
296*16d86563SAlexander Pyhalov {
297*16d86563SAlexander Pyhalov 	int	gb, unicode;
298*16d86563SAlexander Pyhalov 	int	i, l, h;
299*16d86563SAlexander Pyhalov 
300*16d86563SAlexander Pyhalov 	if (buflen < 2)
301*16d86563SAlexander Pyhalov 		return 0;
302*16d86563SAlexander Pyhalov 	unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff);
303*16d86563SAlexander Pyhalov 
304*16d86563SAlexander Pyhalov         /* 0xfffe and 0xffff should not be allowed */
305*16d86563SAlexander Pyhalov         if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
306*16d86563SAlexander Pyhalov 
307*16d86563SAlexander Pyhalov 	for (l = 0, h = UNICODEMAX; l < h; ) {
308*16d86563SAlexander Pyhalov 		if (unicode_gb_tab[l].key == unicode) {
309*16d86563SAlexander Pyhalov 			i = l;
310*16d86563SAlexander Pyhalov 			break;
311*16d86563SAlexander Pyhalov 		}
312*16d86563SAlexander Pyhalov 		if (unicode_gb_tab[h].key == unicode) {
313*16d86563SAlexander Pyhalov 			i = h;
314*16d86563SAlexander Pyhalov 			break;
315*16d86563SAlexander Pyhalov 		}
316*16d86563SAlexander Pyhalov 		i = (l + h) / 2;
317*16d86563SAlexander Pyhalov 		if (unicode_gb_tab[i].key == unicode)
318*16d86563SAlexander Pyhalov 			break;
319*16d86563SAlexander Pyhalov 		if (unicode_gb_tab[i].key < unicode)
320*16d86563SAlexander Pyhalov 			l = i + 1;
321*16d86563SAlexander Pyhalov 		else	h = i - 1;
322*16d86563SAlexander Pyhalov 	}
323*16d86563SAlexander Pyhalov 	if (unicode == unicode_gb_tab[i].key) {
324*16d86563SAlexander Pyhalov 		gb = unicode_gb_tab[i].value;
325*16d86563SAlexander Pyhalov 		*buf = (gb & 0xff00) >> 8;
326*16d86563SAlexander Pyhalov 		*(buf+1) = gb & 0xff;
327*16d86563SAlexander Pyhalov 	} else {
328*16d86563SAlexander Pyhalov 		*buf = *(buf+1) = NON_ID_CHAR;
329*16d86563SAlexander Pyhalov 	}
330*16d86563SAlexander Pyhalov 	return 2;
331*16d86563SAlexander Pyhalov }
332