xref: /titanic_50/usr/src/lib/iconv_modules/zh/common/UTF-8%HZ-GB-2312.c (revision 880d797826457b77414b37d531cc3e1aa166ecbe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright(c) 1998 Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include "unicode_gb2312.h"
31 #include "common_defs.h"
32 
33 #define MSB		0x80
34 #define	NON_ID_CHAR	'?'
35 
36 typedef struct _icv_state {
37 	short	_ustate;
38 	short	saved_ustate;
39 	char	_cbuf[3];
40 } _iconv_st;
41 
42 enum	_USTATE	{ U0, U1, U2, U3, U4, U5, U6 };
43 
44 int unicode_to_gb_to_hz(char in_byte1, char in_byte2, char *buf, int buflen);
45 
46 /*
47  * Open; called from iconv_open()
48  */
49 void *
_icv_open()50 _icv_open()
51 {
52 	_iconv_st *st;
53 
54 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
55 		errno = ENOMEM;
56 		return ((void *) -1);
57 	}
58 
59 	st->_ustate = U0;
60 	st->saved_ustate = U0;
61 	return ((void *)st);
62 }
63 
64 
65 /*
66  * Close; called from iconv_close()
67  */
68 void
_icv_close(_iconv_st * st)69 _icv_close(_iconv_st *st)
70 {
71 	if (st == NULL)
72 		errno = EBADF;
73 	else
74 		free(st);
75 }
76 
77 
78 /*
79  * Actual conversion; called from iconv()
80  */
81 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)82 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
83 			char **outbuf, size_t*outbytesleft)
84 {
85 	char	c1, c2;
86 	int	n;
87 
88 	if (st == NULL) {
89 		errno = EBADF;
90 		return ((size_t)-1);
91 	}
92 
93 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
94 		st->_ustate = U0;
95 		return ((size_t)0);
96 	}
97 
98 	errno = 0;
99 	while (*inbytesleft > 0 && *outbytesleft > 0) {
100 
101 	    uchar_t  first_byte;
102 
103 	    switch (st->_ustate) {
104 	    case U0:
105 		if (**inbuf & MSB && st->saved_ustate ==U0) {
106 			if(*outbytesleft >=2) {
107 			**outbuf = '~';
108 			*(*outbuf+1) = '{';
109 			(*outbuf) += 2, (*outbytesleft) -= 2;
110 			} else {
111                         errno = E2BIG;
112                         return (size_t)-1;
113 			}
114 		}
115 		if ((**inbuf & MSB) == 0) {	/* ASCII */
116 		    if (st->saved_ustate == U1 || st->saved_ustate == U3)
117 		    {
118 		    if(*outbytesleft >=2) {
119                         **outbuf = '~';
120                         *(*outbuf+1) = '}';
121                         (*outbuf) += 2, (*outbytesleft) -= 2;
122                         }else {
123                             errno = E2BIG;
124                             return (size_t)-1;
125                         }
126                     }
127 		    st->saved_ustate = U0;
128 		    if(*outbytesleft >=1) {
129 		    **outbuf = **inbuf;
130 		    (*outbuf)++; (*outbytesleft)--;
131                     }else {
132                         errno = E2BIG;
133                         return (size_t)-1;
134                     }
135 		    if (**inbuf == '~') {
136 		        if(*outbytesleft >=1) {
137                         **outbuf = '~';
138                         (*outbuf)++, (*outbytesleft)--;
139                         }else {
140                             errno = E2BIG;
141                             return (size_t)-1;
142                         }
143                     }
144 	        } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xbf */
145 
146 		    /* invalid sequence if the first char is either 0xc0 or 0xc1 */
147 		    if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
148 		         errno = EILSEQ;
149 		    else {
150 		         st->_ustate = U1;
151 		         st->_cbuf[0] = **inbuf;
152 		    }
153 		} else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */
154 		    st->_ustate = U2;
155 		    st->_cbuf[0] = **inbuf;
156 		} else {
157 		    /* four bytes of UTF-8 sequences */
158 		    if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
159 		         errno = EILSEQ;
160 		    else
161 		     {
162 			st->_ustate = U4;
163 			st->_cbuf[0] = **inbuf;
164 		     }
165 		}
166 		break;
167 	    case U1:
168 		if ((**inbuf & 0xc0) == MSB) {	/* Two-byte UTF */
169 		    c1 = (st->_cbuf[0]&0x1c)>>2;
170 		    c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f);
171 		    n = unicode_to_gb_to_hz(c1, c2, *outbuf, *outbytesleft);
172 		    if (n > 0) {
173 			(*outbuf) += n, (*outbytesleft) -= n;
174 		    } else {
175 			errno = E2BIG;
176 			return ((size_t) -1);
177 		    }
178 		    st->saved_ustate = U1;
179 		    st->_ustate = U0;
180 		} else {
181 		    errno = EILSEQ;
182 		}
183 		break;
184 	    case U2:
185 		st->saved_ustate = U2;
186 
187 	        first_byte = st->_cbuf[0];
188 
189 	        /* if the first byte is 0xed, it is illegal sequence if the second
190 		 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
191 		 */
192 	        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
193 		    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
194 		    errno = EILSEQ;
195 	        else {
196 		    st->_ustate = U3;
197 		    st->_cbuf[1] = **inbuf;
198 		}
199 		break;
200 	    case U3:
201 		if ((**inbuf & 0xc0) == MSB) {	/* Three-byte UTF */
202 		    c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2);
203 		    c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f);
204 		    n = unicode_to_gb_to_hz(c1, c2, *outbuf, *outbytesleft);
205 		    if (n > 0) {
206 			(*outbuf) += n, (*outbytesleft) -= n;
207 		    } else if ( n == -1 ) { /* unicode is either 0xFFFE or 0xFFFF */
208 		        errno = EILSEQ;
209 		    } else {
210 			errno = E2BIG;
211 			return ((size_t)-1);
212 		    }
213 		    st->saved_ustate = U3;
214 		    st->_ustate = U0;
215 		} else {
216 		    errno = EILSEQ;
217 		    break;
218 		}
219 		break;
220 	    case U4:
221 
222 	       first_byte = st->_cbuf[0];
223 
224 	       /* if the first byte is 0xf0, it is illegal sequence if
225 		* the second one is between 0x80 and 0x8f
226 		* for Four-Byte UTF: U+10000..U+10FFFF
227 		*/
228 	       if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
229 		   ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
230 		    errno = EILSEQ;
231 	       else
232 		 {
233 		    st->_ustate = U5;
234 		    st->_cbuf[1] = **inbuf;
235 		    st->saved_ustate = U4;
236 		 }
237 	       break;
238 	    case U5:
239 	       if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
240 		 {
241 		    st->_ustate = U6;
242 		    st->_cbuf[2] = **inbuf;
243 		    st->saved_ustate = U5;
244 		 }
245 	       else
246 		 errno = EILSEQ;
247 	       break;
248 	    case U6:
249 	       if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
250 		 {
251 		    /* replace with double NON_ID_CHARs */
252 		    if ( *outbytesleft < 2 )
253 		       errno = E2BIG;
254 		    else
255 		      {
256 			 **outbuf = NON_ID_CHAR;
257 			 *(*outbuf+1) = NON_ID_CHAR;
258 			 (*outbytesleft) -= 2;
259 
260 			 st->_ustate = U0;
261 			 st->saved_ustate = U6;
262 		      }
263 		 }
264 	       else
265 		 errno = EILSEQ;
266 	       break;
267 	    }
268 
269 	    if (errno)
270 			return ((size_t)-1);
271 	    (*inbuf)++; (*inbytesleft)--;
272 	}
273 
274         if (*inbytesleft == 0 && st->_ustate != U0)
275           {
276 	     errno = EINVAL;
277 	     return ((size_t) -1);
278           }
279 
280 	if (*inbytesleft > 0 && *outbytesleft == 0) {
281 		errno = E2BIG;
282 		return ((size_t)-1);
283 	}
284 	return ((size_t)(*inbytesleft));
285 }
286 
287 /* return value: 0 - no enough space to hold the HZ-GB-2312 code
288  *              -1 - illegal sequence
289  *              >0 - buffer length
290  */
unicode_to_gb_to_hz(in_byte1,in_byte2,buf,buflen)291 int unicode_to_gb_to_hz(in_byte1, in_byte2, buf, buflen)
292 char	in_byte1, in_byte2;
293 char	*buf;
294 int	buflen;
295 {
296 	int	gb, unicode;
297 	int	i, l, h;
298 
299 	if (buflen < 2)
300 		return 0;
301 	unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff);
302 
303         /* 0xfffe and 0xffff should not be allowed */
304         if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
305 
306 	for (l = 0, h = UNICODEMAX; l < h; ) {
307 		if (unicode_gb_tab[l].key == unicode) {
308 			i = l;
309 			break;
310 		}
311 		if (unicode_gb_tab[h].key == unicode) {
312 			i = h;
313 			break;
314 		}
315 		i = (l + h) / 2;
316 		if (unicode_gb_tab[i].key == unicode)
317 			break;
318 		if (unicode_gb_tab[i].key < unicode)
319 			l = i + 1;
320 		else	h = i - 1;
321 	}
322 	if (unicode == unicode_gb_tab[i].key) {
323 		gb = unicode_gb_tab[i].value;
324 		*buf = ((gb & 0xff00) >> 8) & 0x7f;
325 		*(buf+1) = (gb & 0xff) & 0x7f;
326 	} else {
327 		*buf = NON_ID_CHAR;
328 		*(buf+1) = NON_ID_CHAR;
329 	}
330 	return 2;
331 }
332