xref: /illumos-gate/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_CN.euc.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright(c) 1998 Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 #include <stdio.h>
26 #include <errno.h>
27 #include <stdlib.h>
28 #include <sys/types.h>
29 #include <sys/isa_defs.h>
30 #include <unicode_gb2312.h>
31 #include "common_defs.h"
32 
33 #define MSB		0x80
34 #define	NON_ID_CHAR	'?'
35 
36 typedef struct _icv_state {
37 	short	_ustate;
38 	char	_cbuf[3];
39         boolean little_endian;
40         boolean bom_written;
41 } _iconv_st;
42 
43 enum	_USTATE	{ U0, U1, U2, U3, U4, U5, U6 };
44 
45 int unicode_to_gb(char, char, char *, int, int *);
46 
47 /*
48  * Open; called from iconv_open()
49  */
50 void *
_icv_open()51 _icv_open()
52 {
53 	_iconv_st *st;
54 
55 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
56 		errno = ENOMEM;
57 		return ((void *) -1);
58 	}
59 
60 	st->_ustate = U0;
61 	st->little_endian = false;
62 	st->bom_written = false;
63 #if defined(UCS_2LE)
64 	st->little_endian = true;
65 	st->bom_written = true;
66 #endif
67 	return ((void *)st);
68 }
69 
70 
71 /*
72  * Close; called from iconv_close()
73  */
74 void
_icv_close(_iconv_st * st)75 _icv_close(_iconv_st *st)
76 {
77 	if (st == NULL)
78 		errno = EBADF;
79 	else
80 		free(st);
81 }
82 
83 
84 /*
85  * Actual conversion; called from iconv()
86  */
87 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)88 _icv_iconv(_iconv_st *st, char **inbuf, size_t*inbytesleft,
89 			char **outbuf, size_t*outbytesleft)
90 {
91 	char	c1, c2;
92 	int	n;
93 	int     uconv_num = 0;
94 
95 	if (st == NULL) {
96 		errno = EBADF;
97 		return ((size_t)-1);
98 	}
99 
100 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
101 		st->_ustate = U0;
102 		return ((size_t)0);
103 	}
104 
105 	errno = 0;
106 
107 	while (*inbytesleft > 0 && *outbytesleft > 0) {
108 
109 	    uchar_t first_byte;
110 
111 	    switch (st->_ustate) {
112 	    case U0:
113 	        /*
114 		 * Code converion for UCS-2LE to support Samba
115 		 */
116 	        if (st->little_endian) {
117 		    st->_ustate = U1;
118 		    st->_cbuf[0] = **inbuf;
119 		}
120 		else if ((**inbuf & MSB) == 0) {	/* ASCII */
121 		    **outbuf = **inbuf;
122 		    (*outbuf)++; (*outbytesleft)--;
123 	        } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */
124 
125 		    /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
126 		    if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
127 			errno = EILSEQ;
128 		    else {
129 		        st->_ustate = U1;
130 		        st->_cbuf[0] = **inbuf;
131 		    }
132 		} else if ((**inbuf & 0xf0) == 0xe0) { /* 0xe0..0xef */
133 		    st->_ustate = U2;
134 		    st->_cbuf[0] = **inbuf;
135 		} else {
136 		    /* four bytes of UTF-8 sequence */
137 		    if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
138 		        errno = EILSEQ;
139 		    else
140 		     {
141 			st->_ustate = U4;
142 			st->_cbuf[0] = **inbuf;
143 		     }
144 		}
145 		break;
146 	    case U1:
147 		if ((**inbuf & 0xc0) == MSB || st->little_endian) {	/* Two-byte UTF */
148 		    int uconv_num_internal = 0;
149 
150 		    /*
151 		     * Code conversion for UCS-2LE to support Samba
152 		     */
153 		    if (st->little_endian) {
154 		        c1 = **inbuf;
155 			c2 = st->_cbuf[0];
156 
157 			/*
158 			 * It's ASCII
159 			 */
160 			if (c1 == 0 && (c2 & MSB) == 0) {
161 			  *(*outbuf)++ = c2;
162 			  (*outbytesleft) --;
163 			  st->_ustate = U0;
164 			  break;
165 			}
166 		    } else {
167 		        c1 = (st->_cbuf[0]&0x1c)>>2;
168 		        c2 = ((st->_cbuf[0]&0x03)<<6) | ((**inbuf)&0x3f);
169 		    }
170 		    n = unicode_to_gb(c1, c2, *outbuf, *outbytesleft, &uconv_num_internal);
171 		    if (n > 0) {
172 			(*outbuf) += n, (*outbytesleft) -= n;
173 
174 		        uconv_num += uconv_num_internal;
175 
176 		        st->_ustate = U0;
177 		    } else if (n == 0) {
178 			errno = E2BIG;
179 		    } else { /* n == -1 if unicode is either FFFE or 0xFFFF */
180 		        errno = EILSEQ;
181 		    }
182 		} else {
183 		    errno = EILSEQ;
184 		}
185 		break;
186 	    case U2:
187 
188 	        first_byte = st->_cbuf[0];
189 
190 	        /* if the first byte is 0xed, it is illegal sequence if the second
191 		 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
192 		 */
193 		if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
194 		     ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
195 		    errno = EILSEQ;
196 		else
197 		  {
198 		    st->_ustate = U3;
199 		    st->_cbuf[1] = **inbuf;
200 		  }
201 		break;
202 	    case U3:
203 		if ((**inbuf & 0xc0) == MSB) {	/* Three-byte UTF */
204 		    int uconv_num_internal = 0;
205 
206 		    c1 = ((st->_cbuf[0]&0x0f)<<4) | ((st->_cbuf[1]&0x3c)>>2);
207 		    c2 = ((st->_cbuf[1]&0x03)<<6) | ((**inbuf)&0x3f);
208 		    n = unicode_to_gb(c1, c2, *outbuf, *outbytesleft, &uconv_num_internal);
209 		    if (n > 0) {
210 			(*outbuf) += n, (*outbytesleft) -= n;
211 
212 		        uconv_num += uconv_num_internal;
213 
214 			st->_ustate = U0;
215 		    } else if ( n == 0 ) {
216 			errno = E2BIG;
217 		    } else { /* n == -1 if unicode is either 0xFFFE or 0xFFFF */
218 		        errno = EILSEQ;
219 		    }
220 		} else {
221 		    errno = EILSEQ;
222 		}
223 		break;
224 	     case U4:
225 
226 	        first_byte = st->_cbuf[0];
227 
228 		/* if the first byte is 0xf0, it is illegal sequence if
229 		 * the second one is between 0x80 and 0x8f
230 		 * for Four-Byte UTF: U+10000..U+10FFFF
231 		 */
232 		if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
233 		     ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
234 		    errno = EILSEQ;
235 		else
236 	            {
237 		      st->_ustate = U5;
238 		      st->_cbuf[1] = **inbuf;
239 		    }
240 		break;
241 	     case U5:
242 	        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
243 		  {
244 		     st->_ustate = U6;
245 		     st->_cbuf[2] = **inbuf;
246 		  }
247 		else
248 		  errno = EILSEQ;
249 		break;
250 	     case U6:
251 	        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
252 		  {
253 		    /* all gb2312 characters are in Unicode Plane 0
254 		     * so replace these other 16 planes with 0x3f3f
255 		     */
256 		    if ( *outbytesleft < 2 )
257 		       errno = E2BIG;
258 		    else
259 		       {
260 			  **outbuf = NON_ID_CHAR;
261 			  *(*outbuf+1) = NON_ID_CHAR;
262 			  (*outbytesleft) -= 2;
263 
264 			  uconv_num++;
265 
266 			  st->_ustate = U0;
267 		       }
268 		  }
269 		else
270 		  errno = EILSEQ;
271 		break;
272 	    }
273 
274 	    if (errno) break;
275 
276 	    (*inbuf)++; (*inbytesleft)--;
277 	}
278 
279 	if (*inbytesleft == 0 && st->_ustate != U0)
280 		errno = EINVAL;
281 
282 	if (*inbytesleft > 0 && *outbytesleft == 0)
283 		errno = E2BIG;
284 
285         if (errno) {
286 	   int num_reversed_bytes = 0;
287 
288 	   switch (st->_ustate)
289 	     {
290 	      case U1:
291 		num_reversed_bytes = 1;
292 		break;
293 	      case U2:
294 		num_reversed_bytes = 1;
295 		break;
296 	      case U3:
297 		num_reversed_bytes = 2;
298 		break;
299 	      case U4:
300 		num_reversed_bytes = 1;
301 		break;
302 	      case U5:
303 		num_reversed_bytes = 2;
304 		break;
305 	      case U6:
306 		num_reversed_bytes = 3;
307 		break;
308 	     }
309 
310 	   /*
311 	    * if error, *inbuf points to the byte following the last byte
312 	    * successfully used in conversion.
313 	    */
314 	   *inbuf -= num_reversed_bytes;
315 	   *inbytesleft += num_reversed_bytes;
316 	   st->_ustate = U0;
317 
318 	   return ((size_t)-1);
319 	}
320 
321 	return uconv_num;
322 }
323 
324 /* return values: 0 - no enough space to hold the GB2312 code
325  *               -1 - illegal sequence
326  *               >0 - buffer length
327  */
unicode_to_gb(char in_byte1,char in_byte2,char * buf,int buflen,int * uconv_num)328 int unicode_to_gb(char in_byte1, char in_byte2, char *buf, int buflen, int *uconv_num)
329 {
330 	int	gb, unicode;
331 	int	i, l, h;
332 
333 	if (buflen < 2)
334 		return 0;
335 	unicode = ((in_byte1 & 0xff) << 8) + (in_byte2 & 0xff);
336 	/* 0xfffe and 0xffff should not be allowed */
337         if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
338 
339 	for (l = 0, h = UNICODEMAX; l < h; ) {
340 		if (unicode_gb_tab[l].key == unicode) {
341 			i = l;
342 			break;
343 		}
344 		if (unicode_gb_tab[h].key == unicode) {
345 			i = h;
346 			break;
347 		}
348 		i = (l + h) / 2;
349 		if (unicode_gb_tab[i].key == unicode)
350 			break;
351 		if (unicode_gb_tab[i].key < unicode)
352 			l = i + 1;
353 		else	h = i - 1;
354 	}
355 	if (unicode == unicode_gb_tab[i].key) {
356 		gb = unicode_gb_tab[i].value;
357 		*buf = ((gb & 0xff00) >> 8) | MSB;
358 		*(buf+1) = (gb & 0xff) | MSB;
359 	} else {
360 		*buf = NON_ID_CHAR;
361 		*(buf+1) = NON_ID_CHAR;
362 
363 	        /* non-identical conversion */
364 		*uconv_num = 1;
365 	}
366 
367 	return 2;
368 }
369