xref: /titanic_51/usr/src/lib/iconv_modules/zh/common/zh_CN.gbk%UTF-8.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright(c) 2001 Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/isa_defs.h>
31 #include <gb18030_unicode.h>	/* GBK to Unicode mapping table */
32 #include "common_defs.h"
33 
34 #define	MSB	0x80	/* most significant bit */
35 #define ONEBYTE	0xff	/* right most byte */
36 #define GBK_LEN_MAX  4
37 
38 #define INVALID_BYTE(v)   ( (v) == 0x80 || (v) == 0xff )
39 #define gbk4_2nd_byte(v)  ( (v) >= 0x30 && (v) <= 0x39 )
40 #define gbk4_3rd_byte(v)   ( (v) >= 0x81 && (v) <= 0xfe )
41 #define gbk4_4th_byte(v)  gbk4_2nd_byte(v)
42 
43 #define UTF8_NON_ID_CHAR1 0xEF 	/* non-identified character */
44 #define UTF8_NON_ID_CHAR2 0xBF
45 #define UTF8_NON_ID_CHAR3 0xBD
46 
47 #if defined UCS_2LE
48     #define output_char unichr_to_ucs_2le
49 #elif defined UCS_2BE
50     #define output_char unichr_to_ucs_2be
51 #elif defined UCS_4LE
52     #define output_char unichr_to_ucs_4le
53 #elif defined UCS_4BE
54     #define output_char unichr_to_ucs_4be
55 #else
56     #define output_char unichr_to_utf8
57 #endif
58 
59 typedef struct _icv_state {
60 	char	keepc[GBK_LEN_MAX];	/* maximum # byte of GBK2K code */
61 	short	cstate;		/* state machine id */
62 	int	_errno;		/* internal errno */
63         boolean bom_written;
64 } _iconv_st;
65 
66 enum _CSTATE	{ C0, C1, C2, C3 };
67 
68 static unsigned long gbk_to_unicode (_iconv_st *);
69 
70 static int binsearch(unsigned long x, table_t v[], int n);
71 static int gbk_2nd_byte(char inbuf);
72 
73 #include "uni_common.c"
74 
75 /*
76  * Open; called from iconv_open()
77  */
78 void *
79 _icv_open()
80 {
81 	_iconv_st *st;
82 
83 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
84 		errno = ENOMEM;
85 		return ((void *) -1);
86 	}
87 
88 	st->cstate = C0;
89 	st->_errno = 0;
90 #if defined(UCS_2LE) || defined(UCS_2BE) || defined(UCS_4LE) || defined(UCS_4BE)
91 	st->bom_written = true;
92 #else
93         st->bom_written = false;
94 #endif
95 	return ((void *) st);
96 }
97 
98 
99 /*
100  * Close; called from iconv_close()
101  */
102 void
103 _icv_close(_iconv_st *st)
104 {
105 	if (!st)
106 		errno = EBADF;
107 	else
108 		free(st);
109 }
110 
111 
112 /*
113  * Actual conversion; called from iconv()
114  */
115 /*=======================================================
116  *
117  *   State Machine for interpreting GBK code
118  *
119  *=======================================================
120  *
121  * 		                    3rd C
122  *                              C2--------> C3
123  *		                ^            |
124  *                        2nd C |      4th C |
125  *                     1st C    |            |
126  *    +--------> C0 ----------> C1           |
127  *    |    ascii |        2nd C |            |
128  *    ^          v              v	     V
129  *    +----<-----+-----<--------+-----<------+
130  *
131  *=======================================================*/
132 /*
133  * GBK2 encoding range (2 byte area):
134  *	High byte: 0x81 - 0xFE			(  126 encoding space)
135  *	Low byte:  0x40 - 0x7E, 0x80 - 0xFE	(  190 encoding space)
136  *	Total:	   126 * 190 = 23,940		(23940 encoding space)
137  *
138  * GBK4 encoding range (4 byte area):
139  *	The First byte:  0x81 - 0xFE
140  *	The Second byte: 0x30 - 0x39
141  *	The Third byte:  0x81 - 0xFE
142  *	The fourth byte: 0x30 - 0x39
143  */
144 
145 size_t
146 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
147 				char **outbuf, size_t *outbytesleft)
148 {
149 	int	n;
150         int	uconv_num = 0;
151 
152 	if (st == NULL) {
153 		errno = EBADF;
154 		return ((size_t) -1);
155 	}
156 
157 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
158 		st->cstate = C0;
159 		st->_errno = 0;
160 		return ((size_t) 0);
161 	}
162 
163 	st->_errno = 0;         /* reset internal errno */
164 	errno = 0;		/* reset external errno */
165 
166 	/* a state machine for interpreting GBK code */
167 	while (*inbytesleft > 0 && *outbytesleft > 0) {
168 		switch (st->cstate) {
169 		case C0:		/* assuming ASCII in the beginning */
170 			if (**inbuf & MSB) {
171 				if ( INVALID_BYTE((unsigned char)**inbuf) ) {
172 					st->_errno = errno = EILSEQ;
173 				} else {
174 					st->keepc[0] = (**inbuf);
175 					st->cstate = C1;
176 				}
177 			} else {	/* real ASCII */
178                                 int uconv_num_internal = 0;
179                                 n = output_char (st, **inbuf, *outbuf,
180                                                  *outbytesleft, &uconv_num_internal);
181 				if (n > 0) {
182 					(*outbuf) += n;
183 					(*outbytesleft) -= n;
184                                 }
185 			}
186 			break;
187 		case C1:		/* GBK2 characters: 2nd byte */
188 			if (gbk_2nd_byte(**inbuf) == 0) {
189 			        int uconv_num_internal = 0;
190 
191 				st->keepc[1] = (**inbuf);
192 				st->keepc[2] = st->keepc[3] = 0;
193 
194 				n = output_char (st, gbk_to_unicode (st), *outbuf,
195 						 *outbytesleft, &uconv_num_internal);
196 				if (n > 0) {
197 					(*outbuf) += n;
198 					(*outbytesleft) -= n;
199 
200 					uconv_num += uconv_num_internal;
201 
202 					st->cstate = C0;
203 				} else {	/* don't reset state */
204 					st->_errno = errno = E2BIG;
205 				}
206 
207 			} else  if ( gbk4_2nd_byte((unsigned char)**inbuf) ) {
208 				st->keepc[1] = **inbuf;
209 				st->cstate = C2;
210 			} else {	/* input char doesn't belong
211 					 * to the input code set
212 					 */
213 				st->_errno = errno = EILSEQ;
214 			}
215 			break;
216 		case C2:
217 			if ( gbk4_3rd_byte((unsigned char)**inbuf) ) {
218 				st->keepc[2] = **inbuf;
219 				st->cstate = C3;
220 			} else {
221 				st->_errno = errno = EILSEQ;
222 			}
223 			break;
224 		case C3:
225 			if ( gbk4_4th_byte((unsigned char)**inbuf) ) {
226 			        int uconv_num_internal = 0;
227 
228 				st->keepc[3] = **inbuf;
229 
230 				n = output_char (st, gbk_to_unicode (st), *outbuf,
231                                                  *outbytesleft, &uconv_num_internal);
232 
233 				if ( n > 0 ) {
234 					(*outbuf) += n;
235 					(*outbytesleft) -= n;
236 
237 				        uconv_num += uconv_num_internal;
238 
239 					st->cstate = C0;
240 				} else {
241 					st->_errno = errno = E2BIG;
242 				}
243 			} else {
244 				st->_errno = errno = EILSEQ;
245 			}
246 			break;
247 		default:			/* should never come here */
248 			st->_errno = errno = EILSEQ;
249 			st->cstate = C0;	/* reset state */
250 			break;
251 		}
252 
253 		if (st->_errno) {
254 			break;
255 		}
256 
257 		(*inbuf)++;
258 		(*inbytesleft)--;
259 	}
260 
261         if (*inbytesleft == 0 && st->cstate != C0)
262                 errno = EINVAL;
263 
264 	if (*inbytesleft > 0 && *outbytesleft == 0)
265 		errno = E2BIG;
266 
267         if (errno) {
268                 /*
269 		 * if error, *inbuf points to the byte following the last byte
270 		 * successfully used in the conversion.
271 		 */
272 		*inbuf -= (st->cstate - C0);
273 		*inbytesleft += (st->cstate - C0);
274 	        st->cstate = C0;
275 		return ((size_t) -1);
276 	}
277 
278 	return uconv_num;
279 }
280 
281 
282 /*
283  * Test whether inbuf is a valid character for 2nd byte GBK code
284  * Return: = 0 - valid GBK2 2nd byte
285  *         = 1 - invalid GBK2 2nd byte
286  */
287 static int gbk_2nd_byte(char inbuf)
288 {
289 	unsigned int	buf = (unsigned int) (inbuf & ONEBYTE);
290 
291 	if ((buf >= 0x40) && (buf <= 0x7E))
292 		return (0);
293 	if ((buf >= 0x80) && (buf <= 0xFE))
294 		return (0);
295 	return(1);
296 }
297 
298 static unsigned long gbk_to_unicode (st)
299 _iconv_st *st;
300 {
301 	unsigned long	gbk_val;	        /* GBK value */
302 	int		unidx;		        /* Unicode index */
303 	unsigned long	uni_val = 0xffffffff;	/* Unicode */
304 	int		isgbk4 = 1;
305 	char            *keepc = st->keepc;
306 
307 	if ( keepc[2] == 0 && keepc[3] == 0 )
308 		isgbk4 = 0;
309 
310 	if ( ! isgbk4 ) {
311 		gbk_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
312         } else {
313 		int  i;
314 
315 		gbk_val = keepc[0] & ONEBYTE;
316 		for ( i = 1; i < GBK_LEN_MAX; ++i )
317 			gbk_val = (gbk_val << 8) + (keepc[i] & ONEBYTE);
318 	}
319 
320 	if  ( isgbk4 ) {
321 		unidx = binsearch(gbk_val, gbk4_unicode_tab, GBK4MAX);
322 		if ( unidx >= 0 ) uni_val = gbk4_unicode_tab[unidx].value;
323 	} else {
324 		unidx = binsearch(gbk_val, gbk_unicode_tab, GBKMAX);
325 		if ( unidx >= 0 ) uni_val = gbk_unicode_tab[unidx].value;
326 	}
327 
328         return uni_val;
329 }
330 
331 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
332 static int binsearch(unsigned long x, table_t v[], int n)
333 {
334 	int low, high, mid;
335 
336 	low = 0;
337 	high = n - 1;
338 	while (low <= high) {
339 		mid = (high - low) / 2 + low;
340 		if (x < v[mid].key)
341 			high = mid - 1;
342 		else if (x > v[mid].key)
343 			low = mid + 1;
344 		else	/* found match */
345 			return mid;
346 	}
347 	return (-1);	/* no match */
348 }
349 
350 /*
351 vi:ts=8:ai:expandtab
352 */
353