1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 2001 Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/isa_defs.h>
31 #include <gb18030_unicode.h> /* GBK to Unicode mapping table */
32 #include "common_defs.h"
33
34 #define MSB 0x80 /* most significant bit */
35 #define ONEBYTE 0xff /* right most byte */
36 #define GBK_LEN_MAX 4
37
38 #define INVALID_BYTE(v) ( (v) == 0x80 || (v) == 0xff )
39 #define gbk4_2nd_byte(v) ( (v) >= 0x30 && (v) <= 0x39 )
40 #define gbk4_3rd_byte(v) ( (v) >= 0x81 && (v) <= 0xfe )
41 #define gbk4_4th_byte(v) gbk4_2nd_byte(v)
42
43 #define UTF8_NON_ID_CHAR1 0xEF /* non-identified character */
44 #define UTF8_NON_ID_CHAR2 0xBF
45 #define UTF8_NON_ID_CHAR3 0xBD
46
47 #if defined UCS_2LE
48 #define output_char unichr_to_ucs_2le
49 #elif defined UCS_2BE
50 #define output_char unichr_to_ucs_2be
51 #elif defined UCS_4LE
52 #define output_char unichr_to_ucs_4le
53 #elif defined UCS_4BE
54 #define output_char unichr_to_ucs_4be
55 #else
56 #define output_char unichr_to_utf8
57 #endif
58
59 typedef struct _icv_state {
60 char keepc[GBK_LEN_MAX]; /* maximum # byte of GBK2K code */
61 short cstate; /* state machine id */
62 int _errno; /* internal errno */
63 boolean bom_written;
64 } _iconv_st;
65
66 enum _CSTATE { C0, C1, C2, C3 };
67
68 static unsigned long gbk_to_unicode (_iconv_st *);
69
70 static int binsearch(unsigned long x, table_t v[], int n);
71 static int gbk_2nd_byte(char inbuf);
72
73 #include "uni_common.c"
74
75 /*
76 * Open; called from iconv_open()
77 */
78 void *
_icv_open()79 _icv_open()
80 {
81 _iconv_st *st;
82
83 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
84 errno = ENOMEM;
85 return ((void *) -1);
86 }
87
88 st->cstate = C0;
89 st->_errno = 0;
90 #if defined(UCS_2LE) || defined(UCS_2BE) || defined(UCS_4LE) || defined(UCS_4BE)
91 st->bom_written = true;
92 #else
93 st->bom_written = false;
94 #endif
95 return ((void *) st);
96 }
97
98
99 /*
100 * Close; called from iconv_close()
101 */
102 void
_icv_close(_iconv_st * st)103 _icv_close(_iconv_st *st)
104 {
105 if (!st)
106 errno = EBADF;
107 else
108 free(st);
109 }
110
111
112 /*
113 * Actual conversion; called from iconv()
114 */
115 /*=======================================================
116 *
117 * State Machine for interpreting GBK code
118 *
119 *=======================================================
120 *
121 * 3rd C
122 * C2--------> C3
123 * ^ |
124 * 2nd C | 4th C |
125 * 1st C | |
126 * +--------> C0 ----------> C1 |
127 * | ascii | 2nd C | |
128 * ^ v v V
129 * +----<-----+-----<--------+-----<------+
130 *
131 *=======================================================*/
132 /*
133 * GBK2 encoding range (2 byte area):
134 * High byte: 0x81 - 0xFE ( 126 encoding space)
135 * Low byte: 0x40 - 0x7E, 0x80 - 0xFE ( 190 encoding space)
136 * Total: 126 * 190 = 23,940 (23940 encoding space)
137 *
138 * GBK4 encoding range (4 byte area):
139 * The First byte: 0x81 - 0xFE
140 * The Second byte: 0x30 - 0x39
141 * The Third byte: 0x81 - 0xFE
142 * The fourth byte: 0x30 - 0x39
143 */
144
145 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)146 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
147 char **outbuf, size_t *outbytesleft)
148 {
149 int n;
150 int uconv_num = 0;
151
152 if (st == NULL) {
153 errno = EBADF;
154 return ((size_t) -1);
155 }
156
157 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
158 st->cstate = C0;
159 st->_errno = 0;
160 return ((size_t) 0);
161 }
162
163 st->_errno = 0; /* reset internal errno */
164 errno = 0; /* reset external errno */
165
166 /* a state machine for interpreting GBK code */
167 while (*inbytesleft > 0 && *outbytesleft > 0) {
168 switch (st->cstate) {
169 case C0: /* assuming ASCII in the beginning */
170 if (**inbuf & MSB) {
171 if ( INVALID_BYTE((unsigned char)**inbuf) ) {
172 st->_errno = errno = EILSEQ;
173 } else {
174 st->keepc[0] = (**inbuf);
175 st->cstate = C1;
176 }
177 } else { /* real ASCII */
178 int uconv_num_internal = 0;
179 n = output_char (st, **inbuf, *outbuf,
180 *outbytesleft, &uconv_num_internal);
181 if (n > 0) {
182 (*outbuf) += n;
183 (*outbytesleft) -= n;
184 }
185 }
186 break;
187 case C1: /* GBK2 characters: 2nd byte */
188 if (gbk_2nd_byte(**inbuf) == 0) {
189 int uconv_num_internal = 0;
190
191 st->keepc[1] = (**inbuf);
192 st->keepc[2] = st->keepc[3] = 0;
193
194 n = output_char (st, gbk_to_unicode (st), *outbuf,
195 *outbytesleft, &uconv_num_internal);
196 if (n > 0) {
197 (*outbuf) += n;
198 (*outbytesleft) -= n;
199
200 uconv_num += uconv_num_internal;
201
202 st->cstate = C0;
203 } else { /* don't reset state */
204 st->_errno = errno = E2BIG;
205 }
206
207 } else if ( gbk4_2nd_byte((unsigned char)**inbuf) ) {
208 st->keepc[1] = **inbuf;
209 st->cstate = C2;
210 } else { /* input char doesn't belong
211 * to the input code set
212 */
213 st->_errno = errno = EILSEQ;
214 }
215 break;
216 case C2:
217 if ( gbk4_3rd_byte((unsigned char)**inbuf) ) {
218 st->keepc[2] = **inbuf;
219 st->cstate = C3;
220 } else {
221 st->_errno = errno = EILSEQ;
222 }
223 break;
224 case C3:
225 if ( gbk4_4th_byte((unsigned char)**inbuf) ) {
226 int uconv_num_internal = 0;
227
228 st->keepc[3] = **inbuf;
229
230 n = output_char (st, gbk_to_unicode (st), *outbuf,
231 *outbytesleft, &uconv_num_internal);
232
233 if ( n > 0 ) {
234 (*outbuf) += n;
235 (*outbytesleft) -= n;
236
237 uconv_num += uconv_num_internal;
238
239 st->cstate = C0;
240 } else {
241 st->_errno = errno = E2BIG;
242 }
243 } else {
244 st->_errno = errno = EILSEQ;
245 }
246 break;
247 default: /* should never come here */
248 st->_errno = errno = EILSEQ;
249 st->cstate = C0; /* reset state */
250 break;
251 }
252
253 if (st->_errno) {
254 break;
255 }
256
257 (*inbuf)++;
258 (*inbytesleft)--;
259 }
260
261 if (*inbytesleft == 0 && st->cstate != C0)
262 errno = EINVAL;
263
264 if (*inbytesleft > 0 && *outbytesleft == 0)
265 errno = E2BIG;
266
267 if (errno) {
268 /*
269 * if error, *inbuf points to the byte following the last byte
270 * successfully used in the conversion.
271 */
272 *inbuf -= (st->cstate - C0);
273 *inbytesleft += (st->cstate - C0);
274 st->cstate = C0;
275 return ((size_t) -1);
276 }
277
278 return uconv_num;
279 }
280
281
282 /*
283 * Test whether inbuf is a valid character for 2nd byte GBK code
284 * Return: = 0 - valid GBK2 2nd byte
285 * = 1 - invalid GBK2 2nd byte
286 */
gbk_2nd_byte(char inbuf)287 static int gbk_2nd_byte(char inbuf)
288 {
289 unsigned int buf = (unsigned int) (inbuf & ONEBYTE);
290
291 if ((buf >= 0x40) && (buf <= 0x7E))
292 return (0);
293 if ((buf >= 0x80) && (buf <= 0xFE))
294 return (0);
295 return(1);
296 }
297
gbk_to_unicode(st)298 static unsigned long gbk_to_unicode (st)
299 _iconv_st *st;
300 {
301 unsigned long gbk_val; /* GBK value */
302 int unidx; /* Unicode index */
303 unsigned long uni_val = 0xffffffff; /* Unicode */
304 int isgbk4 = 1;
305 char *keepc = st->keepc;
306
307 if ( keepc[2] == 0 && keepc[3] == 0 )
308 isgbk4 = 0;
309
310 if ( ! isgbk4 ) {
311 gbk_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
312 } else {
313 int i;
314
315 gbk_val = keepc[0] & ONEBYTE;
316 for ( i = 1; i < GBK_LEN_MAX; ++i )
317 gbk_val = (gbk_val << 8) + (keepc[i] & ONEBYTE);
318 }
319
320 if ( isgbk4 ) {
321 unidx = binsearch(gbk_val, gbk4_unicode_tab, GBK4MAX);
322 if ( unidx >= 0 ) uni_val = gbk4_unicode_tab[unidx].value;
323 } else {
324 unidx = binsearch(gbk_val, gbk_unicode_tab, GBKMAX);
325 if ( unidx >= 0 ) uni_val = gbk_unicode_tab[unidx].value;
326 }
327
328 return uni_val;
329 }
330
331 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,table_t v[],int n)332 static int binsearch(unsigned long x, table_t v[], int n)
333 {
334 int low, high, mid;
335
336 low = 0;
337 high = n - 1;
338 while (low <= high) {
339 mid = (high - low) / 2 + low;
340 if (x < v[mid].key)
341 high = mid - 1;
342 else if (x > v[mid].key)
343 low = mid + 1;
344 else /* found match */
345 return mid;
346 }
347 return (-1); /* no match */
348 }
349
350 /*
351 vi:ts=8:ai:expandtab
352 */
353