1*16d86563SAlexander Pyhalov /*
2*16d86563SAlexander Pyhalov * CDDL HEADER START
3*16d86563SAlexander Pyhalov *
4*16d86563SAlexander Pyhalov * The contents of this file are subject to the terms of the
5*16d86563SAlexander Pyhalov * Common Development and Distribution License (the "License").
6*16d86563SAlexander Pyhalov * You may not use this file except in compliance with the License.
7*16d86563SAlexander Pyhalov *
8*16d86563SAlexander Pyhalov * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9*16d86563SAlexander Pyhalov * or http://www.opensolaris.org/os/licensing.
10*16d86563SAlexander Pyhalov * See the License for the specific language governing permissions
11*16d86563SAlexander Pyhalov * and limitations under the License.
12*16d86563SAlexander Pyhalov *
13*16d86563SAlexander Pyhalov * When distributing Covered Code, include this CDDL HEADER in each
14*16d86563SAlexander Pyhalov * file and include the License file at src/OPENSOLARIS.LICENSE.
15*16d86563SAlexander Pyhalov * If applicable, add the following below this CDDL HEADER, with the
16*16d86563SAlexander Pyhalov * fields enclosed by brackets "[]" replaced with your own identifying
17*16d86563SAlexander Pyhalov * information: Portions Copyright [yyyy] [name of copyright owner]
18*16d86563SAlexander Pyhalov *
19*16d86563SAlexander Pyhalov * CDDL HEADER END
20*16d86563SAlexander Pyhalov */
21*16d86563SAlexander Pyhalov /*
22*16d86563SAlexander Pyhalov * Copyright(c) 2001 Sun Microsystems, Inc.
23*16d86563SAlexander Pyhalov * All rights reserved.
24*16d86563SAlexander Pyhalov */
25*16d86563SAlexander Pyhalov
26*16d86563SAlexander Pyhalov #include <stdio.h>
27*16d86563SAlexander Pyhalov #include <errno.h>
28*16d86563SAlexander Pyhalov #include <stdlib.h>
29*16d86563SAlexander Pyhalov #include <sys/types.h>
30*16d86563SAlexander Pyhalov #include <sys/isa_defs.h>
31*16d86563SAlexander Pyhalov #include <gb18030_unicode.h> /* GBK to Unicode mapping table */
32*16d86563SAlexander Pyhalov #include "common_defs.h"
33*16d86563SAlexander Pyhalov
34*16d86563SAlexander Pyhalov #define MSB 0x80 /* most significant bit */
35*16d86563SAlexander Pyhalov #define ONEBYTE 0xff /* right most byte */
36*16d86563SAlexander Pyhalov #define GBK_LEN_MAX 4
37*16d86563SAlexander Pyhalov
38*16d86563SAlexander Pyhalov #define INVALID_BYTE(v) ( (v) == 0x80 || (v) == 0xff )
39*16d86563SAlexander Pyhalov #define gbk4_2nd_byte(v) ( (v) >= 0x30 && (v) <= 0x39 )
40*16d86563SAlexander Pyhalov #define gbk4_3rd_byte(v) ( (v) >= 0x81 && (v) <= 0xfe )
41*16d86563SAlexander Pyhalov #define gbk4_4th_byte(v) gbk4_2nd_byte(v)
42*16d86563SAlexander Pyhalov
43*16d86563SAlexander Pyhalov #define UTF8_NON_ID_CHAR1 0xEF /* non-identified character */
44*16d86563SAlexander Pyhalov #define UTF8_NON_ID_CHAR2 0xBF
45*16d86563SAlexander Pyhalov #define UTF8_NON_ID_CHAR3 0xBD
46*16d86563SAlexander Pyhalov
47*16d86563SAlexander Pyhalov #if defined UCS_2LE
48*16d86563SAlexander Pyhalov #define output_char unichr_to_ucs_2le
49*16d86563SAlexander Pyhalov #elif defined UCS_2BE
50*16d86563SAlexander Pyhalov #define output_char unichr_to_ucs_2be
51*16d86563SAlexander Pyhalov #elif defined UCS_4LE
52*16d86563SAlexander Pyhalov #define output_char unichr_to_ucs_4le
53*16d86563SAlexander Pyhalov #elif defined UCS_4BE
54*16d86563SAlexander Pyhalov #define output_char unichr_to_ucs_4be
55*16d86563SAlexander Pyhalov #else
56*16d86563SAlexander Pyhalov #define output_char unichr_to_utf8
57*16d86563SAlexander Pyhalov #endif
58*16d86563SAlexander Pyhalov
59*16d86563SAlexander Pyhalov typedef struct _icv_state {
60*16d86563SAlexander Pyhalov char keepc[GBK_LEN_MAX]; /* maximum # byte of GBK2K code */
61*16d86563SAlexander Pyhalov short cstate; /* state machine id */
62*16d86563SAlexander Pyhalov int _errno; /* internal errno */
63*16d86563SAlexander Pyhalov boolean bom_written;
64*16d86563SAlexander Pyhalov } _iconv_st;
65*16d86563SAlexander Pyhalov
66*16d86563SAlexander Pyhalov enum _CSTATE { C0, C1, C2, C3 };
67*16d86563SAlexander Pyhalov
68*16d86563SAlexander Pyhalov static unsigned long gbk_to_unicode (_iconv_st *);
69*16d86563SAlexander Pyhalov
70*16d86563SAlexander Pyhalov static int binsearch(unsigned long x, table_t v[], int n);
71*16d86563SAlexander Pyhalov static int gbk_2nd_byte(char inbuf);
72*16d86563SAlexander Pyhalov
73*16d86563SAlexander Pyhalov #include "uni_common.c"
74*16d86563SAlexander Pyhalov
75*16d86563SAlexander Pyhalov /*
76*16d86563SAlexander Pyhalov * Open; called from iconv_open()
77*16d86563SAlexander Pyhalov */
78*16d86563SAlexander Pyhalov void *
_icv_open()79*16d86563SAlexander Pyhalov _icv_open()
80*16d86563SAlexander Pyhalov {
81*16d86563SAlexander Pyhalov _iconv_st *st;
82*16d86563SAlexander Pyhalov
83*16d86563SAlexander Pyhalov if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
84*16d86563SAlexander Pyhalov errno = ENOMEM;
85*16d86563SAlexander Pyhalov return ((void *) -1);
86*16d86563SAlexander Pyhalov }
87*16d86563SAlexander Pyhalov
88*16d86563SAlexander Pyhalov st->cstate = C0;
89*16d86563SAlexander Pyhalov st->_errno = 0;
90*16d86563SAlexander Pyhalov #if defined(UCS_2LE) || defined(UCS_2BE) || defined(UCS_4LE) || defined(UCS_4BE)
91*16d86563SAlexander Pyhalov st->bom_written = true;
92*16d86563SAlexander Pyhalov #else
93*16d86563SAlexander Pyhalov st->bom_written = false;
94*16d86563SAlexander Pyhalov #endif
95*16d86563SAlexander Pyhalov return ((void *) st);
96*16d86563SAlexander Pyhalov }
97*16d86563SAlexander Pyhalov
98*16d86563SAlexander Pyhalov
99*16d86563SAlexander Pyhalov /*
100*16d86563SAlexander Pyhalov * Close; called from iconv_close()
101*16d86563SAlexander Pyhalov */
102*16d86563SAlexander Pyhalov void
_icv_close(_iconv_st * st)103*16d86563SAlexander Pyhalov _icv_close(_iconv_st *st)
104*16d86563SAlexander Pyhalov {
105*16d86563SAlexander Pyhalov if (!st)
106*16d86563SAlexander Pyhalov errno = EBADF;
107*16d86563SAlexander Pyhalov else
108*16d86563SAlexander Pyhalov free(st);
109*16d86563SAlexander Pyhalov }
110*16d86563SAlexander Pyhalov
111*16d86563SAlexander Pyhalov
112*16d86563SAlexander Pyhalov /*
113*16d86563SAlexander Pyhalov * Actual conversion; called from iconv()
114*16d86563SAlexander Pyhalov */
115*16d86563SAlexander Pyhalov /*=======================================================
116*16d86563SAlexander Pyhalov *
117*16d86563SAlexander Pyhalov * State Machine for interpreting GBK code
118*16d86563SAlexander Pyhalov *
119*16d86563SAlexander Pyhalov *=======================================================
120*16d86563SAlexander Pyhalov *
121*16d86563SAlexander Pyhalov * 3rd C
122*16d86563SAlexander Pyhalov * C2--------> C3
123*16d86563SAlexander Pyhalov * ^ |
124*16d86563SAlexander Pyhalov * 2nd C | 4th C |
125*16d86563SAlexander Pyhalov * 1st C | |
126*16d86563SAlexander Pyhalov * +--------> C0 ----------> C1 |
127*16d86563SAlexander Pyhalov * | ascii | 2nd C | |
128*16d86563SAlexander Pyhalov * ^ v v V
129*16d86563SAlexander Pyhalov * +----<-----+-----<--------+-----<------+
130*16d86563SAlexander Pyhalov *
131*16d86563SAlexander Pyhalov *=======================================================*/
132*16d86563SAlexander Pyhalov /*
133*16d86563SAlexander Pyhalov * GBK2 encoding range (2 byte area):
134*16d86563SAlexander Pyhalov * High byte: 0x81 - 0xFE ( 126 encoding space)
135*16d86563SAlexander Pyhalov * Low byte: 0x40 - 0x7E, 0x80 - 0xFE ( 190 encoding space)
136*16d86563SAlexander Pyhalov * Total: 126 * 190 = 23,940 (23940 encoding space)
137*16d86563SAlexander Pyhalov *
138*16d86563SAlexander Pyhalov * GBK4 encoding range (4 byte area):
139*16d86563SAlexander Pyhalov * The First byte: 0x81 - 0xFE
140*16d86563SAlexander Pyhalov * The Second byte: 0x30 - 0x39
141*16d86563SAlexander Pyhalov * The Third byte: 0x81 - 0xFE
142*16d86563SAlexander Pyhalov * The fourth byte: 0x30 - 0x39
143*16d86563SAlexander Pyhalov */
144*16d86563SAlexander Pyhalov
145*16d86563SAlexander Pyhalov size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)146*16d86563SAlexander Pyhalov _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
147*16d86563SAlexander Pyhalov char **outbuf, size_t *outbytesleft)
148*16d86563SAlexander Pyhalov {
149*16d86563SAlexander Pyhalov int n;
150*16d86563SAlexander Pyhalov int uconv_num = 0;
151*16d86563SAlexander Pyhalov
152*16d86563SAlexander Pyhalov if (st == NULL) {
153*16d86563SAlexander Pyhalov errno = EBADF;
154*16d86563SAlexander Pyhalov return ((size_t) -1);
155*16d86563SAlexander Pyhalov }
156*16d86563SAlexander Pyhalov
157*16d86563SAlexander Pyhalov if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
158*16d86563SAlexander Pyhalov st->cstate = C0;
159*16d86563SAlexander Pyhalov st->_errno = 0;
160*16d86563SAlexander Pyhalov return ((size_t) 0);
161*16d86563SAlexander Pyhalov }
162*16d86563SAlexander Pyhalov
163*16d86563SAlexander Pyhalov st->_errno = 0; /* reset internal errno */
164*16d86563SAlexander Pyhalov errno = 0; /* reset external errno */
165*16d86563SAlexander Pyhalov
166*16d86563SAlexander Pyhalov /* a state machine for interpreting GBK code */
167*16d86563SAlexander Pyhalov while (*inbytesleft > 0 && *outbytesleft > 0) {
168*16d86563SAlexander Pyhalov switch (st->cstate) {
169*16d86563SAlexander Pyhalov case C0: /* assuming ASCII in the beginning */
170*16d86563SAlexander Pyhalov if (**inbuf & MSB) {
171*16d86563SAlexander Pyhalov if ( INVALID_BYTE((unsigned char)**inbuf) ) {
172*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
173*16d86563SAlexander Pyhalov } else {
174*16d86563SAlexander Pyhalov st->keepc[0] = (**inbuf);
175*16d86563SAlexander Pyhalov st->cstate = C1;
176*16d86563SAlexander Pyhalov }
177*16d86563SAlexander Pyhalov } else { /* real ASCII */
178*16d86563SAlexander Pyhalov int uconv_num_internal = 0;
179*16d86563SAlexander Pyhalov n = output_char (st, **inbuf, *outbuf,
180*16d86563SAlexander Pyhalov *outbytesleft, &uconv_num_internal);
181*16d86563SAlexander Pyhalov if (n > 0) {
182*16d86563SAlexander Pyhalov (*outbuf) += n;
183*16d86563SAlexander Pyhalov (*outbytesleft) -= n;
184*16d86563SAlexander Pyhalov }
185*16d86563SAlexander Pyhalov }
186*16d86563SAlexander Pyhalov break;
187*16d86563SAlexander Pyhalov case C1: /* GBK2 characters: 2nd byte */
188*16d86563SAlexander Pyhalov if (gbk_2nd_byte(**inbuf) == 0) {
189*16d86563SAlexander Pyhalov int uconv_num_internal = 0;
190*16d86563SAlexander Pyhalov
191*16d86563SAlexander Pyhalov st->keepc[1] = (**inbuf);
192*16d86563SAlexander Pyhalov st->keepc[2] = st->keepc[3] = 0;
193*16d86563SAlexander Pyhalov
194*16d86563SAlexander Pyhalov n = output_char (st, gbk_to_unicode (st), *outbuf,
195*16d86563SAlexander Pyhalov *outbytesleft, &uconv_num_internal);
196*16d86563SAlexander Pyhalov if (n > 0) {
197*16d86563SAlexander Pyhalov (*outbuf) += n;
198*16d86563SAlexander Pyhalov (*outbytesleft) -= n;
199*16d86563SAlexander Pyhalov
200*16d86563SAlexander Pyhalov uconv_num += uconv_num_internal;
201*16d86563SAlexander Pyhalov
202*16d86563SAlexander Pyhalov st->cstate = C0;
203*16d86563SAlexander Pyhalov } else { /* don't reset state */
204*16d86563SAlexander Pyhalov st->_errno = errno = E2BIG;
205*16d86563SAlexander Pyhalov }
206*16d86563SAlexander Pyhalov
207*16d86563SAlexander Pyhalov } else if ( gbk4_2nd_byte((unsigned char)**inbuf) ) {
208*16d86563SAlexander Pyhalov st->keepc[1] = **inbuf;
209*16d86563SAlexander Pyhalov st->cstate = C2;
210*16d86563SAlexander Pyhalov } else { /* input char doesn't belong
211*16d86563SAlexander Pyhalov * to the input code set
212*16d86563SAlexander Pyhalov */
213*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
214*16d86563SAlexander Pyhalov }
215*16d86563SAlexander Pyhalov break;
216*16d86563SAlexander Pyhalov case C2:
217*16d86563SAlexander Pyhalov if ( gbk4_3rd_byte((unsigned char)**inbuf) ) {
218*16d86563SAlexander Pyhalov st->keepc[2] = **inbuf;
219*16d86563SAlexander Pyhalov st->cstate = C3;
220*16d86563SAlexander Pyhalov } else {
221*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
222*16d86563SAlexander Pyhalov }
223*16d86563SAlexander Pyhalov break;
224*16d86563SAlexander Pyhalov case C3:
225*16d86563SAlexander Pyhalov if ( gbk4_4th_byte((unsigned char)**inbuf) ) {
226*16d86563SAlexander Pyhalov int uconv_num_internal = 0;
227*16d86563SAlexander Pyhalov
228*16d86563SAlexander Pyhalov st->keepc[3] = **inbuf;
229*16d86563SAlexander Pyhalov
230*16d86563SAlexander Pyhalov n = output_char (st, gbk_to_unicode (st), *outbuf,
231*16d86563SAlexander Pyhalov *outbytesleft, &uconv_num_internal);
232*16d86563SAlexander Pyhalov
233*16d86563SAlexander Pyhalov if ( n > 0 ) {
234*16d86563SAlexander Pyhalov (*outbuf) += n;
235*16d86563SAlexander Pyhalov (*outbytesleft) -= n;
236*16d86563SAlexander Pyhalov
237*16d86563SAlexander Pyhalov uconv_num += uconv_num_internal;
238*16d86563SAlexander Pyhalov
239*16d86563SAlexander Pyhalov st->cstate = C0;
240*16d86563SAlexander Pyhalov } else {
241*16d86563SAlexander Pyhalov st->_errno = errno = E2BIG;
242*16d86563SAlexander Pyhalov }
243*16d86563SAlexander Pyhalov } else {
244*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
245*16d86563SAlexander Pyhalov }
246*16d86563SAlexander Pyhalov break;
247*16d86563SAlexander Pyhalov default: /* should never come here */
248*16d86563SAlexander Pyhalov st->_errno = errno = EILSEQ;
249*16d86563SAlexander Pyhalov st->cstate = C0; /* reset state */
250*16d86563SAlexander Pyhalov break;
251*16d86563SAlexander Pyhalov }
252*16d86563SAlexander Pyhalov
253*16d86563SAlexander Pyhalov if (st->_errno) {
254*16d86563SAlexander Pyhalov break;
255*16d86563SAlexander Pyhalov }
256*16d86563SAlexander Pyhalov
257*16d86563SAlexander Pyhalov (*inbuf)++;
258*16d86563SAlexander Pyhalov (*inbytesleft)--;
259*16d86563SAlexander Pyhalov }
260*16d86563SAlexander Pyhalov
261*16d86563SAlexander Pyhalov if (*inbytesleft == 0 && st->cstate != C0)
262*16d86563SAlexander Pyhalov errno = EINVAL;
263*16d86563SAlexander Pyhalov
264*16d86563SAlexander Pyhalov if (*inbytesleft > 0 && *outbytesleft == 0)
265*16d86563SAlexander Pyhalov errno = E2BIG;
266*16d86563SAlexander Pyhalov
267*16d86563SAlexander Pyhalov if (errno) {
268*16d86563SAlexander Pyhalov /*
269*16d86563SAlexander Pyhalov * if error, *inbuf points to the byte following the last byte
270*16d86563SAlexander Pyhalov * successfully used in the conversion.
271*16d86563SAlexander Pyhalov */
272*16d86563SAlexander Pyhalov *inbuf -= (st->cstate - C0);
273*16d86563SAlexander Pyhalov *inbytesleft += (st->cstate - C0);
274*16d86563SAlexander Pyhalov st->cstate = C0;
275*16d86563SAlexander Pyhalov return ((size_t) -1);
276*16d86563SAlexander Pyhalov }
277*16d86563SAlexander Pyhalov
278*16d86563SAlexander Pyhalov return uconv_num;
279*16d86563SAlexander Pyhalov }
280*16d86563SAlexander Pyhalov
281*16d86563SAlexander Pyhalov
282*16d86563SAlexander Pyhalov /*
283*16d86563SAlexander Pyhalov * Test whether inbuf is a valid character for 2nd byte GBK code
284*16d86563SAlexander Pyhalov * Return: = 0 - valid GBK2 2nd byte
285*16d86563SAlexander Pyhalov * = 1 - invalid GBK2 2nd byte
286*16d86563SAlexander Pyhalov */
gbk_2nd_byte(char inbuf)287*16d86563SAlexander Pyhalov static int gbk_2nd_byte(char inbuf)
288*16d86563SAlexander Pyhalov {
289*16d86563SAlexander Pyhalov unsigned int buf = (unsigned int) (inbuf & ONEBYTE);
290*16d86563SAlexander Pyhalov
291*16d86563SAlexander Pyhalov if ((buf >= 0x40) && (buf <= 0x7E))
292*16d86563SAlexander Pyhalov return (0);
293*16d86563SAlexander Pyhalov if ((buf >= 0x80) && (buf <= 0xFE))
294*16d86563SAlexander Pyhalov return (0);
295*16d86563SAlexander Pyhalov return(1);
296*16d86563SAlexander Pyhalov }
297*16d86563SAlexander Pyhalov
gbk_to_unicode(st)298*16d86563SAlexander Pyhalov static unsigned long gbk_to_unicode (st)
299*16d86563SAlexander Pyhalov _iconv_st *st;
300*16d86563SAlexander Pyhalov {
301*16d86563SAlexander Pyhalov unsigned long gbk_val; /* GBK value */
302*16d86563SAlexander Pyhalov int unidx; /* Unicode index */
303*16d86563SAlexander Pyhalov unsigned long uni_val = 0xffffffff; /* Unicode */
304*16d86563SAlexander Pyhalov int isgbk4 = 1;
305*16d86563SAlexander Pyhalov char *keepc = st->keepc;
306*16d86563SAlexander Pyhalov
307*16d86563SAlexander Pyhalov if ( keepc[2] == 0 && keepc[3] == 0 )
308*16d86563SAlexander Pyhalov isgbk4 = 0;
309*16d86563SAlexander Pyhalov
310*16d86563SAlexander Pyhalov if ( ! isgbk4 ) {
311*16d86563SAlexander Pyhalov gbk_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
312*16d86563SAlexander Pyhalov } else {
313*16d86563SAlexander Pyhalov int i;
314*16d86563SAlexander Pyhalov
315*16d86563SAlexander Pyhalov gbk_val = keepc[0] & ONEBYTE;
316*16d86563SAlexander Pyhalov for ( i = 1; i < GBK_LEN_MAX; ++i )
317*16d86563SAlexander Pyhalov gbk_val = (gbk_val << 8) + (keepc[i] & ONEBYTE);
318*16d86563SAlexander Pyhalov }
319*16d86563SAlexander Pyhalov
320*16d86563SAlexander Pyhalov if ( isgbk4 ) {
321*16d86563SAlexander Pyhalov unidx = binsearch(gbk_val, gbk4_unicode_tab, GBK4MAX);
322*16d86563SAlexander Pyhalov if ( unidx >= 0 ) uni_val = gbk4_unicode_tab[unidx].value;
323*16d86563SAlexander Pyhalov } else {
324*16d86563SAlexander Pyhalov unidx = binsearch(gbk_val, gbk_unicode_tab, GBKMAX);
325*16d86563SAlexander Pyhalov if ( unidx >= 0 ) uni_val = gbk_unicode_tab[unidx].value;
326*16d86563SAlexander Pyhalov }
327*16d86563SAlexander Pyhalov
328*16d86563SAlexander Pyhalov return uni_val;
329*16d86563SAlexander Pyhalov }
330*16d86563SAlexander Pyhalov
331*16d86563SAlexander Pyhalov /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,table_t v[],int n)332*16d86563SAlexander Pyhalov static int binsearch(unsigned long x, table_t v[], int n)
333*16d86563SAlexander Pyhalov {
334*16d86563SAlexander Pyhalov int low, high, mid;
335*16d86563SAlexander Pyhalov
336*16d86563SAlexander Pyhalov low = 0;
337*16d86563SAlexander Pyhalov high = n - 1;
338*16d86563SAlexander Pyhalov while (low <= high) {
339*16d86563SAlexander Pyhalov mid = (high - low) / 2 + low;
340*16d86563SAlexander Pyhalov if (x < v[mid].key)
341*16d86563SAlexander Pyhalov high = mid - 1;
342*16d86563SAlexander Pyhalov else if (x > v[mid].key)
343*16d86563SAlexander Pyhalov low = mid + 1;
344*16d86563SAlexander Pyhalov else /* found match */
345*16d86563SAlexander Pyhalov return mid;
346*16d86563SAlexander Pyhalov }
347*16d86563SAlexander Pyhalov return (-1); /* no match */
348*16d86563SAlexander Pyhalov }
349*16d86563SAlexander Pyhalov
350*16d86563SAlexander Pyhalov /*
351*16d86563SAlexander Pyhalov vi:ts=8:ai:expandtab
352*16d86563SAlexander Pyhalov */
353