1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1995, by Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <unicode_gb18030.h> /* Unicode to GBK mapping table */
31 #include "common_defs.h"
32 #include "ucs4.h"
33
34 #define MSB 0x80 /* most significant bit */
35 #define ONEBYTE 0xff /* right most byte */
36
37 #define NON_ID_CHAR '?' /* non-identified character */
38
39 #define IS_GBK4BYTES(v) ( (v) & 0xffff0000 )
40 #define GBK_LEN_MAX 4
41
42
43 typedef struct _icv_state {
44 char keepc[6]; /* maximum # byte of UTF8 code */
45 short ustate;
46 int _errno; /* internal errno */
47 } _iconv_st;
48
49 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
50
51 int get_gbk_by_unicode(unsigned long, int*, unsigned long*);
52 static int binsearch(unsigned long x, table_t v[], int n);
53 int unicode_to_gbk(int unidx, unsigned long gbkcode, char* buf, size_t buflen, int *uconv_num);
54
55 /*
56 * Open; called from iconv_open()
57 */
58 void *
_icv_open()59 _icv_open()
60 {
61 _iconv_st *st;
62
63 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
64 errno = ENOMEM;
65 return ((void *) -1);
66 }
67
68 st->ustate = U0;
69 st->_errno = 0;
70 return ((void *) st);
71 }
72
73
74 /*
75 * Close; called from iconv_close()
76 */
77 void
_icv_close(_iconv_st * st)78 _icv_close(_iconv_st *st)
79 {
80 if (!st)
81 errno = EBADF;
82 else
83 free(st);
84 }
85
86 #if defined(UCS_2LE) || defined (UCS_2BE) || defined (UCS_4LE) || defined (UCS_4BE)
87 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)88 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
89 char **outbuf, size_t *outbytesleft)
90 {
91 unsigned char c1, c2;
92 #if defined(UCS_4LE) || defined (UCS_4BE)
93 unsigned char c3, c4;
94 #endif
95 int n, unidx;
96 unsigned long unichr;
97 unsigned long gbkcode;
98 int uconv_num = 0;
99
100 if (st == NULL) {
101 errno = EBADF;
102 return ((size_t) -1);
103 }
104
105 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
106 st->ustate = U0;
107 st->_errno = 0;
108 return ((size_t) 0);
109 }
110
111 st->_errno = 0; /* reset internal errno */
112 errno = 0; /* reset external errno */
113
114 while (*inbytesleft > ICV_FETCH_UCS_SIZE-1 && *outbytesleft > 0) {
115
116 int size = 0;
117 int uconv_num_internal = 0;
118
119 c1 = *(*inbuf + size++);
120 c2 = *(*inbuf + size++);
121 #if defined(UCS_4LE) || defined (UCS_4BE)
122 c3 = *(*inbuf + size++);
123 c4 = *(*inbuf + size++);
124 #endif
125
126 #if defined(UCS_2LE)
127 unichr = (unsigned long) (c1 | (c2<<8));
128 #elif defined(UCS_2BE)
129 unichr = (unsigned long) ((c1<<8) | c2);
130 #elif defined(UCS_4LE)
131 unichr = (unsigned long) (c1 | (c2<<8) | (c3)<<16 | (c4<<24));
132 #else
133 unichr = (unsigned long) ((c1<<24) | (c2<<16) | (c3<<8) | c4);
134 #endif
135
136 if (unichr < MSB) { /* ASCII */
137 **outbuf = (char) unichr;
138 (*outbuf)++;
139 (*outbytesleft)--;
140 } else {
141 n = get_gbk_by_unicode(unichr, &unidx, &gbkcode);
142 if ( n == -1 ) { /* invalid unicode codepoint */
143 st->_errno = errno = EILSEQ;
144 return ((size_t)-1);
145 }
146
147 n = unicode_to_gbk(unidx, gbkcode, *outbuf, *outbytesleft, &uconv_num_internal);
148 if (n > 0) {
149 (*outbuf) += n;
150 (*outbytesleft) -= n;
151
152 uconv_num += uconv_num_internal;
153 } else {
154 return ((size_t)-1);
155 }
156 }
157
158 (*inbuf) += size;
159 (*inbytesleft) -= size;
160 }
161
162 if ( *inbytesleft >0 ) {
163 errno = *outbytesleft? EINVAL: E2BIG;
164 return ((size_t)-1);
165 }
166
167 return uconv_num;
168 }
169 #else
170 /*
171 * Actual conversion; called from iconv()
172 */
173 /*=========================================================
174 *
175 * State Machine for interpreting UTF8 code
176 *
177 *=========================================================
178 * 4 byte unicode
179 * +----->------->------------> U5 -----> U6-------> U7---+
180 * | |
181 * | 3 byte unicode |
182 * +----->------->-------+ |
183 * | | |
184 * ^ v |
185 * | 2 byte U2 ---> U3 |
186 * | unicode v |
187 * +------> U0 -------> U1 +-------->U4---+ |
188 * ^ ascii | | ^ | |
189 * | | +-------->--------->--------+ | |
190 * | v v V
191 * +----<---+-----<------------<------------<------------+---------+
192 *
193 *=========================================================*/
194 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)195 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
196 char **outbuf, size_t *outbytesleft)
197 {
198 char c1 = 0, c2 = 0;
199 int n, unidx;
200 unsigned long unichr;
201 unsigned long gbkcode;
202 int uconv_num = 0;
203 int utf8_len = 0;
204
205 #ifdef DEBUG
206 fprintf(stderr, "========== iconv(): UTF2 --> GBK2K ==========\n");
207 #endif
208 if (st == NULL) {
209 errno = EBADF;
210 return ((size_t) -1);
211 }
212
213 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
214 st->ustate = U0;
215 st->_errno = 0;
216 return ((size_t) 0);
217 }
218
219 st->_errno = 0; /* reset internal errno */
220 errno = 0; /* reset external errno */
221
222 /* a state machine for interpreting UTF8 code */
223 while (*inbytesleft > 0 && *outbytesleft > 0) {
224
225 uchar_t first_byte;
226 int uconv_num_internal = 0;
227
228 switch (st->ustate) {
229 case U0: /* assuming ASCII in the beginning */
230 if ((**inbuf & MSB) == 0) { /* ASCII */
231 **outbuf = **inbuf;
232 (*outbuf)++;
233 (*outbytesleft)--;
234 } else {
235 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc0..0xdf */
236 /* invalid sequence if the first char is either 0xc0 or 0xc1 */
237 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
238 st->_errno = errno = EILSEQ;
239 else {
240 st->ustate = U1;
241 st->keepc[0] = **inbuf;
242 }
243 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */
244 st->ustate = U2;
245 st->keepc[0] = **inbuf;
246 } else {
247 /* four bytes of UTF-8 sequences */
248 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
249 st->_errno = errno = EILSEQ;
250 else {
251 st->ustate = U5;
252 st->keepc[0] = **inbuf;
253 }
254 }
255 }
256 break;
257 case U1: /* 2 byte unicode */
258 if ((**inbuf & 0xc0) == MSB) {
259 utf8_len = 2;
260 st->keepc[1] = **inbuf;
261
262 c1 = (st->keepc[0]&0x1c)>>2;
263 c2 = ((st->keepc[0]&0x03)<<6) | ((st->keepc[1])&0x3f);
264
265 st->ustate = U4;
266 #ifdef DEBUG
267 fprintf(stderr, "UTF8: %02x%02x --> ",
268 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
269 #endif
270 continue; /* should not advance *inbuf */
271 } else {
272 st->_errno = errno = EILSEQ;
273 }
274 break;
275 case U2: /* 3 byte unicode - 2nd byte */
276 first_byte = (uchar_t)st->keepc[0];
277
278 /* if the first byte is 0xed, it is illegal sequence if the second
279 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
280 */
281 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
282 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
283 st->_errno = errno = EILSEQ;
284 else
285 {
286 st->ustate = U3;
287 st->keepc[1] = **inbuf;
288 }
289 break;
290 case U3: /* 3 byte unicode - 3rd byte */
291 if ((**inbuf & 0xc0) == MSB) {
292 st->ustate = U4;
293 utf8_len = 3;
294 st->keepc[2] = **inbuf;
295 c1 = ((st->keepc[0]&0x0f)<<4) |
296 ((st->keepc[1]&0x3c)>>2);
297 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
298 #ifdef DEBUG
299 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
300 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
301 #endif
302 continue; /* should not advance *inbuf */
303 } else {
304 st->_errno = errno = EILSEQ;
305 }
306 break;
307 case U4:
308 unichr = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
309 n = get_gbk_by_unicode(unichr, &unidx, &gbkcode);
310 if ( n == -1 ) { /* unicode is either 0xFFFE or 0xFFFF */
311 st->_errno = errno = EILSEQ;
312 break;
313 }
314 /* comment the following lines so that converter can ignore the non-GBK characters
315 if (n != 0) { * legal unicode;illegal GBK *
316 st->_errno = errno = EILSEQ;
317 break;
318 }
319 */
320 n = unicode_to_gbk(unidx, gbkcode, *outbuf, *outbytesleft, &uconv_num_internal);
321 if (n > 0) {
322 (*outbuf) += n;
323 (*outbytesleft) -= n;
324
325 uconv_num += uconv_num_internal;
326
327 st->ustate = U0;
328 } else {
329 st->_errno = errno;
330 }
331 break;
332 case U5:
333 first_byte = st->keepc[0];
334
335 /* if the first byte is 0xf0, it is illegal sequence if
336 * the second one is between 0x80 and 0x8f
337 * for Four-Byte UTF: U+10000..U+10FFFF
338 */
339 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
340 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
341 st->_errno = errno = EILSEQ;
342 else {
343 st->ustate = U6;
344 st->keepc[1] = **inbuf;
345 }
346 break;
347 case U6:
348 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
349 {
350 st->ustate = U7;
351 st->keepc[2] = **inbuf;
352 }
353 else
354 st->_errno = errno = EILSEQ;
355 break;
356 case U7:
357 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
358 {
359 /* replace with double NON_ID_CHARs */
360 if ( *outbytesleft < 2 ) st->_errno = errno = E2BIG;
361 else
362 {
363 **outbuf = NON_ID_CHAR;
364 *(*outbuf+1) = NON_ID_CHAR;
365 (*outbytesleft) -= 2;
366
367 uconv_num++;
368
369 st->ustate = U0;
370 }
371 }
372 else
373 st->_errno = errno = EILSEQ;
374 break;
375 default: /* should never come here */
376 st->_errno = errno = EILSEQ;
377 st->ustate = U0; /* reset state */
378 break;
379 }
380
381 if (st->_errno) {
382 #ifdef DEBUG
383 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
384 st->_errno, st->ustate);
385 #endif
386 break;
387 }
388
389 (*inbuf)++;
390 (*inbytesleft)--;
391 }
392
393 if (*inbytesleft == 0 && st->ustate != U0)
394 errno = EINVAL;
395
396 if (*inbytesleft > 0 && *outbytesleft == 0)
397 errno = E2BIG;
398
399 if (errno) {
400 int num_reversed_bytes = 0;
401
402 switch (st->ustate)
403 {
404 case U1:
405 num_reversed_bytes = 1;
406 break;
407 case U2:
408 num_reversed_bytes = 1;
409 break;
410 case U3:
411 num_reversed_bytes = 2;
412 break;
413 case U4:
414 num_reversed_bytes = utf8_len - 1;
415 break;
416 case U5:
417 num_reversed_bytes = 1;
418 break;
419 case U6:
420 num_reversed_bytes = 2;
421 break;
422 case U7:
423 num_reversed_bytes = 3;
424 break;
425 }
426
427 /*
428 * if error, *inbuf points to the byte following the last byte
429 * successfully used in conversion.
430 */
431 *inbuf -= num_reversed_bytes;
432 *inbytesleft += num_reversed_bytes;
433 st->ustate = U0;
434
435 return ((size_t) -1);
436 }
437
438 return uconv_num;
439 }
440 #endif /* UCS_2LE || UCS_2BE || UCS_4LE || UCS_4BE */
441
442
443 /*
444 * Match GBK code by UTF8 code;
445 * Return: = 0 - match from Unicode to GBK found
446 * = 1 - match from Unicode to GBK NOT found
447 * = -1- illegal sequence
448 *
449 * Since binary search of the UTF8 to GBK table is necessary, might as well
450 * return index and GBK code matching to the unicode.
451 */
get_gbk_by_unicode(unsigned long unicode,int * unidx,unsigned long * gbkcode)452 int get_gbk_by_unicode(unsigned long unicode, int* unidx, unsigned long* gbkcode)
453 {
454 if ( unicode > UCS4_MAXVAL || ext_ucs4_lsw(unicode) > UCS4_PPRC_MAXVAL ) return -1;
455
456 *unidx = binsearch(unicode, unicode_gbk_tab, UNICODEMAX);
457 if ((*unidx) >= 0)
458 *gbkcode = unicode_gbk_tab[*unidx].value;
459 else
460 return(1); /* match from unicode to GBK not found */
461 #ifdef DEBUG
462 fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5=%x ", unicode, *unidx, *gbkcode);
463 #endif
464
465 return(0);
466 }
467
468
469 /*
470 * ISO/IEC 10646-2000 (Unicode) --> GBK2K
471 * Unicode --> UTF8 (FSS-UTF)
472 * (File System Safe Universal Character Set Transformation Format)
473 * Return: > 0 - converted with enough space in output buffer
474 * = 0 - no space in outbuf
475 */
unicode_to_gbk(int unidx,unsigned long gbkcode,char * buf,size_t buflen,int * uconv_num)476 int unicode_to_gbk(int unidx, unsigned long gbkcode, char* buf, size_t buflen, int *uconv_num)
477 {
478 unsigned long val; /* GBK value */
479 char c[GBK_LEN_MAX];
480 int i, length;
481
482 if (unidx < 0) { /* no match from Unicode to GBK */
483 c[0] = c[1] = NON_ID_CHAR;
484
485 *uconv_num = 1;
486
487 length = 2;
488 } else {
489 if ( ! IS_GBK4BYTES( gbkcode ) ) { /* character within two bytes area */
490 val = gbkcode & 0xffff;
491 c[0] = (char) ((val & 0xff00) >> 8);
492 c[1] = (char) (val & 0xff);
493 length = 2;
494 } else { /* character within four bytes area */
495 val = gbkcode & 0xffffffff;
496 c[0] = (char) ( val >> 24 );
497 c[1] = (char) ( val >> 16 );
498 c[2] = (char) ( val >> 8 );
499 c[3] = (char) val;
500 length = 4;
501 }
502 }
503
504 #ifdef DEBUG
505 fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
506 #endif
507
508 if (buflen < length) {
509 errno = E2BIG;
510 return(0);
511 }
512
513 for ( i = 0; i < length; ++i )
514 *buf++ = c[i];
515
516 return length;
517 }
518
519
520 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,table_t v[],int n)521 static int binsearch(unsigned long x, table_t v[], int n)
522 {
523 int low, high, mid;
524
525 low = 0;
526 high = n - 1;
527 while (low <= high) {
528 mid = (high - low) / 2 + low;
529 if (x < v[mid].key)
530 high = mid - 1;
531 else if (x > v[mid].key)
532 low = mid + 1;
533 else /* found match */
534 return mid;
535 }
536 return (-1); /* no match */
537 }
538
539 /*
540 vi:ts=8:ai:expandtab
541 */
542