1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 1995 Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <strings.h>
29 #include <errno.h>
30 #ifdef DEBUG
31 #include <sys/fcntl.h>
32 #include <sys/stat.h>
33 #endif
34 #include <public_struc.h>
35 #include <gb2312_unicode.h>
36 #include <cns11643_unicode_CN.h>
37
38 #define MSB 0x80 /* most significant bit */
39 #define MBYTE 0x8e /* multi-byte (4 byte character) */
40 #define PMASK 0xa0 /* plane number mask */
41 #define ONEBYTE 0xff /* right most byte */
42 #define MSB_OFF 0x7f /* mask off MBS */
43
44 #define SI 0x0f /* shift in */
45 #define SO 0x0e /* shift out */
46 #define ESC 0x1b /* escape */
47 #define SS2 0x4e /* SS2 shift out */
48 #define SS3 0x4f /* SS3 shift out */
49 #define UTF8_NON_ID_CHAR1 0xEF /* non-identified character */
50 #define UTF8_NON_ID_CHAR2 0xBF /* non-identified character */
51 #define UTF8_NON_ID_CHAR3 0xBD /* non-identified character */
52
53 typedef struct _icv_state {
54 char _buf[10];
55 size_t _bufcont;
56 char _keepc[4]; /* maximum # byte of CNS11643 code */
57 short _gstate; /* state machine id */
58 short _istate; /* state for shift in/out */
59 int _plane; /* plane number for Chinese character */
60 int _last_plane; /* last charactor's plane # */
61 int _errno; /* internal errno */
62 } _iconv_st;
63
64 enum _GSTATE { G0, G1, G2, G3, G4, G5, G6, G7, G8, G9, \
65 G10,G11,G12,G13,G14,G15,G16,G17,G18,G19, \
66 G20,G21,G22,G23,G24,G25,G26,G27,G28,G29 };
67
68 enum _ISTATE { IN, OUT };
69
70
71 int iso_to_utf8(_iconv_st * st, char* buf, size_t buflen);
72 int binsearch(unsigned long x, table_t v[], int n);
73 int flush_buf(_iconv_st * st, char ** outbuf, size_t * outbytesleft);
74
flush_buf(_iconv_st * st,char ** outbuf,size_t * outbytesleft)75 int flush_buf(_iconv_st * st, char ** outbuf, size_t * outbytesleft) {
76
77 if (!st->_bufcont)
78 return 0;
79 if (st->_bufcont > *outbytesleft) {
80 st->_errno = E2BIG;
81 return -1;
82 }
83 strncpy(st->_buf, *outbuf, st->_bufcont);
84 (*outbuf)+=(st->_bufcont);
85 (*outbytesleft)-=(st->_bufcont);
86 st->_bufcont = 0;
87 return st->_bufcont;
88 }
89
90 /*
91 * Open; called from iconv_open()
92 */
93 void *
_icv_open()94 _icv_open()
95 {
96 _iconv_st *st;
97
98 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
99 errno = ENOMEM;
100 return ((void *) -1);
101 }
102
103 st->_gstate = G0;
104 st->_istate = IN;
105 st->_last_plane = st->_plane = -1;
106 st->_errno = 0;
107 st->_bufcont = 0;
108
109 return ((void *) st);
110 }
111
112 /*
113 * Close; called from iconv_close()
114 */
115 void
_icv_close(_iconv_st * st)116 _icv_close(_iconv_st *st)
117 {
118 if (st == NULL)
119 errno = EBADF;
120 else
121 free(st);
122 }
123
124 /*
125 * Actual conversion; called from iconv()
126 */
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)127 size_t _icv_iconv(_iconv_st *st, \
128 char **inbuf, size_t *inbytesleft, \
129 char **outbuf, size_t *outbytesleft) {
130 int n;
131 char c;
132
133 if (st == NULL) {
134 errno = EBADF;
135 return ((size_t) -1);
136 }
137
138 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
139 st->_errno = 0;
140 return ((size_t) 0);
141 }
142
143 errno = st->_errno = 0; /* reset internal and external errno */
144
145 /* a state machine for interpreting ISO 2022-7 code */
146 while (*inbytesleft > 0 && *outbytesleft > 0) {
147 switch (st->_gstate) {
148 case G0: /* assuming ASCII in the beginning */
149 if (**inbuf == ESC) {
150 st->_gstate = G1;
151 st->_buf[st->_bufcont++] = ESC;
152 } else { /* real ASCII */
153 **outbuf = **inbuf;
154 (*outbuf)++;
155 (*outbytesleft)--;
156 }
157 break;
158 case G1: /* got ESC, expecting $ */
159 if (**inbuf == '$') {
160 st->_gstate = G2;
161 st->_buf[st->_bufcont++] = '$';
162 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
163 errno = st->_errno;
164 return (size_t)-1;
165 } else {
166 st->_gstate = G0;
167 st->_errno = 0;
168 st->_istate = IN;
169 continue; /* don't advance inbuf */
170 }
171 break;
172 case G2: /* got $, expecting ) * or + */
173 if (**inbuf == ')') {
174 st->_gstate = G3;
175 } else if (**inbuf == '*') {
176 st->_gstate = G12;
177 st->_plane = 2;
178 } else if (**inbuf == '+') {
179 st->_gstate = G19;
180 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
181 errno = st->_errno;
182 return (size_t)-1;
183 } else {
184 st->_gstate = G0;
185 st->_errno = 0;
186 st->_istate = IN;
187 continue; /* don't advance inbuf */
188 }
189 st->_buf[st->_bufcont++] = **inbuf;
190 break;
191 case G3: /* got ) expecting A,G,H */
192 /* H is for the bug of and zh_TW.BIG5 */
193 if (**inbuf == 'A') {
194 st->_plane = 0;
195 st->_gstate = G4;
196 } else if (**inbuf == 'G') {
197 st->_plane = 1;
198 st->_gstate = G8;
199 } else if (**inbuf == 'H') {
200 st->_plane = 2;
201 st->_gstate = G8;
202 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
203 errno = st->_errno;
204 return (size_t)-1;
205 } else {
206 st->_gstate = G0;
207 st->_errno = 0;
208 st->_istate = IN;
209 continue;
210 }
211 st->_buf[st->_bufcont++] = **inbuf;
212 break;
213 case G4: /* ESC $ ) A got, and SO is expected */
214 if (**inbuf == SO) {
215 st->_gstate = G5;
216 st->_istate = OUT;
217 st->_bufcont = 0;
218 st->_last_plane = st->_plane;
219 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
220 errno = st->_errno;
221 return (size_t)-1;
222 } else {
223 st->_gstate = G0;
224 st->_errno = 0;
225 st->_istate = IN;
226 st->_plane = st->_last_plane;
227 continue;
228 }
229 break;
230 case G5: /* SO (Shift Out) */
231 if (**inbuf == SI) {
232 st->_istate = IN;
233 st->_gstate = G7;
234 st->_last_plane = st->_plane;
235 } else if (**inbuf == ESC) {
236 st->_bufcont = 0;
237 st->_gstate = G0;
238 continue;
239 } else { /* Chinese Charactors */
240 st->_keepc[0] = **inbuf;
241 st->_gstate = G6;
242 }
243 break;
244 case G6: /* GB2312: 2nd Chinese character */
245 st->_keepc[1] = **inbuf;
246 n = iso_to_utf8(st, *outbuf, *outbytesleft);
247 if (n > 0) {
248 (*outbuf) += n;
249 (*outbytesleft) -= n;
250 } else {
251 errno = st->_errno;
252 return (size_t)-1;
253 }
254 st->_gstate = G5;
255 break;
256 case G7: /* Shift in */
257 if (**inbuf == SO) {
258 st->_gstate = G5;
259 st->_istate = OUT;
260 st->_last_plane = st->_plane;
261 st->_bufcont = 0;
262 } else if (**inbuf == ESC) {
263 st->_gstate = G0;
264 continue;
265 } else {
266 **outbuf = **inbuf;
267 (*outbuf)++;
268 (*outbytesleft) --;
269 }
270 break;
271 case G8: /* BIG5: Chinese character */
272 if (**inbuf == SO) {
273 st->_istate = OUT;
274 st->_gstate = G9;
275 st->_bufcont = 0;
276 st->_last_plane = st->_plane;
277 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
278 errno = st->_errno;
279 return (size_t)-1;
280 } else {
281 st->_gstate = G0;
282 st->_errno = 0;
283 st->_plane = st->_last_plane;
284 st->_istate = IN;
285 continue;
286 }
287 break;
288 case G9:
289 if (**inbuf == SI) {
290 st->_istate = IN;
291 st->_gstate = G11;
292 st->_last_plane = st->_plane;
293 } else if (**inbuf == ESC) {
294 if (flush_buf(st, outbuf, outbytesleft) == -1) {
295 errno = st->_errno;
296 return (size_t)-1;
297 }
298 st->_gstate = G0;
299 continue;
300 } else { /* Chinese Charactor */
301 st->_keepc[0] = **inbuf;
302 st->_gstate = G10;
303 }
304 break;
305 case G10:
306 st->_keepc[1] = **inbuf;
307 n = iso_to_utf8(st, *outbuf, *outbytesleft);
308 if (n > 0) {
309 (*outbuf) += n;
310 (*outbytesleft) -= n;
311 } else {
312 errno = st->_errno;
313 return (size_t)-1;
314 }
315 st->_gstate = G9;
316 break;
317 case G11:
318 st->_bufcont = 0;
319 if (**inbuf == SO) {
320 st->_istate = OUT;
321 st->_gstate = G9;
322 } else if (**inbuf == ESC) {
323 st->_gstate = G0;
324 continue;
325 } else {
326 **outbuf = **inbuf;
327 (*outbuf)++;
328 (*outbytesleft)--;
329 }
330 break;
331 case G12:
332 if (**inbuf == 'H') {
333 st->_buf[st->_bufcont++] = 'H';
334 st->_gstate = G13;
335 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
336 errno = st->_errno;
337 return (size_t)-1;
338 } else {
339 st->_istate = IN;
340 st->_plane = st->_last_plane;
341 st->_gstate = G0;
342 continue;
343 }
344 break;
345 case G13:
346 if (**inbuf == ESC) {
347 st->_buf[st->_bufcont++] = **inbuf;
348 st->_gstate = G14;
349 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
350 errno = st->_errno;
351 return (size_t)-1;
352 } else {
353 st->_gstate = G0;
354 st->_istate = IN;
355 st->_plane = st->_last_plane;
356 continue;
357 }
358 break;
359 case G14:
360 if (**inbuf == SS2) {
361 st->_istate = OUT;
362 st->_gstate = G15;
363 st->_bufcont = 0;
364 st->_last_plane = st->_plane = 2;
365 } else if (**inbuf == '$') {
366 st->_bufcont --;
367 if (flush_buf(st, outbuf, outbytesleft) == -1) {
368 errno = st->_errno;
369 return (size_t)-1;
370 } else {
371 st->_gstate = G1;
372 st->_plane = st->_last_plane;
373 st->_istate = IN;
374 continue;
375 }
376 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
377 errno = st->_errno;
378 return (size_t)-1;
379 } else {
380 st->_gstate = G0;
381 st->_istate = IN;
382 st->_plane = st->_last_plane;
383 continue;
384 }
385 break;
386 case G15:
387 if (**inbuf == SI) {
388 st->_gstate = G16;
389 st->_istate = IN;
390 st->_last_plane = st->_plane;
391 } else if (**inbuf == ESC) {
392 st->_bufcont = 0;
393 st->_gstate = G0;
394 continue;
395 } else {
396 st->_keepc[0] = **inbuf;
397 st->_gstate = G18;
398 }
399 break;
400 case G16:
401 if (**inbuf == ESC) {
402 st->_gstate = G17;
403 st->_buf[st->_bufcont++] = ESC;
404 } else {
405 **outbuf = **inbuf;
406 (*outbuf) ++;
407 (*outbytesleft) --;
408 st->_bufcont = 0;
409 }
410 break;
411 case G17:
412 if (**inbuf == '$') {
413 st->_gstate = G1;
414 st->_buf[st->_bufcont++] = '$';
415 continue;
416 } else if (**inbuf == SS2) {
417 st->_bufcont = 0;
418 st->_gstate = G15;
419 st->_istate = OUT;
420 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
421 errno = st->_errno;
422 return (size_t)-1;
423 } else {
424 st->_gstate = G16;
425 st->_istate = IN;
426 }
427 break;
428 case G18:
429 st->_keepc[1] = **inbuf;
430 st->_gstate = G15;
431 if ((n = iso_to_utf8(st, \
432 *outbuf, \
433 *outbytesleft)) > 0) {
434 (*outbuf)+=n;
435 (*outbytesleft)-=n;
436 } else {
437 errno = st->_errno;
438 return (size_t)-1;
439 }
440 break;
441 case G19: /* Plane #: 3 - 16 */
442 c = **inbuf;
443 if (c == 'I' || \
444 c == 'J' || \
445 c == 'K' || \
446 c == 'L' || \
447 c == 'M' || \
448 c == 'N' || \
449 c == 'O' || \
450 c == 'P' || \
451 c == 'Q' || \
452 c == 'R' || \
453 c == 'S' || \
454 c == 'T' || \
455 c == 'U' || \
456 c == 'V') {
457 st->_plane = c - 'I' + 3;
458 st->_gstate = G20;
459 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
460 errno = st->_errno;
461 return (size_t)-1;
462 } else {
463 st->_gstate = G0;
464 st->_errno = 0;
465 st->_istate = IN;
466 st->_plane = st->_last_plane;
467 continue;
468 }
469 st->_buf[st->_bufcont++] = c;
470 break;
471 case G20:
472 if (**inbuf == ESC) {
473 st->_buf[st->_bufcont++] = **inbuf;
474 st->_gstate = G21;
475 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
476 errno = st->_errno;
477 return (size_t)-1;
478 } else {
479 st->_gstate = G0;
480 st->_istate = IN;
481 st->_last_plane = st->_plane;
482 continue;
483 }
484 break;
485 case G21:
486 if (**inbuf == SS3) {
487 st->_istate = OUT;
488 st->_gstate = G22;
489 st->_bufcont = 0;
490 } else if (**inbuf == '$') {
491 st->_bufcont --;
492 if (flush_buf(st, outbuf, outbytesleft) == -1) {
493 errno = st->_errno;
494 return (size_t)-1;
495 } else {
496 st->_istate = IN;
497 st->_last_plane = st->_plane;
498 st->_gstate = G1;
499 continue;
500 }
501 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
502 errno = st->_errno;
503 return (size_t)-1;
504 } else {
505 st->_gstate = G0;
506 st->_istate = IN;
507 st->_last_plane = st->_plane;
508 continue;
509 }
510 break;
511 case G22:
512 if (**inbuf == SI) {
513 st->_istate = IN;
514 st->_gstate = G24;
515 st->_last_plane = st->_plane;
516 } else {
517 st->_keepc[0] = (char)MBYTE;
518 st->_keepc[1] = (char)(PMASK + st->_plane);
519 st->_keepc[2] = **inbuf;
520 st->_gstate = G23;
521 }
522 break;
523 case G23:
524 st->_keepc[3] = **inbuf;
525 if ((n = iso_to_utf8(st, \
526 *outbuf, \
527 *outbytesleft)) > 0) {
528 (*outbuf)+=n;
529 (*outbytesleft-=n);
530 } else {
531 errno = st->_errno;
532 return (size_t)-1;
533 }
534 st->_gstate = G22;
535 break;
536 case G24:
537 if (**inbuf == ESC) {
538 st->_gstate = G25;
539 st->_buf[st->_bufcont++] = ESC;
540 } else {
541 **outbuf = **inbuf;
542 (*outbuf)++;
543 (*outbytesleft)--;
544 st->_bufcont = 0;
545 }
546 break;
547 case G25:
548 if (**inbuf == '$') {
549 st->_gstate = G1;
550 continue;
551 } else if (**inbuf == SS3) {
552 st->_gstate = G22;
553 st->_bufcont = 0;
554 st->_istate = OUT;
555 } else if (flush_buf(st, outbuf, outbytesleft) == -1) {
556 errno = st->_errno;
557 return (size_t)-1;
558 } else {
559 st->_gstate = G24;
560 st->_istate = IN;
561 }
562 break;
563 default: /* should never come here */
564 st->_errno = errno = EILSEQ;
565 st->_gstate = G0; /* reset state */
566 break;
567 } /* end of switch */
568
569 (*inbuf)++;
570 (*inbytesleft)--;
571
572 if (st->_errno) {
573 break;
574 }
575 if (errno)
576 return((size_t)(-1));
577 }
578
579 if (*inbytesleft > 0 && *outbytesleft == 0) {
580 errno = E2BIG;
581 return((size_t)(-1));
582 }
583 return (size_t)((*inbytesleft));
584 }
585
586 /*
587 * ISO 2022-7 code --> UTF-8 code
588 * Return: > 0 - converted with enough space in output buffer
589 * = 0 - no space in outbuf
590 */
iso_to_utf8(_iconv_st * st,char * buf,size_t buflen)591 int iso_to_utf8(_iconv_st * st, char* buf, size_t buflen) {
592 unsigned long iso_val;
593 int unidx;
594 unsigned long unicode;
595 switch (st->_plane) {
596 case 0:
597 unidx = (((st->_keepc[0] & 0xff) - 0x21) * 94) + \
598 (st->_keepc[1] & 0xff) - 0x21;
599 if (unidx < 0 || unidx >= GBMAX) {
600 st->_errno = EILSEQ;
601 return (0);
602 }
603 unicode = Unicode[unidx];
604 break;
605
606 case 1:
607 iso_val = ((st->_keepc[0] & MSB_OFF) << 8) + \
608 (st->_keepc[1] & MSB_OFF);
609 unidx = binsearch(iso_val, cns1_utf_tab, MAX_CNS1_NUM);
610 if (unidx >= 0) {
611 unicode = cns1_utf_tab[unidx].value;
612 break;
613 }
614 st->_errno = EILSEQ;
615 return (0);
616 case 2:
617 iso_val = ((st->_keepc[0] & MSB_OFF) << 8) + \
618 (st->_keepc[1] & MSB_OFF);
619 unidx = binsearch(iso_val, cns2_utf_tab, MAX_CNS2_NUM);
620 if (unidx >= 0) {
621 unicode = cns2_utf_tab[unidx].value;
622 break;
623 }
624 st->_errno = EILSEQ;
625 return (0);
626 default:
627 st->_errno = EILSEQ;
628 return (0);
629 }
630 if (unidx >=0) {
631 if (unicode > 0x0080 && unicode <= 0x07ff) {
632 if (buflen < 2) {
633 st->_errno = E2BIG;
634 return (0);
635 }
636 *buf = (char)((unicode >> 6) & 0x1f) | 0xc0;
637 *(buf+1) = (char)(unicode & 0x3f) | 0x80;
638 return 2;
639 }
640 if (unicode > 0x0800 && unicode <= 0xffff) {
641 if (buflen < 3) {
642 st->_errno = E2BIG;
643 return (0);
644 }
645 *buf = (char)((unicode >> 12) & 0xf) | 0xe0;
646 *(buf+1) = (char)((unicode >>6) & 0x3f) | 0x80;
647 *(buf+2) = (char)(unicode & 0x3f) | 0x80;
648 return 3;
649 }
650 }
651 if (buflen < 3) {
652 st->_errno = E2BIG;
653 return (0);
654 }
655
656 *buf = (char)UTF8_NON_ID_CHAR1;
657 *(buf + 2) = (char)UTF8_NON_ID_CHAR2;
658 *(buf + 2) = (char)UTF8_NON_ID_CHAR3;
659 return (3);
660 }
661
662 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,table_t v[],int n)663 int binsearch(unsigned long x, table_t v[], int n)
664 {
665 int low, high, mid;
666
667 low = 0;
668 high = n - 1;
669 while (low <= high) {
670 mid = (low + high) / 2;
671 if (x < v[mid].key)
672 high = mid - 1;
673 else if (x > v[mid].key)
674 low = mid + 1;
675 else /* found match */
676 return mid;
677 }
678 return (-1); /* no match */
679 }
680
681 #ifdef DEBUG
main(int argc,char ** argv)682 main(int argc, char ** argv) {
683 char *inbuf, *outbuf, *in_tmp, *out_tmp;
684 size_t inbytesleft, outbytesleft;
685 int fd;
686 int i;
687 struct stat s;
688 _iconv_st * st;
689 if (argc < 2) {
690 fprintf(stderr, "Usage: %s input\n", argv[0]);
691 exit(-1);
692 }
693 if ((fd = open(argv[1], O_RDONLY)) == -1) {
694 perror("open");
695 exit(-2);
696 }
697 if (fstat(fd, &s) == -1) {
698 perror("stat");
699 exit(-3);
700 }
701 inbytesleft = outbytesleft = s.st_size;
702 in_tmp = inbuf = (char *)malloc(inbytesleft);
703 out_tmp = outbuf = (char *)malloc(outbytesleft);
704 if (!inbuf || !outbuf) {
705 perror("malloc");
706 exit(-1);
707 }
708 if (read(fd, inbuf, inbytesleft) != inbytesleft) {
709 perror("read");
710 exit(-4);
711 }
712 for (i = 0; i < inbytesleft; i++)
713 fprintf(stderr, "%x\t", *(inbuf+i));
714 fprintf(stderr, "\n");
715 st = (_iconv_st *)_icv_open();
716 if (st == (_iconv_st *) -1) {
717 perror("_icv_open");
718 exit(-1);
719 }
720 if (_icv_iconv(st, \
721 &inbuf, &inbytesleft, \
722 &outbuf, &outbytesleft) == -1) {
723 perror("icv_iconv");
724 fprintf(stderr, "\ninbytesleft = %d\n", inbytesleft);
725 exit(-2);
726 }
727 if (write(1, out_tmp, s.st_size - outbytesleft) == -1) {
728 perror("write");
729 exit(-1);
730 }
731 free(in_tmp);
732 free(out_tmp);
733 close(fd);
734 _icv_close(st);
735 }
736 #endif
737