xref: /illumos-gate/usr/src/lib/iconv_modules/zh/common/zh_CN.iso2022-CN%UTF-8.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1995, by Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <strings.h>
29 #include <errno.h>
30 #ifdef DEBUG
31 #include <sys/fcntl.h>
32 #include <sys/stat.h>
33 #endif
34 #include <gb2312_unicode.h>
35 #include <cns11643_unicode_CN.h>	/* CNS 11643 to Unicode mapping table */
36 
37 #define UTF8_NON_ID_CHAR1 0xEF
38 #define UTF8_NON_ID_CHAR2 0xBF
39 #define UTF8_NON_ID_CHAR3 0xBD
40 
41 #define MSB 	0x80	/* most significant bit */
42 #define MBYTE	0x8e	/* multi-byte (4 byte character) */
43 #define PMASK	0xa0	/* plane number mask */
44 #define ONEBYTE 0xff	/* right most byte */
45 #define MSB_OFF 0x7f	/* mask off MBS */
46 
47 #define SI	0x0f		/* shift in */
48 #define SO	0x0e		/* shift out */
49 #define ESC 0x1b		/* escape */
50 #define SS2	0x4e		/* SS2 shift out */
51 #define SS3 0x4f		/* SS3 shift out */
52 #define NON_ID_CHAR_BYTE1	0xA1	/* non-identified character */
53 #define NON_ID_CHAR_BYTE2	0xF5	/* non-identified character */
54 
55 typedef struct _icv_state {
56 	char	_buf[10];
57 	size_t	_bufcont;
58 	char	_keepc[4];	/* maximum # byte of CNS11643 code */
59 	short	_gstate;		/* state machine id */
60 	short	_istate;		/* state for shift in/out */
61 	int		_plane;		/* plane number for Chinese character */
62 	int		_last_plane;	/* last charactor's plane # */
63 	int 	_errno;		/* internal errno */
64 } _iconv_st;
65 
66 enum _GSTATE    { G0, G1, G2, G3, G4, G5, G6, G7, G8, G9, \
67 				  G10,G11,G12,G13,G14,G15,G16,G17,G18,G19, \
68 				  G20,G21,G22,G23,G24,G25,G26,G27,G28,G29 };
69 
70 enum _ISTATE	{ IN, OUT };
71 
72 
73 int iso_gb_to_utf(_iconv_st * st, char* buf, size_t buflen);
74 int iso_cns_to_utf(_iconv_st * st, char* buf, size_t buflen);
75 int iso_cns_to_utf(_iconv_st * st, char* buf, size_t buflen);
76 int binsearch(unsigned long x, table_t v[], int n);
77 int flush_buf(_iconv_st * st, char ** outbuf, size_t * outbytesleft);
78 
flush_buf(_iconv_st * st,char ** outbuf,size_t * outbytesleft)79 int flush_buf(_iconv_st * st, char ** outbuf, size_t * outbytesleft) {
80 	if (!st->_bufcont)
81 		return 0;
82 	if (st->_bufcont > *outbytesleft) {
83 		st->_errno = E2BIG;
84 		return -1;
85 	}
86 	if (st->_istate != IN) {
87 		st->_errno = EILSEQ;
88 		return -1;
89 	}
90 	strncpy(st->_buf, *outbuf, st->_bufcont);
91 	(*outbuf)+=(st->_bufcont);
92 	(*outbytesleft)-=(st->_bufcont);
93 	st->_bufcont = 0;
94 	return st->_bufcont;
95 }
96 
97 /*
98  * Open; called from iconv_open()
99  */
100 void *
_icv_open()101 _icv_open()
102 {
103 	_iconv_st *st;
104 
105 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
106 		errno = ENOMEM;
107 		return ((void *) -1);
108 	}
109 
110 	st->_gstate = G0;
111 	st->_istate = IN;
112 	st->_last_plane = st->_plane = -1;
113 	st->_errno = 0;
114 	st->_bufcont = 0;
115 
116 	return ((void *) st);
117 }
118 
119 /*
120  * Close; called from iconv_close()
121  */
122 void
_icv_close(_iconv_st * st)123 _icv_close(_iconv_st *st)
124 {
125 	if (st == NULL)
126 		errno = EBADF;
127 	else
128 		free(st);
129 }
130 
131 /*
132  * Actual conversion; called from iconv()
133  */
134 /*=========================================================================
135  *
136  *             State Machine for interpreting ISO 2022-7 code
137  *
138  *=========================================================================
139  *
140  *                                                        plane 2 - 16
141  *                                                    +---------->-------+
142  *                                    plane           ^                  |
143  *            ESC      $       )      number     SO   | plane 1          v
144  *    +-> G0 ----> G1 ---> G2 ---> G3 ------> G4 --> G5 -------> G6     G7
145  *    |   | ascii  | ascii | ascii |    ascii |   SI | |          |      |
146  *    +----------------------------+    <-----+------+ +------<---+------+
147  *    ^                                 |
148  *    |              ascii              v
149  *    +---------<-------------<---------+
150  *
151  *=========================================================================*/
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)152 size_t _icv_iconv(_iconv_st *st, \
153 					char **inbuf, size_t *inbytesleft, \
154 					char **outbuf, size_t *outbytesleft) {
155 	int		n;
156 	char	c;
157 
158 	if (st == NULL) {
159 		errno = EBADF;
160 		return ((size_t) -1);
161 	}
162 
163 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
164 		st->_gstate = G0;
165 		st->_istate = IN;
166 		st->_errno = 0;
167 		st->_plane = st->_last_plane = -1;
168 		return ((size_t) 0);
169 	}
170 
171 	errno = st->_errno = 0;	/* reset internal and external errno */
172 
173 	/* a state machine for interpreting ISO 2022-7 code */
174 	while (*inbytesleft > 0 && *outbytesleft > 0) {
175 		switch (st->_gstate) {
176 			case G0:		/* assuming ASCII in the beginning */
177 				if (**inbuf == ESC) {
178 					st->_gstate = G1;
179 					st->_buf[st->_bufcont++] = ESC;
180 				} else {	/* real ASCII */
181 					**outbuf = **inbuf;
182 					(*outbuf)++;
183 					(*outbytesleft)--;
184 				}
185 				break;
186 			case G1:		/* got ESC, expecting $ */
187 				if (**inbuf == '$') {
188 					st->_gstate = G2;
189 					st->_buf[st->_bufcont++] = '$';
190 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
191 					errno = st->_errno;
192 					return (size_t)-1;
193 				} else {
194 					st->_gstate = G0;
195 					st->_errno = 0;
196 					st->_istate = IN;
197 					continue;	/* don't advance inbuf */
198 				}
199 				break;
200 			case G2:		/* got $, expecting ) * or + */
201 				if (**inbuf == ')') {
202 					st->_gstate = G3;
203 				} else if (**inbuf == '*') {
204 					st->_gstate = G12;
205 					st->_plane = 2;
206 				} else if (**inbuf == '+') {
207 					st->_gstate = G19;
208 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
209 					errno = st->_errno;
210 					return (size_t)-1;
211 				} else {
212 					st->_gstate = G0;
213 					st->_errno = 0;
214 					st->_istate = IN;
215 					continue;	/* don't advance inbuf */
216 				}
217 				st->_buf[st->_bufcont++] = **inbuf;
218 				break;
219 			case G3:	/* got ) expecting A,G,H */
220 						/* H is for the bug of and zh_TW.BIG5 */
221 				if (**inbuf == 'A') {
222 					st->_plane = 0;
223 					st->_gstate = G4;
224 				} else if (**inbuf == 'G') {
225 					st->_plane = 1;
226 					st->_gstate = G8;
227 				} else if (**inbuf == 'H') {
228 					st->_plane = 2;
229 					st->_gstate = G8;
230 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
231 					errno = st->_errno;
232 					return (size_t)-1;
233 				} else {
234 					st->_gstate = G0;
235 					st->_errno = 0;
236 					st->_istate = IN;
237 					continue;
238 				}
239 				st->_buf[st->_bufcont++] = **inbuf;
240 				break;
241 		case G4:	/* ESC $ ) A got, and SO is expected */
242 				if (**inbuf == SO) {
243 					st->_gstate = G5;
244 					st->_istate = OUT;
245 					st->_bufcont = 0;
246 					st->_last_plane = st->_plane;
247 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
248 					errno = st->_errno;
249 					return (size_t)-1;
250 				} else {
251 					st->_gstate = G0;
252 					st->_errno = 0;
253 					st->_istate = IN;
254 					st->_plane = st->_last_plane;
255 					continue;
256 				}
257 				break;
258 		case G5:	/* SO (Shift Out) */
259 				if (**inbuf == SI) {
260 					st->_istate = IN;
261 				st->_gstate = G7;
262 					st->_last_plane = st->_plane;
263 				} else if (**inbuf == ESC) {
264 /*
265 				&& *((*inbuf) + 1) == '$') {
266 					if (flush_buf(st, outbuf, outbytesleft) == -1) {
267 						errno = st->_errno;
268 						return (size_t)-1;
269 					}
270  */
271 					st->_bufcont = 0;
272 					st->_gstate = G0;
273 					continue;
274 				} else {	/* Chinese Charactors */
275 					st->_keepc[0] = **inbuf;
276 					st->_gstate = G6;
277 				}
278 				break;
279 		case G6:	/* GB2312: 2nd Chinese character */
280 				st->_keepc[1] = **inbuf;
281 				n = iso_gb_to_utf(st, *outbuf, *outbytesleft);
282 				if (n > 0) {
283 					(*outbuf) += n;
284 					(*outbytesleft) -= n;
285 				} else {
286 					errno = st->_errno;
287 					return (size_t)-1;
288 				}
289 				st->_gstate = G5;
290 				break;
291 			case G7:	/* Shift in */
292 				if (**inbuf == SO) {
293 					st->_gstate = G5;
294 					st->_istate = OUT;
295 					st->_last_plane = st->_plane;
296 					st->_bufcont = 0;
297 				} else if (**inbuf == ESC) {
298 				/*
299 				&& *((*inbuf) + 1) == '$') {
300 				 */
301 					st->_gstate = G0;
302 					continue;
303 				} else {
304 					**outbuf = **inbuf;
305 					(*outbuf)++;
306 					(*outbytesleft) --;
307 				}
308 				break;
309 		case G8:	/* CNS: Chinese character */
310 				if (**inbuf == SO) {
311 					st->_istate = OUT;
312 					st->_gstate = G9;
313 					st->_bufcont = 0;
314 					st->_last_plane = st->_plane;
315 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
316 					errno = st->_errno;
317 					return (size_t)-1;
318 				} else {
319 					st->_gstate = G0;
320 					st->_errno = 0;
321 					st->_plane = st->_last_plane;
322 					st->_istate = IN;
323 					continue;
324 				}
325 				break;
326 		case G9:
327 				if (**inbuf == SI) {
328 					st->_istate = IN;
329 					st->_gstate = G11;
330 					st->_last_plane = st->_plane;
331 				} else if (**inbuf == ESC) {
332 				/*
333 				&& *((*inbuf) + 1) == '$') {
334 				 */
335 					if (flush_buf(st, outbuf, outbytesleft) == -1) {
336 						errno = st->_errno;
337 						return (size_t)-1;
338 					}
339 					st->_gstate = G0;
340 					continue;
341 				} else {	/* Chinese Charactor */
342 					st->_keepc[0] = **inbuf;
343 					st->_gstate = G10;
344 				}
345 				break;
346 			case G10:
347 				st->_keepc[1] = **inbuf;
348 				n = iso_cns_to_utf(st, *outbuf, *outbytesleft);
349 				if (n > 0) {
350 					(*outbuf) += n;
351 					(*outbytesleft) -= n;
352 				} else {
353 					errno = st->_errno;
354 					return (size_t)-1;
355 				}
356 				st->_gstate = G9;
357 				break;
358 			case G11:
359 				st->_bufcont = 0;
360 				if (**inbuf == SO) {
361 					st->_istate = OUT;
362 					st->_gstate = G9;
363 				} else if (**inbuf == ESC) {
364 				/*
365 				&& *((*inbuf) + 1) == '$') {
366 				 */
367 					st->_gstate = G0;
368 					continue;
369 				} else {
370 					**outbuf = **inbuf;
371 					(*outbuf)++;
372 					(*outbytesleft)--;
373 				}
374 				break;
375 			case G12:
376 				if (**inbuf == 'H') {
377 					st->_buf[st->_bufcont++] = 'H';
378 					st->_gstate = G13;
379 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
380 					errno = st->_errno;
381 					return (size_t)-1;
382 				} else {
383 					st->_istate = IN;
384 					st->_plane = st->_last_plane;
385 					st->_gstate = G0;
386 					continue;
387 				}
388 				break;
389 			case G13:
390 				if (**inbuf == ESC) {
391 					st->_buf[st->_bufcont++] = **inbuf;
392 					st->_gstate = G14;
393 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
394 					errno = st->_errno;
395 					return (size_t)-1;
396 				} else {
397 					st->_gstate = G0;
398 					st->_istate = IN;
399 					st->_plane = st->_last_plane;
400 					continue;
401 				}
402 				break;
403 			case G14:
404 				if (**inbuf == SS2) {
405 					st->_istate = OUT;
406 					st->_gstate = G15;
407 					st->_bufcont = 0;
408 					st->_last_plane = st->_plane = 2;
409 				} else if (**inbuf == '$') {
410 					st->_bufcont --;
411 					if (flush_buf(st, outbuf, outbytesleft) == -1) {
412 						errno = st->_errno;
413 						return (size_t)-1;
414 					} else {
415 						st->_gstate = G1;
416 						st->_plane = st->_last_plane;
417 						st->_istate = IN;
418 						continue;
419 					}
420 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
421 					errno = st->_errno;
422 					return (size_t)-1;
423 				} else {
424 					st->_gstate = G0;
425 					st->_istate = IN;
426 					st->_plane = st->_last_plane;
427 					continue;
428 				}
429 				break;
430 			case G15:
431 				if (**inbuf == SI) {
432 					st->_gstate = G16;
433 					st->_istate = IN;
434 					st->_last_plane = st->_plane;
435 				} else if (**inbuf == ESC) {
436 				/*
437 				&& *((*inbuf) + 1) == '$') {
438 				 */
439 					st->_bufcont = 0;
440 					st->_gstate = G0;
441 					continue;
442 				} else {
443 					st->_keepc[0] = **inbuf;
444 					st->_gstate = G18;
445 				}
446 				break;
447 			case G16:
448 				if (**inbuf == ESC) {
449 					st->_gstate = G17;
450 					st->_buf[st->_bufcont++] = ESC;
451 				} else {
452 					**outbuf = **inbuf;
453 					(*outbuf) ++;
454 					(*outbytesleft) --;
455 					st->_bufcont = 0;
456 				}
457 				break;
458 			case G17:
459 				if (**inbuf == '$') {
460 					st->_gstate = G1;
461 					st->_buf[st->_bufcont++] = '$';
462 					continue;
463 				} else if (**inbuf == SS2) {
464 					st->_bufcont = 0;
465 					st->_gstate = G15;
466 					st->_istate = OUT;
467 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
468 					errno = st->_errno;
469 					return (size_t)-1;
470 				} else {
471 					st->_gstate = G16;
472 					st->_istate = IN;
473 				}
474 				break;
475 			case G18:
476 				st->_keepc[1] = **inbuf;
477 				st->_gstate = G15;
478 				if ((n = iso_cns_to_utf(st, \
479 											*outbuf, \
480 											*outbytesleft)) > 0) {
481 					(*outbuf)+=n;
482 					(*outbytesleft)-=n;
483 				} else {
484 					errno = st->_errno;
485 					return (size_t)-1;
486 				}
487 				break;
488 			case G19:	/* Plane #: 3 - 16 */
489 				c = **inbuf;
490 				if				(c == 'I' || \
491 								c == 'J' || \
492 								c == 'K' || \
493 								c == 'L' || \
494 								c == 'M' || \
495 								c == 'N' || \
496 								c == 'O' || \
497 								c == 'P' || \
498 								c == 'Q' || \
499 								c == 'R' || \
500 								c == 'S' || \
501 								c == 'T' || \
502 								c == 'U' || \
503 								c == 'V') {
504 					st->_plane = c - 'I' + 3;
505 					st->_gstate = G20;
506 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
507 					errno = st->_errno;
508 					return (size_t)-1;
509 				} else {
510 					st->_gstate = G0;
511 					st->_errno = 0;
512 					st->_istate = IN;
513 					st->_plane = st->_last_plane;
514 					continue;
515 				}
516 				st->_buf[st->_bufcont++] = c;
517 				break;
518 			case G20:
519 				if (**inbuf == ESC) {
520 					st->_buf[st->_bufcont++] = **inbuf;
521 					st->_gstate = G21;
522 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
523 					errno = st->_errno;
524 					return (size_t)-1;
525 				} else {
526 					st->_gstate = G0;
527 					st->_istate = IN;
528 					st->_last_plane = st->_plane;
529 					continue;
530 				}
531 				break;
532 			case G21:
533 				if (**inbuf == SS3) {
534 					st->_istate = OUT;
535 					st->_gstate = G22;
536 					st->_bufcont = 0;
537 				} else if (**inbuf == '$') {
538 					st->_bufcont --;
539 					if (flush_buf(st, outbuf, outbytesleft) == -1) {
540 						errno = st->_errno;
541 						return (size_t)-1;
542 					} else {
543 						st->_istate = IN;
544 						st->_last_plane = st->_plane;
545 						st->_gstate = G1;
546 						continue;
547 					}
548 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
549 					errno = st->_errno;
550 					return (size_t)-1;
551 				} else {
552 					st->_gstate = G0;
553 					st->_istate = IN;
554 					st->_last_plane = st->_plane;
555 					continue;
556 				}
557 				break;
558 			case G22:
559 				if (**inbuf == SI) {
560 					st->_istate = IN;
561 					st->_gstate = G24;
562 					st->_last_plane = st->_plane;
563 				} else {
564 					st->_keepc[0] = (char)MBYTE;
565 					st->_keepc[1] = (char)(PMASK + st->_plane);
566 					st->_keepc[2] = **inbuf;
567 					st->_gstate = G23;
568 				}
569 				break;
570 			case G23:
571 				st->_keepc[3] = **inbuf;
572 				if ((n = iso_cns_to_utf(st, \
573 											*outbuf, \
574 											*outbytesleft)) > 0) {
575 					(*outbuf)+=n;
576 					(*outbytesleft-=n);
577 				} else {
578 					st->_errno = errno;
579 					return (size_t)-1;
580 				}
581 				st->_gstate = G22;
582 				break;
583 			case G24:
584 				if (**inbuf == ESC) {
585 					st->_gstate = G25;
586 					st->_buf[st->_bufcont++] = ESC;
587 				} else {
588 					**outbuf = **inbuf;
589 					(*outbuf)++;
590 					(*outbytesleft)--;
591 					st->_bufcont = 0;
592 				}
593 				break;
594 			case G25:
595 				if (**inbuf == '$') {
596 					st->_gstate = G1;
597 					continue;
598 				} else if (**inbuf == SS3) {
599 					st->_gstate = G22;
600 					st->_bufcont = 0;
601 					st->_istate = OUT;
602 				} else if (flush_buf(st, outbuf, outbytesleft) == -1) {
603 					errno = st->_errno;
604 					return (size_t)-1;
605 				} else {
606 					st->_gstate = G24;
607 					st->_istate = IN;
608 				}
609 				break;
610 			default:			/* should never come here */
611 				st->_errno = errno = EILSEQ;
612 				st->_gstate = G0;	/* reset state */
613 				break;
614 		}	/* end of switch */
615 
616 		(*inbuf)++;
617 		(*inbytesleft)--;
618 
619 		if (st->_errno) {
620 			break;
621 		}
622 		if (errno)
623 {
624 			return((size_t)(-1));
625 }
626 	}
627 
628 	if (*inbytesleft > 0 && *outbytesleft == 0) {
629 		errno = E2BIG;
630 		return((size_t)(-1));
631 	}
632 	return (size_t)(*inbytesleft);
633 }
634 
iso_gb_to_utf(_iconv_st * st,char * buf,size_t buflen)635 int iso_gb_to_utf(_iconv_st * st, char* buf, size_t buflen)
636 {
637 char    in_byte1, in_byte2;
638 	int	idx;
639 	int	unicode;
640 
641 	if ( buflen < 2 ) {
642 		st->_errno = E2BIG;
643 	    return -1;
644 	}
645 
646 	in_byte1=st->_keepc[0];
647 	in_byte2=st->_keepc[1];
648 
649 	idx = (((in_byte1 & 0xff) - 0x21) * 94)  + (in_byte2 & 0xff) - 0x21;
650 	if (idx < 0 || idx > GBMAX - 1) {
651 		errno = EILSEQ;
652 		return -1;
653 	}
654 	unicode = Unicode[idx];
655 	if (unicode >= 0x0080 && unicode <= 0x07ff) {
656 	    if ( buflen < 2 ) {
657 		errno = E2BIG;
658 		return 0;
659 	    }
660 	    *buf = ((unicode >> 6) & 0x1f) | 0xc0;
661 	    *(buf+1) = (unicode & 0x3f) | MSB;
662 	    return 2;
663 	}
664 	if (unicode >= 0x0800 && unicode <= 0xffff) {
665 	    if ( buflen < 3 ) {
666 		errno = E2BIG;
667 		return 0;
668 	    }
669 	    *buf = ((unicode >> 12) & 0x0f) | 0xe0;
670 	    *(buf+1) = ((unicode >> 6) & 0x3f) | MSB;
671 	    *(buf+2) = (unicode & 0x3f) | MSB;
672 	    return 3;
673 	}
674 	if ( buflen < 3 ) {
675 	    errno = E2BIG;
676 	    return 0;
677 	}
678 
679 	*buf     = UTF8_NON_ID_CHAR1;
680 	*(buf+1) = UTF8_NON_ID_CHAR2;
681 	*(buf+2) = UTF8_NON_ID_CHAR3;
682 	return 3;
683 }
684 
685 /*
686  * Return: > 0 - converted with enough space in output buffer
687  *         = 0 - no space in outbuf
688  */
iso_cns_to_utf(_iconv_st * st,char * buf,size_t buflen)689 int iso_cns_to_utf(_iconv_st * st, char* buf, size_t buflen) {
690 	char		cns_str[3];
691 	unsigned long	cns_val;	/* MSB mask off CNS 11643 value */
692 	int		unidx;		/* binary search index */
693 	unsigned long	utf_val;	/* unicode code */
694 
695 	if (st->_plane == 1) {
696 		cns_str[0] = st->_keepc[0] & MSB_OFF;
697 		cns_str[1] = st->_keepc[1] & MSB_OFF;
698 	} else {
699 		cns_str[0] = st->_keepc[0] & MSB_OFF;
700 		cns_str[1] = st->_keepc[1] & MSB_OFF;
701 	}
702 	cns_val = (cns_str[0] << 8) + cns_str[1];
703 	if (buflen < 2) {
704 		errno = E2BIG;
705 		return(0);
706 	}
707 
708 	switch (st->_plane) {
709 		case 1:
710 			unidx = binsearch(cns_val, cns1_utf_tab, MAX_CNS1_NUM);
711 			if (unidx >= 0)
712 				utf_val = cns1_utf_tab[unidx].value;
713 			break;
714 		case 2:
715 			unidx = binsearch(cns_val, cns2_utf_tab, MAX_CNS2_NUM);
716 			if (unidx >= 0)
717 				utf_val = cns2_utf_tab[unidx].value;
718 			break;
719 		case 3:
720 			unidx = binsearch(cns_val, cns3_utf_tab, MAX_CNS3_NUM);
721 			if (unidx >= 0)
722 				utf_val = cns3_utf_tab[unidx].value;
723 			break;
724 		default:
725 			unidx = -1;	/* no mapping from CNS to Unicode out of plane 1,2&3 */
726 			break;
727 	}
728 
729 
730 	if (unidx < 0) {	/* no match from CNS to Unicode */
731 		*buf     = UTF8_NON_ID_CHAR1;
732 		*(buf+1) = UTF8_NON_ID_CHAR2;
733 		*(buf+2) = UTF8_NON_ID_CHAR3;
734 	return 3;
735 	} else {
736 	if (utf_val >= 0x0080 && utf_val <= 0x07ff) {
737 	    if ( buflen < 2 ) {
738 		errno = E2BIG;
739 		return 0;
740 	    }
741 	    *buf = ((utf_val >> 6) & 0x1f) | 0xc0;
742 	    *(buf+1) = (utf_val & 0x3f) | MSB;
743 	    return 2;
744 	}
745 	if (utf_val >= 0x0800 && utf_val <= 0xffff) {
746 	    if ( buflen < 3 ) {
747 		errno = E2BIG;
748 		return 0;
749 	    }
750 	    *buf = ((utf_val >> 12) & 0x0f) | 0xe0;
751 	    *(buf+1) = ((utf_val >> 6) & 0x3f) | MSB;
752 	    *(buf+2) = (utf_val & 0x3f) | MSB;
753 	    return 3;
754 	}
755 	if ( buflen < 3 ) {
756 	    errno = E2BIG;
757 	    return 0;
758 	}
759 
760 	*buf     = UTF8_NON_ID_CHAR1;
761 	*(buf+1) = UTF8_NON_ID_CHAR2;
762 	*(buf+2) = UTF8_NON_ID_CHAR3;
763 	return 3;
764 	}
765 
766 }
767 
768 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,table_t v[],int n)769 int binsearch(unsigned long x, table_t v[], int n)
770 {
771 	int low, high, mid;
772 
773 	low = 0;
774 	high = n - 1;
775 	while (low <= high) {
776 		mid = (low + high) / 2;
777 		if (x < v[mid].key)
778 			high = mid - 1;
779 		else if (x > v[mid].key)
780 			low = mid + 1;
781 		else	/* found match */
782 			return mid;
783 	}
784 	return (-1);	/* no match */
785 }
786 
787 
788 #ifdef DEBUG
main(int argc,char ** argv)789 main(int argc, char ** argv) {
790 	char *inbuf, *outbuf, *in_tmp, *out_tmp;
791 	size_t inbytesleft, outbytesleft;
792 	int fd;
793 	int i;
794 	struct stat s;
795 	_iconv_st * st;
796 	if (argc < 2) {
797 		fprintf(stderr, "Usage: %s input\n", argv[0]);
798 		exit(-1);
799 	}
800 	if ((fd = open(argv[1], O_RDONLY)) == -1) {
801 		perror("open");
802 		exit(-2);
803 	}
804 	if (fstat(fd, &s) == -1) {
805 		perror("stat");
806 		exit(-3);
807 	}
808 	inbytesleft = outbytesleft = s.st_size;
809 	in_tmp = inbuf = (char *)malloc(inbytesleft);
810 	out_tmp = outbuf = (char *)malloc(outbytesleft);
811 	if (!inbuf || !outbuf) {
812 		perror("malloc");
813 		exit(-1);
814 	}
815 	if (read(fd, inbuf, inbytesleft) != inbytesleft) {
816 		perror("read");
817 		exit(-4);
818 	}
819 	for (i = 0; i < inbytesleft; i++)
820 		fprintf(stderr, "%x\t", *(inbuf+i));
821 	fprintf(stderr, "\n");
822 	st = (_iconv_st *)_icv_open();
823 	if (st == (_iconv_st *) -1) {
824 		perror("_icv_open");
825 		exit(-1);
826 	}
827 	if (_icv_iconv(st, \
828 				&inbuf, &inbytesleft, \
829 				&outbuf, &outbytesleft) == -1) {
830 		perror("icv_iconv");
831 		fprintf(stderr, "\ninbytesleft = %d\n", inbytesleft);
832 		exit(-2);
833 	}
834 	if (write(1, out_tmp, s.st_size - outbytesleft) == -1) {
835 		perror("write");
836 		exit(-1);
837 	}
838 	free(in_tmp);
839 	free(out_tmp);
840 	close(fd);
841 	_icv_close(st);
842 }
843 #endif
844