xref: /titanic_51/usr/src/lib/iconv_modules/zh/common/zh_TW-big5%zh_TW-iso2022-CN-EXT.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1997, by Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 
26 
27 /*
28    Converts From:	Taiwanese BIG5 encoding
29    Converts To:		ISO2022-CN-EXT encoding.
30 
31    NOTE: This file was created using vi editor with tabstop set to 4.
32 		 To view this file correctly set tabstop appropriately.
33 		 e.g. for vi use command	ESC:se ts=4
34  */
35 
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <errno.h>
39 #include "big5_cns11643.h"	/* Big5 to CNS 11643 mapping table */
40 
41 #define MSB			0x80	/* The most significant bit */
42 #define ONEBYTE		0xff	/* The right most byte */
43 
44 #define SI		0x0f	/* shift in */
45 #define SO		0x0e	/* shift out */
46 #define SS2		0x4e	/* SS2 low byte. High byte is ESC */
47 #define SS3		0x4f	/* SS3 low byte. High byte is ESC */
48 #define ESC		0x1b	/* The Escape character */
49 #define NON_ID_CHAR	'_' /*Substitute this for all unidentified characters*/
50 
51 /* GET_PLANEC() - Gets the corresponding ISO assigned plane character for
52                   the CNS11643 plane */
53 static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
54 #define GET_PLANEC(i)	(plane_char[(i)])
55 
56 typedef struct _icv_state {
57 	char	keepc[2];	/* Save the recieved bytes here */
58 	short	cstate;		/* Current state the state machine is in.
59 				   These states are C0 or C1*/
60 	char	ishiftfunc;	/* The currently active shift funtion SI or SO
61 				   in the output ISO buffer */
62 	int	iSOplane;	/* The current CNS11643 plane which is
63 				   assigned to the SOdesignation in the output
64 				   ISO buffer. Only CNS11643 plane 1 can be
65 				   assigned to SOdesignation */
66 	int	iSS2plane;	/* The current CNS11643 plane which is
67 				   assigned to the SS2designation in the output
68 				   ISO buffer. Only CNS11643 plane 2 can be
69 				   assigned to SS2designation */
70 	int	iSS3plane; 	/* The current CNS11643 plane which is
71 				   assigned to the SS3designation in the output
72 				   ISO buffer. All CNS11643 planes >= 3 are
73 				   assigned to SS3designation */
74 	size_t	nonidcount; /* Keeps track of skipped input bytes in conversion */
75 	int	_errno;		/* Internal error number */
76 } _iconv_st;
77 
78 enum _CSTATE	{ C0, C1 };
79 
80 static int isbig5(unsigned char*);
81 static int hascns(char*);
82 static int ascii_to_iso(char, _iconv_st*, char**, size_t*);
83 static int big5_to_iso(int, _iconv_st*, char**, size_t*);
84 static int getcnsbytes(int, char*, int*);
85 static int binsearch(unsigned long, table_t[], int);
86 
87 
88 /*
89  * _icv_open: Called from iconv_open. Allocates and initializes _iconv_st
90  *            structure. Returns pointer to the structure as (void *).
91  */
92 
93 
94 void *
95 _icv_open()
96 {
97 	_iconv_st  *st;
98 
99 #ifdef DEBUG
100 	fprintf(stderr, "_icv_open(): Come into!\n");
101 #endif
102 	/* Allocate */
103 	if ((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
104 		errno = ENOMEM;
105 #ifdef DEBUG
106 	fprintf(stderr, "Error\n");
107 #endif
108 		return ((void *) -1);
109 	}
110 
111 	/* Initialize */
112 	st->cstate = C0;
113 	st->ishiftfunc = SI;
114 	st->iSOplane = -1;
115 	st->iSS2plane = -1;
116 	st->iSS3plane = -1;
117 	st->nonidcount = 0;
118 	st->_errno = 0;
119 
120 #ifdef DEBUG
121 	fprintf(stderr, "====== _icv_open(): Big5 --> ISO2022-CN-EXT =====\n");
122 #endif
123 
124 	/* Return struct */
125 	return ((void *) st);
126 }
127 
128 
129 
130 /*
131  * _icv_close: Called from iconv_close(). Frees the _iconv_st structure as
132  *	       pointed by the argument.
133  */
134 
135 void
136 _icv_close(_iconv_st *st)
137 {
138 	if (st == NULL)
139 		errno = EBADF;
140 	else
141 		free(st);
142 }
143 
144 /*
145  * _icv_iconv: Called from iconv(). Does the convertion from BIG5 to
146  *	       ISO2022-CN-EXT.
147  */
148 /*=======================================================
149  *
150  *   State Machine for interpreting Big-5 code
151  *
152  *=======================================================
153  *
154  *                     1st C
155  *    +--------> C0 ----------> C1
156  *    |    ascii |        2nd C |
157  *    ^          v              v
158  *    +----<-----+-----<--------+
159  *
160  *=======================================================*/
161 size_t
162 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
163 				char **outbuf, size_t *outbytesleft)
164 {
165 
166 	int n, idx;
167 
168 #ifdef DEBUG
169     fprintf(stderr, "=== _icv_iconv(): Big5 --> ISO2022-CN-EXT =====\n");
170 #endif
171 
172 	if (st == NULL) {
173 	    errno = EBADF;
174 	    return ((size_t) -1);
175 	}
176 
177 	if (inbuf == NULL || *inbuf == NULL ||
178 	    inbytesleft == NULL || *inbytesleft == 0) { /* Reset request */
179 	    if (st->ishiftfunc == SO) {
180 		if (outbytesleft && *outbytesleft >= 1  && outbuf && *outbuf) {
181 		    **outbuf = SI;
182 		    (*outbuf)++;
183 		    (*outbytesleft)--;
184 		} else {
185 		    errno = E2BIG;
186 		    return((size_t) -1);
187 		}
188 	    }
189 	    st->cstate = C0;
190 	    st->ishiftfunc = SI;
191 	    st->iSOplane = -1;
192 	    st->iSS2plane = -1;
193 	    st->iSS3plane = -1;
194 	    st->nonidcount = 0;
195 	    st->_errno = 0;
196 	    return ((size_t) 0);
197 	}
198 
199 	st->_errno = 0;
200 	errno = 0;
201 
202 	/* Before we use *inbytesleft or *outbytesleft we should confirm that
203 	inbytesleft and outbytesleft are non-NULL. I am considering inbytesleft
204 	or *inbytesleft having 0 value as a reset request. I am considering
205 	outbytesleft having 0 value as no space in output buffer. Also, here
206 	itself I am verifying that outbuf and *outbuf should be non-NULL pointers
207 	so I do not have to worry about them being NULL below in the conversion
208 	sub-routines. I also confirm here that *outbytesleft should be > 0 before
209 	we can continue further */
210 
211 	if (outbytesleft == NULL || *outbytesleft == 0 ||
212 		outbuf == NULL || *outbuf == NULL){
213 	    errno = E2BIG;
214 	    return ((size_t)-1);
215 	}
216 
217 	/* A state machine for interpreting Big-5 code */
218 	while (*inbytesleft > 0 && *outbytesleft > 0) {
219 	    switch (st->cstate) {
220 	    case C0:
221 		if (**inbuf & MSB) { /* May have got the first byte ofa BIG5 code */
222 
223 		    st->keepc[0] = **inbuf;		/*Save byte */
224 		    st->cstate = C1;	/* Go to the next state where
225 					   the next BIG5 byte is recieved */
226 		    st->nonidcount += 1;/* Until we have verified that this and
227 					   the next byte make a valid BIG5 code
228 					   we shall consider this as an
229 					   unidentified byte */
230 		} else if (**inbuf == ESC || **inbuf == SI || **inbuf == SO){
231 
232 		    /* We should not process these ASCII control codes as these
233 		       have special significance in the output ISO encoding.
234 		       Instead we will output NON_ID_CHAR and continue processing */
235 
236 		    n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
237 		    if (n < 0) /* Insufficient space in the outbuf */
238 			    return ((size_t)-1); /* The errno etc. are set in ascii_to_iso */
239 		    st->nonidcount += 1;
240 		} else { /* Got ASCII code */
241 		    n = ascii_to_iso(**inbuf, st, outbuf, outbytesleft);
242 		    if (n < 0) /* Insufficient space in the outbuf */
243 			return ((size_t)-1);
244 		}
245 		break;
246 
247 	    case C1:
248 		st->keepc[1] = (**inbuf);
249 		if (isbig5((unsigned char*) st->keepc) == 0) {
250 		    if ((idx = hascns(st->keepc)) >= 0){
251 			n = big5_to_iso(idx, st, outbuf, outbytesleft);
252 			if (n < 0) /* Insufficient space in the outbuf */
253 			    return ((size_t)-1);
254 			st->nonidcount -= 1; /* The first byte of this big5 saved in
255 						state C0 is confirmed valid BIG5 High
256 						byte and is processed correctly */
257 
258 		    } else { /* Valid BIG5 but has no CNS encoding */
259 			/* We will output the NON_ID_CHAR character */
260 			n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
261 			if (n < 0) /* Insufficient space in the outbuf */
262 			    return ((size_t)-1);
263 			n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
264 			if (n < 0) /* Insufficient space in the outbuf */
265 			    return ((size_t)-1);
266 			st->nonidcount -= 1; /* Include the 2nd byte also as
267 						    unidentified byte */
268 		    }
269 		} else { /* Input character is not BIG5 encoding */
270 		    st->nonidcount += 1;
271 		    st->_errno = errno = EILSEQ; /* This will cause the code to
272 						    break out of while loop below
273 						    to return to the caller */
274 
275 		}
276 		st->cstate = C0; /* Go to the initial state */
277 		break;
278 
279 	    default:		/* Should never come here */
280 		fprintf(stderr,
281 	 "_icv_iconv():Big5-->ISO2022-CN-EXT: Should not have come here\n");
282 		st->_errno = errno = EILSEQ;
283 		st->cstate = C0;
284 		break;
285 
286 	    } /* end switch */
287 
288 	    (*inbuf)++;
289 	    (*inbytesleft)--;
290 
291 	    if (st->_errno)
292 		    break; /* Break out of while loop */
293 
294 	    if (errno) /* We set st->_errno before we set errno. If errno is set
295 				      somewhere else we handle that here */
296 		return ((size_t)-1);
297 
298 	} /* end while */
299 
300 /* We now have to handle the case where we have successfully processed the
301    previous input character which exhausted the output buffer. This is handled
302    by the while loop. However, since there are more input characters that
303    haven't been processed yet, we need to set the errno appropriately and
304    return -1. */
305 	if (*inbytesleft > 0 && *outbytesleft == 0) {
306 		errno = E2BIG;
307 		return ((size_t)-1);
308 	}
309 
310 	return (*inbytesleft + st->nonidcount);
311 
312 }
313 
314 
315 /*
316  * Big-5 encoding range:
317  *	High byte: 0xA1 - 0xFE				(94 encoding space)
318  *	Low byte:  0x40 - 0x7E, 0xA1 - 0xFE	(157 encoding space)
319  *	Plane #1:  0xA140 - 0xC8FE			(6280 encoding space)
320  *	Plane #2:  0xC940 - 0xFEFE			(8478 encoding space)
321  *	Total:	   94 * 157 = 14,758		(14758 encoding space)
322  */
323 static int isbig5(unsigned char *twobytes)
324 {
325 	if (twobytes[0] >= 0xa1 && twobytes[0] <= 0xfe)
326 	    if ((twobytes[1] >= 0x40 && twobytes[1] <= 0x7e) ||
327 					(twobytes[1] >= 0xa1 && twobytes[1] <= 0xfe))
328 		return (0);
329 	return(-1);
330 }
331 
332 
333 /*
334  * hascns() : checks whether we have a CNS 11643 code for the big5 character
335  *			  code. If exists returns the index of the big5 character in the
336  *			  big5 to CNS table else returns -1.
337  */
338 static int hascns(char* big5mbchar)
339 {
340 
341 	int idx;
342 	unsigned long big5code;
343 
344 	big5code = (unsigned long) ((big5mbchar[0] & ONEBYTE) << 8) +
345 										(big5mbchar[1] & ONEBYTE);
346 
347 	idx = binsearch(big5code, big5_cns_tab, MAX_BIG5_NUM);
348 
349 	return (idx); /* binsearch returns -1 if not found, else index */
350 }
351 
352 
353 /* ascii_to_iso() : If required, outputs the SI shift function. Outputs the
354  *					character. If there is insufficient space in the output
355  *					buffer, it flags the error and returns -1. On success it
356  *					returns 0.
357  */
358 static int ascii_to_iso(char c, _iconv_st *st, char **outbuf,
359 							size_t *outbytesleft)
360 {
361 	if (st->ishiftfunc != SI){
362 	    **outbuf = SI;
363 	    (*outbuf)++;
364 	    (*outbytesleft)--;
365 	    st->ishiftfunc = SI;
366 
367 	    if (*outbytesleft < 1){ /* Do we now have space for ASCII character?*/
368 		    st->_errno = errno = E2BIG;
369 		    return (-1);
370 	    }
371 	}
372 
373 	**outbuf = c;
374 	(*outbuf)++;
375 	(*outbytesleft)--;
376 
377 	/* Each line in ISO is expected to have the character set information
378 	   for the Chinese characters in that line. This facilitates text
379 	   scrollling. Hence, on encountering newline reset designations to
380 	   unknown */
381 	if (c == '\n'){
382 	    st->iSOplane = -1;
383 	    st->iSS2plane = -1;
384 	    st->iSS3plane = -1;
385 	}
386 
387 	return (0);
388 
389 }
390 
391 
392 
393 /* big5_to_iso() : Converts the Big5 code, for which the index idx in
394  *				   the big5 to cns table is provided as an argument, to
395  *				   its corresponding ISO2022-CN-EXT code. This may
396  *				   require outputting of SO shift function and/or
397  *				   the designations. In case we do not have sufficient
398  *				   space in the outbuf to to do the convertion we flag error
399  *				   and return -1
400  */
401 static int big5_to_iso(int idx, _iconv_st *st, char **outbuf,
402 							size_t *outbytesleft)
403 {
404 
405 	char cnsbytes[2];
406 	int cnsplane;
407 	int ret;
408 
409 	ret = getcnsbytes(idx, cnsbytes, &cnsplane);
410 	if (ret < 0){
411 	    /* This means that the cnscode is invalid. Should have been taken
412 	       care of in function hascns() and thus this code should never come
413 	       here. We catch this by the error message below */
414 	    fprintf(stderr,
415 	      "big5_to_iso():Big5->ISO2022-CN-EXT:gencnsbyte() rejected cnscode\n");
416 	    st->_errno = errno = EILSEQ;
417 	    return (0);
418 	}
419 
420 	switch (cnsplane) {
421 	case 1:
422 	    if (st->iSOplane != cnsplane){ /* Is SODESIGNATION set to this plane?*/
423 		/* Output Escape sequence to set the SODESIGNATION to plane 1 */
424 		/* Before that check that we have space in outbuf for it */
425 		if (*outbytesleft < 4){
426 			st->_errno = errno = E2BIG;
427 			return (-1);
428 		}
429 
430 		**outbuf = ESC;
431 		*(*outbuf+1) = '$';
432 		*(*outbuf+2) = ')';
433 		*(*outbuf+3) = GET_PLANEC(cnsplane);
434 		(*outbuf) += 4;
435 		(*outbytesleft) -= 4;
436 		st->iSOplane = cnsplane;
437 	    }
438 
439 	    /* Check the current shift function whether it is SO. If not
440 	       set the SO shift function after confirming that you have
441 	       space for it. */
442 	    if (st->ishiftfunc != SO){
443 		if (*outbytesleft < 1){
444 		    st->_errno = errno = E2BIG;
445 		    return (-1);
446 		}
447 
448 		**outbuf = SO;
449 		(*outbuf)++;
450 		(*outbytesleft)--;
451 		st->ishiftfunc = SO;
452 	    }
453 	    break;
454 
455 	case 2:
456 	    if (st->iSS2plane != cnsplane){ /* Is SS2DESIGNATION set tothis plane ? */
457 		/* Output escape sequence to set SS2DESIGNATION to plane 2 */
458 		/* Before that check that we have space in outbuf for it */
459 		if (*outbytesleft < 4){
460 			st->_errno = errno = E2BIG;
461 			return (-1);
462 		}
463 
464 		**outbuf = ESC;
465 		*(*outbuf+1) = '$';
466 		*(*outbuf+2) = '*';
467 		*(*outbuf+3) = GET_PLANEC(cnsplane);
468 		(*outbuf) += 4;
469 		(*outbytesleft) -= 4;
470 		st->iSS2plane = cnsplane;
471 	    }
472 
473 	    /* Output the SS2 shift function only when we have sufficient space
474 	       for the 2 cns code bytes also */
475 	    if (*outbytesleft < 4){
476 		st->_errno = errno = E2BIG;
477 		return (-1);
478 	    }
479 
480 	    **outbuf = ESC;
481 	    *(*outbuf+1) = SS2;
482 	    (*outbuf) += 2;
483 	    (*outbytesleft) -= 2;
484 
485 	    break;
486 
487 	case 3:
488 	case 4:
489 	case 5:
490 	case 6:
491 	case 7:
492 	case 12:
493 	case 14:
494 	case 15:
495 	case 16:
496 	    if (st->iSS3plane != cnsplane){ /* Is SS3DESIGNATION set tothis plane? */
497 		/* Output escape sequence to set SS3DESIGNATION to cnsplane */
498 		/* Before that check that we have space in outbuf for it */
499 		if (*outbytesleft < 4){
500 			st->_errno = errno = E2BIG;
501 			return (-1);
502 		}
503 
504 		**outbuf = ESC;
505 		*(*outbuf+1) = '$';
506 		*(*outbuf+2) = '+';
507 		*(*outbuf+3) = GET_PLANEC(cnsplane);
508 		(*outbuf) += 4;
509 		(*outbytesleft) -= 4;
510 		st->iSS3plane = cnsplane;
511 
512 	    }
513 
514 	    /* Output the SS3 shift function only when we have sufficient space
515 	       for the 2 cns code bytes also */
516 	    if (*outbytesleft < 4){
517 		st->_errno = errno = E2BIG;
518 		return (-1);
519 	    }
520 
521 	    **outbuf = ESC;
522 	    *(*outbuf+1) = SS3;
523 	    (*outbuf) += 2;
524 	    (*outbytesleft) -= 2;
525 
526 	    break;
527 
528 	default: /* Should have been taken care of in caller of this funcion */
529 
530 	    /* This means that the cnscode is invalid. Should have been taken
531 	       care of in function hascns() and thus this code should never
532 	       come here. We catch this by the error message below */
533 	    fprintf(stderr, "big5_to_iso():Big5->ISO2022-CN-EXT:Rejecting cnscode\n");
534 	    st->_errno = errno = EILSEQ;
535 	    return (0);
536 
537 	    break;
538 
539 	} /* end switch */
540 
541 	/* Output the cns code */
542 	if (*outbytesleft < 2){
543 	    st->_errno = errno = E2BIG;
544 	    return (-1);
545 	}
546 
547 	**outbuf = cnsbytes[0];
548 	*(*outbuf+1) = cnsbytes[1];
549 	(*outbuf) += 2;
550 	(*outbytesleft) -= 2;
551 
552 
553 	return (0);
554 
555 }
556 
557 
558 static int getcnsbytes(int idx, char *cnsbytes, int *cnsplane)
559 {
560 
561 	unsigned long cnscode;
562 	unsigned long val;
563 	int plane;
564 
565 	cnscode = big5_cns_tab[idx].value;
566 
567 	plane = (int) (cnscode >> 16);
568 	switch (plane) {
569 	case 0x21:	/* 0x8EA1 - G */
570 	case 0x22:	/* 0x8EA2 - H */
571 	case 0x23:	/* 0x8EA3 - I */
572 	case 0x24:	/* 0x8EA4 - J */
573 	case 0x25:	/* 0x8EA5 - K */
574 	case 0x26:	/* 0x8EA6 - L */
575 	case 0x27:	/* 0x8EA7 - M */
576 	case 0x28:	/* 0x8EA8 - N */
577 	case 0x29:	/* 0x8EA9 - O */
578 	case 0x2a:	/* 0x8EAA - P */
579 	case 0x2b:	/* 0x8EAB - Q */
580 	case 0x2c:	/* 0x8EAC - R */
581 	case 0x2d:	/* 0x8EAD - S */
582 	case 0x2f:	/* 0x8EAF - U */
583 	case 0x30:	/* 0x8EB0 - V */
584 	    *cnsplane = plane - 0x20;	/* so that we can use GET_PLANEC() */
585 	    break;
586 
587 	case 0x2e:	/* 0x8EAE - T */
588 	    *cnsplane = 3;		/* CNS 11643-1992. Why is this returning 3?  */
589 	    break;
590 
591 	default:
592 	    return (-1); /* Should not have happened */
593 	    break;
594 	}
595 
596 	val = cnscode & 0xffff;
597 	cnsbytes[0] = (val & 0xff00) >> 8;
598 	cnsbytes[1] = val & 0xff;
599 
600 	return (0);
601 
602 }
603 
604 
605 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
606 static int binsearch(unsigned long x, table_t v[], int n)
607 {
608 	int low, high, mid;
609 
610 	low = 0;
611 	high = n - 1;
612 	while (low <= high) {
613 	    mid = (low + high) / 2;
614 	    if (x < v[mid].key)
615 		high = mid - 1;
616 	    else if (x > v[mid].key)
617 		low = mid + 1;
618 	    else	/* found match */
619 		return mid;
620 	}
621 	return (-1);	/* no match */
622 }
623