xref: /titanic_51/usr/src/lib/iconv_modules/zh/common/zh_TW-big5%zh_TW-iso2022-7.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include "big5_cns11643.h"	/* Big-5 to CNS 11643 mapping table */
31 
32 #define	MSB	0x80	/* most significant bit */
33 #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
34 #define	PMASK	0xa0	/* plane number mask */
35 #define ONEBYTE	0xff	/* right most byte */
36 #define MSB_OFF 0x7f    /* mask off MSB */
37 
38 #define SI      0x0f    /* shift in */
39 #define SO      0x0e    /* shift out */
40 #define ESC     0x1b    /* escape */
41 
42 /* static const char plane_char[] = "0GH23456789:;<=>?"; */
43 static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
44 
45 #define GET_PLANEC(i)   (plane_char[i])
46 
47 #define NON_ID_CHAR '_'	/* non-identified character */
48 
49 typedef struct _icv_state {
50 	char	keepc[2];	/* maximum # byte of Big-5 code */
51 	short	cstate;		/* state machine id (Big-5) */
52 	short	istate;		/* state machine id (ISO) */
53 	int	_errno;		/* internal errno */
54 } _iconv_st;
55 
56 enum _CSTATE	{ C0, C1 };
57 enum _ISTATE    { IN, OUT };
58 
59 
60 static int big5_2nd_byte(char);
61 static int get_plane_no_by_big5(const char, const char, int*, unsigned long*);
62 static int big5_to_iso(int, int, unsigned long, char*, size_t);
63 static int binsearch(unsigned long, table_t[], int);
64 
65 
66 /*
67  * Open; called from iconv_open()
68  */
69 void *
70 _icv_open()
71 {
72 	_iconv_st *st;
73 
74 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
75 		errno = ENOMEM;
76 		return ((void *) -1);
77 	}
78 
79 	st->cstate = C0;
80 	st->istate = IN;
81 	st->_errno = 0;
82 
83 #ifdef DEBUG
84     fprintf(stderr, "==========     iconv(): Big-5 --> ISO 2022-7     ==========\n");
85 #endif
86 	return ((void *) st);
87 }
88 
89 
90 /*
91  * Close; called from iconv_close()
92  */
93 void
94 _icv_close(_iconv_st *st)
95 {
96 	if (!st)
97 		errno = EBADF;
98 	else
99 		free(st);
100 }
101 
102 
103 /*
104  * Actual conversion; called from iconv()
105  */
106 /*=======================================================
107  *
108  *   State Machine for interpreting Big-5 code
109  *
110  *=======================================================
111  *
112  *                     1st C
113  *    +--------> C0 ----------> C1
114  *    |    ascii |        2nd C |
115  *    ^          v              v
116  *    +----<-----+-----<--------+
117  *
118  *=======================================================*/
119 /*
120  * Big-5 encoding range:
121  *	High byte: 0xA1 - 0xFE			(   94 encoding space)
122  *	Low byte:  0x40 - 0x7E, 0xA1 - 0xFE	(  157 encoding space)
123  *	Plane #1:  0xA140 - 0xC8FE		( 6280 encoding space)
124  *	Plane #2:  0xC940 - 0xFEFE		( 8478 encoding space)
125  *	Total:	   94 * 157 = 14,758		(14758 encoding space)
126  */
127 size_t
128 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
129 				char **outbuf, size_t *outbytesleft)
130 {
131 	int		plane_no, n, unidx;
132 	unsigned long	cnscode;
133 	/* pre_plane_no: need to be static when re-entry occurs on errno set */
134 	static int      pre_plane_no = -1;      /* previous plane number */
135 
136 	if (st == NULL) {
137 		errno = EBADF;
138 		return ((size_t) -1);
139 	}
140 
141 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
142 		st->cstate = C0;
143 		st->istate = IN;
144 		st->_errno = 0;
145 		return ((size_t) 0);
146 	}
147 
148 #ifdef DEBUG
149     fprintf(stderr, "=== (Re-entry)   iconv(): Big-5 --> ISO 2022-7   ===\n");
150     fprintf(stderr, "st->cstate=%d\tst->istate=%d\tst->_errno=%d\tplane_no=%d\n",
151 	    st->cstate, st->istate, st->_errno, plane_no);
152 #endif
153 	st->_errno = 0;         /* reset internal errno */
154 	errno = 0;		/* reset external errno */
155 
156 	/* a state machine for interpreting Big-5 code */
157 	while (*inbytesleft > 0 && *outbytesleft > 0) {
158 		switch (st->cstate) {
159 		case C0:		/* assuming ASCII in the beginning */
160 			if (**inbuf & MSB) {
161 				st->keepc[0] = (**inbuf);
162 				st->cstate = C1;
163 			} else {	/* real ASCII */
164 				if (st->istate == OUT) {
165 					st->cstate = C0;
166 					st->istate = IN;
167 					**outbuf = SI;
168 					(*outbuf)++;
169 					(*outbytesleft)--;
170 					if (*outbytesleft <= 0) {
171 						errno = E2BIG;
172 						return((size_t)-1);
173 					}
174 				}
175 				**outbuf = **inbuf;
176 				(*outbuf)++;
177 				(*outbytesleft)--;
178 			}
179 			break;
180 		case C1:		/* Chinese characters: 2nd byte */
181 			if (big5_2nd_byte(**inbuf) != 0) {	/* illegal Big-5 */
182 				st->cstate = C0;
183 				st->istate = IN;
184 				st->_errno = errno = EILSEQ;
185 				break;
186 			}
187 			st->keepc[1] = (**inbuf);
188 			plane_no = get_plane_no_by_big5(st->keepc[0],
189 					st->keepc[1], &unidx, &cnscode);
190 			if (plane_no < 0) {     /* legal Big-5; illegal CNS */
191 				st->cstate = C0;
192 				st->istate = IN;
193 				st->_errno = errno = EILSEQ;
194 				break;
195 			}
196 
197 			if ((st->istate == IN) || (pre_plane_no != plane_no)) {
198 				/* change plane # in Chinese mode */
199 				if (st->istate == OUT) {
200 					**outbuf = SI;
201 					(*outbuf)++;
202 					(*outbytesleft)--;
203 #ifdef DEBUG
204 fprintf(stderr, "(plane #=%d\tpre_plane #=%d)\t", plane_no, pre_plane_no);
205 #endif
206 				}
207 				if (*outbytesleft < 4) {
208 					st->_errno = errno = E2BIG;
209 					return((size_t)-1);
210 				}
211 				pre_plane_no = plane_no;
212 				st->istate = OUT;	/* shift out */
213 				**outbuf = ESC;
214 				*(*outbuf+1) = '$';
215                                 *(*outbuf+2) = ')';
216 				*(*outbuf+3) = GET_PLANEC(plane_no);
217 #ifdef DEBUG
218 fprintf(stderr, "ESC $ ) %c  ", *(*outbuf+3));
219 #endif
220 				(*outbuf) += 4;
221 				(*outbytesleft) -= 4;
222 				if (*outbytesleft <= 0) {
223 					st->_errno = errno = E2BIG;
224 					return((size_t)-1);
225 				}
226 				st->istate = OUT;
227 				**outbuf = SO;
228 				(*outbuf)++;
229 				(*outbytesleft)--;
230 			}
231 			n = big5_to_iso(plane_no, unidx, cnscode,
232 					*outbuf, *outbytesleft);
233 			if (n > 0) {
234 				(*outbuf) += n;
235 				(*outbytesleft) -= n;
236 			} else {
237 				st->_errno = errno;
238 				return((size_t)-1);
239 			}
240 			st->cstate = C0;
241 			break;
242 		default:			/* should never come here */
243 			st->_errno = errno = EILSEQ;
244 			st->cstate = C0;	/* reset state */
245 			break;
246 		}
247 
248 		(*inbuf)++;
249 		(*inbytesleft)--;
250 
251 		if (st->_errno) {
252 #ifdef DEBUG
253     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
254 		st->_errno, st->cstate);
255 #endif
256 			break;
257 		}
258 		if (errno)
259 			return((size_t)-1);
260 	}
261 
262 	if (*inbytesleft > 0 && *outbytesleft == 0) {
263 		errno = E2BIG;
264 		return((size_t)-1);
265 	}
266 	return (*inbytesleft);
267 }
268 
269 
270 /*
271  * Test whether inbuf is a valid character for 2nd byte Big-5 code
272  * Return: = 0 - valid Big-5 2nd byte
273  *         = 1 - invalid Big-5 2nd byte
274  */
275 static int big5_2nd_byte(char inbuf)
276 {
277 	unsigned int	buf = (unsigned int) (inbuf & ONEBYTE);
278 
279 	if ((buf >= 0x40) && (buf <= 0x7E))
280 		return (0);
281 	if ((buf >= 0xA1) && (buf <= 0xFE))
282 		return (0);
283 	return(1);
284 }
285 
286 
287 /*
288  * Get plane number by Big-5 code; i.e. plane #1 returns 1, #2 returns 2, etc.
289  * Returns -1 on error conditions
290  *
291  * Since binary search of the Big-5 to CNS table is necessary, might as well
292  * return index and CNS code matching to the unicode.
293  */
294 static int get_plane_no_by_big5(const char c1, const char c2,
295 			int *unidx, unsigned long *cnscode)
296 {
297 	int 		ret;
298 	unsigned long	big5code;
299 
300 	big5code = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
301 	*unidx = binsearch(big5code, big5_cns_tab, MAX_BIG5_NUM);
302 	if ((*unidx) >= 0)
303 		*cnscode = big5_cns_tab[*unidx].value;
304 	else
305 		return(0);	/* match from Big-5 to CNS not found */
306 #ifdef DEBUG
307     fprintf(stderr, "Big-5=%04x, idx=%5d, CNS=%06x ", big5code, *unidx, *cnscode);
308 #endif
309 
310 	ret = (int) (*cnscode >> 16);
311 	switch (ret) {
312 	case 0x21:	/* 0x8EA1 - G */
313 	case 0x22:	/* 0x8EA2 - H */
314 	case 0x23:	/* 0x8EA3 - I */
315 	case 0x24:	/* 0x8EA4 - J */
316 	case 0x25:	/* 0x8EA5 - K */
317 	case 0x26:	/* 0x8EA6 - L */
318 	case 0x27:	/* 0x8EA7 - M */
319 	case 0x28:	/* 0x8EA8 - N */
320 	case 0x29:	/* 0x8EA9 - O */
321 	case 0x2a:	/* 0x8EAA - P */
322 	case 0x2b:	/* 0x8EAB - Q */
323 	case 0x2c:	/* 0x8EAC - R */
324 	case 0x2d:	/* 0x8EAD - S */
325 	case 0x2f:	/* 0x8EAF - U */
326 	case 0x30:	/* 0x8EB0 - V */
327 		return (ret - 0x20);	/* so that we can use GET_PLANEC() */
328 	case 0x2e:	/* 0x8EAE - T */
329 		return (3);		/* CNS 11643-1992 */
330 	default:
331 		return (-1);
332 	}
333 }
334 
335 
336 /*
337  * Big-5 code --> ISO 2022-7
338  * Return: > 0 - converted with enough space in output buffer
339  *         = 0 - no space in outbuf
340  */
341 static int big5_to_iso(int plane_no, int unidx, unsigned long cnscode,
342 						char *buf, size_t buflen)
343 {
344 	unsigned long	val;		/* CNS 11643 value */
345 #ifdef DEBUG
346 	char		cns_str[5];
347 #endif
348 
349         if (buflen < 2) {
350                 errno = E2BIG;
351                 return(0);
352         }
353 
354 	if (unidx < 0) {	/* no match from UTF8 to CNS 11643 */
355 		*buf = *(buf+1) = NON_ID_CHAR;
356 	} else {
357 		val = cnscode & 0xffff;
358 		*buf = (val & 0xff00) >> 8;
359 		*(buf+1) = val & 0xff;
360 	}
361 
362 #ifdef DEBUG
363     fprintf(stderr, "->%02x %02x<-\t->%c %c<-\t", *buf, *(buf+1), *buf, *(buf+1));
364 #endif
365 
366 #ifdef DEBUG
367 	switch (plane_no) {
368 	case 1:
369 		cns_str[0] = *buf | MSB;
370 		cns_str[1] = *(buf+1) | MSB;
371 		cns_str[2] = cns_str[3] = cns_str[4] = NULL;
372 		break;
373 	case 2:
374 	case 3:
375 	case 4:
376 	case 5:
377 	case 6:
378 	case 7:
379 	case 8:
380 	case 9:
381 	case 10:
382 	case 11:
383 	case 12:
384 	case 13:
385 	case 14:
386 	case 15:
387 	case 16:
388 		cns_str[0] = MBYTE;
389 		cns_str[1] = (char) PMASK + plane_no;
390 		cns_str[2] = (char) *buf | MSB;
391 		cns_str[3] = (char) *(buf+1) | MSB;
392 		cns_str[4] = NULL;
393 		break;
394 	}
395 
396     fprintf(stderr, "#%d ->%s<-\n", plane_no, cns_str);
397 #endif
398 
399 	return(2);
400 }
401 
402 
403 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
404 static int binsearch(unsigned long x, table_t v[], int n)
405 {
406 	int low, high, mid;
407 
408 	low = 0;
409 	high = n - 1;
410 	while (low <= high) {
411 		mid = (low + high) / 2;
412 		if (x < v[mid].key)
413 			high = mid - 1;
414 		else if (x > v[mid].key)
415 			low = mid + 1;
416 		else	/* found match */
417 			return mid;
418 	}
419 	return (-1);	/* no match */
420 }
421