xref: /titanic_51/usr/src/lib/iconv_modules/zh/common/zh_TW-iso2022-7%UTF-8.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include "cns11643_unicode_TW.h"	/* CNS 11643 to UTF8 mapping table */
31 
32 #define	MSB	0x80	/* most significant bit */
33 #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
34 #define	PMASK	0xa0	/* plane number mask */
35 #define ONEBYTE	0xff	/* right most byte */
36 #define MSB_OFF	0x7f	/* mask off MBS */
37 
38 #define SI	0x0f	/* shift in */
39 #define SO	0x0e	/* shift out */
40 #define ESC	0x1b	/* escape */
41 
42 /*
43  * static const char plane_char[] = "0GH23456789:;<=>?";
44  * static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
45  * #define	GET_PLANEC(i)	(plane_char[i])
46  */
47 
48 /* non-identified character */
49 #define UTF8_NON_ID_CHAR1 0xEF
50 #define UTF8_NON_ID_CHAR2 0xBF
51 #define UTF8_NON_ID_CHAR3 0xBD
52 
53 typedef struct _icv_state {
54 	char	keepc[4];	/* maximum # byte of CNS11643 code */
55 	short	cstate;		/* state machine id */
56 	int	plane_no;	/* plane number for Chinese character */
57 	int	_errno;		/* internal errno */
58 } _iconv_st;
59 
60 enum _CSTATE	{ C0, C1, C2, C3, C4, C5, C6, C7 };
61 
62 
63 static int get_plane_no_by_iso(const char);
64 static int iso_to_utf8(int, char[], char*, size_t);
65 static int binsearch(unsigned long, cns_utf[], int);
66 
67 
68 /*
69  * Open; called from iconv_open()
70  */
71 void *
72 _icv_open()
73 {
74 	_iconv_st *st;
75 
76 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
77 		errno = ENOMEM;
78 		return ((void *) -1);
79 	}
80 
81 	st->cstate = C0;
82 	st->plane_no = 0;
83 	st->_errno = 0;
84 
85 	return ((void *) st);
86 }
87 
88 
89 /*
90  * Close; called from iconv_close()
91  */
92 void
93 _icv_close(_iconv_st *st)
94 {
95 	if (!st)
96 		errno = EBADF;
97 	else
98 		free(st);
99 }
100 
101 
102 /*
103  * Actual conversion; called from iconv()
104  */
105 /*=========================================================================
106  *
107  *             State Machine for interpreting ISO 2022-7 code
108  *
109  *=========================================================================
110  *
111  *                                                        plane 2 - 16
112  *                                                    +---------->-------+
113  *                                    plane           ^                  |
114  *            ESC      $       )      number     SO   | plane 1          v
115  *    +-> C0 ----> C1 ---> C2 ---> C3 ------> C4 --> C5 -------> C6     C7
116  *    |   | ascii  | ascii | ascii |    ascii |   SI | |          |      |
117  *    +----------------------------+    <-----+------+ +------<---+------+
118  *    ^                                 |
119  *    |              ascii              v
120  *    +---------<-------------<---------+
121  *
122  *=========================================================================*/
123 size_t
124 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
125 				char **outbuf, size_t *outbytesleft)
126 {
127 	int		n;
128 
129 #ifdef DEBUG
130     fprintf(stderr, "==========     iconv(): ISO2022-7 --> UTF2     ==========\n");
131 #endif
132 	if (st == NULL) {
133 		errno = EBADF;
134 		return ((size_t) -1);
135 	}
136 
137 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
138 		st->cstate = C0;
139 		st->_errno = 0;
140 		return ((size_t) 0);
141 	}
142 
143 	st->_errno = 0;         /* reset internal errno */
144 	errno = 0;		/* reset external errno */
145 
146 	/* a state machine for interpreting ISO 2022-7 code */
147 	while (*inbytesleft > 0 && *outbytesleft > 0) {
148 		switch (st->cstate) {
149 		case C0:		/* assuming ASCII in the beginning */
150 			if (**inbuf == ESC) {
151 				st->cstate = C1;
152 			} else {	/* real ASCII */
153 				**outbuf = **inbuf;
154 				(*outbuf)++;
155 				(*outbytesleft)--;
156 			}
157 			break;
158 		case C1:		/* got ESC, expecting $ */
159 			if (**inbuf == '$') {
160 				st->cstate = C2;
161 			} else {
162 				**outbuf = ESC;
163 				(*outbuf)++;
164 				(*outbytesleft)--;
165 				st->cstate = C0;
166 				st->_errno = 0;
167 				continue;	/* don't advance inbuf */
168 			}
169 			break;
170 		case C2:		/* got $, expecting ) */
171 			if (**inbuf == ')') {
172 				st->cstate = C3;
173 			} else {
174 				if (*outbytesleft < 2) {
175 					st->_errno = errno = E2BIG;
176 					return((size_t)-1);
177 				}
178 				**outbuf = ESC;
179 				*(*outbuf+1) = '$';
180 				(*outbuf) += 2;
181 				(*outbytesleft) -= 2;
182 				st->cstate = C0;
183 				st->_errno = 0;
184 				continue;	/* don't advance inbuf */
185 			}
186 			break;
187 		case C3:		/* got ) expecting G,H,I,...,V */
188 			st->plane_no = get_plane_no_by_iso(**inbuf);
189 			if (st->plane_no > 0 ) {	/* plane #1 - #16 */
190 				st->cstate = C4;
191 			} else {
192 				if (*outbytesleft < 3) {
193 					st->_errno = errno = E2BIG;
194 					return((size_t)-1);
195 				}
196 				**outbuf = ESC;
197 				*(*outbuf+1) = '$';
198 				*(*outbuf+2) = ')';
199 				(*outbuf) += 3;
200 				(*outbytesleft) -= 3;
201 				st->cstate = C0;
202 				st->_errno = 0;
203 				continue;	/* don't advance inbuf */
204 			}
205 			break;
206 		case C4:		/* SI (Shift In) */
207 			if (**inbuf == ESC) {
208 				st->cstate = C1;
209 				break;
210 			}
211 			if (**inbuf == SO) {
212 #ifdef DEBUG
213     fprintf(stderr, "<--------------  SO  -------------->\n");
214 #endif
215 				st->cstate = C5;
216 			} else {	/* ASCII */
217 				**outbuf = **inbuf;
218 				(*outbuf)++;
219 				(*outbytesleft)--;
220 				st->cstate = C0;
221 				st->_errno = 0;
222 			}
223 			break;
224 		case C5:		/* SO (Shift Out) */
225 			if (**inbuf == SI) {
226 #ifdef DEBUG
227     fprintf(stderr, ">--------------  SI  --------------<\n");
228 #endif
229 				st->cstate = C4;
230 			} else {	/* 1st Chinese character */
231 				if (st->plane_no == 1) {
232 					st->keepc[0] = (char) (**inbuf | MSB);
233 					st->cstate = C6;
234 				} else {	/* plane #1 - #16 */
235 					st->keepc[0] = (char) MBYTE;
236 					st->keepc[1] = (char) (PMASK +
237 								st->plane_no);
238 					st->keepc[2] = (char) (**inbuf | MSB);
239 					st->cstate = C7;
240 				}
241 			}
242 			break;
243 		case C6:		/* plane #1: 2nd Chinese character */
244 			st->keepc[1] = (char) (**inbuf | MSB);
245 			st->keepc[2] = st->keepc[3] = NULL;
246 			n = iso_to_utf8(1, st->keepc, *outbuf,
247 						*outbytesleft);
248 			if (n > 0) {
249 				(*outbuf) += n;
250 				(*outbytesleft) -= n;
251 			} else {
252 				st->_errno = errno;
253 				return((size_t)-1);
254 			}
255 			st->cstate = C5;
256 			break;
257 		case C7:		/* 4th Chinese character */
258 			st->keepc[3] = (char) (**inbuf | MSB);
259 			n = iso_to_utf8(st->plane_no, st->keepc, *outbuf,
260 					*outbytesleft);
261 			if (n > 0) {
262 				(*outbuf) += n;
263 				(*outbytesleft) -= n;
264 			} else {
265 				st->_errno = errno;
266 				return((size_t)-1);
267 			}
268 			st->cstate = C5;
269 			break;
270 		default:			/* should never come here */
271 			st->_errno = errno = EILSEQ;
272 			st->cstate = C0;	/* reset state */
273 			break;
274 		}
275 
276 		(*inbuf)++;
277 		(*inbytesleft)--;
278 
279 		if (st->_errno) {
280 #ifdef DEBUG
281     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\tinbuf=%x\n",
282 		st->_errno, st->cstate, **inbuf);
283 #endif
284 			break;
285 		}
286 		if (errno)
287 			return((size_t)-1);
288 	}
289 
290 	if (*inbytesleft > 0 && *outbytesleft == 0) {
291 		errno = E2BIG;
292 		return((size_t)-1);
293 	}
294 	return (*inbytesleft);
295 }
296 
297 
298 /*
299  * Get plane number by ISO plane char; i.e. 'G' returns 1, 'H' returns 2, etc.
300  * Returns -1 on error conditions
301  */
302 static int get_plane_no_by_iso(const char inbuf)
303 {
304 	int ret;
305 	unsigned char uc = (unsigned char) inbuf;
306 
307 	if (uc == '0')	/* plane #0 */
308 		return(0);
309 
310 	ret = uc - 'F';
311 	switch (ret) {
312 	case 1:		/* 0x8EA1 - G */
313 	case 2:		/* 0x8EA2 - H */
314 	case 3:		/* 0x8EA3 - I */
315 	case 4:		/* 0x8EA4 - J */
316 	case 5:		/* 0x8EA5 - K */
317 	case 6:		/* 0x8EA6 - L */
318 	case 7:		/* 0x8EA7 - M */
319 	case 8:		/* 0x8EA8 - N */
320 	case 9:		/* 0x8EA9 - O */
321 	case 10:	/* 0x8EAA - P */
322 	case 11:	/* 0x8EAB - Q */
323 	case 12:	/* 0x8EAC - R */
324 	case 13:	/* 0x8EAD - S */
325 	case 14:	/* 0x8EAE - T */
326 	case 15:	/* 0x8EAF - U */
327 	case 16:	/* 0x8EB0 - V */
328 		return (ret);
329 	default:
330 		return (-1);
331 	}
332 }
333 
334 
335 /*
336  * ISO 2022-7 code --> ISO/IEC 10646 (Unicode)
337  * Unicode --> UTF8 (FSS-UTF)
338  *             (File System Safe Universal Character Set Transformation Format)
339  * Return: > 0 - converted with enough space in output buffer
340  *         = 0 - no space in outbuf
341  */
342 static int iso_to_utf8(int plane_no, char keepc[], char *buf, size_t buflen)
343 {
344 	char		iso_str[3];
345 	unsigned long	iso_val;	/* ISO 2022-7 value */
346 	int		unidx;		/* Unicode index */
347 	unsigned long	uni_val;	/* Unicode */
348 
349 #ifdef DEBUG
350     fprintf(stderr, "%s %d ", keepc, plane_no);
351 #endif
352 	if (plane_no == 1) {
353 		iso_str[0] = keepc[0] & MSB_OFF;
354 		iso_str[1] = keepc[1] & MSB_OFF;
355 	} else {
356 		iso_str[0] = keepc[2] & MSB_OFF;
357 		iso_str[1] = keepc[3] & MSB_OFF;
358 	}
359 	iso_val = (iso_str[0] << 8) + iso_str[1];
360 #ifdef DEBUG
361     fprintf(stderr, "%x\t", iso_val);
362 #endif
363 
364 	switch (plane_no) {
365 	case 1:
366 		unidx = binsearch(iso_val, cns1_utf_tab, MAX_CNS1_NUM);
367 		if (unidx >= 0)
368 			uni_val = cns1_utf_tab[unidx].unicode;
369 		break;
370 	case 2:
371 		unidx = binsearch(iso_val, cns2_utf_tab, MAX_CNS2_NUM);
372 		if (unidx >= 0)
373 			uni_val = cns2_utf_tab[unidx].unicode;
374 		break;
375 	case 3:
376 	case 14:
377 		unidx = binsearch(iso_val, cns3_utf_tab, MAX_CNS3_NUM);
378 		if (unidx >= 0)
379 			uni_val = cns3_utf_tab[unidx].unicode;
380 		break;
381 	default:
382 		unidx = -1;	/* no mapping from CNS to UTF8 */
383 		break;
384 	}
385 
386 #ifdef DEBUG
387     fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
388 #endif
389 
390 	if (unidx >= 0) {	/* do Unicode to UTF8 conversion */
391 		if (uni_val > 0x0080 && uni_val <= 0x07ff) {
392 			if (buflen < 2) {
393 				errno = E2BIG;
394 				return(0);
395 			}
396 			*buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
397 			*(buf+1) = (char)(uni_val & 0x3f) | 0x80;
398 #ifdef DEBUG
399     fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
400 #endif
401 			return(2);
402 		}
403 		if (uni_val > 0x0800 && uni_val <= 0xffff) {
404 			if (buflen < 3) {
405 				errno = E2BIG;
406 				return(0);
407 			}
408 			*buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
409 			*(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
410 			*(buf+2) = (char)(uni_val & 0x3f) | 0x80;
411 #ifdef DEBUG
412     fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
413 #endif
414 			return(3);
415 		}
416 	}
417 
418 	/* can't find a match in CNS --> UTF8 table or illegal UTF8 code */
419 	if (buflen < 3) {
420 		errno = E2BIG;
421 		return(0);
422 	}
423 
424         *(unsigned char*) buf     = UTF8_NON_ID_CHAR1;
425         *(unsigned char*) (buf+1) = UTF8_NON_ID_CHAR2;
426         *(unsigned char*) (buf+2) = UTF8_NON_ID_CHAR3;
427 
428 #ifdef DEBUG
429     fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
430 #endif
431 	return(3);
432 }
433 
434 
435 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
436 static int binsearch(unsigned long x, cns_utf v[], int n)
437 {
438 	int low, high, mid;
439 
440 	low = 0;
441 	high = n - 1;
442 	while (low <= high) {
443 		mid = (low + high) / 2;
444 		if (x < v[mid].cnscode)
445 			high = mid - 1;
446 		else if (x > v[mid].cnscode)
447 			low = mid + 1;
448 		else	/* found match */
449 			return mid;
450 	}
451 	return (-1);	/* no match */
452 }
453