xref: /illumos-gate/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_TW-iso2022-7.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <errno.h>
31 #include "unicode_cns11643_TW.h"	/* UTF8 to CNS 11643 mapping table */
32 #include "common_defs.h"
33 
34 #define	MSB	0x80	/* most significant bit */
35 #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
36 #define	PMASK	0xa0	/* plane number mask */
37 #define ONEBYTE	0xff	/* right most byte */
38 
39 #define SI	0x0f	/* shift in */
40 #define SO	0x0e	/* shift out */
41 #define ESC	0x1b	/* escape */
42 
43 /* static const char plane_char[] = "0GH23456789:;<=>?"; */
44 static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
45 
46 #define	GET_PLANEC(i)	(plane_char[i])
47 
48 #define NON_ID_CHAR '?'	/* non-identified character */
49 
50 typedef struct _icv_state {
51 	char	keepc[6];	/* maximum # byte of UTF8 code */
52 	short	cstate;
53 	short	istate;
54 	short	ustate;
55 	int	_errno;		/* internal errno */
56 } _iconv_st;
57 
58 enum _CSTATE	{ C0, C1 };
59 enum _ISTATE	{ IN, OUT };
60 enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
61 
62 
63 static int get_plane_no_by_utf(const char, const char, int *, unsigned long *);
64 static int utf8_to_iso(int, int, unsigned long, char *, size_t);
65 static int binsearch(unsigned long, utf_cns[], int);
66 
67 /*
68  * Open; called from iconv_open()
69  */
70 void *
_icv_open()71 _icv_open()
72 {
73 	_iconv_st *st;
74 
75 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
76 		errno = ENOMEM;
77 		return ((void *) -1);
78 	}
79 
80 	st->cstate = C0;
81 	st->istate = IN;
82 	st->ustate = U0;
83 	st->_errno = 0;
84 
85 #ifdef DEBUG
86     fprintf(stderr, "==========     iconv(): UTF2 --> ISO2022-7     ==========\n");
87 #endif
88 
89 	return ((void *) st);
90 }
91 
92 
93 /*
94  * Close; called from iconv_close()
95  */
96 void
_icv_close(_iconv_st * st)97 _icv_close(_iconv_st *st)
98 {
99 	if (!st)
100 		errno = EBADF;
101 	else
102 		free(st);
103 }
104 
105 
106 /*
107  * Actual conversion; called from iconv()
108  */
109 /*=========================================================
110  *
111  *       State Machine for interpreting UTF8 code
112  *
113  *=========================================================
114  *                         2nd byte   3rd byte 4th byte
115  *          +----->------->------->U5------>U6--------->U7
116  *          |                                            |
117  *          |     3 byte unicode                         |
118  *          +----->------->-------+                      |
119  *          |                     |                      |
120  *          ^                     v                      |
121  *          |  2 byte             U2 ---> U3             |
122  *          |  unicode                    v              |
123  * +------> U0 -------> U1                +-------->U4---+
124  * ^  ascii |           |                           ^    |
125  * |        |           +-------->--------->--------+    |
126  * |        v                                            v
127  * +----<---+-----<------------<------------<------------+
128  *
129  *=========================================================*/
130 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)131 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
132 				char **outbuf, size_t *outbytesleft)
133 {
134 	char c1 = '\0', c2 = '\0';
135 	int		plane_no, n, unidx;
136 	/* pre_plane_no: need to be static when re-entry occurs on errno set */
137 	static int	pre_plane_no = -1;	/* previous plane number */
138 	unsigned long	cnscode;
139 
140 	if (st == NULL) {
141 		errno = EBADF;
142 		return ((size_t) -1);
143 	}
144 
145 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
146 		st->cstate = C0;
147 		st->istate = IN;
148 		st->ustate = U0;
149 		st->_errno = 0;
150 		return ((size_t) 0);
151 	}
152 
153 #ifdef DEBUG
154     fprintf(stderr, "=== (Re-entry)     iconv(): UTF-8 --> ISO 2022-7 ===\n");
155     fprintf(stderr, "st->cstate=%d\tst->istate=%d\tst->_errno=%d\tplane_no=%d\n",
156 	    st->cstate, st->istate, st->_errno, plane_no);
157 #endif
158 	st->_errno = 0;		/* reset internal errno */
159 	errno = 0;		/* reset external errno */
160 
161 	/* a state machine for interpreting UTF8 code */
162 	while (*inbytesleft > 0 && *outbytesleft > 0) {
163 
164 	        uchar_t  first_byte;
165 
166 		switch (st->ustate) {
167 		case U0:		/* assuming ASCII in the beginning */
168 			if ((**inbuf & MSB) == 0) {	/* ASCII */
169 				if (st->istate == OUT) {
170 					st->cstate = C0;
171 					st->istate = IN;
172 					**outbuf = SI;
173 					(*outbuf)++;
174 					(*outbytesleft)--;
175 					if (*outbytesleft <= 0) {
176 						errno = E2BIG;
177 						return((size_t) -1);
178 					}
179 				}
180 				**outbuf = **inbuf;
181 				(*outbuf)++;
182 				(*outbytesleft)--;
183 			} else {	/* Chinese character */
184 				if ((**inbuf & 0xe0) == 0xc0) {	/* 2 byte unicode 0xc2..0xdf */
185 
186 				        /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
187 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
188 				             st->_errno = errno = EILSEQ;
189 				        else {
190 					     st->ustate = U1;
191 					     st->keepc[0] = **inbuf;
192 					}
193 				} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3 byte 0xe0..0xef */
194 					st->ustate = U2;
195 					st->keepc[0] = **inbuf;
196 				} else {
197 				        /* four bytes of UTF-8 sequences */
198 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
199 					     st->_errno = errno = EILSEQ;
200 				        else
201 				         {
202 					     st->ustate = U5;
203 					     st->keepc[0] = **inbuf;
204 					 }
205 				}
206 			}
207 			break;
208 		case U1:		/* 2 byte unicode */
209 			if ((**inbuf & 0xc0) == 0x80) {
210 				st->ustate = U4;
211 				st->keepc[1] = **inbuf;
212 				c1 = (st->keepc[0]&0x1c)>>2;
213 				c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
214 #ifdef DEBUG
215     fprintf(stderr, "UTF8: %02x%02x   --> ",
216 	st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
217 #endif
218 				continue;	/* should not advance *inbuf */
219 			} else {
220 				st->_errno = errno = EILSEQ;
221 			}
222 			break;
223 		case U2:		/* 3 byte unicode - 2nd byte */
224 
225 		        first_byte = st->keepc[0];
226 
227 		        /* if the first byte is 0xed, it is illegal sequence if the second
228 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
229 			 */
230 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
231 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
232 				st->_errno = errno = EILSEQ;
233 			else {
234 				st->ustate = U3;
235 				st->keepc[1] = **inbuf;
236 			}
237 			break;
238 		case U3:		/* 3 byte unicode - 3rd byte */
239 			if ((**inbuf & 0xc0) == 0x80) {
240 				st->ustate = U4;
241 				st->keepc[2] = **inbuf;
242 				c1 = ((st->keepc[0]&0x0f)<<4) |
243 					((st->keepc[1]&0x3c)>>2);
244 				c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
245 #ifdef DEBUG
246     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
247 		st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
248 #endif
249 				continue;	/* should not advance *inbuf */
250 			} else {
251 				st->_errno = errno = EILSEQ;
252 			}
253 			break;
254 		case U4:
255 			plane_no = get_plane_no_by_utf(c1, c2, &unidx, &cnscode);
256 		        if (plane_no == -2)
257 		         {  /* unicode is either 0xFFFE or 0xFFFF */
258 			    st->_errno = errno = EILSEQ;
259 			    break;
260 		         }
261 
262 			if (plane_no > 0) {	/* legal unicode; illegal CNS */
263 			if ((st->istate == IN) || (pre_plane_no != plane_no)) {
264 				if ((st->cstate == C0) ||
265 					(pre_plane_no != plane_no)) {
266 					/* change plane # in Chinese mode */
267 					if (st->cstate == C1) {
268 						**outbuf = SI;
269 						(*outbuf)++;
270 						(*outbytesleft)--;
271 					}
272 					if (*outbytesleft < 4) {
273 						st->_errno = errno = E2BIG;
274 						return((size_t) -1);
275 					}
276 					pre_plane_no = plane_no;
277 					st->cstate = C1;
278 					**outbuf = ESC;
279 					*(*outbuf+1) = '$';
280 					*(*outbuf+2) = ')';
281 					*(*outbuf+3) = GET_PLANEC(plane_no);
282 #ifdef DEBUG
283     fprintf(stderr, "\n\t\t\t\tESC $ ) %c\t", *(*outbuf+3));
284 #endif
285 					(*outbuf) += 4;
286 					(*outbytesleft) -= 4;
287 					if (*outbytesleft <= 0) {
288 						st->_errno = errno = E2BIG;
289 						return((size_t) -1);
290 					}
291 				}
292 				st->istate = OUT;
293 				**outbuf = SO;
294 				(*outbuf)++;
295 				(*outbytesleft)--;
296 			}
297 			}/* get_plane_no OK */
298 
299 			n = utf8_to_iso(plane_no, unidx, cnscode,
300 					*outbuf, *outbytesleft);
301 			if (n > 0) {
302 				(*outbuf) += n;
303 				(*outbytesleft) -= n;
304 			} else {
305 				st->_errno = errno;
306 				return((size_t) -1);
307 			}
308 			st->ustate = U0;
309 			st->_errno = 0;
310 			break;
311 	        case U5:
312 
313 		        first_byte = st->keepc[0];
314 
315 		        /* if the first byte is 0xed, it is illegal sequence if the second
316 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
317 			 */
318 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
319 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
320 				st->_errno = errno = EILSEQ;
321 			else {
322 				st->ustate = U6;
323 				st->keepc[1] = **inbuf;
324 			}
325 		        break;
326 	        case U6:
327 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
328 		          {
329 			     st->ustate = U7;
330 			     st->keepc[2] = **inbuf;
331 			  }
332 		        else
333 		             st->_errno = errno = EILSEQ;
334 		        break;
335 		case U7:
336 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
337 		          {  /* skip it to simplify */
338 			     st->ustate = U0;
339 			     st->_errno = 0;
340 			  }
341 		        else
342 		             st->_errno = errno = EILSEQ;
343 		        break;
344 		default:			/* should never come here */
345 			st->_errno = errno = EILSEQ;
346 			st->ustate = U0;	/* reset state */
347 			break;
348 		}
349 
350 		if (st->_errno) {
351 #ifdef DEBUG
352     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
353 		st->_errno, st->ustate);
354 #endif
355 			break;
356 		}
357 		(*inbuf)++;
358 		(*inbytesleft)--;
359 	}
360 
361 	if (errno)
362 		return((size_t) -1);
363 
364         if (*inbytesleft == 0 && st->ustate != U0) {
365 	        errno = EINVAL;
366 	        return ((size_t) -1);
367 	}
368 
369 	if (*inbytesleft > 0 && *outbytesleft == 0) {
370 		errno = E2BIG;
371 		return((size_t) -1);
372 	}
373 	return (*inbytesleft);
374 }
375 
376 
377 /*
378  * Get plane number by UTF8 code; i.e. plane #1 returns 1, #2 returns 2, etc.
379  * Returns -1 on error conditions and return -2 due to illegal sequence
380  *
381  * Since binary search of the UTF8 to CNS table is necessary, might as well
382  * return index and CNS code matching to the unicode.
383  */
get_plane_no_by_utf(const char c1,const char c2,int * unidx,unsigned long * cnscode)384 static int get_plane_no_by_utf(const char c1, const char c2,
385 			int *unidx, unsigned long *cnscode)
386 {
387 	int 		ret;
388 	unsigned long	unicode;
389 
390 	unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
391         /* the 0xfffe and 0xffff should not be allowed */
392 	if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -2;
393 
394 	*unidx = binsearch(unicode, utf_cns_tab, MAX_UTF_NUM);
395 	if ((*unidx) >= 0)
396 		*cnscode = utf_cns_tab[*unidx].cnscode;
397 	else
398 		return(0);	/* match from UTF8 to CNS not found */
399 #ifdef DEBUG
400     fprintf(stderr, "Unicode=%04x, idx=%5d, CNS=%x ", unicode, *unidx, *cnscode);
401 #endif
402 
403 	ret = (int) (*cnscode >> 16);
404 	switch (ret) {
405 	case 0x21:	/* 0x8EA1 - G */
406 	case 0x22:	/* 0x8EA2 - H */
407 	case 0x23:	/* 0x8EA3 - I */
408 	case 0x24:	/* 0x8EA4 - J */
409 	case 0x25:	/* 0x8EA5 - K */
410 	case 0x26:	/* 0x8EA6 - L */
411 	case 0x27:	/* 0x8EA7 - M */
412 	case 0x28:	/* 0x8EA8 - N */
413 	case 0x29:	/* 0x8EA9 - O */
414 	case 0x2a:	/* 0x8EAA - P */
415 	case 0x2b:	/* 0x8EAB - Q */
416 	case 0x2c:	/* 0x8EAC - R */
417 	case 0x2d:	/* 0x8EAD - S */
418 	case 0x2f:	/* 0x8EAF - U */
419 	case 0x30:	/* 0x8EB0 - V */
420 		return (ret - 0x20);	/* so that we can use GET_PLANEC() */
421 	case 0x2e:	/* 0x8EAE - T */
422 		return (3);		/* CNS 11643-1992 */
423 	default:
424 		return (-1);
425 	}
426 }
427 
428 
429 /*
430  * ISO/IEC 10646 (Unicode) --> ISO 2022-7
431  * Unicode --> UTF8 (FSS-UTF)
432  *             (File System Safe Universal Character Set Transformation Format)
433  * Return: > 0 - converted with enough space in output buffer
434  *         = 0 - no space in outbuf
435  */
utf8_to_iso(int plane_no,int unidx,unsigned long cnscode,char * buf,size_t buflen)436 static int utf8_to_iso(int plane_no, int unidx, unsigned long cnscode,
437 						    char *buf, size_t buflen)
438 {
439 	unsigned long	val;		/* CNS 11643 value */
440 #ifdef DEBUG
441     char	cns_str[5];
442 #endif
443 
444 	if (buflen < 2) {
445 		errno = E2BIG;
446 		return(0);
447 	}
448 
449 
450 	if (unidx < 0) {	/* no match from UTF8 to CNS 11643 */
451 	    *buf = *(buf+1) = NON_ID_CHAR;
452 	    return(2);
453 	} else {
454 		val = cnscode & 0xffff;
455 		*buf = (val & 0xff00) >> 8;
456 		*(buf+1) = val & 0xff;
457 	}
458 #ifdef DEBUG
459     fprintf(stderr, "\t%02x%02x\t", *buf, *(buf+1));
460 #endif
461 
462 #ifdef DEBUG
463     switch (plane_no) {
464     case 1:
465 	cns_str[0] = *buf | MSB;
466 	cns_str[1] = *(buf+1) | MSB;
467 	cns_str[2] = cns_str[3] = cns_str[4] = NULL;
468 	break;
469     case 2:
470     case 3:
471     case 4:
472     case 5:
473     case 6:
474     case 7:
475     case 8:
476     case 9:
477     case 10:
478     case 11:
479     case 12:
480     case 13:
481     case 14:
482     case 15:
483     case 16:
484 	cns_str[0] = MBYTE;
485 	cns_str[1] = (char) PMASK + plane_no;
486 	cns_str[2] = (char) *buf | MSB;
487 	cns_str[3] = (char) *(buf+1) | MSB;
488 	cns_str[4] = NULL;
489 	break;
490     }
491 
492     fprintf(stderr, "#%d ->%s<-\n", plane_no, cns_str);
493 #endif
494 	return(2);
495 }
496 
497 
498 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,utf_cns v[],int n)499 static int binsearch(unsigned long x, utf_cns v[], int n)
500 {
501 	int low, high, mid;
502 
503 	low = 0;
504 	high = n - 1;
505 	while (low <= high) {
506 		mid = (low + high) / 2;
507 		if (x < v[mid].unicode)
508 			high = mid - 1;
509 		else if (x > v[mid].unicode)
510 			low = mid + 1;
511 		else	/* found match */
512 			return mid;
513 	}
514 	return (-1);	/* no match */
515 }
516