xref: /illumos-gate/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_TW-euc.c (revision f642269fe771b10890afea92038f4531cd50cfd9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/isa_defs.h>
31 #include <errno.h>
32 #include "unicode_cns11643_TW.h"
33 #include "common_defs.h"
34 
35 #define	MSB	0x80	/* most significant bit */
36 #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
37 #define	PMASK	0xa0	/* plane number mask */
38 #define ONEBYTE	0xff	/* right most byte */
39 
40 #define NON_ID_CHAR '?'	/* non-identified character */
41 
42 #define Low_UDA_In_Unicode 0xF0000
43 #define High_UDA_In_Unicode 0xF8A10
44 
45 typedef struct _icv_state {
46 	char	keepc[6];	/* maximum # byte of UTF8 code */
47 	short	ustate;
48 	int	_errno;		/* internal errno */
49         boolean little_endian;
50         boolean bom_written;
51 } _iconv_st;
52 
53 enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
54 
55 static int get_plane_no_by_utf(uint_t, int *, unsigned long *);
56 static int utf8_to_cns(int, int, unsigned long, char *, size_t, int *);
57 static int binsearch(unsigned long, utf_cns[], int);
58 
59 /*
60  * Open; called from iconv_open()
61  */
62 void *
_icv_open()63 _icv_open()
64 {
65 	_iconv_st *st;
66 
67 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
68 		errno = ENOMEM;
69 		return ((void *) -1);
70 	}
71 
72 	st->ustate = U0;
73 	st->_errno = 0;
74         st->little_endian = false;
75         st->bom_written = false;
76 #if defined(UCS_2LE)
77         st->little_endian = true;
78         st->bom_written = true;
79 #endif
80 	return ((void *) st);
81 }
82 
83 
84 /*
85  * Close; called from iconv_close()
86  */
87 void
_icv_close(_iconv_st * st)88 _icv_close(_iconv_st *st)
89 {
90 	if (!st)
91 		errno = EBADF;
92 	else
93 		free(st);
94 }
95 
96 
97 /*
98  * Actual conversion; called from iconv()
99  */
100 /*=========================================================
101  *
102  *       State Machine for interpreting UTF8 code
103  *
104  *=========================================================
105  *                          2nd byte  3rd byte 4th byte
106  *          +----->------->------->U5----->U6----------->U7
107  *          |                                            |
108  *          |    3 byte unicode                          |
109  *          +----->------->-------+                      |
110  *          |                     |                      |
111  *          ^                     v                      |
112  *          |  2 byte             U2 ---> U3             |
113  *          |  unicode                    v              |
114  * +------> U0 -------> U1                +-------->U4---+
115  * ^  ascii |           |                           ^    |
116  * |        |           +-------->--------->--------+    |
117  * |        v                                            v
118  * +----<---+-----<------------<------------<------------+
119  *
120  *=========================================================*/
121 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)122 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
123 				char **outbuf, size_t *outbytesleft)
124 {
125 	int		plane_no, n, unidx;
126 	unsigned long	cnscode;
127         uint_t		ucs;
128 	int		uconv_num = 0;
129 	int		utf8_len = 0;
130 
131 #ifdef DEBUG
132     fprintf(stderr, "==========     iconv(): UTF2 --> CNS11643     ==========\n");
133 #endif
134 	if (st == NULL) {
135 		errno = EBADF;
136 		return ((size_t) -1);
137 	}
138 
139 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
140 		st->ustate = U0;
141 		st->_errno = 0;
142 		return ((size_t) 0);
143 	}
144 
145 	st->_errno = 0;		/* reset internal errno */
146 	errno = 0;		/* reset external errno */
147 
148 	/* a state machine for interpreting UTF8 code */
149 	while (*inbytesleft > 0 && *outbytesleft > 0) {
150 
151 	        uchar_t  first_byte;
152 		int	 uconv_num_internal = 0;
153 
154 		switch (st->ustate) {
155 		case U0:		/* assuming ASCII in the beginning */
156 	               /*
157 			* Code converion for UCS-2LE to support Samba
158 			*/
159 		        if (st->little_endian) {
160 			  st->ustate = U1;
161 			  st->keepc[0] = **inbuf;
162 			}
163 			else if ((**inbuf & MSB) == 0) {	/* ASCII */
164 				**outbuf = **inbuf;
165 				(*outbuf)++;
166 				(*outbytesleft)--;
167 			} else {	/* Chinese character 0xc2..0xdf */
168 				if ((**inbuf & 0xe0) == 0xc0) {	/* 2 byte unicode */
169 
170 				        /* invalid sequence if the first char is either 0xc0 or 0xc1 */
171 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
172 				            st->_errno = errno = EILSEQ;
173 				        else {
174 					    st->ustate = U1;
175 					    st->keepc[0] = **inbuf;
176 					}
177 				} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3 byte 0xe0..0xef */
178 					st->ustate = U2;
179 					st->keepc[0] = **inbuf;
180 				} else {
181 				        /* four bytes of UTF-8 sequences */
182 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
183 					    st->_errno = errno = EILSEQ;
184 				        else {
185 					    st->ustate = U5;
186 					    st->keepc[0] = **inbuf;
187 					}
188 				}
189 			}
190 			break;
191 		case U1:		/* 2 byte unicode */
192 			if ((**inbuf & 0xc0) == 0x80 || st->little_endian) {
193 				utf8_len = 2;
194 				st->keepc[1] = **inbuf;
195 
196 				/*
197 				 * Code conversion for UCS-2LE to support Samba
198 				 */
199 			        if  (st->little_endian) {
200 				  /*
201 				   * It's ASCII
202 				   */
203 				  if (st->keepc[1] == 0 && (st->keepc[0] & 0x80) == 0) {
204 				    *(*outbuf)++ = st->keepc[0];
205 				    (*outbytesleft)--;
206 				    st->ustate = U0;
207 				    break;
208 				  }
209 
210 				  ucs = ((st->keepc[1] & 0xff)<< 8) | (st->keepc[0] & 0xff);
211 
212 				} else
213 				  convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
214 
215 				st->ustate = U4;
216 #ifdef DEBUG
217     fprintf(stderr, "UTF8: %02x%02x   --> ",
218 	st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
219 #endif
220 				continue;	/* should not advance *inbuf */
221 			} else {
222 				st->_errno = errno = EILSEQ;
223 			}
224 			break;
225 		case U2:		/* 3 byte unicode - 2nd byte */
226 
227 		        first_byte = st->keepc[0];
228 
229 		        /* if the first byte is 0xed, it is illegal sequence if the second
230 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
231 			 */
232 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
233 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
234 				st->_errno = errno = EILSEQ;
235 			else {
236 				st->ustate = U3;
237 				st->keepc[1] = **inbuf;
238 			}
239 			break;
240 		case U3:		/* 3 byte unicode - 3rd byte */
241 			if ((**inbuf & 0xc0) == 0x80) {
242 				st->ustate = U4;
243 				utf8_len = 3;
244 				st->keepc[2] = **inbuf;
245 
246 			        convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
247 #ifdef DEBUG
248     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
249 		st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
250 #endif
251 				continue;	/* should not advance *inbuf */
252 			} else {
253 				st->_errno = errno = EILSEQ;
254 			}
255 			break;
256 		case U4:
257 
258 		       /* 0xfffe and 0xffff should not be allowed */
259 		       if ( ucs == 0xFFFE || ucs == 0xFFFF ) {
260 			       st->_errno = errno = EILSEQ;
261 			       break;
262 			}
263 
264 			plane_no = get_plane_no_by_utf(ucs, &unidx, &cnscode);
265 
266 /* comment these lines to ignore the invalid CNS
267 			if (plane_no < 0) {
268 				st->_errno = errno = EILSEQ;
269 				break;
270 			}
271 */
272 
273 			n = utf8_to_cns(plane_no, unidx, cnscode,
274 					*outbuf, *outbytesleft, &uconv_num_internal);
275 			if (n > 0) {
276 				(*outbuf) += n;
277 				(*outbytesleft) -= n;
278 
279 				uconv_num += uconv_num_internal;
280 
281 				st->ustate = U0;
282 			} else {
283 				st->_errno = errno = E2BIG;
284 			}
285 			break;
286 		 case U5:
287 
288 		        first_byte = st->keepc[0];
289 
290 		        /* if the first byte is 0xf0, it is illegal sequence if
291 			 * the second one is between 0x80 and 0x8f
292 			 * for Four-Byte UTF: U+10000..U+10FFFF
293 			 */
294 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
295 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
296 		            st->_errno = errno = EILSEQ;
297 		        else {
298 			    st->ustate = U6;
299 			    st->keepc[1] = **inbuf;
300 			}
301 		        break;
302 		 case U6:
303 		        if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
304 		          {
305 			     st->ustate = U7;
306 			     st->keepc[2] = **inbuf;
307 			  }
308 		        else
309 		          st->_errno = errno = EILSEQ;
310 		        break;
311 		 case U7:
312 		        if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
313 		          {
314 			     st->keepc[3] = **inbuf;
315 			     utf8_len = 4;
316 
317 			     convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
318 			     st->ustate = U4;
319 
320 			     continue;
321 			  }
322 		        else
323 		          st->_errno = errno = EILSEQ;
324 		        break;
325 		default:			/* should never come here */
326 			st->_errno = errno = EILSEQ;
327 			st->ustate = U0;	/* reset state */
328 			break;
329 		}
330 
331 		if (st->_errno) {
332 #ifdef DEBUG
333     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
334 		st->_errno, st->ustate);
335 #endif
336 			break;
337 		}
338 
339 		(*inbuf)++;
340 		(*inbytesleft)--;
341 	}
342 
343         if (*inbytesleft == 0 && st->ustate != U0)
344                 errno = EINVAL;
345 
346 
347 	if (*inbytesleft > 0 && *outbytesleft == 0)
348 		errno = E2BIG;
349 
350 	if (errno) {
351 		int num_reversed_bytes = 0;
352 
353 		switch (st->ustate) {
354 		 case U1:
355 		   num_reversed_bytes = 1;
356 		   break;
357 		 case U2:
358 		   num_reversed_bytes = 1;
359 		   break;
360 		 case U3:
361 		   num_reversed_bytes = 2;
362 		   break;
363 		 case U4:
364 		   num_reversed_bytes = utf8_len - 1;
365 		   break;
366 		 case U5:
367 		   num_reversed_bytes = 1;
368 		   break;
369 		 case U6:
370 		   num_reversed_bytes = 2;
371 		   break;
372 		 case U7:
373 		   num_reversed_bytes = 3;
374 		   break;
375 		}
376 
377 		/*
378 		 * if error, *inbuf points to the byte following the last byte
379 		 * successfully used in the conversion.
380 		 */
381 		*inbuf -= num_reversed_bytes;
382 		*inbytesleft += num_reversed_bytes;
383 		st->ustate = U0;
384 		return ((size_t) -1);
385 	}
386 
387 	return uconv_num;
388 }
389 
390 /*
391  * Get plane number by UTF8 code; i.e. plane #1 returns 1, #2 returns 2, etc.
392  * Returns -1 on error conditions
393  *
394  * Since binary search of the UTF8 to CNS table is necessary, might as well
395  * return index and CNS code matching to the unicode.
396  */
get_plane_no_by_utf(uint_t unicode,int * unidx,unsigned long * cnscode)397 static int get_plane_no_by_utf(uint_t unicode,
398 			int *unidx, unsigned long *cnscode)
399 {
400 	int		ret;
401 
402         /* test whether it belongs to private Unicode plane 15 */
403         if (unicode >= Low_UDA_In_Unicode && unicode <= High_UDA_In_Unicode)
404            {
405 	       uint_t  internIdx = (uint_t)(unicode - Low_UDA_In_Unicode);
406 	       uchar_t byte1, byte2;
407 
408 	       byte1 = 0xa1 + (internIdx % 8836) / 94;
409 	       byte2 = 0xa1 + internIdx % 94;
410 	       *cnscode = ((byte1 << 8) & 0xff00) | (byte2 & 0xff);
411 
412 	       *unidx = 1; /* deceit the utf8_to_cns() */
413 
414 	       ret = 12 + internIdx / 8836;
415 	       /* actually it belongs to CNS plane 16, so change it */
416 	       if ( ret == 15 ) ++ret;
417 
418 	       return ret;
419            }
420 
421 
422 	*unidx = binsearch(unicode, utf_cns_tab, MAX_UTF_NUM);
423 	if ((*unidx) >= 0)
424 		*cnscode = utf_cns_tab[*unidx].cnscode;
425 	else
426 		return(0);	/* match from UTF8 to CNS not found */
427 #ifdef DEBUG
428     fprintf(stderr, "Unicode=%04x, idx=%5d, CNS=%x ", unicode, *unidx, *cnscode);
429 #endif
430 
431 	ret = (int) (*cnscode >> 16);
432 	switch (ret) {
433 	case 0x21:	/* 0x8EA1 - G */
434 	case 0x22:	/* 0x8EA2 - H */
435 	case 0x23:	/* 0x8EA3 - I */
436 	case 0x24:	/* 0x8EA4 - J */
437 	case 0x25:	/* 0x8EA5 - K */
438 	case 0x26:	/* 0x8EA6 - L */
439 	case 0x27:	/* 0x8EA7 - M */
440 	case 0x28:	/* 0x8EA8 - N */
441 	case 0x29:	/* 0x8EA9 - O */
442 	case 0x2a:	/* 0x8EAA - P */
443 	case 0x2b:	/* 0x8EAB - Q */
444 	case 0x2c:	/* 0x8EAC - R */
445 	case 0x2d:	/* 0x8EAD - S */
446 	case 0x2e:	/* 0x8EAE - T */
447 	case 0x2f:	/* 0x8EAF - U */
448 	case 0x30:	/* 0x8EB0 - V */
449 		return (ret - 0x20);	/* so that we can use GET_PLANEC() */
450 	default:
451 		return (-1);
452 	}
453 }
454 
455 
456 /*
457  * ISO/IEC 10646 (Unicode) --> ISO 2022-7
458  * Unicode --> UTF8 (FSS-UTF)
459  *             (File System Safe Universal Character Set Transformation Format)
460  * Return: > 0 - converted with enough space in output buffer
461  *         = 0 - no space in outbuf
462  */
utf8_to_cns(int plane_no,int unidx,unsigned long cnscode,char * buf,size_t buflen,int * uconv_num)463 static int utf8_to_cns(int plane_no, int unidx, unsigned long cnscode,
464 						    char *buf, size_t buflen, int *uconv_num)
465 {
466 	unsigned long	val;		/* CNS 11643 value */
467 	unsigned char	c1 = 0, c2 = 0, cns_str[5];
468 	int		ret_size;
469 
470 	if (unidx < 0) {	/* no match from UTF8 to CNS 11643 */
471 		if ( buflen < 2 ) goto err;
472 		*buf = *(buf+1) = NON_ID_CHAR;
473 
474 		/* non-identical conversion */
475 		*uconv_num = 1;
476 
477 		ret_size = 2;
478 	} else {
479 		val = cnscode & 0xffff;
480 		c1 = ((val & 0xff00) >> 8) | MSB;
481 		c2 = (val & 0xff) | MSB;
482 	}
483 
484 	switch (plane_no) {
485 	case 1:
486 		if ( buflen < 2) goto err;
487 		*buf = cns_str[0] = c1;
488 		*(buf+1) = cns_str[1] = c2;
489 		cns_str[2] = cns_str[3] = cns_str[4] = '\0';
490 		ret_size = 2;
491 		break;
492 	case 2:
493 	case 3:
494 	case 4:
495 	case 5:
496 	case 6:
497 	case 7:
498 	case 8:
499 	case 9:
500 	case 10:
501 	case 11:
502 	case 12:
503 	case 13:
504 	case 14:
505 	case 15:
506 	case 16:
507 		if ( buflen < 4) goto err;
508 		*(unsigned char*) buf = cns_str[0] = MBYTE;
509 		*(buf+1) = cns_str[1] = PMASK + plane_no;
510 		*(buf+2) = cns_str[2] = c1;
511 		*(buf+3) = cns_str[3] = c2;
512 		cns_str[4] = '\0';
513 		ret_size = 4;
514 		break;
515 	}
516 
517 #ifdef DEBUG
518     fprintf(stderr, "\t#%d ->%s<-\n", plane_no, cns_str);
519 #endif
520 
521 	return(ret_size);
522 
523 err:
524 	errno = E2BIG;
525 	return 0;
526 }
527 
528 
529 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,utf_cns v[],int n)530 static int binsearch(unsigned long x, utf_cns v[], int n)
531 {
532 	int low, high, mid;
533 
534 	low = 0;
535 	high = n - 1;
536 	while (low <= high) {
537 		mid = (low + high) / 2;
538 		if (x < v[mid].unicode)
539 			high = mid - 1;
540 		else if (x > v[mid].unicode)
541 			low = mid + 1;
542 		else	/* found match */
543 			return mid;
544 	}
545 	return (-1);	/* no match */
546 }
547