xref: /illumos-gate/usr/src/lib/iconv_modules/zh/common/zh_TW-euc%UTF-8.c (revision f642269fe771b10890afea92038f4531cd50cfd9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1995, by Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <sys/types.h>
29 #include <sys/isa_defs.h>
30 #include <errno.h>
31 #include "common_defs.h"
32 #include "cns11643_unicode_TW.h"	/* CNS 11643 to UTF8 mapping table */
33 
34 #define	MSB	0x80	/* most significant bit */
35 #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
36 #define	PMASK	0xa0	/* plane number mask */
37 #define ONEBYTE	0xff	/* right most byte */
38 #define MSB_OFF	0x7f	/* mask off MBS */
39 #define VALID_EUC_BYTE(v) (((uchar_t)v) >= 0xA1 && ((uchar_t)v) <= 0xFE)
40 
41 /* non-identified character */
42 #define UTF8_NON_ID_CHAR1 0xEF
43 #define UTF8_NON_ID_CHAR2 0xBF
44 #define UTF8_NON_ID_CHAR3 0xBD
45 
46 
47 typedef struct _icv_state {
48 	char	keepc[4];	/* maximum # byte of CNS11643 code */
49 	short	cstate;		/* state machine id */
50 	int	_errno;		/* internal errno */
51         boolean little_endian;
52         boolean bom_written;
53 } _iconv_st;
54 
55 enum _CSTATE	{ C0, C1, C2, C3 };
56 
57 static int get_plane_no_by_char(const char);
58 static int cns_to_utf8(int, _iconv_st *, char*, size_t, int *);
59 static int binsearch(unsigned long, cns_utf[], int);
60 static uint_t getUnicodeFromUDA(int, uchar_t, uchar_t);
61 
62 
63 /*
64  * Open; called from iconv_open()
65  */
66 void *
_icv_open()67 _icv_open()
68 {
69 	_iconv_st *st;
70 
71 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
72 		errno = ENOMEM;
73 		return ((void *) -1);
74 	}
75 
76 	st->cstate = C0;
77 	st->_errno = 0;
78 	st->little_endian = false;
79 	st->bom_written = false;
80 #if defined(UCS_2LE)
81 	st->little_endian = true;
82 	st->bom_written = true;
83 #endif
84 	return ((void *) st);
85 }
86 
87 
88 /*
89  * Close; called from iconv_close()
90  */
91 void
_icv_close(_iconv_st * st)92 _icv_close(_iconv_st *st)
93 {
94 	if (!st)
95 		errno = EBADF;
96 	else
97 		free(st);
98 }
99 
100 
101 /*
102  * Actual conversion; called from iconv()
103  */
104 /*=======================================================
105  *
106  *   State Machine for interpreting CNS 11643 code
107  *
108  *=======================================================
109  *
110  *                          plane 2 - 16
111  *                1st C         2nd C       3rd C
112  *    +------> C0 -----> C1 -----------> C2 -----> C3
113  *    |  ascii |  plane 1 |                   4th C |
114  *    ^        v  2nd C   v                         v
115  *    +----<---+-----<----+-------<---------<-------+
116  *
117  *=======================================================*/
118 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)119 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
120 				char **outbuf, size_t *outbytesleft)
121 {
122 	int		plane_no = 0, n;
123 	int		uconv_num = 0;
124 
125 #ifdef DEBUG
126     fprintf(stderr, "==========     iconv(): CNS11643 --> UTF2     ==========\n");
127 #endif
128 	if (st == NULL) {
129 		errno = EBADF;
130 		return ((size_t) -1);
131 	}
132 
133 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
134 		st->cstate = C0;
135 		st->_errno = 0;
136 		return ((size_t) 0);
137 	}
138 
139 	st->_errno = 0;         /* reset internal errno */
140 	errno = 0;		/* reset external errno */
141 
142 	/* a state machine for interpreting CNS 11643 code */
143 	while (*inbytesleft > 0 && *outbytesleft > 0) {
144 		switch (st->cstate) {
145 		case C0:		/* assuming ASCII in the beginning */
146 			if (**inbuf & MSB) {
147 			   if (((uchar_t)**inbuf) == MBYTE || VALID_EUC_BYTE(**inbuf)) {
148 				st->keepc[0] = (**inbuf);
149 				st->cstate = C1;
150 			   } else
151 			        st->_errno = errno = EILSEQ;
152 			} else {	/* real ASCII */
153 			  /*
154 			   * Code conversion for UCS-2LE to support Samba
155 			   */
156 			  if (st->little_endian) {
157 			    if (!st->bom_written) {
158 			      if (*outbytesleft < 4)
159 				errno = E2BIG;
160 			      else {
161 				*(*outbuf)++ = (uchar_t)0xff;
162 				*(*outbuf)++ = (uchar_t)0xfe;
163 				*outbytesleft -= 2;
164 
165 				st->bom_written = true;
166 			      }
167 			    }
168 
169 			    if (*outbytesleft < 2)
170 			      errno = E2BIG;
171 			    else {
172 			      *(*outbuf)++ = **inbuf;
173 			      *(*outbuf)++ = (uchar_t)0x0;
174 			      *outbytesleft -= 2;
175 			    }
176 			  } else {
177 				**outbuf = **inbuf;
178 				(*outbuf)++;
179 				(*outbytesleft)--;
180 			  }
181 			}
182 			break;
183 		case C1:		/* Chinese characters: 2nd byte */
184 			if (((uchar_t)st->keepc[0]) == MBYTE) {
185 				plane_no = get_plane_no_by_char(**inbuf);
186 				if (plane_no == -1) {	/* illegal plane */
187 					st->_errno = errno = EILSEQ;
188 				} else {
189 					st->keepc[1] = (**inbuf);
190 					st->cstate = C2;
191 				}
192 			} else {
193 				if (VALID_EUC_BYTE(**inbuf)) {	/* plane #1 */
194 					int uconv_num_internal = 0;
195 
196 					st->keepc[1] = (**inbuf);
197 					st->keepc[2] = st->keepc[3] = '\0';
198 					n = cns_to_utf8(1, st, *outbuf,
199 							*outbytesleft, &uconv_num_internal);
200 					if (n > 0) {
201 						(*outbuf) += n;
202 						(*outbytesleft) -= n;
203 
204 						uconv_num += uconv_num_internal;
205 
206 						st->cstate = C0;
207 					} else {	/* don't reset state */
208 						st->_errno = errno = E2BIG;
209 					}
210 				} else {	/* input char doesn't belong
211 						 * to the input code set
212 						 */
213 					st->_errno = errno = EILSEQ;
214 				}
215 			}
216 			break;
217 		case C2:	/* plane #2 - #16 (4 bytes): get 3nd byte */
218 			if (VALID_EUC_BYTE(**inbuf)) {	/* 3rd byte */
219 				st->keepc[2] = (**inbuf);
220 				st->cstate = C3;
221 			} else {
222 				st->_errno = errno = EILSEQ;
223 			}
224 			break;
225 		case C3:	/* plane #2 - #16 (4 bytes): get 4th byte */
226 			if (VALID_EUC_BYTE(**inbuf)) {	/* 4th byte */
227 				int uconv_num_internal = 0;
228 
229 				st->keepc[3] = (**inbuf);
230 				n = cns_to_utf8(plane_no, st, *outbuf,
231 						*outbytesleft, &uconv_num_internal);
232 				if (n > 0) {
233 					(*outbuf) += n;
234 					(*outbytesleft) -= n;
235 
236 					uconv_num += uconv_num_internal;
237 
238 					st->cstate = C0;	/* reset state */
239 				} else {	/* don't reset state */
240 					st->_errno = errno = E2BIG;
241 				}
242 			} else {
243 				st->_errno = errno = EILSEQ;
244 			}
245 			break;
246 		default:			/* should never come here */
247 			st->_errno = errno = EILSEQ;
248 			st->cstate = C0;	/* reset state */
249 			break;
250 		}
251 
252 		if (st->_errno) {
253 #ifdef DEBUG
254     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
255 		st->_errno, st->cstate);
256 #endif
257 			break;
258 		}
259 
260 		(*inbuf)++;
261 		(*inbytesleft)--;
262 	}
263 
264         if (*inbytesleft == 0 && st->cstate != C0)
265                 errno = EINVAL;
266 
267 	if (*inbytesleft > 0 && *outbytesleft == 0)
268 		errno = E2BIG;
269 
270 	if (errno) {
271 		/*
272 		 * if error, *inbuf points to the byte following the last byte
273 		 * successfully used in the conversion.
274 		 */
275 		*inbuf -= (st->cstate - C0);
276 		*inbytesleft += (st->cstate - C0);
277 		st->cstate = C0;
278 		return ((size_t) -1);
279 	}
280 
281 	return  uconv_num;
282 }
283 
284 
285 /*
286  * Get plane number by char; i.e. 0xa2 returns 2, 0xae returns 14, etc.
287  * Returns -1 on error conditions
288  */
get_plane_no_by_char(const char inbuf)289 static int get_plane_no_by_char(const char inbuf)
290 {
291 	int ret;
292 	unsigned char uc = (unsigned char) inbuf;
293 
294 	ret = uc - PMASK;
295 	switch (ret) {
296 	case 1:		/* 0x8EA1 */
297 	case 2:		/* 0x8EA2 */
298 	case 3:		/* 0x8EA3 */
299 	case 4:		/* 0x8EA4 */
300 	case 5:		/* 0x8EA5 */
301 	case 6:		/* 0x8EA6 */
302 	case 7:		/* 0x8EA7 */
303 	case 12:	/* 0x8EAC */
304         case 13:	/* 0x8EAD */
305 	case 14:	/* 0x8EAE */
306 	case 15:	/* 0x8EAF */
307 	case 16:	/* 0x8EB0 */
308 		return (ret);
309 	default:
310 		return (-1);
311 	}
312 }
313 
314 
315 /*
316  * CNS 11643 code --> ISO/IEC 10646 (Unicode)
317  * Unicode --> UTF8 (FSS-UTF)
318  *             (File System Safe Universal Character Set Transformation Format)
319  * Return: > 0 - converted with enough space in output buffer
320  *         = 0 - no space in outbuf
321  */
cns_to_utf8(int plane_no,_iconv_st * st,char * buf,size_t buflen,int * uconv_num)322 static int cns_to_utf8(int plane_no, _iconv_st *st, char *buf, size_t buflen, int *uconv_num)
323 {
324 	char		cns_str[3];
325 	unsigned long	cns_val;	/* MSB mask off CNS 11643 value */
326 	int		unidx;		/* Unicode index */
327 	unsigned long	uni_val = 0;	/* Unicode */
328 	char            *keepc = st->keepc;
329 
330 #ifdef DEBUG
331     fprintf(stderr, "%s %d ", keepc, plane_no);
332 #endif
333 	if (plane_no == 1) {
334 		cns_str[0] = keepc[0] & MSB_OFF;
335 		cns_str[1] = keepc[1] & MSB_OFF;
336 	} else {
337 		cns_str[0] = keepc[2] & MSB_OFF;
338 		cns_str[1] = keepc[3] & MSB_OFF;
339 	}
340 	cns_val = (cns_str[0] << 8) + cns_str[1];
341 #ifdef DEBUG
342     fprintf(stderr, "%x\t", cns_val);
343 #endif
344 
345 	switch (plane_no) {
346 	case 1:
347 		unidx = binsearch(cns_val, cns1_utf_tab, MAX_CNS1_NUM);
348 		if (unidx >= 0)
349 			uni_val = cns1_utf_tab[unidx].unicode;
350 		break;
351 	case 2:
352 		unidx = binsearch(cns_val, cns2_utf_tab, MAX_CNS2_NUM);
353 		if (unidx >= 0)
354 			uni_val = cns2_utf_tab[unidx].unicode;
355 		break;
356 	case 3:
357 		unidx = binsearch(cns_val, cns3_utf_tab, MAX_CNS3_NUM);
358 		if (unidx >= 0)
359 			uni_val = cns3_utf_tab[unidx].unicode;
360 		break;
361 	case 4:
362 		unidx = binsearch(cns_val, cns4_utf_tab, MAX_CNS4_NUM);
363 		if (unidx >= 0)
364 			uni_val = cns4_utf_tab[unidx].unicode;
365 		break;
366 	case 5:
367 		unidx = binsearch(cns_val, cns5_utf_tab, MAX_CNS5_NUM);
368 		if (unidx >= 0)
369 			uni_val = cns5_utf_tab[unidx].unicode;
370 		break;
371 	case 6:
372 		unidx = binsearch(cns_val, cns6_utf_tab, MAX_CNS6_NUM);
373 		if (unidx >= 0)
374 			uni_val = cns6_utf_tab[unidx].unicode;
375 		break;
376 	case 7:
377 		unidx = binsearch(cns_val, cns7_utf_tab, MAX_CNS7_NUM);
378 		if (unidx >= 0)
379 			uni_val = cns7_utf_tab[unidx].unicode;
380 		break;
381 	case 12:
382 	case 13:
383 	case 14:
384 	case 16:
385 	        uni_val = getUnicodeFromUDA(plane_no, (uchar_t)keepc[2], (uchar_t)keepc[3]);
386 	        unidx = 1; /* deceit the following if statement */
387 		break;
388 	case 15:
389 		unidx = binsearch(cns_val, cns15_utf_tab, MAX_CNS15_NUM);
390 		if (unidx >= 0)
391 			uni_val = cns15_utf_tab[unidx].unicode;
392 		break;
393 	default:
394 		unidx = -1;	/* no mapping from CNS to UTF8 */
395 		break;
396 	}
397 
398 #ifdef DEBUG
399     fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
400 #endif
401 
402         /*
403 	 * Code version for UCS-2LE to support Samba
404 	 */
405         if (st->little_endian) {
406 	  int size = 0;
407 
408 	  if (unidx < 0 || uni_val > 0x00ffff ) {
409 	    uni_val = ICV_CHAR_UCS2_REPLACEMENT;
410 	    *uconv_num = 1;
411 	  }
412 
413 	  if (!st->bom_written) {
414 	    if (buflen < 4)
415 	      return 0;
416 
417 	    *(buf + size++) = (uchar_t)0xff;
418 	    *(buf + size++) = (uchar_t)0xfe;
419 	    st->bom_written = true;
420 	  }
421 
422 	  if (buflen < 2)
423 	    return 0;
424 
425 	  *(buf + size++) = (uchar_t)(uni_val & 0xff);
426 	  *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff);
427 
428 	  return size;
429 	}
430 
431 	if (unidx >= 0) {	/* do Unicode to UTF8 conversion */
432 		if (uni_val >= 0x0080 && uni_val <= 0x07ff) {
433 			if (buflen < 2) {
434 #ifdef DEBUG
435     fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n");
436 #endif
437 				errno = E2BIG;
438 				return(0);
439 			}
440 			*buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
441 			*(buf+1) = (char)(uni_val & 0x3f) | 0x80;
442 #ifdef DEBUG
443     fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
444 #endif
445 			return(2);
446 		}
447 		if (uni_val >= 0x0800 && uni_val <= 0xffff) {
448 			if (buflen < 3) {
449 #ifdef DEBUG
450     fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n");
451 #endif
452 				errno = E2BIG;
453 				return(0);
454 			}
455 			*buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
456 			*(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
457 			*(buf+2) = (char)(uni_val & 0x3f) | 0x80;
458 #ifdef DEBUG
459     fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
460 #endif
461 			return(3);
462 		}
463 	        if (uni_val >= 0x10000 && uni_val <= 0x10ffff) {
464 		        if (buflen < 4) {
465 			     errno = E2BIG;
466 			     return(0);
467 			}
468 
469 		        *buf = (char)((uni_val >> 18) & 0x7) | 0xf0;
470 		        *(buf+1) = (char)((uni_val >> 12) & 0x3f) | 0x80;
471 		        *(buf+2) = (char)((uni_val >>6) & 0x3f) | 0x80;
472 		        *(buf+3) = (char)(uni_val & 0x3f) | 0x80;
473 		        return(4);
474 		}
475 	}
476 
477 	/* can't find a match in CNS --> UTF8 table or illegal UTF8 code */
478 	if (buflen < 3) {
479 #ifdef DEBUG
480     fprintf(stderr, "outbuf overflow in cns_to_utf8()!!\n");
481 #endif
482 		errno = E2BIG;
483 		return(0);
484 	}
485 
486         *(unsigned char*) buf     = UTF8_NON_ID_CHAR1;
487         *(unsigned char*) (buf+1) = UTF8_NON_ID_CHAR2;
488         *(unsigned char*) (buf+2) = UTF8_NON_ID_CHAR3;
489 
490 	/* non-identical conversion */
491 	*uconv_num = 1;
492 
493 #ifdef DEBUG
494     fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
495 #endif
496 	return(3);
497 }
498 
499 static uint_t
getUnicodeFromUDA(int plane_no,uchar_t byte1,uchar_t byte2)500 getUnicodeFromUDA(int plane_no, uchar_t byte1, uchar_t byte2)
501 {
502         uint_t ucs4, disp;
503 
504         /* compact into consecutive Unicode value for CNS plane 16 */
505         if ( plane_no == 16 ) --plane_no;
506 
507         disp = (plane_no - 12) * 8836 + (byte1 - 0xA1) * 94 + ( byte2 - 0xA1);
508         return (ucs4 = (0xf << 16) | (disp & 0xffff));
509 }
510 
511 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,cns_utf v[],int n)512 static int binsearch(unsigned long x, cns_utf v[], int n)
513 {
514 	int low, high, mid;
515 
516 	low = 0;
517 	high = n - 1;
518 	while (low <= high) {
519 		mid = (low + high) / 2;
520 		if (x < v[mid].cnscode)
521 			high = mid - 1;
522 		else if (x > v[mid].cnscode)
523 			low = mid + 1;
524 		else	/* found match */
525 			return mid;
526 	}
527 	return (-1);	/* no match */
528 }
529