xref: /titanic_51/usr/src/lib/iconv_modules/zh/common/zh_TW-big5%UTF-8.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/isa_defs.h>
31 #include <errno.h>
32 #include "common_defs.h"
33 #include "big5_unicode.h"	/* Big-5 to Unicode mapping table */
34 
35 #define	MSB	0x80	/* most significant bit */
36 #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
37 #define	PMASK	0xa0	/* plane number mask */
38 #define ONEBYTE	0xff	/* right most byte */
39 
40 /* non-identified character */
41 #define UTF8_NON_ID_CHAR1 0xEF
42 #define UTF8_NON_ID_CHAR2 0xBF
43 #define UTF8_NON_ID_CHAR3 0xBD
44 
45 
46 typedef struct  _icv_state {
47 	char	keepc[2];	/* maximum # byte of Big-5 code */
48 	short	cstate;		/* state machine id */
49 	int	_errno;		/* internal errno */
50         boolean little_endian;
51         boolean bom_written;
52 }_iconv_st;
53 
54 enum _CSTATE	{ C0, C1 };
55 
56 static int big5_2nd_byte(char);
57 static int big5_to_utf8(_iconv_st *, char*, size_t, int *);
58 static int binsearch(unsigned long, big5_utf[], int);
59 
60 
61 /*
62  * Open; called from iconv_open()
63  */
64 void *
65 _icv_open()
66 {
67 	_iconv_st *st;
68 
69 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
70 		errno = ENOMEM;
71 		return ((void *) -1);
72 	}
73 
74 	st->cstate = C0;
75 	st->_errno = 0;
76 	st->little_endian = false;
77 	st->bom_written = false;
78 #if defined(UCS_2LE)
79 	st->little_endian = true;
80 	st->bom_written = true;
81 #endif
82 	return ((void *) st);
83 }
84 
85 
86 /*
87  * Close; called from iconv_close()
88  */
89 void
90 _icv_close(_iconv_st *st)
91 {
92 	if (!st)
93 		errno = EBADF;
94 	else
95 		free(st);
96 }
97 
98 
99 /*
100  * Actual conversion; called from iconv()
101  */
102 /*=======================================================
103  *
104  *   State Machine for interpreting Big-5 code
105  *
106  *=======================================================
107  *
108  *                     1st C
109  *    +--------> C0 ----------> C1
110  *    |    ascii |        2nd C |
111  *    ^          v              v
112  *    +----<-----+-----<--------+
113  *
114  *=======================================================*/
115 /*
116  * Big-5 encoding range:
117  *	High byte: 0xA1 - 0xFE			(   94 encoding space)
118  *	Low byte:  0x40 - 0x7E, 0xA1 - 0xFE	(  157 encoding space)
119  *	Plane #1:  0xA140 - 0xC8FE		( 6280 encoding space)
120  *	Plane #2:  0xC940 - 0xFEFE		( 8478 encoding space)
121  *	Total:	   94 * 157 = 14,758		(14758 encoding space)
122  */
123 size_t
124 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
125 				char **outbuf, size_t *outbytesleft)
126 {
127 	int		n;
128 	int		uconv_num = 0;
129 
130 #ifdef DEBUG
131     fprintf(stderr, "==========     iconv(): Big-5 --> UTF2     ==========\n");
132 #endif
133 	if (st == NULL) {
134 		errno = EBADF;
135 		return ((size_t) -1);
136 	}
137 
138 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
139 		st->cstate = C0;
140 		st->_errno = 0;
141 		return ((size_t) 0);
142 	}
143 
144 	st->_errno = 0;         /* reset internal errno */
145 	errno = 0;		/* reset external errno */
146 
147 	/* a state machine for interpreting CNS 11643 code */
148 	while (*inbytesleft > 0 && *outbytesleft > 0) {
149 		switch (st->cstate) {
150 		case C0:		/* assuming ASCII in the beginning */
151 			if (**inbuf & MSB) {
152 				st->keepc[0] = (**inbuf);
153 				st->cstate = C1;
154 			} else {	/* real ASCII */
155 			  if (st->little_endian) {
156 			    if (!st->bom_written) {
157 			      if (*outbytesleft < 4)
158 				errno = E2BIG;
159 			      else {
160 				*(*outbuf)++ = (uchar_t)0xff;
161 				*(*outbuf)++ = (uchar_t)0xfe;
162 				*outbytesleft -= 2;
163 
164 				st->bom_written = true;
165 			      }
166 			    }
167 
168 			    if (*outbytesleft < 2)
169 			      return E2BIG;
170 			    else {
171 			      *(*outbuf)++ = **inbuf;
172 			      *(*outbuf)++ = (uchar_t)0x0;
173 			      *outbytesleft -= 2;
174 			    }
175 			  } else {
176 				**outbuf = **inbuf;
177 				(*outbuf)++;
178 				(*outbytesleft)--;
179 			  }
180 			}
181 			break;
182 		case C1:		/* Chinese characters: 2nd byte */
183 			if (big5_2nd_byte(**inbuf) == 0) {
184 				int uconv_num_internal = 0;
185 
186 				st->keepc[1] = (**inbuf);
187 				n = big5_to_utf8(st, *outbuf,
188 						*outbytesleft, &uconv_num_internal);
189 				if (n > 0) {
190 					(*outbuf) += n;
191 					(*outbytesleft) -= n;
192 
193 					uconv_num += uconv_num_internal;
194 
195 					st->cstate = C0;
196 				} else {	/* don't reset state */
197 					st->_errno = errno = E2BIG;
198 				}
199 			} else {	/* input char doesn't belong
200 					 * to the input code set
201 					 */
202 				st->_errno = errno = EILSEQ;
203 			}
204 			break;
205 		default:			/* should never come here */
206 			st->_errno = errno = EILSEQ;
207 			st->cstate = C0;	/* reset state */
208 			break;
209 		}
210 
211 		if (st->_errno) {
212 #ifdef DEBUG
213     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
214 		st->_errno, st->cstate);
215 #endif
216 			break;
217 		}
218 
219 		(*inbuf)++;
220 		(*inbytesleft)--;
221 	}
222 
223         if (*inbytesleft == 0 && st->cstate != C0)
224                 errno = EINVAL;
225 
226 	if (*inbytesleft > 0 && *outbytesleft == 0)
227 		errno = E2BIG;
228 
229 	if (errno) {
230 		/*
231 		 * if error, *inbuf points to the byte following the last byte
232 		 * successfully used in the conversion.
233 		 */
234 		*inbuf -= (st->cstate - C0);
235 		*inbytesleft += (st->cstate - C0);
236 		st->cstate = C0;
237 		return ((size_t) -1);
238 	}
239 
240 	return uconv_num;
241 }
242 
243 
244 /*
245  * Test whether inbuf is a valid character for 2nd byte Big-5 code
246  * Return: = 0 - valid Big-5 2nd byte
247  *         = 1 - invalid Big-5 2nd byte
248  */
249 static int big5_2nd_byte(char inbuf)
250 {
251 	unsigned int	buf = (unsigned int) (inbuf & ONEBYTE);
252 
253 	if ((buf >= 0x40) && (buf <= 0x7E))
254 		return (0);
255 	if ((buf >= 0xA1) && (buf <= 0xFE))
256 		return (0);
257 	return(1);
258 }
259 
260 #ifdef UDC_SUPPORT
261 typedef struct _udc_sect {
262         unsigned int start, end, count;
263 } UDC;
264 
265 UDC udc[] = {
266   { 0xFA40, 0xFEFE, 0x311 }
267 };
268 
269 #define UDC_START_UNICODE 0xF0000
270 
271 static int
272 ifUDC(UDC *udc, unsigned int code)
273 {
274    int i;
275 
276    for (i=0; i < 1; ++i)
277       if (code >= udc[i].start && code <= udc[i].end)
278 	{
279 	  unsigned char c1, c2, leading_c1;
280 
281 	  c1 = (unsigned char)(code >> 8);
282 	  c2 = (unsigned char)code;
283 	  leading_c1 = (unsigned char) (udc[i].start >> 8);
284 
285 	  return UDC_START_UNICODE + (i ? udc[i-1].count : 0) + \
286                  (c1 - leading_c1) * 157 + ((c2 <= 0x7E) ? (c2 - 0x40) : ((c2 - 0x40) - (0xA1 - 0x7F)));
287 	}
288 
289    return 0;
290 }
291 #endif
292 
293 /*
294  * Big-5 code --> ISO/IEC 10646 (Unicode)
295  * Unicode --> UTF8 (FSS-UTF)
296  *             (File System Safe Universal Character Set Transformation Format)
297  * Return: > 0 - converted with enough space in output buffer
298  *         = 0 - no space in outbuf
299  */
300 static int big5_to_utf8(_iconv_st *st, char *buf, size_t buflen, int *uconv_num)
301 {
302 	unsigned long	big5_val;	/* Big-5 value */
303 	int		unidx = 0;		/* Unicode index */
304 	unsigned long	uni_val = 0;	/* Unicode */
305 	char            *keepc = st->keepc;
306 
307 	big5_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
308 #ifdef DEBUG
309     fprintf(stderr, "%x\t", big5_val);
310 #endif
311 
312 #ifdef UDC_SUPPORT
313       if ((uni_val = ifUDC(udc, big5_val)) == 0) {
314 #endif
315 	unidx = binsearch(big5_val, big5_utf_tab, MAX_BIG5_NUM);
316 	if (unidx >= 0)
317 
318 	   uni_val = big5_utf_tab[unidx].unicode;
319 #ifdef UDC_SUPPORT
320       }
321 #endif
322 #ifdef DEBUG
323     fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
324 #endif
325 
326         /*
327 	 * Code conversion for UCS-2LE to support Samba
328 	 */
329         if (st->little_endian) {
330 	  int size = 0;
331 
332 	  if (unidx < 0 || uni_val > 0x00ffff ) {
333 	    uni_val = ICV_CHAR_UCS2_REPLACEMENT;
334 	    *uconv_num = 1;
335 	  }
336 
337 	  if (!st->bom_written) {
338 	    if (buflen < 4)
339 	      return 0;
340 
341 	    *(buf + size++) = (uchar_t)0xff;
342 	    *(buf + size++) = (uchar_t)0xfe;
343 	    st->bom_written = true;
344 	  }
345 
346 	  if (buflen < 2)
347 	    return 0;
348 
349 	  *(buf + size++) = (uchar_t)(uni_val & 0xff);
350 	  *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff);
351 
352 	  return size;
353 	}
354 
355 	if (unidx >= 0) {	/* do Unicode to UTF8 conversion */
356 		if (uni_val >= 0x0080 && uni_val <= 0x07ff) {
357 			if (buflen < 2) {
358 #ifdef DEBUG
359     fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n");
360 #endif
361 				errno = E2BIG;
362 				return(0);
363 			}
364 			*buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
365 			*(buf+1) = (char)(uni_val & 0x3f) | 0x80;
366 #ifdef DEBUG
367     fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
368 #endif
369 			return(2);
370 		}
371 		if (uni_val >= 0x0800 && uni_val <= 0xffff) {
372 			if (buflen < 3) {
373 #ifdef DEBUG
374     fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n");
375 #endif
376 				errno = E2BIG;
377 				return(0);
378 			}
379 			*buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
380 			*(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
381 			*(buf+2) = (char)(uni_val & 0x3f) | 0x80;
382 #ifdef DEBUG
383     fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
384 #endif
385 			return(3);
386 		}
387 		if (uni_val >= 0x10000 && uni_val <= 0x10ffff) {
388 		        if (buflen < 4) {
389 			   errno = E2BIG;
390 			   return 0;
391 			}
392 
393 			*buf = (char) ((uni_val >> 18 ) & 0x7) | 0xf0;
394 			*(buf+1) = (char) ((uni_val >> 12) & 0x3f) | 0x80;
395 			*(buf+2) = (char) ((uni_val >> 6) & 0x3f) | 0x80;
396 			*(buf+3) = (char) (uni_val & 0x3f) | 0x80;
397 
398 			return 4;
399 		}
400 	}
401 
402 	/* can't find a match in Big-5 --> UTF8 table or illegal UTF8 code */
403 	if (buflen < 3) {
404 #ifdef DEBUG
405     fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n");
406 #endif
407 		errno = E2BIG;
408 		return(0);
409 	}
410 
411         *(unsigned char*) buf     = UTF8_NON_ID_CHAR1;
412         *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2;
413         *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3;
414 
415 	/* non-identical conversion */
416 	*uconv_num = 1;
417 
418 #ifdef DEBUG
419     fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
420 #endif
421 	return(3);
422 }
423 
424 
425 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
426 static int binsearch(unsigned long x, big5_utf v[], int n)
427 {
428 	int low, high, mid;
429 
430 	low = 0;
431 	high = n - 1;
432 	while (low <= high) {
433 		mid = (low + high) / 2;
434 		if (x < v[mid].big5code)
435 			high = mid - 1;
436 		else if (x > v[mid].big5code)
437 			low = mid + 1;
438 		else	/* found match */
439 			return mid;
440 	}
441 	return (-1);	/* no match */
442 }
443