xref: /titanic_50/usr/src/lib/iconv_modules/zh/common/zh_TW-big5p%UTF-8.c (revision 880d797826457b77414b37d531cc3e1aa166ecbe)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include "big5p_unicode.h"	/* Big-5 Plus to Unicode mapping table */
31 
32 #define	MSB	0x80	/* most significant bit */
33 #define ONEBYTE	0xff	/* right most byte */
34 
35 /* non-identified character */
36 #define UTF8_NON_ID_CHAR1 0xEF
37 #define UTF8_NON_ID_CHAR2 0xBF
38 #define UTF8_NON_ID_CHAR3 0xBD
39 
40 
41 typedef struct  _icv_state {
42 	char	keepc[2];	/* maximum # byte of Big-5 code */
43 	short	cstate;		/* state machine id */
44 	int	_errno;		/* internal errno */
45 }_iconv_st;
46 
47 enum _CSTATE	{ C0, C1 };
48 
49 static int big5p_2nd_byte(char);
50 static int big5p_to_utf8(char[], char*, size_t);
51 static int binsearch(unsigned long, big5p_utf[], int);
52 
53 
54 /*
55  * Open; called from iconv_open()
56  */
57 void *
_icv_open()58 _icv_open()
59 {
60 	_iconv_st *st;
61 
62 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
63 		errno = ENOMEM;
64 		return ((void *) -1);
65 	}
66 
67 	st->cstate = C0;
68 	st->_errno = 0;
69 
70 	return ((void *) st);
71 }
72 
73 
74 /*
75  * Close; called from iconv_close()
76  */
77 void
_icv_close(_iconv_st * st)78 _icv_close(_iconv_st *st)
79 {
80 	if (!st)
81 		errno = EBADF;
82 	else
83 		free(st);
84 }
85 
86 
87 /*
88  * Actual conversion; called from iconv()
89  */
90 /*=======================================================
91  *
92  *   State Machine for interpreting Big-5 code
93  *
94  *=======================================================
95  *
96  *                     1st C
97  *    +--------> C0 ----------> C1
98  *    |    ascii |        2nd C |
99  *    ^          v              v
100  *    +----<-----+-----<--------+
101  *
102  *=======================================================*/
103 /*
104  * Big-5 Plus encoding range:
105  *	High byte: 0x81 - 0xFE
106  *	Low byte:  0x40 - 0xFE
107  */
108 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)109 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
110 				char **outbuf, size_t *outbytesleft)
111 {
112 	int		n;
113 
114 #ifdef DEBUG
115     fprintf(stderr, "==========     iconv(): Big-5 --> UTF2     ==========\n");
116 #endif
117 	if (st == NULL) {
118 		errno = EBADF;
119 		return ((size_t) -1);
120 	}
121 
122 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
123 		st->cstate = C0;
124 		st->_errno = 0;
125 		return ((size_t) 0);
126 	}
127 
128 	st->_errno = 0;         /* reset internal errno */
129 	errno = 0;		/* reset external errno */
130 
131 	/* a state machine for interpreting CNS 11643 code */
132 	while (*inbytesleft > 0 && *outbytesleft > 0) {
133 		switch (st->cstate) {
134 		case C0:		/* assuming ASCII in the beginning */
135 			if (**inbuf & MSB) {
136 				st->keepc[0] = (**inbuf);
137 				st->cstate = C1;
138 			} else {	/* real ASCII */
139 				**outbuf = **inbuf;
140 				(*outbuf)++;
141 				(*outbytesleft)--;
142 			}
143 			break;
144 		case C1:		/* Chinese characters: 2nd byte */
145 			if (big5p_2nd_byte(**inbuf) == 0) {
146 				st->keepc[1] = (**inbuf);
147 				n = big5p_to_utf8(st->keepc, *outbuf,
148 							*outbytesleft);
149 				if (n > 0) {
150 					(*outbuf) += n;
151 					(*outbytesleft) -= n;
152 
153 					st->cstate = C0;
154 				} else {	/* don't reset state */
155 					st->_errno = errno = E2BIG;
156 				}
157 			} else {	/* input char doesn't belong
158 					 * to the input code set
159 					 */
160 				st->_errno = errno = EILSEQ;
161 			}
162 			break;
163 		default:			/* should never come here */
164 			st->_errno = errno = EILSEQ;
165 			st->cstate = C0;	/* reset state */
166 			break;
167 		}
168 
169 		if (st->_errno) {
170 #ifdef DEBUG
171     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
172 		st->_errno, st->cstate);
173 #endif
174 			break;
175 		}
176 
177 		(*inbuf)++;
178 		(*inbytesleft)--;
179 	}
180 
181         if (errno) return ((size_t) -1);
182 
183         if (*inbytesleft == 0 && st->cstate != C0) {
184                 errno = EINVAL;
185                 return ((size_t) -1);
186         }
187 
188 	if (*inbytesleft > 0 && *outbytesleft == 0) {
189 		errno = E2BIG;
190 		return((size_t) -1);
191 	}
192 	return (*inbytesleft);
193 }
194 
195 
196 /*
197  * Test whether inbuf is a valid character for 2nd byte Big-5 code
198  * Return: = 0 - valid Big-5 2nd byte
199  *         = 1 - invalid Big-5 2nd byte
200  */
big5p_2nd_byte(char inbuf)201 static int big5p_2nd_byte(char inbuf)
202 {
203 	unsigned int	buf = (unsigned int) (inbuf & ONEBYTE);
204 
205 	if ((buf >= 0x40) && (buf <= 0xFE))
206 	return(0);
207 	else return(1);
208 }
209 
210 
211 /*
212  * Big-5 code --> ISO/IEC 10646 (Unicode)
213  * Unicode --> UTF8 (FSS-UTF)
214  *             (File System Safe Universal Character Set Transformation Format)
215  * Return: > 0 - converted with enough space in output buffer
216  *         = 0 - no space in outbuf
217  */
big5p_to_utf8(char keepc[],char * buf,size_t buflen)218 static int big5p_to_utf8(char keepc[], char *buf, size_t buflen)
219 {
220 	unsigned long	big5p_val;	/* Big-5 value */
221 	int		unidx;		/* Unicode index */
222 	unsigned long	uni_val;	/* Unicode */
223 
224 	big5p_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
225 #ifdef DEBUG
226     fprintf(stderr, "%x\t", big5p_val);
227 #endif
228 
229 	unidx = binsearch(big5p_val, big5p_utf_tab, MAX_BIG5P_NUM);
230 	if (unidx >= 0)
231 		uni_val = big5p_utf_tab[unidx].unicode;
232 #ifdef DEBUG
233     fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
234 #endif
235 
236 	if (unidx >= 0) {	/* do Unicode to UTF8 conversion */
237 		if (uni_val > 0x0080 && uni_val <= 0x07ff) {
238 			if (buflen < 2) {
239 #ifdef DEBUG
240     fprintf(stderr, "outbuf overflow in big5p_to_utf8()!!\n");
241 #endif
242 				errno = E2BIG;
243 				return(0);
244 			}
245 			*buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
246 			*(buf+1) = (char)(uni_val & 0x3f) | 0x80;
247 #ifdef DEBUG
248     fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
249 #endif
250 			return(2);
251 		}
252 		if (uni_val > 0x0800 && uni_val <= 0xffff) {
253 			if (buflen < 3) {
254 #ifdef DEBUG
255     fprintf(stderr, "outbuf overflow in big5p_to_utf8()!!\n");
256 #endif
257 				errno = E2BIG;
258 				return(0);
259 			}
260 			*buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
261 			*(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
262 			*(buf+2) = (char)(uni_val & 0x3f) | 0x80;
263 #ifdef DEBUG
264     fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
265 #endif
266 			return(3);
267 		}
268 	}
269 
270 	/* can't find a match in Big-5 --> UTF8 table or illegal UTF8 code */
271 	if (buflen < 3) {
272 #ifdef DEBUG
273     fprintf(stderr, "outbuf overflow in big5p_to_utf8()!!\n");
274 #endif
275 		errno = E2BIG;
276 		return(0);
277 	}
278 
279         *(unsigned char*) buf     = UTF8_NON_ID_CHAR1;
280         *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2;
281         *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3;
282 
283 #ifdef DEBUG
284     fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
285 #endif
286 	return(3);
287 }
288 
289 
290 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,big5p_utf v[],int n)291 static int binsearch(unsigned long x, big5p_utf v[], int n)
292 {
293 	int low, high, mid;
294 
295 	low = 0;
296 	high = n - 1;
297 	while (low <= high) {
298 		mid = (low + high) / 2;
299 		if (x < v[mid].big5pcode)
300 			high = mid - 1;
301 		else if (x > v[mid].big5pcode)
302 			low = mid + 1;
303 		else	/* found match */
304 			return mid;
305 	}
306 	return (-1);	/* no match */
307 }
308