xref: /titanic_52/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_TW-big5p.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1995, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <errno.h>
31 #include "unicode_big5p.h"	/* UTF8 to Big-5 Plus mapping table */
32 #include "common_defs.h"
33 
34 #define	MSB	0x80	/* most significant bit */
35 #define ONEBYTE	0xff	/* right most byte */
36 
37 #define NON_ID_CHAR   '?' /* non-identified character */
38 
39 typedef struct _icv_state {
40 	char	keepc[6];	/* maximum # byte of UTF8 code */
41 	short	ustate;
42 	int	_errno;		/* internal errno */
43 } _iconv_st;
44 
45 enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
46 
47 static int get_big5p_by_utf(char, char, int *, unsigned long *);
48 static int utf8_to_big5p(int, unsigned long, char *, size_t);
49 static int binsearch(unsigned long, utf_big5p[], int);
50 
51 
52 /*
53  * Open; called from iconv_open()
54  */
55 void *
56 _icv_open()
57 {
58 	_iconv_st *st;
59 
60 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
61 		errno = ENOMEM;
62 		return ((void *) -1);
63 	}
64 
65 	st->ustate = U0;
66 	st->_errno = 0;
67 
68 	return ((void *) st);
69 }
70 
71 
72 /*
73  * Close; called from iconv_close()
74  */
75 void
76 _icv_close(_iconv_st *st)
77 {
78 	if (!st)
79 		errno = EBADF;
80 	else
81 		free(st);
82 }
83 
84 
85 /*
86  * Actual conversion; called from iconv()
87  */
88 /*=========================================================
89  *
90  *       State Machine for interpreting UTF8 code
91  *
92  *=========================================================
93  *
94  *                         2nd byte  3rd byte  4th byte
95  *          +----->------->------->U5------>U6--------->U7
96  *          |                                            |
97  *          |     3 byte unicode                         |
98  *          +----->------->-------+                      |
99  *          |                     |                      |
100  *          ^                     v                      |
101  *          |  2 byte             U2 ---> U3             |
102  *          |  unicode                    v              |
103  * +------> U0 -------> U1                +-------->U4---+
104  * ^  ascii |           |                           ^    |
105  * |        |           +-------->--------->--------+    |
106  * |        v                                            v
107  * +----<---+-----<------------<------------<------------+
108  *
109  *=========================================================*/
110 size_t
111 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
112 				char **outbuf, size_t *outbytesleft)
113 {
114 	char		c1 = '\0', c2 = '\0';
115 	int		n, unidx;
116 	unsigned long	big5pcode;
117 
118 #ifdef DEBUG
119     fprintf(stderr, "==========     iconv(): UTF2 --> Big-5 Plus     ==========\n");
120 #endif
121 	if (st == NULL) {
122 		errno = EBADF;
123 		return ((size_t) -1);
124 	}
125 
126 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
127 		st->ustate = U0;
128 		st->_errno = 0;
129 		return ((size_t) 0);
130 	}
131 
132 	st->_errno = 0;		/* reset internal errno */
133 	errno = 0;		/* reset external errno */
134 
135 	/* a state machine for interpreting UTF8 code */
136 	while (*inbytesleft > 0 && *outbytesleft > 0) {
137 
138 	        uchar_t  first_byte;
139 
140 		switch (st->ustate) {
141 		case U0:		/* assuming ASCII in the beginning */
142 			if ((**inbuf & MSB) == 0) {	/* ASCII */
143 				**outbuf = **inbuf;
144 				(*outbuf)++;
145 				(*outbytesleft)--;
146 			} else {	/* Chinese character */
147 				if ((**inbuf & 0xe0) == 0xc0) {	/* 2 byte unicode 0xc2..0xdf */
148 
149 				        /* invalid sequence if the first char is either 0xc0 or 0xc1 */
150 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
151 				             st->_errno = errno = EILSEQ;
152 				        else {
153 					     st->ustate = U1;
154 					     st->keepc[0] = **inbuf;
155 					}
156 				} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3 byte 0xe0..0xef */
157 					st->ustate = U2;
158 					st->keepc[0] = **inbuf;
159 				} else {
160 				        /* currently the 16 planes are supported */
161 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
162 					     st->_errno = errno = EILSEQ;
163 					else
164 					     {
165 						st->ustate = U5;
166 						st->keepc[0] = **inbuf;
167 					     }
168 				}
169 			}
170 			break;
171 		case U1:		/* 2 byte unicode */
172 			if ((**inbuf & 0xc0) == MSB) {
173 				st->ustate = U4;
174 				st->keepc[1] = **inbuf;
175 				c1 = (st->keepc[0]&0x1c)>>2;
176 				c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
177 #ifdef DEBUG
178     fprintf(stderr, "UTF8: %02x%02x   --> ",
179 	st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
180 #endif
181 				continue;	/* should not advance *inbuf */
182 			} else {
183 				st->_errno = errno = EILSEQ;
184 			}
185 			break;
186 		case U2:		/* 3 byte unicode - 2nd byte */
187 
188 			first_byte = st->keepc[0];
189 
190 			/* if the first byte is 0xed, it is illegal sequence if the second
191 			 * one is between 0xa0 and 0xbf because the surrogate section is ill-formed
192 			 */
193 			if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
194 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
195 				st->_errno = errno = EILSEQ;
196 			else {
197 				st->ustate = U3;
198 				st->keepc[1] = **inbuf;
199 			}
200 			break;
201 		case U3:		/* 3 byte unicode - 3rd byte */
202 			if ((**inbuf & 0xc0) == MSB) {
203 				st->ustate = U4;
204 				st->keepc[2] = **inbuf;
205 				c1 = ((st->keepc[0]&0x0f)<<4) |
206 					((st->keepc[1]&0x3c)>>2);
207 				c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
208 #ifdef DEBUG
209     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
210 		st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
211 #endif
212 				continue;	/* should not advance *inbuf */
213 			} else {
214 				st->_errno = errno = EILSEQ;
215 			}
216 			break;
217 		case U4:
218 			n = get_big5p_by_utf(c1, c2, &unidx, &big5pcode);
219 			if ( n == -1 ) { /* unicode is either 0xfffe or 0xffff */
220 			   st->_errno = errno = EILSEQ;
221 			   break;
222 			}
223 
224 /* comment the following lines to ignore no Big5 plus characters
225 			if (n != 0) {
226 				st->_errno = errno = EILSEQ;
227 				break;
228 			}
229 */
230 
231 			n = utf8_to_big5p(unidx, big5pcode,
232 					*outbuf, *outbytesleft);
233 			if (n > 0) {
234 				(*outbuf) += n;
235 				(*outbytesleft) -= n;
236 
237 				st->ustate = U0;
238 			} else {
239 				st->_errno = errno = E2BIG;
240 			}
241 			break;
242 		case U5:
243 		        first_byte = st->keepc[0];
244 
245 			/* if the first byte is 0xf0, it is illegal sequence if
246 			 * the second one is between 0x80 and 0x8f
247 			 * for Four-Byte UTF: U+10000..U+10FFFF
248 			 */
249 			if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
250 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
251 			     st->_errno = errno = EILSEQ;
252 			else
253 			     {
254 				st->ustate = U6;
255 				st->keepc[1] = **inbuf;
256 			     }
257 			break;
258 		case U6:
259 			if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
260 			  {
261 			     st->ustate = U7;
262 			     st->keepc[2] = **inbuf;
263 			  }
264 			else
265 			     st->_errno = errno = EILSEQ;
266 			break;
267 		case U7:
268 			if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
269 			  { /* skip it */
270 			     st->ustate = U0;
271 			  }
272 			else
273 			  st->_errno = errno = EILSEQ;
274 		        break;
275 		default:			/* should never come here */
276 			st->_errno = errno = EILSEQ;
277 			st->ustate = U0;	/* reset state */
278 			break;
279 		}
280 
281 		if (st->_errno) {
282 #ifdef DEBUG
283     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
284 		st->_errno, st->ustate);
285 #endif
286 			break;
287 		}
288 
289 		(*inbuf)++;
290 		(*inbytesleft)--;
291 	}
292 
293         if (errno) return ((size_t) -1);
294 
295         if (*inbytesleft == 0 && st->ustate != U0) {
296                 errno = EINVAL;
297                 return ((size_t) -1);
298         }
299 
300 	if (*inbytesleft > 0 && *outbytesleft == 0) {
301 		errno = E2BIG;
302 		return((size_t) -1);
303 	}
304 	return (*inbytesleft);
305 }
306 
307 
308 /*
309  * Match Big-5 Plus code by UTF8 code;
310  * Return: = 0 - match from Unicode to Big-5 Plus found
311  *         = 1 - match from Unicode to Big-5 Plus NOT found
312  *         =-1 - illegal sequence
313  *
314  * Since binary search of the UTF8 to Big-5 Plus table is necessary, might as well
315  * return index and Big-5 Plus code matching to the unicode.
316  */
317 static int get_big5p_by_utf(char c1, char c2, int *unidx, unsigned long *big5pcode)
318 {
319 	unsigned long	unicode;
320 
321 	unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
322         /* 0xfffe and 0xffff should not be allowed */
323         if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
324 
325 	*unidx = binsearch(unicode, utf_big5p_tab, MAX_BIG5P_NUM);
326 	if ((*unidx) >= 0)
327 		*big5pcode = utf_big5p_tab[*unidx].big5pcode;
328 	else
329 		return(1);	/* match from UTF8 to Big-5 Plus not found */
330 #ifdef DEBUG
331     fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5 Plus=%x ", unicode, *unidx, *big5pcode);
332 #endif
333 
334 	return(0);
335 }
336 
337 
338 /*
339  * ISO/IEC 10646 (Unicode) --> Big-5 Plus
340  * Unicode --> UTF8 (FSS-UTF)
341  *             (File System Safe Universal Character Set Transformation Format)
342  * Return: > 0 - converted with enough space in output buffer
343  *         = 0 - no space in outbuf
344  */
345 static int utf8_to_big5p(int unidx, unsigned long big5pcode, char *buf, size_t buflen)
346 {
347 	unsigned long	val;		/* Big-5 Plus value */
348 	char		c1, c2, big5p_str[3];
349 
350 	if (buflen < 2) {
351 		errno = E2BIG;
352 		return(0);
353 	}
354 
355 	if (unidx < 0) {	/* no match from UTF8 to Big-5 Plus */
356 		*buf = *(buf+1) = NON_ID_CHAR;
357 	} else {
358 		val = big5pcode & 0xffff;
359 		c1 = (char) ((val & 0xff00) >> 8);
360 		c2 = (char) (val & 0xff);
361 
362 	*buf = big5p_str[0] = c1;
363 	*(buf+1) = big5p_str[1] = c2;
364 	big5p_str[2] = NULL;
365 	}
366 
367 #ifdef DEBUG
368     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
369 #endif
370 
371 	return(2);
372 }
373 
374 
375 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
376 static int binsearch(unsigned long x, utf_big5p v[], int n)
377 {
378 	int low, high, mid;
379 
380 	low = 0;
381 	high = n - 1;
382 	while (low <= high) {
383 		mid = (low + high) / 2;
384 		if (x < v[mid].unicode)
385 			high = mid - 1;
386 		else if (x > v[mid].unicode)
387 			low = mid + 1;
388 		else	/* found match */
389 			return mid;
390 	}
391 	return (-1);	/* no match */
392 }
393