xref: /titanic_53/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_HK.hkscs.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2000, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include <sys/types.h>
31 #include <sys/isa_defs.h>
32 #include "unicode_big5hk.h"	/* UTF8 to HKSCS mapping table */
33 #include "common_defs.h"
34 
35 #define	MSB	0x80	/* most significant bit */
36 #define ONEBYTE	0xff	/* right most byte */
37 
38 #define NON_ID_CHAR   '?' /* non-identified character */
39 
40 typedef struct _icv_state {
41 	char	keepc[6];	/* maximum # byte of UTF8 code */
42 	short	ustate;
43 	int	_errno;		/* internal errno */
44         boolean little_endian;
45         boolean bom_written;
46 } _iconv_st;
47 
48 enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
49 
50 static int get_hkscs_by_utf(uint_t, int *, unsigned long *);
51 static int utf8_to_hkscs(int, unsigned long, char *, size_t, int *);
52 static int binsearch(unsigned long, utf_hkscs[], int);
53 
54 /*
55  * Open; called from iconv_open()
56  */
57 void *
_icv_open()58 _icv_open()
59 {
60 	_iconv_st *st;
61 
62 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
63 		errno = ENOMEM;
64 		return ((void *) -1);
65 	}
66 
67 	st->ustate = U0;
68 	st->_errno = 0;
69         st->little_endian = false;
70         st->bom_written = false;
71 #if defined(UCS_2LE)
72         st->little_endian = true;
73         st->bom_written = true;
74 #endif
75 	return ((void *) st);
76 }
77 
78 
79 /*
80  * Close; called from iconv_close()
81  */
82 void
_icv_close(_iconv_st * st)83 _icv_close(_iconv_st *st)
84 {
85 	if (!st)
86 		errno = EBADF;
87 	else
88 		free(st);
89 }
90 
91 
92 /*
93  * Actual conversion; called from iconv()
94  */
95 /*=========================================================
96  *
97  *       State Machine for interpreting UTF8 code
98  *
99  *=========================================================
100  *                          2nd byte 3rd byte 4th byte
101  *          +----->------->------->U5---->U6------>U7
102  *          |                                      |
103  *          |     3 byte unicode                   |
104  *          +----->------->-------+                |
105  *          |                     |                |
106  *          ^                     v                |
107  *          |  2 byte             U2 ---> U3       |
108  *          |  unicode                    v        v
109  * +------> U0 -------> U1                +-------->U4---+
110  * ^  ascii |           |                           ^    |
111  * |        |           +-------->--------->--------+    |
112  * |        v                                            v
113  * +----<---+-----<------------<------------<------------+
114  *
115  *=========================================================*/
116 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)117 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
118 				char **outbuf, size_t *outbytesleft)
119 {
120         int             utf8_len = 0;
121 	int		n, unidx;
122 	unsigned long	hkscscode;
123 	int		uconv_num = 0;
124 	uint_t          ucs;
125 
126 #ifdef DEBUG
127     fprintf(stderr, "==========     iconv(): UTF2 --> HKSCS     ==========\n");
128 #endif
129 	if (st == NULL) {
130 		errno = EBADF;
131 		return ((size_t) -1);
132 	}
133 
134 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
135 		st->ustate = U0;
136 		st->_errno = 0;
137 		return ((size_t) 0);
138 	}
139 
140 	st->_errno = 0;		/* reset internal errno */
141 	errno = 0;		/* reset external errno */
142 
143 	/* a state machine for interpreting UTF8 code */
144 	while (*inbytesleft > 0 && *outbytesleft > 0) {
145 
146 	        uchar_t  first_byte;
147 		int	 uconv_num_internal = 0;
148 
149 		switch (st->ustate) {
150 		case U0:		/* assuming ASCII in the beginning */
151                        /*
152                         * Code converion for UCS-2LE to support Samba
153                         */
154                         if (st->little_endian) {
155                           st->ustate = U1;
156                           st->keepc[0] = **inbuf;
157                         }
158 			else if ((**inbuf & MSB) == 0) {	/* ASCII */
159 				**outbuf = **inbuf;
160 				(*outbuf)++;
161 				(*outbytesleft)--;
162 			} else {	/* Chinese character */
163 				if ((**inbuf & 0xe0) == 0xc0) {	/* 2 byte unicode 0xc2..0xdf */
164 
165 				        /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
166 				   if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
167 				        st->_errno = errno = EILSEQ;
168 				   else {
169 					st->ustate = U1;
170 					st->keepc[0] = **inbuf;
171 				   }
172 				} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3 byte 0xe0..0xef */
173 					st->ustate = U2;
174 					st->keepc[0] = **inbuf;
175 				} else {
176 				        /* four bytes of UTF-8 sequences */
177 				        if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
178 					    st->_errno = errno = EILSEQ;
179 				        else
180 				         {
181 					    st->ustate = U5;
182 					    st->keepc[0] = **inbuf;
183 					 }
184 				}
185 			}
186 			break;
187 		case U1:		/* 2 byte unicode */
188 			if ((**inbuf & 0xc0) == MSB || st->little_endian) {
189 				st->keepc[1] = **inbuf;
190 			        utf8_len = 2;
191 
192 				/*
193 				 * Code conversion for UCS-2LE to support Samba
194 				 */
195 				if  (st->little_endian) {
196 				  /*
197 				   * It's ASCII
198                                    */
199                                   if (st->keepc[1] == 0 && (st->keepc[0] & 0x80) == 0) {
200                                     *(*outbuf)++ = st->keepc[0];
201 				    (*outbytesleft)--;
202                                     st->ustate = U0;
203                                     break;
204                                   }
205 
206 				  ucs = ((st->keepc[1] & 0xff) << 8) | ( st->keepc[0] & 0xff);
207 
208                                 } else
209                                   convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
210 
211 				st->ustate = U4;
212 #ifdef DEBUG
213     fprintf(stderr, "UTF8: %02x%02x   --> ",
214 	st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
215 #endif
216 				continue;	/* should not advance *inbuf */
217 			} else {
218 				st->_errno = errno = EILSEQ;
219 			}
220 			break;
221 		case U2:		/* 3 byte unicode - 2nd byte */
222 
223 		        first_byte = st->keepc[0];
224 
225 		        /* if the first byte is 0xed, it is illegal sequence if the second
226 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
227 			 */
228 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
229 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
230 				st->_errno = errno = EILSEQ;
231 			else {
232 				st->ustate = U3;
233 				st->keepc[1] = **inbuf;
234 			}
235 			break;
236 		case U3:		/* 3 byte unicode - 3rd byte */
237 			if ((**inbuf & 0xc0) == MSB) {
238 				st->ustate = U4;
239 				st->keepc[2] = **inbuf;
240 			        utf8_len = 3;
241 
242                                 convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
243 #ifdef DEBUG
244     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
245 		st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
246 #endif
247 				continue;	/* should not advance *inbuf */
248 			} else {
249 				st->_errno = errno = EILSEQ;
250 			}
251 			break;
252 		case U4:
253 			n = get_hkscs_by_utf(ucs, &unidx, &hkscscode);
254 		        if ( n == -1 ) { /* unicode is either 0xfffe or 0xffff */
255 			        st->_errno = errno = EILSEQ;
256 			        break;
257 			}
258 
259 /* comment the following lines out to ignore the non-Big5 characters
260 g			if (n != 0) {
261 				st->_errno = errno = EILSEQ;
262 				break;
263 			}
264 */
265 
266 			n = utf8_to_hkscs(unidx, hkscscode,
267 					*outbuf, *outbytesleft, &uconv_num_internal);
268 			if (n > 0) {
269 				(*outbuf) += n;
270 				(*outbytesleft) -= n;
271 
272 				uconv_num += uconv_num_internal;
273 
274 				st->ustate = U0;
275 			} else {
276 				st->_errno = errno;
277 			}
278 			break;
279 		case U5:
280 
281 		        first_byte = st->keepc[0];
282 
283 		        /* if the first byte is 0xf0, it is illegal sequence if
284 			 * the second one is between 0x80 and 0x8f
285 			 * for Four-Byte UTF: U+10000..U+10FFFF
286 			 */
287 		        if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
288 			    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
289 				st->_errno = errno = EILSEQ;
290 		        else
291 		          {
292 			     st->ustate = U6;
293 			     st->keepc[1] = **inbuf;
294 		          }
295 		        break;
296 		case U6:
297 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
298 		          {
299 			     st->ustate = U7;
300 			     st->keepc[2] = **inbuf;
301 			  }
302 		        else
303 		             st->_errno = errno = EILSEQ;
304                         break;
305 		case U7:
306 		        if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
307 		          {
308 			     utf8_len = 4;
309 			     st->keepc[3] = **inbuf;
310 
311                              convert_utf8_to_ucs4((uchar_t*)(&st->keepc[0]), utf8_len, &ucs);
312 
313 			     st->ustate = U4;
314 			     continue;	/* should not advance *inbuf */
315 			  }
316 		        else
317 		             st->_errno = errno = EILSEQ;
318 		        break;
319 		default:			/* should never come here */
320 			st->_errno = errno = EILSEQ;
321 			st->ustate = U0;	/* reset state */
322 			break;
323 		}
324 
325 		if (st->_errno) {
326 #ifdef DEBUG
327     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
328 		st->_errno, st->ustate);
329 #endif
330 			break;
331 		}
332 
333 		(*inbuf)++;
334 		(*inbytesleft)--;
335 	}
336 
337         if (errno) return ((size_t) -1);
338 
339         if (*inbytesleft == 0 && st->ustate != U0)
340                 errno = EINVAL;
341 
342 	if (*inbytesleft > 0 && *outbytesleft == 0)
343 		errno = E2BIG;
344 
345 	if (errno) {
346 		int num_reversed_bytes = 0;
347 
348 		switch (st->ustate)
349 	        {
350 		 case U1:
351 		   num_reversed_bytes = 1;
352 		   break;
353 		 case U2:
354 		   num_reversed_bytes = 1;
355 		   break;
356 		 case U3:
357 		   num_reversed_bytes = 2;
358 		   break;
359 		 case U4:
360 		   num_reversed_bytes = utf8_len - 1;
361 		   break;
362 		 case U5:
363 		   num_reversed_bytes = 1;
364 		   break;
365 		 case U6:
366 		   num_reversed_bytes = 2;
367 		   break;
368 		 case U7:
369 		   num_reversed_bytes = 3;
370 		   break;
371 	        }
372 
373 		/*
374 		 * if error, *inbuf points to the byte following the last byte
375 		 * successfully used in the conversion.
376 		 */
377 		*inbuf -= num_reversed_bytes;
378 		*inbytesleft += num_reversed_bytes;
379 		st->ustate = U0;
380 		return ((size_t) -1);
381 	}
382 
383 	return uconv_num;
384 }
385 
386 /*
387  * Match HKSCS code by UTF8 code;
388  * Return: = 0 - match from Unicode to HKSCS found
389  *         = 1 - match from Unicode to HKSCS NOT found
390  *         =-1 - illegal sequence
391  *
392  * Since binary search of the UTF8 to HKSCS table is necessary, might as well
393  * return index and HKSCS code matching to the unicode.
394  */
get_hkscs_by_utf(uint_t unicode,int * unidx,unsigned long * hkscscode)395 static int get_hkscs_by_utf(uint_t unicode, int *unidx, unsigned long *hkscscode)
396 {
397         /* the 0xFFFE and 0xFFFF should not be allowed */
398         if (unicode == 0xFFFE || unicode == 0xFFFF ) return -1;
399 
400 	*unidx = binsearch(unicode, utf_hkscs_tab, MAX_HKSCS_NUM);
401 	if ((*unidx) >= 0)
402 		*hkscscode = utf_hkscs_tab[*unidx].hkscscode;
403 	else
404 		return(1);	/* match from UTF8 to HKSCS not found */
405 #ifdef DEBUG
406     fprintf(stderr, "Unicode=%04x, idx=%5d, HKSCS=%x ", unicode, *unidx, *hkscscode);
407 #endif
408 
409 	return(0);
410 }
411 
412 
413 /*
414  * ISO/IEC 10646 (Unicode) --> HKSCS
415  * Unicode --> UTF8 (FSS-UTF)
416  *             (File System Safe Universal Character Set Transformation Format)
417  * Return: > 0 - converted with enough space in output buffer
418  *         = 0 - no space in outbuf
419  */
utf8_to_hkscs(int unidx,unsigned long hkscscode,char * buf,size_t buflen,int * uconv_num)420 static int utf8_to_hkscs(int unidx, unsigned long hkscscode, char *buf, size_t buflen, int *uconv_num)
421 {
422 	unsigned long	val;		/* HKSCS value */
423 	char		c1, c2, hkscs_str[3];
424 
425 	if (buflen < 2) {
426 		errno = E2BIG;
427 		return(0);
428 	}
429 
430 	if (unidx < 0) {	/* no match from UTF8 to HKSCS */
431 		*buf = *(buf+1) = NON_ID_CHAR;
432 
433 		/* non-identical conversion */
434 		*uconv_num = 1;
435 	} else {
436 		val = hkscscode & 0xffff;
437 		c1 = (char) ((val & 0xff00) >> 8);
438 		c2 = (char) (val & 0xff);
439 
440 	*buf = hkscs_str[0] = c1;
441 	*(buf+1) = hkscs_str[1] = c2;
442 	hkscs_str[2] = NULL;
443 	}
444 
445 #ifdef DEBUG
446     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
447 #endif
448 
449 	return(2);
450 }
451 
452 
453 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,utf_hkscs v[],int n)454 static int binsearch(unsigned long x, utf_hkscs v[], int n)
455 {
456 	int low, high, mid;
457 
458 	low = 0;
459 	high = n - 1;
460 	while (low <= high) {
461 		mid = (low + high) / 2;
462 		if (x < v[mid].unicode)
463 			high = mid - 1;
464 		else if (x > v[mid].unicode)
465 			low = mid + 1;
466 		else	/* found match */
467 			return mid;
468 	}
469 	return (-1);	/* no match */
470 }
471