xref: /titanic_53/usr/src/lib/iconv_modules/zh/common/UTF-8%zh_CN.gbk.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1995, by Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 
26 #include <stdio.h>
27 #include <errno.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <unicode_gb18030.h>	/* Unicode to GBK mapping table */
31 #include "common_defs.h"
32 #include "ucs4.h"
33 
34 #define	MSB	0x80	/* most significant bit */
35 #define ONEBYTE	0xff	/* right most byte */
36 
37 #define NON_ID_CHAR '?'	/* non-identified character */
38 
39 #define IS_GBK4BYTES(v)  ( (v) & 0xffff0000 )
40 #define GBK_LEN_MAX	4
41 
42 
43 typedef struct _icv_state {
44 	char	keepc[6];	/* maximum # byte of UTF8 code */
45 	short	ustate;
46 	int	_errno;		/* internal errno */
47 } _iconv_st;
48 
49 enum _USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
50 
51 int get_gbk_by_unicode(unsigned long, int*, unsigned long*);
52 static int binsearch(unsigned long x, table_t v[], int n);
53 int unicode_to_gbk(int unidx, unsigned long gbkcode, char* buf, size_t buflen, int *uconv_num);
54 
55 /*
56  * Open; called from iconv_open()
57  */
58 void *
_icv_open()59 _icv_open()
60 {
61 	_iconv_st *st;
62 
63 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
64 		errno = ENOMEM;
65 		return ((void *) -1);
66 	}
67 
68 	st->ustate = U0;
69 	st->_errno = 0;
70 	return ((void *) st);
71 }
72 
73 
74 /*
75  * Close; called from iconv_close()
76  */
77 void
_icv_close(_iconv_st * st)78 _icv_close(_iconv_st *st)
79 {
80 	if (!st)
81 		errno = EBADF;
82 	else
83 		free(st);
84 }
85 
86 #if defined(UCS_2LE) || defined (UCS_2BE) || defined (UCS_4LE) || defined (UCS_4BE)
87 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)88 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
89 				char **outbuf, size_t *outbytesleft)
90 {
91 	unsigned char   c1, c2;
92 #if defined(UCS_4LE) || defined (UCS_4BE)
93 	unsigned char	c3, c4;
94 #endif
95 	int		n, unidx;
96         unsigned long   unichr;
97 	unsigned long	gbkcode;
98         int		uconv_num = 0;
99 
100 	if (st == NULL) {
101 		errno = EBADF;
102 		return ((size_t) -1);
103 	}
104 
105 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
106 		st->ustate = U0;
107 		st->_errno = 0;
108 		return ((size_t) 0);
109 	}
110 
111 	st->_errno = 0;		/* reset internal errno */
112 	errno = 0;		/* reset external errno */
113 
114 	while (*inbytesleft > ICV_FETCH_UCS_SIZE-1 && *outbytesleft > 0) {
115 
116                 int     size = 0;
117 		int	uconv_num_internal = 0;
118 
119                 c1 = *(*inbuf + size++);
120                 c2 = *(*inbuf + size++);
121 #if defined(UCS_4LE) || defined (UCS_4BE)
122                 c3 = *(*inbuf + size++);
123                 c4 = *(*inbuf + size++);
124 #endif
125 
126 #if defined(UCS_2LE)
127                 unichr = (unsigned long) (c1 | (c2<<8));
128 #elif defined(UCS_2BE)
129                 unichr = (unsigned long) ((c1<<8) | c2);
130 #elif defined(UCS_4LE)
131                 unichr = (unsigned long) (c1 | (c2<<8) | (c3)<<16 | (c4<<24));
132 #else
133                 unichr = (unsigned long) ((c1<<24) | (c2<<16) | (c3<<8) | c4);
134 #endif
135 
136                 if (unichr < MSB) { /* ASCII */
137                         **outbuf = (char) unichr;
138 		        (*outbuf)++;
139 			(*outbytesleft)--;
140                 } else {
141 			n = get_gbk_by_unicode(unichr, &unidx, &gbkcode);
142 			if ( n == -1 ) { /* invalid unicode codepoint */
143 			        st->_errno = errno = EILSEQ;
144 			        return ((size_t)-1);
145 			}
146 
147 			n = unicode_to_gbk(unidx, gbkcode, *outbuf, *outbytesleft, &uconv_num_internal);
148 			if (n > 0) {
149 				(*outbuf) += n;
150 				(*outbytesleft) -= n;
151 
152 				uconv_num += uconv_num_internal;
153                         } else {
154                                 return ((size_t)-1);
155                         }
156                 }
157 
158                 (*inbuf) += size;
159                 (*inbytesleft) -= size;
160         }
161 
162         if ( *inbytesleft >0 ) {
163                 errno =  *outbytesleft? EINVAL: E2BIG;
164                 return ((size_t)-1);
165         }
166 
167         return uconv_num;
168 }
169 #else
170 /*
171  * Actual conversion; called from iconv()
172  */
173 /*=========================================================
174  *
175  *       State Machine for interpreting UTF8 code
176  *
177  *=========================================================
178  *               4 byte unicode
179  *          +----->------->------------> U5 -----> U6-------> U7---+
180  *          |                                                      |
181  *          |    3 byte unicode                                    |
182  *          +----->------->-------+                                |
183  *          |                     |                                |
184  *          ^                     v                                |
185  *          |  2 byte             U2 ---> U3                       |
186  *          |  unicode                    v                        |
187  * +------> U0 -------> U1                +-------->U4---+         |
188  * ^  ascii |           |                           ^    |         |
189  * |        |           +-------->--------->--------+    |         |
190  * |        v                                            v         V
191  * +----<---+-----<------------<------------<------------+---------+
192  *
193  *=========================================================*/
194 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)195 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
196 				char **outbuf, size_t *outbytesleft)
197 {
198 	char		c1 = 0, c2 = 0;
199 	int		n, unidx;
200         unsigned long   unichr;
201 	unsigned long	gbkcode;
202         int		uconv_num = 0;
203 	int		utf8_len = 0;
204 
205 #ifdef DEBUG
206     fprintf(stderr, "==========     iconv(): UTF2 --> GBK2K     ==========\n");
207 #endif
208 	if (st == NULL) {
209 		errno = EBADF;
210 		return ((size_t) -1);
211 	}
212 
213 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
214 		st->ustate = U0;
215 		st->_errno = 0;
216 		return ((size_t) 0);
217 	}
218 
219 	st->_errno = 0;		/* reset internal errno */
220 	errno = 0;		/* reset external errno */
221 
222 	/* a state machine for interpreting UTF8 code */
223 	while (*inbytesleft > 0 && *outbytesleft > 0) {
224 
225 	        uchar_t  first_byte;
226 		int	 uconv_num_internal = 0;
227 
228 		switch (st->ustate) {
229 		case U0:		/* assuming ASCII in the beginning */
230 			if ((**inbuf & MSB) == 0) {	/* ASCII */
231 				**outbuf = **inbuf;
232 				(*outbuf)++;
233 				(*outbytesleft)--;
234 			} else {
235 				if ((**inbuf & 0xe0) == 0xc0) {	/* 2 byte unicode 0xc0..0xdf */
236 				    /* invalid sequence if the first char is either 0xc0 or 0xc1 */
237 				    if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
238 				        st->_errno = errno = EILSEQ;
239 				    else {
240 					st->ustate = U1;
241 					st->keepc[0] = **inbuf;
242 				    }
243 				} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3 byte 0xe0..0xef */
244 					st->ustate = U2;
245 					st->keepc[0] = **inbuf;
246 				} else {
247 				     /* four bytes of UTF-8 sequences */
248 				     if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
249 					st->_errno = errno = EILSEQ;
250 				     else {
251 					st->ustate = U5;
252 					st->keepc[0] = **inbuf;
253 				     }
254 				}
255 			}
256 			break;
257 		case U1:		/* 2 byte unicode */
258 			if ((**inbuf & 0xc0) == MSB) {
259 				utf8_len = 2;
260 				st->keepc[1] = **inbuf;
261 
262 				c1 = (st->keepc[0]&0x1c)>>2;
263 				c2 = ((st->keepc[0]&0x03)<<6) | ((st->keepc[1])&0x3f);
264 
265 				st->ustate = U4;
266 #ifdef DEBUG
267     fprintf(stderr, "UTF8: %02x%02x   --> ",
268 	st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
269 #endif
270 				continue;	/* should not advance *inbuf */
271 			} else {
272 				st->_errno = errno = EILSEQ;
273 			}
274 			break;
275 		case U2:		/* 3 byte unicode - 2nd byte */
276 		        first_byte = (uchar_t)st->keepc[0];
277 
278 		        /* if the first byte is 0xed, it is illegal sequence if the second
279 			 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
280 			 */
281 			if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
282 			     ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
283 		                st->_errno = errno = EILSEQ;
284 		        else
285 		           {
286 				st->ustate = U3;
287 				st->keepc[1] = **inbuf;
288 			   }
289 			break;
290 		case U3:		/* 3 byte unicode - 3rd byte */
291 			if ((**inbuf & 0xc0) == MSB) {
292 				st->ustate = U4;
293 				utf8_len = 3;
294 				st->keepc[2] = **inbuf;
295 				c1 = ((st->keepc[0]&0x0f)<<4) |
296 					((st->keepc[1]&0x3c)>>2);
297 				c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
298 #ifdef DEBUG
299     fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
300 		st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
301 #endif
302 				continue;	/* should not advance *inbuf */
303 			} else {
304 				st->_errno = errno = EILSEQ;
305 			}
306 			break;
307 		case U4:
308 	                unichr = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
309 			n = get_gbk_by_unicode(unichr, &unidx, &gbkcode);
310 		        if ( n == -1 ) { /* unicode is either 0xFFFE or 0xFFFF */
311 			     st->_errno = errno = EILSEQ;
312 			     break;
313 			}
314 /* comment the following lines so that converter can ignore the non-GBK characters
315 			if (n != 0) {	* legal unicode;illegal GBK *
316 				st->_errno = errno = EILSEQ;
317 				break;
318 			}
319 */
320 			n = unicode_to_gbk(unidx, gbkcode, *outbuf, *outbytesleft, &uconv_num_internal);
321 			if (n > 0) {
322 				(*outbuf) += n;
323 				(*outbytesleft) -= n;
324 
325 				uconv_num += uconv_num_internal;
326 
327 				st->ustate = U0;
328 			} else {
329 				st->_errno = errno;
330 			}
331 			break;
332 		 case U5:
333 		       first_byte = st->keepc[0];
334 
335 		       /* if the first byte is 0xf0, it is illegal sequence if
336 			* the second one is between 0x80 and 0x8f
337 			* for Four-Byte UTF: U+10000..U+10FFFF
338 			*/
339 		       if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
340 			   ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
341 		           st->_errno = errno = EILSEQ;
342 		       else {
343 			   st->ustate = U6;
344 			   st->keepc[1] = **inbuf;
345 		       }
346 		       break;
347 		 case U6:
348 		      if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
349 		       {
350 			  st->ustate = U7;
351 			  st->keepc[2] = **inbuf;
352 		       }
353 		      else
354 		          st->_errno = errno = EILSEQ;
355 		      break;
356 		 case U7:
357 		      if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
358 		       {
359 			  /* replace with double NON_ID_CHARs */
360 			  if ( *outbytesleft < 2 ) st->_errno = errno = E2BIG;
361 			  else
362 			    {
363 			       **outbuf = NON_ID_CHAR;
364 			       *(*outbuf+1) = NON_ID_CHAR;
365 			       (*outbytesleft) -= 2;
366 
367 			       uconv_num++;
368 
369 			       st->ustate = U0;
370 			    }
371 		       }
372 		      else
373 		          st->_errno = errno = EILSEQ;
374 		      break;
375 		 default:			/* should never come here */
376 			st->_errno = errno = EILSEQ;
377 			st->ustate = U0;	/* reset state */
378 			break;
379 		}
380 
381 		if (st->_errno) {
382 #ifdef DEBUG
383     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
384 		st->_errno, st->ustate);
385 #endif
386 			break;
387 		}
388 
389 		(*inbuf)++;
390 		(*inbytesleft)--;
391 	}
392 
393         if (*inbytesleft == 0 && st->ustate != U0)
394                 errno = EINVAL;
395 
396 	if (*inbytesleft > 0 && *outbytesleft == 0)
397 		errno = E2BIG;
398 
399 	if (errno) {
400 		int num_reversed_bytes = 0;
401 
402 		switch (st->ustate)
403 	        {
404 		 case U1:
405 		   num_reversed_bytes = 1;
406 		   break;
407 		 case U2:
408 		   num_reversed_bytes = 1;
409 		   break;
410 		 case U3:
411 		   num_reversed_bytes = 2;
412 		   break;
413 		 case U4:
414 		   num_reversed_bytes = utf8_len - 1;
415 		   break;
416 		 case U5:
417 		   num_reversed_bytes = 1;
418 		   break;
419 		 case U6:
420 		   num_reversed_bytes = 2;
421 		   break;
422 		 case U7:
423 		   num_reversed_bytes = 3;
424 		   break;
425 	         }
426 
427 		/*
428 		 * if error, *inbuf points to the byte following the last byte
429 		 * successfully used in conversion.
430 		 */
431 		*inbuf -= num_reversed_bytes;
432 		*inbytesleft += num_reversed_bytes;
433 	        st->ustate = U0;
434 
435 		return ((size_t) -1);
436 	}
437 
438 	return uconv_num;
439 }
440 #endif /* UCS_2LE || UCS_2BE || UCS_4LE || UCS_4BE */
441 
442 
443 /*
444  * Match GBK code by UTF8 code;
445  * Return: = 0 - match from Unicode to GBK found
446  *         = 1 - match from Unicode to GBK NOT found
447  *         = -1- illegal sequence
448  *
449  * Since binary search of the UTF8 to GBK table is necessary, might as well
450  * return index and GBK code matching to the unicode.
451  */
get_gbk_by_unicode(unsigned long unicode,int * unidx,unsigned long * gbkcode)452 int get_gbk_by_unicode(unsigned long unicode, int* unidx, unsigned long* gbkcode)
453 {
454         if ( unicode > UCS4_MAXVAL || ext_ucs4_lsw(unicode) > UCS4_PPRC_MAXVAL ) return -1;
455 
456 	*unidx = binsearch(unicode, unicode_gbk_tab, UNICODEMAX);
457 	if ((*unidx) >= 0)
458 		*gbkcode = unicode_gbk_tab[*unidx].value;
459 	else
460 		return(1);	/* match from unicode to GBK not found */
461 #ifdef DEBUG
462     fprintf(stderr, "Unicode=%04x, idx=%5d, Big-5=%x ", unicode, *unidx, *gbkcode);
463 #endif
464 
465 	return(0);
466 }
467 
468 
469 /*
470  * ISO/IEC 10646-2000 (Unicode) --> GBK2K
471  * Unicode --> UTF8 (FSS-UTF)
472  *             (File System Safe Universal Character Set Transformation Format)
473  * Return: > 0 - converted with enough space in output buffer
474  *         = 0 - no space in outbuf
475  */
unicode_to_gbk(int unidx,unsigned long gbkcode,char * buf,size_t buflen,int * uconv_num)476 int unicode_to_gbk(int unidx, unsigned long gbkcode, char* buf, size_t buflen, int *uconv_num)
477 {
478 	unsigned long	val;		/* GBK value */
479 	char		c[GBK_LEN_MAX];
480 	int		i, length;
481 
482 	if (unidx < 0) {	/* no match from Unicode to GBK */
483 		c[0] = c[1] = NON_ID_CHAR;
484 
485 	        *uconv_num = 1;
486 
487 		length = 2;
488 	} else {
489 		if (  ! IS_GBK4BYTES( gbkcode ) ) { /* character within two bytes area */
490 			val = gbkcode & 0xffff;
491 			c[0] = (char) ((val & 0xff00) >> 8);
492 			c[1] = (char) (val & 0xff);
493 			length = 2;
494 		} else { /* character within four bytes area */
495 			val = gbkcode & 0xffffffff;
496 			c[0] = (char) ( val >> 24 );
497 			c[1] = (char) ( val >> 16 );
498 			c[2] = (char) ( val >> 8 );
499 			c[3] = (char) val;
500 			length = 4;
501 		}
502 	}
503 
504 #ifdef DEBUG
505     fprintf(stderr, "\t->%x %x<-\n", *buf, *(buf+1));
506 #endif
507 
508 	if (buflen < length) {
509 		errno = E2BIG;
510 		return(0);
511 	}
512 
513 	for ( i = 0; i < length; ++i )
514 		*buf++ = c[i];
515 
516 	return length;
517 }
518 
519 
520 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,table_t v[],int n)521 static int binsearch(unsigned long x, table_t v[], int n)
522 {
523 	int low, high, mid;
524 
525 	low = 0;
526 	high = n - 1;
527 	while (low <= high) {
528 		mid = (high - low) / 2 + low;
529 		if (x < v[mid].key)
530 			high = mid - 1;
531 		else if (x > v[mid].key)
532 			low = mid + 1;
533 		else	/* found match */
534 			return mid;
535 	}
536 	return (-1);	/* no match */
537 }
538 
539 /*
540 vi:ts=8:ai:expandtab
541 */
542