xref: /titanic_51/usr/src/lib/iconv_modules/zh/common/zh_HK.hkscs%UTF-8.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2000, by Sun Microsystems, Inc.
24  * All rights reserved.
25  */
26 
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/isa_defs.h>
31 #include <errno.h>
32 #include "common_defs.h"
33 #include "big5hk_unicode.h"	/* HKSCS to Unicode mapping table */
34 
35 #define	MSB	0x80	/* most significant bit */
36 #define	MBYTE	0x8e	/* multi-byte (4 byte character) */
37 #define	PMASK	0xa0	/* plane number mask */
38 #define ONEBYTE	0xff	/* right most byte */
39 
40 /* non-identified character */
41 #define UTF8_NON_ID_CHAR1 0xEF
42 #define UTF8_NON_ID_CHAR2 0xBF
43 #define UTF8_NON_ID_CHAR3 0xBD
44 
45 
46 typedef struct  _icv_state {
47 	char	keepc[2];	/* maximum # byte of HKSCS code */
48 	short	cstate;		/* state machine id */
49 	int	_errno;		/* internal errno */
50         boolean little_endian;
51         boolean bom_written;
52 }_iconv_st;
53 
54 enum _CSTATE	{ C0, C1 };
55 
56 static int hkscs_2nd_byte(char);
57 static int hkscs_to_utf8(_iconv_st *, char*, size_t, int *);
58 static int binsearch(unsigned long, hkscs_utf[], int);
59 
60 
61 /*
62  * Open; called from iconv_open()
63  */
64 void *
65 _icv_open()
66 {
67 	_iconv_st *st;
68 
69 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
70 		errno = ENOMEM;
71 		return ((void *) -1);
72 	}
73 
74 	st->cstate = C0;
75 	st->_errno = 0;
76         st->little_endian = false;
77         st->bom_written = false;
78 #if defined(UCS_2LE)
79         st->little_endian = true;
80         st->bom_written = true;
81 #endif
82 	return ((void *) st);
83 }
84 
85 
86 /*
87  * Close; called from iconv_close()
88  */
89 void
90 _icv_close(_iconv_st *st)
91 {
92 	if (!st)
93 		errno = EBADF;
94 	else
95 		free(st);
96 }
97 
98 
99 /*
100  * Actual conversion; called from iconv()
101  */
102 /*=======================================================
103  *
104  *   State Machine for interpreting HKSCS code
105  *
106  *=======================================================
107  *
108  *                     1st C
109  *    +--------> C0 ----------> C1
110  *    |    ascii |        2nd C |
111  *    ^          v              v
112  *    +----<-----+-----<--------+
113  *
114  *=======================================================*/
115 /*
116  * HKSCS encoding range:
117  *	High byte: 0x81 - 0xFE
118  *      Low  byte: 0x40 - 0x7E, 0xA1 - 0xFE
119  *
120  *      For HKSCS:
121  *		   0x8140 - 0x8DFE		( 641 encoding space)
122  *	      	   0x8E40 - 0xA0FE		( 2898 encoding space)
123  *		   0xC6A1 - 0xC8FE		( 359 encoding space)
124  *		   0xF9D6 - 0xF9FE		( 41 encoding space)
125  *		   0xFA40 - 0xFEFE		( 763 encoding space)
126  * 	Total:    4702
127  *	For BIG5
128  *		   0xA140 - 0xC8FE
129  *		   0xC940 - 0xFEFE
130  */
131 size_t
132 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
133 				char **outbuf, size_t *outbytesleft)
134 {
135 	int		n;
136         int		uconv_num = 0;
137 
138 #ifdef DEBUG
139     fprintf(stderr, "==========     iconv(): HKSCS --> UTF2     ==========\n");
140 #endif
141 	if (st == NULL) {
142 		errno = EBADF;
143 		return ((size_t) -1);
144 	}
145 
146 	if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
147 		st->cstate = C0;
148 		st->_errno = 0;
149 		return ((size_t) 0);
150 	}
151 
152 	st->_errno = 0;         /* reset internal errno */
153 	errno = 0;		/* reset external errno */
154 
155 	/* a state machine for interpreting CNS 11643 code */
156 	while (*inbytesleft > 0 && *outbytesleft > 0) {
157 		switch (st->cstate) {
158 		case C0:		/* assuming ASCII in the beginning */
159 			if (**inbuf & MSB) {
160 					st->keepc[0] = (**inbuf);
161 					st->cstate = C1;
162 			} else {	/* real ASCII */
163 			    /*
164 			     * code conversion for UCS-2LE to support Samba
165 			     */
166 			    if (st->little_endian) {
167 			      if (!st->bom_written) {
168 				if (*outbytesleft < 4)
169 				  errno = E2BIG;
170 				else {
171 				  *(*outbuf)++ = (uchar_t)0xff;
172 				  *(*outbuf)++ = (uchar_t)0xfe;
173 				  *outbytesleft -= 2;
174 
175 				  st->bom_written = true;
176 				}
177 			      }
178 
179 			      if (*outbytesleft < 2)
180 				errno = E2BIG;
181 			      else {
182 				*(*outbuf)++ = **inbuf;
183 				*(*outbuf)++ = (uchar_t)0x0;
184 				*outbytesleft -= 2;
185 			      }
186 			    } else {
187 			      **outbuf = **inbuf;
188 			      (*outbuf)++;
189 			      (*outbytesleft)--;
190 			    }
191 			}
192 			break;
193 		case C1:		/* Chinese characters: 2nd byte */
194 			if (hkscs_2nd_byte(**inbuf) == 0) {
195 				int uconv_num_internal = 0;
196 
197 				st->keepc[1] = (**inbuf);
198 				n = hkscs_to_utf8(st, *outbuf,
199 						*outbytesleft, &uconv_num_internal);
200 				if (n > 0) {
201 					(*outbuf) += n;
202 					(*outbytesleft) -= n;
203 
204 					uconv_num += uconv_num_internal;
205 
206 					st->cstate = C0;
207 				} else {	/* don't reset state */
208 					st->_errno = errno = E2BIG;
209 				}
210 			} else {	/* input char doesn't belong
211 					 * to the input code set
212 					 */
213 				st->_errno = errno = EILSEQ;
214 			}
215 			break;
216 		default:			/* should never come here */
217 			st->_errno = errno = EILSEQ;
218 			st->cstate = C0;	/* reset state */
219 			break;
220 		}
221 
222 		if (st->_errno) {
223 #ifdef DEBUG
224     fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
225 		st->_errno, st->cstate);
226 #endif
227 			break;
228 		}
229 
230 		(*inbuf)++;
231 		(*inbytesleft)--;
232 	}
233 
234 	if (*inbytesleft == 0 && st->cstate != C0)
235                 errno = EINVAL;
236 
237 	if (*inbytesleft > 0 && *outbytesleft == 0)
238 		errno = E2BIG;
239 
240 	if (errno) {
241 		/*
242 		 * if error, *inbuf points to the byte following the last byte
243 		 * successfully used in the conversion.
244 		 */
245 		*inbuf -= (st->cstate - C0);
246 		*inbytesleft += (st->cstate - C0);
247 		st->cstate = C0;
248 		return ((size_t) -1);
249 	}
250 
251 	return uconv_num;
252 }
253 
254 
255 /*
256  * Test whether inbuf is a valid character for 2nd byte HKSCS code
257  * Return: = 0 - valid HKSCS 2nd byte
258  *         = 1 - invalid HKSCS 2nd byte
259  */
260 static int hkscs_2nd_byte(char inbuf)
261 {
262 	unsigned int	buf = (unsigned int) (inbuf & ONEBYTE);
263 
264         if ((buf >= 0x40) && (buf <= 0x7E))
265                 return (0);
266         if ((buf >= 0xA1) && (buf <= 0xFE))
267                 return (0);
268 
269 	return(1);
270 }
271 
272 #ifdef UDC_SUPPORT
273 typedef struct _udc_sect {
274   unsigned int start, end, count;
275 } UDC;
276 
277 UDC udc[] = {
278   { 0x8140, 0x84FE, 0x274 }
279 };
280 
281 #define START_UNICODE 0xF0000
282 
283 static int
284 ifUDC(UDC *udc, unsigned int code)
285 {
286    int i;
287 
288    for (i=0; i < 1; ++i)
289 	if (code >= udc[i].start && code <= udc[i].end)
290 	  {
291             unsigned char c1, c2, leading_c1;
292 
293 	    c1 = (unsigned char)(code >> 8);
294 	    c2 = (unsigned char)code;
295 	    leading_c1 = (unsigned char) (udc[i].start >> 8);
296 
297 	    return START_UNICODE + (i ? udc[i-1].count : 0) + \
298 		(c1 - leading_c1) * 157 + ((c2 <= 0x7E) ? (c2 - 0x40) : ((c2 - 0x40) - (0xA1 - 0x7F)));
299 	}
300 
301    return 0;
302 }
303 #endif
304 
305 /*
306  * HKSCS code --> ISO/IEC 10646 (Unicode)
307  * Unicode --> UTF8 (FSS-UTF)
308  *             (File System Safe Universal Character Set Transformation Format)
309  * Return: > 0 - converted with enough space in output buffer
310  *         = 0 - no space in outbuf
311  */
312 static int hkscs_to_utf8(_iconv_st *st, char *buf, size_t buflen, int *uconv_num)
313 {
314 	unsigned long	hkscs_val;	/* HKSCS value */
315 	int		unidx = 0;		/* Unicode index */
316 	unsigned long	uni_val = 0;	/* Unicode */
317 	char            *keepc = st->keepc;
318 
319 	hkscs_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
320 #ifdef DEBUG
321     fprintf(stderr, "%x\t", hkscs_val);
322 #endif
323 
324 #ifdef UDC_SUPPORT
325       if ((uni_val = ifUDC(udc, hkscs_val)) == 0) {
326 #endif
327 	unidx = binsearch(hkscs_val, hkscs_utf_tab, MAX_HKSCS_NUM);
328 	if (unidx >= 0)
329 		uni_val = hkscs_utf_tab[unidx].unicode;
330 #ifdef UDC_SUPPORT
331       }
332 #endif
333 #ifdef DEBUG
334     fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
335 #endif
336 
337         /*
338          * Code version for UCS-2LE to support Samba
339          */
340         if (st->little_endian) {
341           int size = 0;
342 
343           if (unidx < 0 || uni_val > 0x00ffff ) {
344             uni_val = ICV_CHAR_UCS2_REPLACEMENT;
345             *uconv_num = 1;
346           }
347 
348           if (!st->bom_written) {
349             if (buflen < 4)
350               return 0;
351 
352             *(buf + size++) = (uchar_t)0xff;
353             *(buf + size++) = (uchar_t)0xfe;
354             st->bom_written = true;
355           }
356 
357           if (buflen < 2)
358             return 0;
359 
360           *(buf + size++) = (uchar_t)(uni_val & 0xff);
361           *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff);
362 
363           return size;
364         }
365 
366 	if (unidx >= 0) {	/* do Unicode to UTF8 conversion */
367 		if (uni_val >= 0x0080 && uni_val <= 0x07ff) {
368 			if (buflen < 2) {
369 #ifdef DEBUG
370     fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n");
371 #endif
372 				errno = E2BIG;
373 				return(0);
374 			}
375 			*buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
376 			*(buf+1) = (char)(uni_val & 0x3f) | 0x80;
377 #ifdef DEBUG
378     fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
379 #endif
380 			return(2);
381 		}
382 		if (uni_val >= 0x0800 && uni_val <= 0xffff) {
383 			if (buflen < 3) {
384 #ifdef DEBUG
385     fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n");
386 #endif
387 				errno = E2BIG;
388 				return(0);
389 			}
390 			*buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
391 			*(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
392 			*(buf+2) = (char)(uni_val & 0x3f) | 0x80;
393 #ifdef DEBUG
394     fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
395 #endif
396 			return(3);
397 		}
398 	        if (uni_val >= 0x10000 && uni_val <= 0x10ffff) {
399 		        if (buflen < 4)
400 		          {
401 			      errno = E2BIG;
402 			      return 0;
403 			  }
404 		        *buf = (char)((uni_val >> 18) & 0x7) | 0xf0;
405 			*(buf+1) = (char)((uni_val >> 12) & 0x3f) | 0x80;
406 			*(buf+2) = (char)((uni_val >>6) & 0x3f) | 0x80;
407 			*(buf+3) = (char)(uni_val & 0x3f) | 0x80;
408 		        return(4);
409 		}
410 	}
411 
412 	/* can't find a match in HKSCS --> UTF8 table or illegal UTF8 code */
413 	if (buflen < 3) {
414 #ifdef DEBUG
415     fprintf(stderr, "outbuf overflow in hkscs_to_utf8()!!\n");
416 #endif
417 		errno = E2BIG;
418 		return(0);
419 	}
420 
421         *(unsigned char*) buf    = UTF8_NON_ID_CHAR1;
422         *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2;
423         *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3;
424 
425 	/* non-identical conversion */
426 	*uconv_num = 1;
427 
428 #ifdef DEBUG
429     fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
430 #endif
431 	return(3);
432 }
433 
434 
435 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
436 static int binsearch(unsigned long x, hkscs_utf v[], int n)
437 {
438 	int low, high, mid;
439 
440 	low = 0;
441 	high = n - 1;
442 	while (low <= high) {
443 		mid = (low + high) / 2;
444 		if (x < v[mid].hkscscode)
445 			high = mid - 1;
446 		else if (x > v[mid].hkscscode)
447 			low = mid + 1;
448 		else	/* found match */
449 			return mid;
450 	}
451 	return (-1);	/* no match */
452 }
453