1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <sys/isa_defs.h>
31 #include <errno.h>
32 #include "common_defs.h"
33 #include "big5_unicode.h" /* Big-5 to Unicode mapping table */
34
35 #define MSB 0x80 /* most significant bit */
36 #define MBYTE 0x8e /* multi-byte (4 byte character) */
37 #define PMASK 0xa0 /* plane number mask */
38 #define ONEBYTE 0xff /* right most byte */
39
40 /* non-identified character */
41 #define UTF8_NON_ID_CHAR1 0xEF
42 #define UTF8_NON_ID_CHAR2 0xBF
43 #define UTF8_NON_ID_CHAR3 0xBD
44
45
46 typedef struct _icv_state {
47 char keepc[2]; /* maximum # byte of Big-5 code */
48 short cstate; /* state machine id */
49 int _errno; /* internal errno */
50 boolean little_endian;
51 boolean bom_written;
52 }_iconv_st;
53
54 enum _CSTATE { C0, C1 };
55
56 static int big5_2nd_byte(char);
57 static int big5_to_utf8(_iconv_st *, char*, size_t, int *);
58 static int binsearch(unsigned long, big5_utf[], int);
59
60
61 /*
62 * Open; called from iconv_open()
63 */
64 void *
_icv_open()65 _icv_open()
66 {
67 _iconv_st *st;
68
69 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
70 errno = ENOMEM;
71 return ((void *) -1);
72 }
73
74 st->cstate = C0;
75 st->_errno = 0;
76 st->little_endian = false;
77 st->bom_written = false;
78 #if defined(UCS_2LE)
79 st->little_endian = true;
80 st->bom_written = true;
81 #endif
82 return ((void *) st);
83 }
84
85
86 /*
87 * Close; called from iconv_close()
88 */
89 void
_icv_close(_iconv_st * st)90 _icv_close(_iconv_st *st)
91 {
92 if (!st)
93 errno = EBADF;
94 else
95 free(st);
96 }
97
98
99 /*
100 * Actual conversion; called from iconv()
101 */
102 /*=======================================================
103 *
104 * State Machine for interpreting Big-5 code
105 *
106 *=======================================================
107 *
108 * 1st C
109 * +--------> C0 ----------> C1
110 * | ascii | 2nd C |
111 * ^ v v
112 * +----<-----+-----<--------+
113 *
114 *=======================================================*/
115 /*
116 * Big-5 encoding range:
117 * High byte: 0xA1 - 0xFE ( 94 encoding space)
118 * Low byte: 0x40 - 0x7E, 0xA1 - 0xFE ( 157 encoding space)
119 * Plane #1: 0xA140 - 0xC8FE ( 6280 encoding space)
120 * Plane #2: 0xC940 - 0xFEFE ( 8478 encoding space)
121 * Total: 94 * 157 = 14,758 (14758 encoding space)
122 */
123 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)124 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
125 char **outbuf, size_t *outbytesleft)
126 {
127 int n;
128 int uconv_num = 0;
129
130 #ifdef DEBUG
131 fprintf(stderr, "========== iconv(): Big-5 --> UTF2 ==========\n");
132 #endif
133 if (st == NULL) {
134 errno = EBADF;
135 return ((size_t) -1);
136 }
137
138 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
139 st->cstate = C0;
140 st->_errno = 0;
141 return ((size_t) 0);
142 }
143
144 st->_errno = 0; /* reset internal errno */
145 errno = 0; /* reset external errno */
146
147 /* a state machine for interpreting CNS 11643 code */
148 while (*inbytesleft > 0 && *outbytesleft > 0) {
149 switch (st->cstate) {
150 case C0: /* assuming ASCII in the beginning */
151 if (**inbuf & MSB) {
152 st->keepc[0] = (**inbuf);
153 st->cstate = C1;
154 } else { /* real ASCII */
155 if (st->little_endian) {
156 if (!st->bom_written) {
157 if (*outbytesleft < 4)
158 errno = E2BIG;
159 else {
160 *(*outbuf)++ = (uchar_t)0xff;
161 *(*outbuf)++ = (uchar_t)0xfe;
162 *outbytesleft -= 2;
163
164 st->bom_written = true;
165 }
166 }
167
168 if (*outbytesleft < 2)
169 return E2BIG;
170 else {
171 *(*outbuf)++ = **inbuf;
172 *(*outbuf)++ = (uchar_t)0x0;
173 *outbytesleft -= 2;
174 }
175 } else {
176 **outbuf = **inbuf;
177 (*outbuf)++;
178 (*outbytesleft)--;
179 }
180 }
181 break;
182 case C1: /* Chinese characters: 2nd byte */
183 if (big5_2nd_byte(**inbuf) == 0) {
184 int uconv_num_internal = 0;
185
186 st->keepc[1] = (**inbuf);
187 n = big5_to_utf8(st, *outbuf,
188 *outbytesleft, &uconv_num_internal);
189 if (n > 0) {
190 (*outbuf) += n;
191 (*outbytesleft) -= n;
192
193 uconv_num += uconv_num_internal;
194
195 st->cstate = C0;
196 } else { /* don't reset state */
197 st->_errno = errno = E2BIG;
198 }
199 } else { /* input char doesn't belong
200 * to the input code set
201 */
202 st->_errno = errno = EILSEQ;
203 }
204 break;
205 default: /* should never come here */
206 st->_errno = errno = EILSEQ;
207 st->cstate = C0; /* reset state */
208 break;
209 }
210
211 if (st->_errno) {
212 #ifdef DEBUG
213 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
214 st->_errno, st->cstate);
215 #endif
216 break;
217 }
218
219 (*inbuf)++;
220 (*inbytesleft)--;
221 }
222
223 if (*inbytesleft == 0 && st->cstate != C0)
224 errno = EINVAL;
225
226 if (*inbytesleft > 0 && *outbytesleft == 0)
227 errno = E2BIG;
228
229 if (errno) {
230 /*
231 * if error, *inbuf points to the byte following the last byte
232 * successfully used in the conversion.
233 */
234 *inbuf -= (st->cstate - C0);
235 *inbytesleft += (st->cstate - C0);
236 st->cstate = C0;
237 return ((size_t) -1);
238 }
239
240 return uconv_num;
241 }
242
243
244 /*
245 * Test whether inbuf is a valid character for 2nd byte Big-5 code
246 * Return: = 0 - valid Big-5 2nd byte
247 * = 1 - invalid Big-5 2nd byte
248 */
big5_2nd_byte(char inbuf)249 static int big5_2nd_byte(char inbuf)
250 {
251 unsigned int buf = (unsigned int) (inbuf & ONEBYTE);
252
253 if ((buf >= 0x40) && (buf <= 0x7E))
254 return (0);
255 if ((buf >= 0xA1) && (buf <= 0xFE))
256 return (0);
257 return(1);
258 }
259
260 #ifdef UDC_SUPPORT
261 typedef struct _udc_sect {
262 unsigned int start, end, count;
263 } UDC;
264
265 UDC udc[] = {
266 { 0xFA40, 0xFEFE, 0x311 }
267 };
268
269 #define UDC_START_UNICODE 0xF0000
270
271 static int
ifUDC(UDC * udc,unsigned int code)272 ifUDC(UDC *udc, unsigned int code)
273 {
274 int i;
275
276 for (i=0; i < 1; ++i)
277 if (code >= udc[i].start && code <= udc[i].end)
278 {
279 unsigned char c1, c2, leading_c1;
280
281 c1 = (unsigned char)(code >> 8);
282 c2 = (unsigned char)code;
283 leading_c1 = (unsigned char) (udc[i].start >> 8);
284
285 return UDC_START_UNICODE + (i ? udc[i-1].count : 0) + \
286 (c1 - leading_c1) * 157 + ((c2 <= 0x7E) ? (c2 - 0x40) : ((c2 - 0x40) - (0xA1 - 0x7F)));
287 }
288
289 return 0;
290 }
291 #endif
292
293 /*
294 * Big-5 code --> ISO/IEC 10646 (Unicode)
295 * Unicode --> UTF8 (FSS-UTF)
296 * (File System Safe Universal Character Set Transformation Format)
297 * Return: > 0 - converted with enough space in output buffer
298 * = 0 - no space in outbuf
299 */
big5_to_utf8(_iconv_st * st,char * buf,size_t buflen,int * uconv_num)300 static int big5_to_utf8(_iconv_st *st, char *buf, size_t buflen, int *uconv_num)
301 {
302 unsigned long big5_val; /* Big-5 value */
303 int unidx = 0; /* Unicode index */
304 unsigned long uni_val = 0; /* Unicode */
305 char *keepc = st->keepc;
306
307 big5_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
308 #ifdef DEBUG
309 fprintf(stderr, "%x\t", big5_val);
310 #endif
311
312 #ifdef UDC_SUPPORT
313 if ((uni_val = ifUDC(udc, big5_val)) == 0) {
314 #endif
315 unidx = binsearch(big5_val, big5_utf_tab, MAX_BIG5_NUM);
316 if (unidx >= 0)
317
318 uni_val = big5_utf_tab[unidx].unicode;
319 #ifdef UDC_SUPPORT
320 }
321 #endif
322 #ifdef DEBUG
323 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
324 #endif
325
326 /*
327 * Code conversion for UCS-2LE to support Samba
328 */
329 if (st->little_endian) {
330 int size = 0;
331
332 if (unidx < 0 || uni_val > 0x00ffff ) {
333 uni_val = ICV_CHAR_UCS2_REPLACEMENT;
334 *uconv_num = 1;
335 }
336
337 if (!st->bom_written) {
338 if (buflen < 4)
339 return 0;
340
341 *(buf + size++) = (uchar_t)0xff;
342 *(buf + size++) = (uchar_t)0xfe;
343 st->bom_written = true;
344 }
345
346 if (buflen < 2)
347 return 0;
348
349 *(buf + size++) = (uchar_t)(uni_val & 0xff);
350 *(buf + size++) = (uchar_t)((uni_val >> 8) & 0xff);
351
352 return size;
353 }
354
355 if (unidx >= 0) { /* do Unicode to UTF8 conversion */
356 if (uni_val >= 0x0080 && uni_val <= 0x07ff) {
357 if (buflen < 2) {
358 #ifdef DEBUG
359 fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n");
360 #endif
361 errno = E2BIG;
362 return(0);
363 }
364 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
365 *(buf+1) = (char)(uni_val & 0x3f) | 0x80;
366 #ifdef DEBUG
367 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
368 #endif
369 return(2);
370 }
371 if (uni_val >= 0x0800 && uni_val <= 0xffff) {
372 if (buflen < 3) {
373 #ifdef DEBUG
374 fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n");
375 #endif
376 errno = E2BIG;
377 return(0);
378 }
379 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
380 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
381 *(buf+2) = (char)(uni_val & 0x3f) | 0x80;
382 #ifdef DEBUG
383 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
384 #endif
385 return(3);
386 }
387 if (uni_val >= 0x10000 && uni_val <= 0x10ffff) {
388 if (buflen < 4) {
389 errno = E2BIG;
390 return 0;
391 }
392
393 *buf = (char) ((uni_val >> 18 ) & 0x7) | 0xf0;
394 *(buf+1) = (char) ((uni_val >> 12) & 0x3f) | 0x80;
395 *(buf+2) = (char) ((uni_val >> 6) & 0x3f) | 0x80;
396 *(buf+3) = (char) (uni_val & 0x3f) | 0x80;
397
398 return 4;
399 }
400 }
401
402 /* can't find a match in Big-5 --> UTF8 table or illegal UTF8 code */
403 if (buflen < 3) {
404 #ifdef DEBUG
405 fprintf(stderr, "outbuf overflow in big5_to_utf8()!!\n");
406 #endif
407 errno = E2BIG;
408 return(0);
409 }
410
411 *(unsigned char*) buf = UTF8_NON_ID_CHAR1;
412 *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2;
413 *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3;
414
415 /* non-identical conversion */
416 *uconv_num = 1;
417
418 #ifdef DEBUG
419 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
420 #endif
421 return(3);
422 }
423
424
425 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,big5_utf v[],int n)426 static int binsearch(unsigned long x, big5_utf v[], int n)
427 {
428 int low, high, mid;
429
430 low = 0;
431 high = n - 1;
432 while (low <= high) {
433 mid = (low + high) / 2;
434 if (x < v[mid].big5code)
435 high = mid - 1;
436 else if (x > v[mid].big5code)
437 low = mid + 1;
438 else /* found match */
439 return mid;
440 }
441 return (-1); /* no match */
442 }
443