1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <sys/types.h>
30 #include <errno.h>
31 #include "unicode_cns11643_TW.h" /* UTF8 to CNS 11643 mapping table */
32 #include "common_defs.h"
33
34 #define MSB 0x80 /* most significant bit */
35 #define MBYTE 0x8e /* multi-byte (4 byte character) */
36 #define PMASK 0xa0 /* plane number mask */
37 #define ONEBYTE 0xff /* right most byte */
38
39 #define SI 0x0f /* shift in */
40 #define SO 0x0e /* shift out */
41 #define ESC 0x1b /* escape */
42
43 /* static const char plane_char[] = "0GH23456789:;<=>?"; */
44 static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
45
46 #define GET_PLANEC(i) (plane_char[i])
47
48 #define NON_ID_CHAR '?' /* non-identified character */
49
50 typedef struct _icv_state {
51 char keepc[6]; /* maximum # byte of UTF8 code */
52 short cstate;
53 short istate;
54 short ustate;
55 int _errno; /* internal errno */
56 } _iconv_st;
57
58 enum _CSTATE { C0, C1 };
59 enum _ISTATE { IN, OUT };
60 enum _USTATE { U0, U1, U2, U3, U4, U5, U6, U7 };
61
62
63 static int get_plane_no_by_utf(const char, const char, int *, unsigned long *);
64 static int utf8_to_iso(int, int, unsigned long, char *, size_t);
65 static int binsearch(unsigned long, utf_cns[], int);
66
67 /*
68 * Open; called from iconv_open()
69 */
70 void *
_icv_open()71 _icv_open()
72 {
73 _iconv_st *st;
74
75 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
76 errno = ENOMEM;
77 return ((void *) -1);
78 }
79
80 st->cstate = C0;
81 st->istate = IN;
82 st->ustate = U0;
83 st->_errno = 0;
84
85 #ifdef DEBUG
86 fprintf(stderr, "========== iconv(): UTF2 --> ISO2022-7 ==========\n");
87 #endif
88
89 return ((void *) st);
90 }
91
92
93 /*
94 * Close; called from iconv_close()
95 */
96 void
_icv_close(_iconv_st * st)97 _icv_close(_iconv_st *st)
98 {
99 if (!st)
100 errno = EBADF;
101 else
102 free(st);
103 }
104
105
106 /*
107 * Actual conversion; called from iconv()
108 */
109 /*=========================================================
110 *
111 * State Machine for interpreting UTF8 code
112 *
113 *=========================================================
114 * 2nd byte 3rd byte 4th byte
115 * +----->------->------->U5------>U6--------->U7
116 * | |
117 * | 3 byte unicode |
118 * +----->------->-------+ |
119 * | | |
120 * ^ v |
121 * | 2 byte U2 ---> U3 |
122 * | unicode v |
123 * +------> U0 -------> U1 +-------->U4---+
124 * ^ ascii | | ^ |
125 * | | +-------->--------->--------+ |
126 * | v v
127 * +----<---+-----<------------<------------<------------+
128 *
129 *=========================================================*/
130 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)131 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
132 char **outbuf, size_t *outbytesleft)
133 {
134 char c1 = '\0', c2 = '\0';
135 int plane_no, n, unidx;
136 /* pre_plane_no: need to be static when re-entry occurs on errno set */
137 static int pre_plane_no = -1; /* previous plane number */
138 unsigned long cnscode;
139
140 if (st == NULL) {
141 errno = EBADF;
142 return ((size_t) -1);
143 }
144
145 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
146 st->cstate = C0;
147 st->istate = IN;
148 st->ustate = U0;
149 st->_errno = 0;
150 return ((size_t) 0);
151 }
152
153 #ifdef DEBUG
154 fprintf(stderr, "=== (Re-entry) iconv(): UTF-8 --> ISO 2022-7 ===\n");
155 fprintf(stderr, "st->cstate=%d\tst->istate=%d\tst->_errno=%d\tplane_no=%d\n",
156 st->cstate, st->istate, st->_errno, plane_no);
157 #endif
158 st->_errno = 0; /* reset internal errno */
159 errno = 0; /* reset external errno */
160
161 /* a state machine for interpreting UTF8 code */
162 while (*inbytesleft > 0 && *outbytesleft > 0) {
163
164 uchar_t first_byte;
165
166 switch (st->ustate) {
167 case U0: /* assuming ASCII in the beginning */
168 if ((**inbuf & MSB) == 0) { /* ASCII */
169 if (st->istate == OUT) {
170 st->cstate = C0;
171 st->istate = IN;
172 **outbuf = SI;
173 (*outbuf)++;
174 (*outbytesleft)--;
175 if (*outbytesleft <= 0) {
176 errno = E2BIG;
177 return((size_t) -1);
178 }
179 }
180 **outbuf = **inbuf;
181 (*outbuf)++;
182 (*outbytesleft)--;
183 } else { /* Chinese character */
184 if ((**inbuf & 0xe0) == 0xc0) { /* 2 byte unicode 0xc2..0xdf */
185
186 /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
187 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
188 st->_errno = errno = EILSEQ;
189 else {
190 st->ustate = U1;
191 st->keepc[0] = **inbuf;
192 }
193 } else if ((**inbuf & 0xf0) == 0xe0) { /* 3 byte 0xe0..0xef */
194 st->ustate = U2;
195 st->keepc[0] = **inbuf;
196 } else {
197 /* four bytes of UTF-8 sequences */
198 if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
199 st->_errno = errno = EILSEQ;
200 else
201 {
202 st->ustate = U5;
203 st->keepc[0] = **inbuf;
204 }
205 }
206 }
207 break;
208 case U1: /* 2 byte unicode */
209 if ((**inbuf & 0xc0) == 0x80) {
210 st->ustate = U4;
211 st->keepc[1] = **inbuf;
212 c1 = (st->keepc[0]&0x1c)>>2;
213 c2 = ((st->keepc[0]&0x03)<<6) | ((**inbuf)&0x3f);
214 #ifdef DEBUG
215 fprintf(stderr, "UTF8: %02x%02x --> ",
216 st->keepc[0]&ONEBYTE, st->keepc[1]&ONEBYTE);
217 #endif
218 continue; /* should not advance *inbuf */
219 } else {
220 st->_errno = errno = EILSEQ;
221 }
222 break;
223 case U2: /* 3 byte unicode - 2nd byte */
224
225 first_byte = st->keepc[0];
226
227 /* if the first byte is 0xed, it is illegal sequence if the second
228 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
229 */
230 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
231 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
232 st->_errno = errno = EILSEQ;
233 else {
234 st->ustate = U3;
235 st->keepc[1] = **inbuf;
236 }
237 break;
238 case U3: /* 3 byte unicode - 3rd byte */
239 if ((**inbuf & 0xc0) == 0x80) {
240 st->ustate = U4;
241 st->keepc[2] = **inbuf;
242 c1 = ((st->keepc[0]&0x0f)<<4) |
243 ((st->keepc[1]&0x3c)>>2);
244 c2 = ((st->keepc[1]&0x03)<<6) | ((**inbuf)&0x3f);
245 #ifdef DEBUG
246 fprintf(stderr, "UTF8: %02x%02x%02x --> ", st->keepc[0]&ONEBYTE,
247 st->keepc[1]&ONEBYTE, **inbuf&ONEBYTE);
248 #endif
249 continue; /* should not advance *inbuf */
250 } else {
251 st->_errno = errno = EILSEQ;
252 }
253 break;
254 case U4:
255 plane_no = get_plane_no_by_utf(c1, c2, &unidx, &cnscode);
256 if (plane_no == -2)
257 { /* unicode is either 0xFFFE or 0xFFFF */
258 st->_errno = errno = EILSEQ;
259 break;
260 }
261
262 if (plane_no > 0) { /* legal unicode; illegal CNS */
263 if ((st->istate == IN) || (pre_plane_no != plane_no)) {
264 if ((st->cstate == C0) ||
265 (pre_plane_no != plane_no)) {
266 /* change plane # in Chinese mode */
267 if (st->cstate == C1) {
268 **outbuf = SI;
269 (*outbuf)++;
270 (*outbytesleft)--;
271 }
272 if (*outbytesleft < 4) {
273 st->_errno = errno = E2BIG;
274 return((size_t) -1);
275 }
276 pre_plane_no = plane_no;
277 st->cstate = C1;
278 **outbuf = ESC;
279 *(*outbuf+1) = '$';
280 *(*outbuf+2) = ')';
281 *(*outbuf+3) = GET_PLANEC(plane_no);
282 #ifdef DEBUG
283 fprintf(stderr, "\n\t\t\t\tESC $ ) %c\t", *(*outbuf+3));
284 #endif
285 (*outbuf) += 4;
286 (*outbytesleft) -= 4;
287 if (*outbytesleft <= 0) {
288 st->_errno = errno = E2BIG;
289 return((size_t) -1);
290 }
291 }
292 st->istate = OUT;
293 **outbuf = SO;
294 (*outbuf)++;
295 (*outbytesleft)--;
296 }
297 }/* get_plane_no OK */
298
299 n = utf8_to_iso(plane_no, unidx, cnscode,
300 *outbuf, *outbytesleft);
301 if (n > 0) {
302 (*outbuf) += n;
303 (*outbytesleft) -= n;
304 } else {
305 st->_errno = errno;
306 return((size_t) -1);
307 }
308 st->ustate = U0;
309 st->_errno = 0;
310 break;
311 case U5:
312
313 first_byte = st->keepc[0];
314
315 /* if the first byte is 0xed, it is illegal sequence if the second
316 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
317 */
318 if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
319 ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
320 st->_errno = errno = EILSEQ;
321 else {
322 st->ustate = U6;
323 st->keepc[1] = **inbuf;
324 }
325 break;
326 case U6:
327 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
328 {
329 st->ustate = U7;
330 st->keepc[2] = **inbuf;
331 }
332 else
333 st->_errno = errno = EILSEQ;
334 break;
335 case U7:
336 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
337 { /* skip it to simplify */
338 st->ustate = U0;
339 st->_errno = 0;
340 }
341 else
342 st->_errno = errno = EILSEQ;
343 break;
344 default: /* should never come here */
345 st->_errno = errno = EILSEQ;
346 st->ustate = U0; /* reset state */
347 break;
348 }
349
350 if (st->_errno) {
351 #ifdef DEBUG
352 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->ustate = %d\n",
353 st->_errno, st->ustate);
354 #endif
355 break;
356 }
357 (*inbuf)++;
358 (*inbytesleft)--;
359 }
360
361 if (errno)
362 return((size_t) -1);
363
364 if (*inbytesleft == 0 && st->ustate != U0) {
365 errno = EINVAL;
366 return ((size_t) -1);
367 }
368
369 if (*inbytesleft > 0 && *outbytesleft == 0) {
370 errno = E2BIG;
371 return((size_t) -1);
372 }
373 return (*inbytesleft);
374 }
375
376
377 /*
378 * Get plane number by UTF8 code; i.e. plane #1 returns 1, #2 returns 2, etc.
379 * Returns -1 on error conditions and return -2 due to illegal sequence
380 *
381 * Since binary search of the UTF8 to CNS table is necessary, might as well
382 * return index and CNS code matching to the unicode.
383 */
get_plane_no_by_utf(const char c1,const char c2,int * unidx,unsigned long * cnscode)384 static int get_plane_no_by_utf(const char c1, const char c2,
385 int *unidx, unsigned long *cnscode)
386 {
387 int ret;
388 unsigned long unicode;
389
390 unicode = (unsigned long) ((c1 & ONEBYTE) << 8) + (c2 & ONEBYTE);
391 /* the 0xfffe and 0xffff should not be allowed */
392 if ( unicode == 0xFFFE || unicode == 0xFFFF ) return -2;
393
394 *unidx = binsearch(unicode, utf_cns_tab, MAX_UTF_NUM);
395 if ((*unidx) >= 0)
396 *cnscode = utf_cns_tab[*unidx].cnscode;
397 else
398 return(0); /* match from UTF8 to CNS not found */
399 #ifdef DEBUG
400 fprintf(stderr, "Unicode=%04x, idx=%5d, CNS=%x ", unicode, *unidx, *cnscode);
401 #endif
402
403 ret = (int) (*cnscode >> 16);
404 switch (ret) {
405 case 0x21: /* 0x8EA1 - G */
406 case 0x22: /* 0x8EA2 - H */
407 case 0x23: /* 0x8EA3 - I */
408 case 0x24: /* 0x8EA4 - J */
409 case 0x25: /* 0x8EA5 - K */
410 case 0x26: /* 0x8EA6 - L */
411 case 0x27: /* 0x8EA7 - M */
412 case 0x28: /* 0x8EA8 - N */
413 case 0x29: /* 0x8EA9 - O */
414 case 0x2a: /* 0x8EAA - P */
415 case 0x2b: /* 0x8EAB - Q */
416 case 0x2c: /* 0x8EAC - R */
417 case 0x2d: /* 0x8EAD - S */
418 case 0x2f: /* 0x8EAF - U */
419 case 0x30: /* 0x8EB0 - V */
420 return (ret - 0x20); /* so that we can use GET_PLANEC() */
421 case 0x2e: /* 0x8EAE - T */
422 return (3); /* CNS 11643-1992 */
423 default:
424 return (-1);
425 }
426 }
427
428
429 /*
430 * ISO/IEC 10646 (Unicode) --> ISO 2022-7
431 * Unicode --> UTF8 (FSS-UTF)
432 * (File System Safe Universal Character Set Transformation Format)
433 * Return: > 0 - converted with enough space in output buffer
434 * = 0 - no space in outbuf
435 */
utf8_to_iso(int plane_no,int unidx,unsigned long cnscode,char * buf,size_t buflen)436 static int utf8_to_iso(int plane_no, int unidx, unsigned long cnscode,
437 char *buf, size_t buflen)
438 {
439 unsigned long val; /* CNS 11643 value */
440 #ifdef DEBUG
441 char cns_str[5];
442 #endif
443
444 if (buflen < 2) {
445 errno = E2BIG;
446 return(0);
447 }
448
449
450 if (unidx < 0) { /* no match from UTF8 to CNS 11643 */
451 *buf = *(buf+1) = NON_ID_CHAR;
452 return(2);
453 } else {
454 val = cnscode & 0xffff;
455 *buf = (val & 0xff00) >> 8;
456 *(buf+1) = val & 0xff;
457 }
458 #ifdef DEBUG
459 fprintf(stderr, "\t%02x%02x\t", *buf, *(buf+1));
460 #endif
461
462 #ifdef DEBUG
463 switch (plane_no) {
464 case 1:
465 cns_str[0] = *buf | MSB;
466 cns_str[1] = *(buf+1) | MSB;
467 cns_str[2] = cns_str[3] = cns_str[4] = NULL;
468 break;
469 case 2:
470 case 3:
471 case 4:
472 case 5:
473 case 6:
474 case 7:
475 case 8:
476 case 9:
477 case 10:
478 case 11:
479 case 12:
480 case 13:
481 case 14:
482 case 15:
483 case 16:
484 cns_str[0] = MBYTE;
485 cns_str[1] = (char) PMASK + plane_no;
486 cns_str[2] = (char) *buf | MSB;
487 cns_str[3] = (char) *(buf+1) | MSB;
488 cns_str[4] = NULL;
489 break;
490 }
491
492 fprintf(stderr, "#%d ->%s<-\n", plane_no, cns_str);
493 #endif
494 return(2);
495 }
496
497
498 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,utf_cns v[],int n)499 static int binsearch(unsigned long x, utf_cns v[], int n)
500 {
501 int low, high, mid;
502
503 low = 0;
504 high = n - 1;
505 while (low <= high) {
506 mid = (low + high) / 2;
507 if (x < v[mid].unicode)
508 high = mid - 1;
509 else if (x > v[mid].unicode)
510 low = mid + 1;
511 else /* found match */
512 return mid;
513 }
514 return (-1); /* no match */
515 }
516