1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, by Sun Microsystems, Inc.
24 * All rights reserved.
25 */
26
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <errno.h>
30 #include "big5p_unicode.h" /* Big-5 Plus to Unicode mapping table */
31
32 #define MSB 0x80 /* most significant bit */
33 #define ONEBYTE 0xff /* right most byte */
34
35 /* non-identified character */
36 #define UTF8_NON_ID_CHAR1 0xEF
37 #define UTF8_NON_ID_CHAR2 0xBF
38 #define UTF8_NON_ID_CHAR3 0xBD
39
40
41 typedef struct _icv_state {
42 char keepc[2]; /* maximum # byte of Big-5 code */
43 short cstate; /* state machine id */
44 int _errno; /* internal errno */
45 }_iconv_st;
46
47 enum _CSTATE { C0, C1 };
48
49 static int big5p_2nd_byte(char);
50 static int big5p_to_utf8(char[], char*, size_t);
51 static int binsearch(unsigned long, big5p_utf[], int);
52
53
54 /*
55 * Open; called from iconv_open()
56 */
57 void *
_icv_open()58 _icv_open()
59 {
60 _iconv_st *st;
61
62 if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
63 errno = ENOMEM;
64 return ((void *) -1);
65 }
66
67 st->cstate = C0;
68 st->_errno = 0;
69
70 return ((void *) st);
71 }
72
73
74 /*
75 * Close; called from iconv_close()
76 */
77 void
_icv_close(_iconv_st * st)78 _icv_close(_iconv_st *st)
79 {
80 if (!st)
81 errno = EBADF;
82 else
83 free(st);
84 }
85
86
87 /*
88 * Actual conversion; called from iconv()
89 */
90 /*=======================================================
91 *
92 * State Machine for interpreting Big-5 code
93 *
94 *=======================================================
95 *
96 * 1st C
97 * +--------> C0 ----------> C1
98 * | ascii | 2nd C |
99 * ^ v v
100 * +----<-----+-----<--------+
101 *
102 *=======================================================*/
103 /*
104 * Big-5 Plus encoding range:
105 * High byte: 0x81 - 0xFE
106 * Low byte: 0x40 - 0xFE
107 */
108 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)109 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
110 char **outbuf, size_t *outbytesleft)
111 {
112 int n;
113
114 #ifdef DEBUG
115 fprintf(stderr, "========== iconv(): Big-5 --> UTF2 ==========\n");
116 #endif
117 if (st == NULL) {
118 errno = EBADF;
119 return ((size_t) -1);
120 }
121
122 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
123 st->cstate = C0;
124 st->_errno = 0;
125 return ((size_t) 0);
126 }
127
128 st->_errno = 0; /* reset internal errno */
129 errno = 0; /* reset external errno */
130
131 /* a state machine for interpreting CNS 11643 code */
132 while (*inbytesleft > 0 && *outbytesleft > 0) {
133 switch (st->cstate) {
134 case C0: /* assuming ASCII in the beginning */
135 if (**inbuf & MSB) {
136 st->keepc[0] = (**inbuf);
137 st->cstate = C1;
138 } else { /* real ASCII */
139 **outbuf = **inbuf;
140 (*outbuf)++;
141 (*outbytesleft)--;
142 }
143 break;
144 case C1: /* Chinese characters: 2nd byte */
145 if (big5p_2nd_byte(**inbuf) == 0) {
146 st->keepc[1] = (**inbuf);
147 n = big5p_to_utf8(st->keepc, *outbuf,
148 *outbytesleft);
149 if (n > 0) {
150 (*outbuf) += n;
151 (*outbytesleft) -= n;
152
153 st->cstate = C0;
154 } else { /* don't reset state */
155 st->_errno = errno = E2BIG;
156 }
157 } else { /* input char doesn't belong
158 * to the input code set
159 */
160 st->_errno = errno = EILSEQ;
161 }
162 break;
163 default: /* should never come here */
164 st->_errno = errno = EILSEQ;
165 st->cstate = C0; /* reset state */
166 break;
167 }
168
169 if (st->_errno) {
170 #ifdef DEBUG
171 fprintf(stderr, "!!!!!\tst->_errno = %d\tst->cstate = %d\n",
172 st->_errno, st->cstate);
173 #endif
174 break;
175 }
176
177 (*inbuf)++;
178 (*inbytesleft)--;
179 }
180
181 if (errno) return ((size_t) -1);
182
183 if (*inbytesleft == 0 && st->cstate != C0) {
184 errno = EINVAL;
185 return ((size_t) -1);
186 }
187
188 if (*inbytesleft > 0 && *outbytesleft == 0) {
189 errno = E2BIG;
190 return((size_t) -1);
191 }
192 return (*inbytesleft);
193 }
194
195
196 /*
197 * Test whether inbuf is a valid character for 2nd byte Big-5 code
198 * Return: = 0 - valid Big-5 2nd byte
199 * = 1 - invalid Big-5 2nd byte
200 */
big5p_2nd_byte(char inbuf)201 static int big5p_2nd_byte(char inbuf)
202 {
203 unsigned int buf = (unsigned int) (inbuf & ONEBYTE);
204
205 if ((buf >= 0x40) && (buf <= 0xFE))
206 return(0);
207 else return(1);
208 }
209
210
211 /*
212 * Big-5 code --> ISO/IEC 10646 (Unicode)
213 * Unicode --> UTF8 (FSS-UTF)
214 * (File System Safe Universal Character Set Transformation Format)
215 * Return: > 0 - converted with enough space in output buffer
216 * = 0 - no space in outbuf
217 */
big5p_to_utf8(char keepc[],char * buf,size_t buflen)218 static int big5p_to_utf8(char keepc[], char *buf, size_t buflen)
219 {
220 unsigned long big5p_val; /* Big-5 value */
221 int unidx; /* Unicode index */
222 unsigned long uni_val; /* Unicode */
223
224 big5p_val = ((keepc[0]&ONEBYTE) << 8) + (keepc[1]&ONEBYTE);
225 #ifdef DEBUG
226 fprintf(stderr, "%x\t", big5p_val);
227 #endif
228
229 unidx = binsearch(big5p_val, big5p_utf_tab, MAX_BIG5P_NUM);
230 if (unidx >= 0)
231 uni_val = big5p_utf_tab[unidx].unicode;
232 #ifdef DEBUG
233 fprintf(stderr, "unidx = %d, unicode = %x\t", unidx, uni_val);
234 #endif
235
236 if (unidx >= 0) { /* do Unicode to UTF8 conversion */
237 if (uni_val > 0x0080 && uni_val <= 0x07ff) {
238 if (buflen < 2) {
239 #ifdef DEBUG
240 fprintf(stderr, "outbuf overflow in big5p_to_utf8()!!\n");
241 #endif
242 errno = E2BIG;
243 return(0);
244 }
245 *buf = (char)((uni_val >> 6) & 0x1f) | 0xc0;
246 *(buf+1) = (char)(uni_val & 0x3f) | 0x80;
247 #ifdef DEBUG
248 fprintf(stderr, "%x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE);
249 #endif
250 return(2);
251 }
252 if (uni_val > 0x0800 && uni_val <= 0xffff) {
253 if (buflen < 3) {
254 #ifdef DEBUG
255 fprintf(stderr, "outbuf overflow in big5p_to_utf8()!!\n");
256 #endif
257 errno = E2BIG;
258 return(0);
259 }
260 *buf = (char)((uni_val >> 12) & 0xf) | 0xe0;
261 *(buf+1) = (char)((uni_val >>6) & 0x3f) | 0x80;
262 *(buf+2) = (char)(uni_val & 0x3f) | 0x80;
263 #ifdef DEBUG
264 fprintf(stderr, "%x %x %x\n", *buf&ONEBYTE, *(buf+1)&ONEBYTE, *(buf+2)&ONEBYTE);
265 #endif
266 return(3);
267 }
268 }
269
270 /* can't find a match in Big-5 --> UTF8 table or illegal UTF8 code */
271 if (buflen < 3) {
272 #ifdef DEBUG
273 fprintf(stderr, "outbuf overflow in big5p_to_utf8()!!\n");
274 #endif
275 errno = E2BIG;
276 return(0);
277 }
278
279 *(unsigned char*) buf = UTF8_NON_ID_CHAR1;
280 *(unsigned char*)(buf+1) = UTF8_NON_ID_CHAR2;
281 *(unsigned char*)(buf+2) = UTF8_NON_ID_CHAR3;
282
283 #ifdef DEBUG
284 fprintf(stderr, "%c %c %c\n", *buf, *(buf+1), *(buf+2));
285 #endif
286 return(3);
287 }
288
289
290 /* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
binsearch(unsigned long x,big5p_utf v[],int n)291 static int binsearch(unsigned long x, big5p_utf v[], int n)
292 {
293 int low, high, mid;
294
295 low = 0;
296 high = n - 1;
297 while (low <= high) {
298 mid = (low + high) / 2;
299 if (x < v[mid].big5pcode)
300 high = mid - 1;
301 else if (x > v[mid].big5pcode)
302 low = mid + 1;
303 else /* found match */
304 return mid;
305 }
306 return (-1); /* no match */
307 }
308