1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright(c) 2001 Sun Microsystems, Inc.
23 * All rights reserved.
24 */
25 #include <stdio.h>
26 #include <errno.h>
27 #include <stdlib.h>
28 #include <strings.h>
29 #include <sys/types.h>
30 #include "iscii.h"
31 #include "common_defs.h"
32
33 #define MSB 0x80 /* most significant bit */
34 #define ONEBYTE 0xff /* right most byte */
35
36 #define REPLACE_CHAR '?'
37
38 #define utf8_len(Ch) (Ch < 0x80 ? 1 : (Ch < 0xe0 ? 2 : (Ch < 0xf0 ? 3 : (Ch < 0xf8 ? 4 : (Ch < 0xfc ? 5 : 6)))))
39
40 #define analyze_utf8(Ch, Mask, nBytes) \
41 if (Ch < 128) { \
42 nBytes = 1; \
43 Mask = 0x7f; \
44 } else if ((Ch & 0xe0) == 0xc0) { \
45 nBytes = 2; \
46 Mask = 0x1f; \
47 } else if ((Ch & 0xf0) == 0xe0) { \
48 nBytes = 3; \
49 Mask = 0x0f; \
50 } else if ((Ch & 0xf8) == 0xf0) { \
51 nBytes = 4; \
52 Mask = 0x07; \
53 } else if ((Ch & 0xfc) == 0xf8) { \
54 nBytes = 5; \
55 Mask = 0x03; \
56 } else if ((Ch & 0xfe) == 0xfc) { \
57 nBytes = 6; \
58 Mask = 0x01; \
59 } else \
60 nBytes = -1;
61
62 #define ucs2_from_utf8(mUCS, Ch, Ct, Mask, Len) \
63 (mUCS) = (Ch)[0] & (Mask); \
64 for ((Ct) = 1; (Ct) < (Len); ++(Ct)) { \
65 if ( ( (Ch)[(Ct)] & 0xc0) != 0x80) { \
66 (mUCS) = -1; \
67 break; \
68 } \
69 (mUCS) <<= 6; \
70 (mUCS) |= ((Ch)[(Ct)] & 0x3f); \
71 } \
72
73
74 typedef struct _icv_state {
75 char aATR;
76 uchar_t keepc[4];
77 int halant_context; /* preceded by the Halant character or not */
78 int _ustate;
79 int _errno;
80 } _iconv_st;
81
82 enum _CSTATE { U0, U1, U2, U3, U4, U5, U6 };
83
84 /*
85 * Open; called from iconv_open()
86 */
87 void *
_icv_open()88 _icv_open()
89 {
90 _iconv_st *st;
91
92 if ((st = (_iconv_st*)malloc(sizeof(_iconv_st))) == NULL) {
93 errno = ENOMEM;
94 return ((void*)-1);
95 }
96
97 bzero(st, sizeof(_iconv_st));
98 st->aATR = 0x42; /* Devanagiri */
99
100 return ((void*)st);
101 }
102
103 typedef enum { t_NONE, t_NUKTA, t_EXT, t_HALANT, t_DOUBLE_DANDA } Type;
104
105 static int
traverse_table(Entry * entry,int num,ucs_t ucs,Type * type)106 traverse_table(Entry *entry, int num, ucs_t ucs, Type *type)
107 {
108 int i=0;
109 int retc=0;
110
111 *type = t_NONE;
112
113 for ( ; i < num; ++i ) {
114 Entry en = entry[i];
115
116 if (en.count == NUKTA || en.count == EXT || en.count == HALANT || en.count == DOUBLE_DANDA) {
117 if ( ucs < en.ucs ) break;
118 if ( ucs == en.ucs ) { /* found */
119 if ( en.count == NUKTA ) *type = t_NUKTA;
120 if ( en.count == EXT ) *type = t_EXT;
121 if ( en.count == HALANT ) *type = t_HALANT;
122 if ( en.count == DOUBLE_DANDA ) *type = t_DOUBLE_DANDA;
123 retc = en.iscii;
124 break;
125 }
126 } else {
127 if ( ucs < en.ucs ) break;
128 if ( ucs >= en.ucs && ucs < en.ucs + en.count ) {
129 retc = en.iscii + ( ucs - en.ucs );
130 break;
131 }
132 }
133 }
134
135 return retc;
136 }
137
138 static int
ucs_to_iscii(ucs_t uiid,char ** outbuf,size_t * outbytesleft,int isc_type,int * halant_context)139 ucs_to_iscii(ucs_t uiid, char **outbuf, size_t *outbytesleft, int isc_type, int *halant_context)
140 {
141 int nBytesRet = 0 ;
142 Type type = t_NONE;
143 int iscii;
144 Entries en = unicode_table[isc_type];
145
146 if ( *outbytesleft == 0 ) {
147 errno = E2BIG;
148 return 0;
149 }
150
151 iscii = traverse_table(en.entry, en.items, uiid, &type);
152 if ( iscii == 0 ) {
153 **outbuf = REPLACE_CHAR;
154 nBytesRet ++;
155 } else {
156 if ( type != t_NONE ) {
157
158 /* buggy code */
159 if ( *outbytesleft < 2 ) {
160 errno = E2BIG;
161 return 0;
162 }
163
164 switch (type)
165 {
166 case t_NUKTA:
167 **outbuf = (uchar_t) iscii;
168 *(*outbuf+1) = ISC_nukta;
169 nBytesRet = 2;
170
171 break;
172 case t_EXT:
173 **outbuf = ISC_ext;
174 *(*outbuf+1) = (uchar_t) iscii;
175 nBytesRet = 2;
176
177 break;
178 case t_HALANT:
179 if ( (uiid == UNI_ZWJ || uiid == UNI_ZWNJ) && *halant_context )
180 {
181 if ( uiid == UNI_ZWJ ) **outbuf = ISC_nukta; /* soft halant */
182 else **outbuf = ISC_halant; /* explicit halant */
183
184 nBytesRet = 1;
185 } /* consume the UNI_ZWNJ or UNI_ZWJ if *halant_context is 0 */
186
187 break;
188 case t_DOUBLE_DANDA:
189 **outbuf = ISC_danda;
190 *(*outbuf+1) = (uchar_t) iscii;
191 nBytesRet = 2;
192 break;
193 case t_NONE:
194 /* Not reached */
195 break;
196 }
197 } else {
198 **outbuf = (uchar_t) iscii;
199 nBytesRet = 1;
200 }
201 }
202
203 /* if iscii == ISC_halant but type == t_HALANT, set *halant_context to 0 */
204 if ( iscii == ISC_halant && type == t_NONE ) *halant_context = 1;
205 else *halant_context = 0;
206
207 return nBytesRet;
208 }
209
210 /*
211 * Close; called from iconv_close()
212 */
213 void
_icv_close(_iconv_st * st)214 _icv_close(_iconv_st *st)
215 {
216 if (!st)
217 errno = EBADF;
218 else
219 free(st);
220 }
221
222 /*
223 * Conversion routine; called from iconv()
224 */
225 size_t
_icv_iconv(_iconv_st * st,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft)226 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
227 char **outbuf, size_t *outbytesleft)
228 {
229 int n=0;
230
231 if (st == NULL) {
232 errno = EBADF;
233 return ((size_t) -1);
234 }
235
236
237 if (inbuf == NULL || *inbuf == NULL) { /* Reset request. */
238 st->aATR = 0x42; /* Devangiri */
239 st->_ustate = U0;
240 st->_errno = 0;
241 return ((size_t) 0);
242 }
243
244 st->_errno = errno = 0;
245
246 while (*inbytesleft > 0 && *outbytesleft > 0) {
247
248 uchar_t first_byte;
249
250 switch ( st->_ustate ) {
251 case U0:
252 if ((**inbuf & MSB) == 0) { /* ASCII */
253 **outbuf = **inbuf;
254 (*outbuf)++; (*outbytesleft)--;
255 } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */
256
257 /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
258 if ( number_of_bytes_in_utf8_char[((uchar_t) **inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
259 errno = EILSEQ;
260 else {
261 st->_ustate = U1;
262 st->keepc[0] = **inbuf;
263 }
264 } else if ((**inbuf & 0xf0) == 0xe0) {
265 st->_ustate = U2;
266 st->keepc[0] = **inbuf;
267 } else {
268 /* four bytes of UTF-8 sequences */
269 if ( number_of_bytes_in_utf8_char[((uchar_t) **inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
270 errno = EILSEQ;
271 else {
272 st->_ustate = U4;
273 st->keepc[0] = **inbuf;
274 }
275 }
276 break;
277 case U1:
278 if ((**inbuf & 0xc0) == MSB) { /* U+0080 -- U+07FF */
279 **outbuf = REPLACE_CHAR;
280 (*outbuf)++;
281 (*outbytesleft)--;
282 st->_ustate = U0;
283 } else {
284 errno = EILSEQ;
285 }
286 break;
287 case U2:
288
289 first_byte = st->keepc[0];
290
291 /* if the first byte is 0xed, it is illegal sequence if the second
292 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
293 */
294 if (((uchar_t) **inbuf) < valid_min_2nd_byte[first_byte] ||
295 ((uchar_t) **inbuf) > valid_max_2nd_byte[first_byte] )
296 errno = EILSEQ;
297 else {
298 st->_ustate = U3;
299 st->keepc[1] = **inbuf;
300 }
301 break;
302 case U3:
303 if ((**inbuf & 0xc0) == MSB) {
304 unsigned char mChar = st->keepc[0];
305 ucs_t ucsid = 0;
306 int i=0, mask=0, len=0;
307 ISCII isc_type;
308
309 st->keepc[2] = **inbuf;
310
311 analyze_utf8(mChar, mask, len);
312
313 ucs2_from_utf8(ucsid, (char *)&st->keepc[0], i, mask, len);
314
315 /* 0xfffe and 0xffff should not be allowed */
316 if ( ucsid == 0xFFFE || ucsid == 0xFFFF )
317 {
318 errno = EILSEQ;
319 break;
320 }
321
322 get_script_types(ucsid, isc_type);
323 if ( isc_type != NUM_ISCII && st->aATR != aTRs[isc_type] ) {
324 if ( *outbytesleft < 2 ) {
325 errno = E2BIG;
326 return (size_t)-1;
327 }
328
329 **outbuf = (uchar_t)ISC_atr;
330 (*outbuf)++;
331 **outbuf = aTRs[isc_type];
332 (*outbuf)++;
333 (*outbytesleft)-=2;
334 st->aATR = aTRs[isc_type];
335 }
336
337 /* UNI_INV, UNI_ZWJ, UNI_ZWNJ would occur within any India Script as
338 Consonant invisible, explicit halant and soft halant */
339 if ( ucsid == UNI_INV || ucsid == UNI_ZWNJ || ucsid == UNI_ZWJ )
340 isc_type = isc_TYPE[ st->aATR - 0x42 ];
341
342 if ( isc_type == NUM_ISCII ) {
343 if ( *outbytesleft < 1 ) {
344 errno = E2BIG;
345 return (size_t)-1;
346 }
347
348 **outbuf = REPLACE_CHAR;
349 (*outbuf)++;
350 (*outbytesleft)--;
351 } else {
352 n = ucs_to_iscii(ucsid, outbuf, outbytesleft, isc_type, &st->halant_context);
353 if ( n > 0 ) {
354 (*outbuf) += n;
355 (*outbytesleft) -= n;
356 } else if ( errno == E2BIG ) {
357 /* n == 0 if the ZWJ or ZWNJ has been consumed without error */
358 st->_errno = errno;
359 errno = E2BIG;
360 return (size_t)-1;
361 }
362 }
363 } else {
364 errno = EILSEQ;
365 return (size_t)-1;
366 }
367 st->_ustate = U0;
368 break;
369 case U4:
370
371 first_byte = st->keepc[0];
372
373 /* if the first byte is 0xf0, it is illegal sequence if
374 * the second one is between 0x80 and 0x8f
375 * for Four-Byte UTF: U+10000..U+10FFFF
376 */
377 if (((uchar_t) **inbuf) < valid_min_2nd_byte[first_byte] ||
378 ((uchar_t) **inbuf) > valid_max_2nd_byte[first_byte] )
379 errno = EILSEQ;
380 else {
381 st->_ustate = U5;
382 st->keepc[1] = **inbuf;
383 }
384 break;
385 case U5:
386 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
387 {
388 st->_ustate = U6;
389 st->keepc[2] = **inbuf;
390 }
391 else
392 errno = EILSEQ;
393 break;
394 case U6:
395 if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
396 {
397 st->keepc[3] = **inbuf;
398 st->_ustate = U0;
399
400 /* replace with REPLACE_CHAR */
401 **outbuf = REPLACE_CHAR;
402 (*outbuf)++;
403 (*outbytesleft)--;
404 }
405 else
406 errno = EILSEQ;
407 break;
408 }
409
410 if (errno)
411 break;
412
413 (*inbuf)++;
414 (*inbytesleft)--;
415 } /* end of while loop */
416
417 if (errno) return (size_t) -1;
418
419 if (*inbytesleft == 0 && st->_ustate != U0) {
420 errno = EINVAL;
421 return (size_t)-1;
422 }
423
424 if (*inbytesleft > 0 && *outbytesleft == 0) {
425 errno = E2BIG;
426 return((size_t)-1);
427 }
428
429 return (size_t)(*inbytesleft);
430 }
431