xref: /titanic_51/usr/src/lib/iconv_modules/hi_IN/UTF-8%iscii91.c (revision 91e1e26ac6a73ce959289cf7d3d96c4baedbe0b8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright(c) 2001 Sun Microsystems, Inc.
23  * All rights reserved.
24  */
25 #include <stdio.h>
26 #include <errno.h>
27 #include <stdlib.h>
28 #include <strings.h>
29 #include <sys/types.h>
30 #include "iscii.h"
31 #include "common_defs.h"
32 
33 #define MSB          0x80    /* most significant bit */
34 #define ONEBYTE      0xff    /* right most byte */
35 
36 #define REPLACE_CHAR '?'
37 
38 #define utf8_len(Ch) (Ch < 0x80 ? 1 : (Ch  < 0xe0 ? 2 : (Ch < 0xf0 ? 3 : (Ch < 0xf8 ? 4 : (Ch < 0xfc ? 5 : 6)))))
39 
40 #define analyze_utf8(Ch, Mask, nBytes) \
41     if (Ch < 128) { \
42         nBytes = 1; \
43         Mask = 0x7f; \
44       } else if ((Ch & 0xe0) == 0xc0) { \
45         nBytes = 2; \
46         Mask = 0x1f; \
47     } else if ((Ch & 0xf0) == 0xe0) { \
48         nBytes = 3; \
49         Mask = 0x0f; \
50     } else if ((Ch & 0xf8) == 0xf0) { \
51         nBytes = 4; \
52         Mask = 0x07; \
53     } else if ((Ch & 0xfc) == 0xf8) { \
54         nBytes = 5; \
55         Mask = 0x03; \
56     } else if ((Ch & 0xfe) == 0xfc) { \
57         nBytes = 6; \
58         Mask = 0x01; \
59     } else \
60         nBytes = -1;
61 
62 #define ucs2_from_utf8(mUCS, Ch, Ct, Mask, Len)   \
63     (mUCS) = (Ch)[0] & (Mask); \
64     for ((Ct) = 1; (Ct) < (Len); ++(Ct))  { \
65         if ( ( (Ch)[(Ct)] & 0xc0) != 0x80) { \
66              (mUCS) = -1; \
67             break; \
68         } \
69         (mUCS) <<= 6; \
70         (mUCS) |= ((Ch)[(Ct)] & 0x3f); \
71     } \
72 
73 
74 typedef struct _icv_state {
75     char    aATR;
76     uchar_t   keepc[4];
77     int     halant_context; /* preceded by the Halant character or not */
78     int     _ustate;
79     int     _errno;
80 } _iconv_st;
81 
82 enum _CSTATE { U0, U1, U2, U3, U4, U5, U6 };
83 
84 /*
85  * Open; called from iconv_open()
86  */
87 void *
88 _icv_open()
89 {
90     _iconv_st *st;
91 
92     if ((st = (_iconv_st*)malloc(sizeof(_iconv_st))) == NULL) {
93         errno = ENOMEM;
94         return ((void*)-1);
95     }
96 
97     bzero(st, sizeof(_iconv_st));
98     st->aATR = 0x42; /* Devanagiri */
99 
100     return ((void*)st);
101 }
102 
103 typedef enum { t_NONE, t_NUKTA, t_EXT, t_HALANT, t_DOUBLE_DANDA } Type;
104 
105 static int
106 traverse_table(Entry *entry, int num,  ucs_t ucs, Type *type)
107 {
108     int i=0;
109     int retc=0;
110 
111     *type = t_NONE;
112 
113     for ( ; i < num; ++i ) {
114         Entry en = entry[i];
115 
116         if (en.count == NUKTA || en.count == EXT || en.count == HALANT || en.count == DOUBLE_DANDA) {
117             if ( ucs < en.ucs ) break;
118             if ( ucs == en.ucs ) { /* found */
119 	        if ( en.count == NUKTA ) *type = t_NUKTA;
120 	        if ( en.count == EXT ) *type = t_EXT;
121 	        if ( en.count == HALANT ) *type = t_HALANT;
122 	        if ( en.count == DOUBLE_DANDA ) *type = t_DOUBLE_DANDA;
123 		retc = en.iscii;
124                 break;
125             }
126         } else {
127            if ( ucs < en.ucs ) break;
128            if ( ucs >= en.ucs && ucs < en.ucs + en.count ) {
129                retc = en.iscii + ( ucs - en.ucs );
130                break;
131            }
132         }
133     }
134 
135     return retc;
136 }
137 
138 static int
139 ucs_to_iscii(ucs_t uiid, char **outbuf, size_t *outbytesleft, int isc_type, int *halant_context)
140 {
141     int nBytesRet = 0 ;
142     Type type = t_NONE;
143     int iscii;
144     Entries en = unicode_table[isc_type];
145 
146     if ( *outbytesleft == 0 ) {
147         errno = E2BIG;
148         return 0;
149     }
150 
151     iscii = traverse_table(en.entry, en.items,  uiid, &type);
152     if ( iscii == 0 ) {
153         **outbuf = REPLACE_CHAR;
154         nBytesRet ++;
155     } else {
156         if ( type != t_NONE ) {
157 
158             /* buggy code */
159             if ( *outbytesleft < 2 ) {
160                 errno = E2BIG;
161                 return 0;
162             }
163 
164             switch (type)
165             {
166               case t_NUKTA:
167 		**outbuf = (uchar_t) iscii;
168 		*(*outbuf+1) = ISC_nukta;
169                 nBytesRet = 2;
170 
171 		break;
172               case t_EXT:
173                 **outbuf =  ISC_ext;
174                 *(*outbuf+1) = (uchar_t) iscii;
175                 nBytesRet = 2;
176 
177                 break;
178               case t_HALANT:
179                 if ( (uiid == UNI_ZWJ || uiid == UNI_ZWNJ) && *halant_context )
180                  {
181                    if ( uiid == UNI_ZWJ ) **outbuf = ISC_nukta; /* soft halant */
182 		   else **outbuf = ISC_halant; /* explicit halant */
183 
184 		   nBytesRet = 1;
185                  } /* consume the UNI_ZWNJ or UNI_ZWJ if *halant_context is 0 */
186 
187                 break;
188               case t_DOUBLE_DANDA:
189                 **outbuf =  ISC_danda;
190                 *(*outbuf+1) = (uchar_t) iscii;
191                 nBytesRet = 2;
192                 break;
193               case t_NONE:
194                 /* Not reached */
195                 break;
196             }
197         } else {
198             **outbuf = (uchar_t) iscii;
199             nBytesRet = 1;
200         }
201     }
202 
203     /* if iscii == ISC_halant but type == t_HALANT, set *halant_context to 0 */
204     if ( iscii == ISC_halant && type == t_NONE ) *halant_context = 1;
205     else *halant_context = 0;
206 
207     return nBytesRet;
208 }
209 
210 /*
211  * Close; called from iconv_close()
212  */
213 void
214 _icv_close(_iconv_st *st)
215 {
216     if (!st)
217         errno = EBADF;
218     else
219         free(st);
220 }
221 
222 /*
223  * Conversion routine; called from iconv()
224  */
225 size_t
226 _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
227        char **outbuf, size_t *outbytesleft)
228 {
229     int n=0;
230 
231     if (st == NULL)    {
232         errno = EBADF;
233         return ((size_t) -1);
234     }
235 
236 
237     if (inbuf == NULL || *inbuf == NULL) {  /* Reset request. */
238         st->aATR = 0x42; /* Devangiri */
239         st->_ustate = U0;
240         st->_errno = 0;
241         return ((size_t) 0);
242     }
243 
244     st->_errno = errno = 0;
245 
246     while (*inbytesleft > 0 && *outbytesleft > 0) {
247 
248         uchar_t first_byte;
249 
250         switch ( st->_ustate ) {
251         case U0:
252             if ((**inbuf & MSB) == 0) {     /* ASCII */
253                 **outbuf = **inbuf;
254                 (*outbuf)++; (*outbytesleft)--;
255             } else if ((**inbuf & 0xe0) == 0xc0) { /* 0xc2..0xdf */
256 
257 	        /* invalid sequence if the first byte is either 0xc0 or 0xc1 */
258 	        if ( number_of_bytes_in_utf8_char[((uchar_t) **inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
259 		   errno = EILSEQ;
260 	        else {
261                    st->_ustate = U1;
262                    st->keepc[0] = **inbuf;
263 		}
264             } else if ((**inbuf & 0xf0) == 0xe0) {
265                 st->_ustate = U2;
266                 st->keepc[0] = **inbuf;
267             } else {
268 	        /* four bytes of UTF-8 sequences */
269 	        if ( number_of_bytes_in_utf8_char[((uchar_t) **inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
270                    errno = EILSEQ;
271 	        else {
272 		   st->_ustate = U4;
273 		   st->keepc[0] = **inbuf;
274 		}
275             }
276             break;
277         case U1:
278             if ((**inbuf & 0xc0) == MSB) { /* U+0080 -- U+07FF */
279                 **outbuf = REPLACE_CHAR;
280                 (*outbuf)++;
281                 (*outbytesleft)--;
282                 st->_ustate = U0;
283             } else {
284                 errno = EILSEQ;
285             }
286             break;
287         case U2:
288 
289 	    first_byte = st->keepc[0];
290 
291 	    /* if the first byte is 0xed, it is illegal sequence if the second
292 	     * one is between 0xa0 and 0xbf because surrogate section is ill-formed
293 	     */
294 	    if (((uchar_t) **inbuf) < valid_min_2nd_byte[first_byte] ||
295 		((uchar_t) **inbuf) > valid_max_2nd_byte[first_byte] )
296 	        errno = EILSEQ;
297             else {
298                 st->_ustate = U3;
299                 st->keepc[1] = **inbuf;
300             }
301 	    break;
302         case U3:
303             if ((**inbuf & 0xc0) == MSB) {
304                 unsigned char    mChar = st->keepc[0];
305                 ucs_t    ucsid = 0;
306                 int     i=0, mask=0, len=0;
307                 ISCII   isc_type;
308 
309                 st->keepc[2] = **inbuf;
310 
311                 analyze_utf8(mChar, mask, len);
312 
313                 ucs2_from_utf8(ucsid, (char *)&st->keepc[0], i, mask, len);
314 
315 	        /* 0xfffe and 0xffff should not be allowed */
316 	        if ( ucsid == 0xFFFE || ucsid == 0xFFFF )
317 		  {
318 		     errno = EILSEQ;
319 		     break;
320 		  }
321 
322                 get_script_types(ucsid, isc_type);
323                 if ( isc_type != NUM_ISCII && st->aATR != aTRs[isc_type] ) {
324                     if ( *outbytesleft < 2 ) {
325                         errno = E2BIG;
326                         return (size_t)-1;
327                     }
328 
329                     **outbuf = (uchar_t)ISC_atr;
330                     (*outbuf)++;
331                     **outbuf = aTRs[isc_type];
332                     (*outbuf)++;
333                     (*outbytesleft)-=2;
334                     st->aATR = aTRs[isc_type];
335                 }
336 
337                 /* UNI_INV, UNI_ZWJ, UNI_ZWNJ would occur within any India Script as
338                    Consonant invisible, explicit halant and soft halant */
339                 if ( ucsid == UNI_INV || ucsid == UNI_ZWNJ || ucsid == UNI_ZWJ )
340                    isc_type = isc_TYPE[ st->aATR - 0x42 ];
341 
342                 if ( isc_type == NUM_ISCII ) {
343                     if ( *outbytesleft < 1 ) {
344                         errno = E2BIG;
345                         return (size_t)-1;
346                     }
347 
348                     **outbuf = REPLACE_CHAR;
349                     (*outbuf)++;
350                     (*outbytesleft)--;
351                 } else {
352                     n = ucs_to_iscii(ucsid, outbuf, outbytesleft, isc_type, &st->halant_context);
353                     if ( n > 0 ) {
354                         (*outbuf) += n;
355                         (*outbytesleft) -= n;
356                     } else if ( errno == E2BIG ) {
357 		        /* n == 0 if the ZWJ or ZWNJ has been consumed without error */
358                         st->_errno = errno;
359                         errno = E2BIG;
360                         return (size_t)-1;
361                     }
362                 }
363             } else {
364                 errno = EILSEQ;
365                 return (size_t)-1;
366             }
367             st->_ustate = U0;
368             break;
369 	case U4:
370 
371 	    first_byte = st->keepc[0];
372 
373 	    /* if the first byte is 0xf0, it is illegal sequence if
374 	     * the second one is between 0x80 and 0x8f
375 	     * for Four-Byte UTF: U+10000..U+10FFFF
376 	     */
377 	    if (((uchar_t) **inbuf) < valid_min_2nd_byte[first_byte] ||
378 		((uchar_t) **inbuf) > valid_max_2nd_byte[first_byte] )
379 	        errno = EILSEQ;
380 	    else {
381 	        st->_ustate = U5;
382 	        st->keepc[1] = **inbuf;
383 	    }
384 	    break;
385 	case U5:
386 	    if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
387 	     {
388 		st->_ustate = U6;
389 		st->keepc[2] = **inbuf;
390 	     }
391 	    else
392 	        errno = EILSEQ;
393 	    break;
394 	case U6:
395 	    if ((**inbuf & 0xc0) == MSB) /* 0x80..0xbf */
396 	     {
397 		st->keepc[3] = **inbuf;
398 		st->_ustate = U0;
399 
400 		/* replace with REPLACE_CHAR */
401 		**outbuf = REPLACE_CHAR;
402                 (*outbuf)++;
403                 (*outbytesleft)--;
404 	     }
405 	    else
406 	        errno = EILSEQ;
407 	    break;
408         }
409 
410         if (errno)
411             break;
412 
413         (*inbuf)++;
414         (*inbytesleft)--;
415        }    /* end of while loop */
416 
417     if (errno) return (size_t) -1;
418 
419     if (*inbytesleft == 0 && st->_ustate != U0) {
420         errno = EINVAL;
421         return (size_t)-1;
422     }
423 
424     if (*inbytesleft > 0 && *outbytesleft == 0) {
425         errno = E2BIG;
426         return((size_t)-1);
427     }
428 
429     return (size_t)(*inbytesleft);
430 }
431