xref: /illumos-gate/usr/src/lib/iconv_modules/ko/common/ucs_to_unihan.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1998-1999, 2001 by Sun Microsystems, Inc.
23  * All rights reserved.
24  *
25  * Following is how we process BOM and subsequent bytes in this program:
26  * - UCS-2BE, UTF-16BE, UCS-4BE, UTF-32BE, UCS-2LE, UTF-16LE, UCS-4LE, and
27  *   UTF-32LE don't care about BOM. From the beginning, they are properly
28  *   serializedi without the BOM character.
29  * - In other encodings, UCS-2, UCS-4, UTF-16, and UTF-32, the initial byte
30  *   ordering is of the current processor's byte ordering. During the first
31  *   iconv() call, if BOM appears as the first character of the entier
32  *   iconv input stream, the byte order will be changed accordingly.
33  *   We will use 'bom_written' data field of the conversion descriptor to
34  *   save this particular information, in other words, whether we've been
35  *   encountered the first character as the BOM.
36  */
37 
38 
39 #include <stdlib.h>
40 #include <errno.h>
41 #include <sys/types.h>
42 #include <sys/isa_defs.h>
43 #include "ucs_to_unihan.h"
44 #include "common_def.h"
45 #include "common_han.h"
46 
47 typedef struct {
48   int         _magic;
49   boolean     _need_byte_swap;
50   boolean     _bom_written;
51   boolean     _is_little_endian;
52 
53 } _icv_state_t;
54 
55 static hcode_type ucs_to_unihan (uint_t ucs_char);
56 extern hcode_type _utf8_to_unified_hangul (hcode_type);
57 
58 void *
_icv_open()59 _icv_open()
60 {
61   _icv_state_t *cd = (_icv_state_t *)calloc(1, sizeof(_icv_state_t));
62 
63   if (cd == (_icv_state_t *)NULL) {
64     errno = ENOMEM;
65     return((void *)-1);
66   }
67 
68   cd->_magic = MAGIC_NUMBER;
69 
70 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
71 	defined(UTF_32BE)
72   cd->_is_little_endian = false;
73   cd->_bom_written = true;
74 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
75 	defined(UTF_32LE)
76   cd->_is_little_endian = true;
77   cd->_bom_written = true;
78 #elif defined(__IS_LITTLE_ENDIAN)
79   cd->_is_little_endian = true;
80 #endif
81 
82   cd->_need_byte_swap = false;
83 
84   return((void *)cd);
85 }
86 
87 
88 void
_icv_close(_icv_state_t * cd)89 _icv_close(_icv_state_t *cd)
90 {
91   if (! cd)
92     errno = EBADF;
93   else
94     free((void *)cd);
95 }
96 
97 
98 size_t
_icv_iconv(_icv_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)99 _icv_iconv(_icv_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
100 	   size_t *outbufleft)
101 {
102   size_t ret_val = 0;
103   uchar_t *ib;
104   uchar_t *ob;
105   uchar_t *ibtail;
106   uchar_t *obtail;
107   uint_t u4;
108   uint_t u4_2;
109   register int i;
110 
111   hcode_type unihan;
112   unihan.code = 0x00;
113 
114   if (! cd) {
115     errno = EBADF;
116     return((size_t)-1);
117   }
118 
119   if (!inbuf || !(*inbuf))
120     return((size_t)0);
121 
122   ib = (uchar_t *)*inbuf;
123   ob = (uchar_t *)*outbuf;
124   ibtail = ib + *inbufleft;
125   obtail = ob + *outbufleft;
126 
127 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
128   if (! cd->_bom_written) {
129     if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
130       errno = EINVAL;
131       ret_val = (size_t)-1;
132       goto need_more_input_err;
133     }
134 
135     for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
136       u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
137 
138     /* Big endian, Little endian, or, not specified?? */
139     if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
140       ib += ICV_FETCH_UCS_SIZE;
141       cd->_is_little_endian = false;
142     } else if (u4 == ICV_BOM_IN__IS_LITTLE_ENDIAN) {
143       ib += ICV_FETCH_UCS_SIZE;
144       cd->_is_little_endian = true;
145     }
146   }
147   /*
148    * Once BOM checking is done, regardless of whether we had the BOM or
149    * not, we treat the BOM sequence as a ZWNBSP character from now on.
150    */
151   cd->_bom_written = true;
152 #endif
153 
154   while (ib < ibtail) {
155     if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
156       errno = EINVAL;
157       ret_val = (size_t)-1;
158       break;
159     }
160 
161     u4 = u4_2 = 0;
162     if (cd->_is_little_endian) {
163       for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
164 	u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
165     } else {
166       for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
167 	u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
168     }
169 
170 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
171     if (u4 >= 0x00fffe || (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
172       errno = EILSEQ;
173       ret_val = (size_t)-1;
174       break;
175     }
176 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
177     if ((u4 >= 0x00dc00 && u4 <= 0x00dfff) || u4 >= 0x00fffe) {
178       errno = EILSEQ;
179       ret_val = (size_t)-1;
180       break;
181     }
182 
183     if (u4 >= 0x00d800 && u4 <= 0x00dbff) {
184       if ((ibtail - ib) < ICV_FETCH_UCS_SIZE_TWO) {
185 	errno = EINVAL;
186 	ret_val = (size_t)-1;
187 	break;
188       }
189 
190       if (cd->_is_little_endian) {
191 	for (i = ICV_FETCH_UCS_SIZE_TWO - 1;
192 	     i >= ICV_FETCH_UCS_SIZE;
193 	     i--)
194 	  u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
195       } else {
196 	for (i = ICV_FETCH_UCS_SIZE;
197 	     i < ICV_FETCH_UCS_SIZE_TWO;
198 	     i++)
199 	  u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
200       }
201 
202       if (u4_2 < 0x00dc00 || u4_2 > 0x00dfff) {
203 	errno = EILSEQ;
204 	ret_val = (size_t)-1;
205 	break;
206       }
207 
208       u4 = ((((u4 - 0x00d800) * 0x400) +
209 	     (u4_2 - 0x00dc00)) & 0x0fffff) + 0x010000;
210     }
211 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
212     if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x10ffff ||
213 	(u4 >= 0x00d800 && u4 <= 0x00dfff)) {
214       errno = EILSEQ;
215       ret_val = (size_t)-1;
216       break;
217     }
218 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
219     if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x7fffffff ||
220 	(u4 >= 0x00d800 && u4 <= 0x00dfff)) {
221       errno = EILSEQ;
222       ret_val = (size_t)-1;
223       break;
224     }
225 #else
226 #error	"Fatal: one of the UCS macros need to be defined."
227 #endif
228 
229     /*
230      * Once we reach here, the "u4" contains a valid character
231      * and thus we don't do any other error checking in
232      * the below.
233      */
234 
235     unihan = ucs_to_unihan (u4);
236     if(unihan.byte.byte1 == '\0' && unihan.byte.byte2 == '\0' && unihan.byte.byte3 == '\0')
237     {
238 	*ob++ = unihan.byte.byte4;
239 	ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
240 	continue;
241     }
242     if (cd->_need_byte_swap){
243       *ob++ = (uchar_t) unihan.byte.byte4;
244       *ob++ = (uchar_t) unihan.byte.byte3;
245     } else {
246       *ob++ = (uchar_t) unihan.byte.byte3;
247       *ob++ = (uchar_t) unihan.byte.byte4;
248     }
249 
250     ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
251   }
252 
253 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
254  need_more_input_err:
255 #endif
256   *inbuf = (char *)ib;
257   *inbufleft = ibtail - ib;
258   *outbuf = (char *)ob;
259   *outbufleft = obtail - ob;
260 
261   return(ret_val);
262 }
263 
264 static hcode_type
ucs_to_unihan(uint_t ucs_char)265 ucs_to_unihan (uint_t ucs_char)
266 {
267   hcode_type unihan_char;
268   hcode_type utf8_char;
269   unihan_char.code = 0x00;
270 
271   if (ucs_char <= 0x7f) {
272     utf8_char.code = ucs_char;
273 
274   } else if (ucs_char <= 0x7ff) {
275     utf8_char.byte.byte3 = (uchar_t)(0xc0 | ((ucs_char & 0x07c0) >> 6));
276     utf8_char.byte.byte4 = (uchar_t)(0x80 |  (ucs_char & 0x003f));
277 
278   } else if (ucs_char <= 0x00ffff) {
279     utf8_char.byte.byte2 = (uchar_t)(0xe0 | ((ucs_char & 0x0f000) >> 12));
280     utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x00fc0) >> 6));
281     utf8_char.byte.byte4 = (uchar_t)(0x80 |  (ucs_char & 0x0003f));
282   } else if (ucs_char <= 0x1fffff) {
283     utf8_char.byte.byte1 = (uchar_t)(0xf0 | ((ucs_char & 0x01c0000) >> 18));
284     utf8_char.byte.byte2 = (uchar_t)(0x80 | ((ucs_char & 0x003f000) >> 12));
285     utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x0000fc0) >> 6));
286     utf8_char.byte.byte4 = (uchar_t)(0x80 |  (ucs_char & 0x000003f));
287   } else
288     utf8_char.code = 0x00;
289 
290   unihan_char = _utf8_to_unified_hangul (utf8_char);
291   return unihan_char;
292 }
293