1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1998-1999, 2001 by Sun Microsystems, Inc.
23 * All rights reserved.
24 *
25 * Following is how we process BOM and subsequent bytes in this program:
26 * - UCS-2BE, UTF-16BE, UCS-4BE, UTF-32BE, UCS-2LE, UTF-16LE, UCS-4LE, and
27 * UTF-32LE don't care about BOM. From the beginning, they are properly
28 * serializedi without the BOM character.
29 * - In other encodings, UCS-2, UCS-4, UTF-16, and UTF-32, the initial byte
30 * ordering is of the current processor's byte ordering. During the first
31 * iconv() call, if BOM appears as the first character of the entier
32 * iconv input stream, the byte order will be changed accordingly.
33 * We will use 'bom_written' data field of the conversion descriptor to
34 * save this particular information, in other words, whether we've been
35 * encountered the first character as the BOM.
36 */
37
38
39 #include <stdlib.h>
40 #include <errno.h>
41 #include <sys/types.h>
42 #include <sys/isa_defs.h>
43 #include "ucs_to_unihan.h"
44 #include "common_def.h"
45 #include "common_han.h"
46
47 typedef struct {
48 int _magic;
49 boolean _need_byte_swap;
50 boolean _bom_written;
51 boolean _is_little_endian;
52
53 } _icv_state_t;
54
55 static hcode_type ucs_to_unihan (uint_t ucs_char);
56 extern hcode_type _utf8_to_unified_hangul (hcode_type);
57
58 void *
_icv_open()59 _icv_open()
60 {
61 _icv_state_t *cd = (_icv_state_t *)calloc(1, sizeof(_icv_state_t));
62
63 if (cd == (_icv_state_t *)NULL) {
64 errno = ENOMEM;
65 return((void *)-1);
66 }
67
68 cd->_magic = MAGIC_NUMBER;
69
70 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
71 defined(UTF_32BE)
72 cd->_is_little_endian = false;
73 cd->_bom_written = true;
74 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
75 defined(UTF_32LE)
76 cd->_is_little_endian = true;
77 cd->_bom_written = true;
78 #elif defined(__IS_LITTLE_ENDIAN)
79 cd->_is_little_endian = true;
80 #endif
81
82 cd->_need_byte_swap = false;
83
84 return((void *)cd);
85 }
86
87
88 void
_icv_close(_icv_state_t * cd)89 _icv_close(_icv_state_t *cd)
90 {
91 if (! cd)
92 errno = EBADF;
93 else
94 free((void *)cd);
95 }
96
97
98 size_t
_icv_iconv(_icv_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)99 _icv_iconv(_icv_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
100 size_t *outbufleft)
101 {
102 size_t ret_val = 0;
103 uchar_t *ib;
104 uchar_t *ob;
105 uchar_t *ibtail;
106 uchar_t *obtail;
107 uint_t u4;
108 uint_t u4_2;
109 register int i;
110
111 hcode_type unihan;
112 unihan.code = 0x00;
113
114 if (! cd) {
115 errno = EBADF;
116 return((size_t)-1);
117 }
118
119 if (!inbuf || !(*inbuf))
120 return((size_t)0);
121
122 ib = (uchar_t *)*inbuf;
123 ob = (uchar_t *)*outbuf;
124 ibtail = ib + *inbufleft;
125 obtail = ob + *outbufleft;
126
127 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
128 if (! cd->_bom_written) {
129 if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
130 errno = EINVAL;
131 ret_val = (size_t)-1;
132 goto need_more_input_err;
133 }
134
135 for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
136 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
137
138 /* Big endian, Little endian, or, not specified?? */
139 if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
140 ib += ICV_FETCH_UCS_SIZE;
141 cd->_is_little_endian = false;
142 } else if (u4 == ICV_BOM_IN__IS_LITTLE_ENDIAN) {
143 ib += ICV_FETCH_UCS_SIZE;
144 cd->_is_little_endian = true;
145 }
146 }
147 /*
148 * Once BOM checking is done, regardless of whether we had the BOM or
149 * not, we treat the BOM sequence as a ZWNBSP character from now on.
150 */
151 cd->_bom_written = true;
152 #endif
153
154 while (ib < ibtail) {
155 if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
156 errno = EINVAL;
157 ret_val = (size_t)-1;
158 break;
159 }
160
161 u4 = u4_2 = 0;
162 if (cd->_is_little_endian) {
163 for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
164 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
165 } else {
166 for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
167 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
168 }
169
170 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
171 if (u4 >= 0x00fffe || (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
172 errno = EILSEQ;
173 ret_val = (size_t)-1;
174 break;
175 }
176 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
177 if ((u4 >= 0x00dc00 && u4 <= 0x00dfff) || u4 >= 0x00fffe) {
178 errno = EILSEQ;
179 ret_val = (size_t)-1;
180 break;
181 }
182
183 if (u4 >= 0x00d800 && u4 <= 0x00dbff) {
184 if ((ibtail - ib) < ICV_FETCH_UCS_SIZE_TWO) {
185 errno = EINVAL;
186 ret_val = (size_t)-1;
187 break;
188 }
189
190 if (cd->_is_little_endian) {
191 for (i = ICV_FETCH_UCS_SIZE_TWO - 1;
192 i >= ICV_FETCH_UCS_SIZE;
193 i--)
194 u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
195 } else {
196 for (i = ICV_FETCH_UCS_SIZE;
197 i < ICV_FETCH_UCS_SIZE_TWO;
198 i++)
199 u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
200 }
201
202 if (u4_2 < 0x00dc00 || u4_2 > 0x00dfff) {
203 errno = EILSEQ;
204 ret_val = (size_t)-1;
205 break;
206 }
207
208 u4 = ((((u4 - 0x00d800) * 0x400) +
209 (u4_2 - 0x00dc00)) & 0x0fffff) + 0x010000;
210 }
211 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
212 if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x10ffff ||
213 (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
214 errno = EILSEQ;
215 ret_val = (size_t)-1;
216 break;
217 }
218 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
219 if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x7fffffff ||
220 (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
221 errno = EILSEQ;
222 ret_val = (size_t)-1;
223 break;
224 }
225 #else
226 #error "Fatal: one of the UCS macros need to be defined."
227 #endif
228
229 /*
230 * Once we reach here, the "u4" contains a valid character
231 * and thus we don't do any other error checking in
232 * the below.
233 */
234
235 unihan = ucs_to_unihan (u4);
236 if(unihan.byte.byte1 == '\0' && unihan.byte.byte2 == '\0' && unihan.byte.byte3 == '\0')
237 {
238 *ob++ = unihan.byte.byte4;
239 ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
240 continue;
241 }
242 if (cd->_need_byte_swap){
243 *ob++ = (uchar_t) unihan.byte.byte4;
244 *ob++ = (uchar_t) unihan.byte.byte3;
245 } else {
246 *ob++ = (uchar_t) unihan.byte.byte3;
247 *ob++ = (uchar_t) unihan.byte.byte4;
248 }
249
250 ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
251 }
252
253 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
254 need_more_input_err:
255 #endif
256 *inbuf = (char *)ib;
257 *inbufleft = ibtail - ib;
258 *outbuf = (char *)ob;
259 *outbufleft = obtail - ob;
260
261 return(ret_val);
262 }
263
264 static hcode_type
ucs_to_unihan(uint_t ucs_char)265 ucs_to_unihan (uint_t ucs_char)
266 {
267 hcode_type unihan_char;
268 hcode_type utf8_char;
269 unihan_char.code = 0x00;
270
271 if (ucs_char <= 0x7f) {
272 utf8_char.code = ucs_char;
273
274 } else if (ucs_char <= 0x7ff) {
275 utf8_char.byte.byte3 = (uchar_t)(0xc0 | ((ucs_char & 0x07c0) >> 6));
276 utf8_char.byte.byte4 = (uchar_t)(0x80 | (ucs_char & 0x003f));
277
278 } else if (ucs_char <= 0x00ffff) {
279 utf8_char.byte.byte2 = (uchar_t)(0xe0 | ((ucs_char & 0x0f000) >> 12));
280 utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x00fc0) >> 6));
281 utf8_char.byte.byte4 = (uchar_t)(0x80 | (ucs_char & 0x0003f));
282 } else if (ucs_char <= 0x1fffff) {
283 utf8_char.byte.byte1 = (uchar_t)(0xf0 | ((ucs_char & 0x01c0000) >> 18));
284 utf8_char.byte.byte2 = (uchar_t)(0x80 | ((ucs_char & 0x003f000) >> 12));
285 utf8_char.byte.byte3 = (uchar_t)(0x80 | ((ucs_char & 0x0000fc0) >> 6));
286 utf8_char.byte.byte4 = (uchar_t)(0x80 | (ucs_char & 0x000003f));
287 } else
288 utf8_char.code = 0x00;
289
290 unihan_char = _utf8_to_unified_hangul (utf8_char);
291 return unihan_char;
292 }
293