1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * Following is how we process BOM and subsequent bytes in this program:
26 * - UCS-2BE, UTF-16BE, UCS-4BE, UTF-32BE, UCS-2LE, UTF-16LE, UCS-4LE, and
27 * UTF-32LE don't care about BOM. From the beginning, they are properly
28 * serialized without the BOM character; any BOM is treated as ZWNBSP.
29 * - In other encodings, UCS-2, UCS-4, UTF-16, and UTF-32, the initial byte
30 * ordering is of the current processor's byte ordering. During the first
31 * iconv() call, if BOM appears as the first character of the entier
32 * iconv input stream, the byte order will be changed accordingly.
33 * We will use 'bom_written' data field of the conversion descriptor to
34 * save this particular information, in other words, whether we've been
35 * encountered the first character as the BOM.
36 */
37
38
39 #include <stdlib.h>
40 #include <errno.h>
41 #include <sys/types.h>
42 #include <sys/isa_defs.h>
43 #include "ucs_to_utf8.h"
44
45
46 void *
_icv_open()47 _icv_open()
48 {
49 ucs_state_t *cd = (ucs_state_t *)calloc(1, sizeof(ucs_state_t));
50
51 if (cd == (ucs_state_t *)NULL) {
52 errno = ENOMEM;
53 return((void *)-1);
54 }
55
56 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
57 defined(UTF_32BE)
58 cd->little_endian = false;
59 cd->bom_written = true;
60 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
61 defined(UTF_32LE)
62 cd->little_endian = true;
63 cd->bom_written = true;
64 #elif defined(_LITTLE_ENDIAN)
65 cd->little_endian = true;
66 #endif
67
68 return((void *)cd);
69 }
70
71
72 void
_icv_close(ucs_state_t * cd)73 _icv_close(ucs_state_t *cd)
74 {
75 if (! cd)
76 errno = EBADF;
77 else
78 free((void *)cd);
79 }
80
81
82 size_t
_icv_iconv(ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)83 _icv_iconv(ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
84 size_t *outbufleft)
85 {
86 size_t ret_val = 0;
87 uchar_t *ib;
88 uchar_t *ob;
89 uchar_t *ibtail;
90 uchar_t *obtail;
91 uint_t u4;
92 uint_t u4_2;
93 register int i;
94
95 if (! cd) {
96 errno = EBADF;
97 return((size_t)-1);
98 }
99
100 if (!inbuf || !(*inbuf)) {
101 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
102 cd->bom_written = false;
103 #endif
104 return((size_t)0);
105 }
106
107 ib = (uchar_t *)*inbuf;
108 ob = (uchar_t *)*outbuf;
109 ibtail = ib + *inbufleft;
110 obtail = ob + *outbufleft;
111
112 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
113 if (! cd->bom_written) {
114 if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
115 errno = EINVAL;
116 ret_val = (size_t)-1;
117 goto need_more_input_err;
118 }
119
120 for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
121 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
122
123 /* Big endian, Little endian, or, not specified?? */
124 if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
125 ib += ICV_FETCH_UCS_SIZE;
126 cd->little_endian = false;
127 } else if (u4 == ICV_BOM_IN_LITTLE_ENDIAN) {
128 ib += ICV_FETCH_UCS_SIZE;
129 cd->little_endian = true;
130 }
131 }
132 /*
133 * Once BOM checking is done, regardless of whether we had the BOM or
134 * not, we treat the BOM sequence as a ZWNBSP character from now on.
135 */
136 cd->bom_written = true;
137 #endif
138
139 while (ib < ibtail) {
140 if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
141 errno = EINVAL;
142 ret_val = (size_t)-1;
143 break;
144 }
145
146 u4 = u4_2 = 0;
147 if (cd->little_endian) {
148 for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
149 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
150 } else {
151 for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
152 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
153 }
154
155 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
156 if (u4 >= 0x00fffe || (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
157 errno = EILSEQ;
158 ret_val = (size_t)-1;
159 break;
160 }
161 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
162 if ((u4 >= 0x00dc00 && u4 <= 0x00dfff) || u4 >= 0x00fffe) {
163 errno = EILSEQ;
164 ret_val = (size_t)-1;
165 break;
166 }
167
168 if (u4 >= 0x00d800 && u4 <= 0x00dbff) {
169 if ((ibtail - ib) < ICV_FETCH_UCS_SIZE_TWO) {
170 errno = EINVAL;
171 ret_val = (size_t)-1;
172 break;
173 }
174
175 if (cd->little_endian) {
176 for (i = ICV_FETCH_UCS_SIZE_TWO - 1;
177 i >= ICV_FETCH_UCS_SIZE;
178 i--)
179 u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
180 } else {
181 for (i = ICV_FETCH_UCS_SIZE;
182 i < ICV_FETCH_UCS_SIZE_TWO;
183 i++)
184 u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
185 }
186
187 if (u4_2 < 0x00dc00 || u4_2 > 0x00dfff) {
188 errno = EILSEQ;
189 ret_val = (size_t)-1;
190 break;
191 }
192
193 u4 = ((((u4 - 0x00d800) * 0x400) +
194 (u4_2 - 0x00dc00)) & 0x0fffff) + 0x010000;
195 }
196 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
197 if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x10ffff ||
198 (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
199 errno = EILSEQ;
200 ret_val = (size_t)-1;
201 break;
202 }
203 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
204 if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x7fffffff ||
205 (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
206 errno = EILSEQ;
207 ret_val = (size_t)-1;
208 break;
209 }
210 #else
211 #error "Fatal: one of the UCS macros need to be defined."
212 #endif
213
214 /*
215 * Once we reach here, the "u4" contains a valid character
216 * and thus we don't do any other error checking in
217 * the below.
218 */
219 if (u4 <= 0x7f) {
220 OUTBUF_SIZE_CHECK(1);
221 *ob++ = (uchar_t)u4;
222 } else if (u4 <= 0x7ff) {
223 OUTBUF_SIZE_CHECK(2);
224 *ob++ = (uchar_t)(0xc0 | ((u4 & 0x07c0) >> 6));
225 *ob++ = (uchar_t)(0x80 | (u4 & 0x003f));
226 } else if (u4 <= 0x00ffff) {
227 OUTBUF_SIZE_CHECK(3);
228 *ob++ = (uchar_t)(0xe0 | ((u4 & 0x0f000) >> 12));
229 *ob++ = (uchar_t)(0x80 | ((u4 & 0x00fc0) >> 6));
230 *ob++ = (uchar_t)(0x80 | (u4 & 0x0003f));
231 } else if (u4 <= 0x1fffff) {
232 OUTBUF_SIZE_CHECK(4);
233 *ob++ = (uchar_t)(0xf0 | ((u4 & 0x01c0000) >> 18));
234 *ob++ = (uchar_t)(0x80 | ((u4 & 0x003f000) >> 12));
235 *ob++ = (uchar_t)(0x80 | ((u4 & 0x0000fc0) >> 6));
236 *ob++ = (uchar_t)(0x80 | (u4 & 0x000003f));
237 } else if (u4 <= 0x3ffffff) {
238 OUTBUF_SIZE_CHECK(5);
239 *ob++ = (uchar_t)(0xf8 | ((u4 & 0x03000000) >> 24));
240 *ob++ = (uchar_t)(0x80 | ((u4 & 0x00fc0000) >> 18));
241 *ob++ = (uchar_t)(0x80 | ((u4 & 0x0003f000) >> 12));
242 *ob++ = (uchar_t)(0x80 | ((u4 & 0x00000fc0) >> 6));
243 *ob++ = (uchar_t)(0x80 | (u4 & 0x0000003f));
244 } else {
245 OUTBUF_SIZE_CHECK(6);
246 *ob++ = (uchar_t)(0xfc | ((u4 & 0x40000000) >> 30));
247 *ob++ = (uchar_t)(0x80 | ((u4 & 0x3f000000) >> 24));
248 *ob++ = (uchar_t)(0x80 | ((u4 & 0x00fc0000) >> 18));
249 *ob++ = (uchar_t)(0x80 | ((u4 & 0x0003f000) >> 12));
250 *ob++ = (uchar_t)(0x80 | ((u4 & 0x00000fc0) >> 6));
251 *ob++ = (uchar_t)(0x80 | (u4 & 0x0000003f));
252 }
253 ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
254 }
255
256 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
257 need_more_input_err:
258 #endif
259 *inbuf = (char *)ib;
260 *inbufleft = ibtail - ib;
261 *outbuf = (char *)ob;
262 *outbufleft = obtail - ob;
263
264 return(ret_val);
265 }
266