1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * This is for conversions from UTF-8 to various UCS forms, esp.,
26 * UCS-2, UCS-2BE, UCS-2LE, UTF-16, UTF-16BE, UTF-16LE, UCS-4, UCS-4BE,
27 * UCS-4LE, UTF-32, UTF-32BE, and UTF-32LE.
28 */
29
30
31 #include <stdlib.h>
32 #include <errno.h>
33 #include <sys/types.h>
34 #include <sys/isa_defs.h>
35 #include "utf8_to_ucs.h"
36
37
38 void *
_icv_open()39 _icv_open()
40 {
41 ucs_state_t *cd = (ucs_state_t *)calloc(1, sizeof(ucs_state_t));
42
43 if (cd == (ucs_state_t *)NULL) {
44 errno = ENOMEM;
45 return((void *)-1);
46 }
47
48 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
49 defined(UTF_32BE)
50 cd->little_endian = false;
51 cd->bom_written = true;
52 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
53 defined(UTF_32LE)
54 cd->little_endian = true;
55 cd->bom_written = true;
56 #elif defined(_LITTLE_ENDIAN)
57 cd->little_endian = true;
58 #endif
59
60 return((void *)cd);
61 }
62
63
64 void
_icv_close(ucs_state_t * cd)65 _icv_close(ucs_state_t *cd)
66 {
67 if (! cd)
68 errno = EBADF;
69 else
70 free((void *)cd);
71 }
72
73
74 size_t
_icv_iconv(ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)75 _icv_iconv(ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
76 size_t *outbufleft)
77 {
78 size_t ret_val = 0;
79 uchar_t *ib;
80 uchar_t *ob;
81 uchar_t *ibtail;
82 uchar_t *obtail;
83
84 if (! cd) {
85 errno = EBADF;
86 return((size_t)-1);
87 }
88
89 if (!inbuf || !(*inbuf)) {
90 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
91 cd->bom_written = false;
92 #endif
93 return((size_t)0);
94 }
95
96 ib = (uchar_t *)*inbuf;
97 ob = (uchar_t *)*outbuf;
98 ibtail = ib + *inbufleft;
99 obtail = ob + *outbufleft;
100
101 while (ib < ibtail) {
102 uchar_t *ib_org;
103 uint_t u4;
104 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
105 uint_t u4_2;
106 #endif
107 uint_t first_byte;
108 signed char sz;
109 signed char obsz;
110
111 sz = number_of_bytes_in_utf8_char[*ib];
112 if (sz == ICV_TYPE_ILLEGAL_CHAR) {
113 errno = EILSEQ;
114 ret_val = (size_t)-1;
115 break;
116 }
117
118 if ((ibtail - ib) < sz) {
119 errno = EINVAL;
120 ret_val = (size_t)-1;
121 break;
122 }
123
124 ib_org = ib;
125 first_byte = *ib;
126 u4 = (uint_t)(*ib++ & masks_tbl[sz]);
127 for (; sz > 1; sz--) {
128 if (first_byte) {
129 if (((uchar_t)*ib) <
130 valid_min_2nd_byte[first_byte] ||
131 ((uchar_t)*ib) >
132 valid_max_2nd_byte[first_byte]) {
133 ib = ib_org;
134 errno = EILSEQ;
135 ret_val = (size_t)-1;
136 goto ILLEGAL_CHAR_ERR;
137 }
138 first_byte = 0;
139 } else if (((uint_t)*ib) < 0x80 ||
140 ((uint_t)*ib) > 0xbf) {
141 ib = ib_org;
142 errno = EILSEQ;
143 ret_val = (size_t)-1;
144 goto ILLEGAL_CHAR_ERR;
145 }
146 u4 = (u4 << ICV_UTF8_BIT_SHIFT) |
147 (((uint_t)*ib) & ICV_UTF8_BIT_MASK);
148 ib++;
149 }
150
151 /* Check against known non-characters. */
152 if ((u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_fffe ||
153 (u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_ffff ||
154 u4 > ICV_UTF32_LAST_VALID_CHAR ||
155 (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
156 u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
157 (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
158 u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
159 ib = ib_org;
160 errno = EILSEQ;
161 ret_val = (size_t)-1;
162 goto ILLEGAL_CHAR_ERR;
163 }
164
165 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
166 u4_2 = 0;
167 #endif
168
169 if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
170 cd->bom_written = true;
171 }
172
173 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
174 obsz = (cd->bom_written) ? 4 : 8;
175 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
176 obsz = (cd->bom_written) ? 4 : 8;
177 if (u4 > 0x10ffff) {
178 u4 = ICV_CHAR_UCS2_REPLACEMENT;
179 ret_val++;
180 }
181 #elif defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
182 obsz = (cd->bom_written) ? 2 : 4;
183 if (u4 > 0x00ffff) {
184 u4 = ICV_CHAR_UCS2_REPLACEMENT;
185 ret_val++;
186 }
187 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
188 obsz = (cd->bom_written) ? 2 : 4;
189 if (u4 > 0x10ffff) {
190 u4 = ICV_CHAR_UCS2_REPLACEMENT;
191 ret_val++;
192 } else if (u4 > 0x00ffff) {
193 u4_2 = ((u4 - 0x010000) % 0x400) + 0x00dc00;
194 u4 = ((u4 - 0x010000) / 0x400) + 0x00d800;
195 obsz += 2;
196 }
197 #else
198 #error "Fatal: one of the UCS macros need to be defined."
199 #endif
200 if ((obtail - ob) < obsz) {
201 ib = ib_org;
202 errno = E2BIG;
203 ret_val = (size_t)-1;
204 break;
205 }
206
207 if (cd->little_endian) {
208 if (! cd->bom_written) {
209 *ob++ = (uchar_t)0xff;
210 *ob++ = (uchar_t)0xfe;
211 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
212 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
213 *(ushort_t *)ob = (ushort_t)0;
214 ob += 2;
215 #endif
216 cd->bom_written = true;
217 }
218 *ob++ = (uchar_t)(u4 & 0xff);
219 *ob++ = (uchar_t)((u4 >> 8) & 0xff);
220 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
221 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
222 *ob++ = (uchar_t)((u4 >> 16) & 0xff);
223 *ob++ = (uchar_t)((u4 >> 24) & 0xff);
224 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
225 if (u4_2) {
226 *ob++ = (uchar_t)(u4_2 & 0xff);
227 *ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
228 }
229 #endif
230 } else {
231 if (! cd->bom_written) {
232 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
233 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
234 *(ushort_t *)ob = (ushort_t)0;
235 ob += 2;
236 #endif
237 *ob++ = (uchar_t)0xfe;
238 *ob++ = (uchar_t)0xff;
239 cd->bom_written = true;
240 }
241 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
242 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
243 *ob++ = (uchar_t)((u4 >> 24) & 0xff);
244 *ob++ = (uchar_t)((u4 >> 16) & 0xff);
245 #endif
246 *ob++ = (uchar_t)((u4 >> 8) & 0xff);
247 *ob++ = (uchar_t)(u4 & 0xff);
248 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
249 if (u4_2) {
250 *ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
251 *ob++ = (uchar_t)(u4_2 & 0xff);
252 }
253 #endif
254 }
255 }
256
257 ILLEGAL_CHAR_ERR:
258 *inbuf = (char *)ib;
259 *inbufleft = ibtail - ib;
260 *outbuf = (char *)ob;
261 *outbufleft = obtail - ob;
262
263 return(ret_val);
264 }
265