1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 *
25 * This program covers conversion from UTF-8, UCS-2, and, UCS-4 to UTF-7.
26 * UTF-7 is described in RFC 2152.
27 * We only support conversions between UCS-2/UCS-4/UTF-8 and UTF-7. No
28 * other UCS formats are going to be supported unless there is a significant
29 * reason.
30 */
31
32
33 #include <stdlib.h>
34 #include <errno.h>
35 #include <sys/types.h>
36 #include <sys/isa_defs.h>
37 #include "ucs_to_utf7.h"
38
39
40 void *
_icv_open()41 _icv_open()
42 {
43 utf7_state_t *cd = (utf7_state_t *)calloc(1, sizeof(utf7_state_t));
44
45 if (cd == (utf7_state_t *)NULL) {
46 errno = ENOMEM;
47 return((void *)-1);
48 }
49 #if defined(_LITTLE_ENDIAN)
50 cd->little_endian = true;
51 #endif
52
53 return((void *)cd);
54 }
55
56
57 void
_icv_close(utf7_state_t * cd)58 _icv_close(utf7_state_t *cd)
59 {
60 if (! cd)
61 errno = EBADF;
62 else
63 free((void *)cd);
64 }
65
66
67 size_t
_icv_iconv(utf7_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)68 _icv_iconv(utf7_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
69 size_t *outbufleft)
70 {
71 size_t ret_val = 0;
72 uchar_t *ib;
73 uchar_t *ob;
74 uchar_t *ibtail;
75 uchar_t *obtail;
76 uchar_t *ib_org;
77 uint_t u4;
78 uint_t u7;
79 signed char sz;
80 signed char new_bits_count;
81 signed char new_remnant_count;
82 #if defined(UCS_2) || defined(UCS_4)
83 register int i;
84 #endif
85
86 if (! cd) {
87 errno = EBADF;
88 return((size_t)-1);
89 }
90
91 if (!inbuf || !(*inbuf)) {
92 if (cd->in_the_middle_of_utf7_sequence) {
93 sz = (cd->remnant_count > 0) ? 2 : 1;
94
95 if ((! outbufleft) || *outbufleft < sz) {
96 errno = E2BIG;
97 return((size_t)-1);
98 }
99
100 if (cd->remnant_count > 0) {
101 /* Masking is needed. */
102 **outbuf = mb64[((cd->remnant <<
103 (6 - cd->remnant_count)) & 0x003f)];
104 (*outbuf)++;
105 }
106
107 **outbuf = '-';
108 (*outbuf)++;
109 *outbufleft -= sz;
110 }
111
112 cd->remnant = 0;
113 cd->remnant_count = 0;
114 cd->in_the_middle_of_utf7_sequence = false;
115 #if defined(UCS_2) || defined(UCS_4)
116 cd->bom_written = false;
117 #endif
118
119 return((size_t)0);
120 }
121
122 ib = (uchar_t *)*inbuf;
123 ob = (uchar_t *)*outbuf;
124 ibtail = ib + *inbufleft;
125 obtail = ob + *outbufleft;
126
127 #if defined(UCS_2) || defined(UCS_4)
128 if (! cd->bom_written) {
129 if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
130 errno = EINVAL;
131 return((size_t)-1);
132 }
133
134 for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
135 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
136
137 if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
138 ib += ICV_FETCH_UCS_SIZE;
139 cd->little_endian = false;
140 } else if (u4 == ICV_BOM_IN_LITTLE_ENDIAN) {
141 ib += ICV_FETCH_UCS_SIZE;
142 cd->little_endian = true;
143 }
144 }
145 cd->bom_written = true;
146 #endif
147
148 while (ib < ibtail) {
149 #if defined(UTF_8)
150 sz = number_of_bytes_in_utf8_char[*ib];
151 if (sz == ICV_TYPE_ILLEGAL_CHAR) {
152 errno = EILSEQ;
153 ret_val = (size_t)-1;
154 break;
155 }
156 #elif defined(UCS_2) || defined(UCS_4)
157 sz = ICV_FETCH_UCS_SIZE;
158 #else
159 #error "Fatal: One of UTF_8, UCS_2, or, UCS_4 is needed."
160 #endif
161
162 if ((ibtail - ib) < sz) {
163 errno = EINVAL;
164 ret_val = (size_t)-1;
165 break;
166 }
167
168 ib_org = ib;
169 #if defined(UTF_8)
170 u4 = *ib++ & masks_tbl[sz];
171 for (; sz > 1; sz--) {
172 if (((uint_t)*ib) < 0x80) {
173 ib = ib_org;
174 errno = EILSEQ;
175 ret_val = (size_t)-1;
176 goto illegal_char_err;
177 }
178 u4 = (u4 << ICV_UTF8_BIT_SHIFT) |
179 (((uint_t)*ib) & ICV_UTF8_BIT_MASK);
180 ib++;
181 }
182 #elif defined(UCS_2) || defined(UCS_4)
183 u4 = 0;
184 if (cd->little_endian) {
185 for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
186 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
187 } else {
188 for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
189 u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
190 }
191 ib += ICV_FETCH_UCS_SIZE;
192 #endif
193
194 /* Check against known non-characters. */
195 #if defined(UTF_8)
196 if ((u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_fffe ||
197 (u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_ffff ||
198 u4 > ICV_UTF32_LAST_VALID_CHAR ||
199 (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
200 u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
201 (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
202 u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
203 #elif defined(UCS_2)
204 if (u4 >= ICV_UTF32_NONCHAR_fffe ||
205 (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
206 u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
207 (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
208 u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
209 #else
210 if ((u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_fffe ||
211 (u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_ffff ||
212 u4 > ICV_UCS4_LAST_VALID_CHAR ||
213 (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
214 u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
215 (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
216 u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
217 #endif
218 ib = ib_org;
219 errno = EILSEQ;
220 ret_val = (size_t)-1;
221 goto illegal_char_err;
222 }
223
224 #if defined(UCS_4) || defined(UTF_8)
225 if (u4 > 0x00ffff) {
226 u4 = ICV_CHAR_UCS2_REPLACEMENT;
227 ret_val++;
228 }
229 #endif
230
231 /* Set D or Rule 3? */
232 if ((u4 >= (uint_t)'A' && u4 <= (uint_t)'Z') ||
233 (u4 >= (uint_t)'a' && u4 <= (uint_t)'z') ||
234 (u4 >= (uint_t)'0' && u4 <= (uint_t)'9') ||
235 u4 == (uint_t)'\'' || u4 == (uint_t)'(' ||
236 u4 == (uint_t)')' ||
237 (u4 >= (uint_t)',' && u4 <= (uint_t)'/') || /* , - . / */
238 u4 == (uint_t)':' || u4 == (uint_t)'?' ||
239 u4 == (uint_t)' ' || u4 == (uint_t)'\t' ||
240 u4 == (uint_t)'\r' || u4 == (uint_t)'\n') {
241
242 u7 = 0;
243 sz = 1;
244 if (cd->in_the_middle_of_utf7_sequence) {
245 if (cd->remnant_count > 0) {
246 sz++;
247 u7 = cd->remnant <<
248 (6 - cd->remnant_count);
249 }
250 if (u4 == (uint_t)'-' ||
251 ICV_INRANGE_OF_MBASE64_ALPHABET(u4))
252 sz++;
253 }
254
255 if ((obtail - ob) < sz) {
256 ib = ib_org;
257 errno = E2BIG;
258 ret_val = (size_t)-1;
259 break;
260 }
261
262 if (cd->in_the_middle_of_utf7_sequence) {
263 /* Masking is needed. */
264 if (cd->remnant_count > 0)
265 *ob++ = mb64[u7 & 0x003f];
266 if (u4 == (uint_t)'-' ||
267 ICV_INRANGE_OF_MBASE64_ALPHABET(u4))
268 *ob++ = '-';
269
270 cd->in_the_middle_of_utf7_sequence = false;
271 cd->remnant_count = 0;
272 }
273
274 *ob++ = (uchar_t)(u4 & 0x007f);
275
276 } else {
277 /*
278 * Any UCS-2 character sequences will yield:
279 *
280 * +-16 bits (UCS-2)-+ +-16 bits (UCS-2)-+ +-16 bits (UCS-2)-+
281 * | | | | | |
282 * xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx
283 * | || | | || | | || | | || |
284 * +-----++-----+ +------++-----+ +-----++------+ +-----++-----+ MBase64 chars
285 * ^ ^
286 * initially, | |
287 * four remnant bits, |
288 * two remnant bits,
289 *
290 * and, then no remnant bit for three sequential UCS-2 characters,
291 * respectively, and repeat these three UCS-2 character sequences. For the
292 * first UCS-2 character in this sequence, there will be two MBase64
293 * characters, and for the second and the third UCS-2 characters, there will be
294 * three MBase64 characters.
295 */
296 sz = (cd->remnant_count) ? 3 : 2;
297 if (! cd->in_the_middle_of_utf7_sequence)
298 sz++;
299
300 if ((obtail - ob) < sz) {
301 ib = ib_org;
302 errno = E2BIG;
303 ret_val = (size_t)-1;
304 break;
305 }
306
307 if (! cd->in_the_middle_of_utf7_sequence) {
308 *ob++ = '+';
309 cd->in_the_middle_of_utf7_sequence = true;
310 }
311
312 if (cd->remnant_count) {
313 new_bits_count = 18 - cd->remnant_count;
314 new_remnant_count = 16 - new_bits_count;
315 u7 = (cd->remnant << new_bits_count) |
316 (u4 >> new_remnant_count);
317 cd->remnant = u4 & 0x0003;
318 cd->remnant_count = new_remnant_count;
319
320 /* Masking is needed. */
321 *ob++ = mb64[(u7 >> 12) & 0x003f];
322 *ob++ = mb64[(u7 >> 6) & 0x003f];
323 *ob++ = mb64[u7 & 0x003f];
324 } else {
325 cd->remnant = u4 & 0x000f;
326 cd->remnant_count = 4;
327
328 /* Masking is needed. */
329 *ob++ = mb64[(u4 >> 10) & 0x003f];
330 *ob++ = mb64[(u4 >> 4) & 0x003f];
331 }
332 }
333 }
334
335 illegal_char_err:
336 *inbuf = (char *)ib;
337 *inbufleft = ibtail - ib;
338 *outbuf = (char *)ob;
339 *outbufleft = obtail - ob;
340
341 return(ret_val);
342 }
343