xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/ucs_to_utf7.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * This program covers conversion from UTF-8, UCS-2, and, UCS-4 to UTF-7.
26  * UTF-7 is described in RFC 2152.
27  * We only support conversions between UCS-2/UCS-4/UTF-8 and UTF-7. No
28  * other UCS formats are going to be supported unless there is a significant
29  * reason.
30  */
31 
32 
33 #include <stdlib.h>
34 #include <errno.h>
35 #include <sys/types.h>
36 #include <sys/isa_defs.h>
37 #include "ucs_to_utf7.h"
38 
39 
40 void *
_icv_open()41 _icv_open()
42 {
43 	utf7_state_t *cd = (utf7_state_t *)calloc(1, sizeof(utf7_state_t));
44 
45 	if (cd == (utf7_state_t *)NULL) {
46 		errno = ENOMEM;
47 		return((void *)-1);
48 	}
49 #if defined(_LITTLE_ENDIAN)
50 	cd->little_endian = true;
51 #endif
52 
53 	return((void *)cd);
54 }
55 
56 
57 void
_icv_close(utf7_state_t * cd)58 _icv_close(utf7_state_t *cd)
59 {
60 	if (! cd)
61 		errno = EBADF;
62 	else
63 		free((void *)cd);
64 }
65 
66 
67 size_t
_icv_iconv(utf7_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)68 _icv_iconv(utf7_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
69                 size_t *outbufleft)
70 {
71 	size_t ret_val = 0;
72 	uchar_t *ib;
73 	uchar_t *ob;
74 	uchar_t *ibtail;
75 	uchar_t *obtail;
76 	uchar_t *ib_org;
77 	uint_t u4;
78 	uint_t u7;
79 	signed char sz;
80 	signed char new_bits_count;
81 	signed char new_remnant_count;
82 #if defined(UCS_2) || defined(UCS_4)
83 	register int i;
84 #endif
85 
86 	if (! cd) {
87 		errno = EBADF;
88 		return((size_t)-1);
89 	}
90 
91 	if (!inbuf || !(*inbuf)) {
92 		if (cd->in_the_middle_of_utf7_sequence) {
93 			sz = (cd->remnant_count > 0) ? 2 : 1;
94 
95 			if ((! outbufleft) || *outbufleft < sz) {
96 				errno = E2BIG;
97 				return((size_t)-1);
98 			}
99 
100 			if (cd->remnant_count > 0) {
101 				/* Masking is needed. */
102 				**outbuf = mb64[((cd->remnant <<
103 					(6 - cd->remnant_count)) & 0x003f)];
104 				(*outbuf)++;
105 			}
106 
107 			**outbuf = '-';
108 			(*outbuf)++;
109 			*outbufleft -= sz;
110 		}
111 
112 		cd->remnant = 0;
113 		cd->remnant_count = 0;
114 		cd->in_the_middle_of_utf7_sequence = false;
115 #if defined(UCS_2) || defined(UCS_4)
116 		cd->bom_written = false;
117 #endif
118 
119 		return((size_t)0);
120 	}
121 
122 	ib = (uchar_t *)*inbuf;
123 	ob = (uchar_t *)*outbuf;
124 	ibtail = ib + *inbufleft;
125 	obtail = ob + *outbufleft;
126 
127 #if defined(UCS_2) || defined(UCS_4)
128 	if (! cd->bom_written) {
129 		if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
130 			errno = EINVAL;
131 			return((size_t)-1);
132 		}
133 
134 		for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
135 			u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
136 
137 		if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
138 			ib += ICV_FETCH_UCS_SIZE;
139 			cd->little_endian = false;
140 		} else if (u4 == ICV_BOM_IN_LITTLE_ENDIAN) {
141 			ib += ICV_FETCH_UCS_SIZE;
142 			cd->little_endian = true;
143 		}
144 	}
145 	cd->bom_written = true;
146 #endif
147 
148 	while (ib < ibtail) {
149 #if defined(UTF_8)
150 		sz = number_of_bytes_in_utf8_char[*ib];
151 		if (sz == ICV_TYPE_ILLEGAL_CHAR) {
152 			errno = EILSEQ;
153 			ret_val = (size_t)-1;
154 			break;
155 		}
156 #elif defined(UCS_2) || defined(UCS_4)
157 		sz = ICV_FETCH_UCS_SIZE;
158 #else
159 #error	"Fatal: One of UTF_8, UCS_2, or, UCS_4 is needed."
160 #endif
161 
162 		if ((ibtail - ib) < sz) {
163 			errno = EINVAL;
164 			ret_val = (size_t)-1;
165 			break;
166 		}
167 
168 		ib_org = ib;
169 #if defined(UTF_8)
170 		u4 = *ib++ & masks_tbl[sz];
171 		for (; sz > 1; sz--) {
172 			if (((uint_t)*ib) < 0x80) {
173 				ib = ib_org;
174 				errno = EILSEQ;
175 				ret_val = (size_t)-1;
176 				goto illegal_char_err;
177 			}
178 			u4 = (u4 << ICV_UTF8_BIT_SHIFT) |
179 				(((uint_t)*ib) & ICV_UTF8_BIT_MASK);
180 			ib++;
181 		}
182 #elif defined(UCS_2) || defined(UCS_4)
183 		u4 = 0;
184 		if (cd->little_endian) {
185 			for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
186 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
187 		} else {
188 			for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
189 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
190 		}
191 		ib += ICV_FETCH_UCS_SIZE;
192 #endif
193 
194 		/* Check against known non-characters. */
195 #if defined(UTF_8)
196 		if ((u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_fffe ||
197 		    (u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_ffff ||
198 		    u4 > ICV_UTF32_LAST_VALID_CHAR ||
199 		    (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
200 		    u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
201 		    (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
202 		    u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
203 #elif defined(UCS_2)
204 		if (u4 >= ICV_UTF32_NONCHAR_fffe ||
205 		    (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
206 		    u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
207 		    (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
208 		    u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
209 #else
210 		if ((u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_fffe ||
211 		    (u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_ffff ||
212 		    u4 > ICV_UCS4_LAST_VALID_CHAR ||
213 		    (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
214 		    u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
215 		    (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
216 		    u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
217 #endif
218 			ib = ib_org;
219 			errno = EILSEQ;
220 			ret_val = (size_t)-1;
221 			goto illegal_char_err;
222 		}
223 
224 #if defined(UCS_4) || defined(UTF_8)
225 		if (u4 > 0x00ffff) {
226 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
227 			ret_val++;
228 		}
229 #endif
230 
231 		/* Set D or Rule 3? */
232 		if ((u4 >= (uint_t)'A' && u4 <= (uint_t)'Z') ||
233 		    (u4 >= (uint_t)'a' && u4 <= (uint_t)'z') ||
234 		    (u4 >= (uint_t)'0' && u4 <= (uint_t)'9') ||
235 		    u4 == (uint_t)'\'' || u4 == (uint_t)'(' ||
236 		    u4 == (uint_t)')' ||
237 		    (u4 >= (uint_t)',' && u4 <= (uint_t)'/') || /* , - . / */
238 		    u4 == (uint_t)':' || u4 == (uint_t)'?' ||
239 		    u4 == (uint_t)' ' || u4 == (uint_t)'\t' ||
240 		    u4 == (uint_t)'\r' || u4 == (uint_t)'\n') {
241 
242 			u7 = 0;
243 			sz = 1;
244 			if (cd->in_the_middle_of_utf7_sequence) {
245 				if (cd->remnant_count > 0) {
246 					sz++;
247 					u7 = cd->remnant <<
248 						(6 - cd->remnant_count);
249 				}
250 				if (u4 == (uint_t)'-' ||
251 				    ICV_INRANGE_OF_MBASE64_ALPHABET(u4))
252 					sz++;
253 			}
254 
255 			if ((obtail - ob) < sz) {
256 				ib = ib_org;
257 				errno = E2BIG;
258 				ret_val = (size_t)-1;
259 				break;
260 			}
261 
262 			if (cd->in_the_middle_of_utf7_sequence) {
263 				/* Masking is needed. */
264 				if (cd->remnant_count > 0)
265 					*ob++ = mb64[u7 & 0x003f];
266 				if (u4 == (uint_t)'-' ||
267 				    ICV_INRANGE_OF_MBASE64_ALPHABET(u4))
268 					*ob++ = '-';
269 
270 				cd->in_the_middle_of_utf7_sequence = false;
271 				cd->remnant_count = 0;
272 			}
273 
274 			*ob++ = (uchar_t)(u4 & 0x007f);
275 
276 		} else {
277 /*
278  * Any UCS-2 character sequences will yield:
279  *
280  * +-16 bits (UCS-2)-+  +-16 bits (UCS-2)-+  +-16 bits (UCS-2)-+
281  * |                 |  |                 |  |                 |
282  * xxxx xxxx xxxx xxxx  xxxx xxxx xxxx xxxx  xxxx xxxx xxxx xxxx
283  * |     ||     | |      ||     | |     ||      | |     ||     |
284  * +-----++-----+ +------++-----+ +-----++------+ +-----++-----+ MBase64 chars
285  *                ^                      ^
286  * initially,     |                      |
287  *                four remnant bits,     |
288  *                                       two remnant bits,
289  *
290  * and, then no remnant bit for three sequential UCS-2 characters,
291  * respectively, and repeat these three UCS-2 character sequences. For the
292  * first UCS-2 character in this sequence, there will be two MBase64
293  * characters, and for the second and the third UCS-2 characters, there will be
294  * three MBase64 characters.
295  */
296 			sz = (cd->remnant_count) ? 3 : 2;
297 			if (! cd->in_the_middle_of_utf7_sequence)
298 				sz++;
299 
300 			if ((obtail - ob) < sz) {
301 				ib = ib_org;
302 				errno = E2BIG;
303 				ret_val = (size_t)-1;
304 				break;
305 			}
306 
307 			if (! cd->in_the_middle_of_utf7_sequence) {
308 				*ob++ = '+';
309 				cd->in_the_middle_of_utf7_sequence = true;
310 			}
311 
312 			if (cd->remnant_count) {
313 				new_bits_count = 18 - cd->remnant_count;
314 				new_remnant_count = 16 - new_bits_count;
315 				u7 = (cd->remnant << new_bits_count) |
316 					(u4 >> new_remnant_count);
317 				cd->remnant = u4 & 0x0003;
318 				cd->remnant_count = new_remnant_count;
319 
320 				/* Masking is needed. */
321 				*ob++ = mb64[(u7 >> 12) & 0x003f];
322 				*ob++ = mb64[(u7 >> 6) & 0x003f];
323 				*ob++ = mb64[u7 & 0x003f];
324 			} else {
325 				cd->remnant = u4 & 0x000f;
326 				cd->remnant_count = 4;
327 
328 				/* Masking is needed. */
329 				*ob++ = mb64[(u4 >> 10) & 0x003f];
330 				*ob++ = mb64[(u4 >> 4) & 0x003f];
331 			}
332 		}
333 	}
334 
335 illegal_char_err:
336 	*inbuf = (char *)ib;
337 	*inbufleft = ibtail - ib;
338 	*outbuf = (char *)ob;
339 	*outbufleft = obtail - ob;
340 
341 	return(ret_val);
342 }
343