xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/ucs_to_utf8.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * Following is how we process BOM and subsequent bytes in this program:
26  * - UCS-2BE, UTF-16BE, UCS-4BE, UTF-32BE, UCS-2LE, UTF-16LE, UCS-4LE, and
27  *   UTF-32LE don't care about BOM. From the beginning, they are properly
28  *   serialized without the BOM character; any BOM is treated as ZWNBSP.
29  * - In other encodings, UCS-2, UCS-4, UTF-16, and UTF-32, the initial byte
30  *   ordering is of the current processor's byte ordering. During the first
31  *   iconv() call, if BOM appears as the first character of the entier
32  *   iconv input stream, the byte order will be changed accordingly.
33  *   We will use 'bom_written' data field of the conversion descriptor to
34  *   save this particular information, in other words, whether we've been
35  *   encountered the first character as the BOM.
36  */
37 
38 
39 #include <stdlib.h>
40 #include <errno.h>
41 #include <sys/types.h>
42 #include <sys/isa_defs.h>
43 #include "ucs_to_utf8.h"
44 
45 
46 void *
_icv_open()47 _icv_open()
48 {
49 	ucs_state_t *cd = (ucs_state_t *)calloc(1, sizeof(ucs_state_t));
50 
51 	if (cd == (ucs_state_t *)NULL) {
52 		errno = ENOMEM;
53 		return((void *)-1);
54 	}
55 
56 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
57 	defined(UTF_32BE)
58 	cd->little_endian = false;
59 	cd->bom_written = true;
60 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
61 	defined(UTF_32LE)
62 	cd->little_endian = true;
63 	cd->bom_written = true;
64 #elif defined(_LITTLE_ENDIAN)
65 	cd->little_endian = true;
66 #endif
67 
68 	return((void *)cd);
69 }
70 
71 
72 void
_icv_close(ucs_state_t * cd)73 _icv_close(ucs_state_t *cd)
74 {
75 	if (! cd)
76 		errno = EBADF;
77 	else
78 		free((void *)cd);
79 }
80 
81 
82 size_t
_icv_iconv(ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)83 _icv_iconv(ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
84                 size_t *outbufleft)
85 {
86 	size_t ret_val = 0;
87 	uchar_t *ib;
88 	uchar_t *ob;
89 	uchar_t *ibtail;
90 	uchar_t *obtail;
91 	uint_t u4;
92 	uint_t u4_2;
93 	register int i;
94 
95 	if (! cd) {
96 		errno = EBADF;
97 		return((size_t)-1);
98 	}
99 
100 	if (!inbuf || !(*inbuf)) {
101 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
102 		cd->bom_written = false;
103 #endif
104 		return((size_t)0);
105 	}
106 
107 	ib = (uchar_t *)*inbuf;
108 	ob = (uchar_t *)*outbuf;
109 	ibtail = ib + *inbufleft;
110 	obtail = ob + *outbufleft;
111 
112 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
113 	if (! cd->bom_written) {
114 		if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
115 			errno = EINVAL;
116 			ret_val = (size_t)-1;
117 			goto need_more_input_err;
118 		}
119 
120 		for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
121 			u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
122 
123 		/* Big endian, Little endian, or, not specified?? */
124 		if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
125 			ib += ICV_FETCH_UCS_SIZE;
126 			cd->little_endian = false;
127 		} else if (u4 == ICV_BOM_IN_LITTLE_ENDIAN) {
128 			ib += ICV_FETCH_UCS_SIZE;
129 			cd->little_endian = true;
130 		}
131 	}
132 	/*
133 	 * Once BOM checking is done, regardless of whether we had the BOM or
134 	 * not, we treat the BOM sequence as a ZWNBSP character from now on.
135 	 */
136 	cd->bom_written = true;
137 #endif
138 
139 	while (ib < ibtail) {
140 		if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
141 			errno = EINVAL;
142 			ret_val = (size_t)-1;
143 			break;
144 		}
145 
146 		u4 = u4_2 = 0;
147 		if (cd->little_endian) {
148 			for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
149 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
150 		} else {
151 			for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
152 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
153 		}
154 
155 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
156 		if (u4 >= 0x00fffe || (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
157 			errno = EILSEQ;
158 			ret_val = (size_t)-1;
159 			break;
160 		}
161 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
162 		if ((u4 >= 0x00dc00 && u4 <= 0x00dfff) || u4 >= 0x00fffe) {
163 			errno = EILSEQ;
164 			ret_val = (size_t)-1;
165 			break;
166 		}
167 
168 		if (u4 >= 0x00d800 && u4 <= 0x00dbff) {
169 			if ((ibtail - ib) < ICV_FETCH_UCS_SIZE_TWO) {
170 				errno = EINVAL;
171 				ret_val = (size_t)-1;
172 				break;
173 			}
174 
175 			if (cd->little_endian) {
176 				for (i = ICV_FETCH_UCS_SIZE_TWO - 1;
177 					i >= ICV_FETCH_UCS_SIZE;
178 						i--)
179 					u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
180 			} else {
181 				for (i = ICV_FETCH_UCS_SIZE;
182 					i < ICV_FETCH_UCS_SIZE_TWO;
183 						i++)
184 					u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
185 			}
186 
187 			if (u4_2 < 0x00dc00 || u4_2 > 0x00dfff) {
188 				errno = EILSEQ;
189 				ret_val = (size_t)-1;
190 				break;
191 			}
192 
193 			u4 = ((((u4 - 0x00d800) * 0x400) +
194 				(u4_2 - 0x00dc00)) & 0x0fffff) + 0x010000;
195 		}
196 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
197 		if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x10ffff ||
198 		    (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
199 			errno = EILSEQ;
200 			ret_val = (size_t)-1;
201 			break;
202 		}
203 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
204 		if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x7fffffff ||
205 		    (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
206 			errno = EILSEQ;
207 			ret_val = (size_t)-1;
208 			break;
209 		}
210 #else
211 #error	"Fatal: one of the UCS macros need to be defined."
212 #endif
213 
214 		/*
215 		 * Once we reach here, the "u4" contains a valid character
216 		 * and thus we don't do any other error checking in
217 		 * the below.
218 		 */
219 		if (u4 <= 0x7f) {
220 			OUTBUF_SIZE_CHECK(1);
221 			*ob++ = (uchar_t)u4;
222 		} else if (u4 <= 0x7ff) {
223 			OUTBUF_SIZE_CHECK(2);
224 			*ob++ = (uchar_t)(0xc0 | ((u4 & 0x07c0) >> 6));
225 			*ob++ = (uchar_t)(0x80 |  (u4 & 0x003f));
226 		} else if (u4 <= 0x00ffff) {
227 			OUTBUF_SIZE_CHECK(3);
228 			*ob++ = (uchar_t)(0xe0 | ((u4 & 0x0f000) >> 12));
229 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x00fc0) >> 6));
230 			*ob++ = (uchar_t)(0x80 |  (u4 & 0x0003f));
231 		} else if (u4 <= 0x1fffff) {
232 			OUTBUF_SIZE_CHECK(4);
233 			*ob++ = (uchar_t)(0xf0 | ((u4 & 0x01c0000) >> 18));
234 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x003f000) >> 12));
235 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x0000fc0) >> 6));
236 			*ob++ = (uchar_t)(0x80 |  (u4 & 0x000003f));
237 		} else if (u4 <= 0x3ffffff) {
238 			OUTBUF_SIZE_CHECK(5);
239 			*ob++ = (uchar_t)(0xf8 | ((u4 & 0x03000000) >> 24));
240 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x00fc0000) >> 18));
241 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x0003f000) >> 12));
242 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x00000fc0) >> 6));
243 			*ob++ = (uchar_t)(0x80 |  (u4 & 0x0000003f));
244 		} else {
245 			OUTBUF_SIZE_CHECK(6);
246 			*ob++ = (uchar_t)(0xfc | ((u4 & 0x40000000) >> 30));
247 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x3f000000) >> 24));
248 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x00fc0000) >> 18));
249 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x0003f000) >> 12));
250 			*ob++ = (uchar_t)(0x80 | ((u4 & 0x00000fc0) >> 6));
251 			*ob++ = (uchar_t)(0x80 |  (u4 & 0x0000003f));
252 		}
253 		ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
254 	}
255 
256 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
257 need_more_input_err:
258 #endif
259 	*inbuf = (char *)ib;
260 	*inbufleft = ibtail - ib;
261 	*outbuf = (char *)ob;
262 	*outbufleft = obtail - ob;
263 
264 	return(ret_val);
265 }
266