xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/ucs_to_ucs4.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * This particular file is to cover conversions from various UCS formats,
26  * especially, UCS-2, UCS-2BE, UCS-2LE, UTF-16, UTF-16BE, and, UTF-16LE to
27  * another various UCS formats, UCS-4, UCS-4BE, UCS-4LE, UTF-32, UTF-32BE,
28  * and, UTF-32LE.
29  */
30 
31 
32 #include <stdlib.h>
33 #include <errno.h>
34 #include <sys/types.h>
35 #include <sys/isa_defs.h>
36 #include "ucs_to_ucs4.h"
37 
38 
39 void *
_icv_open()40 _icv_open()
41 {
42 	ucs_ucs_state_t *cd;
43 
44 	cd = (ucs_ucs_state_t *)calloc(1, sizeof(ucs_ucs_state_t));
45 	if (cd == (ucs_ucs_state_t *)NULL) {
46 		errno = ENOMEM;
47 		return((void *)-1);
48 	}
49 
50 #if defined(UTF_16BE) || defined(UCS_2BE)
51 	cd->input.little_endian = false;
52 	cd->input.bom_written = true;
53 #elif defined(UTF_16LE) || defined(UCS_2LE)
54 	cd->input.little_endian = true;
55 	cd->input.bom_written = true;
56 #elif defined(_LITTLE_ENDIAN)
57 	cd->input.little_endian = true;
58 #endif
59 
60 #if defined(UCS_4BE) || defined(UTF_32BE)
61 	cd->output.little_endian = false;
62 	cd->output.bom_written = true;
63 #elif defined(UCS_4LE) || defined(UTF_32LE)
64 	cd->output.little_endian = true;
65 	cd->output.bom_written = true;
66 #elif defined(_LITTLE_ENDIAN)
67 	cd->output.little_endian = true;
68 #endif
69 
70 	return((void *)cd);
71 }
72 
73 
74 void
_icv_close(ucs_ucs_state_t * cd)75 _icv_close(ucs_ucs_state_t *cd)
76 {
77 	if (! cd)
78 		errno = EBADF;
79 	else
80 		free((void *)cd);
81 }
82 
83 
84 size_t
_icv_iconv(ucs_ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)85 _icv_iconv(ucs_ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
86                 size_t *outbufleft)
87 {
88 	size_t ret_val = 0;
89 	uchar_t *ib;
90 	uchar_t *ob;
91 	uchar_t *ibtail;
92 	uchar_t *obtail;
93 	uint_t u4;
94 	uint_t u4_2;
95 	register int i;
96 
97 	if (! cd) {
98 		errno = EBADF;
99 		return((size_t)-1);
100 	}
101 
102 	if (!inbuf || !(*inbuf)) {
103 #if defined(UCS_2) || defined(UTF_16)
104 		cd->input.bom_written = false;
105 #endif
106 #if defined(UCS_4) || defined(UTF_32)
107 		cd->output.bom_written = false;
108 #endif
109 		return((size_t)0);
110 	}
111 
112 	ib = (uchar_t *)*inbuf;
113 	ob = (uchar_t *)*outbuf;
114 	ibtail = ib + *inbufleft;
115 	obtail = ob + *outbufleft;
116 
117 #if defined(UCS_2) || defined(UTF_16)
118 	if (! cd->input.bom_written) {
119 		if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
120 			errno = EINVAL;
121 			ret_val = (size_t)-1;
122 			goto need_more_input_err;
123 		}
124 
125 		for (u4 = 0, i = 0; i < ICV_FETCH_UCS_SIZE; i++)
126 			u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
127 
128 		if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
129 			ib += ICV_FETCH_UCS_SIZE;
130 			cd->input.little_endian = false;
131 		} else if (u4 == ICV_BOM_IN_LITTLE_ENDIAN) {
132 			ib += ICV_FETCH_UCS_SIZE;
133 			cd->input.little_endian = true;
134 		}
135 	}
136 	cd->input.bom_written = true;
137 #endif
138 
139 	while (ib < ibtail) {
140 		if ((ibtail - ib) < ICV_FETCH_UCS_SIZE) {
141 			errno = EINVAL;
142 			ret_val = (size_t)-1;
143 			break;
144 		}
145 
146 		u4 = u4_2 = 0;
147 		if (cd->input.little_endian) {
148 			for (i = ICV_FETCH_UCS_SIZE - 1; i >= 0; i--)
149 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
150 		} else {
151 			for (i = 0; i < ICV_FETCH_UCS_SIZE; i++)
152 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
153 		}
154 
155 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
156 		if ((u4 >= 0x00dc00 && u4 <= 0x00dfff) || u4 >= 0x00fffe) {
157 			errno = EILSEQ;
158 			ret_val = (size_t)-1;
159 			break;
160 		}
161 
162 		if (u4 >= 0x00d800 && u4 <= 0x00dbff) {
163 			if ((ibtail - ib) < ICV_FETCH_UCS_SIZE_TWO) {
164 				errno = EINVAL;
165 				ret_val = (size_t)-1;
166 				break;
167 			}
168 
169 			if (cd->input.little_endian) {
170 				for (i = ICV_FETCH_UCS_SIZE_TWO - 1;
171 					i >= ICV_FETCH_UCS_SIZE;
172 						i--)
173 					u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
174 			} else {
175 				for (i = ICV_FETCH_UCS_SIZE;
176 					i < ICV_FETCH_UCS_SIZE_TWO;
177 						i++)
178 					u4_2 = (u4_2<<8)|((uint_t)(*(ib + i)));
179 			}
180 
181 			if (u4_2 < 0x00dc00 || u4_2 > 0x00dfff) {
182 				errno = EILSEQ;
183 				ret_val = (size_t)-1;
184 				break;
185 			}
186 
187 			u4 = ((((u4 - 0x00d800) * 0x400) +
188 				(u4_2 - 0x00dc00)) & 0x0fffff) + 0x010000;
189 		}
190 #elif defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
191 		if (u4 >= 0x00fffe || (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
192 			errno = EILSEQ;
193 			ret_val = (size_t)-1;
194 			break;
195 		}
196 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
197 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
198 		/*
199 		 * We do nothing here since these if expressions are
200 		 * only for input characters particularly of
201 		 * UCS-2, UCS-2BE, UCS-2LE, UTF-16, UTF-16BE, and
202 		 * UTF-16LE.
203 		 */
204 #else
205 #error	"Fatal: one of the UCS macros need to be defined."
206 #endif
207 
208 		if ((obtail - ob) < ((cd->output.bom_written) ? 4 : 8)) {
209 			errno = E2BIG;
210 			ret_val = (size_t)-1;
211 			break;
212 		}
213 
214 		if (cd->output.little_endian) {
215 			if (! cd->output.bom_written) {
216 				*ob++ = (uchar_t)0xff;
217 				*ob++ = (uchar_t)0xfe;
218 				*(ushort_t *)ob = (ushort_t)0;
219 				ob += 2;
220 				cd->output.bom_written = true;
221 			}
222 			*ob++ = (uchar_t)(u4 & 0xff);
223 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
224 			*ob++ = (uchar_t)((u4 >> 16) & 0xff);
225 			*ob++ = (uchar_t)((u4 >> 24) & 0xff);
226 		} else {
227 			if (! cd->output.bom_written) {
228 				*(ushort_t *)ob = (ushort_t)0;
229 				ob += 2;
230 				*ob++ = (uchar_t)0xfe;
231 				*ob++ = (uchar_t)0xff;
232 				cd->output.bom_written = true;
233 			}
234 			*ob++ = (uchar_t)((u4 >> 24) & 0xff);
235 			*ob++ = (uchar_t)((u4 >> 16) & 0xff);
236 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
237 			*ob++ = (uchar_t)(u4 & 0xff);
238 		}
239 		ib += ((u4_2) ? ICV_FETCH_UCS_SIZE_TWO : ICV_FETCH_UCS_SIZE);
240 	}
241 
242 #if defined(UCS_2) || defined(UTF_16)
243 need_more_input_err:
244 #endif
245 	*inbuf = (char *)ib;
246 	*inbufleft = ibtail - ib;
247 	*outbuf = (char *)ob;
248 	*outbufleft = obtail - ob;
249 
250 	return(ret_val);
251 }
252