xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/ucs4_to_ucs.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * This particular file is to cover conversions from UCS-4, UCS-4BE, UCS-4LE,
26  * UTF-32, UTF-32BE, and UTF-32LE to various other UCS formats, especially,
27  * UCS-2, UCS-2BE, UCS-2LE, UTF-16, UTF-16BE, and UTF-16LE.
28  */
29 
30 
31 #include <stdlib.h>
32 #include <errno.h>
33 #include <sys/types.h>
34 #include <sys/isa_defs.h>
35 #include "ucs4_to_ucs.h"
36 
37 
38 void *
_icv_open()39 _icv_open()
40 {
41 	ucs_ucs_state_t *cd;
42 
43 	cd = (ucs_ucs_state_t *)calloc(1, sizeof(ucs_ucs_state_t));
44 	if (cd == (ucs_ucs_state_t *)NULL) {
45 		errno = ENOMEM;
46 		return((void *)-1);
47 	}
48 
49 #if defined(UCS_4BE) || defined(UTF_32BE)
50 	cd->input.little_endian = false;
51 	cd->input.bom_written = true;
52 #elif defined(UCS_4LE) || defined(UTF_32LE)
53 	cd->input.little_endian = true;
54 	cd->input.bom_written = true;
55 #elif defined(_LITTLE_ENDIAN)
56 	cd->input.little_endian = true;
57 #endif
58 
59 #if defined(UTF_16BE) || defined(UCS_2BE)
60 	cd->output.little_endian = false;
61 	cd->output.bom_written = true;
62 #elif defined(UTF_16LE) || defined(UCS_2LE)
63 	cd->output.little_endian = true;
64 	cd->output.bom_written = true;
65 #elif defined(_LITTLE_ENDIAN)
66 	cd->output.little_endian = true;
67 #endif
68 
69 	return((void *)cd);
70 }
71 
72 
73 void
_icv_close(ucs_ucs_state_t * cd)74 _icv_close(ucs_ucs_state_t *cd)
75 {
76 	if (! cd)
77 		errno = EBADF;
78 	else
79 		free((void *)cd);
80 }
81 
82 
83 size_t
_icv_iconv(ucs_ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)84 _icv_iconv(ucs_ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
85                 size_t *outbufleft)
86 {
87 	size_t ret_val = 0;
88 	uchar_t *ib;
89 	uchar_t *ob;
90 	uchar_t *ibtail;
91 	uchar_t *obtail;
92 	uint_t u4;
93 	uint_t u4_2;
94 	signed char obsz;
95 	int i;
96 
97 
98 	if (! cd) {
99 		errno = EBADF;
100 		return((size_t)-1);
101 	}
102 
103 	if (!inbuf || !(*inbuf)) {
104 #if defined(UCS_4) || defined(UTF_32)
105 		cd->input.bom_written = false;
106 #endif
107 #if defined(UCS_2) || defined(UTF_16)
108 		cd->output.bom_written = false;
109 #endif
110 		return((size_t)0);
111 	}
112 
113 	ib = (uchar_t *)*inbuf;
114 	ob = (uchar_t *)*outbuf;
115 	ibtail = ib + *inbufleft;
116 	obtail = ob + *outbufleft;
117 
118 #if defined(UCS_4) || defined(UTF_32)
119 	if (! cd->input.bom_written) {
120 		if ((ibtail - ib) < ICV_FETCH_UCS4_SIZE) {
121 			errno = EINVAL;
122 			ret_val = (size_t)-1;
123 			goto need_more_input_err;
124 		}
125 
126 		for (u4 = 0, i = 0; i < ICV_FETCH_UCS4_SIZE; i++)
127 			u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
128 
129 		if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
130 			ib += ICV_FETCH_UCS4_SIZE;
131 			cd->input.little_endian = false;
132 		} else if (u4 == ICV_BOM_IN_LITTLE_ENDIAN_UCS4) {
133 			ib += ICV_FETCH_UCS4_SIZE;
134 			cd->input.little_endian = true;
135 		}
136 	}
137 	cd->input.bom_written = true;
138 #endif
139 
140 
141 	while (ib < ibtail) {
142 		if ((ibtail - ib) < ICV_FETCH_UCS4_SIZE) {
143 			errno = EINVAL;
144 			ret_val = (size_t)-1;
145 			break;
146 		}
147 
148 		u4 = u4_2 = 0;
149 		if (cd->input.little_endian) {
150 			for (i = ICV_FETCH_UCS4_SIZE - 1; i >= 0; i--)
151 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
152 		} else {
153 			for (i = 0; i < ICV_FETCH_UCS4_SIZE; i++)
154 				u4 = (u4 << 8) | ((uint_t)(*(ib + i)));
155 		}
156 
157 		if (u4 == 0x00fffe || u4 == 0x00ffff ||
158 #if defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
159 		    u4 > 0x10ffff ||
160 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
161 		    u4 > 0x7fffffff ||
162 #endif
163 		    (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
164 			errno = EILSEQ;
165 			ret_val = (size_t)-1;
166 			goto illegal_char_err;
167 		}
168 
169 		obsz = (cd->output.bom_written) ? 2 : 4;
170 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
171 		if (u4 > 0x00ffff) {
172 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
173 			ret_val++;
174 		}
175 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
176 		if (u4 > 0x10ffff) {
177 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
178 			ret_val++;
179 		} else if (u4 > 0x00ffff) {
180 			u4_2 = ((u4 - 0x010000) % 0x400) + 0x00dc00;
181 			u4   = ((u4 - 0x010000) / 0x400) + 0x00d800;
182 			obsz += 2;
183 		}
184 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
185 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
186 		/*
187 		 * We do nothing here since these if expressions
188 		 * are only for preparing for output buffer;
189 		 * macros such as UCS_4/UCS_4BE/UCS_4LE and
190 		 * UTF_32/UTF_32BE/UTF_32LE are only for input.
191 		 */
192 #else
193 #error	"Fatal: one of the UCS macros need to be defined."
194 #endif
195 		if ((obtail - ob) < obsz) {
196 			errno = E2BIG;
197 			ret_val = (size_t)-1;
198 			break;
199 		}
200 
201 		if (cd->output.little_endian) {
202 			if (! cd->output.bom_written) {
203 				*ob++ = (uchar_t)0xff;
204 				*ob++ = (uchar_t)0xfe;
205 				cd->output.bom_written = true;
206 			}
207 			*ob++ = (uchar_t)(u4 & 0xff);
208 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
209 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
210 			if (u4_2) {
211 				*ob++ = (uchar_t)(u4_2 & 0xff);
212 				*ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
213 			}
214 #endif
215 		} else {
216 			if (! cd->output.bom_written) {
217 				*ob++ = (uchar_t)0xfe;
218 				*ob++ = (uchar_t)0xff;
219 				cd->output.bom_written = true;
220 			}
221 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
222 			*ob++ = (uchar_t)(u4 & 0xff);
223 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
224 			if (u4_2) {
225 				*ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
226 				*ob++ = (uchar_t)(u4_2 & 0xff);
227 			}
228 #endif
229 		}
230 		ib += ICV_FETCH_UCS4_SIZE;
231 	}
232 
233 #if defined(UCS_4) || defined(UTF_32)
234 need_more_input_err:
235 #endif
236 illegal_char_err:
237 	*inbuf = (char *)ib;
238 	*inbufleft = ibtail - ib;
239 	*outbuf = (char *)ob;
240 	*outbufleft = obtail - ob;
241 
242 	return(ret_val);
243 }
244