xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/utf8_to_ucs.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * This is for conversions from UTF-8 to various UCS forms, esp.,
26  * UCS-2, UCS-2BE, UCS-2LE, UTF-16, UTF-16BE, UTF-16LE, UCS-4, UCS-4BE,
27  * UCS-4LE, UTF-32, UTF-32BE, and UTF-32LE.
28  */
29 
30 
31 #include <stdlib.h>
32 #include <errno.h>
33 #include <sys/types.h>
34 #include <sys/isa_defs.h>
35 #include "utf8_to_ucs.h"
36 
37 
38 void *
_icv_open()39 _icv_open()
40 {
41 	ucs_state_t *cd = (ucs_state_t *)calloc(1, sizeof(ucs_state_t));
42 
43 	if (cd == (ucs_state_t *)NULL) {
44 		errno = ENOMEM;
45 		return((void *)-1);
46 	}
47 
48 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
49 	defined(UTF_32BE)
50 	cd->little_endian = false;
51 	cd->bom_written = true;
52 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
53 	defined(UTF_32LE)
54 	cd->little_endian = true;
55 	cd->bom_written = true;
56 #elif defined(_LITTLE_ENDIAN)
57 	cd->little_endian = true;
58 #endif
59 
60 	return((void *)cd);
61 }
62 
63 
64 void
_icv_close(ucs_state_t * cd)65 _icv_close(ucs_state_t *cd)
66 {
67 	if (! cd)
68 		errno = EBADF;
69 	else
70 		free((void *)cd);
71 }
72 
73 
74 size_t
_icv_iconv(ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)75 _icv_iconv(ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
76                 size_t *outbufleft)
77 {
78 	size_t ret_val = 0;
79 	uchar_t *ib;
80 	uchar_t *ob;
81 	uchar_t *ibtail;
82 	uchar_t *obtail;
83 
84 	if (! cd) {
85 		errno = EBADF;
86 		return((size_t)-1);
87 	}
88 
89 	if (!inbuf || !(*inbuf)) {
90 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
91 		cd->bom_written = false;
92 #endif
93 		return((size_t)0);
94 	}
95 
96 	ib = (uchar_t *)*inbuf;
97 	ob = (uchar_t *)*outbuf;
98 	ibtail = ib + *inbufleft;
99 	obtail = ob + *outbufleft;
100 
101 	while (ib < ibtail) {
102 		uchar_t *ib_org;
103 		uint_t u4;
104 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
105 		uint_t u4_2;
106 #endif
107 		uint_t first_byte;
108 		signed char sz;
109 		signed char obsz;
110 
111 		sz = number_of_bytes_in_utf8_char[*ib];
112 		if (sz == ICV_TYPE_ILLEGAL_CHAR) {
113 			errno = EILSEQ;
114 			ret_val = (size_t)-1;
115 			break;
116 		}
117 
118 		if ((ibtail - ib) < sz) {
119 			errno = EINVAL;
120 			ret_val = (size_t)-1;
121 			break;
122 		}
123 
124 		ib_org = ib;
125 		first_byte = *ib;
126 		u4 = (uint_t)(*ib++ & masks_tbl[sz]);
127 		for (; sz > 1; sz--) {
128 			if (first_byte) {
129 				if (((uchar_t)*ib) <
130 					valid_min_2nd_byte[first_byte] ||
131 				    ((uchar_t)*ib) >
132 					valid_max_2nd_byte[first_byte]) {
133 					ib = ib_org;
134 					errno = EILSEQ;
135 					ret_val = (size_t)-1;
136 					goto ILLEGAL_CHAR_ERR;
137 				}
138 				first_byte = 0;
139 			} else if (((uint_t)*ib) < 0x80 ||
140 				   ((uint_t)*ib) > 0xbf) {
141 				ib = ib_org;
142 				errno = EILSEQ;
143 				ret_val = (size_t)-1;
144 				goto ILLEGAL_CHAR_ERR;
145 			}
146 			u4 = (u4 << ICV_UTF8_BIT_SHIFT) |
147 				(((uint_t)*ib) & ICV_UTF8_BIT_MASK);
148 			ib++;
149 		}
150 
151 		/* Check against known non-characters. */
152 		if ((u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_fffe ||
153 		    (u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_ffff ||
154 		    u4 > ICV_UTF32_LAST_VALID_CHAR ||
155 		    (u4 >= ICV_UTF32_SURROGATE_START_d800 &&
156 		    u4 <= ICV_UTF32_SURROGATE_END_dfff) ||
157 		    (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 &&
158 		    u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) {
159 			ib = ib_org;
160 			errno = EILSEQ;
161 			ret_val = (size_t)-1;
162 			goto ILLEGAL_CHAR_ERR;
163 		}
164 
165 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
166 		u4_2 = 0;
167 #endif
168 
169 		if (u4 == ICV_BOM_IN_BIG_ENDIAN) {
170 			cd->bom_written = true;
171 		}
172 
173 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
174 		obsz = (cd->bom_written) ? 4 : 8;
175 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
176 		obsz = (cd->bom_written) ? 4 : 8;
177 		if (u4 > 0x10ffff) {
178 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
179 			ret_val++;
180 		}
181 #elif defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
182 		obsz = (cd->bom_written) ? 2 : 4;
183 		if (u4 > 0x00ffff) {
184 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
185 			ret_val++;
186 		}
187 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
188 		obsz = (cd->bom_written) ? 2 : 4;
189 		if (u4 > 0x10ffff) {
190 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
191 			ret_val++;
192 		} else if (u4 > 0x00ffff) {
193 			u4_2 = ((u4 - 0x010000) % 0x400) + 0x00dc00;
194 			u4   = ((u4 - 0x010000) / 0x400) + 0x00d800;
195 			obsz += 2;
196 		}
197 #else
198 #error	"Fatal: one of the UCS macros need to be defined."
199 #endif
200 		if ((obtail - ob) < obsz) {
201 			ib = ib_org;
202 			errno = E2BIG;
203 			ret_val = (size_t)-1;
204 			break;
205 		}
206 
207 		if (cd->little_endian) {
208 			if (! cd->bom_written) {
209 				*ob++ = (uchar_t)0xff;
210 				*ob++ = (uchar_t)0xfe;
211 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
212 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
213 				*(ushort_t *)ob = (ushort_t)0;
214 				ob += 2;
215 #endif
216 				cd->bom_written = true;
217 			}
218 			*ob++ = (uchar_t)(u4 & 0xff);
219 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
220 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
221 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
222 			*ob++ = (uchar_t)((u4 >> 16) & 0xff);
223 			*ob++ = (uchar_t)((u4 >> 24) & 0xff);
224 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
225 			if (u4_2) {
226 				*ob++ = (uchar_t)(u4_2 & 0xff);
227 				*ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
228 			}
229 #endif
230 		} else {
231 			if (! cd->bom_written) {
232 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
233 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
234 				*(ushort_t *)ob = (ushort_t)0;
235 				ob += 2;
236 #endif
237 				*ob++ = (uchar_t)0xfe;
238 				*ob++ = (uchar_t)0xff;
239 				cd->bom_written = true;
240 			}
241 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
242 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
243 			*ob++ = (uchar_t)((u4 >> 24) & 0xff);
244 			*ob++ = (uchar_t)((u4 >> 16) & 0xff);
245 #endif
246 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
247 			*ob++ = (uchar_t)(u4 & 0xff);
248 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
249 			if (u4_2) {
250 				*ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
251 				*ob++ = (uchar_t)(u4_2 & 0xff);
252 			}
253 #endif
254 		}
255 	}
256 
257 ILLEGAL_CHAR_ERR:
258 	*inbuf = (char *)ib;
259 	*inbufleft = ibtail - ib;
260 	*outbuf = (char *)ob;
261 	*outbufleft = obtail - ob;
262 
263 	return(ret_val);
264 }
265