xref: /illumos-gate/usr/src/lib/iconv_modules/utf-8/common/sb_to_ucs.c (revision 16d8656330ae5622ec32e5007f62145ebafdc50f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  *
25  * In this program, we assume that each table entry provided will contain
26  * a valid UCS character, an illegal character, or, a replacement character.
27  * In other words, it is table provider's responsibility to provide
28  * an appropriate mapping for each single byte character in the table since
29  * the program in this file will not do any special checking on the table
30  * component values.
31  *
32  * This particular file is to cover conversions from various single byte
33  * codesets to UCS-2, UCS-2BE, UCS-2LE, UCS-4, UCS-4BE, UCS-4LE, UTF-16,
34  * UTF-16BE, UTF-16LE, UTF-32, UTF-32BE, and UTF-32LE.
35  */
36 
37 
38 #include <stdlib.h>
39 #include <errno.h>
40 #include <sys/types.h>
41 #include <sys/isa_defs.h>
42 #include "sb_to_ucs.h"
43 
44 
45 void *
_icv_open()46 _icv_open()
47 {
48 	ucs_state_t *cd = (ucs_state_t *)calloc(1, sizeof(ucs_state_t));
49 
50 	if (cd == (ucs_state_t *)NULL) {
51 		errno = ENOMEM;
52 		return((void *)-1);
53 	}
54 
55 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \
56 	defined(UTF_32BE)
57 	cd->little_endian = false;
58 	cd->bom_written = true;
59 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \
60 	defined(UTF_32LE)
61 	cd->little_endian = true;
62 	cd->bom_written = true;
63 #elif defined(_LITTLE_ENDIAN)
64 	cd->little_endian = true;
65 #endif
66 
67 	return((void *)cd);
68 }
69 
70 
71 void
_icv_close(ucs_state_t * cd)72 _icv_close(ucs_state_t *cd)
73 {
74 	if (! cd)
75 		errno = EBADF;
76 	else
77 		free((void *)cd);
78 }
79 
80 
81 size_t
_icv_iconv(ucs_state_t * cd,char ** inbuf,size_t * inbufleft,char ** outbuf,size_t * outbufleft)82 _icv_iconv(ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf,
83                 size_t *outbufleft)
84 {
85 	size_t ret_val = 0;
86 	unsigned char *ib;
87 	unsigned char *ob;
88 	unsigned char *ibtail;
89 	unsigned char *obtail;
90 	unsigned int u4;
91 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
92 	unsigned int u4_2;
93 #endif
94 	signed char obsz;
95 
96 
97 	if (! cd) {
98 		errno = EBADF;
99 		return((size_t)-1);
100 	}
101 
102 	if (!inbuf || !(*inbuf)) {
103 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32)
104 		cd->bom_written = false;
105 #endif
106 		return((size_t)0);
107 	}
108 
109 	ib = (unsigned char *)*inbuf;
110 	ob = (unsigned char *)*outbuf;
111 	ibtail = ib + *inbufleft;
112 	obtail = ob + *outbufleft;
113 
114 	while (ib < ibtail) {
115 		u4 = sb_u4_tbl[*ib].u8;
116 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
117 		u4_2 = 0;
118 #endif
119 
120 		if (sb_u4_tbl[*ib].size == ICV_TYPE_ILLEGAL_CHAR) {
121 			errno = EILSEQ;
122 			ret_val = (size_t)-1;
123 			break;
124 		}
125 
126 		obsz = (cd->bom_written) ? ICV_FETCH_UCS_SIZE :
127 			ICV_FETCH_UCS_SIZE_TWO;
128 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE)
129 		if (u4 > 0x00ffff) {
130 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
131 			ret_val++;
132 		}
133 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
134 		if (u4 > 0x00ffff && u4 < 0x110000) {
135 			u4_2 = ((u4 - 0x010000) % 0x400) + 0x00dc00;
136 			u4   = ((u4 - 0x010000) / 0x400) + 0x00d800;
137 			obsz += 2;
138 		} else if (u4 > 0x10ffff) {
139 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
140 			ret_val++;
141 		}
142 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
143 		if (u4 > 0x10ffff) {
144 			u4 = ICV_CHAR_UCS2_REPLACEMENT;
145 			ret_val++;
146 		}
147 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE)
148 		/* do nothing */
149 #else
150 #error	"Fatal: one of the UCS macros need to be defined."
151 #endif
152 
153 		/*
154 		 * The target values in the conversion tables are in UCS-4
155 		 * without BOM and so the max target value possible would be
156 		 * U+7FFFFFFF.
157 		 */
158 		if (u4 == 0x00fffe || u4 == 0x00ffff || u4 > 0x7fffffff ||
159 		    (u4 >= 0x00d800 && u4 <= 0x00dfff)) {
160 			/*
161 			 * if conversion table is right, this should not
162 			 * happen.
163 			 */
164 			errno = EILSEQ;
165 			ret_val = (size_t)-1;
166 			break;
167 		}
168 
169 		if ((obtail - ob) < obsz) {
170 			errno = E2BIG;
171 			ret_val = (size_t)-1;
172 			break;
173 		}
174 
175 		if (cd->little_endian) {
176 			if (! cd->bom_written) {
177 				*ob++ = (uchar_t)0xff;
178 				*ob++ = (uchar_t)0xfe;
179 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
180 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
181 				*(ushort_t *)ob = (ushort_t)0;
182 				ob += 2;
183 #endif
184 				cd->bom_written = true;
185 			}
186 			*ob++ = (uchar_t)(u4 & 0xff);
187 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
188 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
189 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
190 			*ob++ = (uchar_t)((u4 >> 16) & 0xff);
191 			*ob++ = (uchar_t)((u4 >> 24) & 0xff);
192 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
193 			if (u4_2) {
194 				*ob++ = (uchar_t)(u4_2 & 0xff);
195 				*ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
196 			}
197 #endif
198 		} else {
199 			if (! cd->bom_written) {
200 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
201 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
202 				*(ushort_t *)ob = (ushort_t)0;
203 				ob += 2;
204 #endif
205 				*ob++ = (uchar_t)0xfe;
206 				*ob++ = (uchar_t)0xff;
207 				cd->bom_written = true;
208 			}
209 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \
210 	defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE)
211 			*ob++ = (uchar_t)((u4 >> 24) & 0xff);
212 			*ob++ = (uchar_t)((u4 >> 16) & 0xff);
213 #endif
214 			*ob++ = (uchar_t)((u4 >> 8) & 0xff);
215 			*ob++ = (uchar_t)(u4 & 0xff);
216 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE)
217 			if (u4_2) {
218 				*ob++ = (uchar_t)((u4_2 >> 8) & 0xff);
219 				*ob++ = (uchar_t)(u4_2 & 0xff);
220 			}
221 #endif
222 		}
223 		ib++;
224 	}
225 
226 	*inbuf = (char *)ib;
227 	*inbufleft = ibtail - ib;
228 	*outbuf = (char *)ob;
229 	*outbufleft = obtail - ob;
230 
231 	return(ret_val);
232 }
233