xref: /illumos-gate/usr/src/uts/common/kiconv/kiconv_sc/kiconv_cck_common.c (revision c65ebfc7045424bd04a6c7719a27b0ad3399ad54)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/sunddi.h>
35 #include <sys/byteorder.h>
36 #include <sys/errno.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/kiconv.h>
39 #include <sys/kiconv_cck_common.h>
40 
41 /*LINTLIBRARY*/
42 
43 /*
44  * Common kiconv_open method for UTF-8 -> CCK conversion.
45  */
46 void *
47 kiconv_open_to_cck()
48 {
49 	kiconv_state_t st;
50 
51 	st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
52 
53 	st->bom_processed = 0;
54 
55 	return ((void *)st);
56 }
57 
58 /*
59  * Common kiconv_close method for UTF-8 -> CCK conversion.
60  */
61 int
62 kiconv_close_to_cck(void *kcd)
63 {
64 	if (! kcd || kcd == (void *)-1)
65 		return (EBADF);
66 
67 	kmem_free(kcd, sizeof (kiconv_state_data_t));
68 
69 	return (0);
70 }
71 
72 /*
73  * Common routine to convert UTF-8 sequence to CCK legal character sequence.
74  */
75 size_t
76 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft,
77 	char **outbuf, size_t *outbytesleft, int *errno,
78 	kiconv_utf8tocck_t ptr_utf8tocck)
79 {
80 	uchar_t		*ib;
81 	uchar_t		*ob;
82 	uchar_t		*ibtail;
83 	uchar_t		*obtail;
84 	uchar_t		*oldib;
85 	size_t		ret_val;
86 	size_t		i;		/* temp variable in for loop */
87 	uint32_t	u8;
88 	int8_t		sz;
89 
90 	/* Check on the kiconv code conversion descriptor. */
91 	if (! kcd || kcd == (void *)-1) {
92 		*errno = EBADF;
93 		return ((size_t)-1);
94 	}
95 
96 	/* If this is a state reset request, process and return. */
97 	if (! inbuf || !(*inbuf)) {
98 		((kiconv_state_t)kcd)->bom_processed = 0;
99 		return (0);
100 	}
101 
102 	ret_val = 0;
103 	ib = (uchar_t *)*inbuf;
104 	ob = (uchar_t *)*outbuf;
105 	ibtail = ib + *inbytesleft;
106 	obtail = ob + *outbytesleft;
107 
108 	KICONV_CHECK_UTF8_BOM(ib, ibtail);
109 
110 	while (ib < ibtail) {
111 		sz = u8_number_of_bytes[*ib];
112 
113 		/*
114 		 * If it is a 7-bit ASCII character, we don't need to
115 		 * process further and we just copy the character over.
116 		 *
117 		 * If not, we connect the chracter bytes up to four bytes,
118 		 * validate the bytes, and binary search for the corresponding
119 		 * table. If we find it from the mapping table, we put that
120 		 * into the output buffer; otherwise, we put a replacement
121 		 * character instead as a non-identical conversion.
122 		 */
123 		if (sz == 1) {
124 			if (ob >= obtail) {
125 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
126 			}
127 
128 			*ob++ = *ib++;
129 			continue;
130 		}
131 
132 		/*
133 		 * Issue EILSEQ error if the first byte is a
134 		 * invalid UTF-8 character leading byte.
135 		 */
136 		if (sz <= 0) {
137 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
138 		}
139 
140 		/*
141 		 * Issue EINVAL error if input buffer has an incomplete
142 		 * character at the end of the buffer.
143 		 */
144 		if (ibtail - ib < sz) {
145 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
146 		}
147 
148 		/*
149 		 * We collect UTF-8 character bytes and also check if this
150 		 * is a valid UTF-8 character without any bogus bytes based
151 		 * on the latest UTF-8 binary representation.
152 		 */
153 		oldib = ib;
154 		u8 = *ib++;
155 
156 		if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
157 			goto ILLEGAL_CHAR_PROCESS;
158 		u8 = (u8 << 8) | *ib++;
159 
160 		for (i = 2; i < sz; i++) {
161 			if (*ib < 0x80 || *ib > 0xbf) {
162 ILLEGAL_CHAR_PROCESS:
163 				*errno = EILSEQ;
164 				ret_val = (size_t)-1;
165 				ib = oldib;
166 				goto ILLEGAL_CHAR_ERR;
167 			}
168 
169 			u8 = (u8 << 8) | *ib++;
170 		}
171 
172 		/* Now we have a valid UTF-8 character. */
173 		sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
174 		if (sz < 0) {
175 			ib = oldib;
176 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
177 		}
178 
179 		ob += sz;
180 	}
181 
182 ILLEGAL_CHAR_ERR:
183 	*inbuf = (char *)ib;
184 	*inbytesleft = ibtail - ib;
185 	*outbuf = (char *)ob;
186 	*outbytesleft = obtail - ob;
187 
188 	return (ret_val);
189 }
190 
191 size_t
192 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen,
193 	int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck)
194 {
195 	uchar_t		*ibtail;
196 	uchar_t		*obtail;
197 	uchar_t		*oldib;
198 	size_t		ret_val;
199 	size_t		i;		/* temp variable in for loop */
200 	uint32_t	u8;
201 	int8_t		sz;
202 	boolean_t	do_not_ignore_null;
203 
204 	ret_val = 0;
205 	ibtail = ib + *inlen;
206 	obtail = ob + *outlen;
207 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
208 
209 	KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail);
210 
211 	while (ib < ibtail) {
212 		if (*ib == '\0' && do_not_ignore_null)
213 			break;
214 
215 		sz = u8_number_of_bytes[*ib];
216 
217 		if (sz == 1) {
218 			if (ob >= obtail) {
219 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
220 			}
221 
222 			*ob++ = *ib++;
223 			continue;
224 		}
225 
226 		oldib = ib;
227 
228 		if (sz <= 0) {
229 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
230 		}
231 
232 		if (ibtail - ib < sz) {
233 			if (flag & KICONV_REPLACE_INVALID) {
234 				ib = ibtail;
235 				goto REPLACE_INVALID;
236 			}
237 
238 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
239 		}
240 
241 		u8 = *ib++;
242 
243 		if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
244 			goto ILLEGAL_CHAR_PROCESS;
245 		u8 = (u8 << 8) | *ib++;
246 
247 		for (i = 2; i < sz; i++) {
248 			if (*ib < 0x80 || *ib > 0xbf) {
249 ILLEGAL_CHAR_PROCESS:
250 				if (flag & KICONV_REPLACE_INVALID) {
251 					ib = oldib + sz;
252 					goto REPLACE_INVALID;
253 				}
254 
255 				*errno = EILSEQ;
256 				ret_val = (size_t)-1;
257 				ib = oldib;
258 				goto ILLEGAL_CHAR_ERR;
259 			}
260 
261 			u8 = (u8 << 8) | *ib++;
262 		}
263 
264 		/* Now we get a valid character encoded in UTF-8. */
265 		sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
266 		if (sz < 0) {
267 			ib = oldib;
268 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
269 		}
270 
271 		ob += sz;
272 		continue;
273 
274 REPLACE_INVALID:
275 		if (ob >= obtail) {
276 			ib = oldib;
277 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
278 		}
279 
280 		*ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
281 		ret_val++;
282 	}
283 
284 ILLEGAL_CHAR_ERR:
285 	*inlen = ibtail - ib;
286 	*outlen = obtail - ob;
287 
288 	return (ret_val);
289 }
290 
291 /*
292  * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1].  Return 0 if not found.
293  * tbl[0] is a special element for non-identical conversion.
294  */
295 size_t
296 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems)
297 {
298 	size_t low, high, mid;
299 	kiconv_table_t *table;
300 
301 	low = 1;
302 	high = nitems - 1;
303 	table = (kiconv_table_t *)tbl;
304 
305 	while (low <= high) {
306 		mid = (low + high) / 2;
307 
308 		if (key < table[mid].key)
309 			high = mid - 1;
310 		else if (key > table[mid].key)
311 			low = mid + 1;
312 		else
313 			return (mid);
314 	}
315 
316 	return (0);
317 }
318