1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/sysmacros.h>
29 #include <sys/systm.h>
30 #include <sys/debug.h>
31 #include <sys/kmem.h>
32 #include <sys/sunddi.h>
33 #include <sys/byteorder.h>
34 #include <sys/errno.h>
35 #include <sys/u8_textprep.h>
36 #include <sys/kiconv.h>
37 #include <sys/kiconv_cck_common.h>
38
39 /*
40 * Common kiconv_open method for UTF-8 -> CCK conversion.
41 */
42 void *
kiconv_open_to_cck()43 kiconv_open_to_cck()
44 {
45 kiconv_state_t st;
46
47 st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
48
49 st->bom_processed = 0;
50
51 return ((void *)st);
52 }
53
54 /*
55 * Common kiconv_close method for UTF-8 -> CCK conversion.
56 */
57 int
kiconv_close_to_cck(void * kcd)58 kiconv_close_to_cck(void *kcd)
59 {
60 if (! kcd || kcd == (void *)-1)
61 return (EBADF);
62
63 kmem_free(kcd, sizeof (kiconv_state_data_t));
64
65 return (0);
66 }
67
68 /*
69 * Common routine to convert UTF-8 sequence to CCK legal character sequence.
70 */
71 size_t
kiconv_utf8_to_cck(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)72 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft,
73 char **outbuf, size_t *outbytesleft, int *errno,
74 kiconv_utf8tocck_t ptr_utf8tocck)
75 {
76 uchar_t *ib;
77 uchar_t *ob;
78 uchar_t *ibtail;
79 uchar_t *obtail;
80 uchar_t *oldib;
81 size_t ret_val;
82 size_t i; /* temp variable in for loop */
83 uint32_t u8;
84 int8_t sz;
85
86 /* Check on the kiconv code conversion descriptor. */
87 if (! kcd || kcd == (void *)-1) {
88 *errno = EBADF;
89 return ((size_t)-1);
90 }
91
92 /* If this is a state reset request, process and return. */
93 if (! inbuf || !(*inbuf)) {
94 ((kiconv_state_t)kcd)->bom_processed = 0;
95 return (0);
96 }
97
98 ret_val = 0;
99 ib = (uchar_t *)*inbuf;
100 ob = (uchar_t *)*outbuf;
101 ibtail = ib + *inbytesleft;
102 obtail = ob + *outbytesleft;
103
104 KICONV_CHECK_UTF8_BOM(ib, ibtail);
105
106 while (ib < ibtail) {
107 sz = u8_number_of_bytes[*ib];
108
109 /*
110 * If it is a 7-bit ASCII character, we don't need to
111 * process further and we just copy the character over.
112 *
113 * If not, we connect the chracter bytes up to four bytes,
114 * validate the bytes, and binary search for the corresponding
115 * table. If we find it from the mapping table, we put that
116 * into the output buffer; otherwise, we put a replacement
117 * character instead as a non-identical conversion.
118 */
119 if (sz == 1) {
120 if (ob >= obtail) {
121 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
122 }
123
124 *ob++ = *ib++;
125 continue;
126 }
127
128 /*
129 * Issue EILSEQ error if the first byte is a
130 * invalid UTF-8 character leading byte.
131 */
132 if (sz <= 0) {
133 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
134 }
135
136 /*
137 * Issue EINVAL error if input buffer has an incomplete
138 * character at the end of the buffer.
139 */
140 if (ibtail - ib < sz) {
141 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
142 }
143
144 /*
145 * We collect UTF-8 character bytes and also check if this
146 * is a valid UTF-8 character without any bogus bytes based
147 * on the latest UTF-8 binary representation.
148 */
149 oldib = ib;
150 u8 = *ib++;
151
152 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
153 goto ILLEGAL_CHAR_PROCESS;
154 u8 = (u8 << 8) | *ib++;
155
156 for (i = 2; i < sz; i++) {
157 if (*ib < 0x80 || *ib > 0xbf) {
158 ILLEGAL_CHAR_PROCESS:
159 *errno = EILSEQ;
160 ret_val = (size_t)-1;
161 ib = oldib;
162 goto ILLEGAL_CHAR_ERR;
163 }
164
165 u8 = (u8 << 8) | *ib++;
166 }
167
168 /* Now we have a valid UTF-8 character. */
169 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
170 if (sz < 0) {
171 ib = oldib;
172 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
173 }
174
175 ob += sz;
176 }
177
178 ILLEGAL_CHAR_ERR:
179 *inbuf = (char *)ib;
180 *inbytesleft = ibtail - ib;
181 *outbuf = (char *)ob;
182 *outbytesleft = obtail - ob;
183
184 return (ret_val);
185 }
186
187 size_t
kiconvstr_utf8_to_cck(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)188 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen,
189 int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck)
190 {
191 uchar_t *ibtail;
192 uchar_t *obtail;
193 uchar_t *oldib;
194 size_t ret_val;
195 size_t i; /* temp variable in for loop */
196 uint32_t u8;
197 int8_t sz;
198 boolean_t do_not_ignore_null;
199
200 ret_val = 0;
201 ibtail = ib + *inlen;
202 obtail = ob + *outlen;
203 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
204
205 KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail);
206
207 while (ib < ibtail) {
208 if (*ib == '\0' && do_not_ignore_null)
209 break;
210
211 sz = u8_number_of_bytes[*ib];
212
213 if (sz == 1) {
214 if (ob >= obtail) {
215 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
216 }
217
218 *ob++ = *ib++;
219 continue;
220 }
221
222 oldib = ib;
223
224 if (sz <= 0) {
225 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
226 }
227
228 if (ibtail - ib < sz) {
229 if (flag & KICONV_REPLACE_INVALID) {
230 ib = ibtail;
231 goto REPLACE_INVALID;
232 }
233
234 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
235 }
236
237 u8 = *ib++;
238
239 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
240 goto ILLEGAL_CHAR_PROCESS;
241 u8 = (u8 << 8) | *ib++;
242
243 for (i = 2; i < sz; i++) {
244 if (*ib < 0x80 || *ib > 0xbf) {
245 ILLEGAL_CHAR_PROCESS:
246 if (flag & KICONV_REPLACE_INVALID) {
247 ib = oldib + sz;
248 goto REPLACE_INVALID;
249 }
250
251 *errno = EILSEQ;
252 ret_val = (size_t)-1;
253 ib = oldib;
254 goto ILLEGAL_CHAR_ERR;
255 }
256
257 u8 = (u8 << 8) | *ib++;
258 }
259
260 /* Now we get a valid character encoded in UTF-8. */
261 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
262 if (sz < 0) {
263 ib = oldib;
264 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
265 }
266
267 ob += sz;
268 continue;
269
270 REPLACE_INVALID:
271 if (ob >= obtail) {
272 ib = oldib;
273 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
274 }
275
276 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
277 ret_val++;
278 }
279
280 ILLEGAL_CHAR_ERR:
281 *inlen = ibtail - ib;
282 *outlen = obtail - ob;
283
284 return (ret_val);
285 }
286
287 /*
288 * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1]. Return 0 if not found.
289 * tbl[0] is a special element for non-identical conversion.
290 */
291 size_t
kiconv_binsearch(uint32_t key,void * tbl,size_t nitems)292 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems)
293 {
294 size_t low, high, mid;
295 kiconv_table_t *table;
296
297 low = 1;
298 high = nitems - 1;
299 table = (kiconv_table_t *)tbl;
300
301 while (low <= high) {
302 mid = (low + high) / 2;
303
304 if (key < table[mid].key)
305 high = mid - 1;
306 else if (key > table[mid].key)
307 low = mid + 1;
308 else
309 return (mid);
310 }
311
312 return (0);
313 }
314