1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #pragma ident "%Z%%M% %I% %E% SMI"
27
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/systm.h>
32 #include <sys/debug.h>
33 #include <sys/kmem.h>
34 #include <sys/sunddi.h>
35 #include <sys/byteorder.h>
36 #include <sys/errno.h>
37 #include <sys/u8_textprep.h>
38 #include <sys/kiconv.h>
39 #include <sys/kiconv_cck_common.h>
40
41 /*LINTLIBRARY*/
42
43 /*
44 * Common kiconv_open method for UTF-8 -> CCK conversion.
45 */
46 void *
kiconv_open_to_cck()47 kiconv_open_to_cck()
48 {
49 kiconv_state_t st;
50
51 st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
52
53 st->bom_processed = 0;
54
55 return ((void *)st);
56 }
57
58 /*
59 * Common kiconv_close method for UTF-8 -> CCK conversion.
60 */
61 int
kiconv_close_to_cck(void * kcd)62 kiconv_close_to_cck(void *kcd)
63 {
64 if (! kcd || kcd == (void *)-1)
65 return (EBADF);
66
67 kmem_free(kcd, sizeof (kiconv_state_data_t));
68
69 return (0);
70 }
71
72 /*
73 * Common routine to convert UTF-8 sequence to CCK legal character sequence.
74 */
75 size_t
kiconv_utf8_to_cck(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)76 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft,
77 char **outbuf, size_t *outbytesleft, int *errno,
78 kiconv_utf8tocck_t ptr_utf8tocck)
79 {
80 uchar_t *ib;
81 uchar_t *ob;
82 uchar_t *ibtail;
83 uchar_t *obtail;
84 uchar_t *oldib;
85 size_t ret_val;
86 size_t i; /* temp variable in for loop */
87 uint32_t u8;
88 int8_t sz;
89
90 /* Check on the kiconv code conversion descriptor. */
91 if (! kcd || kcd == (void *)-1) {
92 *errno = EBADF;
93 return ((size_t)-1);
94 }
95
96 /* If this is a state reset request, process and return. */
97 if (! inbuf || !(*inbuf)) {
98 ((kiconv_state_t)kcd)->bom_processed = 0;
99 return (0);
100 }
101
102 ret_val = 0;
103 ib = (uchar_t *)*inbuf;
104 ob = (uchar_t *)*outbuf;
105 ibtail = ib + *inbytesleft;
106 obtail = ob + *outbytesleft;
107
108 KICONV_CHECK_UTF8_BOM(ib, ibtail);
109
110 while (ib < ibtail) {
111 sz = u8_number_of_bytes[*ib];
112
113 /*
114 * If it is a 7-bit ASCII character, we don't need to
115 * process further and we just copy the character over.
116 *
117 * If not, we connect the chracter bytes up to four bytes,
118 * validate the bytes, and binary search for the corresponding
119 * table. If we find it from the mapping table, we put that
120 * into the output buffer; otherwise, we put a replacement
121 * character instead as a non-identical conversion.
122 */
123 if (sz == 1) {
124 if (ob >= obtail) {
125 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
126 }
127
128 *ob++ = *ib++;
129 continue;
130 }
131
132 /*
133 * Issue EILSEQ error if the first byte is a
134 * invalid UTF-8 character leading byte.
135 */
136 if (sz <= 0) {
137 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
138 }
139
140 /*
141 * Issue EINVAL error if input buffer has an incomplete
142 * character at the end of the buffer.
143 */
144 if (ibtail - ib < sz) {
145 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
146 }
147
148 /*
149 * We collect UTF-8 character bytes and also check if this
150 * is a valid UTF-8 character without any bogus bytes based
151 * on the latest UTF-8 binary representation.
152 */
153 oldib = ib;
154 u8 = *ib++;
155
156 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
157 goto ILLEGAL_CHAR_PROCESS;
158 u8 = (u8 << 8) | *ib++;
159
160 for (i = 2; i < sz; i++) {
161 if (*ib < 0x80 || *ib > 0xbf) {
162 ILLEGAL_CHAR_PROCESS:
163 *errno = EILSEQ;
164 ret_val = (size_t)-1;
165 ib = oldib;
166 goto ILLEGAL_CHAR_ERR;
167 }
168
169 u8 = (u8 << 8) | *ib++;
170 }
171
172 /* Now we have a valid UTF-8 character. */
173 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
174 if (sz < 0) {
175 ib = oldib;
176 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
177 }
178
179 ob += sz;
180 }
181
182 ILLEGAL_CHAR_ERR:
183 *inbuf = (char *)ib;
184 *inbytesleft = ibtail - ib;
185 *outbuf = (char *)ob;
186 *outbytesleft = obtail - ob;
187
188 return (ret_val);
189 }
190
191 size_t
kiconvstr_utf8_to_cck(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)192 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen,
193 int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck)
194 {
195 uchar_t *ibtail;
196 uchar_t *obtail;
197 uchar_t *oldib;
198 size_t ret_val;
199 size_t i; /* temp variable in for loop */
200 uint32_t u8;
201 int8_t sz;
202 boolean_t do_not_ignore_null;
203
204 ret_val = 0;
205 ibtail = ib + *inlen;
206 obtail = ob + *outlen;
207 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
208
209 KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail);
210
211 while (ib < ibtail) {
212 if (*ib == '\0' && do_not_ignore_null)
213 break;
214
215 sz = u8_number_of_bytes[*ib];
216
217 if (sz == 1) {
218 if (ob >= obtail) {
219 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
220 }
221
222 *ob++ = *ib++;
223 continue;
224 }
225
226 oldib = ib;
227
228 if (sz <= 0) {
229 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
230 }
231
232 if (ibtail - ib < sz) {
233 if (flag & KICONV_REPLACE_INVALID) {
234 ib = ibtail;
235 goto REPLACE_INVALID;
236 }
237
238 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
239 }
240
241 u8 = *ib++;
242
243 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
244 goto ILLEGAL_CHAR_PROCESS;
245 u8 = (u8 << 8) | *ib++;
246
247 for (i = 2; i < sz; i++) {
248 if (*ib < 0x80 || *ib > 0xbf) {
249 ILLEGAL_CHAR_PROCESS:
250 if (flag & KICONV_REPLACE_INVALID) {
251 ib = oldib + sz;
252 goto REPLACE_INVALID;
253 }
254
255 *errno = EILSEQ;
256 ret_val = (size_t)-1;
257 ib = oldib;
258 goto ILLEGAL_CHAR_ERR;
259 }
260
261 u8 = (u8 << 8) | *ib++;
262 }
263
264 /* Now we get a valid character encoded in UTF-8. */
265 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
266 if (sz < 0) {
267 ib = oldib;
268 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
269 }
270
271 ob += sz;
272 continue;
273
274 REPLACE_INVALID:
275 if (ob >= obtail) {
276 ib = oldib;
277 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
278 }
279
280 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
281 ret_val++;
282 }
283
284 ILLEGAL_CHAR_ERR:
285 *inlen = ibtail - ib;
286 *outlen = obtail - ob;
287
288 return (ret_val);
289 }
290
291 /*
292 * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1]. Return 0 if not found.
293 * tbl[0] is a special element for non-identical conversion.
294 */
295 size_t
kiconv_binsearch(uint32_t key,void * tbl,size_t nitems)296 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems)
297 {
298 size_t low, high, mid;
299 kiconv_table_t *table;
300
301 low = 1;
302 high = nitems - 1;
303 table = (kiconv_table_t *)tbl;
304
305 while (low <= high) {
306 mid = (low + high) / 2;
307
308 if (key < table[mid].key)
309 high = mid - 1;
310 else if (key > table[mid].key)
311 low = mid + 1;
312 else
313 return (mid);
314 }
315
316 return (0);
317 }
318