1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/types.h> 29 #include <sys/param.h> 30 #include <sys/sysmacros.h> 31 #include <sys/systm.h> 32 #include <sys/debug.h> 33 #include <sys/kmem.h> 34 #include <sys/sunddi.h> 35 #include <sys/byteorder.h> 36 #include <sys/errno.h> 37 #include <sys/u8_textprep.h> 38 #include <sys/kiconv.h> 39 #include <sys/kiconv_cck_common.h> 40 41 /*LINTLIBRARY*/ 42 43 /* 44 * Common kiconv_open method for UTF-8 -> CCK conversion. 45 */ 46 void * 47 kiconv_open_to_cck() 48 { 49 kiconv_state_t st; 50 51 st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP); 52 53 st->bom_processed = 0; 54 55 return ((void *)st); 56 } 57 58 /* 59 * Common kiconv_close method for UTF-8 -> CCK conversion. 60 */ 61 int 62 kiconv_close_to_cck(void *kcd) 63 { 64 if (! kcd || kcd == (void *)-1) 65 return (EBADF); 66 67 kmem_free(kcd, sizeof (kiconv_state_data_t)); 68 69 return (0); 70 } 71 72 /* 73 * Common routine to convert UTF-8 sequence to CCK legal character sequence. 74 */ 75 size_t 76 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft, 77 char **outbuf, size_t *outbytesleft, int *errno, 78 kiconv_utf8tocck_t ptr_utf8tocck) 79 { 80 uchar_t *ib; 81 uchar_t *ob; 82 uchar_t *ibtail; 83 uchar_t *obtail; 84 uchar_t *oldib; 85 size_t ret_val; 86 size_t i; /* temp variable in for loop */ 87 uint32_t u8; 88 int8_t sz; 89 90 /* Check on the kiconv code conversion descriptor. */ 91 if (! kcd || kcd == (void *)-1) { 92 *errno = EBADF; 93 return ((size_t)-1); 94 } 95 96 /* If this is a state reset request, process and return. */ 97 if (! inbuf || !(*inbuf)) { 98 ((kiconv_state_t)kcd)->bom_processed = 0; 99 return (0); 100 } 101 102 ret_val = 0; 103 ib = (uchar_t *)*inbuf; 104 ob = (uchar_t *)*outbuf; 105 ibtail = ib + *inbytesleft; 106 obtail = ob + *outbytesleft; 107 108 KICONV_CHECK_UTF8_BOM(ib, ibtail); 109 110 while (ib < ibtail) { 111 sz = u8_number_of_bytes[*ib]; 112 113 /* 114 * If it is a 7-bit ASCII character, we don't need to 115 * process further and we just copy the character over. 116 * 117 * If not, we connect the chracter bytes up to four bytes, 118 * validate the bytes, and binary search for the corresponding 119 * table. If we find it from the mapping table, we put that 120 * into the output buffer; otherwise, we put a replacement 121 * character instead as a non-identical conversion. 122 */ 123 if (sz == 1) { 124 if (ob >= obtail) { 125 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 126 } 127 128 *ob++ = *ib++; 129 continue; 130 } 131 132 /* 133 * Issue EILSEQ error if the first byte is a 134 * invalid UTF-8 character leading byte. 135 */ 136 if (sz <= 0) { 137 KICONV_SET_ERRNO_AND_BREAK(EILSEQ); 138 } 139 140 /* 141 * Issue EINVAL error if input buffer has an incomplete 142 * character at the end of the buffer. 143 */ 144 if (ibtail - ib < sz) { 145 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 146 } 147 148 /* 149 * We collect UTF-8 character bytes and also check if this 150 * is a valid UTF-8 character without any bogus bytes based 151 * on the latest UTF-8 binary representation. 152 */ 153 oldib = ib; 154 u8 = *ib++; 155 156 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8)) 157 goto ILLEGAL_CHAR_PROCESS; 158 u8 = (u8 << 8) | *ib++; 159 160 for (i = 2; i < sz; i++) { 161 if (*ib < 0x80 || *ib > 0xbf) { 162 ILLEGAL_CHAR_PROCESS: 163 *errno = EILSEQ; 164 ret_val = (size_t)-1; 165 ib = oldib; 166 goto ILLEGAL_CHAR_ERR; 167 } 168 169 u8 = (u8 << 8) | *ib++; 170 } 171 172 /* Now we have a valid UTF-8 character. */ 173 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val); 174 if (sz < 0) { 175 ib = oldib; 176 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 177 } 178 179 ob += sz; 180 } 181 182 ILLEGAL_CHAR_ERR: 183 *inbuf = (char *)ib; 184 *inbytesleft = ibtail - ib; 185 *outbuf = (char *)ob; 186 *outbytesleft = obtail - ob; 187 188 return (ret_val); 189 } 190 191 size_t 192 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen, 193 int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck) 194 { 195 uchar_t *ibtail; 196 uchar_t *obtail; 197 uchar_t *oldib; 198 size_t ret_val; 199 size_t i; /* temp variable in for loop */ 200 uint32_t u8; 201 int8_t sz; 202 boolean_t do_not_ignore_null; 203 204 ret_val = 0; 205 ibtail = ib + *inlen; 206 obtail = ob + *outlen; 207 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0); 208 209 KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail); 210 211 while (ib < ibtail) { 212 if (*ib == '\0' && do_not_ignore_null) 213 break; 214 215 sz = u8_number_of_bytes[*ib]; 216 217 if (sz == 1) { 218 if (ob >= obtail) { 219 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 220 } 221 222 *ob++ = *ib++; 223 continue; 224 } 225 226 oldib = ib; 227 228 if (sz <= 0) { 229 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ); 230 } 231 232 if (ibtail - ib < sz) { 233 if (flag & KICONV_REPLACE_INVALID) { 234 ib = ibtail; 235 goto REPLACE_INVALID; 236 } 237 238 KICONV_SET_ERRNO_AND_BREAK(EINVAL); 239 } 240 241 u8 = *ib++; 242 243 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8)) 244 goto ILLEGAL_CHAR_PROCESS; 245 u8 = (u8 << 8) | *ib++; 246 247 for (i = 2; i < sz; i++) { 248 if (*ib < 0x80 || *ib > 0xbf) { 249 ILLEGAL_CHAR_PROCESS: 250 if (flag & KICONV_REPLACE_INVALID) { 251 ib = oldib + sz; 252 goto REPLACE_INVALID; 253 } 254 255 *errno = EILSEQ; 256 ret_val = (size_t)-1; 257 ib = oldib; 258 goto ILLEGAL_CHAR_ERR; 259 } 260 261 u8 = (u8 << 8) | *ib++; 262 } 263 264 /* Now we get a valid character encoded in UTF-8. */ 265 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val); 266 if (sz < 0) { 267 ib = oldib; 268 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 269 } 270 271 ob += sz; 272 continue; 273 274 REPLACE_INVALID: 275 if (ob >= obtail) { 276 ib = oldib; 277 KICONV_SET_ERRNO_AND_BREAK(E2BIG); 278 } 279 280 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR; 281 ret_val++; 282 } 283 284 ILLEGAL_CHAR_ERR: 285 *inlen = ibtail - ib; 286 *outlen = obtail - ob; 287 288 return (ret_val); 289 } 290 291 /* 292 * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1]. Return 0 if not found. 293 * tbl[0] is a special element for non-identical conversion. 294 */ 295 size_t 296 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems) 297 { 298 size_t low, high, mid; 299 kiconv_table_t *table; 300 301 low = 1; 302 high = nitems - 1; 303 table = (kiconv_table_t *)tbl; 304 305 while (low <= high) { 306 mid = (low + high) / 2; 307 308 if (key < table[mid].key) 309 high = mid - 1; 310 else if (key > table[mid].key) 311 low = mid + 1; 312 else 313 return (mid); 314 } 315 316 return (0); 317 } 318