1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * This is for conversions from UTF-8 to various UCS forms, esp., 26 * UCS-2, UCS-2BE, UCS-2LE, UTF-16, UTF-16BE, UTF-16LE, UCS-4, UCS-4BE, 27 * UCS-4LE, UTF-32, UTF-32BE, and UTF-32LE. 28 */ 29 30 31 #include <stdlib.h> 32 #include <errno.h> 33 #include <sys/types.h> 34 #include <sys/isa_defs.h> 35 #include "utf8_to_ucs.h" 36 37 38 void * 39 _icv_open() 40 { 41 ucs_state_t *cd = (ucs_state_t *)calloc(1, sizeof(ucs_state_t)); 42 43 if (cd == (ucs_state_t *)NULL) { 44 errno = ENOMEM; 45 return((void *)-1); 46 } 47 48 #if defined(UTF_16BE) || defined(UCS_2BE) || defined(UCS_4BE) || \ 49 defined(UTF_32BE) 50 cd->little_endian = false; 51 cd->bom_written = true; 52 #elif defined(UTF_16LE) || defined(UCS_2LE) || defined(UCS_4LE) || \ 53 defined(UTF_32LE) 54 cd->little_endian = true; 55 cd->bom_written = true; 56 #elif defined(_LITTLE_ENDIAN) 57 cd->little_endian = true; 58 #endif 59 60 return((void *)cd); 61 } 62 63 64 void 65 _icv_close(ucs_state_t *cd) 66 { 67 if (! cd) 68 errno = EBADF; 69 else 70 free((void *)cd); 71 } 72 73 74 size_t 75 _icv_iconv(ucs_state_t *cd, char **inbuf, size_t *inbufleft, char **outbuf, 76 size_t *outbufleft) 77 { 78 size_t ret_val = 0; 79 uchar_t *ib; 80 uchar_t *ob; 81 uchar_t *ibtail; 82 uchar_t *obtail; 83 84 if (! cd) { 85 errno = EBADF; 86 return((size_t)-1); 87 } 88 89 if (!inbuf || !(*inbuf)) { 90 #if defined(UCS_2) || defined(UCS_4) || defined(UTF_16) || defined(UTF_32) 91 cd->bom_written = false; 92 #endif 93 return((size_t)0); 94 } 95 96 ib = (uchar_t *)*inbuf; 97 ob = (uchar_t *)*outbuf; 98 ibtail = ib + *inbufleft; 99 obtail = ob + *outbufleft; 100 101 while (ib < ibtail) { 102 uchar_t *ib_org; 103 uint_t u4; 104 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 105 uint_t u4_2; 106 #endif 107 uint_t first_byte; 108 signed char sz; 109 signed char obsz; 110 111 sz = number_of_bytes_in_utf8_char[*ib]; 112 if (sz == ICV_TYPE_ILLEGAL_CHAR) { 113 errno = EILSEQ; 114 ret_val = (size_t)-1; 115 break; 116 } 117 118 if ((ibtail - ib) < sz) { 119 errno = EINVAL; 120 ret_val = (size_t)-1; 121 break; 122 } 123 124 ib_org = ib; 125 first_byte = *ib; 126 u4 = (uint_t)(*ib++ & masks_tbl[sz]); 127 for (; sz > 1; sz--) { 128 if (first_byte) { 129 if (((uchar_t)*ib) < 130 valid_min_2nd_byte[first_byte] || 131 ((uchar_t)*ib) > 132 valid_max_2nd_byte[first_byte]) { 133 ib = ib_org; 134 errno = EILSEQ; 135 ret_val = (size_t)-1; 136 goto ILLEGAL_CHAR_ERR; 137 } 138 first_byte = 0; 139 } else if (((uint_t)*ib) < 0x80 || 140 ((uint_t)*ib) > 0xbf) { 141 ib = ib_org; 142 errno = EILSEQ; 143 ret_val = (size_t)-1; 144 goto ILLEGAL_CHAR_ERR; 145 } 146 u4 = (u4 << ICV_UTF8_BIT_SHIFT) | 147 (((uint_t)*ib) & ICV_UTF8_BIT_MASK); 148 ib++; 149 } 150 151 /* Check against known non-characters. */ 152 if ((u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_fffe || 153 (u4 & ICV_UTF32_NONCHAR_mask) == ICV_UTF32_NONCHAR_ffff || 154 u4 > ICV_UTF32_LAST_VALID_CHAR || 155 (u4 >= ICV_UTF32_SURROGATE_START_d800 && 156 u4 <= ICV_UTF32_SURROGATE_END_dfff) || 157 (u4 >= ICV_UTF32_ARABIC_NONCHAR_START_fdd0 && 158 u4 <= ICV_UTF32_ARABIC_NONCHAR_END_fdef)) { 159 ib = ib_org; 160 errno = EILSEQ; 161 ret_val = (size_t)-1; 162 goto ILLEGAL_CHAR_ERR; 163 } 164 165 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 166 u4_2 = 0; 167 #endif 168 169 if (u4 == ICV_BOM_IN_BIG_ENDIAN) { 170 cd->bom_written = true; 171 } 172 173 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) 174 obsz = (cd->bom_written) ? 4 : 8; 175 #elif defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 176 obsz = (cd->bom_written) ? 4 : 8; 177 if (u4 > 0x10ffff) { 178 u4 = ICV_CHAR_UCS2_REPLACEMENT; 179 ret_val++; 180 } 181 #elif defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) 182 obsz = (cd->bom_written) ? 2 : 4; 183 if (u4 > 0x00ffff) { 184 u4 = ICV_CHAR_UCS2_REPLACEMENT; 185 ret_val++; 186 } 187 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 188 obsz = (cd->bom_written) ? 2 : 4; 189 if (u4 > 0x10ffff) { 190 u4 = ICV_CHAR_UCS2_REPLACEMENT; 191 ret_val++; 192 } else if (u4 > 0x00ffff) { 193 u4_2 = ((u4 - 0x010000) % 0x400) + 0x00dc00; 194 u4 = ((u4 - 0x010000) / 0x400) + 0x00d800; 195 obsz += 2; 196 } 197 #else 198 #error "Fatal: one of the UCS macros need to be defined." 199 #endif 200 if ((obtail - ob) < obsz) { 201 ib = ib_org; 202 errno = E2BIG; 203 ret_val = (size_t)-1; 204 break; 205 } 206 207 if (cd->little_endian) { 208 if (! cd->bom_written) { 209 *ob++ = (uchar_t)0xff; 210 *ob++ = (uchar_t)0xfe; 211 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 212 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 213 *(ushort_t *)ob = (ushort_t)0; 214 ob += 2; 215 #endif 216 cd->bom_written = true; 217 } 218 *ob++ = (uchar_t)(u4 & 0xff); 219 *ob++ = (uchar_t)((u4 >> 8) & 0xff); 220 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 221 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 222 *ob++ = (uchar_t)((u4 >> 16) & 0xff); 223 *ob++ = (uchar_t)((u4 >> 24) & 0xff); 224 #elif defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 225 if (u4_2) { 226 *ob++ = (uchar_t)(u4_2 & 0xff); 227 *ob++ = (uchar_t)((u4_2 >> 8) & 0xff); 228 } 229 #endif 230 } else { 231 if (! cd->bom_written) { 232 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 233 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 234 *(ushort_t *)ob = (ushort_t)0; 235 ob += 2; 236 #endif 237 *ob++ = (uchar_t)0xfe; 238 *ob++ = (uchar_t)0xff; 239 cd->bom_written = true; 240 } 241 #if defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 242 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 243 *ob++ = (uchar_t)((u4 >> 24) & 0xff); 244 *ob++ = (uchar_t)((u4 >> 16) & 0xff); 245 #endif 246 *ob++ = (uchar_t)((u4 >> 8) & 0xff); 247 *ob++ = (uchar_t)(u4 & 0xff); 248 #if defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 249 if (u4_2) { 250 *ob++ = (uchar_t)((u4_2 >> 8) & 0xff); 251 *ob++ = (uchar_t)(u4_2 & 0xff); 252 } 253 #endif 254 } 255 } 256 257 ILLEGAL_CHAR_ERR: 258 *inbuf = (char *)ib; 259 *inbufleft = ibtail - ib; 260 *outbuf = (char *)ob; 261 *outbufleft = obtail - ob; 262 263 return(ret_val); 264 } 265