1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * COPYRIGHT AND PERMISSION NOTICE 23 * 24 * Copyright (c) 1991-2005 Unicode, Inc. All rights reserved. Distributed 25 * under the Terms of Use in http://www.unicode.org/copyright.html. 26 * 27 * This file has been modified by Sun Microsystems, Inc. 28 */ 29 /* 30 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 31 * Use is subject to license terms. 32 */ 33 34 35 #include <sys/types.h> 36 37 #if defined(JFP_ICONV_FROMCODE_UTF32BE)||defined(JFP_ICONV_FROMCODE_UTF32LE) 38 #define JFP_ICONV_FROMCODE_UTF32 39 #endif 40 41 #if defined(JFP_ICONV_FROMCODE_UTF16BE)||defined(JFP_ICONV_FROMCODE_UTF16LE) 42 #define JFP_ICONV_FROMCODE_UTF16 43 #endif 44 45 #if defined(JFP_ICONV_FROMCODE_UCS2BE)||defined(JFP_ICONV_FROMCODE_UCS2LE) 46 #define JFP_ICONV_FROMCODE_UCS2 47 #endif 48 49 #if defined(JFP_ICONV_TOCODE_UTF32BE)||defined(JFP_ICONV_TOCODE_UTF32LE) 50 #define JFP_ICONV_TOCODE_UTF32 51 #endif 52 53 #if defined(JFP_ICONV_TOCODE_UTF16BE)||defined(JFP_ICONV_TOCODE_UTF16LE) 54 #define JFP_ICONV_TOCODE_UTF16 55 #endif 56 57 #if defined(JFP_ICONV_TOCODE_UCS2BE)||defined(JFP_ICONV_TOCODE_UCS2LE) 58 #define JFP_ICONV_TOCODE_UCS2 59 #endif 60 61 62 #define BOM 0xfeff 63 #define BSBOM16 0xfffe 64 #define BSBOM32 0xfffe0000 65 #define REPLACE 0xfffd 66 #define IFHISUR(x) ((0xd800 <= (x)) && ((x) <= 0xdbff)) 67 #define IFLOSUR(x) ((0xdc00 <= (x)) && ((x) <= 0xdfff)) 68 69 typedef struct { 70 boolean_t bom_written; 71 boolean_t little_endian; 72 } ucs_state_t; 73 74 75 #if defined(JFP_ICONV_FROMCODE_UTF32) 76 77 static size_t /* return #bytes read, or -1 */ 78 read_unicode( 79 unsigned int *p, /* point variable to store UTF-32 */ 80 unsigned char **pip, /* point pointer to input buf */ 81 size_t *pileft, /* point #bytes left in input buf */ 82 ucs_state_t *state) /* BOM state and endian */ 83 { 84 unsigned char *ip = *pip; 85 size_t ileft = *pileft; 86 size_t rv = (size_t)0; /* return value */ 87 unsigned char ic1, ic2, ic3, ic4; /* bytes read */ 88 unsigned int u32; /* resulted UTF-32 */ 89 90 NGET(ic1, "UTF32-1"); 91 NGET(ic2, "UTF32-2"); 92 NGET(ic3, "UTF32-3"); 93 NGET(ic4, "UTF32-4"); 94 95 if (state->bom_written == B_FALSE) { 96 u32 = 0U; 97 u32 |= (unsigned int)ic1 << 24; 98 u32 |= (unsigned int)ic2 << 16; 99 u32 |= (unsigned int)ic3 << 8; 100 u32 |= (unsigned int)ic4 << 0; 101 if (u32 == BOM) { 102 state->bom_written = B_TRUE; 103 state->little_endian = B_FALSE; 104 *p = BOM; 105 rv = (size_t)0; 106 goto ret; 107 } else if (u32 == BSBOM32) { 108 state->bom_written = B_TRUE; 109 state->little_endian = B_TRUE; 110 *p = BOM; 111 rv = (size_t)0; 112 goto ret; 113 } else { 114 state->bom_written = B_TRUE; 115 } 116 } 117 118 if (state->little_endian == B_TRUE) { 119 u32 = 0U; 120 u32 |= (unsigned int)ic1 << 0; 121 u32 |= (unsigned int)ic2 << 8; 122 u32 |= (unsigned int)ic3 << 16; 123 u32 |= (unsigned int)ic4 << 24; 124 } else { 125 u32 = 0U; 126 u32 |= (unsigned int)ic1 << 24; 127 u32 |= (unsigned int)ic2 << 16; 128 u32 |= (unsigned int)ic3 << 8; 129 u32 |= (unsigned int)ic4 << 0; 130 } 131 132 if (u32 == BSBOM32) { 133 RETERROR(EILSEQ, "byte-swapped BOM detected") 134 } 135 136 if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff) 137 || IFHISUR(u32) || IFLOSUR(u32)) { 138 RETERROR(EILSEQ, "illegal in UTF-32") 139 } 140 141 *p = u32; 142 rv = *pileft - ileft; 143 144 ret: 145 if (rv != (size_t)-1) { 146 /* update *pip and *pileft only on successful return */ 147 *pip = ip; 148 *pileft = ileft; 149 } 150 151 return (rv); 152 } 153 154 #elif defined(JFP_ICONV_FROMCODE_UTF16) || defined(JFP_ICONV_FROMCODE_UCS2) 155 156 static size_t /* return #bytes read, or -1 */ 157 read_unicode( 158 unsigned int *p, /* point variable to store UTF-32 */ 159 unsigned char **pip, /* point pointer to input buf */ 160 size_t *pileft, /* point #bytes left in input buf */ 161 ucs_state_t *state) /* BOM state and endian */ 162 { 163 unsigned char *ip = *pip; 164 size_t ileft = *pileft; 165 size_t rv = (size_t)0; /* return value */ 166 unsigned char ic1, ic2; /* bytes read */ 167 unsigned int u32; /* resulted UTF-32 */ 168 #ifndef JFP_ICONV_FROMCODE_UCS2 169 unsigned int losur; /* low surrogate */ 170 #endif 171 172 NGET(ic1, "UTF16-1"); /* read 1st byte */ 173 NGET(ic2, "UTF16-2"); /* read 2nd byte */ 174 175 if (state->bom_written == B_FALSE) { 176 u32 = 0U; 177 u32 |= (unsigned int)ic1 << 8; 178 u32 |= (unsigned int)ic2 << 0; 179 if (u32 == BOM) { 180 state->bom_written = B_TRUE; 181 state->little_endian = B_FALSE; 182 *p = BOM; 183 rv = (size_t)0; 184 goto ret; 185 } else if (u32 == BSBOM16) { 186 state->bom_written = B_TRUE; 187 state->little_endian = B_TRUE; 188 *p = BOM; 189 rv = (size_t)0; 190 goto ret; 191 } else { 192 state->bom_written = B_TRUE; 193 } 194 } 195 196 if (state->little_endian == B_TRUE) { 197 u32 = (((unsigned int)ic2) << 8) | ic1; 198 } else { 199 u32 = (((unsigned int)ic1) << 8) | ic2; 200 } 201 202 if (u32 == BSBOM16) { 203 RETERROR(EILSEQ, "byte-swapped BOM detected") 204 } 205 206 if ((u32 == 0xfffe) || (u32 == 0xffff) || (u32 > 0x10ffff) 207 || (IFLOSUR(u32))) { 208 RETERROR(EILSEQ, "illegal in UTF16") 209 } 210 211 if (IFHISUR(u32)) { 212 #if defined(JFP_ICONV_FROMCODE_UCS2) 213 RETERROR(EILSEQ, "surrogate is illegal in UCS2") 214 #else /* !defined(JFP_ICONV_FROMCODE_UCS2) */ 215 NGET(ic1, "LOSUR-1"); 216 NGET(ic2, "LOSUR-2"); 217 218 if (state->little_endian == B_TRUE) { 219 losur = (((unsigned int)ic2) << 8) | ic1; 220 } else { 221 losur = (((unsigned int)ic1) << 8) | ic2; 222 } 223 224 if (IFLOSUR(losur)) { 225 u32 = ((u32 - 0xd800) * 0x400) 226 + (losur - 0xdc00) + 0x10000; 227 } else { 228 RETERROR(EILSEQ, "low-surrogate expected") 229 } 230 #endif /* defined(JFP_ICONV_FROMCODE_UCS2) */ 231 } 232 233 *p = u32; 234 rv = *pileft - ileft; 235 236 ret: 237 if (rv != (size_t)-1) { 238 /* update *pip and *pileft only on successful return */ 239 *pip = ip; 240 *pileft = ileft; 241 } 242 243 return (rv); 244 } 245 246 #else /* JFP_ICONV_FROMCODE_UTF8 (default) */ 247 248 /* 249 * The following vector shows remaining bytes in a UTF-8 character. 250 * Index will be the first byte of the character. 251 */ 252 static const char remaining_bytes_tbl[0x100] = { 253 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 254 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 256 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 258 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 260 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 262 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 263 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 264 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 265 266 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 267 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 268 269 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 270 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 271 272 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 273 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 274 275 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 276 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 277 }; 278 279 280 /* 281 * The following is a vector of bit-masks to get used bits in 282 * the first byte of a UTF-8 character. Index is remaining bytes at above of 283 * the character. 284 */ 285 static const char masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 286 287 288 /* 289 * The following two vectors are to provide valid minimum and 290 * maximum values for the 2'nd byte of a multibyte UTF-8 character for 291 * better illegal sequence checking. The index value must be the value of 292 * the first byte of the UTF-8 character. 293 */ 294 static const unsigned char valid_min_2nd_byte[0x100] = { 295 0, 0, 0, 0, 0, 0, 0, 0, 296 0, 0, 0, 0, 0, 0, 0, 0, 297 0, 0, 0, 0, 0, 0, 0, 0, 298 0, 0, 0, 0, 0, 0, 0, 0, 299 0, 0, 0, 0, 0, 0, 0, 0, 300 0, 0, 0, 0, 0, 0, 0, 0, 301 0, 0, 0, 0, 0, 0, 0, 0, 302 0, 0, 0, 0, 0, 0, 0, 0, 303 0, 0, 0, 0, 0, 0, 0, 0, 304 0, 0, 0, 0, 0, 0, 0, 0, 305 0, 0, 0, 0, 0, 0, 0, 0, 306 0, 0, 0, 0, 0, 0, 0, 0, 307 0, 0, 0, 0, 0, 0, 0, 0, 308 0, 0, 0, 0, 0, 0, 0, 0, 309 0, 0, 0, 0, 0, 0, 0, 0, 310 0, 0, 0, 0, 0, 0, 0, 0, 311 0, 0, 0, 0, 0, 0, 0, 0, 312 0, 0, 0, 0, 0, 0, 0, 0, 313 0, 0, 0, 0, 0, 0, 0, 0, 314 0, 0, 0, 0, 0, 0, 0, 0, 315 0, 0, 0, 0, 0, 0, 0, 0, 316 0, 0, 0, 0, 0, 0, 0, 0, 317 0, 0, 0, 0, 0, 0, 0, 0, 318 0, 0, 0, 0, 0, 0, 0, 0, 319 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 320 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 321 /* C8 C9 CA CB CC CD CE CF */ 322 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 323 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 324 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 325 /* D8 D9 DA DB DC DD DE DF */ 326 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 327 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 328 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 329 /* E8 E9 EA EB EC ED EE EF */ 330 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 331 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 332 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 333 0, 0, 0, 0, 0, 0, 0, 0, 334 }; 335 336 static const unsigned char valid_max_2nd_byte[0x100] = { 337 0, 0, 0, 0, 0, 0, 0, 0, 338 0, 0, 0, 0, 0, 0, 0, 0, 339 0, 0, 0, 0, 0, 0, 0, 0, 340 0, 0, 0, 0, 0, 0, 0, 0, 341 0, 0, 0, 0, 0, 0, 0, 0, 342 0, 0, 0, 0, 0, 0, 0, 0, 343 0, 0, 0, 0, 0, 0, 0, 0, 344 0, 0, 0, 0, 0, 0, 0, 0, 345 0, 0, 0, 0, 0, 0, 0, 0, 346 0, 0, 0, 0, 0, 0, 0, 0, 347 0, 0, 0, 0, 0, 0, 0, 0, 348 0, 0, 0, 0, 0, 0, 0, 0, 349 0, 0, 0, 0, 0, 0, 0, 0, 350 0, 0, 0, 0, 0, 0, 0, 0, 351 0, 0, 0, 0, 0, 0, 0, 0, 352 0, 0, 0, 0, 0, 0, 0, 0, 353 0, 0, 0, 0, 0, 0, 0, 0, 354 0, 0, 0, 0, 0, 0, 0, 0, 355 0, 0, 0, 0, 0, 0, 0, 0, 356 0, 0, 0, 0, 0, 0, 0, 0, 357 0, 0, 0, 0, 0, 0, 0, 0, 358 0, 0, 0, 0, 0, 0, 0, 0, 359 0, 0, 0, 0, 0, 0, 0, 0, 360 0, 0, 0, 0, 0, 0, 0, 0, 361 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 362 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 363 /* C8 C9 CA CB CC CD CE CF */ 364 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 365 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 366 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 367 /* D8 D9 DA DB DC DD DE DF */ 368 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 369 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 370 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 371 /* E8 E9 EA EB EC ED EE EF */ 372 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 373 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 374 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 375 0, 0, 0, 0, 0, 0, 0, 0, 376 }; 377 378 static size_t 379 utf8_ucs(unsigned int *p, unsigned char **pip, size_t *pileft) 380 { 381 unsigned int l; /* to be copied to *p on successful return */ 382 unsigned char ic; /* current byte */ 383 unsigned char ic1; /* 1st byte */ 384 unsigned char *ip = *pip; /* next byte to read */ 385 size_t ileft = *pileft; /* number of bytes available */ 386 size_t rv = (size_t)0; /* return value of this function */ 387 int remaining_bytes; 388 389 NGET(ic, "no bytes available"); /* read 1st byte */ 390 ic1 = ic; 391 l = ic1; /* get bits from 1st byte to UCS value */ 392 393 if (ic1 < 0x80) { 394 /* successfully converted */ 395 *p = l; 396 rv = *pileft - ileft; 397 goto ret; 398 } 399 400 remaining_bytes = remaining_bytes_tbl[ic1]; 401 402 if (remaining_bytes != 0) { 403 l &= masks_tbl[remaining_bytes]; 404 405 for (; remaining_bytes > 0; remaining_bytes--) { 406 if (ic1 != 0U) { 407 NGET(ic, "2nd byte of UTF-8"); 408 if ((ic < valid_min_2nd_byte[ic1]) || 409 (ic > valid_max_2nd_byte[ic1])) { 410 RETERROR(EILSEQ, "2nd byte is invalid") 411 } 412 ic1 = 0U; /* 2nd byte check done */ 413 } else { 414 NGET(ic, "3rd or later byte of UTF-8"); 415 if ((ic < 0x80) || (ic > 0xbf)) { 416 RETERROR(EILSEQ, "3rd or later byte is invalid") 417 } 418 } 419 l = (l << 6) | (ic & 0x3f); 420 } 421 422 /* successfully converted */ 423 *p = l; 424 rv = *pileft - ileft; 425 goto ret; 426 } else { 427 RETERROR(EILSEQ, "1st byte is invalid") 428 } 429 430 ret: 431 if (rv != (size_t)-1) { 432 /* 433 * update *pip and *pileft on successful return 434 */ 435 *pip = ip; 436 *pileft = ileft; 437 } 438 439 return (rv); 440 } 441 442 /* for UTF-8 */ 443 static size_t /* return #bytes read, or -1 */ 444 read_unicode( 445 unsigned int *p, /* point variable to store UTF-32 */ 446 unsigned char **pip, /* point pointer to input buf */ 447 size_t *pileft, /* point #bytes left in input buf */ 448 ucs_state_t *state) /* BOM state and endian - unused */ 449 { 450 return (utf8_ucs(p, pip, pileft)); 451 } 452 453 #endif 454 455 #if defined(JFP_ICONV_TOCODE_UTF32) 456 457 static size_t 458 write_unicode( 459 unsigned int u32, /* UTF-32 to write */ 460 char **pop, /* point pointer to output buf */ 461 size_t *poleft, /* point #bytes left in output buf */ 462 ucs_state_t *state, /* BOM state and endian */ 463 const char *msg) /* debug message */ 464 { 465 char *op = *pop; 466 size_t oleft = *poleft; 467 size_t rv = (size_t)0; /* return value */ 468 unsigned char ic1, ic2, ic3, ic4; /* bytes to be written */ 469 470 if (state->bom_written == B_FALSE) { 471 if (state->little_endian == B_TRUE) { 472 ic1 = (unsigned char)((BOM >> 0) & 0xff); 473 ic2 = (unsigned char)((BOM >> 8) & 0xff); 474 ic3 = (unsigned char)((BOM >> 16) & 0xff); 475 ic4 = (unsigned char)((BOM >> 24) & 0xff); 476 } else { 477 ic1 = (unsigned char)((BOM >> 24) & 0xff); 478 ic2 = (unsigned char)((BOM >> 16) & 0xff); 479 ic3 = (unsigned char)((BOM >> 8) & 0xff); 480 ic4 = (unsigned char)((BOM >> 0) & 0xff); 481 } 482 rv += 4; 483 NPUT(ic1, "BOM32-1") 484 NPUT(ic2, "BOM32-2") 485 NPUT(ic3, "BOM32-3") 486 NPUT(ic4, "BOM32-4") 487 } 488 489 if (state->little_endian == B_TRUE) { 490 ic1 = (unsigned char)((u32 >> 0) & 0xff); 491 ic2 = (unsigned char)((u32 >> 8) & 0xff); 492 ic3 = (unsigned char)((u32 >> 16) & 0xff); 493 ic4 = (unsigned char)((u32 >> 24) & 0xff); 494 rv += 4; 495 } else { 496 ic1 = (unsigned char)((u32 >> 24) & 0xff); 497 ic2 = (unsigned char)((u32 >> 16) & 0xff); 498 ic3 = (unsigned char)((u32 >> 8) & 0xff); 499 ic4 = (unsigned char)((u32 >> 0) & 0xff); 500 rv += 4; 501 } 502 503 NPUT(ic1, "UTF32-1") 504 NPUT(ic2, "UTF32-2") 505 NPUT(ic3, "UTF32-3") 506 NPUT(ic4, "UTF32-4") 507 508 ret: 509 if (rv != (size_t)-1) { 510 /* update *pop and *poleft only on successful return */ 511 *pop = op; 512 *poleft = oleft; 513 if (state->bom_written == B_FALSE) 514 state->bom_written = B_TRUE; 515 } 516 517 return (rv); 518 } 519 520 #elif defined(JFP_ICONV_TOCODE_UTF16) || defined(JFP_ICONV_TOCODE_UCS2) 521 522 static size_t 523 write_unicode( 524 unsigned int u32, /* UTF-32 to write */ 525 char **pop, /* point pointer to output buf */ 526 size_t *poleft, /* point #bytes left in output buf */ 527 ucs_state_t *state, /* BOM state and endian */ 528 const char *msg) /* debug message */ 529 { 530 char *op = *pop; 531 size_t oleft = *poleft; 532 size_t rv = (size_t)0; /* return value */ 533 unsigned char ic1, ic2; /* bytes to be written */ 534 unsigned int losur = 0U; /* Hi/Lo surrogates */ 535 536 if (state->bom_written == B_FALSE) { 537 if (state->little_endian == B_TRUE) { 538 ic1 = (unsigned char)((BOM >> 0) & 0xff); 539 ic2 = (unsigned char)((BOM >> 8) & 0xff); 540 } else { 541 ic1 = (unsigned char)((BOM >> 8) & 0xff); 542 ic2 = (unsigned char)((BOM >> 0) & 0xff); 543 } 544 rv += 2; 545 NPUT(ic1, "BOM16-1") 546 NPUT(ic2, "BOM16-2") 547 } 548 549 if (u32 > 0xffff) { 550 #if defined(JFP_ICONV_TOCODE_UCS2) 551 u32 = REPLACE; 552 #else /* !defined(JFP_ICONV_TOCODE_UCS2) */ 553 losur = ((u32 - 0x10000) % 0x400) + 0xdc00; 554 u32 = ((u32 - 0x10000) / 0x400) + 0xd800; 555 #endif /* defined(JFP_ICONV_TOCODE_UCS2) */ 556 } 557 558 if (state->little_endian == B_TRUE) { 559 ic1 = (unsigned char)(u32 & 0xff); 560 ic2 = (unsigned char)((u32 >> 8) & 0xff); 561 rv += 2; 562 } else { 563 ic1 = (unsigned char)((u32 >> 8) & 0xff); 564 ic2 = (unsigned char)(u32 & 0xff); 565 rv += 2; 566 } 567 568 NPUT(ic1, "UTF16-1") 569 NPUT(ic2, "UTF16-2") 570 571 if (losur != 0U) { 572 if (state->little_endian == B_TRUE) { 573 ic1 = (unsigned char)(losur & 0xff); 574 ic2 = (unsigned char)((losur >> 8) & 0xff); 575 rv += 2; 576 } else { 577 ic1 = (unsigned char)((losur >> 8) & 0xff); 578 ic2 = (unsigned char)(losur & 0xff); 579 rv += 2; 580 } 581 582 NPUT(ic1, "LOSUR-1") 583 NPUT(ic2, "LOSUR-2") 584 } 585 586 587 ret: 588 if (rv != (size_t)-1) { 589 /* update *pop and *poleft only on successful return */ 590 *pop = op; 591 *poleft = oleft; 592 if (state->bom_written == B_FALSE) 593 state->bom_written = B_TRUE; 594 } 595 596 return (rv); 597 } 598 599 #else /* JFP_ICONV_TOCODE_UTF8 (default) */ 600 601 static size_t 602 write_unicode( 603 unsigned int u32, /* UTF-32 to write */ 604 char **pop, /* point pointer to output buf */ 605 size_t *poleft, /* point #bytes left in output buf */ 606 ucs_state_t *state, /* BOM state and endian - unused */ 607 const char *msg) /* debug message */ 608 { 609 char *op = *pop; 610 size_t oleft = *poleft; 611 size_t rv = 0; /* return value */ 612 613 if (u32 <= 0x7f) { 614 NPUT((unsigned char)(u32), msg); 615 rv = 1; 616 } else if (u32 <= 0x7ff) { 617 NPUT((unsigned char)((((u32)>>6) & 0x1f) | 0xc0), msg); 618 NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg); 619 rv = 2; 620 } else if ((u32 >= 0xd800) && (u32 <= 0xdfff)) { 621 RETERROR(EILSEQ, "surrogate in UTF-8") 622 } else if (u32 <= 0xffff) { 623 NPUT((unsigned char)((((u32)>>12) & 0x0f) | 0xe0), msg); 624 NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg); 625 NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg); 626 rv = 3; 627 } else if (u32 <= 0x10ffff) { 628 NPUT((unsigned char)((((u32)>>18) & 0x07) | 0xf0), msg); 629 NPUT((unsigned char)((((u32)>>12) & 0x3f) | 0x80), msg); 630 NPUT((unsigned char)((((u32)>>6) & 0x3f) | 0x80), msg); 631 NPUT((unsigned char)((((u32)>>0) & 0x3f) | 0x80), msg); 632 rv = 4; 633 } else { 634 RETERROR(EILSEQ, "beyond range of UTF-8") 635 } 636 637 ret: 638 if (rv != (size_t)-1) { 639 /* update *pop and *poleft only on successful return */ 640 *pop = op; 641 *poleft = oleft; 642 } 643 644 return (rv); 645 } 646 647 #endif 648 649 #define GETU(pu32) \ 650 switch (read_unicode(pu32, &ip, &ileft, (ucs_state_t *)cd)) { \ 651 case (size_t)-1: \ 652 /* errno has been set in read_unicode() */ \ 653 rv = (size_t)-1; \ 654 goto ret; \ 655 case (size_t)0: \ 656 /* character read was handled in the read_unicode() */ \ 657 /* no further evaluation needed in caller side */ \ 658 rv = (size_t)0; \ 659 goto next; \ 660 default: \ 661 break; \ 662 } 663 664 665 #define PUTU(u32, msg) \ 666 if (write_unicode(u32, &op, &oleft, (ucs_state_t *)cd, msg) \ 667 == (size_t)-1) { \ 668 rv = ((size_t)-1);\ 669 goto ret; \ 670 } 671 672 #include <stdlib.h> 673 674 static void 675 _icv_reset_unicode(void *cd) 676 { 677 ucs_state_t *state = (ucs_state_t *)cd; 678 679 #if defined(JFP_ICONV_FROMCODE_UTF32BE) || \ 680 defined(JFP_ICONV_TOCODE_UTF32BE) || \ 681 defined(JFP_ICONV_FROMCODE_UTF16BE) || \ 682 defined(JFP_ICONV_TOCODE_UTF16BE) || \ 683 defined(JFP_ICONV_FROMCODE_UCS2BE) || \ 684 defined(JFP_ICONV_TOCODE_UCS2BE) 685 state->little_endian = B_FALSE; 686 state->bom_written = B_TRUE; 687 #elif defined(JFP_ICONV_FROMCODE_UTF32LE) || \ 688 defined(JFP_ICONV_TOCODE_UTF32LE) || \ 689 defined(JFP_ICONV_FROMCODE_UTF16LE) || \ 690 defined(JFP_ICONV_TOCODE_UTF16LE) || \ 691 defined(JFP_ICONV_FROMCODE_UCS2LE) || \ 692 defined(JFP_ICONV_TOCODE_UCS2LE) 693 state->little_endian = B_TRUE; 694 state->bom_written = B_TRUE; 695 #elif defined(_LITTLE_ENDIAN) 696 state->little_endian = B_TRUE; 697 state->bom_written = B_FALSE; 698 #endif 699 700 return; 701 } 702 703 static void * 704 _icv_open_unicode(size_t extsize) 705 { 706 ucs_state_t *cd; 707 708 if ((cd = (ucs_state_t *)calloc(1, 709 sizeof (ucs_state_t) + extsize)) == NULL) { 710 errno = ENOMEM; 711 return ((void *)-1); 712 } 713 714 _icv_reset_unicode((void *)cd); 715 716 return ((void *)cd); 717 } 718 719 static void 720 _icv_close_unicode(void *cd) 721 { 722 if (cd == NULL) { 723 errno = EBADF; 724 } else { 725 free(cd); 726 } 727 return; 728 } 729 730 static void * 731 _icv_get_ext(void *cd) 732 { 733 return ((void *)((unsigned char *)cd + sizeof (ucs_state_t))); 734 } 735