1 /*********************************************************************** 2 * * 3 * This software is part of the ast package * 4 * Copyright (c) 1985-2007 AT&T Knowledge Ventures * 5 * and is licensed under the * 6 * Common Public License, Version 1.0 * 7 * by AT&T Knowledge Ventures * 8 * * 9 * A copy of the License is available at * 10 * http://www.opensource.org/licenses/cpl1.0.txt * 11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) * 12 * * 13 * Information and Software Systems Research * 14 * AT&T Research * 15 * Florham Park NJ * 16 * * 17 * Glenn Fowler <gsf@research.att.com> * 18 * David Korn <dgk@research.att.com> * 19 * Phong Vo <kpv@research.att.com> * 20 * * 21 ***********************************************************************/ 22 #pragma prototyped 23 24 /* 25 * Glenn Fowler 26 * AT&T Research 27 * 28 * iconv intercept 29 * minimally provides { utf*<=>bin ascii<=>ebcdic* } 30 */ 31 32 #include <ast.h> 33 #include <dirent.h> 34 35 #define DEBUG_TRACE 0 36 #define _ICONV_LIST_PRIVATE_ 37 38 #include <ccode.h> 39 #include <ctype.h> 40 #include <iconv.h> 41 42 #include "lclib.h" 43 44 #if !_lib_iconv_open 45 46 #define _ast_iconv_t iconv_t 47 #define _ast_iconv_f iconv_f 48 #define _ast_iconv_list_t iconv_list_t 49 #define _ast_iconv_open iconv_open 50 #define _ast_iconv iconv 51 #define _ast_iconv_close iconv_close 52 #define _ast_iconv_list iconv_list 53 #define _ast_iconv_move iconv_move 54 #define _ast_iconv_name iconv_name 55 #define _ast_iconv_write iconv_write 56 57 #endif 58 59 #ifndef E2BIG 60 #define E2BIG ENOMEM 61 #endif 62 #ifndef EILSEQ 63 #define EILSEQ EIO 64 #endif 65 66 #define RETURN(e,n,fn) \ 67 if (*fn && !e) e = E2BIG; \ 68 if (e) { errno = e; return (size_t)(-1); } \ 69 return n; 70 71 typedef struct Map_s 72 { 73 char* name; 74 const unsigned char* map; 75 _ast_iconv_f fun; 76 int index; 77 } Map_t; 78 79 typedef struct Conv_s 80 { 81 iconv_t cvt; 82 char* buf; 83 size_t size; 84 Map_t from; 85 Map_t to; 86 } Conv_t; 87 88 static Conv_t* freelist[4]; 89 static int freeindex; 90 91 static const char name_local[] = "local"; 92 static const char name_native[] = "native"; 93 94 static const _ast_iconv_list_t codes[] = 95 { 96 { 97 "utf", 98 "un|unicode|utf", 99 "multibyte 8-bit unicode", 100 "UTF-%s", 101 "8", 102 CC_UTF, 103 }, 104 105 { 106 "ume", 107 "um|ume|utf?(-)7", 108 "multibyte 7-bit unicode", 109 "UTF-7", 110 0, 111 CC_UME, 112 }, 113 114 { 115 "euc", 116 "(big|euc)*", 117 "euc family", 118 0, 119 0, 120 CC_ICONV, 121 }, 122 123 { 124 "dos", 125 "dos?(-)?(855)", 126 "dos code page", 127 "DOS855", 128 0, 129 CC_ICONV, 130 }, 131 132 { 133 "ucs", 134 "ucs?(-)?(2)?(be)|utf-16?(be)", 135 "unicode runes", 136 "UCS-%s", 137 "2", 138 CC_UCS, 139 }, 140 141 { 142 "ucs-le", 143 "ucs?(-)?(2)le|utf-16le", 144 "little endian unicode runes", 145 "UCS-%sLE", 146 "2", 147 CC_SCU, 148 }, 149 150 { 0 }, 151 }; 152 153 #if _UWIN 154 155 #include <ast_windows.h> 156 157 #ifndef CP_UCS2 158 #define CP_UCS2 0x0000 159 #endif 160 161 static char _win_maps[] = "/reg/local_machine/SOFTWARE/Classes/MIME/Database/Charset"; 162 163 /* 164 * return the codeset index given its name or alias 165 * the map is in the what? oh, the registry 166 */ 167 168 static int 169 _win_codeset(const char* name) 170 { 171 register char* s; 172 char* e; 173 int n; 174 Sfio_t* sp; 175 char aka[128]; 176 char tmp[128]; 177 178 #if DEBUG_TRACE 179 error(DEBUG_TRACE, "AHA#%d _win_codeset name=%s", __LINE__, name); 180 #endif 181 if (name == name_native) 182 return CP_ACP; 183 if (!strcasecmp(name, "utf") || !strcasecmp(name, "utf8") || !strcasecmp(name, "utf-8")) 184 return CP_UTF8; 185 if (!strcasecmp(name, "ucs") || !strcasecmp(name, "ucs2") || !strcasecmp(name, "ucs-2")) 186 return CP_UCS2; 187 if (name[0] == '0' && name[1] == 'x' && (n = strtol(name, &e, 0)) > 0 && !*e) 188 return n; 189 for (;;) 190 { 191 sfsprintf(tmp, sizeof(tmp), "%s/%s", _win_maps, name); 192 if (!(sp = sfopen(0, tmp, "r"))) 193 { 194 s = (char*)name; 195 if ((s[0] == 'c' || s[0] == 'C') && (s[1] == 'p' || s[1] == 'P')) 196 s += 2; 197 if (!isdigit(s[0])) 198 break; 199 sfsprintf(tmp, sizeof(tmp), "%s/windows-%s", _win_maps, s); 200 if (!(sp = sfopen(0, tmp, "r"))) 201 break; 202 } 203 for (;;) 204 { 205 if (!(s = sfgetr(sp, '\n', 0))) 206 { 207 sfclose(sp); 208 return -1; 209 } 210 if (!strncasecmp(s, "AliasForCharSet=", 16)) 211 { 212 n = sfvalue(sp) - 17; 213 s += 16; 214 if (n >= sizeof(aka)) 215 n = sizeof(aka) - 1; 216 memcpy(aka, s, n); 217 aka[n] = 0; 218 sfclose(sp); 219 name = (const char*)aka; 220 break; 221 } 222 if (!strncasecmp(s, "CodePage=", 9)) 223 { 224 s += 9; 225 n = strtol(s, 0, 0); 226 sfclose(sp); 227 return n; 228 } 229 } 230 } 231 return -1; 232 } 233 234 /* 235 * get and check the codeset indices 236 */ 237 238 static _ast_iconv_t 239 _win_iconv_open(register Conv_t* cc, const char* t, const char* f) 240 { 241 #if DEBUG_TRACE 242 error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=%s t=%s\n", __LINE__, f, t); 243 #endif 244 if ((cc->from.index = _win_codeset(f)) < 0) 245 return (_ast_iconv_t)(-1); 246 if ((cc->to.index = _win_codeset(t)) < 0) 247 return (_ast_iconv_t)(-1); 248 #if DEBUG_TRACE 249 error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=0x%04x t=0x%04x\n", __LINE__, cc->from.index, cc->to.index); 250 #endif 251 return (_ast_iconv_t)cc; 252 } 253 254 /* 255 * even though the indices already check out 256 * they could still be rejected 257 */ 258 259 static size_t 260 _win_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 261 { 262 Conv_t* cc = (Conv_t*)cd; 263 size_t un; 264 size_t tz; 265 size_t fz; 266 size_t bz; 267 size_t pz; 268 size_t oz; 269 LPWSTR ub; 270 271 #if DEBUG_TRACE 272 error(DEBUG_TRACE, "AHA#%d _win_iconv from=0x%04x to=0x%04x\n", __LINE__, cc->from.index, cc->to.index); 273 #endif 274 if (cc->from.index == cc->to.index) 275 { 276 /* 277 * easy 278 */ 279 280 fz = tz = (*fn < *tn) ? *fn : *tn; 281 memcpy(*tb, *fb, fz); 282 } 283 else 284 { 285 ub = 0; 286 un = *fn; 287 288 /* 289 * from => ucs-2 290 */ 291 292 if (cc->to.index == CP_UCS2) 293 { 294 if ((tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, *tn)) && tz <= *tn) 295 { 296 fz = *fn; 297 tz *= sizeof(WCHAR); 298 } 299 else 300 { 301 /* 302 * target too small 303 * binary search on input size to make it fit 304 */ 305 306 oz = 0; 307 pz = *fn / 2; 308 fz = *fn - pz; 309 for (;;) 310 { 311 while (!(tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)fz, (LPWSTR)*tb, 0))) 312 if (++fz >= *fn) 313 goto nope; 314 tz *= sizeof(WCHAR); 315 if (tz == *tn) 316 break; 317 if (!(pz /= 2)) 318 { 319 if (!(fz = oz)) 320 goto nope; 321 break; 322 } 323 if (tz > *tn) 324 fz -= pz; 325 else 326 { 327 oz = fz; 328 fz += pz; 329 } 330 } 331 } 332 } 333 else 334 { 335 if (cc->from.index == CP_UCS2) 336 { 337 un = *fn / sizeof(WCHAR); 338 ub = (LPWSTR)*fb; 339 } 340 else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, 0))) 341 goto nope; 342 else if (!(ub = (LPWSTR)malloc(un * sizeof(WCHAR)))) 343 goto nope; 344 else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)ub, un))) 345 goto nope; 346 347 /* 348 * ucs-2 => to 349 */ 350 351 if (tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, un, *tb, *tn, 0, 0)) 352 fz = *fn; 353 else 354 { 355 /* 356 * target too small 357 * binary search on input size to make it fit 358 */ 359 360 oz = 0; 361 pz = *fn / 2; 362 bz = *fn - pz; 363 for (;;) 364 { 365 while (!(fz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)bz, (LPWSTR)ub, un))) 366 if (++bz > *fn) 367 goto nope; 368 if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, 0, 0, 0))) 369 goto nope; 370 if (tz == *tn) 371 break; 372 if (!(pz /= 2)) 373 { 374 if (!(fz = oz)) 375 goto nope; 376 break; 377 } 378 if (tz > *tn) 379 bz -= pz; 380 else 381 { 382 oz = bz; 383 bz += pz; 384 } 385 } 386 if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, tz, 0, 0))) 387 goto nope; 388 #if DEBUG_TRACE 389 error(DEBUG_TRACE, "AHA#%d _win_iconv *fn=%u fz=%u[%u] *tn=%u tz=%u\n", __LINE__, *fn, fz, fz * sizeof(WCHAR), *tn, tz); 390 #endif 391 #if 0 392 fz *= sizeof(WCHAR); 393 #endif 394 } 395 if (ub != (LPWSTR)*fb) 396 free(ub); 397 } 398 } 399 *fb += fz; 400 *fn -= fz; 401 *tb += tz; 402 *tn -= tz; 403 return fz; 404 nope: 405 if (ub && ub != (LPWSTR)*fb) 406 free(ub); 407 errno = EINVAL; 408 return (size_t)(-1); 409 } 410 411 #endif 412 413 /* 414 * return canonical character code set name for m 415 * if b!=0 then canonical name placed in b of size n 416 * <ccode.h> index returned 417 */ 418 419 int 420 _ast_iconv_name(register const char* m, register char* b, size_t n) 421 { 422 register const _ast_iconv_list_t* cp; 423 const _ast_iconv_list_t* bp; 424 register int c; 425 register char* e; 426 int sub[2]; 427 char buf[16]; 428 #if DEBUG_TRACE 429 char* o; 430 #endif 431 432 if (!b) 433 { 434 b = buf; 435 n = sizeof(buf); 436 } 437 #if DEBUG_TRACE 438 o = b; 439 #endif 440 e = b + n - 1; 441 bp = 0; 442 n = 0; 443 cp = ccmaplist(NiL); 444 #if DEBUG_TRACE 445 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name m=\"%s\"\n", error_info.id, error_info.trace, __LINE__, m); 446 #endif 447 for (;;) 448 { 449 #if DEBUG_TRACE 450 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name n=%d bp=%p cp=%p ccode=%d name=\"%s\"\n", error_info.id, error_info.trace, __LINE__, n, bp, cp, cp->ccode, cp->name); 451 #endif 452 if (strgrpmatch(m, cp->match, sub, elementsof(sub) / 2, STR_MAXIMAL|STR_LEFT|STR_ICASE)) 453 { 454 if (!(c = m[sub[1]])) 455 { 456 bp = cp; 457 break; 458 } 459 if (sub[1] > n && !isalpha(c)) 460 { 461 bp = cp; 462 n = sub[1]; 463 } 464 } 465 if (cp->ccode < 0) 466 { 467 if (!(++cp)->name) 468 break; 469 } 470 else if (!(cp = (const _ast_iconv_list_t*)ccmaplist((_ast_iconv_list_t*)cp))) 471 cp = codes; 472 } 473 if (cp = bp) 474 { 475 if (cp->canon) 476 { 477 if (cp->index) 478 { 479 for (m += sub[1]; *m && !isalnum(*m); m++); 480 if (!isdigit(*m)) 481 m = cp->index; 482 } 483 else 484 m = "1"; 485 b += sfsprintf(b, e - b, cp->canon, m); 486 } 487 else if (cp->ccode == CC_NATIVE) 488 { 489 if ((locales[AST_LC_CTYPE]->flags & LC_default) || !locales[AST_LC_CTYPE]->charset || !(m = locales[AST_LC_CTYPE]->charset->code) || streq(m, "iso8859-1")) 490 switch (CC_NATIVE) 491 { 492 case CC_EBCDIC: 493 m = (const char*)"EBCDIC"; 494 break; 495 case CC_EBCDIC_I: 496 m = (const char*)"EBCDIC-I"; 497 break; 498 case CC_EBCDIC_O: 499 m = (const char*)"EBCDIC-O"; 500 break; 501 default: 502 m = (const char*)"ISO-8859-1"; 503 break; 504 } 505 b += sfsprintf(b, e - b, "%s", m); 506 } 507 *b = 0; 508 #if DEBUG_TRACE 509 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, cp->ccode, o); 510 #endif 511 return cp->ccode; 512 } 513 while (b < e && (c = *m++)) 514 { 515 if (islower(c)) 516 c = toupper(c); 517 *b++ = c; 518 } 519 *b = 0; 520 #if DEBUG_TRACE 521 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, CC_ICONV, o); 522 #endif 523 return CC_ICONV; 524 } 525 526 /* 527 * convert utf-8 to bin 528 */ 529 530 static size_t 531 utf2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 532 { 533 register unsigned char* f; 534 register unsigned char* fe; 535 register unsigned char* t; 536 register unsigned char* te; 537 register unsigned char* p; 538 register int c; 539 register int w; 540 size_t n; 541 int e; 542 543 e = 0; 544 f = (unsigned char*)(*fb); 545 fe = f + (*fn); 546 t = (unsigned char*)(*tb); 547 te = t + (*tn); 548 while (t < te && f < fe) 549 { 550 p = f; 551 c = *f++; 552 if (c & 0x80) 553 { 554 if (!(c & 0x40)) 555 { 556 f = p; 557 e = EILSEQ; 558 break; 559 } 560 if (c & 0x20) 561 { 562 w = (c & 0x0F) << 12; 563 if (f >= fe) 564 { 565 f = p; 566 e = EINVAL; 567 break; 568 } 569 c = *f++; 570 if (c & 0x40) 571 { 572 f = p; 573 e = EILSEQ; 574 break; 575 } 576 w |= (c & 0x3F) << 6; 577 } 578 else 579 w = (c & 0x1F) << 6; 580 if (f >= fe) 581 { 582 f = p; 583 e = EINVAL; 584 break; 585 } 586 c = *f++; 587 w |= (c & 0x3F); 588 } 589 else 590 w = c; 591 *t++ = w; 592 } 593 *fn -= (char*)f - (*fb); 594 *fb = (char*)f; 595 *tn -= (n = (char*)t - (*tb)); 596 *tb = (char*)t; 597 RETURN(e, n, fn); 598 } 599 600 /* 601 * convert bin to utf-8 602 */ 603 604 static size_t 605 bin2utf(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 606 { 607 register unsigned char* f; 608 register unsigned char* fe; 609 register unsigned char* t; 610 register unsigned char* te; 611 register int c; 612 wchar_t w; 613 size_t n; 614 int e; 615 616 e = 0; 617 f = (unsigned char*)(*fb); 618 fe = f + (*fn); 619 t = (unsigned char*)(*tb); 620 te = t + (*tn); 621 while (f < fe && t < te) 622 { 623 if (!mbwide()) 624 { 625 c = 1; 626 w = *f; 627 } 628 else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) 629 { 630 e = EINVAL; 631 break; 632 } 633 else if (!c) 634 c = 1; 635 if (!(w & ~0x7F)) 636 *t++ = w; 637 else 638 { 639 if (!(w & ~0x7FF)) 640 { 641 if (t >= (te - 2)) 642 { 643 e = E2BIG; 644 break; 645 } 646 *t++ = 0xC0 + (w >> 6); 647 } 648 else if (!(w & ~0xffff)) 649 { 650 if (t >= (te - 3)) 651 { 652 e = E2BIG; 653 break; 654 } 655 *t++ = 0xE0 + (w >> 12); 656 *t++ = 0x80 + ((w >> 6 ) & 0x3F); 657 } 658 else 659 { 660 e = EILSEQ; 661 break; 662 } 663 *t++ = 0x80 + (w & 0x3F); 664 } 665 f += c; 666 } 667 *fn -= (n = (char*)f - (*fb)); 668 *fb = (char*)f; 669 *tn -= (char*)t - (*tb); 670 *tb = (char*)t; 671 RETURN(e, n, fn); 672 } 673 674 static const unsigned char ume_D[] = 675 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?!\"#$%&*;<=>@[]^_`{|} \t\n"; 676 677 static const unsigned char ume_M[] = 678 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 679 680 static unsigned char ume_d[UCHAR_MAX+1]; 681 682 static unsigned char ume_m[UCHAR_MAX+1]; 683 684 #define NOE 0xFF 685 #define UMEINIT() (ume_d[ume_D[0]]?0:umeinit()) 686 687 /* 688 * initialize the ume tables 689 */ 690 691 static int 692 umeinit(void) 693 { 694 register const unsigned char* s; 695 register int i; 696 register int c; 697 698 if (!ume_d[ume_D[0]]) 699 { 700 s = ume_D; 701 while (c = *s++) 702 ume_d[c] = 1; 703 memset(ume_m, NOE, sizeof(ume_m)); 704 for (i = 0; c = ume_M[i]; i++) 705 ume_m[c] = i; 706 } 707 return 0; 708 } 709 710 /* 711 * convert utf-7 to bin 712 */ 713 714 static size_t 715 ume2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 716 { 717 register unsigned char* f; 718 register unsigned char* fe; 719 register unsigned char* t; 720 register unsigned char* te; 721 register unsigned char* p; 722 register int s; 723 register int c; 724 register int w; 725 size_t n; 726 int e; 727 728 e = 0; 729 UMEINIT(); 730 f = (unsigned char*)(*fb); 731 fe = f + (*fn); 732 t = (unsigned char*)(*tb); 733 te = t + (*tn); 734 s = 0; 735 while (f < fe && t < te) 736 { 737 p = f; 738 c = *f++; 739 if (s) 740 { 741 if (c == '-' && s > 1) 742 s = 0; 743 else if ((w = ume_m[c]) == NOE) 744 { 745 s = 0; 746 *t++ = c; 747 } 748 else if (f >= (fe - 2)) 749 { 750 f = p; 751 e = EINVAL; 752 break; 753 } 754 else 755 { 756 s = 2; 757 w = (w << 6) | ume_m[*f++]; 758 w = (w << 6) | ume_m[*f++]; 759 if (!(w & ~0xFF)) 760 *t++ = w; 761 else if (t >= (te - 1)) 762 { 763 f = p; 764 e = E2BIG; 765 break; 766 } 767 else 768 { 769 *t++ = (w >> 8) & 0xFF; 770 *t++ = w & 0xFF; 771 } 772 } 773 } 774 else if (c == '+') 775 s = 1; 776 else 777 *t++ = c; 778 } 779 *fn -= (char*)f - (*fb); 780 *fb = (char*)f; 781 *tn -= (n = (char*)t - (*tb)); 782 *tb = (char*)t; 783 RETURN(e, n, fn); 784 } 785 786 /* 787 * convert bin to utf-7 788 */ 789 790 static size_t 791 bin2ume(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 792 { 793 register unsigned char* f; 794 register unsigned char* fe; 795 register unsigned char* t; 796 register unsigned char* te; 797 register int c; 798 register int s; 799 wchar_t w; 800 size_t n; 801 int e; 802 803 e = 0; 804 UMEINIT(); 805 f = (unsigned char*)(*fb); 806 fe = f + (*fn); 807 t = (unsigned char*)(*tb); 808 te = t + (*tn); 809 s = 0; 810 while (f < fe && t < (te - s)) 811 { 812 if (!mbwide()) 813 { 814 c = 1; 815 w = *f; 816 } 817 else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) 818 { 819 e = EINVAL; 820 break; 821 } 822 else if (!c) 823 c = 1; 824 if (!(w & ~0x7F) && ume_d[w]) 825 { 826 if (s) 827 { 828 s = 0; 829 *t++ = '-'; 830 } 831 *t++ = w; 832 } 833 else if (t >= (te - (4 + s))) 834 { 835 e = E2BIG; 836 break; 837 } 838 else 839 { 840 if (!s) 841 { 842 s = 1; 843 *t++ = '+'; 844 } 845 *t++ = ume_M[(w >> 12) & 0x3F]; 846 *t++ = ume_M[(w >> 6) & 0x3F]; 847 *t++ = ume_M[w & 0x3F]; 848 } 849 f += c; 850 } 851 if (s) 852 *t++ = '-'; 853 *fn -= (n = (char*)f - (*fb)); 854 *fb = (char*)f; 855 *tn -= (char*)t - (*tb); 856 *tb = (char*)t; 857 RETURN(e, n, fn); 858 } 859 860 /* 861 * convert ucs-2 to bin with no byte swap 862 */ 863 864 static size_t 865 ucs2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 866 { 867 register unsigned char* f; 868 register unsigned char* fe; 869 register unsigned char* t; 870 register unsigned char* te; 871 register int w; 872 size_t n; 873 int e; 874 875 e = 0; 876 f = (unsigned char*)(*fb); 877 fe = f + (*fn); 878 t = (unsigned char*)(*tb); 879 te = t + (*tn); 880 while (f < (fe - 1) && t < te) 881 { 882 w = *f++; 883 w = (w << 8) | *f++; 884 if (!(w & ~0xFF)) 885 *t++ = w; 886 else if (t >= (te - 1)) 887 { 888 f -= 2; 889 e = E2BIG; 890 break; 891 } 892 else 893 { 894 *t++ = (w >> 8) & 0xFF; 895 *t++ = w & 0xFF; 896 } 897 } 898 *fn -= (char*)f - (*fb); 899 *fb = (char*)f; 900 *tn -= (n = (char*)t - (*tb)); 901 *tb = (char*)t; 902 RETURN(e, n, fn); 903 } 904 905 /* 906 * convert bin to ucs-2 with no byte swap 907 */ 908 909 static size_t 910 bin2ucs(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 911 { 912 register unsigned char* f; 913 register unsigned char* fe; 914 register unsigned char* t; 915 register unsigned char* te; 916 register int c; 917 wchar_t w; 918 size_t n; 919 int e; 920 921 e = 0; 922 f = (unsigned char*)(*fb); 923 fe = f + (*fn); 924 t = (unsigned char*)(*tb); 925 te = t + (*tn); 926 while (f < fe && t < (te - 1)) 927 { 928 if (!mbwide()) 929 { 930 c = 1; 931 w = *f; 932 } 933 if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) 934 { 935 e = EINVAL; 936 break; 937 } 938 else if (!c) 939 c = 1; 940 *t++ = (w >> 8) & 0xFF; 941 *t++ = w & 0xFF; 942 f += c; 943 } 944 *fn -= (n = (char*)f - (*fb)); 945 *fb = (char*)f; 946 *tn -= (char*)t - (*tb); 947 *tb = (char*)t; 948 RETURN(e, n, fn); 949 } 950 951 /* 952 * convert ucs-2 to bin with byte swap 953 */ 954 955 static size_t 956 scu2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 957 { 958 register unsigned char* f; 959 register unsigned char* fe; 960 register unsigned char* t; 961 register unsigned char* te; 962 register int w; 963 size_t n; 964 int e; 965 966 e = 0; 967 f = (unsigned char*)(*fb); 968 fe = f + (*fn); 969 t = (unsigned char*)(*tb); 970 te = t + (*tn); 971 while (f < (fe - 1) && t < te) 972 { 973 w = *f++; 974 w = w | (*f++ << 8); 975 if (!(w & ~0xFF)) 976 *t++ = w; 977 else if (t >= (te - 1)) 978 { 979 f -= 2; 980 e = E2BIG; 981 break; 982 } 983 else 984 { 985 *t++ = (w >> 8) & 0xFF; 986 *t++ = w & 0xFF; 987 } 988 } 989 *fn -= (char*)f - (*fb); 990 *fb = (char*)f; 991 *tn -= (n = (char*)t - (*tb)); 992 *tb = (char*)t; 993 RETURN(e, n, fn); 994 } 995 996 /* 997 * convert bin to ucs-2 with byte swap 998 */ 999 1000 static size_t 1001 bin2scu(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 1002 { 1003 register unsigned char* f; 1004 register unsigned char* fe; 1005 register unsigned char* t; 1006 register unsigned char* te; 1007 register int c; 1008 wchar_t w; 1009 size_t n; 1010 int e; 1011 1012 e = 0; 1013 f = (unsigned char*)(*fb); 1014 fe = f + (*fn); 1015 t = (unsigned char*)(*tb); 1016 te = t + (*tn); 1017 while (f < fe && t < (te - 1)) 1018 { 1019 if (!mbwide()) 1020 { 1021 c = 1; 1022 w = *f; 1023 } 1024 else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) 1025 { 1026 e = EINVAL; 1027 break; 1028 } 1029 else if (!c) 1030 c = 1; 1031 *t++ = w & 0xFF; 1032 *t++ = (w >> 8) & 0xFF; 1033 f += c; 1034 } 1035 *fn -= (n = (char*)f - (*fb)); 1036 *fb = (char*)f; 1037 *tn -= (char*)t - (*tb); 1038 *tb = (char*)t; 1039 RETURN(e, n, fn); 1040 } 1041 1042 /* 1043 * open a character code conversion map from f to t 1044 */ 1045 1046 _ast_iconv_t 1047 _ast_iconv_open(const char* t, const char* f) 1048 { 1049 register Conv_t* cc; 1050 int fc; 1051 int tc; 1052 int i; 1053 1054 char fr[64]; 1055 char to[64]; 1056 1057 #if DEBUG_TRACE 1058 error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s t=%s\n", __LINE__, f, t); 1059 #endif 1060 if (!t || !*t || *t == '-' && !*(t + 1) || !strcasecmp(t, name_local) || !strcasecmp(t, name_native)) 1061 t = name_native; 1062 if (!f || !*f || *f == '-' && !*(f + 1) || !strcasecmp(t, name_local) || !strcasecmp(f, name_native)) 1063 f = name_native; 1064 1065 /* 1066 * the ast identify is always (iconv_t)(0) 1067 */ 1068 1069 if (t == f) 1070 return (iconv_t)(0); 1071 fc = _ast_iconv_name(f, fr, sizeof(fr)); 1072 tc = _ast_iconv_name(t, to, sizeof(to)); 1073 #if DEBUG_TRACE 1074 error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s:%s:%d t=%s:%s:%d\n", __LINE__, f, fr, fc, t, to, tc); 1075 #endif 1076 if (fc != CC_ICONV && fc == tc || streq(fr, to)) 1077 return (iconv_t)(0); 1078 1079 /* 1080 * first check the free list 1081 */ 1082 1083 for (i = 0; i < elementsof(freelist); i++) 1084 if ((cc = freelist[i]) && streq(to, cc->to.name) && streq(fr, cc->from.name)) 1085 { 1086 freelist[i] = 0; 1087 #if _lib_iconv_open 1088 /* 1089 * reset the shift state if any 1090 */ 1091 1092 if (cc->cvt != (iconv_t)(-1)) 1093 iconv(cc->cvt, NiL, NiL, NiL, NiL); 1094 #endif 1095 return cc; 1096 } 1097 1098 /* 1099 * allocate a new one 1100 */ 1101 1102 if (!(cc = newof(0, Conv_t, 1, strlen(to) + strlen(fr) + 2))) 1103 return (iconv_t)(-1); 1104 cc->to.name = (char*)(cc + 1); 1105 cc->from.name = strcopy(cc->to.name, to) + 1; 1106 strcpy(cc->from.name, fr); 1107 cc->cvt = (iconv_t)(-1); 1108 1109 /* 1110 * 8 bit maps are the easiest 1111 */ 1112 1113 if (fc >= 0 && tc >= 0) 1114 cc->from.map = ccmap(fc, tc); 1115 #if _lib_iconv_open 1116 else if ((cc->cvt = iconv_open(to, fr)) != (iconv_t)(-1)) 1117 cc->from.fun = (_ast_iconv_f)iconv; 1118 #endif 1119 #if _UWIN 1120 else if ((cc->cvt = _win_iconv_open(cc, to, fr)) != (_ast_iconv_t)(-1)) 1121 cc->from.fun = (_ast_iconv_f)_win_iconv; 1122 #endif 1123 else 1124 { 1125 switch (fc) 1126 { 1127 case CC_UTF: 1128 cc->from.fun = utf2bin; 1129 break; 1130 case CC_UME: 1131 cc->from.fun = ume2bin; 1132 break; 1133 case CC_UCS: 1134 cc->from.fun = ucs2bin; 1135 break; 1136 case CC_SCU: 1137 cc->from.fun = scu2bin; 1138 break; 1139 case CC_ASCII: 1140 break; 1141 default: 1142 if (fc < 0) 1143 goto nope; 1144 cc->from.map = ccmap(fc, CC_ASCII); 1145 break; 1146 } 1147 switch (tc) 1148 { 1149 case CC_UTF: 1150 cc->to.fun = bin2utf; 1151 break; 1152 case CC_UME: 1153 cc->to.fun = bin2ume; 1154 break; 1155 case CC_UCS: 1156 cc->to.fun = bin2ucs; 1157 break; 1158 case CC_SCU: 1159 cc->to.fun = bin2scu; 1160 break; 1161 case CC_ASCII: 1162 break; 1163 default: 1164 if (tc < 0) 1165 goto nope; 1166 cc->to.map = ccmap(CC_ASCII, tc); 1167 break; 1168 } 1169 } 1170 return (iconv_t)cc; 1171 nope: 1172 return (iconv_t)(-1); 1173 } 1174 1175 /* 1176 * close a character code conversion map 1177 */ 1178 1179 int 1180 _ast_iconv_close(_ast_iconv_t cd) 1181 { 1182 Conv_t* cc; 1183 Conv_t* oc; 1184 int i; 1185 int r = 0; 1186 1187 if (cd == (_ast_iconv_t)(-1)) 1188 return -1; 1189 if (!(cc = (Conv_t*)cd)) 1190 return 0; 1191 1192 /* 1193 * add to the free list 1194 */ 1195 1196 i = freeindex; 1197 for (;;) 1198 { 1199 if (++ i >= elementsof(freelist)) 1200 i = 0; 1201 if (!freelist[i]) 1202 break; 1203 if (i == freeindex) 1204 { 1205 if (++ i >= elementsof(freelist)) 1206 i = 0; 1207 1208 /* 1209 * close the oldest 1210 */ 1211 1212 if (oc = freelist[i]) 1213 { 1214 #if _lib_iconv_open 1215 if (oc->cvt != (iconv_t)(-1)) 1216 r = iconv_close(oc->cvt); 1217 #endif 1218 if (oc->buf) 1219 free(oc->buf); 1220 free(oc); 1221 } 1222 break; 1223 } 1224 } 1225 freelist[freeindex = i] = cc; 1226 return r; 1227 } 1228 1229 /* 1230 * copy *fb size *fn to *tb size *tn 1231 * fb,fn tb,tn updated on return 1232 */ 1233 1234 size_t 1235 _ast_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) 1236 { 1237 Conv_t* cc = (Conv_t*)cd; 1238 register unsigned char* f; 1239 register unsigned char* t; 1240 register unsigned char* e; 1241 register const unsigned char* m; 1242 register size_t n; 1243 char* b; 1244 char* tfb; 1245 size_t tfn; 1246 size_t i; 1247 1248 if (!fb || !*fb) 1249 { 1250 /* TODO: reset to the initial state */ 1251 if (!tb || !*tb) 1252 return 0; 1253 /* TODO: write the initial state shift sequence */ 1254 return 0; 1255 } 1256 n = *tn; 1257 if (cc) 1258 { 1259 if (cc->from.fun) 1260 { 1261 if (cc->to.fun) 1262 { 1263 if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0))) 1264 { 1265 errno = ENOMEM; 1266 return -1; 1267 } 1268 b = cc->buf; 1269 i = cc->size; 1270 tfb = *fb; 1271 tfn = *fn; 1272 if ((*cc->from.fun)(cc->cvt, &tfb, &tfn, &b, &i) == (size_t)(-1)) 1273 return -1; 1274 tfn = b - cc->buf; 1275 tfb = cc->buf; 1276 n = (*cc->to.fun)(cc->cvt, &tfb, &tfn, tb, tn); 1277 i = tfb - cc->buf; 1278 *fb += i; 1279 *fn -= i; 1280 return n; 1281 } 1282 if ((*cc->from.fun)(cc->cvt, fb, fn, tb, tn) == (size_t)(-1)) 1283 return -1; 1284 n -= *tn; 1285 if (m = cc->to.map) 1286 { 1287 e = (unsigned char*)(*tb); 1288 for (t = e - n; t < e; t++) 1289 *t = m[*t]; 1290 } 1291 return n; 1292 } 1293 else if (cc->to.fun) 1294 { 1295 if (!(m = cc->from.map)) 1296 return (*cc->to.fun)(cc->cvt, fb, fn, tb, tn); 1297 if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0))) 1298 { 1299 errno = ENOMEM; 1300 return -1; 1301 } 1302 if ((n = *fn) > cc->size) 1303 n = cc->size; 1304 f = (unsigned char*)(*fb); 1305 e = f + n; 1306 t = (unsigned char*)(b = cc->buf); 1307 while (f < e) 1308 *t++ = m[*f++]; 1309 n = (*cc->to.fun)(cc->cvt, &b, fn, tb, tn); 1310 *fb += b - cc->buf; 1311 return n; 1312 } 1313 } 1314 if (n > *fn) 1315 n = *fn; 1316 if (cc && (m = cc->from.map)) 1317 { 1318 f = (unsigned char*)(*fb); 1319 e = f + n; 1320 t = (unsigned char*)(*tb); 1321 while (f < e) 1322 *t++ = m[*f++]; 1323 } 1324 else 1325 memcpy(*tb, *fb, n); 1326 *fb += n; 1327 *fn -= n; 1328 *tb += n; 1329 *tn -= n; 1330 return n; 1331 } 1332 1333 /* 1334 * write *fb size *fn to op 1335 * fb,fn updated on return 1336 * total bytes written to op returned 1337 */ 1338 1339 ssize_t 1340 _ast_iconv_write(_ast_iconv_t cd, Sfio_t* op, char** fb, size_t* fn, size_t* e) 1341 { 1342 char* tb; 1343 char* ts; 1344 size_t tn; 1345 size_t r; 1346 1347 r = 0; 1348 tn = 0; 1349 while (*fn > 0) 1350 { 1351 if (!(tb = (char*)sfreserve(op, -(tn + 1), SF_WRITE|SF_LOCKR))) 1352 return r ? r : -1; 1353 ts = tb; 1354 tn = sfvalue(op); 1355 #if DEBUG_TRACE 1356 error(DEBUG_TRACE, "AHA#%d iconv_write ts=%p tn=%d", __LINE__, ts, tn); 1357 for (;;) 1358 #else 1359 while (_ast_iconv(cd, fb, fn, &ts, &tn) == (size_t)(-1)) 1360 #endif 1361 { 1362 #if DEBUG_TRACE 1363 ssize_t _r; 1364 error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d `%-.*s'", __LINE__, *fn, tn, *fn, *fb); 1365 _r = _ast_iconv(cd, fb, fn, &ts, &tn); 1366 error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d [%d]", __LINE__, *fn, tn, _r); 1367 if (_r != (size_t)(-1)) 1368 break; 1369 #endif 1370 if (errno == E2BIG) 1371 break; 1372 if (e) 1373 (*e)++; 1374 if (!tn) 1375 break; 1376 *ts++ = *(*fb)++; 1377 tn--; 1378 (*fn)--; 1379 } 1380 #if DEBUG_TRACE 1381 error(DEBUG_TRACE, "AHA#%d iconv_write %d", __LINE__, ts - tb); 1382 #endif 1383 1384 sfwrite(op, tb, ts - tb); 1385 r += ts - tb; 1386 } 1387 return r; 1388 } 1389 1390 /* 1391 * move n bytes from ip to op 1392 */ 1393 1394 ssize_t 1395 _ast_iconv_move(_ast_iconv_t cd, Sfio_t* ip, Sfio_t* op, size_t n, size_t* e) 1396 { 1397 char* fb; 1398 char* fs; 1399 char* tb; 1400 char* ts; 1401 size_t fn; 1402 size_t fo; 1403 size_t tn; 1404 size_t i; 1405 ssize_t r = 0; 1406 int locked; 1407 1408 fn = n; 1409 for (;;) 1410 { 1411 if (fn != SF_UNBOUND) 1412 fn = -((ssize_t)(fn & (((size_t)(~0))>>1))); 1413 if (!(fb = (char*)sfreserve(ip, fn, locked = SF_LOCKR)) && 1414 !(fb = (char*)sfreserve(ip, fn, locked = 0))) 1415 break; 1416 fs = fb; 1417 fn = fo = sfvalue(ip); 1418 if (!(tb = (char*)sfreserve(op, SF_UNBOUND, SF_WRITE|SF_LOCKR))) 1419 { 1420 sfread(ip, fb, 0); 1421 return r ? r : -1; 1422 } 1423 ts = tb; 1424 tn = sfvalue(op); 1425 while (_ast_iconv(cd, &fs, &fn, &ts, &tn) != (size_t)(-1) && fn > 0) 1426 { 1427 if (tn > 0) 1428 { 1429 *ts++ = '_'; 1430 tn--; 1431 } 1432 if (e) 1433 (*e)++; 1434 fs++; 1435 fn--; 1436 } 1437 sfwrite(op, tb, ts - tb); 1438 r += ts - tb; 1439 if (locked) 1440 sfread(ip, fb, fs - fb); 1441 else 1442 for (i = fn; --i >= (fs - fb);) 1443 sfungetc(ip, fb[i]); 1444 if (n != SF_UNBOUND) 1445 { 1446 if (n <= (fs - fb)) 1447 break; 1448 n -= fs - fb; 1449 } 1450 if (fn == fo) 1451 fn++; 1452 } 1453 return r; 1454 } 1455 1456 /* 1457 * iconv_list_t iterator 1458 * call with arg 0 to start 1459 * prev return value is current arg 1460 */ 1461 1462 _ast_iconv_list_t* 1463 _ast_iconv_list(_ast_iconv_list_t* cp) 1464 { 1465 #if _UWIN 1466 struct dirent* ent; 1467 1468 if (!cp) 1469 { 1470 if (!(cp = newof(0, _ast_iconv_list_t, 1, 0))) 1471 return ccmaplist(NiL); 1472 if (!(cp->data = opendir(_win_maps))) 1473 { 1474 free(cp); 1475 return ccmaplist(NiL); 1476 } 1477 } 1478 if (cp->data) 1479 { 1480 if (ent = readdir((DIR*)cp->data)) 1481 { 1482 cp->name = cp->match = cp->desc = (const char*)ent->d_name; 1483 return cp; 1484 } 1485 closedir((DIR*)cp->data); 1486 free(cp); 1487 return ccmaplist(NiL); 1488 } 1489 #else 1490 if (!cp) 1491 return ccmaplist(NiL); 1492 #endif 1493 if (cp->ccode >= 0) 1494 return (cp = ccmaplist(cp)) ? cp : (_ast_iconv_list_t*)codes; 1495 return (++cp)->name ? cp : (_ast_iconv_list_t*)0; 1496 } 1497