1 /***********************************************************************
2 * *
3 * This software is part of the ast package *
4 * Copyright (c) 1985-2010 AT&T Intellectual Property *
5 * and is licensed under the *
6 * Common Public License, Version 1.0 *
7 * by AT&T Intellectual Property *
8 * *
9 * A copy of the License is available at *
10 * http://www.opensource.org/licenses/cpl1.0.txt *
11 * (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9) *
12 * *
13 * Information and Software Systems Research *
14 * AT&T Research *
15 * Florham Park NJ *
16 * *
17 * Glenn Fowler <gsf@research.att.com> *
18 * David Korn <dgk@research.att.com> *
19 * Phong Vo <kpv@research.att.com> *
20 * *
21 ***********************************************************************/
22 #pragma prototyped
23
24 /*
25 * Glenn Fowler
26 * AT&T Research
27 *
28 * iconv intercept
29 * minimally provides { utf*<=>bin ascii<=>ebcdic* }
30 */
31
32 #include <ast.h>
33 #include <dirent.h>
34
35 #define DEBUG_TRACE 0
36 #define _ICONV_LIST_PRIVATE_
37
38 #include <ccode.h>
39 #include <ctype.h>
40 #include <iconv.h>
41
42 #include "lclib.h"
43
44 #if !_lib_iconv_open
45
46 #define _ast_iconv_t iconv_t
47 #define _ast_iconv_f iconv_f
48 #define _ast_iconv_list_t iconv_list_t
49 #define _ast_iconv_open iconv_open
50 #define _ast_iconv iconv
51 #define _ast_iconv_close iconv_close
52 #define _ast_iconv_list iconv_list
53 #define _ast_iconv_move iconv_move
54 #define _ast_iconv_name iconv_name
55 #define _ast_iconv_write iconv_write
56
57 #endif
58
59 #ifndef E2BIG
60 #define E2BIG ENOMEM
61 #endif
62 #ifndef EILSEQ
63 #define EILSEQ EIO
64 #endif
65
66 #define RETURN(e,n,fn) \
67 if (*fn && !e) e = E2BIG; \
68 if (e) { errno = e; return (size_t)(-1); } \
69 return n;
70
71 typedef struct Map_s
72 {
73 char* name;
74 const unsigned char* map;
75 _ast_iconv_f fun;
76 int index;
77 } Map_t;
78
79 typedef struct Conv_s
80 {
81 iconv_t cvt;
82 char* buf;
83 size_t size;
84 Map_t from;
85 Map_t to;
86 } Conv_t;
87
88 static Conv_t* freelist[4];
89 static int freeindex;
90
91 static const char name_local[] = "local";
92 static const char name_native[] = "native";
93
94 static const _ast_iconv_list_t codes[] =
95 {
96 {
97 "utf",
98 "un|unicode|utf",
99 "multibyte 8-bit unicode",
100 "UTF-%s",
101 "8",
102 CC_UTF,
103 },
104
105 {
106 "ume",
107 "um|ume|utf?(-)7",
108 "multibyte 7-bit unicode",
109 "UTF-7",
110 0,
111 CC_UME,
112 },
113
114 {
115 "euc",
116 "(big|euc)*",
117 "euc family",
118 0,
119 0,
120 CC_ICONV,
121 },
122
123 {
124 "dos",
125 "dos?(-)?(855)",
126 "dos code page",
127 "DOS855",
128 0,
129 CC_ICONV,
130 },
131
132 {
133 "ucs",
134 "ucs?(-)?(2)?(be)|utf-16?(be)",
135 "unicode runes",
136 "UCS-%s",
137 "2",
138 CC_UCS,
139 },
140
141 {
142 "ucs-le",
143 "ucs?(-)?(2)le|utf-16le",
144 "little endian unicode runes",
145 "UCS-%sLE",
146 "2",
147 CC_SCU,
148 },
149
150 { 0 },
151 };
152
153 #if _UWIN
154
155 #include <ast_windows.h>
156
157 #ifndef CP_UCS2
158 #define CP_UCS2 0x0000
159 #endif
160
161 static char _win_maps[] = "/reg/local_machine/SOFTWARE/Classes/MIME/Database/Charset";
162
163 /*
164 * return the codeset index given its name or alias
165 * the map is in the what? oh, the registry
166 */
167
168 static int
_win_codeset(const char * name)169 _win_codeset(const char* name)
170 {
171 register char* s;
172 char* e;
173 int n;
174 Sfio_t* sp;
175 char aka[128];
176 char tmp[128];
177
178 #if DEBUG_TRACE
179 error(DEBUG_TRACE, "AHA#%d _win_codeset name=%s", __LINE__, name);
180 #endif
181 if (name == name_native)
182 return CP_ACP;
183 if (!strcasecmp(name, "utf") || !strcasecmp(name, "utf8") || !strcasecmp(name, "utf-8"))
184 return CP_UTF8;
185 if (!strcasecmp(name, "ucs") || !strcasecmp(name, "ucs2") || !strcasecmp(name, "ucs-2"))
186 return CP_UCS2;
187 if (name[0] == '0' && name[1] == 'x' && (n = strtol(name, &e, 0)) > 0 && !*e)
188 return n;
189 for (;;)
190 {
191 sfsprintf(tmp, sizeof(tmp), "%s/%s", _win_maps, name);
192 if (!(sp = sfopen(0, tmp, "r")))
193 {
194 s = (char*)name;
195 if ((s[0] == 'c' || s[0] == 'C') && (s[1] == 'p' || s[1] == 'P'))
196 s += 2;
197 if (!isdigit(s[0]))
198 break;
199 sfsprintf(tmp, sizeof(tmp), "%s/windows-%s", _win_maps, s);
200 if (!(sp = sfopen(0, tmp, "r")))
201 break;
202 }
203 for (;;)
204 {
205 if (!(s = sfgetr(sp, '\n', 0)))
206 {
207 sfclose(sp);
208 return -1;
209 }
210 if (!strncasecmp(s, "AliasForCharSet=", 16))
211 {
212 n = sfvalue(sp) - 17;
213 s += 16;
214 if (n >= sizeof(aka))
215 n = sizeof(aka) - 1;
216 memcpy(aka, s, n);
217 aka[n] = 0;
218 sfclose(sp);
219 name = (const char*)aka;
220 break;
221 }
222 if (!strncasecmp(s, "CodePage=", 9))
223 {
224 s += 9;
225 n = strtol(s, 0, 0);
226 sfclose(sp);
227 return n;
228 }
229 }
230 }
231 return -1;
232 }
233
234 /*
235 * get and check the codeset indices
236 */
237
238 static _ast_iconv_t
_win_iconv_open(register Conv_t * cc,const char * t,const char * f)239 _win_iconv_open(register Conv_t* cc, const char* t, const char* f)
240 {
241 #if DEBUG_TRACE
242 error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=%s t=%s\n", __LINE__, f, t);
243 #endif
244 if ((cc->from.index = _win_codeset(f)) < 0)
245 return (_ast_iconv_t)(-1);
246 if ((cc->to.index = _win_codeset(t)) < 0)
247 return (_ast_iconv_t)(-1);
248 #if DEBUG_TRACE
249 error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=0x%04x t=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
250 #endif
251 return (_ast_iconv_t)cc;
252 }
253
254 /*
255 * even though the indices already check out
256 * they could still be rejected
257 */
258
259 static size_t
_win_iconv(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)260 _win_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
261 {
262 Conv_t* cc = (Conv_t*)cd;
263 size_t un;
264 size_t tz;
265 size_t fz;
266 size_t bz;
267 size_t pz;
268 size_t oz;
269 LPWSTR ub;
270
271 #if DEBUG_TRACE
272 error(DEBUG_TRACE, "AHA#%d _win_iconv from=0x%04x to=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
273 #endif
274 if (cc->from.index == cc->to.index)
275 {
276 /*
277 * easy
278 */
279
280 fz = tz = (*fn < *tn) ? *fn : *tn;
281 memcpy(*tb, *fb, fz);
282 }
283 else
284 {
285 ub = 0;
286 un = *fn;
287
288 /*
289 * from => ucs-2
290 */
291
292 if (cc->to.index == CP_UCS2)
293 {
294 if ((tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, *tn)) && tz <= *tn)
295 {
296 fz = *fn;
297 tz *= sizeof(WCHAR);
298 }
299 else
300 {
301 /*
302 * target too small
303 * binary search on input size to make it fit
304 */
305
306 oz = 0;
307 pz = *fn / 2;
308 fz = *fn - pz;
309 for (;;)
310 {
311 while (!(tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)fz, (LPWSTR)*tb, 0)))
312 if (++fz >= *fn)
313 goto nope;
314 tz *= sizeof(WCHAR);
315 if (tz == *tn)
316 break;
317 if (!(pz /= 2))
318 {
319 if (!(fz = oz))
320 goto nope;
321 break;
322 }
323 if (tz > *tn)
324 fz -= pz;
325 else
326 {
327 oz = fz;
328 fz += pz;
329 }
330 }
331 }
332 }
333 else
334 {
335 if (cc->from.index == CP_UCS2)
336 {
337 un = *fn / sizeof(WCHAR);
338 ub = (LPWSTR)*fb;
339 }
340 else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, 0)))
341 goto nope;
342 else if (!(ub = (LPWSTR)malloc(un * sizeof(WCHAR))))
343 goto nope;
344 else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)ub, un)))
345 goto nope;
346
347 /*
348 * ucs-2 => to
349 */
350
351 if (tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, un, *tb, *tn, 0, 0))
352 fz = *fn;
353 else
354 {
355 /*
356 * target too small
357 * binary search on input size to make it fit
358 */
359
360 oz = 0;
361 pz = *fn / 2;
362 bz = *fn - pz;
363 for (;;)
364 {
365 while (!(fz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)bz, (LPWSTR)ub, un)))
366 if (++bz > *fn)
367 goto nope;
368 if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, 0, 0, 0)))
369 goto nope;
370 if (tz == *tn)
371 break;
372 if (!(pz /= 2))
373 {
374 if (!(fz = oz))
375 goto nope;
376 break;
377 }
378 if (tz > *tn)
379 bz -= pz;
380 else
381 {
382 oz = bz;
383 bz += pz;
384 }
385 }
386 if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, tz, 0, 0)))
387 goto nope;
388 #if DEBUG_TRACE
389 error(DEBUG_TRACE, "AHA#%d _win_iconv *fn=%u fz=%u[%u] *tn=%u tz=%u\n", __LINE__, *fn, fz, fz * sizeof(WCHAR), *tn, tz);
390 #endif
391 #if 0
392 fz *= sizeof(WCHAR);
393 #endif
394 }
395 if (ub != (LPWSTR)*fb)
396 free(ub);
397 }
398 }
399 *fb += fz;
400 *fn -= fz;
401 *tb += tz;
402 *tn -= tz;
403 return fz;
404 nope:
405 if (ub && ub != (LPWSTR)*fb)
406 free(ub);
407 errno = EINVAL;
408 return (size_t)(-1);
409 }
410
411 #endif
412
413 /*
414 * return canonical character code set name for m
415 * if b!=0 then canonical name placed in b of size n
416 * <ccode.h> index returned
417 */
418
419 int
_ast_iconv_name(register const char * m,register char * b,size_t n)420 _ast_iconv_name(register const char* m, register char* b, size_t n)
421 {
422 register const _ast_iconv_list_t* cp;
423 const _ast_iconv_list_t* bp;
424 register int c;
425 register char* e;
426 int sub[2];
427 char buf[16];
428 #if DEBUG_TRACE
429 char* o;
430 #endif
431
432 if (!b)
433 {
434 b = buf;
435 n = sizeof(buf);
436 }
437 #if DEBUG_TRACE
438 o = b;
439 #endif
440 e = b + n - 1;
441 bp = 0;
442 n = 0;
443 cp = ccmaplist(NiL);
444 #if DEBUG_TRACE
445 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name m=\"%s\"\n", error_info.id, error_info.trace, __LINE__, m);
446 #endif
447 for (;;)
448 {
449 #if DEBUG_TRACE
450 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name n=%d bp=%p cp=%p ccode=%d name=\"%s\"\n", error_info.id, error_info.trace, __LINE__, n, bp, cp, cp->ccode, cp->name);
451 #endif
452 if (strgrpmatch(m, cp->match, sub, elementsof(sub) / 2, STR_MAXIMAL|STR_LEFT|STR_ICASE))
453 {
454 if (!(c = m[sub[1]]))
455 {
456 bp = cp;
457 break;
458 }
459 if (sub[1] > n && !isalpha(c))
460 {
461 bp = cp;
462 n = sub[1];
463 }
464 }
465 if (cp->ccode < 0)
466 {
467 if (!(++cp)->name)
468 break;
469 }
470 else if (!(cp = (const _ast_iconv_list_t*)ccmaplist((_ast_iconv_list_t*)cp)))
471 cp = codes;
472 }
473 if (cp = bp)
474 {
475 if (cp->canon)
476 {
477 if (cp->index)
478 {
479 for (m += sub[1]; *m && !isalnum(*m); m++);
480 if (!isdigit(*m))
481 m = cp->index;
482 }
483 else
484 m = "1";
485 b += sfsprintf(b, e - b, cp->canon, m);
486 }
487 else if (cp->ccode == CC_NATIVE)
488 {
489 if ((locales[AST_LC_CTYPE]->flags & LC_default) || !locales[AST_LC_CTYPE]->charset || !(m = locales[AST_LC_CTYPE]->charset->code) || streq(m, "iso8859-1"))
490 switch (CC_NATIVE)
491 {
492 case CC_EBCDIC:
493 m = (const char*)"EBCDIC";
494 break;
495 case CC_EBCDIC_I:
496 m = (const char*)"EBCDIC-I";
497 break;
498 case CC_EBCDIC_O:
499 m = (const char*)"EBCDIC-O";
500 break;
501 default:
502 m = (const char*)"ISO-8859-1";
503 break;
504 }
505 b += sfsprintf(b, e - b, "%s", m);
506 }
507 *b = 0;
508 #if DEBUG_TRACE
509 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, cp->ccode, o);
510 #endif
511 return cp->ccode;
512 }
513 while (b < e && (c = *m++))
514 {
515 if (islower(c))
516 c = toupper(c);
517 *b++ = c;
518 }
519 *b = 0;
520 #if DEBUG_TRACE
521 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, CC_ICONV, o);
522 #endif
523 return CC_ICONV;
524 }
525
526 /*
527 * convert utf-8 to bin
528 */
529
530 static size_t
utf2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)531 utf2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
532 {
533 register unsigned char* f;
534 register unsigned char* fe;
535 register unsigned char* t;
536 register unsigned char* te;
537 register unsigned char* p;
538 register int c;
539 register int w;
540 size_t n;
541 int e;
542
543 e = 0;
544 f = (unsigned char*)(*fb);
545 fe = f + (*fn);
546 t = (unsigned char*)(*tb);
547 te = t + (*tn);
548 while (t < te && f < fe)
549 {
550 p = f;
551 c = *f++;
552 if (c & 0x80)
553 {
554 if (!(c & 0x40))
555 {
556 f = p;
557 e = EILSEQ;
558 break;
559 }
560 if (c & 0x20)
561 {
562 w = (c & 0x0F) << 12;
563 if (f >= fe)
564 {
565 f = p;
566 e = EINVAL;
567 break;
568 }
569 c = *f++;
570 if (c & 0x40)
571 {
572 f = p;
573 e = EILSEQ;
574 break;
575 }
576 w |= (c & 0x3F) << 6;
577 }
578 else
579 w = (c & 0x1F) << 6;
580 if (f >= fe)
581 {
582 f = p;
583 e = EINVAL;
584 break;
585 }
586 c = *f++;
587 w |= (c & 0x3F);
588 }
589 else
590 w = c;
591 *t++ = w;
592 }
593 *fn -= (char*)f - (*fb);
594 *fb = (char*)f;
595 *tn -= (n = (char*)t - (*tb));
596 *tb = (char*)t;
597 RETURN(e, n, fn);
598 }
599
600 /*
601 * convert bin to utf-8
602 */
603
604 static size_t
bin2utf(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)605 bin2utf(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
606 {
607 register unsigned char* f;
608 register unsigned char* fe;
609 register unsigned char* t;
610 register unsigned char* te;
611 register int c;
612 wchar_t w;
613 size_t n;
614 int e;
615
616 e = 0;
617 f = (unsigned char*)(*fb);
618 fe = f + (*fn);
619 t = (unsigned char*)(*tb);
620 te = t + (*tn);
621 while (f < fe && t < te)
622 {
623 if (!mbwide())
624 {
625 c = 1;
626 w = *f;
627 }
628 else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
629 {
630 e = EINVAL;
631 break;
632 }
633 else if (!c)
634 c = 1;
635 if (!(w & ~0x7F))
636 *t++ = w;
637 else
638 {
639 if (!(w & ~0x7FF))
640 {
641 if (t >= (te - 2))
642 {
643 e = E2BIG;
644 break;
645 }
646 *t++ = 0xC0 + (w >> 6);
647 }
648 else if (!(w & ~0xffff))
649 {
650 if (t >= (te - 3))
651 {
652 e = E2BIG;
653 break;
654 }
655 *t++ = 0xE0 + (w >> 12);
656 *t++ = 0x80 + ((w >> 6 ) & 0x3F);
657 }
658 else
659 {
660 e = EILSEQ;
661 break;
662 }
663 *t++ = 0x80 + (w & 0x3F);
664 }
665 f += c;
666 }
667 *fn -= (n = (char*)f - (*fb));
668 *fb = (char*)f;
669 *tn -= (char*)t - (*tb);
670 *tb = (char*)t;
671 RETURN(e, n, fn);
672 }
673
674 static const unsigned char ume_D[] =
675 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?!\"#$%&*;<=>@[]^_`{|} \t\n";
676
677 static const unsigned char ume_M[] =
678 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
679
680 static unsigned char ume_d[UCHAR_MAX+1];
681
682 static unsigned char ume_m[UCHAR_MAX+1];
683
684 #define NOE 0xFF
685 #define UMEINIT() (ume_d[ume_D[0]]?0:umeinit())
686
687 /*
688 * initialize the ume tables
689 */
690
691 static int
umeinit(void)692 umeinit(void)
693 {
694 register const unsigned char* s;
695 register int i;
696 register int c;
697
698 if (!ume_d[ume_D[0]])
699 {
700 s = ume_D;
701 while (c = *s++)
702 ume_d[c] = 1;
703 memset(ume_m, NOE, sizeof(ume_m));
704 for (i = 0; c = ume_M[i]; i++)
705 ume_m[c] = i;
706 }
707 return 0;
708 }
709
710 /*
711 * convert utf-7 to bin
712 */
713
714 static size_t
ume2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)715 ume2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
716 {
717 register unsigned char* f;
718 register unsigned char* fe;
719 register unsigned char* t;
720 register unsigned char* te;
721 register unsigned char* p;
722 register int s;
723 register int c;
724 register int w;
725 size_t n;
726 int e;
727
728 e = 0;
729 UMEINIT();
730 f = (unsigned char*)(*fb);
731 fe = f + (*fn);
732 t = (unsigned char*)(*tb);
733 te = t + (*tn);
734 s = 0;
735 while (f < fe && t < te)
736 {
737 p = f;
738 c = *f++;
739 if (s)
740 {
741 if (c == '-' && s > 1)
742 s = 0;
743 else if ((w = ume_m[c]) == NOE)
744 {
745 s = 0;
746 *t++ = c;
747 }
748 else if (f >= (fe - 2))
749 {
750 f = p;
751 e = EINVAL;
752 break;
753 }
754 else
755 {
756 s = 2;
757 w = (w << 6) | ume_m[*f++];
758 w = (w << 6) | ume_m[*f++];
759 if (!(w & ~0xFF))
760 *t++ = w;
761 else if (t >= (te - 1))
762 {
763 f = p;
764 e = E2BIG;
765 break;
766 }
767 else
768 {
769 *t++ = (w >> 8) & 0xFF;
770 *t++ = w & 0xFF;
771 }
772 }
773 }
774 else if (c == '+')
775 s = 1;
776 else
777 *t++ = c;
778 }
779 *fn -= (char*)f - (*fb);
780 *fb = (char*)f;
781 *tn -= (n = (char*)t - (*tb));
782 *tb = (char*)t;
783 RETURN(e, n, fn);
784 }
785
786 /*
787 * convert bin to utf-7
788 */
789
790 static size_t
bin2ume(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)791 bin2ume(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
792 {
793 register unsigned char* f;
794 register unsigned char* fe;
795 register unsigned char* t;
796 register unsigned char* te;
797 register int c;
798 register int s;
799 wchar_t w;
800 size_t n;
801 int e;
802
803 e = 0;
804 UMEINIT();
805 f = (unsigned char*)(*fb);
806 fe = f + (*fn);
807 t = (unsigned char*)(*tb);
808 te = t + (*tn);
809 s = 0;
810 while (f < fe && t < (te - s))
811 {
812 if (!mbwide())
813 {
814 c = 1;
815 w = *f;
816 }
817 else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
818 {
819 e = EINVAL;
820 break;
821 }
822 else if (!c)
823 c = 1;
824 if (!(w & ~0x7F) && ume_d[w])
825 {
826 if (s)
827 {
828 s = 0;
829 *t++ = '-';
830 }
831 *t++ = w;
832 }
833 else if (t >= (te - (4 + s)))
834 {
835 e = E2BIG;
836 break;
837 }
838 else
839 {
840 if (!s)
841 {
842 s = 1;
843 *t++ = '+';
844 }
845 *t++ = ume_M[(w >> 12) & 0x3F];
846 *t++ = ume_M[(w >> 6) & 0x3F];
847 *t++ = ume_M[w & 0x3F];
848 }
849 f += c;
850 }
851 if (s)
852 *t++ = '-';
853 *fn -= (n = (char*)f - (*fb));
854 *fb = (char*)f;
855 *tn -= (char*)t - (*tb);
856 *tb = (char*)t;
857 RETURN(e, n, fn);
858 }
859
860 /*
861 * convert ucs-2 to bin with no byte swap
862 */
863
864 static size_t
ucs2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)865 ucs2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
866 {
867 register unsigned char* f;
868 register unsigned char* fe;
869 register unsigned char* t;
870 register unsigned char* te;
871 register int w;
872 size_t n;
873 int e;
874
875 e = 0;
876 f = (unsigned char*)(*fb);
877 fe = f + (*fn);
878 t = (unsigned char*)(*tb);
879 te = t + (*tn);
880 while (f < (fe - 1) && t < te)
881 {
882 w = *f++;
883 w = (w << 8) | *f++;
884 if (!(w & ~0xFF))
885 *t++ = w;
886 else if (t >= (te - 1))
887 {
888 f -= 2;
889 e = E2BIG;
890 break;
891 }
892 else
893 {
894 *t++ = (w >> 8) & 0xFF;
895 *t++ = w & 0xFF;
896 }
897 }
898 *fn -= (char*)f - (*fb);
899 *fb = (char*)f;
900 *tn -= (n = (char*)t - (*tb));
901 *tb = (char*)t;
902 RETURN(e, n, fn);
903 }
904
905 /*
906 * convert bin to ucs-2 with no byte swap
907 */
908
909 static size_t
bin2ucs(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)910 bin2ucs(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
911 {
912 register unsigned char* f;
913 register unsigned char* fe;
914 register unsigned char* t;
915 register unsigned char* te;
916 register int c;
917 wchar_t w;
918 size_t n;
919 int e;
920
921 e = 0;
922 f = (unsigned char*)(*fb);
923 fe = f + (*fn);
924 t = (unsigned char*)(*tb);
925 te = t + (*tn);
926 while (f < fe && t < (te - 1))
927 {
928 if (!mbwide())
929 {
930 c = 1;
931 w = *f;
932 }
933 if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
934 {
935 e = EINVAL;
936 break;
937 }
938 else if (!c)
939 c = 1;
940 *t++ = (w >> 8) & 0xFF;
941 *t++ = w & 0xFF;
942 f += c;
943 }
944 *fn -= (n = (char*)f - (*fb));
945 *fb = (char*)f;
946 *tn -= (char*)t - (*tb);
947 *tb = (char*)t;
948 RETURN(e, n, fn);
949 }
950
951 /*
952 * convert ucs-2 to bin with byte swap
953 */
954
955 static size_t
scu2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)956 scu2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
957 {
958 register unsigned char* f;
959 register unsigned char* fe;
960 register unsigned char* t;
961 register unsigned char* te;
962 register int w;
963 size_t n;
964 int e;
965
966 e = 0;
967 f = (unsigned char*)(*fb);
968 fe = f + (*fn);
969 t = (unsigned char*)(*tb);
970 te = t + (*tn);
971 while (f < (fe - 1) && t < te)
972 {
973 w = *f++;
974 w = w | (*f++ << 8);
975 if (!(w & ~0xFF))
976 *t++ = w;
977 else if (t >= (te - 1))
978 {
979 f -= 2;
980 e = E2BIG;
981 break;
982 }
983 else
984 {
985 *t++ = (w >> 8) & 0xFF;
986 *t++ = w & 0xFF;
987 }
988 }
989 *fn -= (char*)f - (*fb);
990 *fb = (char*)f;
991 *tn -= (n = (char*)t - (*tb));
992 *tb = (char*)t;
993 RETURN(e, n, fn);
994 }
995
996 /*
997 * convert bin to ucs-2 with byte swap
998 */
999
1000 static size_t
bin2scu(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)1001 bin2scu(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1002 {
1003 register unsigned char* f;
1004 register unsigned char* fe;
1005 register unsigned char* t;
1006 register unsigned char* te;
1007 register int c;
1008 wchar_t w;
1009 size_t n;
1010 int e;
1011
1012 e = 0;
1013 f = (unsigned char*)(*fb);
1014 fe = f + (*fn);
1015 t = (unsigned char*)(*tb);
1016 te = t + (*tn);
1017 while (f < fe && t < (te - 1))
1018 {
1019 if (!mbwide())
1020 {
1021 c = 1;
1022 w = *f;
1023 }
1024 else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
1025 {
1026 e = EINVAL;
1027 break;
1028 }
1029 else if (!c)
1030 c = 1;
1031 *t++ = w & 0xFF;
1032 *t++ = (w >> 8) & 0xFF;
1033 f += c;
1034 }
1035 *fn -= (n = (char*)f - (*fb));
1036 *fb = (char*)f;
1037 *tn -= (char*)t - (*tb);
1038 *tb = (char*)t;
1039 RETURN(e, n, fn);
1040 }
1041
1042 /*
1043 * open a character code conversion map from f to t
1044 */
1045
1046 _ast_iconv_t
_ast_iconv_open(const char * t,const char * f)1047 _ast_iconv_open(const char* t, const char* f)
1048 {
1049 register Conv_t* cc;
1050 int fc;
1051 int tc;
1052 int i;
1053
1054 char fr[64];
1055 char to[64];
1056
1057 #if DEBUG_TRACE
1058 error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s t=%s\n", __LINE__, f, t);
1059 #endif
1060 if (!t || !*t || *t == '-' && !*(t + 1) || !strcasecmp(t, name_local) || !strcasecmp(t, name_native))
1061 t = name_native;
1062 if (!f || !*f || *f == '-' && !*(f + 1) || !strcasecmp(t, name_local) || !strcasecmp(f, name_native))
1063 f = name_native;
1064
1065 /*
1066 * the ast identify is always (iconv_t)(0)
1067 */
1068
1069 if (t == f)
1070 return (iconv_t)(0);
1071 fc = _ast_iconv_name(f, fr, sizeof(fr));
1072 tc = _ast_iconv_name(t, to, sizeof(to));
1073 #if DEBUG_TRACE
1074 error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s:%s:%d t=%s:%s:%d\n", __LINE__, f, fr, fc, t, to, tc);
1075 #endif
1076 if (fc != CC_ICONV && fc == tc || streq(fr, to))
1077 return (iconv_t)(0);
1078
1079 /*
1080 * first check the free list
1081 */
1082
1083 for (i = 0; i < elementsof(freelist); i++)
1084 if ((cc = freelist[i]) && streq(to, cc->to.name) && streq(fr, cc->from.name))
1085 {
1086 freelist[i] = 0;
1087 #if _lib_iconv_open
1088 /*
1089 * reset the shift state if any
1090 */
1091
1092 if (cc->cvt != (iconv_t)(-1))
1093 iconv(cc->cvt, NiL, NiL, NiL, NiL);
1094 #endif
1095 return cc;
1096 }
1097
1098 /*
1099 * allocate a new one
1100 */
1101
1102 if (!(cc = newof(0, Conv_t, 1, strlen(to) + strlen(fr) + 2)))
1103 return (iconv_t)(-1);
1104 cc->to.name = (char*)(cc + 1);
1105 cc->from.name = strcopy(cc->to.name, to) + 1;
1106 strcpy(cc->from.name, fr);
1107 cc->cvt = (iconv_t)(-1);
1108
1109 /*
1110 * 8 bit maps are the easiest
1111 */
1112
1113 if (fc >= 0 && tc >= 0)
1114 cc->from.map = ccmap(fc, tc);
1115 #if _lib_iconv_open
1116 else if ((cc->cvt = iconv_open(to, fr)) != (iconv_t)(-1))
1117 cc->from.fun = (_ast_iconv_f)iconv;
1118 #endif
1119 #if _UWIN
1120 else if ((cc->cvt = _win_iconv_open(cc, to, fr)) != (_ast_iconv_t)(-1))
1121 cc->from.fun = (_ast_iconv_f)_win_iconv;
1122 #endif
1123 else
1124 {
1125 switch (fc)
1126 {
1127 case CC_UTF:
1128 cc->from.fun = utf2bin;
1129 break;
1130 case CC_UME:
1131 cc->from.fun = ume2bin;
1132 break;
1133 case CC_UCS:
1134 cc->from.fun = ucs2bin;
1135 break;
1136 case CC_SCU:
1137 cc->from.fun = scu2bin;
1138 break;
1139 case CC_ASCII:
1140 break;
1141 default:
1142 if (fc < 0)
1143 goto nope;
1144 cc->from.map = ccmap(fc, CC_ASCII);
1145 break;
1146 }
1147 switch (tc)
1148 {
1149 case CC_UTF:
1150 cc->to.fun = bin2utf;
1151 break;
1152 case CC_UME:
1153 cc->to.fun = bin2ume;
1154 break;
1155 case CC_UCS:
1156 cc->to.fun = bin2ucs;
1157 break;
1158 case CC_SCU:
1159 cc->to.fun = bin2scu;
1160 break;
1161 case CC_ASCII:
1162 break;
1163 default:
1164 if (tc < 0)
1165 goto nope;
1166 cc->to.map = ccmap(CC_ASCII, tc);
1167 break;
1168 }
1169 }
1170 return (iconv_t)cc;
1171 nope:
1172 return (iconv_t)(-1);
1173 }
1174
1175 /*
1176 * close a character code conversion map
1177 */
1178
1179 int
_ast_iconv_close(_ast_iconv_t cd)1180 _ast_iconv_close(_ast_iconv_t cd)
1181 {
1182 Conv_t* cc;
1183 Conv_t* oc;
1184 int i;
1185 int r = 0;
1186
1187 if (cd == (_ast_iconv_t)(-1))
1188 return -1;
1189 if (!(cc = (Conv_t*)cd))
1190 return 0;
1191
1192 /*
1193 * add to the free list
1194 */
1195
1196 i = freeindex;
1197 for (;;)
1198 {
1199 if (++ i >= elementsof(freelist))
1200 i = 0;
1201 if (!freelist[i])
1202 break;
1203 if (i == freeindex)
1204 {
1205 if (++ i >= elementsof(freelist))
1206 i = 0;
1207
1208 /*
1209 * close the oldest
1210 */
1211
1212 if (oc = freelist[i])
1213 {
1214 #if _lib_iconv_open
1215 if (oc->cvt != (iconv_t)(-1))
1216 r = iconv_close(oc->cvt);
1217 #endif
1218 if (oc->buf)
1219 free(oc->buf);
1220 free(oc);
1221 }
1222 break;
1223 }
1224 }
1225 freelist[freeindex = i] = cc;
1226 return r;
1227 }
1228
1229 /*
1230 * copy *fb size *fn to *tb size *tn
1231 * fb,fn tb,tn updated on return
1232 */
1233
1234 size_t
_ast_iconv(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)1235 _ast_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1236 {
1237 Conv_t* cc = (Conv_t*)cd;
1238 register unsigned char* f;
1239 register unsigned char* t;
1240 register unsigned char* e;
1241 register const unsigned char* m;
1242 register size_t n;
1243 char* b;
1244 char* tfb;
1245 size_t tfn;
1246 size_t i;
1247
1248 if (!fb || !*fb)
1249 {
1250 /* TODO: reset to the initial state */
1251 if (!tb || !*tb)
1252 return 0;
1253 /* TODO: write the initial state shift sequence */
1254 return 0;
1255 }
1256 n = *tn;
1257 if (cc)
1258 {
1259 if (cc->from.fun)
1260 {
1261 if (cc->to.fun)
1262 {
1263 if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1264 {
1265 errno = ENOMEM;
1266 return -1;
1267 }
1268 b = cc->buf;
1269 i = cc->size;
1270 tfb = *fb;
1271 tfn = *fn;
1272 if ((*cc->from.fun)(cc->cvt, &tfb, &tfn, &b, &i) == (size_t)(-1))
1273 return -1;
1274 tfn = b - cc->buf;
1275 tfb = cc->buf;
1276 n = (*cc->to.fun)(cc->cvt, &tfb, &tfn, tb, tn);
1277 i = tfb - cc->buf;
1278 *fb += i;
1279 *fn -= i;
1280 return n;
1281 }
1282 if ((*cc->from.fun)(cc->cvt, fb, fn, tb, tn) == (size_t)(-1))
1283 return -1;
1284 n -= *tn;
1285 if (m = cc->to.map)
1286 {
1287 e = (unsigned char*)(*tb);
1288 for (t = e - n; t < e; t++)
1289 *t = m[*t];
1290 }
1291 return n;
1292 }
1293 else if (cc->to.fun)
1294 {
1295 if (!(m = cc->from.map))
1296 return (*cc->to.fun)(cc->cvt, fb, fn, tb, tn);
1297 if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1298 {
1299 errno = ENOMEM;
1300 return -1;
1301 }
1302 if ((n = *fn) > cc->size)
1303 n = cc->size;
1304 f = (unsigned char*)(*fb);
1305 e = f + n;
1306 t = (unsigned char*)(b = cc->buf);
1307 while (f < e)
1308 *t++ = m[*f++];
1309 n = (*cc->to.fun)(cc->cvt, &b, fn, tb, tn);
1310 *fb += b - cc->buf;
1311 return n;
1312 }
1313 }
1314 if (n > *fn)
1315 n = *fn;
1316 if (cc && (m = cc->from.map))
1317 {
1318 f = (unsigned char*)(*fb);
1319 e = f + n;
1320 t = (unsigned char*)(*tb);
1321 while (f < e)
1322 *t++ = m[*f++];
1323 }
1324 else
1325 memcpy(*tb, *fb, n);
1326 *fb += n;
1327 *fn -= n;
1328 *tb += n;
1329 *tn -= n;
1330 return n;
1331 }
1332
1333 /*
1334 * write *fb size *fn to op
1335 * fb,fn updated on return
1336 * total bytes written to op returned
1337 */
1338
1339 ssize_t
_ast_iconv_write(_ast_iconv_t cd,Sfio_t * op,char ** fb,size_t * fn,size_t * e)1340 _ast_iconv_write(_ast_iconv_t cd, Sfio_t* op, char** fb, size_t* fn, size_t* e)
1341 {
1342 char* tb;
1343 char* ts;
1344 size_t tn;
1345 size_t r;
1346
1347 r = 0;
1348 tn = 0;
1349 while (*fn > 0)
1350 {
1351 if (!(tb = (char*)sfreserve(op, -(tn + 1), SF_WRITE|SF_LOCKR)))
1352 return r ? r : -1;
1353 ts = tb;
1354 tn = sfvalue(op);
1355 #if DEBUG_TRACE
1356 error(DEBUG_TRACE, "AHA#%d iconv_write ts=%p tn=%d", __LINE__, ts, tn);
1357 for (;;)
1358 #else
1359 while (_ast_iconv(cd, fb, fn, &ts, &tn) == (size_t)(-1))
1360 #endif
1361 {
1362 #if DEBUG_TRACE
1363 ssize_t _r;
1364 error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d `%-.*s'", __LINE__, *fn, tn, *fn, *fb);
1365 _r = _ast_iconv(cd, fb, fn, &ts, &tn);
1366 error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d [%d]", __LINE__, *fn, tn, _r);
1367 if (_r != (size_t)(-1))
1368 break;
1369 #endif
1370 if (errno == E2BIG)
1371 break;
1372 if (e)
1373 (*e)++;
1374 if (!tn)
1375 break;
1376 *ts++ = *(*fb)++;
1377 tn--;
1378 (*fn)--;
1379 }
1380 #if DEBUG_TRACE
1381 error(DEBUG_TRACE, "AHA#%d iconv_write %d", __LINE__, ts - tb);
1382 #endif
1383
1384 sfwrite(op, tb, ts - tb);
1385 r += ts - tb;
1386 }
1387 return r;
1388 }
1389
1390 /*
1391 * move n bytes from ip to op
1392 */
1393
1394 ssize_t
_ast_iconv_move(_ast_iconv_t cd,Sfio_t * ip,Sfio_t * op,size_t n,size_t * e)1395 _ast_iconv_move(_ast_iconv_t cd, Sfio_t* ip, Sfio_t* op, size_t n, size_t* e)
1396 {
1397 char* fb;
1398 char* fs;
1399 char* tb;
1400 char* ts;
1401 size_t fn;
1402 size_t fo;
1403 size_t tn;
1404 size_t i;
1405 ssize_t r = 0;
1406 int locked;
1407
1408 fn = n;
1409 for (;;)
1410 {
1411 if (fn != SF_UNBOUND)
1412 fn = -((ssize_t)(fn & (((size_t)(~0))>>1)));
1413 if (!(fb = (char*)sfreserve(ip, fn, locked = SF_LOCKR)) &&
1414 !(fb = (char*)sfreserve(ip, fn, locked = 0)))
1415 break;
1416 fs = fb;
1417 fn = fo = sfvalue(ip);
1418 if (!(tb = (char*)sfreserve(op, SF_UNBOUND, SF_WRITE|SF_LOCKR)))
1419 {
1420 sfread(ip, fb, 0);
1421 return r ? r : -1;
1422 }
1423 ts = tb;
1424 tn = sfvalue(op);
1425 while (_ast_iconv(cd, &fs, &fn, &ts, &tn) != (size_t)(-1) && fn > 0)
1426 {
1427 if (tn > 0)
1428 {
1429 *ts++ = '_';
1430 tn--;
1431 }
1432 if (e)
1433 (*e)++;
1434 fs++;
1435 fn--;
1436 }
1437 sfwrite(op, tb, ts - tb);
1438 r += ts - tb;
1439 if (locked)
1440 sfread(ip, fb, fs - fb);
1441 else
1442 for (i = fn; --i >= (fs - fb);)
1443 sfungetc(ip, fb[i]);
1444 if (n != SF_UNBOUND)
1445 {
1446 if (n <= (fs - fb))
1447 break;
1448 n -= fs - fb;
1449 }
1450 if (fn == fo)
1451 fn++;
1452 }
1453 return r;
1454 }
1455
1456 /*
1457 * iconv_list_t iterator
1458 * call with arg 0 to start
1459 * prev return value is current arg
1460 */
1461
1462 _ast_iconv_list_t*
_ast_iconv_list(_ast_iconv_list_t * cp)1463 _ast_iconv_list(_ast_iconv_list_t* cp)
1464 {
1465 #if _UWIN
1466 struct dirent* ent;
1467
1468 if (!cp)
1469 {
1470 if (!(cp = newof(0, _ast_iconv_list_t, 1, 0)))
1471 return ccmaplist(NiL);
1472 if (!(cp->data = opendir(_win_maps)))
1473 {
1474 free(cp);
1475 return ccmaplist(NiL);
1476 }
1477 }
1478 if (cp->data)
1479 {
1480 if (ent = readdir((DIR*)cp->data))
1481 {
1482 cp->name = cp->match = cp->desc = (const char*)ent->d_name;
1483 return cp;
1484 }
1485 closedir((DIR*)cp->data);
1486 free(cp);
1487 return ccmaplist(NiL);
1488 }
1489 #else
1490 if (!cp)
1491 return ccmaplist(NiL);
1492 #endif
1493 if (cp->ccode >= 0)
1494 return (cp = ccmaplist(cp)) ? cp : (_ast_iconv_list_t*)codes;
1495 return (++cp)->name ? cp : (_ast_iconv_list_t*)0;
1496 }
1497