xref: /titanic_41/usr/src/lib/libast/common/comp/iconv.c (revision 3e14f97f673e8a630f076077de35afdd43dc1587)
1 /***********************************************************************
2 *                                                                      *
3 *               This software is part of the ast package               *
4 *          Copyright (c) 1985-2010 AT&T Intellectual Property          *
5 *                      and is licensed under the                       *
6 *                  Common Public License, Version 1.0                  *
7 *                    by AT&T Intellectual Property                     *
8 *                                                                      *
9 *                A copy of the License is available at                 *
10 *            http://www.opensource.org/licenses/cpl1.0.txt             *
11 *         (with md5 checksum 059e8cd6165cb4c31e351f2b69388fd9)         *
12 *                                                                      *
13 *              Information and Software Systems Research               *
14 *                            AT&T Research                             *
15 *                           Florham Park NJ                            *
16 *                                                                      *
17 *                 Glenn Fowler <gsf@research.att.com>                  *
18 *                  David Korn <dgk@research.att.com>                   *
19 *                   Phong Vo <kpv@research.att.com>                    *
20 *                                                                      *
21 ***********************************************************************/
22 #pragma prototyped
23 
24 /*
25  * Glenn Fowler
26  * AT&T Research
27  *
28  * iconv intercept
29  * minimally provides { utf*<=>bin ascii<=>ebcdic* }
30  */
31 
32 #include <ast.h>
33 #include <dirent.h>
34 
35 #define DEBUG_TRACE		0
36 #define _ICONV_LIST_PRIVATE_
37 
38 #include <ccode.h>
39 #include <ctype.h>
40 #include <iconv.h>
41 
42 #include "lclib.h"
43 
44 #if !_lib_iconv_open
45 
46 #define _ast_iconv_t		iconv_t
47 #define _ast_iconv_f		iconv_f
48 #define _ast_iconv_list_t	iconv_list_t
49 #define _ast_iconv_open		iconv_open
50 #define _ast_iconv		iconv
51 #define _ast_iconv_close	iconv_close
52 #define _ast_iconv_list		iconv_list
53 #define _ast_iconv_move		iconv_move
54 #define _ast_iconv_name		iconv_name
55 #define _ast_iconv_write	iconv_write
56 
57 #endif
58 
59 #ifndef E2BIG
60 #define E2BIG			ENOMEM
61 #endif
62 #ifndef EILSEQ
63 #define EILSEQ			EIO
64 #endif
65 
66 #define RETURN(e,n,fn) \
67 	if (*fn && !e) e = E2BIG; \
68 	if (e) { errno = e; return (size_t)(-1); } \
69 	return n;
70 
71 typedef struct Map_s
72 {
73 	char*			name;
74 	const unsigned char*	map;
75 	_ast_iconv_f		fun;
76 	int			index;
77 } Map_t;
78 
79 typedef struct Conv_s
80 {
81 	iconv_t			cvt;
82 	char*			buf;
83 	size_t			size;
84 	Map_t			from;
85 	Map_t			to;
86 } Conv_t;
87 
88 static Conv_t*			freelist[4];
89 static int			freeindex;
90 
91 static const char		name_local[] = "local";
92 static const char		name_native[] = "native";
93 
94 static const _ast_iconv_list_t	codes[] =
95 {
96 	{
97 	"utf",
98 	"un|unicode|utf",
99 	"multibyte 8-bit unicode",
100 	"UTF-%s",
101 	"8",
102 	CC_UTF,
103 	},
104 
105 	{
106 	"ume",
107 	"um|ume|utf?(-)7",
108 	"multibyte 7-bit unicode",
109 	"UTF-7",
110 	0,
111 	CC_UME,
112 	},
113 
114 	{
115 	"euc",
116 	"(big|euc)*",
117 	"euc family",
118 	0,
119 	0,
120 	CC_ICONV,
121 	},
122 
123 	{
124 	"dos",
125 	"dos?(-)?(855)",
126 	"dos code page",
127 	"DOS855",
128 	0,
129 	CC_ICONV,
130 	},
131 
132 	{
133 	"ucs",
134 	"ucs?(-)?(2)?(be)|utf-16?(be)",
135 	"unicode runes",
136 	"UCS-%s",
137 	"2",
138 	CC_UCS,
139 	},
140 
141 	{
142 	"ucs-le",
143 	"ucs?(-)?(2)le|utf-16le",
144 	"little endian unicode runes",
145 	"UCS-%sLE",
146 	"2",
147 	CC_SCU,
148 	},
149 
150 	{ 0 },
151 };
152 
153 #if _UWIN
154 
155 #include <ast_windows.h>
156 
157 #ifndef CP_UCS2
158 #define CP_UCS2	0x0000
159 #endif
160 
161 static char	_win_maps[] = "/reg/local_machine/SOFTWARE/Classes/MIME/Database/Charset";
162 
163 /*
164  * return the codeset index given its name or alias
165  * the map is in the what? oh, the registry
166  */
167 
168 static int
_win_codeset(const char * name)169 _win_codeset(const char* name)
170 {
171 	register char*	s;
172 	char*		e;
173 	int		n;
174 	Sfio_t*		sp;
175 	char		aka[128];
176 	char		tmp[128];
177 
178 #if DEBUG_TRACE
179 error(DEBUG_TRACE, "AHA#%d _win_codeset name=%s", __LINE__, name);
180 #endif
181 	if (name == name_native)
182 		return CP_ACP;
183 	if (!strcasecmp(name, "utf") || !strcasecmp(name, "utf8") || !strcasecmp(name, "utf-8"))
184 		return CP_UTF8;
185 	if (!strcasecmp(name, "ucs") || !strcasecmp(name, "ucs2") || !strcasecmp(name, "ucs-2"))
186 		return CP_UCS2;
187 	if (name[0] == '0' && name[1] == 'x' && (n = strtol(name, &e, 0)) > 0 && !*e)
188 		return n;
189 	for (;;)
190 	{
191 		sfsprintf(tmp, sizeof(tmp), "%s/%s", _win_maps, name);
192 		if (!(sp = sfopen(0, tmp, "r")))
193 		{
194 			s = (char*)name;
195 			if ((s[0] == 'c' || s[0] == 'C') && (s[1] == 'p' || s[1] == 'P'))
196 				s += 2;
197 			if (!isdigit(s[0]))
198 				break;
199 			sfsprintf(tmp, sizeof(tmp), "%s/windows-%s", _win_maps, s);
200 			if (!(sp = sfopen(0, tmp, "r")))
201 				break;
202 		}
203 		for (;;)
204 		{
205 			if (!(s = sfgetr(sp, '\n', 0)))
206 			{
207 				sfclose(sp);
208 				return -1;
209 			}
210 			if (!strncasecmp(s, "AliasForCharSet=", 16))
211 			{
212 				n = sfvalue(sp) - 17;
213 				s += 16;
214 				if (n >= sizeof(aka))
215 					n = sizeof(aka) - 1;
216 				memcpy(aka, s, n);
217 				aka[n] = 0;
218 				sfclose(sp);
219 				name = (const char*)aka;
220 				break;
221 			}
222 			if (!strncasecmp(s, "CodePage=", 9))
223 			{
224 				s += 9;
225 				n = strtol(s, 0, 0);
226 				sfclose(sp);
227 				return n;
228 			}
229 		}
230 	}
231 	return -1;
232 }
233 
234 /*
235  * get and check the codeset indices
236  */
237 
238 static _ast_iconv_t
_win_iconv_open(register Conv_t * cc,const char * t,const char * f)239 _win_iconv_open(register Conv_t* cc, const char* t, const char* f)
240 {
241 #if DEBUG_TRACE
242 error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=%s t=%s\n", __LINE__, f, t);
243 #endif
244 	if ((cc->from.index = _win_codeset(f)) < 0)
245 		return (_ast_iconv_t)(-1);
246 	if ((cc->to.index = _win_codeset(t)) < 0)
247 		return (_ast_iconv_t)(-1);
248 #if DEBUG_TRACE
249 error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=0x%04x t=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
250 #endif
251 	return (_ast_iconv_t)cc;
252 }
253 
254 /*
255  * even though the indices already check out
256  * they could still be rejected
257  */
258 
259 static size_t
_win_iconv(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)260 _win_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
261 {
262 	Conv_t*	cc = (Conv_t*)cd;
263 	size_t	un;
264 	size_t	tz;
265 	size_t	fz;
266 	size_t	bz;
267 	size_t	pz;
268 	size_t	oz;
269 	LPWSTR	ub;
270 
271 #if DEBUG_TRACE
272 error(DEBUG_TRACE, "AHA#%d _win_iconv from=0x%04x to=0x%04x\n", __LINE__, cc->from.index, cc->to.index);
273 #endif
274 	if (cc->from.index == cc->to.index)
275 	{
276 		/*
277 		 * easy
278 		 */
279 
280 		fz = tz = (*fn < *tn) ? *fn : *tn;
281 		memcpy(*tb, *fb, fz);
282 	}
283 	else
284 	{
285 		ub = 0;
286 		un = *fn;
287 
288 		/*
289 		 * from => ucs-2
290 		 */
291 
292 		if (cc->to.index == CP_UCS2)
293 		{
294 			if ((tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, *tn)) && tz <= *tn)
295 			{
296 				fz = *fn;
297 				tz *= sizeof(WCHAR);
298 			}
299 			else
300 			{
301 				/*
302 				 * target too small
303 				 * binary search on input size to make it fit
304 				 */
305 
306 				oz = 0;
307 				pz = *fn / 2;
308 				fz = *fn - pz;
309 				for (;;)
310 				{
311 					while (!(tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)fz, (LPWSTR)*tb, 0)))
312 						if (++fz >= *fn)
313 							goto nope;
314 					tz *= sizeof(WCHAR);
315 					if (tz == *tn)
316 						break;
317 					if (!(pz /= 2))
318 					{
319 						if (!(fz = oz))
320 							goto nope;
321 						break;
322 					}
323 					if (tz > *tn)
324 						fz -= pz;
325 					else
326 					{
327 						oz = fz;
328 						fz += pz;
329 					}
330 				}
331 			}
332 		}
333 		else
334 		{
335 			if (cc->from.index == CP_UCS2)
336 			{
337 				un = *fn / sizeof(WCHAR);
338 				ub = (LPWSTR)*fb;
339 			}
340 			else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, 0)))
341 				goto nope;
342 			else if (!(ub = (LPWSTR)malloc(un * sizeof(WCHAR))))
343 				goto nope;
344 			else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)ub, un)))
345 				goto nope;
346 
347 			/*
348 			 * ucs-2 => to
349 			 */
350 
351 			if (tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, un, *tb, *tn, 0, 0))
352 				fz = *fn;
353 			else
354 			{
355 				/*
356 				 * target too small
357 				 * binary search on input size to make it fit
358 				 */
359 
360 				oz = 0;
361 				pz = *fn / 2;
362 				bz = *fn - pz;
363 				for (;;)
364 				{
365 					while (!(fz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)bz, (LPWSTR)ub, un)))
366 						if (++bz > *fn)
367 							goto nope;
368 					if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, 0, 0, 0)))
369 						goto nope;
370 					if (tz == *tn)
371 						break;
372 					if (!(pz /= 2))
373 					{
374 						if (!(fz = oz))
375 							goto nope;
376 						break;
377 					}
378 					if (tz > *tn)
379 						bz -= pz;
380 					else
381 					{
382 						oz = bz;
383 						bz += pz;
384 					}
385 				}
386 				if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, tz, 0, 0)))
387 					goto nope;
388 #if DEBUG_TRACE
389 error(DEBUG_TRACE, "AHA#%d _win_iconv *fn=%u fz=%u[%u] *tn=%u tz=%u\n", __LINE__, *fn, fz, fz * sizeof(WCHAR), *tn, tz);
390 #endif
391 #if 0
392 				fz *= sizeof(WCHAR);
393 #endif
394 			}
395 			if (ub != (LPWSTR)*fb)
396 				free(ub);
397 		}
398 	}
399 	*fb += fz;
400 	*fn -= fz;
401 	*tb += tz;
402 	*tn -= tz;
403 	return fz;
404  nope:
405 	if (ub && ub != (LPWSTR)*fb)
406 		free(ub);
407 	errno = EINVAL;
408 	return (size_t)(-1);
409 }
410 
411 #endif
412 
413 /*
414  * return canonical character code set name for m
415  * if b!=0 then canonical name placed in b of size n
416  * <ccode.h> index returned
417  */
418 
419 int
_ast_iconv_name(register const char * m,register char * b,size_t n)420 _ast_iconv_name(register const char* m, register char* b, size_t n)
421 {
422 	register const _ast_iconv_list_t*	cp;
423 	const _ast_iconv_list_t*		bp;
424 	register int				c;
425 	register char*				e;
426 	int					sub[2];
427 	char					buf[16];
428 #if DEBUG_TRACE
429 	char*					o;
430 #endif
431 
432 	if (!b)
433 	{
434 		b = buf;
435 		n = sizeof(buf);
436 	}
437 #if DEBUG_TRACE
438 	o = b;
439 #endif
440 	e = b + n - 1;
441 	bp = 0;
442 	n = 0;
443 	cp = ccmaplist(NiL);
444 #if DEBUG_TRACE
445 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name m=\"%s\"\n", error_info.id, error_info.trace, __LINE__, m);
446 #endif
447 	for (;;)
448 	{
449 #if DEBUG_TRACE
450 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name n=%d bp=%p cp=%p ccode=%d name=\"%s\"\n", error_info.id, error_info.trace, __LINE__, n, bp, cp, cp->ccode, cp->name);
451 #endif
452 		if (strgrpmatch(m, cp->match, sub, elementsof(sub) / 2, STR_MAXIMAL|STR_LEFT|STR_ICASE))
453 		{
454 			if (!(c = m[sub[1]]))
455 			{
456 				bp = cp;
457 				break;
458 			}
459 			if (sub[1] > n && !isalpha(c))
460 			{
461 				bp = cp;
462 				n = sub[1];
463 			}
464 		}
465 		if (cp->ccode < 0)
466 		{
467 			if (!(++cp)->name)
468 				break;
469 		}
470 		else if (!(cp = (const _ast_iconv_list_t*)ccmaplist((_ast_iconv_list_t*)cp)))
471 			cp = codes;
472 	}
473 	if (cp = bp)
474 	{
475 		if (cp->canon)
476 		{
477 			if (cp->index)
478 			{
479 				for (m += sub[1]; *m && !isalnum(*m); m++);
480 				if (!isdigit(*m))
481 					m = cp->index;
482 			}
483 			else
484 				m = "1";
485 			b += sfsprintf(b, e - b, cp->canon, m);
486 		}
487 		else if (cp->ccode == CC_NATIVE)
488 		{
489 			if ((locales[AST_LC_CTYPE]->flags & LC_default) || !locales[AST_LC_CTYPE]->charset || !(m = locales[AST_LC_CTYPE]->charset->code) || streq(m, "iso8859-1"))
490 				switch (CC_NATIVE)
491 				{
492 				case CC_EBCDIC:
493 					m = (const char*)"EBCDIC";
494 					break;
495 				case CC_EBCDIC_I:
496 					m = (const char*)"EBCDIC-I";
497 					break;
498 				case CC_EBCDIC_O:
499 					m = (const char*)"EBCDIC-O";
500 					break;
501 				default:
502 					m = (const char*)"ISO-8859-1";
503 					break;
504 				}
505 			b += sfsprintf(b, e - b, "%s", m);
506 		}
507 		*b = 0;
508 #if DEBUG_TRACE
509 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, cp->ccode, o);
510 #endif
511 		return cp->ccode;
512 	}
513 	while (b < e && (c = *m++))
514 	{
515 		if (islower(c))
516 			c = toupper(c);
517 		*b++ = c;
518 	}
519 	*b = 0;
520 #if DEBUG_TRACE
521 if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, CC_ICONV, o);
522 #endif
523 	return CC_ICONV;
524 }
525 
526 /*
527  * convert utf-8 to bin
528  */
529 
530 static size_t
utf2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)531 utf2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
532 {
533 	register unsigned char*		f;
534 	register unsigned char*		fe;
535 	register unsigned char*		t;
536 	register unsigned char*		te;
537 	register unsigned char*		p;
538 	register int			c;
539 	register int			w;
540 	size_t				n;
541 	int				e;
542 
543 	e = 0;
544 	f = (unsigned char*)(*fb);
545 	fe = f + (*fn);
546 	t = (unsigned char*)(*tb);
547 	te = t + (*tn);
548 	while (t < te && f < fe)
549 	{
550 		p = f;
551 		c = *f++;
552 		if (c & 0x80)
553 		{
554 			if (!(c & 0x40))
555 			{
556 				f = p;
557 				e = EILSEQ;
558 				break;
559 			}
560 			if (c & 0x20)
561 			{
562 				w = (c & 0x0F) << 12;
563 				if (f >= fe)
564 				{
565 					f = p;
566 					e = EINVAL;
567 					break;
568 				}
569 				c = *f++;
570 				if (c & 0x40)
571 				{
572 					f = p;
573 					e = EILSEQ;
574 					break;
575 				}
576 				w |= (c & 0x3F) << 6;
577 			}
578 			else
579 				w = (c & 0x1F) << 6;
580 			if (f >= fe)
581 			{
582 				f = p;
583 				e = EINVAL;
584 				break;
585 			}
586 			c = *f++;
587 			w |= (c & 0x3F);
588 		}
589 		else
590 			w = c;
591 		*t++ = w;
592 	}
593 	*fn -= (char*)f - (*fb);
594 	*fb = (char*)f;
595 	*tn -= (n = (char*)t - (*tb));
596 	*tb = (char*)t;
597 	RETURN(e, n, fn);
598 }
599 
600 /*
601  * convert bin to utf-8
602  */
603 
604 static size_t
bin2utf(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)605 bin2utf(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
606 {
607 	register unsigned char*		f;
608 	register unsigned char*		fe;
609 	register unsigned char*		t;
610 	register unsigned char*		te;
611 	register int			c;
612 	wchar_t				w;
613 	size_t				n;
614 	int				e;
615 
616 	e = 0;
617 	f = (unsigned char*)(*fb);
618 	fe = f + (*fn);
619 	t = (unsigned char*)(*tb);
620 	te = t + (*tn);
621 	while (f < fe && t < te)
622 	{
623 		if (!mbwide())
624 		{
625 			c = 1;
626 			w = *f;
627 		}
628 		else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
629 		{
630 			e = EINVAL;
631 			break;
632 		}
633 		else if (!c)
634 			c = 1;
635 		if (!(w & ~0x7F))
636 			*t++ = w;
637 		else
638 		{
639 			if (!(w & ~0x7FF))
640 			{
641 				if (t >= (te - 2))
642 				{
643 					e = E2BIG;
644 					break;
645 				}
646 				*t++ = 0xC0 + (w >> 6);
647 			}
648 			else if (!(w & ~0xffff))
649 			{
650 				if (t >= (te - 3))
651 				{
652 					e = E2BIG;
653 					break;
654 				}
655 				*t++ = 0xE0 + (w >> 12);
656 				*t++ = 0x80 + ((w >> 6 ) & 0x3F);
657 			}
658 			else
659 			{
660 				e = EILSEQ;
661 				break;
662 			}
663 			*t++ = 0x80 + (w & 0x3F);
664 		}
665 		f += c;
666 	}
667 	*fn -= (n = (char*)f - (*fb));
668 	*fb = (char*)f;
669 	*tn -= (char*)t - (*tb);
670 	*tb = (char*)t;
671 	RETURN(e, n, fn);
672 }
673 
674 static const unsigned char	ume_D[] =
675 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?!\"#$%&*;<=>@[]^_`{|} \t\n";
676 
677 static const unsigned char	ume_M[] =
678 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
679 
680 static unsigned char		ume_d[UCHAR_MAX+1];
681 
682 static unsigned char		ume_m[UCHAR_MAX+1];
683 
684 #define NOE			0xFF
685 #define UMEINIT()		(ume_d[ume_D[0]]?0:umeinit())
686 
687 /*
688  * initialize the ume tables
689  */
690 
691 static int
umeinit(void)692 umeinit(void)
693 {
694 	register const unsigned char*	s;
695 	register int			i;
696 	register int			c;
697 
698 	if (!ume_d[ume_D[0]])
699 	{
700 		s = ume_D;
701 		while (c = *s++)
702 			ume_d[c] = 1;
703 		memset(ume_m, NOE, sizeof(ume_m));
704 		for (i = 0; c = ume_M[i]; i++)
705 			ume_m[c] = i;
706 	}
707 	return 0;
708 }
709 
710 /*
711  * convert utf-7 to bin
712  */
713 
714 static size_t
ume2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)715 ume2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
716 {
717 	register unsigned char*		f;
718 	register unsigned char*		fe;
719 	register unsigned char*		t;
720 	register unsigned char*		te;
721 	register unsigned char*		p;
722 	register int			s;
723 	register int			c;
724 	register int			w;
725 	size_t				n;
726 	int				e;
727 
728 	e = 0;
729 	UMEINIT();
730 	f = (unsigned char*)(*fb);
731 	fe = f + (*fn);
732 	t = (unsigned char*)(*tb);
733 	te = t + (*tn);
734 	s = 0;
735 	while (f < fe && t < te)
736 	{
737 		p = f;
738 		c = *f++;
739 		if (s)
740 		{
741 			if (c == '-' && s > 1)
742 				s = 0;
743 			else if ((w = ume_m[c]) == NOE)
744 			{
745 				s = 0;
746 				*t++ = c;
747 			}
748 			else if (f >= (fe - 2))
749 			{
750 				f = p;
751 				e = EINVAL;
752 				break;
753 			}
754 			else
755 			{
756 				s = 2;
757 				w = (w << 6) | ume_m[*f++];
758 				w = (w << 6) | ume_m[*f++];
759 				if (!(w & ~0xFF))
760 					*t++ = w;
761 				else if (t >= (te - 1))
762 				{
763 					f = p;
764 					e = E2BIG;
765 					break;
766 				}
767 				else
768 				{
769 					*t++ = (w >> 8) & 0xFF;
770 					*t++ = w & 0xFF;
771 				}
772 			}
773 		}
774 		else if (c == '+')
775 			s = 1;
776 		else
777 			*t++ = c;
778 	}
779 	*fn -= (char*)f - (*fb);
780 	*fb = (char*)f;
781 	*tn -= (n = (char*)t - (*tb));
782 	*tb = (char*)t;
783 	RETURN(e, n, fn);
784 }
785 
786 /*
787  * convert bin to utf-7
788  */
789 
790 static size_t
bin2ume(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)791 bin2ume(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
792 {
793 	register unsigned char*		f;
794 	register unsigned char*		fe;
795 	register unsigned char*		t;
796 	register unsigned char*		te;
797 	register int			c;
798 	register int			s;
799 	wchar_t				w;
800 	size_t				n;
801 	int				e;
802 
803 	e = 0;
804 	UMEINIT();
805 	f = (unsigned char*)(*fb);
806 	fe = f + (*fn);
807 	t = (unsigned char*)(*tb);
808 	te = t + (*tn);
809 	s = 0;
810 	while (f < fe && t < (te - s))
811 	{
812 		if (!mbwide())
813 		{
814 			c = 1;
815 			w = *f;
816 		}
817 		else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
818 		{
819 			e = EINVAL;
820 			break;
821 		}
822 		else if (!c)
823 			c = 1;
824 		if (!(w & ~0x7F) && ume_d[w])
825 		{
826 			if (s)
827 			{
828 				s = 0;
829 				*t++ = '-';
830 			}
831 			*t++ = w;
832 		}
833 		else if (t >= (te - (4 + s)))
834 		{
835 			e = E2BIG;
836 			break;
837 		}
838 		else
839 		{
840 			if (!s)
841 			{
842 				s = 1;
843 				*t++ = '+';
844 			}
845 			*t++ = ume_M[(w >> 12) & 0x3F];
846 			*t++ = ume_M[(w >> 6) & 0x3F];
847 			*t++ = ume_M[w & 0x3F];
848 		}
849 		f += c;
850 	}
851 	if (s)
852 		*t++ = '-';
853 	*fn -= (n = (char*)f - (*fb));
854 	*fb = (char*)f;
855 	*tn -= (char*)t - (*tb);
856 	*tb = (char*)t;
857 	RETURN(e, n, fn);
858 }
859 
860 /*
861  * convert ucs-2 to bin with no byte swap
862  */
863 
864 static size_t
ucs2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)865 ucs2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
866 {
867 	register unsigned char*		f;
868 	register unsigned char*		fe;
869 	register unsigned char*		t;
870 	register unsigned char*		te;
871 	register int			w;
872 	size_t				n;
873 	int				e;
874 
875 	e = 0;
876 	f = (unsigned char*)(*fb);
877 	fe = f + (*fn);
878 	t = (unsigned char*)(*tb);
879 	te = t + (*tn);
880 	while (f < (fe - 1) && t < te)
881 	{
882 		w = *f++;
883 		w = (w << 8) | *f++;
884 		if (!(w & ~0xFF))
885 			*t++ = w;
886 		else if (t >= (te - 1))
887 		{
888 			f -= 2;
889 			e = E2BIG;
890 			break;
891 		}
892 		else
893 		{
894 			*t++ = (w >> 8) & 0xFF;
895 			*t++ = w & 0xFF;
896 		}
897 	}
898 	*fn -= (char*)f - (*fb);
899 	*fb = (char*)f;
900 	*tn -= (n = (char*)t - (*tb));
901 	*tb = (char*)t;
902 	RETURN(e, n, fn);
903 }
904 
905 /*
906  * convert bin to ucs-2 with no byte swap
907  */
908 
909 static size_t
bin2ucs(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)910 bin2ucs(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
911 {
912 	register unsigned char*		f;
913 	register unsigned char*		fe;
914 	register unsigned char*		t;
915 	register unsigned char*		te;
916 	register int			c;
917 	wchar_t				w;
918 	size_t				n;
919 	int				e;
920 
921 	e = 0;
922 	f = (unsigned char*)(*fb);
923 	fe = f + (*fn);
924 	t = (unsigned char*)(*tb);
925 	te = t + (*tn);
926 	while (f < fe && t < (te - 1))
927 	{
928 		if (!mbwide())
929 		{
930 			c = 1;
931 			w = *f;
932 		}
933 		if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
934 		{
935 			e = EINVAL;
936 			break;
937 		}
938 		else if (!c)
939 			c = 1;
940 		*t++ = (w >> 8) & 0xFF;
941 		*t++ = w & 0xFF;
942 		f += c;
943 	}
944 	*fn -= (n = (char*)f - (*fb));
945 	*fb = (char*)f;
946 	*tn -= (char*)t - (*tb);
947 	*tb = (char*)t;
948 	RETURN(e, n, fn);
949 }
950 
951 /*
952  * convert ucs-2 to bin with byte swap
953  */
954 
955 static size_t
scu2bin(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)956 scu2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
957 {
958 	register unsigned char*		f;
959 	register unsigned char*		fe;
960 	register unsigned char*		t;
961 	register unsigned char*		te;
962 	register int			w;
963 	size_t				n;
964 	int				e;
965 
966 	e = 0;
967 	f = (unsigned char*)(*fb);
968 	fe = f + (*fn);
969 	t = (unsigned char*)(*tb);
970 	te = t + (*tn);
971 	while (f < (fe - 1) && t < te)
972 	{
973 		w = *f++;
974 		w = w | (*f++ << 8);
975 		if (!(w & ~0xFF))
976 			*t++ = w;
977 		else if (t >= (te - 1))
978 		{
979 			f -= 2;
980 			e = E2BIG;
981 			break;
982 		}
983 		else
984 		{
985 			*t++ = (w >> 8) & 0xFF;
986 			*t++ = w & 0xFF;
987 		}
988 	}
989 	*fn -= (char*)f - (*fb);
990 	*fb = (char*)f;
991 	*tn -= (n = (char*)t - (*tb));
992 	*tb = (char*)t;
993 	RETURN(e, n, fn);
994 }
995 
996 /*
997  * convert bin to ucs-2 with byte swap
998  */
999 
1000 static size_t
bin2scu(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)1001 bin2scu(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1002 {
1003 	register unsigned char*		f;
1004 	register unsigned char*		fe;
1005 	register unsigned char*		t;
1006 	register unsigned char*		te;
1007 	register int			c;
1008 	wchar_t				w;
1009 	size_t				n;
1010 	int				e;
1011 
1012 	e = 0;
1013 	f = (unsigned char*)(*fb);
1014 	fe = f + (*fn);
1015 	t = (unsigned char*)(*tb);
1016 	te = t + (*tn);
1017 	while (f < fe && t < (te - 1))
1018 	{
1019 		if (!mbwide())
1020 		{
1021 			c = 1;
1022 			w = *f;
1023 		}
1024 		else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0)
1025 		{
1026 			e = EINVAL;
1027 			break;
1028 		}
1029 		else if (!c)
1030 			c = 1;
1031 		*t++ = w & 0xFF;
1032 		*t++ = (w >> 8) & 0xFF;
1033 		f += c;
1034 	}
1035 	*fn -= (n = (char*)f - (*fb));
1036 	*fb = (char*)f;
1037 	*tn -= (char*)t - (*tb);
1038 	*tb = (char*)t;
1039 	RETURN(e, n, fn);
1040 }
1041 
1042 /*
1043  * open a character code conversion map from f to t
1044  */
1045 
1046 _ast_iconv_t
_ast_iconv_open(const char * t,const char * f)1047 _ast_iconv_open(const char* t, const char* f)
1048 {
1049 	register Conv_t*	cc;
1050 	int			fc;
1051 	int			tc;
1052 	int			i;
1053 
1054 	char			fr[64];
1055 	char			to[64];
1056 
1057 #if DEBUG_TRACE
1058 error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s t=%s\n", __LINE__, f, t);
1059 #endif
1060 	if (!t || !*t || *t == '-' && !*(t + 1) || !strcasecmp(t, name_local) || !strcasecmp(t, name_native))
1061 		t = name_native;
1062 	if (!f || !*f || *f == '-' && !*(f + 1) || !strcasecmp(t, name_local) || !strcasecmp(f, name_native))
1063 		f = name_native;
1064 
1065 	/*
1066 	 * the ast identify is always (iconv_t)(0)
1067 	 */
1068 
1069 	if (t == f)
1070 		return (iconv_t)(0);
1071 	fc = _ast_iconv_name(f, fr, sizeof(fr));
1072 	tc = _ast_iconv_name(t, to, sizeof(to));
1073 #if DEBUG_TRACE
1074 error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s:%s:%d t=%s:%s:%d\n", __LINE__, f, fr, fc, t, to, tc);
1075 #endif
1076 	if (fc != CC_ICONV && fc == tc || streq(fr, to))
1077 		return (iconv_t)(0);
1078 
1079 	/*
1080 	 * first check the free list
1081 	 */
1082 
1083 	for (i = 0; i < elementsof(freelist); i++)
1084 		if ((cc = freelist[i]) && streq(to, cc->to.name) && streq(fr, cc->from.name))
1085 		{
1086 			freelist[i] = 0;
1087 #if _lib_iconv_open
1088 			/*
1089 			 * reset the shift state if any
1090 			 */
1091 
1092 			if (cc->cvt != (iconv_t)(-1))
1093 				iconv(cc->cvt, NiL, NiL, NiL, NiL);
1094 #endif
1095 			return cc;
1096 		}
1097 
1098 	/*
1099 	 * allocate a new one
1100 	 */
1101 
1102 	if (!(cc = newof(0, Conv_t, 1, strlen(to) + strlen(fr) + 2)))
1103 		return (iconv_t)(-1);
1104 	cc->to.name = (char*)(cc + 1);
1105 	cc->from.name = strcopy(cc->to.name, to) + 1;
1106 	strcpy(cc->from.name, fr);
1107 	cc->cvt = (iconv_t)(-1);
1108 
1109 	/*
1110 	 * 8 bit maps are the easiest
1111 	 */
1112 
1113 	if (fc >= 0 && tc >= 0)
1114 		cc->from.map = ccmap(fc, tc);
1115 #if _lib_iconv_open
1116 	else if ((cc->cvt = iconv_open(to, fr)) != (iconv_t)(-1))
1117 		cc->from.fun = (_ast_iconv_f)iconv;
1118 #endif
1119 #if _UWIN
1120 	else if ((cc->cvt = _win_iconv_open(cc, to, fr)) != (_ast_iconv_t)(-1))
1121 		cc->from.fun = (_ast_iconv_f)_win_iconv;
1122 #endif
1123 	else
1124 	{
1125 		switch (fc)
1126 		{
1127 		case CC_UTF:
1128 			cc->from.fun = utf2bin;
1129 			break;
1130 		case CC_UME:
1131 			cc->from.fun = ume2bin;
1132 			break;
1133 		case CC_UCS:
1134 			cc->from.fun = ucs2bin;
1135 			break;
1136 		case CC_SCU:
1137 			cc->from.fun = scu2bin;
1138 			break;
1139 		case CC_ASCII:
1140 			break;
1141 		default:
1142 			if (fc < 0)
1143 				goto nope;
1144 			cc->from.map = ccmap(fc, CC_ASCII);
1145 			break;
1146 		}
1147 		switch (tc)
1148 		{
1149 		case CC_UTF:
1150 			cc->to.fun = bin2utf;
1151 			break;
1152 		case CC_UME:
1153 			cc->to.fun = bin2ume;
1154 			break;
1155 		case CC_UCS:
1156 			cc->to.fun = bin2ucs;
1157 			break;
1158 		case CC_SCU:
1159 			cc->to.fun = bin2scu;
1160 			break;
1161 		case CC_ASCII:
1162 			break;
1163 		default:
1164 			if (tc < 0)
1165 				goto nope;
1166 			cc->to.map = ccmap(CC_ASCII, tc);
1167 			break;
1168 		}
1169 	}
1170 	return (iconv_t)cc;
1171  nope:
1172 	return (iconv_t)(-1);
1173 }
1174 
1175 /*
1176  * close a character code conversion map
1177  */
1178 
1179 int
_ast_iconv_close(_ast_iconv_t cd)1180 _ast_iconv_close(_ast_iconv_t cd)
1181 {
1182 	Conv_t*	cc;
1183 	Conv_t*	oc;
1184 	int	i;
1185 	int	r = 0;
1186 
1187 	if (cd == (_ast_iconv_t)(-1))
1188 		return -1;
1189 	if (!(cc = (Conv_t*)cd))
1190 		return 0;
1191 
1192 	/*
1193 	 * add to the free list
1194 	 */
1195 
1196 	i = freeindex;
1197 	for (;;)
1198 	{
1199 		if (++ i >= elementsof(freelist))
1200 			i = 0;
1201 		if (!freelist[i])
1202 			break;
1203 		if (i == freeindex)
1204 		{
1205 			if (++ i >= elementsof(freelist))
1206 				i = 0;
1207 
1208 			/*
1209 			 * close the oldest
1210 			 */
1211 
1212 			if (oc = freelist[i])
1213 			{
1214 #if _lib_iconv_open
1215 				if (oc->cvt != (iconv_t)(-1))
1216 					r = iconv_close(oc->cvt);
1217 #endif
1218 				if (oc->buf)
1219 					free(oc->buf);
1220 				free(oc);
1221 			}
1222 			break;
1223 		}
1224 	}
1225 	freelist[freeindex = i] = cc;
1226 	return r;
1227 }
1228 
1229 /*
1230  * copy *fb size *fn to *tb size *tn
1231  * fb,fn tb,tn updated on return
1232  */
1233 
1234 size_t
_ast_iconv(_ast_iconv_t cd,char ** fb,size_t * fn,char ** tb,size_t * tn)1235 _ast_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn)
1236 {
1237 	Conv_t*				cc = (Conv_t*)cd;
1238 	register unsigned char*		f;
1239 	register unsigned char*		t;
1240 	register unsigned char*		e;
1241 	register const unsigned char*	m;
1242 	register size_t			n;
1243 	char*				b;
1244 	char*				tfb;
1245 	size_t				tfn;
1246 	size_t				i;
1247 
1248 	if (!fb || !*fb)
1249 	{
1250 		/* TODO: reset to the initial state */
1251 		if (!tb || !*tb)
1252 			return 0;
1253 		/* TODO: write the initial state shift sequence */
1254 		return 0;
1255 	}
1256 	n = *tn;
1257 	if (cc)
1258 	{
1259 		if (cc->from.fun)
1260 		{
1261 			if (cc->to.fun)
1262 			{
1263 				if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1264 				{
1265 					errno = ENOMEM;
1266 					return -1;
1267 				}
1268 				b = cc->buf;
1269 				i = cc->size;
1270 				tfb = *fb;
1271 				tfn = *fn;
1272 				if ((*cc->from.fun)(cc->cvt, &tfb, &tfn, &b, &i) == (size_t)(-1))
1273 					return -1;
1274 				tfn = b - cc->buf;
1275 				tfb = cc->buf;
1276 				n = (*cc->to.fun)(cc->cvt, &tfb, &tfn, tb, tn);
1277 				i = tfb - cc->buf;
1278 				*fb += i;
1279 				*fn -= i;
1280 				return n;
1281 			}
1282 			if ((*cc->from.fun)(cc->cvt, fb, fn, tb, tn) == (size_t)(-1))
1283 				return -1;
1284 			n -= *tn;
1285 			if (m = cc->to.map)
1286 			{
1287 				e = (unsigned char*)(*tb);
1288 				for (t = e - n; t < e; t++)
1289 					*t = m[*t];
1290 			}
1291 			return n;
1292 		}
1293 		else if (cc->to.fun)
1294 		{
1295 			if (!(m = cc->from.map))
1296 				return (*cc->to.fun)(cc->cvt, fb, fn, tb, tn);
1297 			if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0)))
1298 			{
1299 				errno = ENOMEM;
1300 				return -1;
1301 			}
1302 			if ((n = *fn) > cc->size)
1303 				n = cc->size;
1304 			f = (unsigned char*)(*fb);
1305 			e = f + n;
1306 			t = (unsigned char*)(b = cc->buf);
1307 			while (f < e)
1308 				*t++ = m[*f++];
1309 			n = (*cc->to.fun)(cc->cvt, &b, fn, tb, tn);
1310 			*fb += b - cc->buf;
1311 			return n;
1312 		}
1313 	}
1314 	if (n > *fn)
1315 		n = *fn;
1316 	if (cc && (m = cc->from.map))
1317 	{
1318 		f = (unsigned char*)(*fb);
1319 		e = f + n;
1320 		t = (unsigned char*)(*tb);
1321 		while (f < e)
1322 			*t++ = m[*f++];
1323 	}
1324 	else
1325 		memcpy(*tb, *fb, n);
1326 	*fb += n;
1327 	*fn -= n;
1328 	*tb += n;
1329 	*tn -= n;
1330 	return n;
1331 }
1332 
1333 /*
1334  * write *fb size *fn to op
1335  * fb,fn updated on return
1336  * total bytes written to op returned
1337  */
1338 
1339 ssize_t
_ast_iconv_write(_ast_iconv_t cd,Sfio_t * op,char ** fb,size_t * fn,size_t * e)1340 _ast_iconv_write(_ast_iconv_t cd, Sfio_t* op, char** fb, size_t* fn, size_t* e)
1341 {
1342 	char*		tb;
1343 	char*		ts;
1344 	size_t		tn;
1345 	size_t		r;
1346 
1347 	r = 0;
1348 	tn = 0;
1349 	while (*fn > 0)
1350 	{
1351 		if (!(tb = (char*)sfreserve(op, -(tn + 1), SF_WRITE|SF_LOCKR)))
1352 			return r ? r : -1;
1353 		ts = tb;
1354 		tn = sfvalue(op);
1355 #if DEBUG_TRACE
1356 error(DEBUG_TRACE, "AHA#%d iconv_write ts=%p tn=%d", __LINE__, ts, tn);
1357 		for (;;)
1358 #else
1359 		while (_ast_iconv(cd, fb, fn, &ts, &tn) == (size_t)(-1))
1360 #endif
1361 		{
1362 #if DEBUG_TRACE
1363 			ssize_t	_r;
1364 error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d `%-.*s'", __LINE__, *fn, tn, *fn, *fb);
1365 			_r = _ast_iconv(cd, fb, fn, &ts, &tn);
1366 error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d [%d]", __LINE__, *fn, tn, _r);
1367 			if (_r != (size_t)(-1))
1368 				break;
1369 #endif
1370 			if (errno == E2BIG)
1371 				break;
1372 			if (e)
1373 				(*e)++;
1374 			if (!tn)
1375 				break;
1376 			*ts++ = *(*fb)++;
1377 			tn--;
1378 			(*fn)--;
1379 		}
1380 #if DEBUG_TRACE
1381 error(DEBUG_TRACE, "AHA#%d iconv_write %d", __LINE__, ts - tb);
1382 #endif
1383 
1384 		sfwrite(op, tb, ts - tb);
1385 		r += ts - tb;
1386 	}
1387 	return r;
1388 }
1389 
1390 /*
1391  * move n bytes from ip to op
1392  */
1393 
1394 ssize_t
_ast_iconv_move(_ast_iconv_t cd,Sfio_t * ip,Sfio_t * op,size_t n,size_t * e)1395 _ast_iconv_move(_ast_iconv_t cd, Sfio_t* ip, Sfio_t* op, size_t n, size_t* e)
1396 {
1397 	char*		fb;
1398 	char*		fs;
1399 	char*		tb;
1400 	char*		ts;
1401 	size_t		fn;
1402 	size_t		fo;
1403 	size_t		tn;
1404 	size_t		i;
1405 	ssize_t		r = 0;
1406 	int		locked;
1407 
1408 	fn = n;
1409 	for (;;)
1410 	{
1411 		if (fn != SF_UNBOUND)
1412 			fn = -((ssize_t)(fn & (((size_t)(~0))>>1)));
1413 		if (!(fb = (char*)sfreserve(ip, fn, locked = SF_LOCKR)) &&
1414 		    !(fb = (char*)sfreserve(ip, fn, locked = 0)))
1415 			break;
1416 		fs = fb;
1417 		fn = fo = sfvalue(ip);
1418 		if (!(tb = (char*)sfreserve(op, SF_UNBOUND, SF_WRITE|SF_LOCKR)))
1419 		{
1420 			sfread(ip, fb, 0);
1421 			return r ? r : -1;
1422 		}
1423 		ts = tb;
1424 		tn = sfvalue(op);
1425 		while (_ast_iconv(cd, &fs, &fn, &ts, &tn) != (size_t)(-1) && fn > 0)
1426 		{
1427 			if (tn > 0)
1428 			{
1429 				*ts++ = '_';
1430 				tn--;
1431 			}
1432 			if (e)
1433 				(*e)++;
1434 			fs++;
1435 			fn--;
1436 		}
1437 		sfwrite(op, tb, ts - tb);
1438 		r += ts - tb;
1439 		if (locked)
1440 			sfread(ip, fb, fs - fb);
1441 		else
1442 			for (i = fn; --i >= (fs - fb);)
1443 				sfungetc(ip, fb[i]);
1444 		if (n != SF_UNBOUND)
1445 		{
1446 			if (n <= (fs - fb))
1447 				break;
1448 			n -= fs - fb;
1449 		}
1450 		if (fn == fo)
1451 			fn++;
1452 	}
1453 	return r;
1454 }
1455 
1456 /*
1457  * iconv_list_t iterator
1458  * call with arg 0 to start
1459  * prev return value is current arg
1460  */
1461 
1462 _ast_iconv_list_t*
_ast_iconv_list(_ast_iconv_list_t * cp)1463 _ast_iconv_list(_ast_iconv_list_t* cp)
1464 {
1465 #if _UWIN
1466 	struct dirent*	ent;
1467 
1468 	if (!cp)
1469 	{
1470 		if (!(cp = newof(0, _ast_iconv_list_t, 1, 0)))
1471 			return ccmaplist(NiL);
1472 		if (!(cp->data = opendir(_win_maps)))
1473 		{
1474 			free(cp);
1475 			return ccmaplist(NiL);
1476 		}
1477 	}
1478 	if (cp->data)
1479 	{
1480 		if (ent = readdir((DIR*)cp->data))
1481 		{
1482 			cp->name = cp->match = cp->desc = (const char*)ent->d_name;
1483 			return cp;
1484 		}
1485 		closedir((DIR*)cp->data);
1486 		free(cp);
1487 		return ccmaplist(NiL);
1488 	}
1489 #else
1490 	if (!cp)
1491 		return ccmaplist(NiL);
1492 #endif
1493 	if (cp->ccode >= 0)
1494 		return (cp = ccmaplist(cp)) ? cp : (_ast_iconv_list_t*)codes;
1495 	return (++cp)->name ? cp : (_ast_iconv_list_t*)0;
1496 }
1497