xref: /freebsd/contrib/nvi/common/conv.c (revision 0957b409a90fd597c1e9124cbaf3edd2b488f4ac)
1 /*-
2  * Copyright (c) 1993, 1994
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 1993, 1994, 1995, 1996
5  *	Keith Bostic.  All rights reserved.
6  * Copyright (c) 2011, 2012
7  *	Zhihao Yuan.  All rights reserved.
8  *
9  * See the LICENSE file for redistribution information.
10  */
11 
12 #include "config.h"
13 
14 #ifndef lint
15 static const char sccsid[] = "$Id: conv.c,v 2.40 2014/02/27 16:25:29 zy Exp $";
16 #endif /* not lint */
17 
18 #include <sys/types.h>
19 #include <sys/queue.h>
20 #include <sys/time.h>
21 
22 #include <bitstring.h>
23 #include <errno.h>
24 #include <limits.h>
25 #include <langinfo.h>
26 #include <locale.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <strings.h>
31 #include <unistd.h>
32 
33 #include "common.h"
34 
35 /*
36  * codeset --
37  *	Get the locale encoding.
38  *
39  * PUBLIC: char * codeset(void);
40  */
41 char *
42 codeset(void)
43 {
44 	static char *cs;
45 
46 	if (cs == NULL)
47 		cs = nl_langinfo(CODESET);
48 
49 	return cs;
50 }
51 
52 #ifdef USE_WIDECHAR
53 static int
54 raw2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
55     CHAR_T **dst)
56 {
57 	int i;
58 	CHAR_T **tostr = &cw->bp1.wc;
59 	size_t  *blen = &cw->blen1;
60 
61 	BINC_RETW(NULL, *tostr, *blen, len);
62 
63 	*tolen = len;
64 	for (i = 0; i < len; ++i)
65 		(*tostr)[i] = (u_char) str[i];
66 
67 	*dst = cw->bp1.wc;
68 
69 	return 0;
70 }
71 
72 #define CONV_BUFFER_SIZE    512
73 /* fill the buffer with codeset encoding of string pointed to by str
74  * left has the number of bytes left in str and is adjusted
75  * len contains the number of bytes put in the buffer
76  */
77 #ifdef USE_ICONV
78 #define CONVERT(str, left, src, len)					\
79 	do {								\
80 		size_t outleft;						\
81 		char *bp = buffer;					\
82 		outleft = CONV_BUFFER_SIZE;				\
83 		errno = 0;						\
84 		if (iconv(id, (iconv_src_t)&str, &left, &bp, &outleft)	\
85 		    == -1 && errno != E2BIG)				\
86 			goto err;					\
87 		if ((len = CONV_BUFFER_SIZE - outleft) == 0) {		\
88 			error = -left;					\
89 			goto err;					\
90 		}							\
91 		src = buffer;						\
92 	} while (0)
93 
94 #define IC_RESET()							\
95 	do {								\
96 		if (id != (iconv_t)-1)					\
97 			iconv(id, NULL, NULL, NULL, NULL);		\
98 	} while(0)
99 #else
100 #define CONVERT(str, left, src, len)
101 #define IC_RESET()
102 #endif
103 
104 static int
105 default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
106     size_t *tolen, CHAR_T **dst, iconv_t id)
107 {
108 	size_t i = 0, j;
109 	CHAR_T **tostr = &cw->bp1.wc;
110 	size_t *blen = &cw->blen1;
111 	mbstate_t mbs;
112 	size_t n;
113 	ssize_t nlen = len;
114 	char *src = (char *)str;
115 #ifdef USE_ICONV
116 	char buffer[CONV_BUFFER_SIZE];
117 #endif
118 	size_t left = len;
119 	int error = 1;
120 
121 	BZERO(&mbs, 1);
122 	BINC_RETW(NULL, *tostr, *blen, nlen);
123 
124 #ifdef USE_ICONV
125 	if (id != (iconv_t)-1)
126 		CONVERT(str, left, src, len);
127 #endif
128 
129 	for (i = 0, j = 0; j < len; ) {
130 		n = mbrtowc((*tostr)+i, src+j, len-j, &mbs);
131 		/* NULL character converted */
132 		if (n == -2)
133 			error = -(len-j);
134 		if (n == -1 || n == -2)
135 			goto err;
136 		if (n == 0)
137 			n = 1;
138 		j += n;
139 		if (++i >= *blen) {
140 			nlen += 256;
141 			BINC_RETW(NULL, *tostr, *blen, nlen);
142 		}
143 		if (id != (iconv_t)-1 && j == len && left) {
144 			CONVERT(str, left, src, len);
145 			j = 0;
146 		}
147 	}
148 
149 	error = 0;
150 err:
151 	*tolen = i;
152 	*dst = cw->bp1.wc;
153 	IC_RESET();
154 
155 	return error;
156 }
157 
158 static int
159 fe_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
160     CHAR_T **dst)
161 {
162 	return default_char2int(sp, str, len, cw, tolen, dst,
163 	    sp->conv.id[IC_FE_CHAR2INT]);
164 }
165 
166 static int
167 ie_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
168     CHAR_T **dst)
169 {
170 	return default_char2int(sp, str, len, cw, tolen, dst,
171 	    sp->conv.id[IC_IE_CHAR2INT]);
172 }
173 
174 static int
175 cs_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
176     CHAR_T **dst)
177 {
178 	return default_char2int(sp, str, len, cw, tolen, dst, (iconv_t)-1);
179 }
180 
181 static int
182 int2raw(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, size_t *tolen,
183     char **dst)
184 {
185 	int i;
186 	char **tostr = &cw->bp1.c;
187 	size_t  *blen = &cw->blen1;
188 
189 	BINC_RETC(NULL, *tostr, *blen, len);
190 
191 	*tolen = len;
192 	for (i = 0; i < len; ++i)
193 		(*tostr)[i] = str[i];
194 
195 	*dst = cw->bp1.c;
196 
197 	return 0;
198 }
199 
200 static int
201 default_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
202     size_t *tolen, char **pdst, iconv_t id)
203 {
204 	size_t i, j, offset = 0;
205 	char **tostr = &cw->bp1.c;
206 	size_t *blen = &cw->blen1;
207 	mbstate_t mbs;
208 	size_t n;
209 	ssize_t  nlen = len + MB_CUR_MAX;
210 	char *dst;
211 	size_t buflen;
212 #ifdef USE_ICONV
213 	char buffer[CONV_BUFFER_SIZE];
214 #endif
215 	int error = 1;
216 
217 /* convert first len bytes of buffer and append it to cw->bp
218  * len is adjusted => 0
219  * offset contains the offset in cw->bp and is adjusted
220  * cw->bp is grown as required
221  */
222 #ifdef USE_ICONV
223 #define CONVERT2(_buffer, lenp, cw, offset)				\
224 	do {								\
225 		char *bp = _buffer;					\
226 		int ret;						\
227 		do {							\
228 			size_t outleft = cw->blen1 - offset;		\
229 			char *obp = cw->bp1.c + offset;			\
230 			if (cw->blen1 < offset + MB_CUR_MAX) {		\
231 				nlen += 256;				\
232 				BINC_RETC(NULL, cw->bp1.c, cw->blen1,	\
233 				    nlen);				\
234 			}						\
235 			errno = 0;					\
236 			ret = iconv(id, (iconv_src_t)&bp, lenp, &obp,	\
237 			    &outleft);					\
238 			if (ret == -1 && errno != E2BIG)		\
239 				goto err;				\
240 			offset = cw->blen1 - outleft;			\
241 		} while (ret != 0); 					\
242 	} while (0)
243 #else
244 #define CONVERT2(_buffer, lenp, cw, offset)
245 #endif
246 
247 
248 	BZERO(&mbs, 1);
249 	BINC_RETC(NULL, *tostr, *blen, nlen);
250 	dst = *tostr; buflen = *blen;
251 
252 #ifdef USE_ICONV
253 	if (id != (iconv_t)-1) {
254 		dst = buffer; buflen = CONV_BUFFER_SIZE;
255 	}
256 #endif
257 
258 	for (i = 0, j = 0; i < len; ++i) {
259 		n = wcrtomb(dst+j, str[i], &mbs);
260 		if (n == -1)
261 			goto err;
262 		j += n;
263 		if (buflen < j + MB_CUR_MAX) {
264 			if (id != (iconv_t)-1) {
265 				CONVERT2(buffer, &j, cw, offset);
266 			} else {
267 				nlen += 256;
268 				BINC_RETC(NULL, *tostr, *blen, nlen);
269 				dst = *tostr; buflen = *blen;
270 			}
271 		}
272 	}
273 
274 	n = wcrtomb(dst+j, L'\0', &mbs);
275 	j += n - 1;				/* don't count NUL at the end */
276 	*tolen = j;
277 
278 	if (id != (iconv_t)-1) {
279 		CONVERT2(buffer, &j, cw, offset);
280 		/* back to the initial state */
281 		CONVERT2(NULL, NULL, cw, offset);
282 		*tolen = offset;
283 	}
284 
285 	error = 0;
286 err:
287 	if (error)
288 		*tolen = j;
289 	*pdst = cw->bp1.c;
290 	IC_RESET();
291 
292 	return error;
293 }
294 
295 static int
296 fe_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
297     size_t *tolen, char **dst)
298 {
299 	return default_int2char(sp, str, len, cw, tolen, dst,
300 		sp->conv.id[IC_FE_INT2CHAR]);
301 }
302 
303 static int
304 cs_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
305     size_t *tolen, char **dst)
306 {
307 	return default_int2char(sp, str, len, cw, tolen, dst, (iconv_t)-1);
308 }
309 
310 #endif
311 
312 /*
313  * conv_init --
314  *	Initialize the iconv environment.
315  *
316  * PUBLIC: void conv_init(SCR *, SCR *);
317  */
318 void
319 conv_init(SCR *orig, SCR *sp)
320 {
321 	int i;
322 
323 	if (orig == NULL)
324 		setlocale(LC_ALL, "");
325 	if (orig != NULL)
326 		BCOPY(&orig->conv, &sp->conv, 1);
327 #ifdef USE_WIDECHAR
328 	else {
329 		char *ctype = setlocale(LC_CTYPE, NULL);
330 
331 		/*
332 		 * XXX
333 		 * This hack fixes the libncursesw issue on FreeBSD.
334 		 */
335 		if (!strcmp(ctype, "ko_KR.CP949"))
336 			setlocale(LC_CTYPE, "ko_KR.eucKR");
337 		else if (!strcmp(ctype, "zh_CN.GB2312"))
338 			setlocale(LC_CTYPE, "zh_CN.eucCN");
339 		else if (!strcmp(ctype, "zh_CN.GBK"))
340 			setlocale(LC_CTYPE, "zh_CN.GB18030");
341 
342 		/*
343 		 * Switch to 8bit mode if locale is C;
344 		 * LC_CTYPE should be reseted to C if unmatched.
345 		 */
346 		if (!strcmp(ctype, "C") || !strcmp(ctype, "POSIX")) {
347 			sp->conv.sys2int = sp->conv.file2int = raw2int;
348 			sp->conv.int2sys = sp->conv.int2file = int2raw;
349 			sp->conv.input2int = raw2int;
350 		} else {
351 			sp->conv.sys2int = cs_char2int;
352 			sp->conv.int2sys = cs_int2char;
353 			sp->conv.file2int = fe_char2int;
354 			sp->conv.int2file = fe_int2char;
355 			sp->conv.input2int = ie_char2int;
356 		}
357 #ifdef USE_ICONV
358 		o_set(sp, O_INPUTENCODING, OS_STRDUP, codeset(), 0);
359 #endif
360 	}
361 #endif
362 
363 	/* iconv descriptors must be distinct to screens. */
364 	for (i = 0; i <= IC_IE_TO_UTF16; ++i)
365 		sp->conv.id[i] = (iconv_t)-1;
366 #ifdef USE_ICONV
367 	conv_enc(sp, O_INPUTENCODING, 0);
368 #endif
369 }
370 
371 /*
372  * conv_enc --
373  *	Convert file/input encoding.
374  *
375  * PUBLIC: int conv_enc(SCR *, int, char *);
376  */
377 int
378 conv_enc(SCR *sp, int option, char *enc)
379 {
380 #if defined(USE_WIDECHAR) && defined(USE_ICONV)
381 	iconv_t *c2w, *w2c;
382 	iconv_t id_c2w, id_w2c;
383 
384 	switch (option) {
385 	case O_FILEENCODING:
386 		c2w = sp->conv.id + IC_FE_CHAR2INT;
387 		w2c = sp->conv.id + IC_FE_INT2CHAR;
388 		if (!enc)
389 			enc = O_STR(sp, O_FILEENCODING);
390 
391 		if (strcasecmp(codeset(), enc)) {
392 			if ((id_c2w = iconv_open(codeset(), enc)) ==
393 			    (iconv_t)-1)
394 				goto err;
395 			if ((id_w2c = iconv_open(enc, codeset())) ==
396 			    (iconv_t)-1)
397 				goto err;
398 		} else {
399 			id_c2w = (iconv_t)-1;
400 			id_w2c = (iconv_t)-1;
401 		}
402 
403 		break;
404 
405 	case O_INPUTENCODING:
406 		c2w = sp->conv.id + IC_IE_CHAR2INT;
407 		w2c = sp->conv.id + IC_IE_TO_UTF16;
408 		if (!enc)
409 			enc = O_STR(sp, O_INPUTENCODING);
410 
411 		if (strcasecmp(codeset(), enc)) {
412 			if ((id_c2w = iconv_open(codeset(), enc)) ==
413 			    (iconv_t)-1)
414 				goto err;
415 		} else
416 			id_c2w = (iconv_t)-1;
417 
418 		/* UTF-16 can not be locale and can not be inputed. */
419 		if ((id_w2c = iconv_open("utf-16be", enc)) == (iconv_t)-1)
420 			goto err;
421 
422 		break;
423 
424 	default:
425 		abort();
426 	}
427 
428 	if (*c2w != (iconv_t)-1)
429 		iconv_close(*c2w);
430 	if (*w2c != (iconv_t)-1)
431 		iconv_close(*w2c);
432 
433 	*c2w = id_c2w;
434 	*w2c = id_w2c;
435 
436 	F_CLR(sp, SC_CONV_ERROR);
437 	F_SET(sp, SC_SCR_REFORMAT);
438 
439 	return 0;
440 err:
441 #endif
442 	switch (option) {
443 	case O_FILEENCODING:
444 		msgq(sp, M_ERR, "321|File encoding conversion not supported");
445 		break;
446 	case O_INPUTENCODING:
447 		msgq(sp, M_ERR, "322|Input encoding conversion not supported");
448 		break;
449 	}
450 	return 1;
451 }
452 
453 /*
454  * conv_end --
455  *	Close the iconv descriptors, release the buffer.
456  *
457  * PUBLIC: void conv_end(SCR *);
458  */
459 void
460 conv_end(SCR *sp)
461 {
462 #if defined(USE_WIDECHAR) && defined(USE_ICONV)
463 	int i;
464 	for (i = 0; i <= IC_IE_TO_UTF16; ++i)
465 		if (sp->conv.id[i] != (iconv_t)-1)
466 			iconv_close(sp->conv.id[i]);
467 	if (sp->cw.bp1.c != NULL)
468 		free(sp->cw.bp1.c);
469 #endif
470 }
471