xref: /freebsd/contrib/nvi/common/conv.c (revision faf25f48d601ae39f5752602f3020e2e92605625)
1 /*-
2  * Copyright (c) 1993, 1994
3  *	The Regents of the University of California.  All rights reserved.
4  * Copyright (c) 1993, 1994, 1995, 1996
5  *	Keith Bostic.  All rights reserved.
6  * Copyright (c) 2011, 2012
7  *	Zhihao Yuan.  All rights reserved.
8  *
9  * See the LICENSE file for redistribution information.
10  */
11 
12 #include "config.h"
13 
14 #include <sys/types.h>
15 #include <sys/queue.h>
16 #include <sys/time.h>
17 
18 #include <bitstring.h>
19 #include <errno.h>
20 #include <limits.h>
21 #include <langinfo.h>
22 #include <locale.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <strings.h>
27 #include <unistd.h>
28 
29 #include "common.h"
30 
31 /*
32  * codeset --
33  *	Get the locale encoding.
34  *
35  * PUBLIC: char * codeset(void);
36  */
37 char *
38 codeset(void)
39 {
40 	static char *cs;
41 
42 	if (cs == NULL)
43 		cs = nl_langinfo(CODESET);
44 
45 	return cs;
46 }
47 
48 #ifdef USE_WIDECHAR
49 static int
50 raw2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
51     CHAR_T **dst)
52 {
53 	int i;
54 	CHAR_T **tostr = &cw->bp1.wc;
55 	size_t  *blen = &cw->blen1;
56 
57 	BINC_RETW(NULL, *tostr, *blen, len);
58 
59 	*tolen = len;
60 	for (i = 0; i < len; ++i)
61 		(*tostr)[i] = (u_char) str[i];
62 
63 	*dst = cw->bp1.wc;
64 
65 	return 0;
66 }
67 
68 #define CONV_BUFFER_SIZE    512
69 /* fill the buffer with codeset encoding of string pointed to by str
70  * left has the number of bytes left in str and is adjusted
71  * len contains the number of bytes put in the buffer
72  */
73 #ifdef USE_ICONV
74 #define CONVERT(str, left, src, len)					\
75 	do {								\
76 		size_t outleft;						\
77 		char *bp = buffer;					\
78 		outleft = CONV_BUFFER_SIZE;				\
79 		errno = 0;						\
80 		if (iconv(id, (iconv_src_t)&str, &left, &bp, &outleft)	\
81 		    == -1 && errno != E2BIG)				\
82 			goto err;					\
83 		if ((len = CONV_BUFFER_SIZE - outleft) == 0) {		\
84 			error = -left;					\
85 			goto err;					\
86 		}							\
87 		src = buffer;						\
88 	} while (0)
89 
90 #define IC_RESET()							\
91 	do {								\
92 		if (id != (iconv_t)-1)					\
93 			iconv(id, NULL, NULL, NULL, NULL);		\
94 	} while(0)
95 #else
96 #define CONVERT(str, left, src, len)
97 #define IC_RESET()
98 #endif
99 
100 static int
101 default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
102     size_t *tolen, CHAR_T **dst, iconv_t id)
103 {
104 	size_t i = 0, j;
105 	CHAR_T **tostr = &cw->bp1.wc;
106 	size_t *blen = &cw->blen1;
107 	mbstate_t mbs;
108 	size_t n;
109 	ssize_t nlen = len;
110 	char *src = (char *)str;
111 #ifdef USE_ICONV
112 	char buffer[CONV_BUFFER_SIZE];
113 #endif
114 	size_t left = len;
115 	int error = 1;
116 
117 	memset(&mbs, 0, sizeof(mbs));
118 	BINC_RETW(NULL, *tostr, *blen, nlen);
119 
120 #ifdef USE_ICONV
121 	if (id != (iconv_t)-1)
122 		CONVERT(str, left, src, len);
123 #endif
124 
125 	for (i = 0, j = 0; j < len; ) {
126 		n = mbrtowc((*tostr)+i, src+j, len-j, &mbs);
127 		/* NULL character converted */
128 		if (n == -2)
129 			error = -(len-j);
130 		if (n == -1 || n == -2)
131 			goto err;
132 		if (n == 0)
133 			n = 1;
134 		j += n;
135 		if (++i >= *blen) {
136 			nlen += 256;
137 			BINC_RETW(NULL, *tostr, *blen, nlen);
138 		}
139 		if (id != (iconv_t)-1 && j == len && left) {
140 			CONVERT(str, left, src, len);
141 			j = 0;
142 		}
143 	}
144 
145 	error = 0;
146 err:
147 	*tolen = i;
148 	*dst = cw->bp1.wc;
149 	IC_RESET();
150 
151 	return error;
152 }
153 
154 static int
155 fe_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
156     CHAR_T **dst)
157 {
158 	return default_char2int(sp, str, len, cw, tolen, dst,
159 	    sp->conv.id[IC_FE_CHAR2INT]);
160 }
161 
162 static int
163 ie_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
164     CHAR_T **dst)
165 {
166 	return default_char2int(sp, str, len, cw, tolen, dst,
167 	    sp->conv.id[IC_IE_CHAR2INT]);
168 }
169 
170 static int
171 cs_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
172     CHAR_T **dst)
173 {
174 	return default_char2int(sp, str, len, cw, tolen, dst, (iconv_t)-1);
175 }
176 
177 static int
178 int2raw(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, size_t *tolen,
179     char **dst)
180 {
181 	int i;
182 	char **tostr = &cw->bp1.c;
183 	size_t  *blen = &cw->blen1;
184 
185 	BINC_RETC(NULL, *tostr, *blen, len);
186 
187 	*tolen = len;
188 	for (i = 0; i < len; ++i)
189 		(*tostr)[i] = str[i];
190 
191 	*dst = cw->bp1.c;
192 
193 	return 0;
194 }
195 
196 static int
197 default_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
198     size_t *tolen, char **pdst, iconv_t id)
199 {
200 	size_t i, j, offset = 0;
201 	char **tostr = &cw->bp1.c;
202 	size_t *blen = &cw->blen1;
203 	mbstate_t mbs;
204 	size_t n;
205 	ssize_t  nlen = len + MB_CUR_MAX;
206 	char *dst;
207 	size_t buflen;
208 #ifdef USE_ICONV
209 	char buffer[CONV_BUFFER_SIZE];
210 #endif
211 	int error = 1;
212 
213 /* convert first len bytes of buffer and append it to cw->bp
214  * len is adjusted => 0
215  * offset contains the offset in cw->bp and is adjusted
216  * cw->bp is grown as required
217  */
218 #ifdef USE_ICONV
219 #define CONVERT2(_buffer, lenp, cw, offset)				\
220 	do {								\
221 		char *bp = _buffer;					\
222 		int ret;						\
223 		do {							\
224 			size_t outleft = cw->blen1 - offset;		\
225 			char *obp = cw->bp1.c + offset;			\
226 			if (cw->blen1 < offset + MB_CUR_MAX) {		\
227 				nlen += 256;				\
228 				BINC_RETC(NULL, cw->bp1.c, cw->blen1,	\
229 				    nlen);				\
230 			}						\
231 			errno = 0;					\
232 			ret = iconv(id, (iconv_src_t)&bp, lenp, &obp,	\
233 			    &outleft);					\
234 			if (ret == -1 && errno != E2BIG)		\
235 				goto err;				\
236 			offset = cw->blen1 - outleft;			\
237 		} while (ret != 0); 					\
238 	} while (0)
239 #else
240 #define CONVERT2(_buffer, lenp, cw, offset)
241 #endif
242 
243 
244 	memset(&mbs, 0, sizeof(mbs));
245 	BINC_RETC(NULL, *tostr, *blen, nlen);
246 	dst = *tostr; buflen = *blen;
247 
248 #ifdef USE_ICONV
249 	if (id != (iconv_t)-1) {
250 		dst = buffer; buflen = CONV_BUFFER_SIZE;
251 	}
252 #endif
253 
254 	for (i = 0, j = 0; i < len; ++i) {
255 		n = wcrtomb(dst+j, str[i], &mbs);
256 		if (n == -1)
257 			goto err;
258 		j += n;
259 		if (buflen < j + MB_CUR_MAX) {
260 			if (id != (iconv_t)-1) {
261 				CONVERT2(buffer, &j, cw, offset);
262 			} else {
263 				nlen += 256;
264 				BINC_RETC(NULL, *tostr, *blen, nlen);
265 				dst = *tostr; buflen = *blen;
266 			}
267 		}
268 	}
269 
270 	n = wcrtomb(dst+j, L'\0', &mbs);
271 	j += n - 1;				/* don't count NUL at the end */
272 	*tolen = j;
273 
274 	if (id != (iconv_t)-1) {
275 		CONVERT2(buffer, &j, cw, offset);
276 		/* back to the initial state */
277 		CONVERT2(NULL, NULL, cw, offset);
278 		*tolen = offset;
279 	}
280 
281 	error = 0;
282 err:
283 	if (error)
284 		*tolen = j;
285 	*pdst = cw->bp1.c;
286 	IC_RESET();
287 
288 	return error;
289 }
290 
291 static int
292 fe_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
293     size_t *tolen, char **dst)
294 {
295 	return default_int2char(sp, str, len, cw, tolen, dst,
296 		sp->conv.id[IC_FE_INT2CHAR]);
297 }
298 
299 static int
300 cs_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
301     size_t *tolen, char **dst)
302 {
303 	return default_int2char(sp, str, len, cw, tolen, dst, (iconv_t)-1);
304 }
305 
306 #endif
307 
308 /*
309  * conv_init --
310  *	Initialize the iconv environment.
311  *
312  * PUBLIC: void conv_init(SCR *, SCR *);
313  */
314 void
315 conv_init(SCR *orig, SCR *sp)
316 {
317 	int i;
318 
319 	if (orig == NULL)
320 		setlocale(LC_ALL, "");
321 	if (orig != NULL)
322 		memmove(&sp->conv, &orig->conv, sizeof(CONV));
323 #ifdef USE_WIDECHAR
324 	else {
325 		char *ctype = setlocale(LC_CTYPE, NULL);
326 
327 		/*
328 		 * XXX
329 		 * This hack fixes the libncursesw issue on FreeBSD.
330 		 */
331 		if (!strcmp(ctype, "ko_KR.CP949"))
332 			setlocale(LC_CTYPE, "ko_KR.eucKR");
333 		else if (!strcmp(ctype, "zh_CN.GB2312"))
334 			setlocale(LC_CTYPE, "zh_CN.eucCN");
335 		else if (!strcmp(ctype, "zh_CN.GBK"))
336 			setlocale(LC_CTYPE, "zh_CN.GB18030");
337 
338 		/*
339 		 * Switch to 8bit mode if locale is C;
340 		 * LC_CTYPE should be reseted to C if unmatched.
341 		 */
342 		if (!strcmp(ctype, "C") || !strcmp(ctype, "POSIX")) {
343 			sp->conv.sys2int = sp->conv.file2int = raw2int;
344 			sp->conv.int2sys = sp->conv.int2file = int2raw;
345 			sp->conv.input2int = raw2int;
346 		} else {
347 			sp->conv.sys2int = cs_char2int;
348 			sp->conv.int2sys = cs_int2char;
349 			sp->conv.file2int = fe_char2int;
350 			sp->conv.int2file = fe_int2char;
351 			sp->conv.input2int = ie_char2int;
352 		}
353 #ifdef USE_ICONV
354 		o_set(sp, O_INPUTENCODING, OS_STRDUP, codeset(), 0);
355 #endif
356 	}
357 #endif
358 
359 	/* iconv descriptors must be distinct to screens. */
360 	for (i = 0; i <= IC_IE_TO_UTF16; ++i)
361 		sp->conv.id[i] = (iconv_t)-1;
362 #ifdef USE_ICONV
363 	conv_enc(sp, O_INPUTENCODING, 0);
364 #endif
365 }
366 
367 /*
368  * conv_enc --
369  *	Convert file/input encoding.
370  *
371  * PUBLIC: int conv_enc(SCR *, int, char *);
372  */
373 int
374 conv_enc(SCR *sp, int option, char *enc)
375 {
376 #if defined(USE_WIDECHAR) && defined(USE_ICONV)
377 	iconv_t *c2w, *w2c;
378 	iconv_t id_c2w, id_w2c;
379 
380 	switch (option) {
381 	case O_FILEENCODING:
382 		c2w = sp->conv.id + IC_FE_CHAR2INT;
383 		w2c = sp->conv.id + IC_FE_INT2CHAR;
384 		if (!enc)
385 			enc = O_STR(sp, O_FILEENCODING);
386 
387 		if (strcasecmp(codeset(), enc)) {
388 			if ((id_c2w = iconv_open(codeset(), enc)) ==
389 			    (iconv_t)-1)
390 				goto err;
391 			if ((id_w2c = iconv_open(enc, codeset())) ==
392 			    (iconv_t)-1)
393 				goto err;
394 		} else {
395 			id_c2w = (iconv_t)-1;
396 			id_w2c = (iconv_t)-1;
397 		}
398 
399 		break;
400 
401 	case O_INPUTENCODING:
402 		c2w = sp->conv.id + IC_IE_CHAR2INT;
403 		w2c = sp->conv.id + IC_IE_TO_UTF16;
404 		if (!enc)
405 			enc = O_STR(sp, O_INPUTENCODING);
406 
407 		if (strcasecmp(codeset(), enc)) {
408 			if ((id_c2w = iconv_open(codeset(), enc)) ==
409 			    (iconv_t)-1)
410 				goto err;
411 		} else
412 			id_c2w = (iconv_t)-1;
413 
414 		/* UTF-16 can not be locale and can not be inputed. */
415 		if ((id_w2c = iconv_open("utf-16be", enc)) == (iconv_t)-1)
416 			goto err;
417 
418 		break;
419 
420 	default:
421 		abort();
422 	}
423 
424 	if (*c2w != (iconv_t)-1)
425 		iconv_close(*c2w);
426 	if (*w2c != (iconv_t)-1)
427 		iconv_close(*w2c);
428 
429 	*c2w = id_c2w;
430 	*w2c = id_w2c;
431 
432 	F_CLR(sp, SC_CONV_ERROR);
433 	F_SET(sp, SC_SCR_REFORMAT);
434 
435 	return 0;
436 err:
437 #endif
438 	switch (option) {
439 	case O_FILEENCODING:
440 		msgq(sp, M_ERR, "321|File encoding conversion not supported");
441 		break;
442 	case O_INPUTENCODING:
443 		msgq(sp, M_ERR, "322|Input encoding conversion not supported");
444 		break;
445 	}
446 	return 1;
447 }
448 
449 /*
450  * conv_end --
451  *	Close the iconv descriptors, release the buffer.
452  *
453  * PUBLIC: void conv_end(SCR *);
454  */
455 void
456 conv_end(SCR *sp)
457 {
458 #if defined(USE_WIDECHAR) && defined(USE_ICONV)
459 	int i;
460 	for (i = 0; i <= IC_IE_TO_UTF16; ++i)
461 		if (sp->conv.id[i] != (iconv_t)-1)
462 			iconv_close(sp->conv.id[i]);
463 	free(sp->cw.bp1.c);
464 #endif
465 }
466