xref: /freebsd/contrib/less/charset.c (revision 5521ff5a4d1929056e7ffc982fac3341ca54df7c)
1 /*
2  * Copyright (C) 1984-2000  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information about less, or for information on how to
8  * contact the author, see the README file.
9  */
10 
11 
12 /*
13  * Functions to define the character set
14  * and do things specific to the character set.
15  */
16 
17 #include "less.h"
18 #if HAVE_LOCALE
19 #include <locale.h>
20 #include <ctype.h>
21 #endif
22 
23 public int utf_mode = 0;
24 
25 /*
26  * Predefined character sets,
27  * selected by the LESSCHARSET environment variable.
28  */
29 struct charset {
30 	char *name;
31 	int *p_flag;
32 	char *desc;
33 } charsets[] = {
34 	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
35 	{ "dos",	NULL,       "8bcccbcc12bc5b95.b." },
36 	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
37 	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
38 	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
39 	{ "latin1",	NULL,       "8bcccbcc18b95.33b." },
40 	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
41 	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
42 	{ NULL, NULL, NULL }
43 };
44 
45 #define	IS_BINARY_CHAR	01
46 #define	IS_CONTROL_CHAR	02
47 
48 static char chardef[256];
49 static char *binfmt = NULL;
50 public int binattr = AT_STANDOUT;
51 
52 
53 /*
54  * Define a charset, given a description string.
55  * The string consists of 256 letters,
56  * one for each character in the charset.
57  * If the string is shorter than 256 letters, missing letters
58  * are taken to be identical to the last one.
59  * A decimal number followed by a letter is taken to be a
60  * repetition of the letter.
61  *
62  * Each letter is one of:
63  *	. normal character
64  *	b binary character
65  *	c control character
66  */
67 	static void
68 ichardef(s)
69 	char *s;
70 {
71 	register char *cp;
72 	register int n;
73 	register char v;
74 
75 	n = 0;
76 	v = 0;
77 	cp = chardef;
78 	while (*s != '\0')
79 	{
80 		switch (*s++)
81 		{
82 		case '.':
83 			v = 0;
84 			break;
85 		case 'c':
86 			v = IS_CONTROL_CHAR;
87 			break;
88 		case 'b':
89 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
90 			break;
91 
92 		case '0': case '1': case '2': case '3': case '4':
93 		case '5': case '6': case '7': case '8': case '9':
94 			n = (10 * n) + (s[-1] - '0');
95 			continue;
96 
97 		default:
98 			error("invalid chardef", NULL_PARG);
99 			quit(QUIT_ERROR);
100 			/*NOTREACHED*/
101 		}
102 
103 		do
104 		{
105 			if (cp >= chardef + sizeof(chardef))
106 			{
107 				error("chardef longer than 256", NULL_PARG);
108 				quit(QUIT_ERROR);
109 				/*NOTREACHED*/
110 			}
111 			*cp++ = v;
112 		} while (--n > 0);
113 		n = 0;
114 	}
115 
116 	while (cp < chardef + sizeof(chardef))
117 		*cp++ = v;
118 }
119 
120 /*
121  * Define a charset, given a charset name.
122  * The valid charset names are listed in the "charsets" array.
123  */
124 	static int
125 icharset(name)
126 	register char *name;
127 {
128 	register struct charset *p;
129 
130 	if (name == NULL || *name == '\0')
131 		return (0);
132 
133 	for (p = charsets;  p->name != NULL;  p++)
134 	{
135 		if (strcmp(name, p->name) == 0)
136 		{
137 			ichardef(p->desc);
138 			if (p->p_flag != NULL)
139 				*(p->p_flag) = 1;
140 			return (1);
141 		}
142 	}
143 
144 	error("invalid charset name", NULL_PARG);
145 	quit(QUIT_ERROR);
146 	/*NOTREACHED*/
147 }
148 
149 #if HAVE_LOCALE
150 /*
151  * Define a charset, given a locale name.
152  */
153 	static void
154 ilocale()
155 {
156 	register int c;
157 
158 	setlocale(LC_ALL, "");
159 	for (c = 0;  c < (int) sizeof(chardef);  c++)
160 	{
161 		if (isprint(c))
162 			chardef[c] = 0;
163 		else if (iscntrl(c))
164 			chardef[c] = IS_CONTROL_CHAR;
165 		else
166 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
167 	}
168 }
169 #endif
170 
171 /*
172  * Define the printing format for control chars.
173  */
174    	public void
175 setbinfmt(s)
176 	char *s;
177 {
178 	if (s == NULL || *s == '\0')
179 		s = "*s<%X>";
180 	/*
181 	 * Select the attributes if it starts with "*".
182 	 */
183 	if (*s == '*')
184 	{
185 		switch (s[1])
186 		{
187 		case 'd':  binattr = AT_BOLD;      break;
188 		case 'k':  binattr = AT_BLINK;     break;
189 		case 's':  binattr = AT_STANDOUT;  break;
190 		case 'u':  binattr = AT_UNDERLINE; break;
191 		default:   binattr = AT_NORMAL;    break;
192 		}
193 		s += 2;
194 	}
195 	binfmt = s;
196 }
197 
198 /*
199  * Initialize charset data structures.
200  */
201 	public void
202 init_charset()
203 {
204 	register char *s;
205 
206 	s = lgetenv("LESSBINFMT");
207 	setbinfmt(s);
208 
209 	/*
210 	 * See if environment variable LESSCHARSET is defined.
211 	 */
212 	s = lgetenv("LESSCHARSET");
213 	if (icharset(s))
214 		return;
215 	/*
216 	 * LESSCHARSET is not defined: try LESSCHARDEF.
217 	 */
218 	s = lgetenv("LESSCHARDEF");
219 	if (s != NULL && *s != '\0')
220 	{
221 		ichardef(s);
222 		return;
223 	}
224 
225 #if HAVE_STRSTR
226 	/*
227 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
228 	 */
229 	if ((s = lgetenv("LC_ALL")) != NULL ||
230 	    (s = lgetenv("LC_CTYPE")) != NULL ||
231 	    (s = lgetenv("LANG")) != NULL)
232 	{
233 		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
234 			if (icharset("utf-8"))
235 				return;
236 	}
237 #endif
238 
239 #if HAVE_LOCALE
240 	/*
241 	 * Use setlocale.
242 	 */
243 	ilocale();
244 #else
245 	/*
246 	 * Default to "latin1".
247 	 */
248 	(void) icharset("latin1");
249 #endif
250 }
251 
252 /*
253  * Is a given character a "binary" character?
254  */
255 	public int
256 binary_char(c)
257 	unsigned char c;
258 {
259 	c &= 0377;
260 	return (chardef[c] & IS_BINARY_CHAR);
261 }
262 
263 /*
264  * Is a given character a "control" character?
265  */
266 	public int
267 control_char(c)
268 	int c;
269 {
270 	c &= 0377;
271 	return (chardef[c] & IS_CONTROL_CHAR);
272 }
273 
274 /*
275  * Return the printable form of a character.
276  * For example, in the "ascii" charset '\3' is printed as "^C".
277  */
278 	public char *
279 prchar(c)
280 	int c;
281 {
282 	static char buf[8];
283 
284 	c &= 0377;
285 	if (!control_char(c))
286 		sprintf(buf, "%c", c);
287 	else if (c == ESC)
288 		sprintf(buf, "ESC");
289 	else if (c < 128 && !control_char(c ^ 0100))
290 		sprintf(buf, "^%c", c ^ 0100);
291 	else
292 		sprintf(buf, binfmt, c);
293 	return (buf);
294 }
295