xref: /freebsd/contrib/less/charset.c (revision 5773cccf19ef7b97e56c1101aa481c43149224da)
1 /*
2  * Copyright (C) 1984-2000  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information about less, or for information on how to
8  * contact the author, see the README file.
9  */
10 
11 
12 /*
13  * Functions to define the character set
14  * and do things specific to the character set.
15  */
16 
17 #include "less.h"
18 #if HAVE_LOCALE
19 #include <locale.h>
20 #include <ctype.h>
21 #endif
22 
23 public int utf_mode = 0;
24 
25 /*
26  * Predefined character sets,
27  * selected by the LESSCHARSET environment variable.
28  */
29 struct charset {
30 	char *name;
31 	int *p_flag;
32 	char *desc;
33 } charsets[] = {
34 	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
35 	{ "dos",	NULL,       "8bcccbcc12bc5b223.b" },
36 	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
37 	{ "IBM-1047",	NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
38 	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
39 	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
40 	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
41 	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
42 	{ NULL, NULL, NULL }
43 };
44 
45 struct cs_alias {
46 	char *name;
47 	char *oname;
48 } cs_aliases[] = {
49 	{ "latin1",	"iso8859" },
50 	{ "latin9",	"iso8859" },
51 	{ NULL, NULL }
52 };
53 
54 #define	IS_BINARY_CHAR	01
55 #define	IS_CONTROL_CHAR	02
56 
57 static char chardef[256];
58 static char *binfmt = NULL;
59 public int binattr = AT_STANDOUT;
60 
61 
62 /*
63  * Define a charset, given a description string.
64  * The string consists of 256 letters,
65  * one for each character in the charset.
66  * If the string is shorter than 256 letters, missing letters
67  * are taken to be identical to the last one.
68  * A decimal number followed by a letter is taken to be a
69  * repetition of the letter.
70  *
71  * Each letter is one of:
72  *	. normal character
73  *	b binary character
74  *	c control character
75  */
76 	static void
77 ichardef(s)
78 	char *s;
79 {
80 	register char *cp;
81 	register int n;
82 	register char v;
83 
84 	n = 0;
85 	v = 0;
86 	cp = chardef;
87 	while (*s != '\0')
88 	{
89 		switch (*s++)
90 		{
91 		case '.':
92 			v = 0;
93 			break;
94 		case 'c':
95 			v = IS_CONTROL_CHAR;
96 			break;
97 		case 'b':
98 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
99 			break;
100 
101 		case '0': case '1': case '2': case '3': case '4':
102 		case '5': case '6': case '7': case '8': case '9':
103 			n = (10 * n) + (s[-1] - '0');
104 			continue;
105 
106 		default:
107 			error("invalid chardef", NULL_PARG);
108 			quit(QUIT_ERROR);
109 			/*NOTREACHED*/
110 		}
111 
112 		do
113 		{
114 			if (cp >= chardef + sizeof(chardef))
115 			{
116 				error("chardef longer than 256", NULL_PARG);
117 				quit(QUIT_ERROR);
118 				/*NOTREACHED*/
119 			}
120 			*cp++ = v;
121 		} while (--n > 0);
122 		n = 0;
123 	}
124 
125 	while (cp < chardef + sizeof(chardef))
126 		*cp++ = v;
127 }
128 
129 /*
130  * Define a charset, given a charset name.
131  * The valid charset names are listed in the "charsets" array.
132  */
133 	static int
134 icharset(name)
135 	register char *name;
136 {
137 	register struct charset *p;
138 	register struct cs_alias *a;
139 
140 	if (name == NULL || *name == '\0')
141 		return (0);
142 
143 	/* First see if the name is an alias. */
144 	for (a = cs_aliases;  a->name != NULL;  a++)
145 	{
146 		if (strcmp(name, a->name) == 0)
147 		{
148 			name = a->oname;
149 			break;
150 		}
151 	}
152 
153 	for (p = charsets;  p->name != NULL;  p++)
154 	{
155 		if (strcmp(name, p->name) == 0)
156 		{
157 			ichardef(p->desc);
158 			if (p->p_flag != NULL)
159 				*(p->p_flag) = 1;
160 			return (1);
161 		}
162 	}
163 
164 	error("invalid charset name", NULL_PARG);
165 	quit(QUIT_ERROR);
166 	/*NOTREACHED*/
167 }
168 
169 #if HAVE_LOCALE
170 /*
171  * Define a charset, given a locale name.
172  */
173 	static void
174 ilocale()
175 {
176 	register int c;
177 
178 	setlocale(LC_ALL, "");
179 	for (c = 0;  c < (int) sizeof(chardef);  c++)
180 	{
181 		if (isprint(c))
182 			chardef[c] = 0;
183 		else if (iscntrl(c))
184 			chardef[c] = IS_CONTROL_CHAR;
185 		else
186 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
187 	}
188 }
189 #endif
190 
191 /*
192  * Define the printing format for control chars.
193  */
194    	public void
195 setbinfmt(s)
196 	char *s;
197 {
198 	if (s == NULL || *s == '\0')
199 		s = "*s<%X>";
200 	/*
201 	 * Select the attributes if it starts with "*".
202 	 */
203 	if (*s == '*')
204 	{
205 		switch (s[1])
206 		{
207 		case 'd':  binattr = AT_BOLD;      break;
208 		case 'k':  binattr = AT_BLINK;     break;
209 		case 's':  binattr = AT_STANDOUT;  break;
210 		case 'u':  binattr = AT_UNDERLINE; break;
211 		default:   binattr = AT_NORMAL;    break;
212 		}
213 		s += 2;
214 	}
215 	binfmt = s;
216 }
217 
218 /*
219  * Initialize charset data structures.
220  */
221 	public void
222 init_charset()
223 {
224 	register char *s;
225 
226 	s = lgetenv("LESSBINFMT");
227 	setbinfmt(s);
228 
229 	/*
230 	 * See if environment variable LESSCHARSET is defined.
231 	 */
232 	s = lgetenv("LESSCHARSET");
233 	if (icharset(s))
234 		return;
235 	/*
236 	 * LESSCHARSET is not defined: try LESSCHARDEF.
237 	 */
238 	s = lgetenv("LESSCHARDEF");
239 	if (s != NULL && *s != '\0')
240 	{
241 		ichardef(s);
242 		return;
243 	}
244 
245 #if HAVE_STRSTR
246 	/*
247 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
248 	 */
249 	if ((s = lgetenv("LC_ALL")) != NULL ||
250 	    (s = lgetenv("LC_CTYPE")) != NULL ||
251 	    (s = lgetenv("LANG")) != NULL)
252 	{
253 		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
254 			if (icharset("utf-8"))
255 				return;
256 	}
257 #endif
258 
259 #if HAVE_LOCALE
260 	/*
261 	 * Use setlocale.
262 	 */
263 	ilocale();
264 #else
265 #if MSDOS_COMPILER
266 	/*
267 	 * Default to "dos".
268 	 */
269 	(void) icharset("dos");
270 #else
271 	/*
272 	 * Default to "latin1".
273 	 */
274 	(void) icharset("latin1");
275 #endif
276 #endif
277 }
278 
279 /*
280  * Is a given character a "binary" character?
281  */
282 	public int
283 binary_char(c)
284 	unsigned char c;
285 {
286 	c &= 0377;
287 	return (chardef[c] & IS_BINARY_CHAR);
288 }
289 
290 /*
291  * Is a given character a "control" character?
292  */
293 	public int
294 control_char(c)
295 	int c;
296 {
297 	c &= 0377;
298 	return (chardef[c] & IS_CONTROL_CHAR);
299 }
300 
301 /*
302  * Return the printable form of a character.
303  * For example, in the "ascii" charset '\3' is printed as "^C".
304  */
305 	public char *
306 prchar(c)
307 	int c;
308 {
309 	static char buf[8];
310 
311 	c &= 0377;
312 	if (!control_char(c))
313 		sprintf(buf, "%c", c);
314 	else if (c == ESC)
315 		sprintf(buf, "ESC");
316 #if IS_EBCDIC_HOST
317 	else if (!binary_char(c) && c < 64)
318 		sprintf(buf, "^%c",
319 		/*
320 		 * This array roughly inverts CONTROL() #defined in less.h,
321 	 	 * and should be kept in sync with CONTROL() and IBM-1047.
322  	 	 */
323 		"@ABC.I.?...KLMNO"
324 		"PQRS.JH.XY.."
325 		"\\]^_"
326 		"......W[.....EFG"
327 		"..V....D....TU.Z"[c]);
328 #else
329   	else if (c < 128 && !control_char(c ^ 0100))
330   		sprintf(buf, "^%c", c ^ 0100);
331 #endif
332 	else
333 		sprintf(buf, binfmt, c);
334 	return (buf);
335 }
336