xref: /freebsd/contrib/less/charset.c (revision 3642298923e528d795e3a30ec165d2b469e28b40)
1 /*
2  * Copyright (C) 1984-2002  Mark Nudelman
3  *
4  * You may distribute under the terms of either the GNU General Public
5  * License or the Less License, as specified in the README file.
6  *
7  * For more information about less, or for information on how to
8  * contact the author, see the README file.
9  */
10 
11 
12 /*
13  * Functions to define the character set
14  * and do things specific to the character set.
15  */
16 
17 #include "less.h"
18 #if HAVE_LOCALE
19 #include <locale.h>
20 #include <ctype.h>
21 #endif
22 
23 public int utf_mode = 0;
24 
25 /*
26  * Predefined character sets,
27  * selected by the LESSCHARSET environment variable.
28  */
29 struct charset {
30 	char *name;
31 	int *p_flag;
32 	char *desc;
33 } charsets[] = {
34 	{ "ascii",	NULL,       "8bcccbcc18b95.b" },
35 	{ "dos",	NULL,       "8bcccbcc12bc5b223.b" },
36 	{ "ebcdic",	NULL,       "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
37 	{ "IBM-1047",	NULL,       "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
38 	{ "iso8859",	NULL,       "8bcccbcc18b95.33b." },
39 	{ "koi8-r",	NULL,       "8bcccbcc18b95.b128." },
40 	{ "next",	NULL,       "8bcccbcc18b95.bb125.bb" },
41 	{ "utf-8",	&utf_mode,  "8bcccbcc18b." },
42 	{ NULL, NULL, NULL }
43 };
44 
45 struct cs_alias {
46 	char *name;
47 	char *oname;
48 } cs_aliases[] = {
49 	{ "latin1",	"iso8859" },
50 	{ "latin9",	"iso8859" },
51 	{ NULL, NULL }
52 };
53 
54 #define	IS_BINARY_CHAR	01
55 #define	IS_CONTROL_CHAR	02
56 
57 static char chardef[256];
58 static char *binfmt = NULL;
59 public int binattr = AT_STANDOUT;
60 
61 
62 /*
63  * Define a charset, given a description string.
64  * The string consists of 256 letters,
65  * one for each character in the charset.
66  * If the string is shorter than 256 letters, missing letters
67  * are taken to be identical to the last one.
68  * A decimal number followed by a letter is taken to be a
69  * repetition of the letter.
70  *
71  * Each letter is one of:
72  *	. normal character
73  *	b binary character
74  *	c control character
75  */
76 	static void
77 ichardef(s)
78 	char *s;
79 {
80 	register char *cp;
81 	register int n;
82 	register char v;
83 
84 	n = 0;
85 	v = 0;
86 	cp = chardef;
87 	while (*s != '\0')
88 	{
89 		switch (*s++)
90 		{
91 		case '.':
92 			v = 0;
93 			break;
94 		case 'c':
95 			v = IS_CONTROL_CHAR;
96 			break;
97 		case 'b':
98 			v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
99 			break;
100 
101 		case '0': case '1': case '2': case '3': case '4':
102 		case '5': case '6': case '7': case '8': case '9':
103 			n = (10 * n) + (s[-1] - '0');
104 			continue;
105 
106 		default:
107 			error("invalid chardef", NULL_PARG);
108 			quit(QUIT_ERROR);
109 			/*NOTREACHED*/
110 		}
111 
112 		do
113 		{
114 			if (cp >= chardef + sizeof(chardef))
115 			{
116 				error("chardef longer than 256", NULL_PARG);
117 				quit(QUIT_ERROR);
118 				/*NOTREACHED*/
119 			}
120 			*cp++ = v;
121 		} while (--n > 0);
122 		n = 0;
123 	}
124 
125 	while (cp < chardef + sizeof(chardef))
126 		*cp++ = v;
127 }
128 
129 /*
130  * Define a charset, given a charset name.
131  * The valid charset names are listed in the "charsets" array.
132  */
133 	static int
134 icharset(name)
135 	register char *name;
136 {
137 	register struct charset *p;
138 	register struct cs_alias *a;
139 
140 	if (name == NULL || *name == '\0')
141 		return (0);
142 
143 	/* First see if the name is an alias. */
144 	for (a = cs_aliases;  a->name != NULL;  a++)
145 	{
146 		if (strcmp(name, a->name) == 0)
147 		{
148 			name = a->oname;
149 			break;
150 		}
151 	}
152 
153 	for (p = charsets;  p->name != NULL;  p++)
154 	{
155 		if (strcmp(name, p->name) == 0)
156 		{
157 			ichardef(p->desc);
158 			if (p->p_flag != NULL)
159 				*(p->p_flag) = 1;
160 			return (1);
161 		}
162 	}
163 
164 	error("invalid charset name", NULL_PARG);
165 	quit(QUIT_ERROR);
166 	/*NOTREACHED*/
167 	return (0);
168 }
169 
170 #if HAVE_LOCALE
171 /*
172  * Define a charset, given a locale name.
173  */
174 	static void
175 ilocale()
176 {
177 	register int c;
178 
179 	setlocale(LC_ALL, "");
180 	for (c = 0;  c < (int) sizeof(chardef);  c++)
181 	{
182 		if (isprint(c))
183 			chardef[c] = 0;
184 		else if (iscntrl(c))
185 			chardef[c] = IS_CONTROL_CHAR;
186 		else
187 			chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
188 	}
189 }
190 #endif
191 
192 /*
193  * Define the printing format for control chars.
194  */
195    	public void
196 setbinfmt(s)
197 	char *s;
198 {
199 	if (s == NULL || *s == '\0')
200 		s = "*s<%X>";
201 	/*
202 	 * Select the attributes if it starts with "*".
203 	 */
204 	if (*s == '*')
205 	{
206 		switch (s[1])
207 		{
208 		case 'd':  binattr = AT_BOLD;      break;
209 		case 'k':  binattr = AT_BLINK;     break;
210 		case 's':  binattr = AT_STANDOUT;  break;
211 		case 'u':  binattr = AT_UNDERLINE; break;
212 		default:   binattr = AT_NORMAL;    break;
213 		}
214 		s += 2;
215 	}
216 	binfmt = s;
217 }
218 
219 /*
220  * Initialize charset data structures.
221  */
222 	public void
223 init_charset()
224 {
225 	register char *s;
226 
227 	s = lgetenv("LESSBINFMT");
228 	setbinfmt(s);
229 
230 	/*
231 	 * See if environment variable LESSCHARSET is defined.
232 	 */
233 	s = lgetenv("LESSCHARSET");
234 	if (icharset(s))
235 		return;
236 	/*
237 	 * LESSCHARSET is not defined: try LESSCHARDEF.
238 	 */
239 	s = lgetenv("LESSCHARDEF");
240 	if (s != NULL && *s != '\0')
241 	{
242 		ichardef(s);
243 		return;
244 	}
245 
246 #if HAVE_STRSTR
247 	/*
248 	 * Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
249 	 */
250 	if ((s = lgetenv("LC_ALL")) != NULL ||
251 	    (s = lgetenv("LC_CTYPE")) != NULL ||
252 	    (s = lgetenv("LANG")) != NULL)
253 	{
254 		if (strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL)
255 			if (icharset("utf-8"))
256 				return;
257 	}
258 #endif
259 
260 #if HAVE_LOCALE
261 	/*
262 	 * Use setlocale.
263 	 */
264 	ilocale();
265 #else
266 #if MSDOS_COMPILER
267 	/*
268 	 * Default to "dos".
269 	 */
270 	(void) icharset("dos");
271 #else
272 	/*
273 	 * Default to "latin1".
274 	 */
275 	(void) icharset("latin1");
276 #endif
277 #endif
278 }
279 
280 /*
281  * Is a given character a "binary" character?
282  */
283 	public int
284 binary_char(c)
285 	unsigned char c;
286 {
287 	c &= 0377;
288 	return (chardef[c] & IS_BINARY_CHAR);
289 }
290 
291 /*
292  * Is a given character a "control" character?
293  */
294 	public int
295 control_char(c)
296 	int c;
297 {
298 	c &= 0377;
299 	return (chardef[c] & IS_CONTROL_CHAR);
300 }
301 
302 /*
303  * Return the printable form of a character.
304  * For example, in the "ascii" charset '\3' is printed as "^C".
305  */
306 	public char *
307 prchar(c)
308 	int c;
309 {
310 	static char buf[8];
311 
312 	c &= 0377;
313 	if (!control_char(c))
314 		sprintf(buf, "%c", c);
315 	else if (c == ESC)
316 		sprintf(buf, "ESC");
317 #if IS_EBCDIC_HOST
318 	else if (!binary_char(c) && c < 64)
319 		sprintf(buf, "^%c",
320 		/*
321 		 * This array roughly inverts CONTROL() #defined in less.h,
322 	 	 * and should be kept in sync with CONTROL() and IBM-1047.
323  	 	 */
324 		"@ABC.I.?...KLMNO"
325 		"PQRS.JH.XY.."
326 		"\\]^_"
327 		"......W[.....EFG"
328 		"..V....D....TU.Z"[c]);
329 #else
330   	else if (c < 128 && !control_char(c ^ 0100))
331   		sprintf(buf, "^%c", c ^ 0100);
332 #endif
333 	else
334 		sprintf(buf, binfmt, c);
335 	return (buf);
336 }
337