xref: /freebsd/usr.bin/tr/str.c (revision 7660b554bc59a07be0431c17e0e33815818baa69)
1 /*-
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 
36 __FBSDID("$FreeBSD$");
37 
38 #ifndef lint
39 static const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
40 #endif
41 
42 #include <sys/cdefs.h>
43 #include <sys/types.h>
44 
45 #include <ctype.h>
46 #include <err.h>
47 #include <stddef.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 
52 #include "extern.h"
53 
54 static int      backslash(STR *, int *);
55 static int	bracket(STR *);
56 static int	c_class(const void *, const void *);
57 static void	genclass(STR *);
58 static void	genequiv(STR *);
59 static int      genrange(STR *, int);
60 static void	genseq(STR *);
61 
62 int
63 next(s)
64 	STR *s;
65 {
66 	int ch, is_octal;
67 
68 	switch (s->state) {
69 	case EOS:
70 		return (0);
71 	case INFINITE:
72 		return (1);
73 	case NORMAL:
74 		switch (ch = (u_char)*s->str) {
75 		case '\0':
76 			s->state = EOS;
77 			return (0);
78 		case '\\':
79 			s->lastch = backslash(s, &is_octal);
80 			break;
81 		case '[':
82 			if (bracket(s))
83 				return (next(s));
84 			/* FALLTHROUGH */
85 		default:
86 			is_octal = 0;
87 			++s->str;
88 			s->lastch = ch;
89 			break;
90 		}
91 
92 		/* We can start a range at any time. */
93 		if (s->str[0] == '-' && genrange(s, is_octal))
94 			return (next(s));
95 		return (1);
96 	case RANGE:
97 		if (s->cnt-- == 0) {
98 			s->state = NORMAL;
99 			return (next(s));
100 		}
101 		++s->lastch;
102 		return (1);
103 	case SEQUENCE:
104 		if (s->cnt-- == 0) {
105 			s->state = NORMAL;
106 			return (next(s));
107 		}
108 		return (1);
109 	case SET:
110 	case SET_UPPER:
111 	case SET_LOWER:
112 		if ((ch = s->set[s->cnt++]) == OOBCH) {
113 			s->state = NORMAL;
114 			return (next(s));
115 		}
116 		s->lastch = ch;
117 		return (1);
118 	default:
119 		return (0);
120 	}
121 	/* NOTREACHED */
122 }
123 
124 static int
125 bracket(s)
126 	STR *s;
127 {
128 	char *p;
129 
130 	switch (s->str[1]) {
131 	case ':':				/* "[:class:]" */
132 		if ((p = strchr(s->str + 2, ']')) == NULL)
133 			return (0);
134 		if (*(p - 1) != ':' || p - s->str < 4)
135 			goto repeat;
136 		*(p - 1) = '\0';
137 		s->str += 2;
138 		genclass(s);
139 		s->str = p + 1;
140 		return (1);
141 	case '=':				/* "[=equiv=]" */
142 		if ((p = strchr(s->str + 2, ']')) == NULL)
143 			return (0);
144 		if (*(p - 1) != '=' || p - s->str < 4)
145 			goto repeat;
146 		s->str += 2;
147 		genequiv(s);
148 		return (1);
149 	default:				/* "[\###*n]" or "[#*n]" */
150 	repeat:
151 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
152 			return (0);
153 		if (p[0] != '*' || index(p, ']') == NULL)
154 			return (0);
155 		s->str += 1;
156 		genseq(s);
157 		return (1);
158 	}
159 	/* NOTREACHED */
160 }
161 
162 typedef struct {
163 	const char *name;
164 	int (*func)(int);
165 	int *set;
166 } CLASS;
167 
168 static CLASS classes[] = {
169 #undef isalnum
170 	{ "alnum",  isalnum,  NULL },
171 #undef isalpha
172 	{ "alpha",  isalpha,  NULL },
173 #undef isblank
174 	{ "blank",  isblank,  NULL },
175 #undef iscntrl
176 	{ "cntrl",  iscntrl,  NULL },
177 #undef isdigit
178 	{ "digit",  isdigit,  NULL },
179 #undef isgraph
180 	{ "graph",  isgraph,  NULL },
181 #undef islower
182 	{ "lower",  islower,  NULL },
183 #undef isprint
184 	{ "print",  isprint,  NULL },
185 #undef ispunct
186 	{ "punct",  ispunct,  NULL },
187 #undef isspace
188 	{ "space",  isspace,  NULL },
189 #undef isupper
190 	{ "upper",  isupper,  NULL },
191 #undef isxdigit
192 	{ "xdigit", isxdigit, NULL },
193 };
194 
195 static void
196 genclass(s)
197 	STR *s;
198 {
199 	int cnt, (*func)(int);
200 	CLASS *cp, tmp;
201 	int *p;
202 
203 	tmp.name = s->str;
204 	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
205 	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
206 		errx(1, "unknown class %s", s->str);
207 
208 	if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
209 		err(1, "genclass() malloc");
210 	for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
211 		if ((func)(cnt))
212 			*p++ = cnt;
213 	*p = OOBCH;
214 
215 	s->cnt = 0;
216 	s->set = cp->set;
217 	if (strcmp(s->str, "upper") == 0)
218 		s->state = SET_UPPER;
219 	else if (strcmp(s->str, "lower") == 0)
220 		s->state = SET_LOWER;
221 	else
222 		s->state = SET;
223 }
224 
225 static int
226 c_class(a, b)
227 	const void *a, *b;
228 {
229 	return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name));
230 }
231 
232 static void
233 genequiv(s)
234 	STR *s;
235 {
236 	int i, p, pri;
237 	char src[2], dst[3];
238 
239 	if (*s->str == '\\') {
240 		s->equiv[0] = backslash(s, NULL);
241 		if (*s->str != '=')
242 			errx(1, "misplaced equivalence equals sign");
243 		s->str += 2;
244 	} else {
245 		s->equiv[0] = s->str[0];
246 		if (s->str[1] != '=')
247 			errx(1, "misplaced equivalence equals sign");
248 		s->str += 3;
249 	}
250 
251 	/*
252 	 * Calculate the set of all characters in the same equivalence class
253 	 * as the specified character (they will have the same primary
254 	 * collation weights).
255 	 * XXX Knows too much about how strxfrm() is implemented. Assumes
256 	 * it fills the string with primary collation weight bytes. Only one-
257 	 * to-one mappings are supported.
258 	 */
259 	src[0] = s->equiv[0];
260 	src[1] = '\0';
261 	if (strxfrm(dst, src, sizeof(dst)) == 1) {
262 		pri = (unsigned char)*dst;
263 		for (p = 1, i = 1; i < NCHARS; i++) {
264 			*src = i;
265 			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
266 			    pri == (unsigned char)*dst)
267 				s->equiv[p++] = i;
268 		}
269 		s->equiv[p] = OOBCH;
270 	}
271 
272 	s->cnt = 0;
273 	s->state = SET;
274 	s->set = s->equiv;
275 }
276 
277 static int
278 genrange(STR *s, int was_octal)
279 {
280 	int stopval, octal;
281 	char *savestart;
282 	int n, cnt, *p;
283 
284 	octal = 0;
285 	savestart = s->str;
286 	stopval = *++s->str == '\\' ? backslash(s, &octal) : (u_char)*s->str++;
287 	if (!octal)
288 		octal = was_octal;
289 
290 	if ((octal && stopval < s->lastch) ||
291 	    (!octal &&
292 	     charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0)) {
293 		s->str = savestart;
294 		return (0);
295 	}
296 	if (octal) {
297 		s->cnt = stopval - s->lastch + 1;
298 		s->state = RANGE;
299 		--s->lastch;
300 		return (1);
301 	}
302 	if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
303 		err(1, "genrange() malloc");
304 	for (cnt = 0; cnt < NCHARS; cnt++)
305 		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
306 		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
307 			*p++ = cnt;
308 	*p = OOBCH;
309 	n = p - s->set;
310 
311 	s->cnt = 0;
312 	s->state = SET;
313 	if (n > 1)
314 		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
315 	return (1);
316 }
317 
318 static void
319 genseq(s)
320 	STR *s;
321 {
322 	char *ep;
323 
324 	if (s->which == STRING1)
325 		errx(1, "sequences only valid in string2");
326 
327 	if (*s->str == '\\')
328 		s->lastch = backslash(s, NULL);
329 	else
330 		s->lastch = *s->str++;
331 	if (*s->str != '*')
332 		errx(1, "misplaced sequence asterisk");
333 
334 	switch (*++s->str) {
335 	case '\\':
336 		s->cnt = backslash(s, NULL);
337 		break;
338 	case ']':
339 		s->cnt = 0;
340 		++s->str;
341 		break;
342 	default:
343 		if (isdigit((u_char)*s->str)) {
344 			s->cnt = strtol(s->str, &ep, 0);
345 			if (*ep == ']') {
346 				s->str = ep + 1;
347 				break;
348 			}
349 		}
350 		errx(1, "illegal sequence count");
351 		/* NOTREACHED */
352 	}
353 
354 	s->state = s->cnt ? SEQUENCE : INFINITE;
355 }
356 
357 /*
358  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
359  * an escape code or a literal character.
360  */
361 static int
362 backslash(STR *s, int *is_octal)
363 {
364 	int ch, cnt, val;
365 
366 	if (is_octal != NULL)
367 		*is_octal = 0;
368 	for (cnt = val = 0;;) {
369 		ch = (u_char)*++s->str;
370 		if (!isdigit(ch))
371 			break;
372 		val = val * 8 + ch - '0';
373 		if (++cnt == 3) {
374 			++s->str;
375 			break;
376 		}
377 	}
378 	if (cnt) {
379 		if (is_octal != NULL)
380 			*is_octal = 1;
381 		return (val);
382 	}
383 	if (ch != '\0')
384 		++s->str;
385 	switch (ch) {
386 		case 'a':			/* escape characters */
387 			return ('\7');
388 		case 'b':
389 			return ('\b');
390 		case 'f':
391 			return ('\f');
392 		case 'n':
393 			return ('\n');
394 		case 'r':
395 			return ('\r');
396 		case 't':
397 			return ('\t');
398 		case 'v':
399 			return ('\13');
400 		case '\0':			/*  \" -> \ */
401 			s->state = EOS;
402 			return ('\\');
403 		default:			/* \x" -> x */
404 			return (ch);
405 	}
406 }
407