xref: /freebsd/usr.bin/tr/str.c (revision a5921bc3653e2e286715e6fe8d473ec0d02da38c)
1 /*-
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 
32 __FBSDID("$FreeBSD$");
33 
34 #ifndef lint
35 static const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
36 #endif
37 
38 #include <sys/types.h>
39 
40 #include <ctype.h>
41 #include <err.h>
42 #include <errno.h>
43 #include <stddef.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <wchar.h>
48 #include <wctype.h>
49 
50 #include "extern.h"
51 
52 static int      backslash(STR *, int *);
53 static int	bracket(STR *);
54 static void	genclass(STR *);
55 static void	genequiv(STR *);
56 static int      genrange(STR *, int);
57 static void	genseq(STR *);
58 
59 wint_t
60 next(STR *s)
61 {
62 	int is_octal;
63 	wint_t ch;
64 	wchar_t wch;
65 	size_t clen;
66 
67 	switch (s->state) {
68 	case EOS:
69 		return (0);
70 	case INFINITE:
71 		return (1);
72 	case NORMAL:
73 		switch (*s->str) {
74 		case '\0':
75 			s->state = EOS;
76 			return (0);
77 		case '\\':
78 			s->lastch = backslash(s, &is_octal);
79 			break;
80 		case '[':
81 			if (bracket(s))
82 				return (next(s));
83 			/* FALLTHROUGH */
84 		default:
85 			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
86 			if (clen == (size_t)-1 || clen == (size_t)-2 ||
87 			    clen == 0)
88 				errc(1, EILSEQ, NULL);
89 			is_octal = 0;
90 			s->lastch = wch;
91 			s->str += clen;
92 			break;
93 		}
94 
95 		/* We can start a range at any time. */
96 		if (s->str[0] == '-' && genrange(s, is_octal))
97 			return (next(s));
98 		return (1);
99 	case RANGE:
100 		if (s->cnt-- == 0) {
101 			s->state = NORMAL;
102 			return (next(s));
103 		}
104 		++s->lastch;
105 		return (1);
106 	case SEQUENCE:
107 		if (s->cnt-- == 0) {
108 			s->state = NORMAL;
109 			return (next(s));
110 		}
111 		return (1);
112 	case CCLASS:
113 	case CCLASS_UPPER:
114 	case CCLASS_LOWER:
115 		s->cnt++;
116 		ch = nextwctype(s->lastch, s->cclass);
117 		if (ch == -1) {
118 			s->state = NORMAL;
119 			return (next(s));
120 		}
121 		s->lastch = ch;
122 		return (1);
123 	case SET:
124 		if ((ch = s->set[s->cnt++]) == OOBCH) {
125 			s->state = NORMAL;
126 			return (next(s));
127 		}
128 		s->lastch = ch;
129 		return (1);
130 	default:
131 		return (0);
132 	}
133 	/* NOTREACHED */
134 }
135 
136 static int
137 bracket(STR *s)
138 {
139 	char *p;
140 
141 	switch (s->str[1]) {
142 	case ':':				/* "[:class:]" */
143 		if ((p = strchr(s->str + 2, ']')) == NULL)
144 			return (0);
145 		if (*(p - 1) != ':' || p - s->str < 4)
146 			goto repeat;
147 		*(p - 1) = '\0';
148 		s->str += 2;
149 		genclass(s);
150 		s->str = p + 1;
151 		return (1);
152 	case '=':				/* "[=equiv=]" */
153 		if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL)
154 			return (0);
155 		if (*(p - 1) != '=' || p - s->str < 4)
156 			goto repeat;
157 		s->str += 2;
158 		genequiv(s);
159 		return (1);
160 	default:				/* "[\###*n]" or "[#*n]" */
161 	repeat:
162 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
163 			return (0);
164 		if (p[0] != '*' || strchr(p, ']') == NULL)
165 			return (0);
166 		s->str += 1;
167 		genseq(s);
168 		return (1);
169 	}
170 	/* NOTREACHED */
171 }
172 
173 static void
174 genclass(STR *s)
175 {
176 
177 	if ((s->cclass = wctype(s->str)) == 0)
178 		errx(1, "unknown class %s", s->str);
179 	s->cnt = 0;
180 	s->lastch = -1;		/* incremented before check in next() */
181 	if (strcmp(s->str, "upper") == 0)
182 		s->state = CCLASS_UPPER;
183 	else if (strcmp(s->str, "lower") == 0)
184 		s->state = CCLASS_LOWER;
185 	else
186 		s->state = CCLASS;
187 }
188 
189 static void
190 genequiv(STR *s)
191 {
192 	int i, p, pri;
193 	char src[2], dst[3];
194 	size_t clen;
195 	wchar_t wc;
196 
197 	if (*s->str == '\\') {
198 		s->equiv[0] = backslash(s, NULL);
199 		if (*s->str != '=')
200 			errx(1, "misplaced equivalence equals sign");
201 		s->str += 2;
202 	} else {
203 		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
204 		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
205 			errc(1, EILSEQ, NULL);
206 		s->equiv[0] = wc;
207 		if (s->str[clen] != '=')
208 			errx(1, "misplaced equivalence equals sign");
209 		s->str += clen + 2;
210 	}
211 
212 	/*
213 	 * Calculate the set of all characters in the same equivalence class
214 	 * as the specified character (they will have the same primary
215 	 * collation weights).
216 	 * XXX Knows too much about how strxfrm() is implemented. Assumes
217 	 * it fills the string with primary collation weight bytes. Only one-
218 	 * to-one mappings are supported.
219 	 * XXX Equivalence classes not supported in multibyte locales.
220 	 */
221 	src[0] = (char)s->equiv[0];
222 	src[1] = '\0';
223 	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
224 		pri = (unsigned char)*dst;
225 		for (p = 1, i = 1; i < NCHARS_SB; i++) {
226 			*src = i;
227 			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
228 			    pri == (unsigned char)*dst)
229 				s->equiv[p++] = i;
230 		}
231 		s->equiv[p] = OOBCH;
232 	}
233 
234 	s->cnt = 0;
235 	s->state = SET;
236 	s->set = s->equiv;
237 }
238 
239 static int
240 genrange(STR *s, int was_octal)
241 {
242 	int stopval, octal;
243 	char *savestart;
244 	int n, cnt, *p;
245 	size_t clen;
246 	wchar_t wc;
247 
248 	octal = 0;
249 	savestart = s->str;
250 	if (*++s->str == '\\')
251 		stopval = backslash(s, &octal);
252 	else {
253 		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
254 		if (clen == (size_t)-1 || clen == (size_t)-2)
255 			errc(1, EILSEQ, NULL);
256 		stopval = wc;
257 		s->str += clen;
258 	}
259 	/*
260 	 * XXX Characters are not ordered according to collating sequence in
261 	 * multibyte locales.
262 	 */
263 	if (octal || was_octal || MB_CUR_MAX > 1) {
264 		if (stopval < s->lastch) {
265 			s->str = savestart;
266 			return (0);
267 		}
268 		s->cnt = stopval - s->lastch + 1;
269 		s->state = RANGE;
270 		--s->lastch;
271 		return (1);
272 	}
273 	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
274 		s->str = savestart;
275 		return (0);
276 	}
277 	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
278 		err(1, "genrange() malloc");
279 	for (cnt = 0; cnt < NCHARS_SB; cnt++)
280 		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
281 		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
282 			*p++ = cnt;
283 	*p = OOBCH;
284 	n = p - s->set;
285 
286 	s->cnt = 0;
287 	s->state = SET;
288 	if (n > 1)
289 		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
290 	return (1);
291 }
292 
293 static void
294 genseq(STR *s)
295 {
296 	char *ep;
297 	wchar_t wc;
298 	size_t clen;
299 
300 	if (s->which == STRING1)
301 		errx(1, "sequences only valid in string2");
302 
303 	if (*s->str == '\\')
304 		s->lastch = backslash(s, NULL);
305 	else {
306 		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
307 		if (clen == (size_t)-1 || clen == (size_t)-2)
308 			errc(1, EILSEQ, NULL);
309 		s->lastch = wc;
310 		s->str += clen;
311 	}
312 	if (*s->str != '*')
313 		errx(1, "misplaced sequence asterisk");
314 
315 	switch (*++s->str) {
316 	case '\\':
317 		s->cnt = backslash(s, NULL);
318 		break;
319 	case ']':
320 		s->cnt = 0;
321 		++s->str;
322 		break;
323 	default:
324 		if (isdigit((u_char)*s->str)) {
325 			s->cnt = strtol(s->str, &ep, 0);
326 			if (*ep == ']') {
327 				s->str = ep + 1;
328 				break;
329 			}
330 		}
331 		errx(1, "illegal sequence count");
332 		/* NOTREACHED */
333 	}
334 
335 	s->state = s->cnt ? SEQUENCE : INFINITE;
336 }
337 
338 /*
339  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
340  * an escape code or a literal character.
341  */
342 static int
343 backslash(STR *s, int *is_octal)
344 {
345 	int ch, cnt, val;
346 
347 	if (is_octal != NULL)
348 		*is_octal = 0;
349 	for (cnt = val = 0;;) {
350 		ch = (u_char)*++s->str;
351 		if (!isdigit(ch) || ch > '7')
352 			break;
353 		val = val * 8 + ch - '0';
354 		if (++cnt == 3) {
355 			++s->str;
356 			break;
357 		}
358 	}
359 	if (cnt) {
360 		if (is_octal != NULL)
361 			*is_octal = 1;
362 		return (val);
363 	}
364 	if (ch != '\0')
365 		++s->str;
366 	switch (ch) {
367 		case 'a':			/* escape characters */
368 			return ('\7');
369 		case 'b':
370 			return ('\b');
371 		case 'f':
372 			return ('\f');
373 		case 'n':
374 			return ('\n');
375 		case 'r':
376 			return ('\r');
377 		case 't':
378 			return ('\t');
379 		case 'v':
380 			return ('\13');
381 		case '\0':			/*  \" -> \ */
382 			s->state = EOS;
383 			return ('\\');
384 		default:			/* \x" -> x */
385 			return (ch);
386 	}
387 }
388