xref: /freebsd/usr.bin/tr/str.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*-
2  * Copyright (c) 1991, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 4. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 
32 __FBSDID("$FreeBSD$");
33 
34 #ifndef lint
35 static const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
36 #endif
37 
38 #include <sys/types.h>
39 
40 #include <ctype.h>
41 #include <err.h>
42 #include <errno.h>
43 #include <stddef.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <wchar.h>
48 #include <wctype.h>
49 
50 #include "extern.h"
51 
52 static int      backslash(STR *, int *);
53 static int	bracket(STR *);
54 static void	genclass(STR *);
55 static void	genequiv(STR *);
56 static int      genrange(STR *, int);
57 static void	genseq(STR *);
58 
59 wint_t
60 next(s)
61 	STR *s;
62 {
63 	int is_octal;
64 	wint_t ch;
65 	wchar_t wch;
66 	size_t clen;
67 
68 	switch (s->state) {
69 	case EOS:
70 		return (0);
71 	case INFINITE:
72 		return (1);
73 	case NORMAL:
74 		switch (*s->str) {
75 		case '\0':
76 			s->state = EOS;
77 			return (0);
78 		case '\\':
79 			s->lastch = backslash(s, &is_octal);
80 			break;
81 		case '[':
82 			if (bracket(s))
83 				return (next(s));
84 			/* FALLTHROUGH */
85 		default:
86 			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
87 			if (clen == (size_t)-1 || clen == (size_t)-2 ||
88 			    clen == 0)
89 				errc(1, EILSEQ, NULL);
90 			is_octal = 0;
91 			s->lastch = wch;
92 			s->str += clen;
93 			break;
94 		}
95 
96 		/* We can start a range at any time. */
97 		if (s->str[0] == '-' && genrange(s, is_octal))
98 			return (next(s));
99 		return (1);
100 	case RANGE:
101 		if (s->cnt-- == 0) {
102 			s->state = NORMAL;
103 			return (next(s));
104 		}
105 		++s->lastch;
106 		return (1);
107 	case SEQUENCE:
108 		if (s->cnt-- == 0) {
109 			s->state = NORMAL;
110 			return (next(s));
111 		}
112 		return (1);
113 	case CCLASS:
114 	case CCLASS_UPPER:
115 	case CCLASS_LOWER:
116 		s->cnt++;
117 		ch = nextwctype(s->lastch, s->cclass);
118 		if (ch == -1) {
119 			s->state = NORMAL;
120 			return (next(s));
121 		}
122 		s->lastch = ch;
123 		return (1);
124 	case SET:
125 		if ((ch = s->set[s->cnt++]) == OOBCH) {
126 			s->state = NORMAL;
127 			return (next(s));
128 		}
129 		s->lastch = ch;
130 		return (1);
131 	default:
132 		return (0);
133 	}
134 	/* NOTREACHED */
135 }
136 
137 static int
138 bracket(s)
139 	STR *s;
140 {
141 	char *p;
142 
143 	switch (s->str[1]) {
144 	case ':':				/* "[:class:]" */
145 		if ((p = strchr(s->str + 2, ']')) == NULL)
146 			return (0);
147 		if (*(p - 1) != ':' || p - s->str < 4)
148 			goto repeat;
149 		*(p - 1) = '\0';
150 		s->str += 2;
151 		genclass(s);
152 		s->str = p + 1;
153 		return (1);
154 	case '=':				/* "[=equiv=]" */
155 		if (s->str[2] == '\0' || (p = strchr(s->str + 3, ']')) == NULL)
156 			return (0);
157 		if (*(p - 1) != '=' || p - s->str < 4)
158 			goto repeat;
159 		s->str += 2;
160 		genequiv(s);
161 		return (1);
162 	default:				/* "[\###*n]" or "[#*n]" */
163 	repeat:
164 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
165 			return (0);
166 		if (p[0] != '*' || index(p, ']') == NULL)
167 			return (0);
168 		s->str += 1;
169 		genseq(s);
170 		return (1);
171 	}
172 	/* NOTREACHED */
173 }
174 
175 static void
176 genclass(s)
177 	STR *s;
178 {
179 
180 	if ((s->cclass = wctype(s->str)) == 0)
181 		errx(1, "unknown class %s", s->str);
182 	s->cnt = 0;
183 	s->lastch = -1;		/* incremented before check in next() */
184 	if (strcmp(s->str, "upper") == 0)
185 		s->state = CCLASS_UPPER;
186 	else if (strcmp(s->str, "lower") == 0)
187 		s->state = CCLASS_LOWER;
188 	else
189 		s->state = CCLASS;
190 }
191 
192 static void
193 genequiv(s)
194 	STR *s;
195 {
196 	int i, p, pri;
197 	char src[2], dst[3];
198 	size_t clen;
199 	wchar_t wc;
200 
201 	if (*s->str == '\\') {
202 		s->equiv[0] = backslash(s, NULL);
203 		if (*s->str != '=')
204 			errx(1, "misplaced equivalence equals sign");
205 		s->str += 2;
206 	} else {
207 		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
208 		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
209 			errc(1, EILSEQ, NULL);
210 		s->equiv[0] = wc;
211 		if (s->str[clen] != '=')
212 			errx(1, "misplaced equivalence equals sign");
213 		s->str += clen + 2;
214 	}
215 
216 	/*
217 	 * Calculate the set of all characters in the same equivalence class
218 	 * as the specified character (they will have the same primary
219 	 * collation weights).
220 	 * XXX Knows too much about how strxfrm() is implemented. Assumes
221 	 * it fills the string with primary collation weight bytes. Only one-
222 	 * to-one mappings are supported.
223 	 * XXX Equivalence classes not supported in multibyte locales.
224 	 */
225 	src[0] = (char)s->equiv[0];
226 	src[1] = '\0';
227 	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
228 		pri = (unsigned char)*dst;
229 		for (p = 1, i = 1; i < NCHARS_SB; i++) {
230 			*src = i;
231 			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
232 			    pri == (unsigned char)*dst)
233 				s->equiv[p++] = i;
234 		}
235 		s->equiv[p] = OOBCH;
236 	}
237 
238 	s->cnt = 0;
239 	s->state = SET;
240 	s->set = s->equiv;
241 }
242 
243 static int
244 genrange(STR *s, int was_octal)
245 {
246 	int stopval, octal;
247 	char *savestart;
248 	int n, cnt, *p;
249 	size_t clen;
250 	wchar_t wc;
251 
252 	octal = 0;
253 	savestart = s->str;
254 	if (*++s->str == '\\')
255 		stopval = backslash(s, &octal);
256 	else {
257 		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
258 		if (clen == (size_t)-1 || clen == (size_t)-2)
259 			errc(1, EILSEQ, NULL);
260 		stopval = wc;
261 		s->str += clen;
262 	}
263 	/*
264 	 * XXX Characters are not ordered according to collating sequence in
265 	 * multibyte locales.
266 	 */
267 	if (octal || was_octal || MB_CUR_MAX > 1) {
268 		if (stopval < s->lastch) {
269 			s->str = savestart;
270 			return (0);
271 		}
272 		s->cnt = stopval - s->lastch + 1;
273 		s->state = RANGE;
274 		--s->lastch;
275 		return (1);
276 	}
277 	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
278 		s->str = savestart;
279 		return (0);
280 	}
281 	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
282 		err(1, "genrange() malloc");
283 	for (cnt = 0; cnt < NCHARS_SB; cnt++)
284 		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
285 		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
286 			*p++ = cnt;
287 	*p = OOBCH;
288 	n = p - s->set;
289 
290 	s->cnt = 0;
291 	s->state = SET;
292 	if (n > 1)
293 		mergesort(s->set, n, sizeof(*(s->set)), charcoll);
294 	return (1);
295 }
296 
297 static void
298 genseq(s)
299 	STR *s;
300 {
301 	char *ep;
302 	wchar_t wc;
303 	size_t clen;
304 
305 	if (s->which == STRING1)
306 		errx(1, "sequences only valid in string2");
307 
308 	if (*s->str == '\\')
309 		s->lastch = backslash(s, NULL);
310 	else {
311 		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
312 		if (clen == (size_t)-1 || clen == (size_t)-2)
313 			errc(1, EILSEQ, NULL);
314 		s->lastch = wc;
315 		s->str += clen;
316 	}
317 	if (*s->str != '*')
318 		errx(1, "misplaced sequence asterisk");
319 
320 	switch (*++s->str) {
321 	case '\\':
322 		s->cnt = backslash(s, NULL);
323 		break;
324 	case ']':
325 		s->cnt = 0;
326 		++s->str;
327 		break;
328 	default:
329 		if (isdigit((u_char)*s->str)) {
330 			s->cnt = strtol(s->str, &ep, 0);
331 			if (*ep == ']') {
332 				s->str = ep + 1;
333 				break;
334 			}
335 		}
336 		errx(1, "illegal sequence count");
337 		/* NOTREACHED */
338 	}
339 
340 	s->state = s->cnt ? SEQUENCE : INFINITE;
341 }
342 
343 /*
344  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
345  * an escape code or a literal character.
346  */
347 static int
348 backslash(STR *s, int *is_octal)
349 {
350 	int ch, cnt, val;
351 
352 	if (is_octal != NULL)
353 		*is_octal = 0;
354 	for (cnt = val = 0;;) {
355 		ch = (u_char)*++s->str;
356 		if (!isdigit(ch) || ch > '7')
357 			break;
358 		val = val * 8 + ch - '0';
359 		if (++cnt == 3) {
360 			++s->str;
361 			break;
362 		}
363 	}
364 	if (cnt) {
365 		if (is_octal != NULL)
366 			*is_octal = 1;
367 		return (val);
368 	}
369 	if (ch != '\0')
370 		++s->str;
371 	switch (ch) {
372 		case 'a':			/* escape characters */
373 			return ('\7');
374 		case 'b':
375 			return ('\b');
376 		case 'f':
377 			return ('\f');
378 		case 'n':
379 			return ('\n');
380 		case 'r':
381 			return ('\r');
382 		case 't':
383 			return ('\t');
384 		case 'v':
385 			return ('\13');
386 		case '\0':			/*  \" -> \ */
387 			s->state = EOS;
388 			return ('\\');
389 		default:			/* \x" -> x */
390 			return (ch);
391 	}
392 }
393