xref: /freebsd/usr.bin/tr/cset.c (revision fbf96e52bbd90bbbb9c9e2ae6fbc101fa6ebd080)
1 /*-
2  * Copyright (c) 2004 Tim J. Robbins.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 /*
27  * "Set of characters" ADT implemented as a splay tree of extents, with
28  * a lookup table cache to simplify looking up the first bunch of
29  * characters (which are presumably more common than others).
30  */
31 
32 #include <sys/cdefs.h>
33 __FBSDID("$FreeBSD$");
34 
35 #include <assert.h>
36 #include <stdbool.h>
37 #include <stdlib.h>
38 #include <wchar.h>
39 #include <wctype.h>
40 #include "cset.h"
41 
42 static struct csnode *	cset_delete(struct csnode *, wchar_t);
43 static __inline int	cset_rangecmp(struct csnode *, wchar_t);
44 static struct csnode *	cset_splay(struct csnode *, wchar_t);
45 
46 /*
47  * cset_alloc --
48  *	Allocate a set of characters.
49  */
50 struct cset *
51 cset_alloc(void)
52 {
53 	struct cset *cs;
54 
55 	if ((cs = malloc(sizeof(*cs))) == NULL)
56 		return (NULL);
57 	cs->cs_root = NULL;
58 	cs->cs_classes = NULL;
59 	cs->cs_havecache = false;
60 	cs->cs_invert = false;
61 	return (cs);
62 }
63 
64 /*
65  * cset_add --
66  *	Add a character to the set.
67  */
68 bool
69 cset_add(struct cset *cs, wchar_t ch)
70 {
71 	struct csnode *csn, *ncsn;
72 	wchar_t oval;
73 
74 	cs->cs_havecache = false;
75 
76 	/*
77 	 * Inserting into empty tree; new item becomes the root.
78 	 */
79 	if (cs->cs_root == NULL) {
80 		csn = malloc(sizeof(*cs->cs_root));
81 		if (csn == NULL)
82 			return (false);
83 		csn->csn_left = csn->csn_right = NULL;
84 		csn->csn_min = csn->csn_max = ch;
85 		cs->cs_root = csn;
86 		return (true);
87 	}
88 
89 	/*
90 	 * Splay to check whether the item already exists, and otherwise,
91 	 * where we should put it.
92 	 */
93 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
94 
95 	/*
96 	 * Easy cases where we can avoid allocating a new node:
97 	 *	(a) node already exists.
98 	 *	(b) we can lower the extent's "min" to accomodate this
99 	 *	    character without having to coalesce.
100 	 *	(c) we can raise the extent's "max" without having
101 	 *	    to coalesce.
102 	 */
103 	if (cset_rangecmp(csn, ch) == 0)
104 		return (true);
105 	if (ch + 1 == csn->csn_min && (csn->csn_left == NULL ||
106 	    ch > csn->csn_left->csn_max + 1)) {
107 		csn->csn_min--;
108 		return (true);
109 	}
110 	if (ch == csn->csn_max + 1 && (csn->csn_right == NULL ||
111 	    ch + 1 < csn->csn_right->csn_min)) {
112 		csn->csn_max++;
113 		return (true);
114 	}
115 
116 	/*
117 	 * Allocate a new node and link it into the tree as a direct
118 	 * child of the root.
119 	 */
120 	ncsn = malloc(sizeof(*ncsn));
121 	if (ncsn == NULL)
122 		return (false);
123 	ncsn->csn_min = ncsn->csn_max = ch;
124 	if (cset_rangecmp(csn, ch) < 0) {
125 		ncsn->csn_left = csn->csn_left;
126 		ncsn->csn_right = csn;
127 		csn->csn_left = NULL;
128 	} else {
129 		ncsn->csn_right = csn->csn_right;
130 		ncsn->csn_left = csn;
131 		csn->csn_right = NULL;
132 	}
133 	cs->cs_root = ncsn;
134 
135 	/*
136 	 * Splay to bring the newly inserted node to the root, then
137 	 * coalesce with left and right neighbours if possible.
138 	 */
139 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
140 	if (csn->csn_left != NULL &&
141 	    csn->csn_left->csn_max + 1 == csn->csn_min) {
142 		oval = csn->csn_left->csn_min;
143 		cs->cs_root = cset_delete(cs->cs_root,
144 		    csn->csn_left->csn_min);
145 		ncsn->csn_min = oval;
146 	}
147 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
148 	if (csn->csn_right != NULL &&
149 	    csn->csn_right->csn_min - 1 == csn->csn_max) {
150 		oval = csn->csn_right->csn_max;
151 		cs->cs_root = cset_delete(cs->cs_root,
152 		    csn->csn_right->csn_min);
153 		ncsn->csn_max = oval;
154 	}
155 
156 	return (true);
157 }
158 
159 /*
160  * cset_in_hard --
161  *	Determine whether a character is in the set without using
162  *	the cache.
163  */
164 bool
165 cset_in_hard(struct cset *cs, wchar_t ch)
166 {
167 	struct csclass *csc;
168 
169 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
170 		if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0)
171 			return (cs->cs_invert ^ true);
172 	if (cs->cs_root != NULL) {
173 		cs->cs_root = cset_splay(cs->cs_root, ch);
174 		return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0);
175 	}
176 	return (cs->cs_invert ^ false);
177 }
178 
179 /*
180  * cset_cache --
181  *	Update the cache.
182  */
183 void
184 cset_cache(struct cset *cs)
185 {
186 	wchar_t i;
187 
188 	for (i = 0; i < CS_CACHE_SIZE; i++)
189 		cs->cs_cache[i] = cset_in_hard(cs, i);
190 
191 	cs->cs_havecache = true;
192 }
193 
194 /*
195  * cset_invert --
196  *	Invert the character set.
197  */
198 void
199 cset_invert(struct cset *cs)
200 {
201 
202 	cs->cs_invert ^= true;
203 	cs->cs_havecache = false;
204 }
205 
206 /*
207  * cset_addclass --
208  *	Add a wctype()-style character class to the set, optionally
209  *	inverting it.
210  */
211 bool
212 cset_addclass(struct cset *cs, wctype_t type, bool invert)
213 {
214 	struct csclass *csc;
215 
216 	csc = malloc(sizeof(*csc));
217 	if (csc == NULL)
218 		return (false);
219 	csc->csc_type = type;
220 	csc->csc_invert = invert;
221 	csc->csc_next = cs->cs_classes;
222 	cs->cs_classes = csc;
223 	cs->cs_havecache = false;
224 	return (true);
225 }
226 
227 static __inline int
228 cset_rangecmp(struct csnode *t, wchar_t ch)
229 {
230 
231 	if (ch < t->csn_min)
232 		return (-1);
233 	if (ch > t->csn_max)
234 		return (1);
235 	return (0);
236 }
237 
238 static struct csnode *
239 cset_splay(struct csnode *t, wchar_t ch)
240 {
241 	struct csnode N, *l, *r, *y;
242 
243 	/*
244 	 * Based on public domain code from Sleator.
245 	 */
246 
247 	assert(t != NULL);
248 
249 	N.csn_left = N.csn_right = NULL;
250 	l = r = &N;
251 	for (;;) {
252 		if (cset_rangecmp(t, ch) < 0) {
253 			if (t->csn_left != NULL &&
254 			    cset_rangecmp(t->csn_left, ch) < 0) {
255 				y = t->csn_left;
256 				t->csn_left = y->csn_right;
257 				y->csn_right = t;
258 				t = y;
259 			}
260 			if (t->csn_left == NULL)
261 				break;
262 			r->csn_left = t;
263 			r = t;
264 			t = t->csn_left;
265 		} else if (cset_rangecmp(t, ch) > 0) {
266 			if (t->csn_right != NULL &&
267 			    cset_rangecmp(t->csn_right, ch) > 0) {
268 				y = t->csn_right;
269 				t->csn_right = y->csn_left;
270 				y->csn_left = t;
271 				t = y;
272 			}
273 			if (t->csn_right == NULL)
274 				break;
275 			l->csn_right = t;
276 			l = t;
277 			t = t->csn_right;
278 		} else
279 			break;
280 	}
281 	l->csn_right = t->csn_left;
282 	r->csn_left = t->csn_right;
283 	t->csn_left = N.csn_right;
284 	t->csn_right = N.csn_left;
285 	return (t);
286 }
287 
288 static struct csnode *
289 cset_delete(struct csnode *t, wchar_t ch)
290 {
291 	struct csnode *x;
292 
293 	assert(t != NULL);
294 	t = cset_splay(t, ch);
295 	assert(cset_rangecmp(t, ch) == 0);
296 	if (t->csn_left == NULL)
297 		x = t->csn_right;
298 	else {
299 		x = cset_splay(t->csn_left, ch);
300 		x->csn_right = t->csn_right;
301 	}
302 	free(t);
303 	return x;
304 }
305