xref: /freebsd/usr.bin/tr/cset.c (revision 22cf89c938886d14f5796fc49f9f020c23ea8eaf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2004 Tim J. Robbins.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * "Set of characters" ADT implemented as a splay tree of extents, with
30  * a lookup table cache to simplify looking up the first bunch of
31  * characters (which are presumably more common than others).
32  */
33 
34 #include <sys/cdefs.h>
35 #include <assert.h>
36 #include <stdbool.h>
37 #include <stdlib.h>
38 #include <wchar.h>
39 #include <wctype.h>
40 #include "cset.h"
41 
42 static struct csnode *	cset_delete(struct csnode *, wchar_t);
43 static __inline int	cset_rangecmp(struct csnode *, wchar_t);
44 static struct csnode *	cset_splay(struct csnode *, wchar_t);
45 
46 /*
47  * cset_alloc --
48  *	Allocate a set of characters.
49  */
50 struct cset *
51 cset_alloc(void)
52 {
53 	struct cset *cs;
54 
55 	if ((cs = malloc(sizeof(*cs))) == NULL)
56 		return (NULL);
57 	cs->cs_root = NULL;
58 	cs->cs_classes = NULL;
59 	cs->cs_havecache = false;
60 	cs->cs_invert = false;
61 	return (cs);
62 }
63 
64 /*
65  * cset_add --
66  *	Add a character to the set.
67  */
68 bool
69 cset_add(struct cset *cs, wchar_t ch)
70 {
71 	struct csnode *csn, *ncsn;
72 	wchar_t oval;
73 
74 	cs->cs_havecache = false;
75 
76 	/*
77 	 * Inserting into empty tree; new item becomes the root.
78 	 */
79 	if (cs->cs_root == NULL) {
80 		csn = malloc(sizeof(*cs->cs_root));
81 		if (csn == NULL)
82 			return (false);
83 		csn->csn_left = csn->csn_right = NULL;
84 		csn->csn_min = csn->csn_max = ch;
85 		cs->cs_root = csn;
86 		return (true);
87 	}
88 
89 	/*
90 	 * Splay to check whether the item already exists, and otherwise,
91 	 * where we should put it.
92 	 */
93 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
94 
95 	/*
96 	 * Avoid adding duplicate nodes.
97 	 */
98 	if (cset_rangecmp(csn, ch) == 0)
99 		return (true);
100 
101 	/*
102 	 * Allocate a new node and make it the new root.
103 	 */
104 	ncsn = malloc(sizeof(*ncsn));
105 	if (ncsn == NULL)
106 		return (false);
107 	ncsn->csn_min = ncsn->csn_max = ch;
108 	if (cset_rangecmp(csn, ch) < 0) {
109 		ncsn->csn_left = csn->csn_left;
110 		ncsn->csn_right = csn;
111 		csn->csn_left = NULL;
112 	} else {
113 		ncsn->csn_right = csn->csn_right;
114 		ncsn->csn_left = csn;
115 		csn->csn_right = NULL;
116 	}
117 	cs->cs_root = ncsn;
118 
119 	/*
120 	 * Coalesce with left and right neighbours if possible.
121 	 */
122 	if (ncsn->csn_left != NULL) {
123 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
124 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
125 			oval = ncsn->csn_left->csn_min;
126 			ncsn->csn_left = cset_delete(ncsn->csn_left,
127 			    ncsn->csn_left->csn_min);
128 			ncsn->csn_min = oval;
129 		}
130 	}
131 	if (ncsn->csn_right != NULL) {
132 		ncsn->csn_right = cset_splay(ncsn->csn_right,
133 		    ncsn->csn_max + 1);
134 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
135 			oval = ncsn->csn_right->csn_max;
136 			ncsn->csn_right = cset_delete(ncsn->csn_right,
137 			    ncsn->csn_right->csn_min);
138 			ncsn->csn_max = oval;
139 		}
140 	}
141 
142 	return (true);
143 }
144 
145 /*
146  * cset_in_hard --
147  *	Determine whether a character is in the set without using
148  *	the cache.
149  */
150 bool
151 cset_in_hard(struct cset *cs, wchar_t ch)
152 {
153 	struct csclass *csc;
154 
155 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
156 		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
157 			return (cs->cs_invert ^ true);
158 	if (cs->cs_root != NULL) {
159 		cs->cs_root = cset_splay(cs->cs_root, ch);
160 		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
161 	}
162 	return (cs->cs_invert ^ false);
163 }
164 
165 /*
166  * cset_cache --
167  *	Update the cache.
168  */
169 void
170 cset_cache(struct cset *cs)
171 {
172 	wchar_t i;
173 
174 	for (i = 0; i < CS_CACHE_SIZE; i++)
175 		cs->cs_cache[i] = cset_in_hard(cs, i);
176 
177 	cs->cs_havecache = true;
178 }
179 
180 /*
181  * cset_invert --
182  *	Invert the character set.
183  */
184 void
185 cset_invert(struct cset *cs)
186 {
187 
188 	cs->cs_invert ^= true;
189 	cs->cs_havecache = false;
190 }
191 
192 /*
193  * cset_addclass --
194  *	Add a wctype()-style character class to the set, optionally
195  *	inverting it.
196  */
197 bool
198 cset_addclass(struct cset *cs, wctype_t type, bool invert)
199 {
200 	struct csclass *csc;
201 
202 	csc = malloc(sizeof(*csc));
203 	if (csc == NULL)
204 		return (false);
205 	csc->csc_type = type;
206 	csc->csc_invert = invert;
207 	csc->csc_next = cs->cs_classes;
208 	cs->cs_classes = csc;
209 	cs->cs_havecache = false;
210 	return (true);
211 }
212 
213 static __inline int
214 cset_rangecmp(struct csnode *t, wchar_t ch)
215 {
216 
217 	if (ch < t->csn_min)
218 		return (-1);
219 	if (ch > t->csn_max)
220 		return (1);
221 	return (0);
222 }
223 
224 static struct csnode *
225 cset_splay(struct csnode *t, wchar_t ch)
226 {
227 	struct csnode N, *l, *r, *y;
228 
229 	/*
230 	 * Based on public domain code from Sleator.
231 	 */
232 
233 	assert(t != NULL);
234 
235 	N.csn_left = N.csn_right = NULL;
236 	l = r = &N;
237 	for (;;) {
238 		if (cset_rangecmp(t, ch) < 0) {
239 			if (t->csn_left != NULL &&
240 			    cset_rangecmp(t->csn_left, ch) < 0) {
241 				y = t->csn_left;
242 				t->csn_left = y->csn_right;
243 				y->csn_right = t;
244 				t = y;
245 			}
246 			if (t->csn_left == NULL)
247 				break;
248 			r->csn_left = t;
249 			r = t;
250 			t = t->csn_left;
251 		} else if (cset_rangecmp(t, ch) > 0) {
252 			if (t->csn_right != NULL &&
253 			    cset_rangecmp(t->csn_right, ch) > 0) {
254 				y = t->csn_right;
255 				t->csn_right = y->csn_left;
256 				y->csn_left = t;
257 				t = y;
258 			}
259 			if (t->csn_right == NULL)
260 				break;
261 			l->csn_right = t;
262 			l = t;
263 			t = t->csn_right;
264 		} else
265 			break;
266 	}
267 	l->csn_right = t->csn_left;
268 	r->csn_left = t->csn_right;
269 	t->csn_left = N.csn_right;
270 	t->csn_right = N.csn_left;
271 	return (t);
272 }
273 
274 static struct csnode *
275 cset_delete(struct csnode *t, wchar_t ch)
276 {
277 	struct csnode *x;
278 
279 	assert(t != NULL);
280 	t = cset_splay(t, ch);
281 	assert(cset_rangecmp(t, ch) == 0);
282 	if (t->csn_left == NULL)
283 		x = t->csn_right;
284 	else {
285 		x = cset_splay(t->csn_left, ch);
286 		x->csn_right = t->csn_right;
287 	}
288 	free(t);
289 	return x;
290 }
291