xref: /freebsd/usr.bin/tr/cset.c (revision b5a3a89c50671a1ad29e7c43fe15e7b16feac239)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2004 Tim J. Robbins.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 /*
29  * "Set of characters" ADT implemented as a splay tree of extents, with
30  * a lookup table cache to simplify looking up the first bunch of
31  * characters (which are presumably more common than others).
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <assert.h>
38 #include <stdbool.h>
39 #include <stdlib.h>
40 #include <wchar.h>
41 #include <wctype.h>
42 #include "cset.h"
43 
44 static struct csnode *	cset_delete(struct csnode *, wchar_t);
45 static __inline int	cset_rangecmp(struct csnode *, wchar_t);
46 static struct csnode *	cset_splay(struct csnode *, wchar_t);
47 
48 /*
49  * cset_alloc --
50  *	Allocate a set of characters.
51  */
52 struct cset *
53 cset_alloc(void)
54 {
55 	struct cset *cs;
56 
57 	if ((cs = malloc(sizeof(*cs))) == NULL)
58 		return (NULL);
59 	cs->cs_root = NULL;
60 	cs->cs_classes = NULL;
61 	cs->cs_havecache = false;
62 	cs->cs_invert = false;
63 	return (cs);
64 }
65 
66 /*
67  * cset_add --
68  *	Add a character to the set.
69  */
70 bool
71 cset_add(struct cset *cs, wchar_t ch)
72 {
73 	struct csnode *csn, *ncsn;
74 	wchar_t oval;
75 
76 	cs->cs_havecache = false;
77 
78 	/*
79 	 * Inserting into empty tree; new item becomes the root.
80 	 */
81 	if (cs->cs_root == NULL) {
82 		csn = malloc(sizeof(*cs->cs_root));
83 		if (csn == NULL)
84 			return (false);
85 		csn->csn_left = csn->csn_right = NULL;
86 		csn->csn_min = csn->csn_max = ch;
87 		cs->cs_root = csn;
88 		return (true);
89 	}
90 
91 	/*
92 	 * Splay to check whether the item already exists, and otherwise,
93 	 * where we should put it.
94 	 */
95 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
96 
97 	/*
98 	 * Avoid adding duplicate nodes.
99 	 */
100 	if (cset_rangecmp(csn, ch) == 0)
101 		return (true);
102 
103 	/*
104 	 * Allocate a new node and make it the new root.
105 	 */
106 	ncsn = malloc(sizeof(*ncsn));
107 	if (ncsn == NULL)
108 		return (false);
109 	ncsn->csn_min = ncsn->csn_max = ch;
110 	if (cset_rangecmp(csn, ch) < 0) {
111 		ncsn->csn_left = csn->csn_left;
112 		ncsn->csn_right = csn;
113 		csn->csn_left = NULL;
114 	} else {
115 		ncsn->csn_right = csn->csn_right;
116 		ncsn->csn_left = csn;
117 		csn->csn_right = NULL;
118 	}
119 	cs->cs_root = ncsn;
120 
121 	/*
122 	 * Coalesce with left and right neighbours if possible.
123 	 */
124 	if (ncsn->csn_left != NULL) {
125 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
126 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
127 			oval = ncsn->csn_left->csn_min;
128 			ncsn->csn_left = cset_delete(ncsn->csn_left,
129 			    ncsn->csn_left->csn_min);
130 			ncsn->csn_min = oval;
131 		}
132 	}
133 	if (ncsn->csn_right != NULL) {
134 		ncsn->csn_right = cset_splay(ncsn->csn_right,
135 		    ncsn->csn_max + 1);
136 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
137 			oval = ncsn->csn_right->csn_max;
138 			ncsn->csn_right = cset_delete(ncsn->csn_right,
139 			    ncsn->csn_right->csn_min);
140 			ncsn->csn_max = oval;
141 		}
142 	}
143 
144 	return (true);
145 }
146 
147 /*
148  * cset_in_hard --
149  *	Determine whether a character is in the set without using
150  *	the cache.
151  */
152 bool
153 cset_in_hard(struct cset *cs, wchar_t ch)
154 {
155 	struct csclass *csc;
156 
157 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
158 		if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0))
159 			return (cs->cs_invert ^ true);
160 	if (cs->cs_root != NULL) {
161 		cs->cs_root = cset_splay(cs->cs_root, ch);
162 		return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0));
163 	}
164 	return (cs->cs_invert ^ false);
165 }
166 
167 /*
168  * cset_cache --
169  *	Update the cache.
170  */
171 void
172 cset_cache(struct cset *cs)
173 {
174 	wchar_t i;
175 
176 	for (i = 0; i < CS_CACHE_SIZE; i++)
177 		cs->cs_cache[i] = cset_in_hard(cs, i);
178 
179 	cs->cs_havecache = true;
180 }
181 
182 /*
183  * cset_invert --
184  *	Invert the character set.
185  */
186 void
187 cset_invert(struct cset *cs)
188 {
189 
190 	cs->cs_invert ^= true;
191 	cs->cs_havecache = false;
192 }
193 
194 /*
195  * cset_addclass --
196  *	Add a wctype()-style character class to the set, optionally
197  *	inverting it.
198  */
199 bool
200 cset_addclass(struct cset *cs, wctype_t type, bool invert)
201 {
202 	struct csclass *csc;
203 
204 	csc = malloc(sizeof(*csc));
205 	if (csc == NULL)
206 		return (false);
207 	csc->csc_type = type;
208 	csc->csc_invert = invert;
209 	csc->csc_next = cs->cs_classes;
210 	cs->cs_classes = csc;
211 	cs->cs_havecache = false;
212 	return (true);
213 }
214 
215 static __inline int
216 cset_rangecmp(struct csnode *t, wchar_t ch)
217 {
218 
219 	if (ch < t->csn_min)
220 		return (-1);
221 	if (ch > t->csn_max)
222 		return (1);
223 	return (0);
224 }
225 
226 static struct csnode *
227 cset_splay(struct csnode *t, wchar_t ch)
228 {
229 	struct csnode N, *l, *r, *y;
230 
231 	/*
232 	 * Based on public domain code from Sleator.
233 	 */
234 
235 	assert(t != NULL);
236 
237 	N.csn_left = N.csn_right = NULL;
238 	l = r = &N;
239 	for (;;) {
240 		if (cset_rangecmp(t, ch) < 0) {
241 			if (t->csn_left != NULL &&
242 			    cset_rangecmp(t->csn_left, ch) < 0) {
243 				y = t->csn_left;
244 				t->csn_left = y->csn_right;
245 				y->csn_right = t;
246 				t = y;
247 			}
248 			if (t->csn_left == NULL)
249 				break;
250 			r->csn_left = t;
251 			r = t;
252 			t = t->csn_left;
253 		} else if (cset_rangecmp(t, ch) > 0) {
254 			if (t->csn_right != NULL &&
255 			    cset_rangecmp(t->csn_right, ch) > 0) {
256 				y = t->csn_right;
257 				t->csn_right = y->csn_left;
258 				y->csn_left = t;
259 				t = y;
260 			}
261 			if (t->csn_right == NULL)
262 				break;
263 			l->csn_right = t;
264 			l = t;
265 			t = t->csn_right;
266 		} else
267 			break;
268 	}
269 	l->csn_right = t->csn_left;
270 	r->csn_left = t->csn_right;
271 	t->csn_left = N.csn_right;
272 	t->csn_right = N.csn_left;
273 	return (t);
274 }
275 
276 static struct csnode *
277 cset_delete(struct csnode *t, wchar_t ch)
278 {
279 	struct csnode *x;
280 
281 	assert(t != NULL);
282 	t = cset_splay(t, ch);
283 	assert(cset_rangecmp(t, ch) == 0);
284 	if (t->csn_left == NULL)
285 		x = t->csn_right;
286 	else {
287 		x = cset_splay(t->csn_left, ch);
288 		x->csn_right = t->csn_right;
289 	}
290 	free(t);
291 	return x;
292 }
293