xref: /titanic_52/usr/src/cmd/tr/cset.c (revision 163bd69b3c164dda2a59c7f08ca788e7d6ba9bea)
1*163bd69bSGarrett D'Amore /*
2*163bd69bSGarrett D'Amore  * Copyright (c) 2004 Tim J. Robbins.
3*163bd69bSGarrett D'Amore  * All rights reserved.
4*163bd69bSGarrett D'Amore  *
5*163bd69bSGarrett D'Amore  * Redistribution and use in source and binary forms, with or without
6*163bd69bSGarrett D'Amore  * modification, are permitted provided that the following conditions
7*163bd69bSGarrett D'Amore  * are met:
8*163bd69bSGarrett D'Amore  * 1. Redistributions of source code must retain the above copyright
9*163bd69bSGarrett D'Amore  *    notice, this list of conditions and the following disclaimer.
10*163bd69bSGarrett D'Amore  * 2. Redistributions in binary form must reproduce the above copyright
11*163bd69bSGarrett D'Amore  *    notice, this list of conditions and the following disclaimer in the
12*163bd69bSGarrett D'Amore  *    documentation and/or other materials provided with the distribution.
13*163bd69bSGarrett D'Amore  *
14*163bd69bSGarrett D'Amore  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15*163bd69bSGarrett D'Amore  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16*163bd69bSGarrett D'Amore  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17*163bd69bSGarrett D'Amore  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18*163bd69bSGarrett D'Amore  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19*163bd69bSGarrett D'Amore  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20*163bd69bSGarrett D'Amore  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21*163bd69bSGarrett D'Amore  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22*163bd69bSGarrett D'Amore  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23*163bd69bSGarrett D'Amore  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24*163bd69bSGarrett D'Amore  * SUCH DAMAGE.
25*163bd69bSGarrett D'Amore  */
26*163bd69bSGarrett D'Amore /*
27*163bd69bSGarrett D'Amore  * "Set of characters" ADT implemented as a splay tree of extents, with
28*163bd69bSGarrett D'Amore  * a lookup table cache to simplify looking up the first bunch of
29*163bd69bSGarrett D'Amore  * characters (which are presumably more common than others).
30*163bd69bSGarrett D'Amore  */
31*163bd69bSGarrett D'Amore 
32*163bd69bSGarrett D'Amore #include <assert.h>
33*163bd69bSGarrett D'Amore #include <stdbool.h>
34*163bd69bSGarrett D'Amore #include <stdlib.h>
35*163bd69bSGarrett D'Amore #include <wchar.h>
36*163bd69bSGarrett D'Amore #include <wctype.h>
37*163bd69bSGarrett D'Amore #include "cset.h"
38*163bd69bSGarrett D'Amore 
39*163bd69bSGarrett D'Amore static struct csnode	*cset_delete(struct csnode *, wchar_t);
40*163bd69bSGarrett D'Amore static int		cset_rangecmp(struct csnode *, wchar_t);
41*163bd69bSGarrett D'Amore static struct csnode	*cset_splay(struct csnode *, wchar_t);
42*163bd69bSGarrett D'Amore 
43*163bd69bSGarrett D'Amore /*
44*163bd69bSGarrett D'Amore  * cset_alloc --
45*163bd69bSGarrett D'Amore  *	Allocate a set of characters.
46*163bd69bSGarrett D'Amore  */
47*163bd69bSGarrett D'Amore struct cset *
48*163bd69bSGarrett D'Amore cset_alloc(void)
49*163bd69bSGarrett D'Amore {
50*163bd69bSGarrett D'Amore 	struct cset *cs;
51*163bd69bSGarrett D'Amore 
52*163bd69bSGarrett D'Amore 	if ((cs = malloc(sizeof (*cs))) == NULL)
53*163bd69bSGarrett D'Amore 		return (NULL);
54*163bd69bSGarrett D'Amore 	cs->cs_root = NULL;
55*163bd69bSGarrett D'Amore 	cs->cs_classes = NULL;
56*163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
57*163bd69bSGarrett D'Amore 	cs->cs_invert = false;
58*163bd69bSGarrett D'Amore 	return (cs);
59*163bd69bSGarrett D'Amore }
60*163bd69bSGarrett D'Amore 
61*163bd69bSGarrett D'Amore /*
62*163bd69bSGarrett D'Amore  * cset_add --
63*163bd69bSGarrett D'Amore  *	Add a character to the set.
64*163bd69bSGarrett D'Amore  */
65*163bd69bSGarrett D'Amore bool
66*163bd69bSGarrett D'Amore cset_add(struct cset *cs, wchar_t ch)
67*163bd69bSGarrett D'Amore {
68*163bd69bSGarrett D'Amore 	struct csnode *csn, *ncsn;
69*163bd69bSGarrett D'Amore 	wchar_t oval;
70*163bd69bSGarrett D'Amore 
71*163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
72*163bd69bSGarrett D'Amore 
73*163bd69bSGarrett D'Amore 	/*
74*163bd69bSGarrett D'Amore 	 * Inserting into empty tree; new item becomes the root.
75*163bd69bSGarrett D'Amore 	 */
76*163bd69bSGarrett D'Amore 	if (cs->cs_root == NULL) {
77*163bd69bSGarrett D'Amore 		csn = malloc(sizeof (*cs->cs_root));
78*163bd69bSGarrett D'Amore 		if (csn == NULL)
79*163bd69bSGarrett D'Amore 			return (false);
80*163bd69bSGarrett D'Amore 		csn->csn_left = csn->csn_right = NULL;
81*163bd69bSGarrett D'Amore 		csn->csn_min = csn->csn_max = ch;
82*163bd69bSGarrett D'Amore 		cs->cs_root = csn;
83*163bd69bSGarrett D'Amore 		return (true);
84*163bd69bSGarrett D'Amore 	}
85*163bd69bSGarrett D'Amore 
86*163bd69bSGarrett D'Amore 	/*
87*163bd69bSGarrett D'Amore 	 * Splay to check whether the item already exists, and otherwise,
88*163bd69bSGarrett D'Amore 	 * where we should put it.
89*163bd69bSGarrett D'Amore 	 */
90*163bd69bSGarrett D'Amore 	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
91*163bd69bSGarrett D'Amore 
92*163bd69bSGarrett D'Amore 	/*
93*163bd69bSGarrett D'Amore 	 * Avoid adding duplicate nodes.
94*163bd69bSGarrett D'Amore 	 */
95*163bd69bSGarrett D'Amore 	if (cset_rangecmp(csn, ch) == 0)
96*163bd69bSGarrett D'Amore 		return (true);
97*163bd69bSGarrett D'Amore 
98*163bd69bSGarrett D'Amore 	/*
99*163bd69bSGarrett D'Amore 	 * Allocate a new node and make it the new root.
100*163bd69bSGarrett D'Amore 	 */
101*163bd69bSGarrett D'Amore 	ncsn = malloc(sizeof (*ncsn));
102*163bd69bSGarrett D'Amore 	if (ncsn == NULL)
103*163bd69bSGarrett D'Amore 		return (false);
104*163bd69bSGarrett D'Amore 	ncsn->csn_min = ncsn->csn_max = ch;
105*163bd69bSGarrett D'Amore 	if (cset_rangecmp(csn, ch) < 0) {
106*163bd69bSGarrett D'Amore 		ncsn->csn_left = csn->csn_left;
107*163bd69bSGarrett D'Amore 		ncsn->csn_right = csn;
108*163bd69bSGarrett D'Amore 		csn->csn_left = NULL;
109*163bd69bSGarrett D'Amore 	} else {
110*163bd69bSGarrett D'Amore 		ncsn->csn_right = csn->csn_right;
111*163bd69bSGarrett D'Amore 		ncsn->csn_left = csn;
112*163bd69bSGarrett D'Amore 		csn->csn_right = NULL;
113*163bd69bSGarrett D'Amore 	}
114*163bd69bSGarrett D'Amore 	cs->cs_root = ncsn;
115*163bd69bSGarrett D'Amore 
116*163bd69bSGarrett D'Amore 	/*
117*163bd69bSGarrett D'Amore 	 * Coalesce with left and right neighbours if possible.
118*163bd69bSGarrett D'Amore 	 */
119*163bd69bSGarrett D'Amore 	if (ncsn->csn_left != NULL) {
120*163bd69bSGarrett D'Amore 		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
121*163bd69bSGarrett D'Amore 		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
122*163bd69bSGarrett D'Amore 			oval = ncsn->csn_left->csn_min;
123*163bd69bSGarrett D'Amore 			ncsn->csn_left = cset_delete(ncsn->csn_left,
124*163bd69bSGarrett D'Amore 			    ncsn->csn_left->csn_min);
125*163bd69bSGarrett D'Amore 			ncsn->csn_min = oval;
126*163bd69bSGarrett D'Amore 		}
127*163bd69bSGarrett D'Amore 	}
128*163bd69bSGarrett D'Amore 	if (ncsn->csn_right != NULL) {
129*163bd69bSGarrett D'Amore 		ncsn->csn_right = cset_splay(ncsn->csn_right,
130*163bd69bSGarrett D'Amore 		    ncsn->csn_max + 1);
131*163bd69bSGarrett D'Amore 		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
132*163bd69bSGarrett D'Amore 			oval = ncsn->csn_right->csn_max;
133*163bd69bSGarrett D'Amore 			ncsn->csn_right = cset_delete(ncsn->csn_right,
134*163bd69bSGarrett D'Amore 			    ncsn->csn_right->csn_min);
135*163bd69bSGarrett D'Amore 			ncsn->csn_max = oval;
136*163bd69bSGarrett D'Amore 		}
137*163bd69bSGarrett D'Amore 	}
138*163bd69bSGarrett D'Amore 
139*163bd69bSGarrett D'Amore 	return (true);
140*163bd69bSGarrett D'Amore }
141*163bd69bSGarrett D'Amore 
142*163bd69bSGarrett D'Amore /*
143*163bd69bSGarrett D'Amore  * cset_in_hard --
144*163bd69bSGarrett D'Amore  *	Determine whether a character is in the set without using
145*163bd69bSGarrett D'Amore  *	the cache.
146*163bd69bSGarrett D'Amore  */
147*163bd69bSGarrett D'Amore bool
148*163bd69bSGarrett D'Amore cset_in_hard(struct cset *cs, wchar_t ch)
149*163bd69bSGarrett D'Amore {
150*163bd69bSGarrett D'Amore 	struct csclass *csc;
151*163bd69bSGarrett D'Amore 
152*163bd69bSGarrett D'Amore 	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
153*163bd69bSGarrett D'Amore 		if ((csc->csc_invert ^ iswctype(ch, csc->csc_type)) != 0)
154*163bd69bSGarrett D'Amore 			return (cs->cs_invert ^ true);
155*163bd69bSGarrett D'Amore 	if (cs->cs_root != NULL) {
156*163bd69bSGarrett D'Amore 		cs->cs_root = cset_splay(cs->cs_root, ch);
157*163bd69bSGarrett D'Amore 		return ((cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch)) == 0);
158*163bd69bSGarrett D'Amore 	}
159*163bd69bSGarrett D'Amore 	return (cs->cs_invert ^ false);
160*163bd69bSGarrett D'Amore }
161*163bd69bSGarrett D'Amore 
162*163bd69bSGarrett D'Amore /*
163*163bd69bSGarrett D'Amore  * cset_cache --
164*163bd69bSGarrett D'Amore  *	Update the cache.
165*163bd69bSGarrett D'Amore  */
166*163bd69bSGarrett D'Amore void
167*163bd69bSGarrett D'Amore cset_cache(struct cset *cs)
168*163bd69bSGarrett D'Amore {
169*163bd69bSGarrett D'Amore 	wchar_t i;
170*163bd69bSGarrett D'Amore 
171*163bd69bSGarrett D'Amore 	for (i = 0; i < CS_CACHE_SIZE; i++)
172*163bd69bSGarrett D'Amore 		cs->cs_cache[i] = cset_in_hard(cs, i);
173*163bd69bSGarrett D'Amore 
174*163bd69bSGarrett D'Amore 	cs->cs_havecache = true;
175*163bd69bSGarrett D'Amore }
176*163bd69bSGarrett D'Amore 
177*163bd69bSGarrett D'Amore /*
178*163bd69bSGarrett D'Amore  * cset_invert --
179*163bd69bSGarrett D'Amore  *	Invert the character set.
180*163bd69bSGarrett D'Amore  */
181*163bd69bSGarrett D'Amore void
182*163bd69bSGarrett D'Amore cset_invert(struct cset *cs)
183*163bd69bSGarrett D'Amore {
184*163bd69bSGarrett D'Amore 
185*163bd69bSGarrett D'Amore 	cs->cs_invert ^= true;
186*163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
187*163bd69bSGarrett D'Amore }
188*163bd69bSGarrett D'Amore 
189*163bd69bSGarrett D'Amore /*
190*163bd69bSGarrett D'Amore  * cset_addclass --
191*163bd69bSGarrett D'Amore  *	Add a wctype()-style character class to the set, optionally
192*163bd69bSGarrett D'Amore  *	inverting it.
193*163bd69bSGarrett D'Amore  */
194*163bd69bSGarrett D'Amore bool
195*163bd69bSGarrett D'Amore cset_addclass(struct cset *cs, wctype_t type, bool invert)
196*163bd69bSGarrett D'Amore {
197*163bd69bSGarrett D'Amore 	struct csclass *csc;
198*163bd69bSGarrett D'Amore 
199*163bd69bSGarrett D'Amore 	csc = malloc(sizeof (*csc));
200*163bd69bSGarrett D'Amore 	if (csc == NULL)
201*163bd69bSGarrett D'Amore 		return (false);
202*163bd69bSGarrett D'Amore 	csc->csc_type = type;
203*163bd69bSGarrett D'Amore 	csc->csc_invert = invert;
204*163bd69bSGarrett D'Amore 	csc->csc_next = cs->cs_classes;
205*163bd69bSGarrett D'Amore 	cs->cs_classes = csc;
206*163bd69bSGarrett D'Amore 	cs->cs_havecache = false;
207*163bd69bSGarrett D'Amore 	return (true);
208*163bd69bSGarrett D'Amore }
209*163bd69bSGarrett D'Amore 
210*163bd69bSGarrett D'Amore static int
211*163bd69bSGarrett D'Amore cset_rangecmp(struct csnode *t, wchar_t ch)
212*163bd69bSGarrett D'Amore {
213*163bd69bSGarrett D'Amore 
214*163bd69bSGarrett D'Amore 	if (ch < t->csn_min)
215*163bd69bSGarrett D'Amore 		return (-1);
216*163bd69bSGarrett D'Amore 	if (ch > t->csn_max)
217*163bd69bSGarrett D'Amore 		return (1);
218*163bd69bSGarrett D'Amore 	return (0);
219*163bd69bSGarrett D'Amore }
220*163bd69bSGarrett D'Amore 
221*163bd69bSGarrett D'Amore static struct csnode *
222*163bd69bSGarrett D'Amore cset_splay(struct csnode *t, wchar_t ch)
223*163bd69bSGarrett D'Amore {
224*163bd69bSGarrett D'Amore 	struct csnode N, *l, *r, *y;
225*163bd69bSGarrett D'Amore 
226*163bd69bSGarrett D'Amore 	/*
227*163bd69bSGarrett D'Amore 	 * Based on public domain code from Sleator.
228*163bd69bSGarrett D'Amore 	 */
229*163bd69bSGarrett D'Amore 
230*163bd69bSGarrett D'Amore 	assert(t != NULL);
231*163bd69bSGarrett D'Amore 
232*163bd69bSGarrett D'Amore 	N.csn_left = N.csn_right = NULL;
233*163bd69bSGarrett D'Amore 	l = r = &N;
234*163bd69bSGarrett D'Amore 	for (;;) {
235*163bd69bSGarrett D'Amore 		if (cset_rangecmp(t, ch) < 0) {
236*163bd69bSGarrett D'Amore 			if (t->csn_left != NULL &&
237*163bd69bSGarrett D'Amore 			    cset_rangecmp(t->csn_left, ch) < 0) {
238*163bd69bSGarrett D'Amore 				y = t->csn_left;
239*163bd69bSGarrett D'Amore 				t->csn_left = y->csn_right;
240*163bd69bSGarrett D'Amore 				y->csn_right = t;
241*163bd69bSGarrett D'Amore 				t = y;
242*163bd69bSGarrett D'Amore 			}
243*163bd69bSGarrett D'Amore 			if (t->csn_left == NULL)
244*163bd69bSGarrett D'Amore 				break;
245*163bd69bSGarrett D'Amore 			r->csn_left = t;
246*163bd69bSGarrett D'Amore 			r = t;
247*163bd69bSGarrett D'Amore 			t = t->csn_left;
248*163bd69bSGarrett D'Amore 		} else if (cset_rangecmp(t, ch) > 0) {
249*163bd69bSGarrett D'Amore 			if (t->csn_right != NULL &&
250*163bd69bSGarrett D'Amore 			    cset_rangecmp(t->csn_right, ch) > 0) {
251*163bd69bSGarrett D'Amore 				y = t->csn_right;
252*163bd69bSGarrett D'Amore 				t->csn_right = y->csn_left;
253*163bd69bSGarrett D'Amore 				y->csn_left = t;
254*163bd69bSGarrett D'Amore 				t = y;
255*163bd69bSGarrett D'Amore 			}
256*163bd69bSGarrett D'Amore 			if (t->csn_right == NULL)
257*163bd69bSGarrett D'Amore 				break;
258*163bd69bSGarrett D'Amore 			l->csn_right = t;
259*163bd69bSGarrett D'Amore 			l = t;
260*163bd69bSGarrett D'Amore 			t = t->csn_right;
261*163bd69bSGarrett D'Amore 		} else
262*163bd69bSGarrett D'Amore 			break;
263*163bd69bSGarrett D'Amore 	}
264*163bd69bSGarrett D'Amore 	l->csn_right = t->csn_left;
265*163bd69bSGarrett D'Amore 	r->csn_left = t->csn_right;
266*163bd69bSGarrett D'Amore 	t->csn_left = N.csn_right;
267*163bd69bSGarrett D'Amore 	t->csn_right = N.csn_left;
268*163bd69bSGarrett D'Amore 	return (t);
269*163bd69bSGarrett D'Amore }
270*163bd69bSGarrett D'Amore 
271*163bd69bSGarrett D'Amore static struct csnode *
272*163bd69bSGarrett D'Amore cset_delete(struct csnode *t, wchar_t ch)
273*163bd69bSGarrett D'Amore {
274*163bd69bSGarrett D'Amore 	struct csnode *x;
275*163bd69bSGarrett D'Amore 
276*163bd69bSGarrett D'Amore 	assert(t != NULL);
277*163bd69bSGarrett D'Amore 	t = cset_splay(t, ch);
278*163bd69bSGarrett D'Amore 	assert(cset_rangecmp(t, ch) == 0);
279*163bd69bSGarrett D'Amore 	if (t->csn_left == NULL)
280*163bd69bSGarrett D'Amore 		x = t->csn_right;
281*163bd69bSGarrett D'Amore 	else {
282*163bd69bSGarrett D'Amore 		x = cset_splay(t->csn_left, ch);
283*163bd69bSGarrett D'Amore 		x->csn_right = t->csn_right;
284*163bd69bSGarrett D'Amore 	}
285*163bd69bSGarrett D'Amore 	free(t);
286*163bd69bSGarrett D'Amore 	return (x);
287*163bd69bSGarrett D'Amore }
288