xref: /titanic_51/usr/src/cmd/tr/cset.c (revision eb8f03cde11ee9fc86ee179ef61fc5861756b1c6)
1  /*
2   * Copyright (c) 2004 Tim J. Robbins.
3   * All rights reserved.
4   *
5   * Redistribution and use in source and binary forms, with or without
6   * modification, are permitted provided that the following conditions
7   * are met:
8   * 1. Redistributions of source code must retain the above copyright
9   *    notice, this list of conditions and the following disclaimer.
10   * 2. Redistributions in binary form must reproduce the above copyright
11   *    notice, this list of conditions and the following disclaimer in the
12   *    documentation and/or other materials provided with the distribution.
13   *
14   * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17   * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24   * SUCH DAMAGE.
25   */
26  /*
27   * "Set of characters" ADT implemented as a splay tree of extents, with
28   * a lookup table cache to simplify looking up the first bunch of
29   * characters (which are presumably more common than others).
30   */
31  
32  #include <assert.h>
33  #include <stdbool.h>
34  #include <stdlib.h>
35  #include <wchar.h>
36  #include <wctype.h>
37  #include "cset.h"
38  
39  static struct csnode	*cset_delete(struct csnode *, wchar_t);
40  static int		cset_rangecmp(struct csnode *, wchar_t);
41  static struct csnode	*cset_splay(struct csnode *, wchar_t);
42  
43  /*
44   * cset_alloc --
45   *	Allocate a set of characters.
46   */
47  struct cset *
48  cset_alloc(void)
49  {
50  	struct cset *cs;
51  
52  	if ((cs = malloc(sizeof (*cs))) == NULL)
53  		return (NULL);
54  	cs->cs_root = NULL;
55  	cs->cs_classes = NULL;
56  	cs->cs_havecache = false;
57  	cs->cs_invert = false;
58  	return (cs);
59  }
60  
61  /*
62   * cset_add --
63   *	Add a character to the set.
64   */
65  bool
66  cset_add(struct cset *cs, wchar_t ch)
67  {
68  	struct csnode *csn, *ncsn;
69  	wchar_t oval;
70  
71  	cs->cs_havecache = false;
72  
73  	/*
74  	 * Inserting into empty tree; new item becomes the root.
75  	 */
76  	if (cs->cs_root == NULL) {
77  		csn = malloc(sizeof (*cs->cs_root));
78  		if (csn == NULL)
79  			return (false);
80  		csn->csn_left = csn->csn_right = NULL;
81  		csn->csn_min = csn->csn_max = ch;
82  		cs->cs_root = csn;
83  		return (true);
84  	}
85  
86  	/*
87  	 * Splay to check whether the item already exists, and otherwise,
88  	 * where we should put it.
89  	 */
90  	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
91  
92  	/*
93  	 * Avoid adding duplicate nodes.
94  	 */
95  	if (cset_rangecmp(csn, ch) == 0)
96  		return (true);
97  
98  	/*
99  	 * Allocate a new node and make it the new root.
100  	 */
101  	ncsn = malloc(sizeof (*ncsn));
102  	if (ncsn == NULL)
103  		return (false);
104  	ncsn->csn_min = ncsn->csn_max = ch;
105  	if (cset_rangecmp(csn, ch) < 0) {
106  		ncsn->csn_left = csn->csn_left;
107  		ncsn->csn_right = csn;
108  		csn->csn_left = NULL;
109  	} else {
110  		ncsn->csn_right = csn->csn_right;
111  		ncsn->csn_left = csn;
112  		csn->csn_right = NULL;
113  	}
114  	cs->cs_root = ncsn;
115  
116  	/*
117  	 * Coalesce with left and right neighbours if possible.
118  	 */
119  	if (ncsn->csn_left != NULL) {
120  		ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1);
121  		if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) {
122  			oval = ncsn->csn_left->csn_min;
123  			ncsn->csn_left = cset_delete(ncsn->csn_left,
124  			    ncsn->csn_left->csn_min);
125  			ncsn->csn_min = oval;
126  		}
127  	}
128  	if (ncsn->csn_right != NULL) {
129  		ncsn->csn_right = cset_splay(ncsn->csn_right,
130  		    ncsn->csn_max + 1);
131  		if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) {
132  			oval = ncsn->csn_right->csn_max;
133  			ncsn->csn_right = cset_delete(ncsn->csn_right,
134  			    ncsn->csn_right->csn_min);
135  			ncsn->csn_max = oval;
136  		}
137  	}
138  
139  	return (true);
140  }
141  
142  /*
143   * cset_in_hard --
144   *	Determine whether a character is in the set without using
145   *	the cache.
146   */
147  bool
148  cset_in_hard(struct cset *cs, wchar_t ch)
149  {
150  	struct csclass *csc;
151  
152  	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
153  		if ((csc->csc_invert ^ iswctype(ch, csc->csc_type)) != 0)
154  			return (cs->cs_invert ^ true);
155  	if (cs->cs_root != NULL) {
156  		cs->cs_root = cset_splay(cs->cs_root, ch);
157  		return ((cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch)) == 0);
158  	}
159  	return (cs->cs_invert ^ false);
160  }
161  
162  /*
163   * cset_cache --
164   *	Update the cache.
165   */
166  void
167  cset_cache(struct cset *cs)
168  {
169  	wchar_t i;
170  
171  	for (i = 0; i < CS_CACHE_SIZE; i++)
172  		cs->cs_cache[i] = cset_in_hard(cs, i);
173  
174  	cs->cs_havecache = true;
175  }
176  
177  /*
178   * cset_invert --
179   *	Invert the character set.
180   */
181  void
182  cset_invert(struct cset *cs)
183  {
184  
185  	cs->cs_invert ^= true;
186  	cs->cs_havecache = false;
187  }
188  
189  /*
190   * cset_addclass --
191   *	Add a wctype()-style character class to the set, optionally
192   *	inverting it.
193   */
194  bool
195  cset_addclass(struct cset *cs, wctype_t type, bool invert)
196  {
197  	struct csclass *csc;
198  
199  	csc = malloc(sizeof (*csc));
200  	if (csc == NULL)
201  		return (false);
202  	csc->csc_type = type;
203  	csc->csc_invert = invert;
204  	csc->csc_next = cs->cs_classes;
205  	cs->cs_classes = csc;
206  	cs->cs_havecache = false;
207  	return (true);
208  }
209  
210  static int
211  cset_rangecmp(struct csnode *t, wchar_t ch)
212  {
213  
214  	if (ch < t->csn_min)
215  		return (-1);
216  	if (ch > t->csn_max)
217  		return (1);
218  	return (0);
219  }
220  
221  static struct csnode *
222  cset_splay(struct csnode *t, wchar_t ch)
223  {
224  	struct csnode N, *l, *r, *y;
225  
226  	/*
227  	 * Based on public domain code from Sleator.
228  	 */
229  
230  	assert(t != NULL);
231  
232  	N.csn_left = N.csn_right = NULL;
233  	l = r = &N;
234  	for (;;) {
235  		if (cset_rangecmp(t, ch) < 0) {
236  			if (t->csn_left != NULL &&
237  			    cset_rangecmp(t->csn_left, ch) < 0) {
238  				y = t->csn_left;
239  				t->csn_left = y->csn_right;
240  				y->csn_right = t;
241  				t = y;
242  			}
243  			if (t->csn_left == NULL)
244  				break;
245  			r->csn_left = t;
246  			r = t;
247  			t = t->csn_left;
248  		} else if (cset_rangecmp(t, ch) > 0) {
249  			if (t->csn_right != NULL &&
250  			    cset_rangecmp(t->csn_right, ch) > 0) {
251  				y = t->csn_right;
252  				t->csn_right = y->csn_left;
253  				y->csn_left = t;
254  				t = y;
255  			}
256  			if (t->csn_right == NULL)
257  				break;
258  			l->csn_right = t;
259  			l = t;
260  			t = t->csn_right;
261  		} else
262  			break;
263  	}
264  	l->csn_right = t->csn_left;
265  	r->csn_left = t->csn_right;
266  	t->csn_left = N.csn_right;
267  	t->csn_right = N.csn_left;
268  	return (t);
269  }
270  
271  static struct csnode *
272  cset_delete(struct csnode *t, wchar_t ch)
273  {
274  	struct csnode *x;
275  
276  	assert(t != NULL);
277  	t = cset_splay(t, ch);
278  	assert(cset_rangecmp(t, ch) == 0);
279  	if (t->csn_left == NULL)
280  		x = t->csn_right;
281  	else {
282  		x = cset_splay(t->csn_left, ch);
283  		x->csn_right = t->csn_right;
284  	}
285  	free(t);
286  	return (x);
287  }
288