1 /*- 2 * Copyright (c) 2004 Tim J. Robbins. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * "Set of characters" ADT implemented as a splay tree of extents, with 28 * a lookup table cache to simplify looking up the first bunch of 29 * characters (which are presumably more common than others). 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <assert.h> 36 #include <stdbool.h> 37 #include <stdlib.h> 38 #include <wchar.h> 39 #include <wctype.h> 40 #include "cset.h" 41 42 static struct csnode * cset_delete(struct csnode *, wchar_t); 43 static __inline int cset_rangecmp(struct csnode *, wchar_t); 44 static struct csnode * cset_splay(struct csnode *, wchar_t); 45 46 /* 47 * cset_alloc -- 48 * Allocate a set of characters. 49 */ 50 struct cset * 51 cset_alloc(void) 52 { 53 struct cset *cs; 54 55 if ((cs = malloc(sizeof(*cs))) == NULL) 56 return (NULL); 57 cs->cs_root = NULL; 58 cs->cs_classes = NULL; 59 cs->cs_havecache = false; 60 cs->cs_invert = false; 61 return (cs); 62 } 63 64 /* 65 * cset_add -- 66 * Add a character to the set. 67 */ 68 bool 69 cset_add(struct cset *cs, wchar_t ch) 70 { 71 struct csnode *csn, *ncsn; 72 wchar_t oval; 73 74 cs->cs_havecache = false; 75 76 /* 77 * Inserting into empty tree; new item becomes the root. 78 */ 79 if (cs->cs_root == NULL) { 80 csn = malloc(sizeof(*cs->cs_root)); 81 if (csn == NULL) 82 return (false); 83 csn->csn_left = csn->csn_right = NULL; 84 csn->csn_min = csn->csn_max = ch; 85 cs->cs_root = csn; 86 return (true); 87 } 88 89 /* 90 * Splay to check whether the item already exists, and otherwise, 91 * where we should put it. 92 */ 93 csn = cs->cs_root = cset_splay(cs->cs_root, ch); 94 95 /* 96 * Easy cases where we can avoid allocating a new node: 97 * (a) node already exists. 98 * (b) we can lower the extent's "min" to accomodate this 99 * character without having to coalesce. 100 * (c) we can raise the extent's "max" without having 101 * to coalesce. 102 */ 103 if (cset_rangecmp(csn, ch) == 0) 104 return (true); 105 if (ch + 1 == csn->csn_min && (csn->csn_left == NULL || 106 ch > csn->csn_left->csn_max + 1)) { 107 csn->csn_min--; 108 return (true); 109 } 110 if (ch == csn->csn_max + 1 && (csn->csn_right == NULL || 111 ch + 1 < csn->csn_right->csn_min)) { 112 csn->csn_max++; 113 return (true); 114 } 115 116 /* 117 * Allocate a new node and link it into the tree as a direct 118 * child of the root. 119 */ 120 ncsn = malloc(sizeof(*ncsn)); 121 if (ncsn == NULL) 122 return (false); 123 ncsn->csn_min = ncsn->csn_max = ch; 124 if (cset_rangecmp(csn, ch) < 0) { 125 ncsn->csn_left = csn->csn_left; 126 ncsn->csn_right = csn; 127 csn->csn_left = NULL; 128 } else { 129 ncsn->csn_right = csn->csn_right; 130 ncsn->csn_left = csn; 131 csn->csn_right = NULL; 132 } 133 cs->cs_root = ncsn; 134 135 /* 136 * Splay to bring the newly inserted node to the root, then 137 * coalesce with left and right neighbours if possible. 138 */ 139 csn = cs->cs_root = cset_splay(cs->cs_root, ch); 140 if (csn->csn_left != NULL && 141 csn->csn_left->csn_max + 1 == csn->csn_min) { 142 oval = csn->csn_left->csn_min; 143 cs->cs_root = cset_delete(cs->cs_root, 144 csn->csn_left->csn_min); 145 ncsn->csn_min = oval; 146 } 147 csn = cs->cs_root = cset_splay(cs->cs_root, ch); 148 if (csn->csn_right != NULL && 149 csn->csn_right->csn_min - 1 == csn->csn_max) { 150 oval = csn->csn_right->csn_max; 151 cs->cs_root = cset_delete(cs->cs_root, 152 csn->csn_right->csn_min); 153 ncsn->csn_max = oval; 154 } 155 156 return (true); 157 } 158 159 /* 160 * cset_in_hard -- 161 * Determine whether a character is in the set without using 162 * the cache. 163 */ 164 bool 165 cset_in_hard(struct cset *cs, wchar_t ch) 166 { 167 struct csclass *csc; 168 169 for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) 170 if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0) 171 return (cs->cs_invert ^ true); 172 if (cs->cs_root != NULL) { 173 cs->cs_root = cset_splay(cs->cs_root, ch); 174 return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0); 175 } 176 return (cs->cs_invert ^ false); 177 } 178 179 /* 180 * cset_cache -- 181 * Update the cache. 182 */ 183 void 184 cset_cache(struct cset *cs) 185 { 186 wchar_t i; 187 188 for (i = 0; i < CS_CACHE_SIZE; i++) 189 cs->cs_cache[i] = cset_in_hard(cs, i); 190 191 cs->cs_havecache = true; 192 } 193 194 /* 195 * cset_invert -- 196 * Invert the character set. 197 */ 198 void 199 cset_invert(struct cset *cs) 200 { 201 202 cs->cs_invert ^= true; 203 cs->cs_havecache = false; 204 } 205 206 /* 207 * cset_addclass -- 208 * Add a wctype()-style character class to the set, optionally 209 * inverting it. 210 */ 211 bool 212 cset_addclass(struct cset *cs, wctype_t type, bool invert) 213 { 214 struct csclass *csc; 215 216 csc = malloc(sizeof(*csc)); 217 if (csc == NULL) 218 return (false); 219 csc->csc_type = type; 220 csc->csc_invert = invert; 221 csc->csc_next = cs->cs_classes; 222 cs->cs_classes = csc; 223 cs->cs_havecache = false; 224 return (true); 225 } 226 227 static __inline int 228 cset_rangecmp(struct csnode *t, wchar_t ch) 229 { 230 231 if (ch < t->csn_min) 232 return (-1); 233 if (ch > t->csn_max) 234 return (1); 235 return (0); 236 } 237 238 static struct csnode * 239 cset_splay(struct csnode *t, wchar_t ch) 240 { 241 struct csnode N, *l, *r, *y; 242 243 /* 244 * Based on public domain code from Sleator. 245 */ 246 247 assert(t != NULL); 248 249 N.csn_left = N.csn_right = NULL; 250 l = r = &N; 251 for (;;) { 252 if (cset_rangecmp(t, ch) < 0) { 253 if (t->csn_left != NULL && 254 cset_rangecmp(t->csn_left, ch) < 0) { 255 y = t->csn_left; 256 t->csn_left = y->csn_right; 257 y->csn_right = t; 258 t = y; 259 } 260 if (t->csn_left == NULL) 261 break; 262 r->csn_left = t; 263 r = t; 264 t = t->csn_left; 265 } else if (cset_rangecmp(t, ch) > 0) { 266 if (t->csn_right != NULL && 267 cset_rangecmp(t->csn_right, ch) > 0) { 268 y = t->csn_right; 269 t->csn_right = y->csn_left; 270 y->csn_left = t; 271 t = y; 272 } 273 if (t->csn_right == NULL) 274 break; 275 l->csn_right = t; 276 l = t; 277 t = t->csn_right; 278 } else 279 break; 280 } 281 l->csn_right = t->csn_left; 282 r->csn_left = t->csn_right; 283 t->csn_left = N.csn_right; 284 t->csn_right = N.csn_left; 285 return (t); 286 } 287 288 static struct csnode * 289 cset_delete(struct csnode *t, wchar_t ch) 290 { 291 struct csnode *x; 292 293 assert(t != NULL); 294 t = cset_splay(t, ch); 295 assert(cset_rangecmp(t, ch) == 0); 296 if (t->csn_left == NULL) 297 x = t->csn_right; 298 else { 299 x = cset_splay(t->csn_left, ch); 300 x->csn_right = t->csn_right; 301 } 302 free(t); 303 return x; 304 } 305