1 /* 2 * Copyright (c) 2004 Tim J. Robbins. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * "Set of characters" ADT implemented as a splay tree of extents, with 28 * a lookup table cache to simplify looking up the first bunch of 29 * characters (which are presumably more common than others). 30 */ 31 32 #include <assert.h> 33 #include <stdbool.h> 34 #include <stdlib.h> 35 #include <wchar.h> 36 #include <wctype.h> 37 #include "cset.h" 38 39 static struct csnode *cset_delete(struct csnode *, wchar_t); 40 static int cset_rangecmp(struct csnode *, wchar_t); 41 static struct csnode *cset_splay(struct csnode *, wchar_t); 42 43 /* 44 * cset_alloc -- 45 * Allocate a set of characters. 46 */ 47 struct cset * 48 cset_alloc(void) 49 { 50 struct cset *cs; 51 52 if ((cs = malloc(sizeof (*cs))) == NULL) 53 return (NULL); 54 cs->cs_root = NULL; 55 cs->cs_classes = NULL; 56 cs->cs_havecache = false; 57 cs->cs_invert = false; 58 return (cs); 59 } 60 61 /* 62 * cset_add -- 63 * Add a character to the set. 64 */ 65 bool 66 cset_add(struct cset *cs, wchar_t ch) 67 { 68 struct csnode *csn, *ncsn; 69 wchar_t oval; 70 71 cs->cs_havecache = false; 72 73 /* 74 * Inserting into empty tree; new item becomes the root. 75 */ 76 if (cs->cs_root == NULL) { 77 csn = malloc(sizeof (*cs->cs_root)); 78 if (csn == NULL) 79 return (false); 80 csn->csn_left = csn->csn_right = NULL; 81 csn->csn_min = csn->csn_max = ch; 82 cs->cs_root = csn; 83 return (true); 84 } 85 86 /* 87 * Splay to check whether the item already exists, and otherwise, 88 * where we should put it. 89 */ 90 csn = cs->cs_root = cset_splay(cs->cs_root, ch); 91 92 /* 93 * Avoid adding duplicate nodes. 94 */ 95 if (cset_rangecmp(csn, ch) == 0) 96 return (true); 97 98 /* 99 * Allocate a new node and make it the new root. 100 */ 101 ncsn = malloc(sizeof (*ncsn)); 102 if (ncsn == NULL) 103 return (false); 104 ncsn->csn_min = ncsn->csn_max = ch; 105 if (cset_rangecmp(csn, ch) < 0) { 106 ncsn->csn_left = csn->csn_left; 107 ncsn->csn_right = csn; 108 csn->csn_left = NULL; 109 } else { 110 ncsn->csn_right = csn->csn_right; 111 ncsn->csn_left = csn; 112 csn->csn_right = NULL; 113 } 114 cs->cs_root = ncsn; 115 116 /* 117 * Coalesce with left and right neighbours if possible. 118 */ 119 if (ncsn->csn_left != NULL) { 120 ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1); 121 if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) { 122 oval = ncsn->csn_left->csn_min; 123 ncsn->csn_left = cset_delete(ncsn->csn_left, 124 ncsn->csn_left->csn_min); 125 ncsn->csn_min = oval; 126 } 127 } 128 if (ncsn->csn_right != NULL) { 129 ncsn->csn_right = cset_splay(ncsn->csn_right, 130 ncsn->csn_max + 1); 131 if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) { 132 oval = ncsn->csn_right->csn_max; 133 ncsn->csn_right = cset_delete(ncsn->csn_right, 134 ncsn->csn_right->csn_min); 135 ncsn->csn_max = oval; 136 } 137 } 138 139 return (true); 140 } 141 142 /* 143 * cset_in_hard -- 144 * Determine whether a character is in the set without using 145 * the cache. 146 */ 147 bool 148 cset_in_hard(struct cset *cs, wchar_t ch) 149 { 150 struct csclass *csc; 151 152 for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) 153 if ((csc->csc_invert ^ iswctype(ch, csc->csc_type)) != 0) 154 return (cs->cs_invert ^ true); 155 if (cs->cs_root != NULL) { 156 cs->cs_root = cset_splay(cs->cs_root, ch); 157 return ((cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch)) == 0); 158 } 159 return (cs->cs_invert ^ false); 160 } 161 162 /* 163 * cset_cache -- 164 * Update the cache. 165 */ 166 void 167 cset_cache(struct cset *cs) 168 { 169 wchar_t i; 170 171 for (i = 0; i < CS_CACHE_SIZE; i++) 172 cs->cs_cache[i] = cset_in_hard(cs, i); 173 174 cs->cs_havecache = true; 175 } 176 177 /* 178 * cset_invert -- 179 * Invert the character set. 180 */ 181 void 182 cset_invert(struct cset *cs) 183 { 184 185 cs->cs_invert ^= true; 186 cs->cs_havecache = false; 187 } 188 189 /* 190 * cset_addclass -- 191 * Add a wctype()-style character class to the set, optionally 192 * inverting it. 193 */ 194 bool 195 cset_addclass(struct cset *cs, wctype_t type, bool invert) 196 { 197 struct csclass *csc; 198 199 csc = malloc(sizeof (*csc)); 200 if (csc == NULL) 201 return (false); 202 csc->csc_type = type; 203 csc->csc_invert = invert; 204 csc->csc_next = cs->cs_classes; 205 cs->cs_classes = csc; 206 cs->cs_havecache = false; 207 return (true); 208 } 209 210 static int 211 cset_rangecmp(struct csnode *t, wchar_t ch) 212 { 213 214 if (ch < t->csn_min) 215 return (-1); 216 if (ch > t->csn_max) 217 return (1); 218 return (0); 219 } 220 221 static struct csnode * 222 cset_splay(struct csnode *t, wchar_t ch) 223 { 224 struct csnode N, *l, *r, *y; 225 226 /* 227 * Based on public domain code from Sleator. 228 */ 229 230 assert(t != NULL); 231 232 N.csn_left = N.csn_right = NULL; 233 l = r = &N; 234 for (;;) { 235 if (cset_rangecmp(t, ch) < 0) { 236 if (t->csn_left != NULL && 237 cset_rangecmp(t->csn_left, ch) < 0) { 238 y = t->csn_left; 239 t->csn_left = y->csn_right; 240 y->csn_right = t; 241 t = y; 242 } 243 if (t->csn_left == NULL) 244 break; 245 r->csn_left = t; 246 r = t; 247 t = t->csn_left; 248 } else if (cset_rangecmp(t, ch) > 0) { 249 if (t->csn_right != NULL && 250 cset_rangecmp(t->csn_right, ch) > 0) { 251 y = t->csn_right; 252 t->csn_right = y->csn_left; 253 y->csn_left = t; 254 t = y; 255 } 256 if (t->csn_right == NULL) 257 break; 258 l->csn_right = t; 259 l = t; 260 t = t->csn_right; 261 } else 262 break; 263 } 264 l->csn_right = t->csn_left; 265 r->csn_left = t->csn_right; 266 t->csn_left = N.csn_right; 267 t->csn_right = N.csn_left; 268 return (t); 269 } 270 271 static struct csnode * 272 cset_delete(struct csnode *t, wchar_t ch) 273 { 274 struct csnode *x; 275 276 assert(t != NULL); 277 t = cset_splay(t, ch); 278 assert(cset_rangecmp(t, ch) == 0); 279 if (t->csn_left == NULL) 280 x = t->csn_right; 281 else { 282 x = cset_splay(t->csn_left, ch); 283 x->csn_right = t->csn_right; 284 } 285 free(t); 286 return (x); 287 } 288