1 /*- 2 * Copyright (c) 2004 Tim J. Robbins. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 /* 27 * "Set of characters" ADT implemented as a splay tree of extents, with 28 * a lookup table cache to simplify looking up the first bunch of 29 * characters (which are presumably more common than others). 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include <assert.h> 36 #include <stdbool.h> 37 #include <stdlib.h> 38 #include <wchar.h> 39 #include <wctype.h> 40 #include "cset.h" 41 42 static struct csnode * cset_delete(struct csnode *, wchar_t); 43 static __inline int cset_rangecmp(struct csnode *, wchar_t); 44 static struct csnode * cset_splay(struct csnode *, wchar_t); 45 46 /* 47 * cset_alloc -- 48 * Allocate a set of characters. 49 */ 50 struct cset * 51 cset_alloc(void) 52 { 53 struct cset *cs; 54 55 if ((cs = malloc(sizeof(*cs))) == NULL) 56 return (NULL); 57 cs->cs_root = NULL; 58 cs->cs_classes = NULL; 59 cs->cs_havecache = false; 60 cs->cs_invert = false; 61 return (cs); 62 } 63 64 /* 65 * cset_add -- 66 * Add a character to the set. 67 */ 68 bool 69 cset_add(struct cset *cs, wchar_t ch) 70 { 71 struct csnode *csn, *ncsn; 72 wchar_t oval; 73 74 cs->cs_havecache = false; 75 76 /* 77 * Inserting into empty tree; new item becomes the root. 78 */ 79 if (cs->cs_root == NULL) { 80 csn = malloc(sizeof(*cs->cs_root)); 81 if (csn == NULL) 82 return (false); 83 csn->csn_left = csn->csn_right = NULL; 84 csn->csn_min = csn->csn_max = ch; 85 cs->cs_root = csn; 86 return (true); 87 } 88 89 /* 90 * Splay to check whether the item already exists, and otherwise, 91 * where we should put it. 92 */ 93 csn = cs->cs_root = cset_splay(cs->cs_root, ch); 94 95 /* 96 * Avoid adding duplicate nodes. 97 */ 98 if (cset_rangecmp(csn, ch) == 0) 99 return (true); 100 101 /* 102 * Allocate a new node and make it the new root. 103 */ 104 ncsn = malloc(sizeof(*ncsn)); 105 if (ncsn == NULL) 106 return (false); 107 ncsn->csn_min = ncsn->csn_max = ch; 108 if (cset_rangecmp(csn, ch) < 0) { 109 ncsn->csn_left = csn->csn_left; 110 ncsn->csn_right = csn; 111 csn->csn_left = NULL; 112 } else { 113 ncsn->csn_right = csn->csn_right; 114 ncsn->csn_left = csn; 115 csn->csn_right = NULL; 116 } 117 cs->cs_root = ncsn; 118 119 /* 120 * Coalesce with left and right neighbours if possible. 121 */ 122 if (ncsn->csn_left != NULL) { 123 ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1); 124 if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) { 125 oval = ncsn->csn_left->csn_min; 126 ncsn->csn_left = cset_delete(ncsn->csn_left, 127 ncsn->csn_left->csn_min); 128 ncsn->csn_min = oval; 129 } 130 } 131 if (ncsn->csn_right != NULL) { 132 ncsn->csn_right = cset_splay(ncsn->csn_right, 133 ncsn->csn_max + 1); 134 if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) { 135 oval = ncsn->csn_right->csn_max; 136 ncsn->csn_right = cset_delete(ncsn->csn_right, 137 ncsn->csn_right->csn_min); 138 ncsn->csn_max = oval; 139 } 140 } 141 142 return (true); 143 } 144 145 /* 146 * cset_in_hard -- 147 * Determine whether a character is in the set without using 148 * the cache. 149 */ 150 bool 151 cset_in_hard(struct cset *cs, wchar_t ch) 152 { 153 struct csclass *csc; 154 155 for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) 156 if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0) 157 return (cs->cs_invert ^ true); 158 if (cs->cs_root != NULL) { 159 cs->cs_root = cset_splay(cs->cs_root, ch); 160 return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0); 161 } 162 return (cs->cs_invert ^ false); 163 } 164 165 /* 166 * cset_cache -- 167 * Update the cache. 168 */ 169 void 170 cset_cache(struct cset *cs) 171 { 172 wchar_t i; 173 174 for (i = 0; i < CS_CACHE_SIZE; i++) 175 cs->cs_cache[i] = cset_in_hard(cs, i); 176 177 cs->cs_havecache = true; 178 } 179 180 /* 181 * cset_invert -- 182 * Invert the character set. 183 */ 184 void 185 cset_invert(struct cset *cs) 186 { 187 188 cs->cs_invert ^= true; 189 cs->cs_havecache = false; 190 } 191 192 /* 193 * cset_addclass -- 194 * Add a wctype()-style character class to the set, optionally 195 * inverting it. 196 */ 197 bool 198 cset_addclass(struct cset *cs, wctype_t type, bool invert) 199 { 200 struct csclass *csc; 201 202 csc = malloc(sizeof(*csc)); 203 if (csc == NULL) 204 return (false); 205 csc->csc_type = type; 206 csc->csc_invert = invert; 207 csc->csc_next = cs->cs_classes; 208 cs->cs_classes = csc; 209 cs->cs_havecache = false; 210 return (true); 211 } 212 213 static __inline int 214 cset_rangecmp(struct csnode *t, wchar_t ch) 215 { 216 217 if (ch < t->csn_min) 218 return (-1); 219 if (ch > t->csn_max) 220 return (1); 221 return (0); 222 } 223 224 static struct csnode * 225 cset_splay(struct csnode *t, wchar_t ch) 226 { 227 struct csnode N, *l, *r, *y; 228 229 /* 230 * Based on public domain code from Sleator. 231 */ 232 233 assert(t != NULL); 234 235 N.csn_left = N.csn_right = NULL; 236 l = r = &N; 237 for (;;) { 238 if (cset_rangecmp(t, ch) < 0) { 239 if (t->csn_left != NULL && 240 cset_rangecmp(t->csn_left, ch) < 0) { 241 y = t->csn_left; 242 t->csn_left = y->csn_right; 243 y->csn_right = t; 244 t = y; 245 } 246 if (t->csn_left == NULL) 247 break; 248 r->csn_left = t; 249 r = t; 250 t = t->csn_left; 251 } else if (cset_rangecmp(t, ch) > 0) { 252 if (t->csn_right != NULL && 253 cset_rangecmp(t->csn_right, ch) > 0) { 254 y = t->csn_right; 255 t->csn_right = y->csn_left; 256 y->csn_left = t; 257 t = y; 258 } 259 if (t->csn_right == NULL) 260 break; 261 l->csn_right = t; 262 l = t; 263 t = t->csn_right; 264 } else 265 break; 266 } 267 l->csn_right = t->csn_left; 268 r->csn_left = t->csn_right; 269 t->csn_left = N.csn_right; 270 t->csn_right = N.csn_left; 271 return (t); 272 } 273 274 static struct csnode * 275 cset_delete(struct csnode *t, wchar_t ch) 276 { 277 struct csnode *x; 278 279 assert(t != NULL); 280 t = cset_splay(t, ch); 281 assert(cset_rangecmp(t, ch) == 0); 282 if (t->csn_left == NULL) 283 x = t->csn_right; 284 else { 285 x = cset_splay(t->csn_left, ch); 286 x->csn_right = t->csn_right; 287 } 288 free(t); 289 return x; 290 } 291