1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2004 Tim J. Robbins. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 /* 29 * "Set of characters" ADT implemented as a splay tree of extents, with 30 * a lookup table cache to simplify looking up the first bunch of 31 * characters (which are presumably more common than others). 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <assert.h> 38 #include <stdbool.h> 39 #include <stdlib.h> 40 #include <wchar.h> 41 #include <wctype.h> 42 #include "cset.h" 43 44 static struct csnode * cset_delete(struct csnode *, wchar_t); 45 static __inline int cset_rangecmp(struct csnode *, wchar_t); 46 static struct csnode * cset_splay(struct csnode *, wchar_t); 47 48 /* 49 * cset_alloc -- 50 * Allocate a set of characters. 51 */ 52 struct cset * 53 cset_alloc(void) 54 { 55 struct cset *cs; 56 57 if ((cs = malloc(sizeof(*cs))) == NULL) 58 return (NULL); 59 cs->cs_root = NULL; 60 cs->cs_classes = NULL; 61 cs->cs_havecache = false; 62 cs->cs_invert = false; 63 return (cs); 64 } 65 66 /* 67 * cset_add -- 68 * Add a character to the set. 69 */ 70 bool 71 cset_add(struct cset *cs, wchar_t ch) 72 { 73 struct csnode *csn, *ncsn; 74 wchar_t oval; 75 76 cs->cs_havecache = false; 77 78 /* 79 * Inserting into empty tree; new item becomes the root. 80 */ 81 if (cs->cs_root == NULL) { 82 csn = malloc(sizeof(*cs->cs_root)); 83 if (csn == NULL) 84 return (false); 85 csn->csn_left = csn->csn_right = NULL; 86 csn->csn_min = csn->csn_max = ch; 87 cs->cs_root = csn; 88 return (true); 89 } 90 91 /* 92 * Splay to check whether the item already exists, and otherwise, 93 * where we should put it. 94 */ 95 csn = cs->cs_root = cset_splay(cs->cs_root, ch); 96 97 /* 98 * Avoid adding duplicate nodes. 99 */ 100 if (cset_rangecmp(csn, ch) == 0) 101 return (true); 102 103 /* 104 * Allocate a new node and make it the new root. 105 */ 106 ncsn = malloc(sizeof(*ncsn)); 107 if (ncsn == NULL) 108 return (false); 109 ncsn->csn_min = ncsn->csn_max = ch; 110 if (cset_rangecmp(csn, ch) < 0) { 111 ncsn->csn_left = csn->csn_left; 112 ncsn->csn_right = csn; 113 csn->csn_left = NULL; 114 } else { 115 ncsn->csn_right = csn->csn_right; 116 ncsn->csn_left = csn; 117 csn->csn_right = NULL; 118 } 119 cs->cs_root = ncsn; 120 121 /* 122 * Coalesce with left and right neighbours if possible. 123 */ 124 if (ncsn->csn_left != NULL) { 125 ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1); 126 if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) { 127 oval = ncsn->csn_left->csn_min; 128 ncsn->csn_left = cset_delete(ncsn->csn_left, 129 ncsn->csn_left->csn_min); 130 ncsn->csn_min = oval; 131 } 132 } 133 if (ncsn->csn_right != NULL) { 134 ncsn->csn_right = cset_splay(ncsn->csn_right, 135 ncsn->csn_max + 1); 136 if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) { 137 oval = ncsn->csn_right->csn_max; 138 ncsn->csn_right = cset_delete(ncsn->csn_right, 139 ncsn->csn_right->csn_min); 140 ncsn->csn_max = oval; 141 } 142 } 143 144 return (true); 145 } 146 147 /* 148 * cset_in_hard -- 149 * Determine whether a character is in the set without using 150 * the cache. 151 */ 152 bool 153 cset_in_hard(struct cset *cs, wchar_t ch) 154 { 155 struct csclass *csc; 156 157 for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) 158 if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0)) 159 return (cs->cs_invert ^ true); 160 if (cs->cs_root != NULL) { 161 cs->cs_root = cset_splay(cs->cs_root, ch); 162 return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0)); 163 } 164 return (cs->cs_invert ^ false); 165 } 166 167 /* 168 * cset_cache -- 169 * Update the cache. 170 */ 171 void 172 cset_cache(struct cset *cs) 173 { 174 wchar_t i; 175 176 for (i = 0; i < CS_CACHE_SIZE; i++) 177 cs->cs_cache[i] = cset_in_hard(cs, i); 178 179 cs->cs_havecache = true; 180 } 181 182 /* 183 * cset_invert -- 184 * Invert the character set. 185 */ 186 void 187 cset_invert(struct cset *cs) 188 { 189 190 cs->cs_invert ^= true; 191 cs->cs_havecache = false; 192 } 193 194 /* 195 * cset_addclass -- 196 * Add a wctype()-style character class to the set, optionally 197 * inverting it. 198 */ 199 bool 200 cset_addclass(struct cset *cs, wctype_t type, bool invert) 201 { 202 struct csclass *csc; 203 204 csc = malloc(sizeof(*csc)); 205 if (csc == NULL) 206 return (false); 207 csc->csc_type = type; 208 csc->csc_invert = invert; 209 csc->csc_next = cs->cs_classes; 210 cs->cs_classes = csc; 211 cs->cs_havecache = false; 212 return (true); 213 } 214 215 static __inline int 216 cset_rangecmp(struct csnode *t, wchar_t ch) 217 { 218 219 if (ch < t->csn_min) 220 return (-1); 221 if (ch > t->csn_max) 222 return (1); 223 return (0); 224 } 225 226 static struct csnode * 227 cset_splay(struct csnode *t, wchar_t ch) 228 { 229 struct csnode N, *l, *r, *y; 230 231 /* 232 * Based on public domain code from Sleator. 233 */ 234 235 assert(t != NULL); 236 237 N.csn_left = N.csn_right = NULL; 238 l = r = &N; 239 for (;;) { 240 if (cset_rangecmp(t, ch) < 0) { 241 if (t->csn_left != NULL && 242 cset_rangecmp(t->csn_left, ch) < 0) { 243 y = t->csn_left; 244 t->csn_left = y->csn_right; 245 y->csn_right = t; 246 t = y; 247 } 248 if (t->csn_left == NULL) 249 break; 250 r->csn_left = t; 251 r = t; 252 t = t->csn_left; 253 } else if (cset_rangecmp(t, ch) > 0) { 254 if (t->csn_right != NULL && 255 cset_rangecmp(t->csn_right, ch) > 0) { 256 y = t->csn_right; 257 t->csn_right = y->csn_left; 258 y->csn_left = t; 259 t = y; 260 } 261 if (t->csn_right == NULL) 262 break; 263 l->csn_right = t; 264 l = t; 265 t = t->csn_right; 266 } else 267 break; 268 } 269 l->csn_right = t->csn_left; 270 r->csn_left = t->csn_right; 271 t->csn_left = N.csn_right; 272 t->csn_right = N.csn_left; 273 return (t); 274 } 275 276 static struct csnode * 277 cset_delete(struct csnode *t, wchar_t ch) 278 { 279 struct csnode *x; 280 281 assert(t != NULL); 282 t = cset_splay(t, ch); 283 assert(cset_rangecmp(t, ch) == 0); 284 if (t->csn_left == NULL) 285 x = t->csn_right; 286 else { 287 x = cset_splay(t->csn_left, ch); 288 x->csn_right = t->csn_right; 289 } 290 free(t); 291 return x; 292 } 293