1163bd69bSGarrett D'Amore /* 2163bd69bSGarrett D'Amore * Copyright (c) 2004 Tim J. Robbins. 3163bd69bSGarrett D'Amore * All rights reserved. 4163bd69bSGarrett D'Amore * 5163bd69bSGarrett D'Amore * Redistribution and use in source and binary forms, with or without 6163bd69bSGarrett D'Amore * modification, are permitted provided that the following conditions 7163bd69bSGarrett D'Amore * are met: 8163bd69bSGarrett D'Amore * 1. Redistributions of source code must retain the above copyright 9163bd69bSGarrett D'Amore * notice, this list of conditions and the following disclaimer. 10163bd69bSGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright 11163bd69bSGarrett D'Amore * notice, this list of conditions and the following disclaimer in the 12163bd69bSGarrett D'Amore * documentation and/or other materials provided with the distribution. 13163bd69bSGarrett D'Amore * 14163bd69bSGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15163bd69bSGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16163bd69bSGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17163bd69bSGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18163bd69bSGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19163bd69bSGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20163bd69bSGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21163bd69bSGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22163bd69bSGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23163bd69bSGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24163bd69bSGarrett D'Amore * SUCH DAMAGE. 25163bd69bSGarrett D'Amore */ 26163bd69bSGarrett D'Amore /* 27163bd69bSGarrett D'Amore * "Set of characters" ADT implemented as a splay tree of extents, with 28163bd69bSGarrett D'Amore * a lookup table cache to simplify looking up the first bunch of 29163bd69bSGarrett D'Amore * characters (which are presumably more common than others). 30163bd69bSGarrett D'Amore */ 31163bd69bSGarrett D'Amore 32163bd69bSGarrett D'Amore #include <assert.h> 33163bd69bSGarrett D'Amore #include <stdbool.h> 34163bd69bSGarrett D'Amore #include <stdlib.h> 35163bd69bSGarrett D'Amore #include <wchar.h> 36163bd69bSGarrett D'Amore #include <wctype.h> 37163bd69bSGarrett D'Amore #include "cset.h" 38163bd69bSGarrett D'Amore 39163bd69bSGarrett D'Amore static struct csnode *cset_delete(struct csnode *, wchar_t); 40163bd69bSGarrett D'Amore static int cset_rangecmp(struct csnode *, wchar_t); 41163bd69bSGarrett D'Amore static struct csnode *cset_splay(struct csnode *, wchar_t); 42163bd69bSGarrett D'Amore 43163bd69bSGarrett D'Amore /* 44163bd69bSGarrett D'Amore * cset_alloc -- 45163bd69bSGarrett D'Amore * Allocate a set of characters. 46163bd69bSGarrett D'Amore */ 47163bd69bSGarrett D'Amore struct cset * 48163bd69bSGarrett D'Amore cset_alloc(void) 49163bd69bSGarrett D'Amore { 50163bd69bSGarrett D'Amore struct cset *cs; 51163bd69bSGarrett D'Amore 52163bd69bSGarrett D'Amore if ((cs = malloc(sizeof (*cs))) == NULL) 53163bd69bSGarrett D'Amore return (NULL); 54163bd69bSGarrett D'Amore cs->cs_root = NULL; 55163bd69bSGarrett D'Amore cs->cs_classes = NULL; 56163bd69bSGarrett D'Amore cs->cs_havecache = false; 57163bd69bSGarrett D'Amore cs->cs_invert = false; 58163bd69bSGarrett D'Amore return (cs); 59163bd69bSGarrett D'Amore } 60163bd69bSGarrett D'Amore 61163bd69bSGarrett D'Amore /* 62163bd69bSGarrett D'Amore * cset_add -- 63163bd69bSGarrett D'Amore * Add a character to the set. 64163bd69bSGarrett D'Amore */ 65163bd69bSGarrett D'Amore bool 66163bd69bSGarrett D'Amore cset_add(struct cset *cs, wchar_t ch) 67163bd69bSGarrett D'Amore { 68163bd69bSGarrett D'Amore struct csnode *csn, *ncsn; 69163bd69bSGarrett D'Amore wchar_t oval; 70163bd69bSGarrett D'Amore 71163bd69bSGarrett D'Amore cs->cs_havecache = false; 72163bd69bSGarrett D'Amore 73163bd69bSGarrett D'Amore /* 74163bd69bSGarrett D'Amore * Inserting into empty tree; new item becomes the root. 75163bd69bSGarrett D'Amore */ 76163bd69bSGarrett D'Amore if (cs->cs_root == NULL) { 77163bd69bSGarrett D'Amore csn = malloc(sizeof (*cs->cs_root)); 78163bd69bSGarrett D'Amore if (csn == NULL) 79163bd69bSGarrett D'Amore return (false); 80163bd69bSGarrett D'Amore csn->csn_left = csn->csn_right = NULL; 81163bd69bSGarrett D'Amore csn->csn_min = csn->csn_max = ch; 82163bd69bSGarrett D'Amore cs->cs_root = csn; 83163bd69bSGarrett D'Amore return (true); 84163bd69bSGarrett D'Amore } 85163bd69bSGarrett D'Amore 86163bd69bSGarrett D'Amore /* 87163bd69bSGarrett D'Amore * Splay to check whether the item already exists, and otherwise, 88163bd69bSGarrett D'Amore * where we should put it. 89163bd69bSGarrett D'Amore */ 90163bd69bSGarrett D'Amore csn = cs->cs_root = cset_splay(cs->cs_root, ch); 91163bd69bSGarrett D'Amore 92163bd69bSGarrett D'Amore /* 93163bd69bSGarrett D'Amore * Avoid adding duplicate nodes. 94163bd69bSGarrett D'Amore */ 95163bd69bSGarrett D'Amore if (cset_rangecmp(csn, ch) == 0) 96163bd69bSGarrett D'Amore return (true); 97163bd69bSGarrett D'Amore 98163bd69bSGarrett D'Amore /* 99163bd69bSGarrett D'Amore * Allocate a new node and make it the new root. 100163bd69bSGarrett D'Amore */ 101163bd69bSGarrett D'Amore ncsn = malloc(sizeof (*ncsn)); 102163bd69bSGarrett D'Amore if (ncsn == NULL) 103163bd69bSGarrett D'Amore return (false); 104163bd69bSGarrett D'Amore ncsn->csn_min = ncsn->csn_max = ch; 105163bd69bSGarrett D'Amore if (cset_rangecmp(csn, ch) < 0) { 106163bd69bSGarrett D'Amore ncsn->csn_left = csn->csn_left; 107163bd69bSGarrett D'Amore ncsn->csn_right = csn; 108163bd69bSGarrett D'Amore csn->csn_left = NULL; 109163bd69bSGarrett D'Amore } else { 110163bd69bSGarrett D'Amore ncsn->csn_right = csn->csn_right; 111163bd69bSGarrett D'Amore ncsn->csn_left = csn; 112163bd69bSGarrett D'Amore csn->csn_right = NULL; 113163bd69bSGarrett D'Amore } 114163bd69bSGarrett D'Amore cs->cs_root = ncsn; 115163bd69bSGarrett D'Amore 116163bd69bSGarrett D'Amore /* 117163bd69bSGarrett D'Amore * Coalesce with left and right neighbours if possible. 118163bd69bSGarrett D'Amore */ 119163bd69bSGarrett D'Amore if (ncsn->csn_left != NULL) { 120163bd69bSGarrett D'Amore ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1); 121163bd69bSGarrett D'Amore if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) { 122163bd69bSGarrett D'Amore oval = ncsn->csn_left->csn_min; 123163bd69bSGarrett D'Amore ncsn->csn_left = cset_delete(ncsn->csn_left, 124163bd69bSGarrett D'Amore ncsn->csn_left->csn_min); 125163bd69bSGarrett D'Amore ncsn->csn_min = oval; 126163bd69bSGarrett D'Amore } 127163bd69bSGarrett D'Amore } 128163bd69bSGarrett D'Amore if (ncsn->csn_right != NULL) { 129163bd69bSGarrett D'Amore ncsn->csn_right = cset_splay(ncsn->csn_right, 130163bd69bSGarrett D'Amore ncsn->csn_max + 1); 131163bd69bSGarrett D'Amore if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) { 132163bd69bSGarrett D'Amore oval = ncsn->csn_right->csn_max; 133163bd69bSGarrett D'Amore ncsn->csn_right = cset_delete(ncsn->csn_right, 134163bd69bSGarrett D'Amore ncsn->csn_right->csn_min); 135163bd69bSGarrett D'Amore ncsn->csn_max = oval; 136163bd69bSGarrett D'Amore } 137163bd69bSGarrett D'Amore } 138163bd69bSGarrett D'Amore 139163bd69bSGarrett D'Amore return (true); 140163bd69bSGarrett D'Amore } 141163bd69bSGarrett D'Amore 142163bd69bSGarrett D'Amore /* 143163bd69bSGarrett D'Amore * cset_in_hard -- 144163bd69bSGarrett D'Amore * Determine whether a character is in the set without using 145163bd69bSGarrett D'Amore * the cache. 146163bd69bSGarrett D'Amore */ 147163bd69bSGarrett D'Amore bool 148163bd69bSGarrett D'Amore cset_in_hard(struct cset *cs, wchar_t ch) 149163bd69bSGarrett D'Amore { 150163bd69bSGarrett D'Amore struct csclass *csc; 151163bd69bSGarrett D'Amore 152163bd69bSGarrett D'Amore for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) 153*84cf253fSRichard Lowe if (csc->csc_invert ^ (iswctype(ch, csc->csc_type) != 0)) 154163bd69bSGarrett D'Amore return (cs->cs_invert ^ true); 155163bd69bSGarrett D'Amore if (cs->cs_root != NULL) { 156163bd69bSGarrett D'Amore cs->cs_root = cset_splay(cs->cs_root, ch); 157*84cf253fSRichard Lowe return (cs->cs_invert ^ (cset_rangecmp(cs->cs_root, ch) == 0)); 158163bd69bSGarrett D'Amore } 159163bd69bSGarrett D'Amore return (cs->cs_invert ^ false); 160163bd69bSGarrett D'Amore } 161163bd69bSGarrett D'Amore 162163bd69bSGarrett D'Amore /* 163163bd69bSGarrett D'Amore * cset_cache -- 164163bd69bSGarrett D'Amore * Update the cache. 165163bd69bSGarrett D'Amore */ 166163bd69bSGarrett D'Amore void 167163bd69bSGarrett D'Amore cset_cache(struct cset *cs) 168163bd69bSGarrett D'Amore { 169163bd69bSGarrett D'Amore wchar_t i; 170163bd69bSGarrett D'Amore 171163bd69bSGarrett D'Amore for (i = 0; i < CS_CACHE_SIZE; i++) 172163bd69bSGarrett D'Amore cs->cs_cache[i] = cset_in_hard(cs, i); 173163bd69bSGarrett D'Amore 174163bd69bSGarrett D'Amore cs->cs_havecache = true; 175163bd69bSGarrett D'Amore } 176163bd69bSGarrett D'Amore 177163bd69bSGarrett D'Amore /* 178163bd69bSGarrett D'Amore * cset_invert -- 179163bd69bSGarrett D'Amore * Invert the character set. 180163bd69bSGarrett D'Amore */ 181163bd69bSGarrett D'Amore void 182163bd69bSGarrett D'Amore cset_invert(struct cset *cs) 183163bd69bSGarrett D'Amore { 184163bd69bSGarrett D'Amore 185163bd69bSGarrett D'Amore cs->cs_invert ^= true; 186163bd69bSGarrett D'Amore cs->cs_havecache = false; 187163bd69bSGarrett D'Amore } 188163bd69bSGarrett D'Amore 189163bd69bSGarrett D'Amore /* 190163bd69bSGarrett D'Amore * cset_addclass -- 191163bd69bSGarrett D'Amore * Add a wctype()-style character class to the set, optionally 192163bd69bSGarrett D'Amore * inverting it. 193163bd69bSGarrett D'Amore */ 194163bd69bSGarrett D'Amore bool 195163bd69bSGarrett D'Amore cset_addclass(struct cset *cs, wctype_t type, bool invert) 196163bd69bSGarrett D'Amore { 197163bd69bSGarrett D'Amore struct csclass *csc; 198163bd69bSGarrett D'Amore 199163bd69bSGarrett D'Amore csc = malloc(sizeof (*csc)); 200163bd69bSGarrett D'Amore if (csc == NULL) 201163bd69bSGarrett D'Amore return (false); 202163bd69bSGarrett D'Amore csc->csc_type = type; 203163bd69bSGarrett D'Amore csc->csc_invert = invert; 204163bd69bSGarrett D'Amore csc->csc_next = cs->cs_classes; 205163bd69bSGarrett D'Amore cs->cs_classes = csc; 206163bd69bSGarrett D'Amore cs->cs_havecache = false; 207163bd69bSGarrett D'Amore return (true); 208163bd69bSGarrett D'Amore } 209163bd69bSGarrett D'Amore 210163bd69bSGarrett D'Amore static int 211163bd69bSGarrett D'Amore cset_rangecmp(struct csnode *t, wchar_t ch) 212163bd69bSGarrett D'Amore { 213163bd69bSGarrett D'Amore 214163bd69bSGarrett D'Amore if (ch < t->csn_min) 215163bd69bSGarrett D'Amore return (-1); 216163bd69bSGarrett D'Amore if (ch > t->csn_max) 217163bd69bSGarrett D'Amore return (1); 218163bd69bSGarrett D'Amore return (0); 219163bd69bSGarrett D'Amore } 220163bd69bSGarrett D'Amore 221163bd69bSGarrett D'Amore static struct csnode * 222163bd69bSGarrett D'Amore cset_splay(struct csnode *t, wchar_t ch) 223163bd69bSGarrett D'Amore { 224163bd69bSGarrett D'Amore struct csnode N, *l, *r, *y; 225163bd69bSGarrett D'Amore 226163bd69bSGarrett D'Amore /* 227163bd69bSGarrett D'Amore * Based on public domain code from Sleator. 228163bd69bSGarrett D'Amore */ 229163bd69bSGarrett D'Amore 230163bd69bSGarrett D'Amore assert(t != NULL); 231163bd69bSGarrett D'Amore 232163bd69bSGarrett D'Amore N.csn_left = N.csn_right = NULL; 233163bd69bSGarrett D'Amore l = r = &N; 234163bd69bSGarrett D'Amore for (;;) { 235163bd69bSGarrett D'Amore if (cset_rangecmp(t, ch) < 0) { 236163bd69bSGarrett D'Amore if (t->csn_left != NULL && 237163bd69bSGarrett D'Amore cset_rangecmp(t->csn_left, ch) < 0) { 238163bd69bSGarrett D'Amore y = t->csn_left; 239163bd69bSGarrett D'Amore t->csn_left = y->csn_right; 240163bd69bSGarrett D'Amore y->csn_right = t; 241163bd69bSGarrett D'Amore t = y; 242163bd69bSGarrett D'Amore } 243163bd69bSGarrett D'Amore if (t->csn_left == NULL) 244163bd69bSGarrett D'Amore break; 245163bd69bSGarrett D'Amore r->csn_left = t; 246163bd69bSGarrett D'Amore r = t; 247163bd69bSGarrett D'Amore t = t->csn_left; 248163bd69bSGarrett D'Amore } else if (cset_rangecmp(t, ch) > 0) { 249163bd69bSGarrett D'Amore if (t->csn_right != NULL && 250163bd69bSGarrett D'Amore cset_rangecmp(t->csn_right, ch) > 0) { 251163bd69bSGarrett D'Amore y = t->csn_right; 252163bd69bSGarrett D'Amore t->csn_right = y->csn_left; 253163bd69bSGarrett D'Amore y->csn_left = t; 254163bd69bSGarrett D'Amore t = y; 255163bd69bSGarrett D'Amore } 256163bd69bSGarrett D'Amore if (t->csn_right == NULL) 257163bd69bSGarrett D'Amore break; 258163bd69bSGarrett D'Amore l->csn_right = t; 259163bd69bSGarrett D'Amore l = t; 260163bd69bSGarrett D'Amore t = t->csn_right; 261163bd69bSGarrett D'Amore } else 262163bd69bSGarrett D'Amore break; 263163bd69bSGarrett D'Amore } 264163bd69bSGarrett D'Amore l->csn_right = t->csn_left; 265163bd69bSGarrett D'Amore r->csn_left = t->csn_right; 266163bd69bSGarrett D'Amore t->csn_left = N.csn_right; 267163bd69bSGarrett D'Amore t->csn_right = N.csn_left; 268163bd69bSGarrett D'Amore return (t); 269163bd69bSGarrett D'Amore } 270163bd69bSGarrett D'Amore 271163bd69bSGarrett D'Amore static struct csnode * 272163bd69bSGarrett D'Amore cset_delete(struct csnode *t, wchar_t ch) 273163bd69bSGarrett D'Amore { 274163bd69bSGarrett D'Amore struct csnode *x; 275163bd69bSGarrett D'Amore 276163bd69bSGarrett D'Amore assert(t != NULL); 277163bd69bSGarrett D'Amore t = cset_splay(t, ch); 278163bd69bSGarrett D'Amore assert(cset_rangecmp(t, ch) == 0); 279163bd69bSGarrett D'Amore if (t->csn_left == NULL) 280163bd69bSGarrett D'Amore x = t->csn_right; 281163bd69bSGarrett D'Amore else { 282163bd69bSGarrett D'Amore x = cset_splay(t->csn_left, ch); 283163bd69bSGarrett D'Amore x->csn_right = t->csn_right; 284163bd69bSGarrett D'Amore } 285163bd69bSGarrett D'Amore free(t); 286163bd69bSGarrett D'Amore return (x); 287163bd69bSGarrett D'Amore } 288