1*163bd69bSGarrett D'Amore /* 2*163bd69bSGarrett D'Amore * Copyright (c) 2004 Tim J. Robbins. 3*163bd69bSGarrett D'Amore * All rights reserved. 4*163bd69bSGarrett D'Amore * 5*163bd69bSGarrett D'Amore * Redistribution and use in source and binary forms, with or without 6*163bd69bSGarrett D'Amore * modification, are permitted provided that the following conditions 7*163bd69bSGarrett D'Amore * are met: 8*163bd69bSGarrett D'Amore * 1. Redistributions of source code must retain the above copyright 9*163bd69bSGarrett D'Amore * notice, this list of conditions and the following disclaimer. 10*163bd69bSGarrett D'Amore * 2. Redistributions in binary form must reproduce the above copyright 11*163bd69bSGarrett D'Amore * notice, this list of conditions and the following disclaimer in the 12*163bd69bSGarrett D'Amore * documentation and/or other materials provided with the distribution. 13*163bd69bSGarrett D'Amore * 14*163bd69bSGarrett D'Amore * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15*163bd69bSGarrett D'Amore * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16*163bd69bSGarrett D'Amore * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17*163bd69bSGarrett D'Amore * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18*163bd69bSGarrett D'Amore * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19*163bd69bSGarrett D'Amore * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20*163bd69bSGarrett D'Amore * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21*163bd69bSGarrett D'Amore * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22*163bd69bSGarrett D'Amore * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23*163bd69bSGarrett D'Amore * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24*163bd69bSGarrett D'Amore * SUCH DAMAGE. 25*163bd69bSGarrett D'Amore */ 26*163bd69bSGarrett D'Amore /* 27*163bd69bSGarrett D'Amore * "Set of characters" ADT implemented as a splay tree of extents, with 28*163bd69bSGarrett D'Amore * a lookup table cache to simplify looking up the first bunch of 29*163bd69bSGarrett D'Amore * characters (which are presumably more common than others). 30*163bd69bSGarrett D'Amore */ 31*163bd69bSGarrett D'Amore 32*163bd69bSGarrett D'Amore #include <assert.h> 33*163bd69bSGarrett D'Amore #include <stdbool.h> 34*163bd69bSGarrett D'Amore #include <stdlib.h> 35*163bd69bSGarrett D'Amore #include <wchar.h> 36*163bd69bSGarrett D'Amore #include <wctype.h> 37*163bd69bSGarrett D'Amore #include "cset.h" 38*163bd69bSGarrett D'Amore 39*163bd69bSGarrett D'Amore static struct csnode *cset_delete(struct csnode *, wchar_t); 40*163bd69bSGarrett D'Amore static int cset_rangecmp(struct csnode *, wchar_t); 41*163bd69bSGarrett D'Amore static struct csnode *cset_splay(struct csnode *, wchar_t); 42*163bd69bSGarrett D'Amore 43*163bd69bSGarrett D'Amore /* 44*163bd69bSGarrett D'Amore * cset_alloc -- 45*163bd69bSGarrett D'Amore * Allocate a set of characters. 46*163bd69bSGarrett D'Amore */ 47*163bd69bSGarrett D'Amore struct cset * 48*163bd69bSGarrett D'Amore cset_alloc(void) 49*163bd69bSGarrett D'Amore { 50*163bd69bSGarrett D'Amore struct cset *cs; 51*163bd69bSGarrett D'Amore 52*163bd69bSGarrett D'Amore if ((cs = malloc(sizeof (*cs))) == NULL) 53*163bd69bSGarrett D'Amore return (NULL); 54*163bd69bSGarrett D'Amore cs->cs_root = NULL; 55*163bd69bSGarrett D'Amore cs->cs_classes = NULL; 56*163bd69bSGarrett D'Amore cs->cs_havecache = false; 57*163bd69bSGarrett D'Amore cs->cs_invert = false; 58*163bd69bSGarrett D'Amore return (cs); 59*163bd69bSGarrett D'Amore } 60*163bd69bSGarrett D'Amore 61*163bd69bSGarrett D'Amore /* 62*163bd69bSGarrett D'Amore * cset_add -- 63*163bd69bSGarrett D'Amore * Add a character to the set. 64*163bd69bSGarrett D'Amore */ 65*163bd69bSGarrett D'Amore bool 66*163bd69bSGarrett D'Amore cset_add(struct cset *cs, wchar_t ch) 67*163bd69bSGarrett D'Amore { 68*163bd69bSGarrett D'Amore struct csnode *csn, *ncsn; 69*163bd69bSGarrett D'Amore wchar_t oval; 70*163bd69bSGarrett D'Amore 71*163bd69bSGarrett D'Amore cs->cs_havecache = false; 72*163bd69bSGarrett D'Amore 73*163bd69bSGarrett D'Amore /* 74*163bd69bSGarrett D'Amore * Inserting into empty tree; new item becomes the root. 75*163bd69bSGarrett D'Amore */ 76*163bd69bSGarrett D'Amore if (cs->cs_root == NULL) { 77*163bd69bSGarrett D'Amore csn = malloc(sizeof (*cs->cs_root)); 78*163bd69bSGarrett D'Amore if (csn == NULL) 79*163bd69bSGarrett D'Amore return (false); 80*163bd69bSGarrett D'Amore csn->csn_left = csn->csn_right = NULL; 81*163bd69bSGarrett D'Amore csn->csn_min = csn->csn_max = ch; 82*163bd69bSGarrett D'Amore cs->cs_root = csn; 83*163bd69bSGarrett D'Amore return (true); 84*163bd69bSGarrett D'Amore } 85*163bd69bSGarrett D'Amore 86*163bd69bSGarrett D'Amore /* 87*163bd69bSGarrett D'Amore * Splay to check whether the item already exists, and otherwise, 88*163bd69bSGarrett D'Amore * where we should put it. 89*163bd69bSGarrett D'Amore */ 90*163bd69bSGarrett D'Amore csn = cs->cs_root = cset_splay(cs->cs_root, ch); 91*163bd69bSGarrett D'Amore 92*163bd69bSGarrett D'Amore /* 93*163bd69bSGarrett D'Amore * Avoid adding duplicate nodes. 94*163bd69bSGarrett D'Amore */ 95*163bd69bSGarrett D'Amore if (cset_rangecmp(csn, ch) == 0) 96*163bd69bSGarrett D'Amore return (true); 97*163bd69bSGarrett D'Amore 98*163bd69bSGarrett D'Amore /* 99*163bd69bSGarrett D'Amore * Allocate a new node and make it the new root. 100*163bd69bSGarrett D'Amore */ 101*163bd69bSGarrett D'Amore ncsn = malloc(sizeof (*ncsn)); 102*163bd69bSGarrett D'Amore if (ncsn == NULL) 103*163bd69bSGarrett D'Amore return (false); 104*163bd69bSGarrett D'Amore ncsn->csn_min = ncsn->csn_max = ch; 105*163bd69bSGarrett D'Amore if (cset_rangecmp(csn, ch) < 0) { 106*163bd69bSGarrett D'Amore ncsn->csn_left = csn->csn_left; 107*163bd69bSGarrett D'Amore ncsn->csn_right = csn; 108*163bd69bSGarrett D'Amore csn->csn_left = NULL; 109*163bd69bSGarrett D'Amore } else { 110*163bd69bSGarrett D'Amore ncsn->csn_right = csn->csn_right; 111*163bd69bSGarrett D'Amore ncsn->csn_left = csn; 112*163bd69bSGarrett D'Amore csn->csn_right = NULL; 113*163bd69bSGarrett D'Amore } 114*163bd69bSGarrett D'Amore cs->cs_root = ncsn; 115*163bd69bSGarrett D'Amore 116*163bd69bSGarrett D'Amore /* 117*163bd69bSGarrett D'Amore * Coalesce with left and right neighbours if possible. 118*163bd69bSGarrett D'Amore */ 119*163bd69bSGarrett D'Amore if (ncsn->csn_left != NULL) { 120*163bd69bSGarrett D'Amore ncsn->csn_left = cset_splay(ncsn->csn_left, ncsn->csn_min - 1); 121*163bd69bSGarrett D'Amore if (ncsn->csn_left->csn_max == ncsn->csn_min - 1) { 122*163bd69bSGarrett D'Amore oval = ncsn->csn_left->csn_min; 123*163bd69bSGarrett D'Amore ncsn->csn_left = cset_delete(ncsn->csn_left, 124*163bd69bSGarrett D'Amore ncsn->csn_left->csn_min); 125*163bd69bSGarrett D'Amore ncsn->csn_min = oval; 126*163bd69bSGarrett D'Amore } 127*163bd69bSGarrett D'Amore } 128*163bd69bSGarrett D'Amore if (ncsn->csn_right != NULL) { 129*163bd69bSGarrett D'Amore ncsn->csn_right = cset_splay(ncsn->csn_right, 130*163bd69bSGarrett D'Amore ncsn->csn_max + 1); 131*163bd69bSGarrett D'Amore if (ncsn->csn_right->csn_min == ncsn->csn_max + 1) { 132*163bd69bSGarrett D'Amore oval = ncsn->csn_right->csn_max; 133*163bd69bSGarrett D'Amore ncsn->csn_right = cset_delete(ncsn->csn_right, 134*163bd69bSGarrett D'Amore ncsn->csn_right->csn_min); 135*163bd69bSGarrett D'Amore ncsn->csn_max = oval; 136*163bd69bSGarrett D'Amore } 137*163bd69bSGarrett D'Amore } 138*163bd69bSGarrett D'Amore 139*163bd69bSGarrett D'Amore return (true); 140*163bd69bSGarrett D'Amore } 141*163bd69bSGarrett D'Amore 142*163bd69bSGarrett D'Amore /* 143*163bd69bSGarrett D'Amore * cset_in_hard -- 144*163bd69bSGarrett D'Amore * Determine whether a character is in the set without using 145*163bd69bSGarrett D'Amore * the cache. 146*163bd69bSGarrett D'Amore */ 147*163bd69bSGarrett D'Amore bool 148*163bd69bSGarrett D'Amore cset_in_hard(struct cset *cs, wchar_t ch) 149*163bd69bSGarrett D'Amore { 150*163bd69bSGarrett D'Amore struct csclass *csc; 151*163bd69bSGarrett D'Amore 152*163bd69bSGarrett D'Amore for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next) 153*163bd69bSGarrett D'Amore if ((csc->csc_invert ^ iswctype(ch, csc->csc_type)) != 0) 154*163bd69bSGarrett D'Amore return (cs->cs_invert ^ true); 155*163bd69bSGarrett D'Amore if (cs->cs_root != NULL) { 156*163bd69bSGarrett D'Amore cs->cs_root = cset_splay(cs->cs_root, ch); 157*163bd69bSGarrett D'Amore return ((cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch)) == 0); 158*163bd69bSGarrett D'Amore } 159*163bd69bSGarrett D'Amore return (cs->cs_invert ^ false); 160*163bd69bSGarrett D'Amore } 161*163bd69bSGarrett D'Amore 162*163bd69bSGarrett D'Amore /* 163*163bd69bSGarrett D'Amore * cset_cache -- 164*163bd69bSGarrett D'Amore * Update the cache. 165*163bd69bSGarrett D'Amore */ 166*163bd69bSGarrett D'Amore void 167*163bd69bSGarrett D'Amore cset_cache(struct cset *cs) 168*163bd69bSGarrett D'Amore { 169*163bd69bSGarrett D'Amore wchar_t i; 170*163bd69bSGarrett D'Amore 171*163bd69bSGarrett D'Amore for (i = 0; i < CS_CACHE_SIZE; i++) 172*163bd69bSGarrett D'Amore cs->cs_cache[i] = cset_in_hard(cs, i); 173*163bd69bSGarrett D'Amore 174*163bd69bSGarrett D'Amore cs->cs_havecache = true; 175*163bd69bSGarrett D'Amore } 176*163bd69bSGarrett D'Amore 177*163bd69bSGarrett D'Amore /* 178*163bd69bSGarrett D'Amore * cset_invert -- 179*163bd69bSGarrett D'Amore * Invert the character set. 180*163bd69bSGarrett D'Amore */ 181*163bd69bSGarrett D'Amore void 182*163bd69bSGarrett D'Amore cset_invert(struct cset *cs) 183*163bd69bSGarrett D'Amore { 184*163bd69bSGarrett D'Amore 185*163bd69bSGarrett D'Amore cs->cs_invert ^= true; 186*163bd69bSGarrett D'Amore cs->cs_havecache = false; 187*163bd69bSGarrett D'Amore } 188*163bd69bSGarrett D'Amore 189*163bd69bSGarrett D'Amore /* 190*163bd69bSGarrett D'Amore * cset_addclass -- 191*163bd69bSGarrett D'Amore * Add a wctype()-style character class to the set, optionally 192*163bd69bSGarrett D'Amore * inverting it. 193*163bd69bSGarrett D'Amore */ 194*163bd69bSGarrett D'Amore bool 195*163bd69bSGarrett D'Amore cset_addclass(struct cset *cs, wctype_t type, bool invert) 196*163bd69bSGarrett D'Amore { 197*163bd69bSGarrett D'Amore struct csclass *csc; 198*163bd69bSGarrett D'Amore 199*163bd69bSGarrett D'Amore csc = malloc(sizeof (*csc)); 200*163bd69bSGarrett D'Amore if (csc == NULL) 201*163bd69bSGarrett D'Amore return (false); 202*163bd69bSGarrett D'Amore csc->csc_type = type; 203*163bd69bSGarrett D'Amore csc->csc_invert = invert; 204*163bd69bSGarrett D'Amore csc->csc_next = cs->cs_classes; 205*163bd69bSGarrett D'Amore cs->cs_classes = csc; 206*163bd69bSGarrett D'Amore cs->cs_havecache = false; 207*163bd69bSGarrett D'Amore return (true); 208*163bd69bSGarrett D'Amore } 209*163bd69bSGarrett D'Amore 210*163bd69bSGarrett D'Amore static int 211*163bd69bSGarrett D'Amore cset_rangecmp(struct csnode *t, wchar_t ch) 212*163bd69bSGarrett D'Amore { 213*163bd69bSGarrett D'Amore 214*163bd69bSGarrett D'Amore if (ch < t->csn_min) 215*163bd69bSGarrett D'Amore return (-1); 216*163bd69bSGarrett D'Amore if (ch > t->csn_max) 217*163bd69bSGarrett D'Amore return (1); 218*163bd69bSGarrett D'Amore return (0); 219*163bd69bSGarrett D'Amore } 220*163bd69bSGarrett D'Amore 221*163bd69bSGarrett D'Amore static struct csnode * 222*163bd69bSGarrett D'Amore cset_splay(struct csnode *t, wchar_t ch) 223*163bd69bSGarrett D'Amore { 224*163bd69bSGarrett D'Amore struct csnode N, *l, *r, *y; 225*163bd69bSGarrett D'Amore 226*163bd69bSGarrett D'Amore /* 227*163bd69bSGarrett D'Amore * Based on public domain code from Sleator. 228*163bd69bSGarrett D'Amore */ 229*163bd69bSGarrett D'Amore 230*163bd69bSGarrett D'Amore assert(t != NULL); 231*163bd69bSGarrett D'Amore 232*163bd69bSGarrett D'Amore N.csn_left = N.csn_right = NULL; 233*163bd69bSGarrett D'Amore l = r = &N; 234*163bd69bSGarrett D'Amore for (;;) { 235*163bd69bSGarrett D'Amore if (cset_rangecmp(t, ch) < 0) { 236*163bd69bSGarrett D'Amore if (t->csn_left != NULL && 237*163bd69bSGarrett D'Amore cset_rangecmp(t->csn_left, ch) < 0) { 238*163bd69bSGarrett D'Amore y = t->csn_left; 239*163bd69bSGarrett D'Amore t->csn_left = y->csn_right; 240*163bd69bSGarrett D'Amore y->csn_right = t; 241*163bd69bSGarrett D'Amore t = y; 242*163bd69bSGarrett D'Amore } 243*163bd69bSGarrett D'Amore if (t->csn_left == NULL) 244*163bd69bSGarrett D'Amore break; 245*163bd69bSGarrett D'Amore r->csn_left = t; 246*163bd69bSGarrett D'Amore r = t; 247*163bd69bSGarrett D'Amore t = t->csn_left; 248*163bd69bSGarrett D'Amore } else if (cset_rangecmp(t, ch) > 0) { 249*163bd69bSGarrett D'Amore if (t->csn_right != NULL && 250*163bd69bSGarrett D'Amore cset_rangecmp(t->csn_right, ch) > 0) { 251*163bd69bSGarrett D'Amore y = t->csn_right; 252*163bd69bSGarrett D'Amore t->csn_right = y->csn_left; 253*163bd69bSGarrett D'Amore y->csn_left = t; 254*163bd69bSGarrett D'Amore t = y; 255*163bd69bSGarrett D'Amore } 256*163bd69bSGarrett D'Amore if (t->csn_right == NULL) 257*163bd69bSGarrett D'Amore break; 258*163bd69bSGarrett D'Amore l->csn_right = t; 259*163bd69bSGarrett D'Amore l = t; 260*163bd69bSGarrett D'Amore t = t->csn_right; 261*163bd69bSGarrett D'Amore } else 262*163bd69bSGarrett D'Amore break; 263*163bd69bSGarrett D'Amore } 264*163bd69bSGarrett D'Amore l->csn_right = t->csn_left; 265*163bd69bSGarrett D'Amore r->csn_left = t->csn_right; 266*163bd69bSGarrett D'Amore t->csn_left = N.csn_right; 267*163bd69bSGarrett D'Amore t->csn_right = N.csn_left; 268*163bd69bSGarrett D'Amore return (t); 269*163bd69bSGarrett D'Amore } 270*163bd69bSGarrett D'Amore 271*163bd69bSGarrett D'Amore static struct csnode * 272*163bd69bSGarrett D'Amore cset_delete(struct csnode *t, wchar_t ch) 273*163bd69bSGarrett D'Amore { 274*163bd69bSGarrett D'Amore struct csnode *x; 275*163bd69bSGarrett D'Amore 276*163bd69bSGarrett D'Amore assert(t != NULL); 277*163bd69bSGarrett D'Amore t = cset_splay(t, ch); 278*163bd69bSGarrett D'Amore assert(cset_rangecmp(t, ch) == 0); 279*163bd69bSGarrett D'Amore if (t->csn_left == NULL) 280*163bd69bSGarrett D'Amore x = t->csn_right; 281*163bd69bSGarrett D'Amore else { 282*163bd69bSGarrett D'Amore x = cset_splay(t->csn_left, ch); 283*163bd69bSGarrett D'Amore x->csn_right = t->csn_right; 284*163bd69bSGarrett D'Amore } 285*163bd69bSGarrett D'Amore free(t); 286*163bd69bSGarrett D'Amore return (x); 287*163bd69bSGarrett D'Amore } 288