1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 /* 8 * Copyright (c) 1990, 1993, 1994, 1995, 1996 9 * Keith Bostic. All rights reserved. 10 */ 11 /* 12 * Copyright (c) 1990, 1993, 1994, 1995 13 * The Regents of the University of California. All rights reserved. 14 * 15 * This code is derived from software contributed to Berkeley by 16 * Mike Olson. 17 * 18 * Redistribution and use in source and binary forms, with or without 19 * modification, are permitted provided that the following conditions 20 * are met: 21 * 1. Redistributions of source code must retain the above copyright 22 * notice, this list of conditions and the following disclaimer. 23 * 2. Redistributions in binary form must reproduce the above copyright 24 * notice, this list of conditions and the following disclaimer in the 25 * documentation and/or other materials provided with the distribution. 26 * 3. All advertising materials mentioning features or use of this software 27 * must display the following acknowledgement: 28 * This product includes software developed by the University of 29 * California, Berkeley and its contributors. 30 * 4. Neither the name of the University nor the names of its contributors 31 * may be used to endorse or promote products derived from this software 32 * without specific prior written permission. 33 * 34 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 35 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 36 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 37 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 38 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 39 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 40 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 41 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 42 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 43 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 44 * SUCH DAMAGE. 45 */ 46 47 #include "config.h" 48 49 #ifndef lint 50 static const char sccsid[] = "@(#)bt_search.c 10.25 (Sleepycat) 12/16/98"; 51 #endif /* not lint */ 52 53 #ifndef NO_SYSTEM_INCLUDES 54 #include <sys/types.h> 55 56 #include <errno.h> 57 #include <string.h> 58 #endif 59 60 #include "db_int.h" 61 #include "db_page.h" 62 #include "btree.h" 63 64 /* 65 * __bam_search -- 66 * Search a btree for a key. 67 * 68 * PUBLIC: int __bam_search __P((DBC *, 69 * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); 70 */ 71 int 72 __bam_search(dbc, key, flags, stop, recnop, exactp) 73 DBC *dbc; 74 const DBT *key; 75 u_int32_t flags; 76 int stop, *exactp; 77 db_recno_t *recnop; 78 { 79 BTREE *t; 80 CURSOR *cp; 81 DB *dbp; 82 DB_LOCK lock; 83 PAGE *h; 84 db_indx_t base, i, indx, lim; 85 db_pgno_t pg; 86 db_recno_t recno; 87 int cmp, jump, ret, stack; 88 89 dbp = dbc->dbp; 90 cp = dbc->internal; 91 t = dbp->internal; 92 recno = 0; 93 94 BT_STK_CLR(cp); 95 96 /* 97 * There are several ways we search a btree tree. The flags argument 98 * specifies if we're acquiring read or write locks, if we position 99 * to the first or last item in a set of duplicates, if we return 100 * deleted items, and if we are locking pairs of pages. In addition, 101 * if we're modifying record numbers, we have to lock the entire tree 102 * regardless. See btree.h for more details. 103 * 104 * If write-locking pages, we need to know whether or not to acquire a 105 * write lock on a page before getting it. This depends on how deep it 106 * is in tree, which we don't know until we acquire the root page. So, 107 * if we need to lock the root page we may have to upgrade it later, 108 * because we won't get the correct lock initially. 109 * 110 * Retrieve the root page. 111 */ 112 pg = PGNO_ROOT; 113 stack = F_ISSET(dbp, DB_BT_RECNUM) && LF_ISSET(S_STACK); 114 if ((ret = __bam_lget(dbc, 115 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) 116 return (ret); 117 if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { 118 (void)__BT_LPUT(dbc, lock); 119 return (ret); 120 } 121 122 /* 123 * Decide if we need to save this page; if we do, write lock it. 124 * We deliberately don't lock-couple on this call. If the tree 125 * is tiny, i.e., one page, and two threads are busily updating 126 * the root page, we're almost guaranteed deadlocks galore, as 127 * each one gets a read lock and then blocks the other's attempt 128 * for a write lock. 129 */ 130 if (!stack && 131 ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || 132 (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { 133 (void)memp_fput(dbp->mpf, h, 0); 134 (void)__BT_LPUT(dbc, lock); 135 if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) 136 return (ret); 137 if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { 138 (void)__BT_LPUT(dbc, lock); 139 return (ret); 140 } 141 stack = 1; 142 } 143 144 for (;;) { 145 /* 146 * Do a binary search on the current page. If we're searching 147 * a leaf page, we have to manipulate the indices in groups of 148 * two. If we're searching an internal page, they're an index 149 * per page item. If we find an exact match on a leaf page, 150 * we're done. 151 */ 152 jump = TYPE(h) == P_LBTREE ? P_INDX : O_INDX; 153 for (base = 0, 154 lim = NUM_ENT(h) / (db_indx_t)jump; lim != 0; lim >>= 1) { 155 indx = base + ((lim >> 1) * jump); 156 if ((cmp = 157 __bam_cmp(dbp, key, h, indx, t->bt_compare)) == 0) { 158 if (TYPE(h) == P_LBTREE) 159 goto match; 160 goto next; 161 } 162 if (cmp > 0) { 163 base = indx + jump; 164 --lim; 165 } 166 } 167 168 /* 169 * No match found. Base is the smallest index greater than 170 * key and may be zero or a last + O_INDX index. 171 * 172 * If it's a leaf page, return base as the "found" value. 173 * Delete only deletes exact matches. 174 */ 175 if (TYPE(h) == P_LBTREE) { 176 *exactp = 0; 177 178 if (LF_ISSET(S_EXACT)) 179 goto notfound; 180 181 /* 182 * !!! 183 * Possibly returning a deleted record -- DB_SET_RANGE, 184 * DB_KEYFIRST and DB_KEYLAST don't require an exact 185 * match, and we don't want to walk multiple pages here 186 * to find an undeleted record. This is handled in the 187 * __bam_c_search() routine. 188 */ 189 BT_STK_ENTER(cp, h, base, lock, ret); 190 return (ret); 191 } 192 193 /* 194 * If it's not a leaf page, record the internal page (which is 195 * a parent page for the key). Decrement the base by 1 if it's 196 * non-zero so that if a split later occurs, the inserted page 197 * will be to the right of the saved page. 198 */ 199 indx = base > 0 ? base - O_INDX : base; 200 201 /* 202 * If we're trying to calculate the record number, sum up 203 * all the record numbers on this page up to the indx point. 204 */ 205 if (recnop != NULL) 206 for (i = 0; i < indx; ++i) 207 recno += GET_BINTERNAL(h, i)->nrecs; 208 209 next: pg = GET_BINTERNAL(h, indx)->pgno; 210 if (stack) { 211 /* Return if this is the lowest page wanted. */ 212 if (LF_ISSET(S_PARENT) && stop == h->level) { 213 BT_STK_ENTER(cp, h, indx, lock, ret); 214 return (ret); 215 } 216 BT_STK_PUSH(cp, h, indx, lock, ret); 217 if (ret != 0) 218 goto err; 219 220 if ((ret = 221 __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) 222 goto err; 223 } else { 224 /* 225 * Decide if we want to return a reference to the next 226 * page in the return stack. If so, lock it and never 227 * unlock it. 228 */ 229 if ((LF_ISSET(S_PARENT) && 230 (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || 231 (h->level - 1) == LEAFLEVEL) 232 stack = 1; 233 234 (void)memp_fput(dbp->mpf, h, 0); 235 236 if ((ret = 237 __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ? 238 DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) 239 goto err; 240 } 241 if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) 242 goto err; 243 } 244 /* NOTREACHED */ 245 246 match: *exactp = 1; 247 248 /* 249 * If we're trying to calculate the record number, add in the 250 * offset on this page and correct for the fact that records 251 * in the tree are 0-based. 252 */ 253 if (recnop != NULL) 254 *recnop = recno + (indx / P_INDX) + 1; 255 256 /* 257 * If we got here, we know that we have a btree leaf page. 258 * 259 * If there are duplicates, go to the first/last one. This is 260 * safe because we know that we're not going to leave the page, 261 * all duplicate sets that are not on overflow pages exist on a 262 * single leaf page. 263 */ 264 if (LF_ISSET(S_DUPLAST)) 265 while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && 266 h->inp[indx] == h->inp[indx + P_INDX]) 267 indx += P_INDX; 268 else 269 while (indx > 0 && 270 h->inp[indx] == h->inp[indx - P_INDX]) 271 indx -= P_INDX; 272 273 /* 274 * Now check if we are allowed to return deleted items; if not 275 * find the next (or previous) non-deleted item. 276 */ 277 if (LF_ISSET(S_DELNO)) { 278 if (LF_ISSET(S_DUPLAST)) 279 while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) && 280 indx > 0 && 281 h->inp[indx] == h->inp[indx - P_INDX]) 282 indx -= P_INDX; 283 else 284 while (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type) && 285 indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && 286 h->inp[indx] == h->inp[indx + P_INDX]) 287 indx += P_INDX; 288 289 if (B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) 290 goto notfound; 291 } 292 293 BT_STK_ENTER(cp, h, indx, lock, ret); 294 return (ret); 295 296 notfound: 297 (void)memp_fput(dbp->mpf, h, 0); 298 (void)__BT_LPUT(dbc, lock); 299 ret = DB_NOTFOUND; 300 301 err: if (cp->csp > cp->sp) { 302 BT_STK_POP(cp); 303 __bam_stkrel(dbc, 0); 304 } 305 return (ret); 306 } 307 308 /* 309 * __bam_stkrel -- 310 * Release all pages currently held in the stack. 311 * 312 * PUBLIC: int __bam_stkrel __P((DBC *, int)); 313 */ 314 int 315 __bam_stkrel(dbc, nolocks) 316 DBC *dbc; 317 int nolocks; 318 { 319 CURSOR *cp; 320 DB *dbp; 321 EPG *epg; 322 323 dbp = dbc->dbp; 324 cp = dbc->internal; 325 326 /* Release inner pages first. */ 327 for (epg = cp->sp; epg <= cp->csp; ++epg) { 328 if (epg->page != NULL) 329 (void)memp_fput(dbp->mpf, epg->page, 0); 330 if (epg->lock != LOCK_INVALID) 331 if (nolocks) 332 (void)__BT_LPUT(dbc, epg->lock); 333 else 334 (void)__BT_TLPUT(dbc, epg->lock); 335 } 336 337 /* Clear the stack, all pages have been released. */ 338 BT_STK_CLR(cp); 339 340 return (0); 341 } 342 343 /* 344 * __bam_stkgrow -- 345 * Grow the stack. 346 * 347 * PUBLIC: int __bam_stkgrow __P((CURSOR *)); 348 */ 349 int 350 __bam_stkgrow(cp) 351 CURSOR *cp; 352 { 353 EPG *p; 354 size_t entries; 355 int ret; 356 357 entries = cp->esp - cp->sp; 358 359 if ((ret = __os_calloc(entries * 2, sizeof(EPG), &p)) != 0) 360 return (ret); 361 memcpy(p, cp->sp, entries * sizeof(EPG)); 362 if (cp->sp != cp->stack) 363 __os_free(cp->sp, entries * sizeof(EPG)); 364 cp->sp = p; 365 cp->csp = p + entries; 366 cp->esp = p + entries * 2; 367 return (0); 368 } 369