1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 /* 8 * Copyright (c) 1990, 1993, 1994, 1995, 1996 9 * Keith Bostic. All rights reserved. 10 */ 11 /* 12 * Copyright (c) 1990, 1993 13 * The Regents of the University of California. All rights reserved. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the University of 26 * California, Berkeley and its contributors. 27 * 4. Neither the name of the University nor the names of its contributors 28 * may be used to endorse or promote products derived from this software 29 * without specific prior written permission. 30 * 31 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 32 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 33 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 35 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 36 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 37 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 38 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 39 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 40 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 41 * SUCH DAMAGE. 42 */ 43 44 #include "config.h" 45 46 #ifndef lint 47 static const char sccsid[] = "@(#)bt_rsearch.c 10.21 (Sleepycat) 12/2/98"; 48 #endif /* not lint */ 49 50 #ifndef NO_SYSTEM_INCLUDES 51 #include <sys/types.h> 52 #endif 53 54 #include "db_int.h" 55 #include "db_page.h" 56 #include "btree.h" 57 58 /* 59 * __bam_rsearch -- 60 * Search a btree for a record number. 61 * 62 * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); 63 */ 64 int 65 __bam_rsearch(dbc, recnop, flags, stop, exactp) 66 DBC *dbc; 67 db_recno_t *recnop; 68 u_int32_t flags; 69 int stop, *exactp; 70 { 71 BINTERNAL *bi; 72 CURSOR *cp; 73 DB *dbp; 74 DB_LOCK lock; 75 PAGE *h; 76 RINTERNAL *ri; 77 db_indx_t indx, top; 78 db_pgno_t pg; 79 db_recno_t i, recno, total; 80 int ret, stack; 81 82 dbp = dbc->dbp; 83 cp = dbc->internal; 84 85 BT_STK_CLR(cp); 86 87 /* 88 * There are several ways we search a btree tree. The flags argument 89 * specifies if we're acquiring read or write locks and if we are 90 * locking pairs of pages. In addition, if we're adding or deleting 91 * an item, we have to lock the entire tree, regardless. See btree.h 92 * for more details. 93 * 94 * If write-locking pages, we need to know whether or not to acquire a 95 * write lock on a page before getting it. This depends on how deep it 96 * is in tree, which we don't know until we acquire the root page. So, 97 * if we need to lock the root page we may have to upgrade it later, 98 * because we won't get the correct lock initially. 99 * 100 * Retrieve the root page. 101 */ 102 pg = PGNO_ROOT; 103 stack = LF_ISSET(S_STACK); 104 if ((ret = __bam_lget(dbc, 105 0, pg, stack ? DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) 106 return (ret); 107 if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { 108 (void)__BT_LPUT(dbc, lock); 109 return (ret); 110 } 111 112 /* 113 * Decide if we need to save this page; if we do, write lock it. 114 * We deliberately don't lock-couple on this call. If the tree 115 * is tiny, i.e., one page, and two threads are busily updating 116 * the root page, we're almost guaranteed deadlocks galore, as 117 * each one gets a read lock and then blocks the other's attempt 118 * for a write lock. 119 */ 120 if (!stack && 121 ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || 122 (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { 123 (void)memp_fput(dbp->mpf, h, 0); 124 (void)__BT_LPUT(dbc, lock); 125 if ((ret = __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) 126 return (ret); 127 if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { 128 (void)__BT_LPUT(dbc, lock); 129 return (ret); 130 } 131 stack = 1; 132 } 133 134 /* 135 * If appending to the tree, set the record number now -- we have the 136 * root page locked. 137 * 138 * Delete only deletes exact matches, read only returns exact matches. 139 * Note, this is different from __bam_search(), which returns non-exact 140 * matches for read. 141 * 142 * The record may not exist. We can only return the correct location 143 * for the record immediately after the last record in the tree, so do 144 * a fast check now. 145 */ 146 total = RE_NREC(h); 147 if (LF_ISSET(S_APPEND)) { 148 *exactp = 0; 149 *recnop = recno = total + 1; 150 } else { 151 recno = *recnop; 152 if (recno <= total) 153 *exactp = 1; 154 else { 155 *exactp = 0; 156 if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) { 157 (void)memp_fput(dbp->mpf, h, 0); 158 (void)__BT_LPUT(dbc, lock); 159 return (DB_NOTFOUND); 160 } 161 } 162 } 163 164 /* 165 * !!! 166 * Record numbers in the tree are 0-based, but the recno is 167 * 1-based. All of the calculations below have to take this 168 * into account. 169 */ 170 for (total = 0;;) { 171 switch (TYPE(h)) { 172 case P_LBTREE: 173 recno -= total; 174 175 /* 176 * There may be logically deleted records on the page, 177 * walk the page correcting for them. The record may 178 * not exist if there are enough deleted records in the 179 * page. 180 */ 181 if (recno <= (db_recno_t)NUM_ENT(h) / P_INDX) 182 for (i = recno - 1;; --i) { 183 if (B_DISSET(GET_BKEYDATA(h, 184 i * P_INDX + O_INDX)->type)) 185 ++recno; 186 if (i == 0) 187 break; 188 } 189 if (recno > (db_recno_t)NUM_ENT(h) / P_INDX) { 190 *exactp = 0; 191 if (!LF_ISSET(S_PAST_EOF) || recno > 192 (db_recno_t)(NUM_ENT(h) / P_INDX + 1)) { 193 ret = DB_NOTFOUND; 194 goto err; 195 } 196 197 } 198 199 /* Correct from 1-based to 0-based for a page offset. */ 200 --recno; 201 BT_STK_ENTER(cp, h, recno * P_INDX, lock, ret); 202 return (ret); 203 case P_IBTREE: 204 for (indx = 0, top = NUM_ENT(h);;) { 205 bi = GET_BINTERNAL(h, indx); 206 if (++indx == top || total + bi->nrecs >= recno) 207 break; 208 total += bi->nrecs; 209 } 210 pg = bi->pgno; 211 break; 212 case P_LRECNO: 213 recno -= total; 214 215 /* Correct from 1-based to 0-based for a page offset. */ 216 --recno; 217 BT_STK_ENTER(cp, h, recno, lock, ret); 218 return (ret); 219 case P_IRECNO: 220 for (indx = 0, top = NUM_ENT(h);;) { 221 ri = GET_RINTERNAL(h, indx); 222 if (++indx == top || total + ri->nrecs >= recno) 223 break; 224 total += ri->nrecs; 225 } 226 pg = ri->pgno; 227 break; 228 default: 229 return (__db_pgfmt(dbp, h->pgno)); 230 } 231 --indx; 232 233 if (stack) { 234 /* Return if this is the lowest page wanted. */ 235 if (LF_ISSET(S_PARENT) && stop == h->level) { 236 BT_STK_ENTER(cp, h, indx, lock, ret); 237 return (ret); 238 } 239 BT_STK_PUSH(cp, h, indx, lock, ret); 240 if (ret != 0) 241 goto err; 242 243 if ((ret = 244 __bam_lget(dbc, 0, pg, DB_LOCK_WRITE, &lock)) != 0) 245 goto err; 246 } else { 247 /* 248 * Decide if we want to return a pointer to the next 249 * page in the stack. If we do, write lock it and 250 * never unlock it. 251 */ 252 if ((LF_ISSET(S_PARENT) && 253 (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || 254 (h->level - 1) == LEAFLEVEL) 255 stack = 1; 256 257 (void)memp_fput(dbp->mpf, h, 0); 258 259 if ((ret = 260 __bam_lget(dbc, 1, pg, stack && LF_ISSET(S_WRITE) ? 261 DB_LOCK_WRITE : DB_LOCK_READ, &lock)) != 0) 262 goto err; 263 } 264 265 if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) 266 goto err; 267 } 268 /* NOTREACHED */ 269 270 err: BT_STK_POP(cp); 271 __bam_stkrel(dbc, 0); 272 return (ret); 273 } 274 275 /* 276 * __bam_adjust -- 277 * Adjust the tree after adding or deleting a record. 278 * 279 * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); 280 */ 281 int 282 __bam_adjust(dbc, adjust) 283 DBC *dbc; 284 int32_t adjust; 285 { 286 CURSOR *cp; 287 DB *dbp; 288 EPG *epg; 289 PAGE *h; 290 int ret; 291 292 dbp = dbc->dbp; 293 cp = dbc->internal; 294 295 /* Update the record counts for the tree. */ 296 for (epg = cp->sp; epg <= cp->csp; ++epg) { 297 h = epg->page; 298 if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { 299 if (DB_LOGGING(dbc) && 300 (ret = __bam_cadjust_log(dbp->dbenv->lg_info, 301 dbc->txn, &LSN(h), 0, dbp->log_fileid, 302 PGNO(h), &LSN(h), (u_int32_t)epg->indx, 303 adjust, 1)) != 0) 304 return (ret); 305 306 if (TYPE(h) == P_IBTREE) 307 GET_BINTERNAL(h, epg->indx)->nrecs += adjust; 308 else 309 GET_RINTERNAL(h, epg->indx)->nrecs += adjust; 310 311 if (PGNO(h) == PGNO_ROOT) 312 RE_NREC_ADJ(h, adjust); 313 314 if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) 315 return (ret); 316 } 317 } 318 return (0); 319 } 320 321 /* 322 * __bam_nrecs -- 323 * Return the number of records in the tree. 324 * 325 * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); 326 */ 327 int 328 __bam_nrecs(dbc, rep) 329 DBC *dbc; 330 db_recno_t *rep; 331 { 332 DB *dbp; 333 DB_LOCK lock; 334 PAGE *h; 335 db_pgno_t pgno; 336 int ret; 337 338 dbp = dbc->dbp; 339 340 pgno = PGNO_ROOT; 341 if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &lock)) != 0) 342 return (ret); 343 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) 344 return (ret); 345 346 *rep = RE_NREC(h); 347 348 (void)memp_fput(dbp->mpf, h, 0); 349 (void)__BT_TLPUT(dbc, lock); 350 351 return (0); 352 } 353 354 /* 355 * __bam_total -- 356 * Return the number of records below a page. 357 * 358 * PUBLIC: db_recno_t __bam_total __P((PAGE *)); 359 */ 360 db_recno_t 361 __bam_total(h) 362 PAGE *h; 363 { 364 db_recno_t nrecs; 365 db_indx_t indx, top; 366 367 nrecs = 0; 368 top = NUM_ENT(h); 369 370 switch (TYPE(h)) { 371 case P_LBTREE: 372 /* Check for logically deleted records. */ 373 for (indx = 0; indx < top; indx += P_INDX) 374 if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) 375 ++nrecs; 376 break; 377 case P_IBTREE: 378 for (indx = 0; indx < top; indx += O_INDX) 379 nrecs += GET_BINTERNAL(h, indx)->nrecs; 380 break; 381 case P_LRECNO: 382 nrecs = NUM_ENT(h); 383 break; 384 case P_IRECNO: 385 for (indx = 0; indx < top; indx += O_INDX) 386 nrecs += GET_RINTERNAL(h, indx)->nrecs; 387 break; 388 } 389 390 return (nrecs); 391 } 392