1 /* 2 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 #pragma ident "%Z%%M% %I% %E% SMI" 7 8 /* 9 * Copyright (C) 2002 by the Massachusetts Institute of Technology. 10 * All rights reserved. 11 * 12 * Export of this software from the United States of America may 13 * require a specific license from the United States Government. 14 * It is the responsibility of any person or organization contemplating 15 * export to obtain such a license before exporting. 16 * 17 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and 18 * distribute this software and its documentation for any purpose and 19 * without fee is hereby granted, provided that the above copyright 20 * notice appear in all copies and that both that copyright notice and 21 * this permission notice appear in supporting documentation, and that 22 * the name of M.I.T. not be used in advertising or publicity pertaining 23 * to distribution of the software without specific, written prior 24 * permission. Furthermore if you modify this software you must label 25 * your software as modified software and not distribute it in such a 26 * fashion that it might be confused with the original M.I.T. software. 27 * M.I.T. makes no representations about the suitability of 28 * this software for any purpose. It is provided "as is" without express 29 * or implied warranty. 30 */ 31 32 /*- 33 * Copyright (c) 1990, 1993, 1994 34 * The Regents of the University of California. All rights reserved. 35 * 36 * This code is derived from software contributed to Berkeley by 37 * Mike Olson. 38 * 39 * Redistribution and use in source and binary forms, with or without 40 * modification, are permitted provided that the following conditions 41 * are met: 42 * 1. Redistributions of source code must retain the above copyright 43 * notice, this list of conditions and the following disclaimer. 44 * 2. Redistributions in binary form must reproduce the above copyright 45 * notice, this list of conditions and the following disclaimer in the 46 * documentation and/or other materials provided with the distribution. 47 * 3. All advertising materials mentioning features or use of this software 48 * must display the following acknowledgement: 49 * This product includes software developed by the University of 50 * California, Berkeley and its contributors. 51 * 4. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 */ 67 68 #if defined(LIBC_SCCS) && !defined(lint) 69 static char sccsid[] = "@(#)bt_seq.c 8.9 (Berkeley) 6/20/95"; 70 #endif /* LIBC_SCCS and not lint */ 71 72 #include <sys/types.h> 73 74 #include <errno.h> 75 #include <stddef.h> 76 #include <stdio.h> 77 #include <stdlib.h> 78 #include <string.h> 79 80 #include "db-int.h" 81 #include "btree.h" 82 83 static int __bt_first __P((BTREE *, const DBT *, EPG *, int *)); 84 static int __bt_seqadv __P((BTREE *, EPG *, int)); 85 static int __bt_seqset __P((BTREE *, EPG *, DBT *, int)); 86 87 /* 88 * Sequential scan support. 89 * 90 * The tree can be scanned sequentially, starting from either end of the 91 * tree or from any specific key. A scan request before any scanning is 92 * done is initialized as starting from the least node. 93 */ 94 95 /* 96 * __bt_seq -- 97 * Btree sequential scan interface. 98 * 99 * Parameters: 100 * dbp: pointer to access method 101 * key: key for positioning and return value 102 * data: data return value 103 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. 104 * 105 * Returns: 106 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 107 */ 108 int 109 __bt_seq(dbp, key, data, flags) 110 const DB *dbp; 111 DBT *key, *data; 112 u_int flags; 113 { 114 BTREE *t; 115 EPG e; 116 int status; 117 118 t = dbp->internal; 119 120 /* Toss any page pinned across calls. */ 121 if (t->bt_pinned != NULL) { 122 mpool_put(t->bt_mp, t->bt_pinned, 0); 123 t->bt_pinned = NULL; 124 } 125 126 /* 127 * If scan unitialized as yet, or starting at a specific record, set 128 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin 129 * the page the cursor references if they're successful. 130 */ 131 switch (flags) { 132 case R_NEXT: 133 case R_PREV: 134 if (F_ISSET(&t->bt_cursor, CURS_INIT)) { 135 status = __bt_seqadv(t, &e, flags); 136 break; 137 } 138 /* FALLTHROUGH */ 139 case R_FIRST: 140 case R_LAST: 141 case R_CURSOR: 142 status = __bt_seqset(t, &e, key, flags); 143 break; 144 default: 145 errno = EINVAL; 146 return (RET_ERROR); 147 } 148 149 if (status == RET_SUCCESS) { 150 __bt_setcur(t, e.page->pgno, e.index); 151 152 status = 153 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); 154 155 /* 156 * If the user is doing concurrent access, we copied the 157 * key/data, toss the page. 158 */ 159 if (F_ISSET(t, B_DB_LOCK)) 160 mpool_put(t->bt_mp, e.page, 0); 161 else 162 t->bt_pinned = e.page; 163 } 164 return (status); 165 } 166 167 /* 168 * __bt_seqset -- 169 * Set the sequential scan to a specific key. 170 * 171 * Parameters: 172 * t: tree 173 * ep: storage for returned key 174 * key: key for initial scan position 175 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV 176 * 177 * Side effects: 178 * Pins the page the cursor references. 179 * 180 * Returns: 181 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 182 */ 183 static int 184 __bt_seqset(t, ep, key, flags) 185 BTREE *t; 186 EPG *ep; 187 DBT *key; 188 int flags; 189 { 190 PAGE *h; 191 db_pgno_t pg; 192 int exact; 193 194 /* 195 * Find the first, last or specific key in the tree and point the 196 * cursor at it. The cursor may not be moved until a new key has 197 * been found. 198 */ 199 switch (flags) { 200 case R_CURSOR: /* Keyed scan. */ 201 /* 202 * Find the first instance of the key or the smallest key 203 * which is greater than or equal to the specified key. 204 */ 205 if (key->data == NULL || key->size == 0) { 206 errno = EINVAL; 207 return (RET_ERROR); 208 } 209 return (__bt_first(t, key, ep, &exact)); 210 case R_FIRST: /* First record. */ 211 case R_NEXT: 212 /* Walk down the left-hand side of the tree. */ 213 for (pg = P_ROOT;;) { 214 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 215 return (RET_ERROR); 216 217 /* Check for an empty tree. */ 218 if (NEXTINDEX(h) == 0) { 219 mpool_put(t->bt_mp, h, 0); 220 return (RET_SPECIAL); 221 } 222 223 if (h->flags & (P_BLEAF | P_RLEAF)) 224 break; 225 pg = GETBINTERNAL(h, 0)->pgno; 226 mpool_put(t->bt_mp, h, 0); 227 } 228 ep->page = h; 229 ep->index = 0; 230 break; 231 case R_LAST: /* Last record. */ 232 case R_PREV: 233 /* Walk down the right-hand side of the tree. */ 234 for (pg = P_ROOT;;) { 235 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 236 return (RET_ERROR); 237 238 /* Check for an empty tree. */ 239 if (NEXTINDEX(h) == 0) { 240 mpool_put(t->bt_mp, h, 0); 241 return (RET_SPECIAL); 242 } 243 244 if (h->flags & (P_BLEAF | P_RLEAF)) 245 break; 246 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; 247 mpool_put(t->bt_mp, h, 0); 248 } 249 250 ep->page = h; 251 ep->index = NEXTINDEX(h) - 1; 252 break; 253 } 254 return (RET_SUCCESS); 255 } 256 257 /* 258 * __bt_seqadvance -- 259 * Advance the sequential scan. 260 * 261 * Parameters: 262 * t: tree 263 * flags: R_NEXT, R_PREV 264 * 265 * Side effects: 266 * Pins the page the new key/data record is on. 267 * 268 * Returns: 269 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 270 */ 271 static int 272 __bt_seqadv(t, ep, flags) 273 BTREE *t; 274 EPG *ep; 275 int flags; 276 { 277 CURSOR *c; 278 PAGE *h; 279 indx_t idx; 280 db_pgno_t pg; 281 int exact, rval; 282 283 /* 284 * There are a couple of states that we can be in. The cursor has 285 * been initialized by the time we get here, but that's all we know. 286 */ 287 c = &t->bt_cursor; 288 289 /* 290 * The cursor was deleted and there weren't any duplicate records, 291 * so the cursor's key was saved. Find out where that key would 292 * be in the current tree. If the returned key is an exact match, 293 * it means that a key/data pair was inserted into the tree after 294 * the delete. We could reasonably return the key, but the problem 295 * is that this is the access pattern we'll see if the user is 296 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes 297 * the cursor record and then replaces it, so the cursor was saved, 298 * and we'll simply return the same "new" record until the user 299 * notices and doesn't do a put() of it. Since the key is an exact 300 * match, we could as easily put the new record before the cursor, 301 * and we've made no guarantee to return it. So, move forward or 302 * back a record if it's an exact match. 303 * 304 * XXX 305 * In the current implementation, put's to the cursor are done with 306 * delete/add pairs. This has two consequences. First, it means 307 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit 308 * the same behavior as above. Second, you can return the same key 309 * twice if you have duplicate records. The scenario is that the 310 * cursor record is deleted, moving the cursor forward or backward 311 * to a duplicate. The add then inserts the new record at a location 312 * ahead of the cursor because duplicates aren't sorted in any way, 313 * and the new record is later returned. This has to be fixed at some 314 * point. 315 */ 316 if (F_ISSET(c, CURS_ACQUIRE)) { 317 if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR) 318 return (RET_ERROR); 319 if (!exact) 320 return (rval); 321 /* 322 * XXX 323 * Kluge -- get, release, get the page. 324 */ 325 c->pg.pgno = ep->page->pgno; 326 c->pg.index = ep->index; 327 mpool_put(t->bt_mp, ep->page, 0); 328 } 329 330 /* Get the page referenced by the cursor. */ 331 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 332 return (RET_ERROR); 333 334 /* 335 * Find the next/previous record in the tree and point the cursor at 336 * it. The cursor may not be moved until a new key has been found. 337 */ 338 switch (flags) { 339 case R_NEXT: /* Next record. */ 340 /* 341 * The cursor was deleted in duplicate records, and moved 342 * forward to a record that has yet to be returned. Clear 343 * that flag, and return the record. 344 */ 345 if (F_ISSET(c, CURS_AFTER)) 346 goto usecurrent; 347 idx = c->pg.index; 348 if (++idx == NEXTINDEX(h)) { 349 pg = h->nextpg; 350 mpool_put(t->bt_mp, h, 0); 351 if (pg == P_INVALID) 352 return (RET_SPECIAL); 353 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 354 return (RET_ERROR); 355 idx = 0; 356 } 357 break; 358 case R_PREV: /* Previous record. */ 359 /* 360 * The cursor was deleted in duplicate records, and moved 361 * backward to a record that has yet to be returned. Clear 362 * that flag, and return the record. 363 */ 364 if (F_ISSET(c, CURS_BEFORE)) { 365 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); 366 ep->page = h; 367 ep->index = c->pg.index; 368 return (RET_SUCCESS); 369 } 370 idx = c->pg.index; 371 if (idx == 0) { 372 pg = h->prevpg; 373 mpool_put(t->bt_mp, h, 0); 374 if (pg == P_INVALID) 375 return (RET_SPECIAL); 376 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 377 return (RET_ERROR); 378 idx = NEXTINDEX(h) - 1; 379 } else 380 --idx; 381 break; 382 } 383 384 ep->page = h; 385 ep->index = idx; 386 return (RET_SUCCESS); 387 } 388 389 /* 390 * __bt_first -- 391 * Find the first entry. 392 * 393 * Parameters: 394 * t: the tree 395 * key: the key 396 * erval: return EPG 397 * exactp: pointer to exact match flag 398 * 399 * Returns: 400 * The first entry in the tree greater than or equal to key, 401 * or RET_SPECIAL if no such key exists. 402 */ 403 static int 404 __bt_first(t, key, erval, exactp) 405 BTREE *t; 406 const DBT *key; 407 EPG *erval; 408 int *exactp; 409 { 410 PAGE *h; 411 EPG *ep, save; 412 db_pgno_t pg; 413 414 /* 415 * Find any matching record; __bt_search pins the page. 416 * 417 * If it's an exact match and duplicates are possible, walk backwards 418 * in the tree until we find the first one. Otherwise, make sure it's 419 * a valid key (__bt_search may return an index just past the end of a 420 * page) and return it. 421 */ 422 if ((ep = __bt_search(t, key, exactp)) == NULL) 423 return (RET_SPECIAL); 424 if (*exactp) { 425 if (F_ISSET(t, B_NODUPS)) { 426 *erval = *ep; 427 return (RET_SUCCESS); 428 } 429 430 /* 431 * Walk backwards, as long as the entry matches and there are 432 * keys left in the tree. Save a copy of each match in case 433 * we go too far. 434 */ 435 save = *ep; 436 h = ep->page; 437 do { 438 if (save.page->pgno != ep->page->pgno) { 439 mpool_put(t->bt_mp, save.page, 0); 440 save = *ep; 441 } else 442 save.index = ep->index; 443 444 /* 445 * Don't unpin the page the last (or original) match 446 * was on, but make sure it's unpinned if an error 447 * occurs. 448 */ 449 if (ep->index == 0) { 450 if (h->prevpg == P_INVALID) 451 break; 452 if (h->pgno != save.page->pgno) 453 mpool_put(t->bt_mp, h, 0); 454 if ((h = mpool_get(t->bt_mp, 455 h->prevpg, 0)) == NULL) { 456 if (h->pgno == save.page->pgno) 457 mpool_put(t->bt_mp, 458 save.page, 0); 459 return (RET_ERROR); 460 } 461 ep->page = h; 462 ep->index = NEXTINDEX(h); 463 } 464 --ep->index; 465 } while (__bt_cmp(t, key, ep) == 0); 466 467 /* 468 * Reach here with the last page that was looked at pinned, 469 * which may or may not be the same as the last (or original) 470 * match page. If it's not useful, release it. 471 */ 472 if (h->pgno != save.page->pgno) 473 mpool_put(t->bt_mp, h, 0); 474 475 *erval = save; 476 return (RET_SUCCESS); 477 } 478 479 /* If at the end of a page, find the next entry. */ 480 if (ep->index == NEXTINDEX(ep->page)) { 481 h = ep->page; 482 pg = h->nextpg; 483 mpool_put(t->bt_mp, h, 0); 484 if (pg == P_INVALID) 485 return (RET_SPECIAL); 486 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 487 return (RET_ERROR); 488 ep->index = 0; 489 ep->page = h; 490 } 491 *erval = *ep; 492 return (RET_SUCCESS); 493 } 494 495 /* 496 * __bt_setcur -- 497 * Set the cursor to an entry in the tree. 498 * 499 * Parameters: 500 * t: the tree 501 * pgno: page number 502 * index: page index 503 */ 504 void 505 __bt_setcur(t, pgno, idx) 506 BTREE *t; 507 db_pgno_t pgno; 508 u_int idx; 509 { 510 /* Lose any already deleted key. */ 511 if (t->bt_cursor.key.data != NULL) { 512 free(t->bt_cursor.key.data); 513 t->bt_cursor.key.size = 0; 514 t->bt_cursor.key.data = NULL; 515 } 516 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 517 518 /* Update the cursor. */ 519 t->bt_cursor.pg.pgno = pgno; 520 t->bt_cursor.pg.index = idx; 521 F_SET(&t->bt_cursor, CURS_INIT); 522 } 523 524 /* Recursive descent cursor. */ 525 typedef struct rcursor_ { 526 CURSOR cursor; 527 size_t ssize; 528 EPGNO *stack; 529 EPGNO *sp; 530 } RCURSOR; 531 #define RCURSOR_MINSS 64 532 533 static int bt_rcinit(void **); 534 static void bt_rcdestroy(void **); 535 static int bt_rcpush(RCURSOR *, db_pgno_t, u_int); 536 static EPGNO *bt_rcpop(RCURSOR *); 537 static void bt_rcclr(RCURSOR *); 538 static int bt_rcgrowstk(RCURSOR *); 539 static int bt_rseqset(BTREE *, EPG *, DBT *, RCURSOR *, int); 540 static int bt_rseqadv(BTREE *, EPG *, RCURSOR *, int); 541 542 static int 543 bt_rcinit(curs) 544 void **curs; 545 { 546 RCURSOR *rc; 547 548 rc = *curs = malloc(sizeof(RCURSOR)); 549 if (rc == NULL) { 550 errno = ENOMEM; 551 return RET_ERROR; 552 } 553 memset(rc, 0, sizeof(*rc)); 554 555 rc->ssize = RCURSOR_MINSS; 556 rc->stack = malloc(rc->ssize * sizeof(EPGNO)); 557 if (rc->stack == NULL) { 558 free(rc); 559 errno = ENOMEM; 560 return RET_ERROR; 561 } 562 bt_rcclr(rc); 563 return RET_SUCCESS; 564 } 565 566 static void 567 bt_rcdestroy(curs) 568 void **curs; 569 { 570 RCURSOR *rc; 571 572 rc = *curs; 573 free(rc->stack); 574 free(rc); 575 *curs = NULL; 576 } 577 578 static int 579 bt_rcpush(rc, p, i) 580 RCURSOR *rc; 581 db_pgno_t p; 582 u_int i; 583 { 584 int status; 585 586 rc->sp->pgno = p; 587 rc->sp->index = i; 588 if (++rc->sp > rc->stack + rc->ssize) { 589 status = bt_rcgrowstk(rc); 590 if (status != RET_SUCCESS) 591 return status; 592 } 593 return RET_SUCCESS; 594 } 595 596 static EPGNO * 597 bt_rcpop(rc) 598 RCURSOR *rc; 599 { 600 return (rc->sp == rc->stack) ? NULL : --rc->sp; 601 } 602 603 static void 604 bt_rcclr(rc) 605 RCURSOR *rc; 606 { 607 rc->sp = rc->stack; 608 } 609 610 static int 611 bt_rcgrowstk(rc) 612 RCURSOR *rc; 613 { 614 size_t osize; 615 EPGNO *e; 616 617 osize = rc->ssize; 618 rc->ssize *= 2; 619 e = realloc(rc->stack, rc->ssize * sizeof(EPGNO)); 620 if (e == NULL) { 621 rc->ssize = osize; 622 errno = ENOMEM; 623 return RET_ERROR; 624 } 625 rc->stack = e; 626 return RET_SUCCESS; 627 } 628 629 /* 630 * bt_rseq -- 631 * Like __bt_seq but does recursive descent tree traversal 632 * instead of using the prev/next pointers. 633 */ 634 int 635 bt_rseq(dbp, key, data, curs, flags) 636 const DB *dbp; 637 DBT *key, *data; 638 void **curs; 639 u_int flags; 640 { 641 RCURSOR *rc; 642 BTREE *t; 643 EPG e; 644 int status; 645 646 t = dbp->internal; 647 648 /* Toss any page pinned across calls. */ 649 if (t->bt_pinned != NULL) { 650 mpool_put(t->bt_mp, t->bt_pinned, 0); 651 t->bt_pinned = NULL; 652 } 653 654 if (curs == NULL) { 655 errno = EINVAL; 656 return RET_ERROR; 657 } 658 if (*curs == NULL) { 659 status = bt_rcinit(curs); 660 if (status != RET_SUCCESS) 661 return RET_ERROR; 662 } 663 rc = *curs; 664 665 /* 666 * If scan unitialized as yet, or starting at a specific record, set 667 * the scan to a specific key. Both bt_rseqset and bt_rseqadv pin 668 * the page the cursor references if they're successful. 669 */ 670 switch (flags) { 671 case R_NEXT: 672 case R_PREV: 673 if (F_ISSET(&rc->cursor, CURS_INIT)) { 674 status = bt_rseqadv(t, &e, rc, flags); 675 break; 676 } 677 /* FALLTHROUGH */ 678 case R_FIRST: 679 case R_LAST: 680 case R_CURSOR: 681 status = bt_rseqset(t, &e, key, rc, flags); 682 break; 683 default: 684 errno = EINVAL; 685 return (RET_ERROR); 686 } 687 688 if (status == RET_SUCCESS) { 689 status = 690 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); 691 692 /* 693 * If the user is doing concurrent access, we copied the 694 * key/data, toss the page. 695 */ 696 if (F_ISSET(t, B_DB_LOCK)) 697 mpool_put(t->bt_mp, e.page, 0); 698 else 699 t->bt_pinned = e.page; 700 } else if (status == RET_SPECIAL) 701 bt_rcdestroy(curs); 702 return (status); 703 } 704 705 /* 706 * bt_rseqset -- 707 * Set the sequential scan to a specific key. 708 * 709 * Parameters: 710 * t: tree 711 * ep: storage for returned key 712 * key: key for initial scan position 713 * rc: recursion cursor 714 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV 715 * 716 * Side effects: 717 * Pins the page the cursor references. 718 * Updates rc's stack and cursor. 719 * 720 * Returns: 721 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 722 */ 723 static int 724 bt_rseqset(t, ep, key, rc, flags) 725 BTREE *t; 726 EPG *ep; 727 DBT *key; 728 RCURSOR *rc; 729 int flags; 730 { 731 PAGE *h; 732 db_pgno_t pg; 733 int status; 734 735 /* 736 * Find the first, last or specific key in the tree and point the 737 * cursor at it. The cursor may not be moved until a new key has 738 * been found. 739 */ 740 switch (flags) { 741 case R_CURSOR: /* Not implemented. */ 742 errno = EINVAL; 743 return RET_ERROR; 744 case R_FIRST: /* First record. */ 745 case R_NEXT: 746 bt_rcclr(rc); 747 /* Walk down the left-hand side of the tree. */ 748 for (pg = P_ROOT;;) { 749 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 750 return (RET_ERROR); 751 752 /* Check for an empty tree. */ 753 if (NEXTINDEX(h) == 0) { 754 mpool_put(t->bt_mp, h, 0); 755 return (RET_SPECIAL); 756 } 757 758 if (h->flags & (P_BLEAF | P_RLEAF)) 759 break; 760 pg = GETBINTERNAL(h, 0)->pgno; 761 status = bt_rcpush(rc, h->pgno, 0); 762 mpool_put(t->bt_mp, h, 0); 763 if (status != RET_SUCCESS) 764 return status; 765 } 766 ep->page = h; 767 ep->index = 0; 768 break; 769 case R_LAST: /* Last record. */ 770 case R_PREV: 771 bt_rcclr(rc); 772 /* Walk down the right-hand side of the tree. */ 773 for (pg = P_ROOT;;) { 774 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 775 return (RET_ERROR); 776 777 /* Check for an empty tree. */ 778 if (NEXTINDEX(h) == 0) { 779 mpool_put(t->bt_mp, h, 0); 780 return (RET_SPECIAL); 781 } 782 783 if (h->flags & (P_BLEAF | P_RLEAF)) 784 break; 785 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; 786 status = bt_rcpush(rc, h->pgno, NEXTINDEX(h) - 1); 787 mpool_put(t->bt_mp, h, 0); 788 if (status != RET_SUCCESS) 789 return status; 790 } 791 ep->page = h; 792 ep->index = NEXTINDEX(h) - 1; 793 break; 794 } 795 rc->cursor.pg.pgno = ep->page->pgno; 796 rc->cursor.pg.index = ep->index; 797 F_CLR(&rc->cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 798 F_SET(&rc->cursor, CURS_INIT); 799 return (RET_SUCCESS); 800 } 801 802 /* 803 * bt_rseqadvance -- 804 * Advance the sequential scan. 805 * 806 * Parameters: 807 * t: tree 808 * ep: return page 809 * rc: recursion cursor 810 * flags: R_NEXT, R_PREV 811 * 812 * Side effects: 813 * Pins the page the new key/data record is on. 814 * Updates rc's stack and cursor. 815 * 816 * Returns: 817 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 818 */ 819 static int 820 bt_rseqadv(t, ep, rc, flags) 821 BTREE *t; 822 EPG *ep; 823 RCURSOR *rc; 824 int flags; 825 { 826 CURSOR *c; 827 PAGE *h; 828 indx_t idx; 829 db_pgno_t pg; 830 int status; 831 EPGNO *e; 832 833 /* 834 * There are a couple of states that we can be in. The cursor has 835 * been initialized by the time we get here, but that's all we know. 836 */ 837 c = &rc->cursor; 838 839 /* Get the page referenced by the cursor. */ 840 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 841 return (RET_ERROR); 842 843 /* 844 * Find the next/previous record in the tree and point the cursor at 845 * it. The cursor may not be moved until a new key has been found. 846 */ 847 switch (flags) { 848 case R_NEXT: /* Next record. */ 849 idx = c->pg.index; 850 while (++idx == NEXTINDEX(h)) { 851 /* Crawl up if we hit the right edge. */ 852 e = bt_rcpop(rc); 853 mpool_put(t->bt_mp, h, 0); 854 if (e == NULL) /* Hit the right edge of root. */ 855 return RET_SPECIAL; 856 idx = e->index; 857 pg = e->pgno; 858 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 859 return (RET_ERROR); 860 } 861 while (!(h->flags & (P_BLEAF | P_RLEAF))) { 862 /* Crawl down the left until we hit a leaf. */ 863 status = bt_rcpush(rc, h->pgno, idx); 864 pg = GETBINTERNAL(h, idx)->pgno; 865 mpool_put(t->bt_mp, h, 0); 866 if (status != RET_SUCCESS) 867 return status; 868 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 869 return (RET_ERROR); 870 idx = 0; 871 } 872 break; 873 case R_PREV: /* Previous record. */ 874 idx = c->pg.index; 875 while (!idx) { 876 /* Crawl up if we hit the left edge. */ 877 e = bt_rcpop(rc); 878 mpool_put(t->bt_mp, h, 0); 879 if (e == NULL) /* Hit the left edge of root. */ 880 return RET_SPECIAL; 881 idx = e->index; 882 pg = e->pgno; 883 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 884 return (RET_ERROR); 885 } 886 idx--; 887 while (!(h->flags & (P_BLEAF | P_RLEAF))) { 888 /* Crawl down the right until we hit a leaf. */ 889 status = bt_rcpush(rc, h->pgno, idx); 890 pg = GETBINTERNAL(h, idx)->pgno; 891 mpool_put(t->bt_mp, h, 0); 892 if (status != RET_SUCCESS) 893 return status; 894 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 895 return (RET_ERROR); 896 idx = NEXTINDEX(h) - 1; 897 } 898 break; 899 } 900 901 ep->page = h; 902 ep->index = idx; 903 c->pg.pgno = h->pgno; 904 c->pg.index = idx; 905 F_CLR(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 906 F_SET(c, CURS_INIT); 907 return (RET_SUCCESS); 908 } 909