1 /*
2 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
5
6 /*
7 * Copyright (C) 2002 by the Massachusetts Institute of Technology.
8 * All rights reserved.
9 *
10 * Export of this software from the United States of America may
11 * require a specific license from the United States Government.
12 * It is the responsibility of any person or organization contemplating
13 * export to obtain such a license before exporting.
14 *
15 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
16 * distribute this software and its documentation for any purpose and
17 * without fee is hereby granted, provided that the above copyright
18 * notice appear in all copies and that both that copyright notice and
19 * this permission notice appear in supporting documentation, and that
20 * the name of M.I.T. not be used in advertising or publicity pertaining
21 * to distribution of the software without specific, written prior
22 * permission. Furthermore if you modify this software you must label
23 * your software as modified software and not distribute it in such a
24 * fashion that it might be confused with the original M.I.T. software.
25 * M.I.T. makes no representations about the suitability of
26 * this software for any purpose. It is provided "as is" without express
27 * or implied warranty.
28 */
29
30 /*-
31 * Copyright (c) 1990, 1993, 1994
32 * The Regents of the University of California. All rights reserved.
33 *
34 * This code is derived from software contributed to Berkeley by
35 * Mike Olson.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 * notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 * notice, this list of conditions and the following disclaimer in the
44 * documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 * must display the following acknowledgement:
47 * This product includes software developed by the University of
48 * California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 * may be used to endorse or promote products derived from this software
51 * without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 */
65
66 #if defined(LIBC_SCCS) && !defined(lint)
67 static char sccsid[] = "@(#)bt_seq.c 8.9 (Berkeley) 6/20/95";
68 #endif /* LIBC_SCCS and not lint */
69
70 #include <sys/types.h>
71
72 #include <errno.h>
73 #include <stddef.h>
74 #include <stdio.h>
75 #include <stdlib.h>
76 #include <string.h>
77
78 #include "db-int.h"
79 #include "btree.h"
80
81 static int __bt_first __P((BTREE *, const DBT *, EPG *, int *));
82 static int __bt_seqadv __P((BTREE *, EPG *, int));
83 static int __bt_seqset __P((BTREE *, EPG *, DBT *, int));
84
85 /*
86 * Sequential scan support.
87 *
88 * The tree can be scanned sequentially, starting from either end of the
89 * tree or from any specific key. A scan request before any scanning is
90 * done is initialized as starting from the least node.
91 */
92
93 /*
94 * __bt_seq --
95 * Btree sequential scan interface.
96 *
97 * Parameters:
98 * dbp: pointer to access method
99 * key: key for positioning and return value
100 * data: data return value
101 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
102 *
103 * Returns:
104 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
105 */
106 int
__bt_seq(dbp,key,data,flags)107 __bt_seq(dbp, key, data, flags)
108 const DB *dbp;
109 DBT *key, *data;
110 u_int flags;
111 {
112 BTREE *t;
113 EPG e;
114 int status;
115
116 t = dbp->internal;
117
118 /* Toss any page pinned across calls. */
119 if (t->bt_pinned != NULL) {
120 mpool_put(t->bt_mp, t->bt_pinned, 0);
121 t->bt_pinned = NULL;
122 }
123
124 /*
125 * If scan unitialized as yet, or starting at a specific record, set
126 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin
127 * the page the cursor references if they're successful.
128 */
129 switch (flags) {
130 case R_NEXT:
131 case R_PREV:
132 if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
133 status = __bt_seqadv(t, &e, flags);
134 break;
135 }
136 /* FALLTHROUGH */
137 case R_FIRST:
138 case R_LAST:
139 case R_CURSOR:
140 status = __bt_seqset(t, &e, key, flags);
141 break;
142 default:
143 errno = EINVAL;
144 return (RET_ERROR);
145 }
146
147 if (status == RET_SUCCESS) {
148 __bt_setcur(t, e.page->pgno, e.index);
149
150 status =
151 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
152
153 /*
154 * If the user is doing concurrent access, we copied the
155 * key/data, toss the page.
156 */
157 if (F_ISSET(t, B_DB_LOCK))
158 mpool_put(t->bt_mp, e.page, 0);
159 else
160 t->bt_pinned = e.page;
161 }
162 return (status);
163 }
164
165 /*
166 * __bt_seqset --
167 * Set the sequential scan to a specific key.
168 *
169 * Parameters:
170 * t: tree
171 * ep: storage for returned key
172 * key: key for initial scan position
173 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
174 *
175 * Side effects:
176 * Pins the page the cursor references.
177 *
178 * Returns:
179 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
180 */
181 static int
__bt_seqset(t,ep,key,flags)182 __bt_seqset(t, ep, key, flags)
183 BTREE *t;
184 EPG *ep;
185 DBT *key;
186 int flags;
187 {
188 PAGE *h;
189 db_pgno_t pg;
190 int exact;
191
192 /*
193 * Find the first, last or specific key in the tree and point the
194 * cursor at it. The cursor may not be moved until a new key has
195 * been found.
196 */
197 switch (flags) {
198 case R_CURSOR: /* Keyed scan. */
199 /*
200 * Find the first instance of the key or the smallest key
201 * which is greater than or equal to the specified key.
202 */
203 if (key->data == NULL || key->size == 0) {
204 errno = EINVAL;
205 return (RET_ERROR);
206 }
207 return (__bt_first(t, key, ep, &exact));
208 case R_FIRST: /* First record. */
209 case R_NEXT:
210 /* Walk down the left-hand side of the tree. */
211 for (pg = P_ROOT;;) {
212 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
213 return (RET_ERROR);
214
215 /* Check for an empty tree. */
216 if (NEXTINDEX(h) == 0) {
217 mpool_put(t->bt_mp, h, 0);
218 return (RET_SPECIAL);
219 }
220
221 if (h->flags & (P_BLEAF | P_RLEAF))
222 break;
223 pg = GETBINTERNAL(h, 0)->pgno;
224 mpool_put(t->bt_mp, h, 0);
225 }
226 ep->page = h;
227 ep->index = 0;
228 break;
229 case R_LAST: /* Last record. */
230 case R_PREV:
231 /* Walk down the right-hand side of the tree. */
232 for (pg = P_ROOT;;) {
233 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
234 return (RET_ERROR);
235
236 /* Check for an empty tree. */
237 if (NEXTINDEX(h) == 0) {
238 mpool_put(t->bt_mp, h, 0);
239 return (RET_SPECIAL);
240 }
241
242 if (h->flags & (P_BLEAF | P_RLEAF))
243 break;
244 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
245 mpool_put(t->bt_mp, h, 0);
246 }
247
248 ep->page = h;
249 ep->index = NEXTINDEX(h) - 1;
250 break;
251 }
252 return (RET_SUCCESS);
253 }
254
255 /*
256 * __bt_seqadvance --
257 * Advance the sequential scan.
258 *
259 * Parameters:
260 * t: tree
261 * flags: R_NEXT, R_PREV
262 *
263 * Side effects:
264 * Pins the page the new key/data record is on.
265 *
266 * Returns:
267 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
268 */
269 static int
__bt_seqadv(t,ep,flags)270 __bt_seqadv(t, ep, flags)
271 BTREE *t;
272 EPG *ep;
273 int flags;
274 {
275 CURSOR *c;
276 PAGE *h;
277 indx_t idx;
278 db_pgno_t pg;
279 int exact, rval;
280
281 /*
282 * There are a couple of states that we can be in. The cursor has
283 * been initialized by the time we get here, but that's all we know.
284 */
285 c = &t->bt_cursor;
286
287 /*
288 * The cursor was deleted and there weren't any duplicate records,
289 * so the cursor's key was saved. Find out where that key would
290 * be in the current tree. If the returned key is an exact match,
291 * it means that a key/data pair was inserted into the tree after
292 * the delete. We could reasonably return the key, but the problem
293 * is that this is the access pattern we'll see if the user is
294 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
295 * the cursor record and then replaces it, so the cursor was saved,
296 * and we'll simply return the same "new" record until the user
297 * notices and doesn't do a put() of it. Since the key is an exact
298 * match, we could as easily put the new record before the cursor,
299 * and we've made no guarantee to return it. So, move forward or
300 * back a record if it's an exact match.
301 *
302 * XXX
303 * In the current implementation, put's to the cursor are done with
304 * delete/add pairs. This has two consequences. First, it means
305 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
306 * the same behavior as above. Second, you can return the same key
307 * twice if you have duplicate records. The scenario is that the
308 * cursor record is deleted, moving the cursor forward or backward
309 * to a duplicate. The add then inserts the new record at a location
310 * ahead of the cursor because duplicates aren't sorted in any way,
311 * and the new record is later returned. This has to be fixed at some
312 * point.
313 */
314 if (F_ISSET(c, CURS_ACQUIRE)) {
315 if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
316 return (RET_ERROR);
317 if (!exact)
318 return (rval);
319 /*
320 * XXX
321 * Kluge -- get, release, get the page.
322 */
323 c->pg.pgno = ep->page->pgno;
324 c->pg.index = ep->index;
325 mpool_put(t->bt_mp, ep->page, 0);
326 }
327
328 /* Get the page referenced by the cursor. */
329 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
330 return (RET_ERROR);
331
332 /*
333 * Find the next/previous record in the tree and point the cursor at
334 * it. The cursor may not be moved until a new key has been found.
335 */
336 switch (flags) {
337 case R_NEXT: /* Next record. */
338 /*
339 * The cursor was deleted in duplicate records, and moved
340 * forward to a record that has yet to be returned. Clear
341 * that flag, and return the record.
342 */
343 if (F_ISSET(c, CURS_AFTER))
344 goto usecurrent;
345 idx = c->pg.index;
346 if (++idx == NEXTINDEX(h)) {
347 pg = h->nextpg;
348 mpool_put(t->bt_mp, h, 0);
349 if (pg == P_INVALID)
350 return (RET_SPECIAL);
351 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
352 return (RET_ERROR);
353 idx = 0;
354 }
355 break;
356 case R_PREV: /* Previous record. */
357 /*
358 * The cursor was deleted in duplicate records, and moved
359 * backward to a record that has yet to be returned. Clear
360 * that flag, and return the record.
361 */
362 if (F_ISSET(c, CURS_BEFORE)) {
363 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE);
364 ep->page = h;
365 ep->index = c->pg.index;
366 return (RET_SUCCESS);
367 }
368 idx = c->pg.index;
369 if (idx == 0) {
370 pg = h->prevpg;
371 mpool_put(t->bt_mp, h, 0);
372 if (pg == P_INVALID)
373 return (RET_SPECIAL);
374 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
375 return (RET_ERROR);
376 idx = NEXTINDEX(h) - 1;
377 } else
378 --idx;
379 break;
380 }
381
382 ep->page = h;
383 ep->index = idx;
384 return (RET_SUCCESS);
385 }
386
387 /*
388 * __bt_first --
389 * Find the first entry.
390 *
391 * Parameters:
392 * t: the tree
393 * key: the key
394 * erval: return EPG
395 * exactp: pointer to exact match flag
396 *
397 * Returns:
398 * The first entry in the tree greater than or equal to key,
399 * or RET_SPECIAL if no such key exists.
400 */
401 static int
__bt_first(t,key,erval,exactp)402 __bt_first(t, key, erval, exactp)
403 BTREE *t;
404 const DBT *key;
405 EPG *erval;
406 int *exactp;
407 {
408 PAGE *h;
409 EPG *ep, save;
410 db_pgno_t pg;
411
412 /*
413 * Find any matching record; __bt_search pins the page.
414 *
415 * If it's an exact match and duplicates are possible, walk backwards
416 * in the tree until we find the first one. Otherwise, make sure it's
417 * a valid key (__bt_search may return an index just past the end of a
418 * page) and return it.
419 */
420 if ((ep = __bt_search(t, key, exactp)) == NULL)
421 return (RET_SPECIAL);
422 if (*exactp) {
423 if (F_ISSET(t, B_NODUPS)) {
424 *erval = *ep;
425 return (RET_SUCCESS);
426 }
427
428 /*
429 * Walk backwards, as long as the entry matches and there are
430 * keys left in the tree. Save a copy of each match in case
431 * we go too far.
432 */
433 save = *ep;
434 h = ep->page;
435 do {
436 if (save.page->pgno != ep->page->pgno) {
437 mpool_put(t->bt_mp, save.page, 0);
438 save = *ep;
439 } else
440 save.index = ep->index;
441
442 /*
443 * Don't unpin the page the last (or original) match
444 * was on, but make sure it's unpinned if an error
445 * occurs.
446 */
447 if (ep->index == 0) {
448 if (h->prevpg == P_INVALID)
449 break;
450 if (h->pgno != save.page->pgno)
451 mpool_put(t->bt_mp, h, 0);
452 if ((h = mpool_get(t->bt_mp,
453 h->prevpg, 0)) == NULL) {
454 if (h->pgno == save.page->pgno)
455 mpool_put(t->bt_mp,
456 save.page, 0);
457 return (RET_ERROR);
458 }
459 ep->page = h;
460 ep->index = NEXTINDEX(h);
461 }
462 --ep->index;
463 } while (__bt_cmp(t, key, ep) == 0);
464
465 /*
466 * Reach here with the last page that was looked at pinned,
467 * which may or may not be the same as the last (or original)
468 * match page. If it's not useful, release it.
469 */
470 if (h->pgno != save.page->pgno)
471 mpool_put(t->bt_mp, h, 0);
472
473 *erval = save;
474 return (RET_SUCCESS);
475 }
476
477 /* If at the end of a page, find the next entry. */
478 if (ep->index == NEXTINDEX(ep->page)) {
479 h = ep->page;
480 pg = h->nextpg;
481 mpool_put(t->bt_mp, h, 0);
482 if (pg == P_INVALID)
483 return (RET_SPECIAL);
484 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
485 return (RET_ERROR);
486 ep->index = 0;
487 ep->page = h;
488 }
489 *erval = *ep;
490 return (RET_SUCCESS);
491 }
492
493 /*
494 * __bt_setcur --
495 * Set the cursor to an entry in the tree.
496 *
497 * Parameters:
498 * t: the tree
499 * pgno: page number
500 * index: page index
501 */
502 void
__bt_setcur(t,pgno,idx)503 __bt_setcur(t, pgno, idx)
504 BTREE *t;
505 db_pgno_t pgno;
506 u_int idx;
507 {
508 /* Lose any already deleted key. */
509 if (t->bt_cursor.key.data != NULL) {
510 free(t->bt_cursor.key.data);
511 t->bt_cursor.key.size = 0;
512 t->bt_cursor.key.data = NULL;
513 }
514 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
515
516 /* Update the cursor. */
517 t->bt_cursor.pg.pgno = pgno;
518 t->bt_cursor.pg.index = idx;
519 F_SET(&t->bt_cursor, CURS_INIT);
520 }
521
522 /* Recursive descent cursor. */
523 typedef struct rcursor_ {
524 CURSOR cursor;
525 size_t ssize;
526 EPGNO *stack;
527 EPGNO *sp;
528 } RCURSOR;
529 #define RCURSOR_MINSS 64
530
531 static int bt_rcinit(void **);
532 static void bt_rcdestroy(void **);
533 static int bt_rcpush(RCURSOR *, db_pgno_t, u_int);
534 static EPGNO *bt_rcpop(RCURSOR *);
535 static void bt_rcclr(RCURSOR *);
536 static int bt_rcgrowstk(RCURSOR *);
537 static int bt_rseqset(BTREE *, EPG *, DBT *, RCURSOR *, int);
538 static int bt_rseqadv(BTREE *, EPG *, RCURSOR *, int);
539
540 static int
bt_rcinit(curs)541 bt_rcinit(curs)
542 void **curs;
543 {
544 RCURSOR *rc;
545
546 rc = *curs = malloc(sizeof(RCURSOR));
547 if (rc == NULL) {
548 errno = ENOMEM;
549 return RET_ERROR;
550 }
551 memset(rc, 0, sizeof(*rc));
552
553 rc->ssize = RCURSOR_MINSS;
554 rc->stack = malloc(rc->ssize * sizeof(EPGNO));
555 if (rc->stack == NULL) {
556 free(rc);
557 errno = ENOMEM;
558 return RET_ERROR;
559 }
560 bt_rcclr(rc);
561 return RET_SUCCESS;
562 }
563
564 static void
bt_rcdestroy(curs)565 bt_rcdestroy(curs)
566 void **curs;
567 {
568 RCURSOR *rc;
569
570 rc = *curs;
571 free(rc->stack);
572 free(rc);
573 *curs = NULL;
574 }
575
576 static int
bt_rcpush(rc,p,i)577 bt_rcpush(rc, p, i)
578 RCURSOR *rc;
579 db_pgno_t p;
580 u_int i;
581 {
582 int status;
583
584 rc->sp->pgno = p;
585 rc->sp->index = i;
586 if (++rc->sp > rc->stack + rc->ssize) {
587 status = bt_rcgrowstk(rc);
588 if (status != RET_SUCCESS)
589 return status;
590 }
591 return RET_SUCCESS;
592 }
593
594 static EPGNO *
bt_rcpop(rc)595 bt_rcpop(rc)
596 RCURSOR *rc;
597 {
598 return (rc->sp == rc->stack) ? NULL : --rc->sp;
599 }
600
601 static void
bt_rcclr(rc)602 bt_rcclr(rc)
603 RCURSOR *rc;
604 {
605 rc->sp = rc->stack;
606 }
607
608 static int
bt_rcgrowstk(rc)609 bt_rcgrowstk(rc)
610 RCURSOR *rc;
611 {
612 size_t osize;
613 EPGNO *e;
614
615 osize = rc->ssize;
616 rc->ssize *= 2;
617 e = realloc(rc->stack, rc->ssize * sizeof(EPGNO));
618 if (e == NULL) {
619 rc->ssize = osize;
620 errno = ENOMEM;
621 return RET_ERROR;
622 }
623 rc->stack = e;
624 return RET_SUCCESS;
625 }
626
627 /*
628 * bt_rseq --
629 * Like __bt_seq but does recursive descent tree traversal
630 * instead of using the prev/next pointers.
631 */
632 int
bt_rseq(dbp,key,data,curs,flags)633 bt_rseq(dbp, key, data, curs, flags)
634 const DB *dbp;
635 DBT *key, *data;
636 void **curs;
637 u_int flags;
638 {
639 RCURSOR *rc;
640 BTREE *t;
641 EPG e;
642 int status;
643
644 t = dbp->internal;
645
646 /* Toss any page pinned across calls. */
647 if (t->bt_pinned != NULL) {
648 mpool_put(t->bt_mp, t->bt_pinned, 0);
649 t->bt_pinned = NULL;
650 }
651
652 if (curs == NULL) {
653 errno = EINVAL;
654 return RET_ERROR;
655 }
656 if (*curs == NULL) {
657 status = bt_rcinit(curs);
658 if (status != RET_SUCCESS)
659 return RET_ERROR;
660 }
661 rc = *curs;
662
663 /*
664 * If scan unitialized as yet, or starting at a specific record, set
665 * the scan to a specific key. Both bt_rseqset and bt_rseqadv pin
666 * the page the cursor references if they're successful.
667 */
668 switch (flags) {
669 case R_NEXT:
670 case R_PREV:
671 if (F_ISSET(&rc->cursor, CURS_INIT)) {
672 status = bt_rseqadv(t, &e, rc, flags);
673 break;
674 }
675 /* FALLTHROUGH */
676 case R_FIRST:
677 case R_LAST:
678 case R_CURSOR:
679 status = bt_rseqset(t, &e, key, rc, flags);
680 break;
681 default:
682 errno = EINVAL;
683 return (RET_ERROR);
684 }
685
686 if (status == RET_SUCCESS) {
687 status =
688 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
689
690 /*
691 * If the user is doing concurrent access, we copied the
692 * key/data, toss the page.
693 */
694 if (F_ISSET(t, B_DB_LOCK))
695 mpool_put(t->bt_mp, e.page, 0);
696 else
697 t->bt_pinned = e.page;
698 } else if (status == RET_SPECIAL)
699 bt_rcdestroy(curs);
700 return (status);
701 }
702
703 /*
704 * bt_rseqset --
705 * Set the sequential scan to a specific key.
706 *
707 * Parameters:
708 * t: tree
709 * ep: storage for returned key
710 * key: key for initial scan position
711 * rc: recursion cursor
712 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
713 *
714 * Side effects:
715 * Pins the page the cursor references.
716 * Updates rc's stack and cursor.
717 *
718 * Returns:
719 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
720 */
721 static int
bt_rseqset(t,ep,key,rc,flags)722 bt_rseqset(t, ep, key, rc, flags)
723 BTREE *t;
724 EPG *ep;
725 DBT *key;
726 RCURSOR *rc;
727 int flags;
728 {
729 PAGE *h;
730 db_pgno_t pg;
731 int status;
732
733 /*
734 * Find the first, last or specific key in the tree and point the
735 * cursor at it. The cursor may not be moved until a new key has
736 * been found.
737 */
738 switch (flags) {
739 case R_CURSOR: /* Not implemented. */
740 errno = EINVAL;
741 return RET_ERROR;
742 case R_FIRST: /* First record. */
743 case R_NEXT:
744 bt_rcclr(rc);
745 /* Walk down the left-hand side of the tree. */
746 for (pg = P_ROOT;;) {
747 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
748 return (RET_ERROR);
749
750 /* Check for an empty tree. */
751 if (NEXTINDEX(h) == 0) {
752 mpool_put(t->bt_mp, h, 0);
753 return (RET_SPECIAL);
754 }
755
756 if (h->flags & (P_BLEAF | P_RLEAF))
757 break;
758 pg = GETBINTERNAL(h, 0)->pgno;
759 status = bt_rcpush(rc, h->pgno, 0);
760 mpool_put(t->bt_mp, h, 0);
761 if (status != RET_SUCCESS)
762 return status;
763 }
764 ep->page = h;
765 ep->index = 0;
766 break;
767 case R_LAST: /* Last record. */
768 case R_PREV:
769 bt_rcclr(rc);
770 /* Walk down the right-hand side of the tree. */
771 for (pg = P_ROOT;;) {
772 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
773 return (RET_ERROR);
774
775 /* Check for an empty tree. */
776 if (NEXTINDEX(h) == 0) {
777 mpool_put(t->bt_mp, h, 0);
778 return (RET_SPECIAL);
779 }
780
781 if (h->flags & (P_BLEAF | P_RLEAF))
782 break;
783 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
784 status = bt_rcpush(rc, h->pgno, NEXTINDEX(h) - 1);
785 mpool_put(t->bt_mp, h, 0);
786 if (status != RET_SUCCESS)
787 return status;
788 }
789 ep->page = h;
790 ep->index = NEXTINDEX(h) - 1;
791 break;
792 }
793 rc->cursor.pg.pgno = ep->page->pgno;
794 rc->cursor.pg.index = ep->index;
795 F_CLR(&rc->cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
796 F_SET(&rc->cursor, CURS_INIT);
797 return (RET_SUCCESS);
798 }
799
800 /*
801 * bt_rseqadvance --
802 * Advance the sequential scan.
803 *
804 * Parameters:
805 * t: tree
806 * ep: return page
807 * rc: recursion cursor
808 * flags: R_NEXT, R_PREV
809 *
810 * Side effects:
811 * Pins the page the new key/data record is on.
812 * Updates rc's stack and cursor.
813 *
814 * Returns:
815 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
816 */
817 static int
bt_rseqadv(t,ep,rc,flags)818 bt_rseqadv(t, ep, rc, flags)
819 BTREE *t;
820 EPG *ep;
821 RCURSOR *rc;
822 int flags;
823 {
824 CURSOR *c;
825 PAGE *h;
826 indx_t idx;
827 db_pgno_t pg;
828 int status;
829 EPGNO *e;
830
831 /*
832 * There are a couple of states that we can be in. The cursor has
833 * been initialized by the time we get here, but that's all we know.
834 */
835 c = &rc->cursor;
836
837 /* Get the page referenced by the cursor. */
838 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
839 return (RET_ERROR);
840
841 /*
842 * Find the next/previous record in the tree and point the cursor at
843 * it. The cursor may not be moved until a new key has been found.
844 */
845 switch (flags) {
846 case R_NEXT: /* Next record. */
847 idx = c->pg.index;
848 while (++idx == NEXTINDEX(h)) {
849 /* Crawl up if we hit the right edge. */
850 e = bt_rcpop(rc);
851 mpool_put(t->bt_mp, h, 0);
852 if (e == NULL) /* Hit the right edge of root. */
853 return RET_SPECIAL;
854 idx = e->index;
855 pg = e->pgno;
856 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
857 return (RET_ERROR);
858 }
859 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
860 /* Crawl down the left until we hit a leaf. */
861 status = bt_rcpush(rc, h->pgno, idx);
862 pg = GETBINTERNAL(h, idx)->pgno;
863 mpool_put(t->bt_mp, h, 0);
864 if (status != RET_SUCCESS)
865 return status;
866 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
867 return (RET_ERROR);
868 idx = 0;
869 }
870 break;
871 case R_PREV: /* Previous record. */
872 idx = c->pg.index;
873 while (!idx) {
874 /* Crawl up if we hit the left edge. */
875 e = bt_rcpop(rc);
876 mpool_put(t->bt_mp, h, 0);
877 if (e == NULL) /* Hit the left edge of root. */
878 return RET_SPECIAL;
879 idx = e->index;
880 pg = e->pgno;
881 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
882 return (RET_ERROR);
883 }
884 idx--;
885 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
886 /* Crawl down the right until we hit a leaf. */
887 status = bt_rcpush(rc, h->pgno, idx);
888 pg = GETBINTERNAL(h, idx)->pgno;
889 mpool_put(t->bt_mp, h, 0);
890 if (status != RET_SUCCESS)
891 return status;
892 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
893 return (RET_ERROR);
894 idx = NEXTINDEX(h) - 1;
895 }
896 break;
897 }
898
899 ep->page = h;
900 ep->index = idx;
901 c->pg.pgno = h->pgno;
902 c->pg.index = idx;
903 F_CLR(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
904 F_SET(c, CURS_INIT);
905 return (RET_SUCCESS);
906 }
907