1 /*
2 * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 */
5
6 #pragma ident "%Z%%M% %I% %E% SMI"
7
8 /*
9 * Copyright (C) 2002 by the Massachusetts Institute of Technology.
10 * All rights reserved.
11 *
12 * Export of this software from the United States of America may
13 * require a specific license from the United States Government.
14 * It is the responsibility of any person or organization contemplating
15 * export to obtain such a license before exporting.
16 *
17 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
18 * distribute this software and its documentation for any purpose and
19 * without fee is hereby granted, provided that the above copyright
20 * notice appear in all copies and that both that copyright notice and
21 * this permission notice appear in supporting documentation, and that
22 * the name of M.I.T. not be used in advertising or publicity pertaining
23 * to distribution of the software without specific, written prior
24 * permission. Furthermore if you modify this software you must label
25 * your software as modified software and not distribute it in such a
26 * fashion that it might be confused with the original M.I.T. software.
27 * M.I.T. makes no representations about the suitability of
28 * this software for any purpose. It is provided "as is" without express
29 * or implied warranty.
30 */
31
32 /*-
33 * Copyright (c) 1990, 1993, 1994
34 * The Regents of the University of California. All rights reserved.
35 *
36 * This code is derived from software contributed to Berkeley by
37 * Mike Olson.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 * notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 * notice, this list of conditions and the following disclaimer in the
46 * documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 * must display the following acknowledgement:
49 * This product includes software developed by the University of
50 * California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 * may be used to endorse or promote products derived from this software
53 * without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 */
67
68 #if defined(LIBC_SCCS) && !defined(lint)
69 static char sccsid[] = "@(#)bt_seq.c 8.9 (Berkeley) 6/20/95";
70 #endif /* LIBC_SCCS and not lint */
71
72 #include <sys/types.h>
73
74 #include <errno.h>
75 #include <stddef.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
79
80 #include "db-int.h"
81 #include "btree.h"
82
83 static int __bt_first __P((BTREE *, const DBT *, EPG *, int *));
84 static int __bt_seqadv __P((BTREE *, EPG *, int));
85 static int __bt_seqset __P((BTREE *, EPG *, DBT *, int));
86
87 /*
88 * Sequential scan support.
89 *
90 * The tree can be scanned sequentially, starting from either end of the
91 * tree or from any specific key. A scan request before any scanning is
92 * done is initialized as starting from the least node.
93 */
94
95 /*
96 * __bt_seq --
97 * Btree sequential scan interface.
98 *
99 * Parameters:
100 * dbp: pointer to access method
101 * key: key for positioning and return value
102 * data: data return value
103 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
104 *
105 * Returns:
106 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
107 */
108 int
__bt_seq(dbp,key,data,flags)109 __bt_seq(dbp, key, data, flags)
110 const DB *dbp;
111 DBT *key, *data;
112 u_int flags;
113 {
114 BTREE *t;
115 EPG e;
116 int status;
117
118 t = dbp->internal;
119
120 /* Toss any page pinned across calls. */
121 if (t->bt_pinned != NULL) {
122 mpool_put(t->bt_mp, t->bt_pinned, 0);
123 t->bt_pinned = NULL;
124 }
125
126 /*
127 * If scan unitialized as yet, or starting at a specific record, set
128 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin
129 * the page the cursor references if they're successful.
130 */
131 switch (flags) {
132 case R_NEXT:
133 case R_PREV:
134 if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
135 status = __bt_seqadv(t, &e, flags);
136 break;
137 }
138 /* FALLTHROUGH */
139 case R_FIRST:
140 case R_LAST:
141 case R_CURSOR:
142 status = __bt_seqset(t, &e, key, flags);
143 break;
144 default:
145 errno = EINVAL;
146 return (RET_ERROR);
147 }
148
149 if (status == RET_SUCCESS) {
150 __bt_setcur(t, e.page->pgno, e.index);
151
152 status =
153 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
154
155 /*
156 * If the user is doing concurrent access, we copied the
157 * key/data, toss the page.
158 */
159 if (F_ISSET(t, B_DB_LOCK))
160 mpool_put(t->bt_mp, e.page, 0);
161 else
162 t->bt_pinned = e.page;
163 }
164 return (status);
165 }
166
167 /*
168 * __bt_seqset --
169 * Set the sequential scan to a specific key.
170 *
171 * Parameters:
172 * t: tree
173 * ep: storage for returned key
174 * key: key for initial scan position
175 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
176 *
177 * Side effects:
178 * Pins the page the cursor references.
179 *
180 * Returns:
181 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
182 */
183 static int
__bt_seqset(t,ep,key,flags)184 __bt_seqset(t, ep, key, flags)
185 BTREE *t;
186 EPG *ep;
187 DBT *key;
188 int flags;
189 {
190 PAGE *h;
191 db_pgno_t pg;
192 int exact;
193
194 /*
195 * Find the first, last or specific key in the tree and point the
196 * cursor at it. The cursor may not be moved until a new key has
197 * been found.
198 */
199 switch (flags) {
200 case R_CURSOR: /* Keyed scan. */
201 /*
202 * Find the first instance of the key or the smallest key
203 * which is greater than or equal to the specified key.
204 */
205 if (key->data == NULL || key->size == 0) {
206 errno = EINVAL;
207 return (RET_ERROR);
208 }
209 return (__bt_first(t, key, ep, &exact));
210 case R_FIRST: /* First record. */
211 case R_NEXT:
212 /* Walk down the left-hand side of the tree. */
213 for (pg = P_ROOT;;) {
214 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
215 return (RET_ERROR);
216
217 /* Check for an empty tree. */
218 if (NEXTINDEX(h) == 0) {
219 mpool_put(t->bt_mp, h, 0);
220 return (RET_SPECIAL);
221 }
222
223 if (h->flags & (P_BLEAF | P_RLEAF))
224 break;
225 pg = GETBINTERNAL(h, 0)->pgno;
226 mpool_put(t->bt_mp, h, 0);
227 }
228 ep->page = h;
229 ep->index = 0;
230 break;
231 case R_LAST: /* Last record. */
232 case R_PREV:
233 /* Walk down the right-hand side of the tree. */
234 for (pg = P_ROOT;;) {
235 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
236 return (RET_ERROR);
237
238 /* Check for an empty tree. */
239 if (NEXTINDEX(h) == 0) {
240 mpool_put(t->bt_mp, h, 0);
241 return (RET_SPECIAL);
242 }
243
244 if (h->flags & (P_BLEAF | P_RLEAF))
245 break;
246 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
247 mpool_put(t->bt_mp, h, 0);
248 }
249
250 ep->page = h;
251 ep->index = NEXTINDEX(h) - 1;
252 break;
253 }
254 return (RET_SUCCESS);
255 }
256
257 /*
258 * __bt_seqadvance --
259 * Advance the sequential scan.
260 *
261 * Parameters:
262 * t: tree
263 * flags: R_NEXT, R_PREV
264 *
265 * Side effects:
266 * Pins the page the new key/data record is on.
267 *
268 * Returns:
269 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
270 */
271 static int
__bt_seqadv(t,ep,flags)272 __bt_seqadv(t, ep, flags)
273 BTREE *t;
274 EPG *ep;
275 int flags;
276 {
277 CURSOR *c;
278 PAGE *h;
279 indx_t idx;
280 db_pgno_t pg;
281 int exact, rval;
282
283 /*
284 * There are a couple of states that we can be in. The cursor has
285 * been initialized by the time we get here, but that's all we know.
286 */
287 c = &t->bt_cursor;
288
289 /*
290 * The cursor was deleted and there weren't any duplicate records,
291 * so the cursor's key was saved. Find out where that key would
292 * be in the current tree. If the returned key is an exact match,
293 * it means that a key/data pair was inserted into the tree after
294 * the delete. We could reasonably return the key, but the problem
295 * is that this is the access pattern we'll see if the user is
296 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
297 * the cursor record and then replaces it, so the cursor was saved,
298 * and we'll simply return the same "new" record until the user
299 * notices and doesn't do a put() of it. Since the key is an exact
300 * match, we could as easily put the new record before the cursor,
301 * and we've made no guarantee to return it. So, move forward or
302 * back a record if it's an exact match.
303 *
304 * XXX
305 * In the current implementation, put's to the cursor are done with
306 * delete/add pairs. This has two consequences. First, it means
307 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
308 * the same behavior as above. Second, you can return the same key
309 * twice if you have duplicate records. The scenario is that the
310 * cursor record is deleted, moving the cursor forward or backward
311 * to a duplicate. The add then inserts the new record at a location
312 * ahead of the cursor because duplicates aren't sorted in any way,
313 * and the new record is later returned. This has to be fixed at some
314 * point.
315 */
316 if (F_ISSET(c, CURS_ACQUIRE)) {
317 if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
318 return (RET_ERROR);
319 if (!exact)
320 return (rval);
321 /*
322 * XXX
323 * Kluge -- get, release, get the page.
324 */
325 c->pg.pgno = ep->page->pgno;
326 c->pg.index = ep->index;
327 mpool_put(t->bt_mp, ep->page, 0);
328 }
329
330 /* Get the page referenced by the cursor. */
331 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
332 return (RET_ERROR);
333
334 /*
335 * Find the next/previous record in the tree and point the cursor at
336 * it. The cursor may not be moved until a new key has been found.
337 */
338 switch (flags) {
339 case R_NEXT: /* Next record. */
340 /*
341 * The cursor was deleted in duplicate records, and moved
342 * forward to a record that has yet to be returned. Clear
343 * that flag, and return the record.
344 */
345 if (F_ISSET(c, CURS_AFTER))
346 goto usecurrent;
347 idx = c->pg.index;
348 if (++idx == NEXTINDEX(h)) {
349 pg = h->nextpg;
350 mpool_put(t->bt_mp, h, 0);
351 if (pg == P_INVALID)
352 return (RET_SPECIAL);
353 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
354 return (RET_ERROR);
355 idx = 0;
356 }
357 break;
358 case R_PREV: /* Previous record. */
359 /*
360 * The cursor was deleted in duplicate records, and moved
361 * backward to a record that has yet to be returned. Clear
362 * that flag, and return the record.
363 */
364 if (F_ISSET(c, CURS_BEFORE)) {
365 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE);
366 ep->page = h;
367 ep->index = c->pg.index;
368 return (RET_SUCCESS);
369 }
370 idx = c->pg.index;
371 if (idx == 0) {
372 pg = h->prevpg;
373 mpool_put(t->bt_mp, h, 0);
374 if (pg == P_INVALID)
375 return (RET_SPECIAL);
376 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
377 return (RET_ERROR);
378 idx = NEXTINDEX(h) - 1;
379 } else
380 --idx;
381 break;
382 }
383
384 ep->page = h;
385 ep->index = idx;
386 return (RET_SUCCESS);
387 }
388
389 /*
390 * __bt_first --
391 * Find the first entry.
392 *
393 * Parameters:
394 * t: the tree
395 * key: the key
396 * erval: return EPG
397 * exactp: pointer to exact match flag
398 *
399 * Returns:
400 * The first entry in the tree greater than or equal to key,
401 * or RET_SPECIAL if no such key exists.
402 */
403 static int
__bt_first(t,key,erval,exactp)404 __bt_first(t, key, erval, exactp)
405 BTREE *t;
406 const DBT *key;
407 EPG *erval;
408 int *exactp;
409 {
410 PAGE *h;
411 EPG *ep, save;
412 db_pgno_t pg;
413
414 /*
415 * Find any matching record; __bt_search pins the page.
416 *
417 * If it's an exact match and duplicates are possible, walk backwards
418 * in the tree until we find the first one. Otherwise, make sure it's
419 * a valid key (__bt_search may return an index just past the end of a
420 * page) and return it.
421 */
422 if ((ep = __bt_search(t, key, exactp)) == NULL)
423 return (RET_SPECIAL);
424 if (*exactp) {
425 if (F_ISSET(t, B_NODUPS)) {
426 *erval = *ep;
427 return (RET_SUCCESS);
428 }
429
430 /*
431 * Walk backwards, as long as the entry matches and there are
432 * keys left in the tree. Save a copy of each match in case
433 * we go too far.
434 */
435 save = *ep;
436 h = ep->page;
437 do {
438 if (save.page->pgno != ep->page->pgno) {
439 mpool_put(t->bt_mp, save.page, 0);
440 save = *ep;
441 } else
442 save.index = ep->index;
443
444 /*
445 * Don't unpin the page the last (or original) match
446 * was on, but make sure it's unpinned if an error
447 * occurs.
448 */
449 if (ep->index == 0) {
450 if (h->prevpg == P_INVALID)
451 break;
452 if (h->pgno != save.page->pgno)
453 mpool_put(t->bt_mp, h, 0);
454 if ((h = mpool_get(t->bt_mp,
455 h->prevpg, 0)) == NULL) {
456 if (h->pgno == save.page->pgno)
457 mpool_put(t->bt_mp,
458 save.page, 0);
459 return (RET_ERROR);
460 }
461 ep->page = h;
462 ep->index = NEXTINDEX(h);
463 }
464 --ep->index;
465 } while (__bt_cmp(t, key, ep) == 0);
466
467 /*
468 * Reach here with the last page that was looked at pinned,
469 * which may or may not be the same as the last (or original)
470 * match page. If it's not useful, release it.
471 */
472 if (h->pgno != save.page->pgno)
473 mpool_put(t->bt_mp, h, 0);
474
475 *erval = save;
476 return (RET_SUCCESS);
477 }
478
479 /* If at the end of a page, find the next entry. */
480 if (ep->index == NEXTINDEX(ep->page)) {
481 h = ep->page;
482 pg = h->nextpg;
483 mpool_put(t->bt_mp, h, 0);
484 if (pg == P_INVALID)
485 return (RET_SPECIAL);
486 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
487 return (RET_ERROR);
488 ep->index = 0;
489 ep->page = h;
490 }
491 *erval = *ep;
492 return (RET_SUCCESS);
493 }
494
495 /*
496 * __bt_setcur --
497 * Set the cursor to an entry in the tree.
498 *
499 * Parameters:
500 * t: the tree
501 * pgno: page number
502 * index: page index
503 */
504 void
__bt_setcur(t,pgno,idx)505 __bt_setcur(t, pgno, idx)
506 BTREE *t;
507 db_pgno_t pgno;
508 u_int idx;
509 {
510 /* Lose any already deleted key. */
511 if (t->bt_cursor.key.data != NULL) {
512 free(t->bt_cursor.key.data);
513 t->bt_cursor.key.size = 0;
514 t->bt_cursor.key.data = NULL;
515 }
516 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
517
518 /* Update the cursor. */
519 t->bt_cursor.pg.pgno = pgno;
520 t->bt_cursor.pg.index = idx;
521 F_SET(&t->bt_cursor, CURS_INIT);
522 }
523
524 /* Recursive descent cursor. */
525 typedef struct rcursor_ {
526 CURSOR cursor;
527 size_t ssize;
528 EPGNO *stack;
529 EPGNO *sp;
530 } RCURSOR;
531 #define RCURSOR_MINSS 64
532
533 static int bt_rcinit(void **);
534 static void bt_rcdestroy(void **);
535 static int bt_rcpush(RCURSOR *, db_pgno_t, u_int);
536 static EPGNO *bt_rcpop(RCURSOR *);
537 static void bt_rcclr(RCURSOR *);
538 static int bt_rcgrowstk(RCURSOR *);
539 static int bt_rseqset(BTREE *, EPG *, DBT *, RCURSOR *, int);
540 static int bt_rseqadv(BTREE *, EPG *, RCURSOR *, int);
541
542 static int
bt_rcinit(curs)543 bt_rcinit(curs)
544 void **curs;
545 {
546 RCURSOR *rc;
547
548 rc = *curs = malloc(sizeof(RCURSOR));
549 if (rc == NULL) {
550 errno = ENOMEM;
551 return RET_ERROR;
552 }
553 memset(rc, 0, sizeof(*rc));
554
555 rc->ssize = RCURSOR_MINSS;
556 rc->stack = malloc(rc->ssize * sizeof(EPGNO));
557 if (rc->stack == NULL) {
558 free(rc);
559 errno = ENOMEM;
560 return RET_ERROR;
561 }
562 bt_rcclr(rc);
563 return RET_SUCCESS;
564 }
565
566 static void
bt_rcdestroy(curs)567 bt_rcdestroy(curs)
568 void **curs;
569 {
570 RCURSOR *rc;
571
572 rc = *curs;
573 free(rc->stack);
574 free(rc);
575 *curs = NULL;
576 }
577
578 static int
bt_rcpush(rc,p,i)579 bt_rcpush(rc, p, i)
580 RCURSOR *rc;
581 db_pgno_t p;
582 u_int i;
583 {
584 int status;
585
586 rc->sp->pgno = p;
587 rc->sp->index = i;
588 if (++rc->sp > rc->stack + rc->ssize) {
589 status = bt_rcgrowstk(rc);
590 if (status != RET_SUCCESS)
591 return status;
592 }
593 return RET_SUCCESS;
594 }
595
596 static EPGNO *
bt_rcpop(rc)597 bt_rcpop(rc)
598 RCURSOR *rc;
599 {
600 return (rc->sp == rc->stack) ? NULL : --rc->sp;
601 }
602
603 static void
bt_rcclr(rc)604 bt_rcclr(rc)
605 RCURSOR *rc;
606 {
607 rc->sp = rc->stack;
608 }
609
610 static int
bt_rcgrowstk(rc)611 bt_rcgrowstk(rc)
612 RCURSOR *rc;
613 {
614 size_t osize;
615 EPGNO *e;
616
617 osize = rc->ssize;
618 rc->ssize *= 2;
619 e = realloc(rc->stack, rc->ssize * sizeof(EPGNO));
620 if (e == NULL) {
621 rc->ssize = osize;
622 errno = ENOMEM;
623 return RET_ERROR;
624 }
625 rc->stack = e;
626 return RET_SUCCESS;
627 }
628
629 /*
630 * bt_rseq --
631 * Like __bt_seq but does recursive descent tree traversal
632 * instead of using the prev/next pointers.
633 */
634 int
bt_rseq(dbp,key,data,curs,flags)635 bt_rseq(dbp, key, data, curs, flags)
636 const DB *dbp;
637 DBT *key, *data;
638 void **curs;
639 u_int flags;
640 {
641 RCURSOR *rc;
642 BTREE *t;
643 EPG e;
644 int status;
645
646 t = dbp->internal;
647
648 /* Toss any page pinned across calls. */
649 if (t->bt_pinned != NULL) {
650 mpool_put(t->bt_mp, t->bt_pinned, 0);
651 t->bt_pinned = NULL;
652 }
653
654 if (curs == NULL) {
655 errno = EINVAL;
656 return RET_ERROR;
657 }
658 if (*curs == NULL) {
659 status = bt_rcinit(curs);
660 if (status != RET_SUCCESS)
661 return RET_ERROR;
662 }
663 rc = *curs;
664
665 /*
666 * If scan unitialized as yet, or starting at a specific record, set
667 * the scan to a specific key. Both bt_rseqset and bt_rseqadv pin
668 * the page the cursor references if they're successful.
669 */
670 switch (flags) {
671 case R_NEXT:
672 case R_PREV:
673 if (F_ISSET(&rc->cursor, CURS_INIT)) {
674 status = bt_rseqadv(t, &e, rc, flags);
675 break;
676 }
677 /* FALLTHROUGH */
678 case R_FIRST:
679 case R_LAST:
680 case R_CURSOR:
681 status = bt_rseqset(t, &e, key, rc, flags);
682 break;
683 default:
684 errno = EINVAL;
685 return (RET_ERROR);
686 }
687
688 if (status == RET_SUCCESS) {
689 status =
690 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
691
692 /*
693 * If the user is doing concurrent access, we copied the
694 * key/data, toss the page.
695 */
696 if (F_ISSET(t, B_DB_LOCK))
697 mpool_put(t->bt_mp, e.page, 0);
698 else
699 t->bt_pinned = e.page;
700 } else if (status == RET_SPECIAL)
701 bt_rcdestroy(curs);
702 return (status);
703 }
704
705 /*
706 * bt_rseqset --
707 * Set the sequential scan to a specific key.
708 *
709 * Parameters:
710 * t: tree
711 * ep: storage for returned key
712 * key: key for initial scan position
713 * rc: recursion cursor
714 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
715 *
716 * Side effects:
717 * Pins the page the cursor references.
718 * Updates rc's stack and cursor.
719 *
720 * Returns:
721 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
722 */
723 static int
bt_rseqset(t,ep,key,rc,flags)724 bt_rseqset(t, ep, key, rc, flags)
725 BTREE *t;
726 EPG *ep;
727 DBT *key;
728 RCURSOR *rc;
729 int flags;
730 {
731 PAGE *h;
732 db_pgno_t pg;
733 int status;
734
735 /*
736 * Find the first, last or specific key in the tree and point the
737 * cursor at it. The cursor may not be moved until a new key has
738 * been found.
739 */
740 switch (flags) {
741 case R_CURSOR: /* Not implemented. */
742 errno = EINVAL;
743 return RET_ERROR;
744 case R_FIRST: /* First record. */
745 case R_NEXT:
746 bt_rcclr(rc);
747 /* Walk down the left-hand side of the tree. */
748 for (pg = P_ROOT;;) {
749 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
750 return (RET_ERROR);
751
752 /* Check for an empty tree. */
753 if (NEXTINDEX(h) == 0) {
754 mpool_put(t->bt_mp, h, 0);
755 return (RET_SPECIAL);
756 }
757
758 if (h->flags & (P_BLEAF | P_RLEAF))
759 break;
760 pg = GETBINTERNAL(h, 0)->pgno;
761 status = bt_rcpush(rc, h->pgno, 0);
762 mpool_put(t->bt_mp, h, 0);
763 if (status != RET_SUCCESS)
764 return status;
765 }
766 ep->page = h;
767 ep->index = 0;
768 break;
769 case R_LAST: /* Last record. */
770 case R_PREV:
771 bt_rcclr(rc);
772 /* Walk down the right-hand side of the tree. */
773 for (pg = P_ROOT;;) {
774 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
775 return (RET_ERROR);
776
777 /* Check for an empty tree. */
778 if (NEXTINDEX(h) == 0) {
779 mpool_put(t->bt_mp, h, 0);
780 return (RET_SPECIAL);
781 }
782
783 if (h->flags & (P_BLEAF | P_RLEAF))
784 break;
785 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
786 status = bt_rcpush(rc, h->pgno, NEXTINDEX(h) - 1);
787 mpool_put(t->bt_mp, h, 0);
788 if (status != RET_SUCCESS)
789 return status;
790 }
791 ep->page = h;
792 ep->index = NEXTINDEX(h) - 1;
793 break;
794 }
795 rc->cursor.pg.pgno = ep->page->pgno;
796 rc->cursor.pg.index = ep->index;
797 F_CLR(&rc->cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
798 F_SET(&rc->cursor, CURS_INIT);
799 return (RET_SUCCESS);
800 }
801
802 /*
803 * bt_rseqadvance --
804 * Advance the sequential scan.
805 *
806 * Parameters:
807 * t: tree
808 * ep: return page
809 * rc: recursion cursor
810 * flags: R_NEXT, R_PREV
811 *
812 * Side effects:
813 * Pins the page the new key/data record is on.
814 * Updates rc's stack and cursor.
815 *
816 * Returns:
817 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
818 */
819 static int
bt_rseqadv(t,ep,rc,flags)820 bt_rseqadv(t, ep, rc, flags)
821 BTREE *t;
822 EPG *ep;
823 RCURSOR *rc;
824 int flags;
825 {
826 CURSOR *c;
827 PAGE *h;
828 indx_t idx;
829 db_pgno_t pg;
830 int status;
831 EPGNO *e;
832
833 /*
834 * There are a couple of states that we can be in. The cursor has
835 * been initialized by the time we get here, but that's all we know.
836 */
837 c = &rc->cursor;
838
839 /* Get the page referenced by the cursor. */
840 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
841 return (RET_ERROR);
842
843 /*
844 * Find the next/previous record in the tree and point the cursor at
845 * it. The cursor may not be moved until a new key has been found.
846 */
847 switch (flags) {
848 case R_NEXT: /* Next record. */
849 idx = c->pg.index;
850 while (++idx == NEXTINDEX(h)) {
851 /* Crawl up if we hit the right edge. */
852 e = bt_rcpop(rc);
853 mpool_put(t->bt_mp, h, 0);
854 if (e == NULL) /* Hit the right edge of root. */
855 return RET_SPECIAL;
856 idx = e->index;
857 pg = e->pgno;
858 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
859 return (RET_ERROR);
860 }
861 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
862 /* Crawl down the left until we hit a leaf. */
863 status = bt_rcpush(rc, h->pgno, idx);
864 pg = GETBINTERNAL(h, idx)->pgno;
865 mpool_put(t->bt_mp, h, 0);
866 if (status != RET_SUCCESS)
867 return status;
868 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
869 return (RET_ERROR);
870 idx = 0;
871 }
872 break;
873 case R_PREV: /* Previous record. */
874 idx = c->pg.index;
875 while (!idx) {
876 /* Crawl up if we hit the left edge. */
877 e = bt_rcpop(rc);
878 mpool_put(t->bt_mp, h, 0);
879 if (e == NULL) /* Hit the left edge of root. */
880 return RET_SPECIAL;
881 idx = e->index;
882 pg = e->pgno;
883 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
884 return (RET_ERROR);
885 }
886 idx--;
887 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
888 /* Crawl down the right until we hit a leaf. */
889 status = bt_rcpush(rc, h->pgno, idx);
890 pg = GETBINTERNAL(h, idx)->pgno;
891 mpool_put(t->bt_mp, h, 0);
892 if (status != RET_SUCCESS)
893 return status;
894 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
895 return (RET_ERROR);
896 idx = NEXTINDEX(h) - 1;
897 }
898 break;
899 }
900
901 ep->page = h;
902 ep->index = idx;
903 c->pg.pgno = h->pgno;
904 c->pg.index = idx;
905 F_CLR(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
906 F_SET(c, CURS_INIT);
907 return (RET_SUCCESS);
908 }
909