1 /*
2 * Copyright (C) 2002, 2016 by the Massachusetts Institute of Technology.
3 * All rights reserved.
4 *
5 * Export of this software from the United States of America may
6 * require a specific license from the United States Government.
7 * It is the responsibility of any person or organization contemplating
8 * export to obtain such a license before exporting.
9 *
10 * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
11 * distribute this software and its documentation for any purpose and
12 * without fee is hereby granted, provided that the above copyright
13 * notice appear in all copies and that both that copyright notice and
14 * this permission notice appear in supporting documentation, and that
15 * the name of M.I.T. not be used in advertising or publicity pertaining
16 * to distribution of the software without specific, written prior
17 * permission. Furthermore if you modify this software you must label
18 * your software as modified software and not distribute it in such a
19 * fashion that it might be confused with the original M.I.T. software.
20 * M.I.T. makes no representations about the suitability of
21 * this software for any purpose. It is provided "as is" without express
22 * or implied warranty.
23 */
24
25 /*-
26 * Copyright (c) 1990, 1993, 1994
27 * The Regents of the University of California. All rights reserved.
28 *
29 * This code is derived from software contributed to Berkeley by
30 * Mike Olson.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61 #if defined(LIBC_SCCS) && !defined(lint)
62 static char sccsid[] = "@(#)bt_seq.c 8.9 (Berkeley) 6/20/95";
63 #endif /* LIBC_SCCS and not lint */
64
65 #include <sys/types.h>
66
67 #include <errno.h>
68 #include <stddef.h>
69 #include <stdio.h>
70 #include <stdlib.h>
71 #include <string.h>
72
73 #include "db-int.h"
74 #include "btree.h"
75
76 static int __bt_first __P((BTREE *, const DBT *, EPG *, int *));
77 static int __bt_seqadv __P((BTREE *, EPG *, int));
78 static int __bt_seqset __P((BTREE *, EPG *, DBT *, int));
79
80 static int bt_rseq_next(BTREE *, EPG *);
81 static int bt_rseq_prev(BTREE *, EPG *);
82
83 /*
84 * Sequential scan support.
85 *
86 * The tree can be scanned sequentially, starting from either end of the
87 * tree or from any specific key. A scan request before any scanning is
88 * done is initialized as starting from the least node.
89 */
90
91 /*
92 * __bt_seq --
93 * Btree sequential scan interface.
94 *
95 * Parameters:
96 * dbp: pointer to access method
97 * key: key for positioning and return value
98 * data: data return value
99 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
100 *
101 * Returns:
102 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
103 */
104 int
__bt_seq(dbp,key,data,flags)105 __bt_seq(dbp, key, data, flags)
106 const DB *dbp;
107 DBT *key, *data;
108 u_int flags;
109 {
110 BTREE *t;
111 EPG e;
112 int status;
113
114 t = dbp->internal;
115
116 /* Toss any page pinned across calls. */
117 if (t->bt_pinned != NULL) {
118 mpool_put(t->bt_mp, t->bt_pinned, 0);
119 t->bt_pinned = NULL;
120 }
121
122 /*
123 * If scan unitialized as yet, or starting at a specific record, set
124 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin
125 * the page the cursor references if they're successful.
126 */
127 switch (flags) {
128 case R_NEXT:
129 case R_PREV:
130 case R_RNEXT:
131 case R_RPREV:
132 if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
133 status = __bt_seqadv(t, &e, flags);
134 break;
135 }
136 /* FALLTHROUGH */
137 case R_FIRST:
138 case R_LAST:
139 case R_CURSOR:
140 status = __bt_seqset(t, &e, key, flags);
141 break;
142 default:
143 errno = EINVAL;
144 return (RET_ERROR);
145 }
146
147 if (status == RET_SUCCESS) {
148 __bt_setcur(t, e.page->pgno, e.index);
149
150 status =
151 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
152
153 /*
154 * If the user is doing concurrent access, we copied the
155 * key/data, toss the page.
156 */
157 if (F_ISSET(t, B_DB_LOCK))
158 mpool_put(t->bt_mp, e.page, 0);
159 else
160 t->bt_pinned = e.page;
161 }
162 return (status);
163 }
164
165 /*
166 * __bt_seqset --
167 * Set the sequential scan to a specific key.
168 *
169 * Parameters:
170 * t: tree
171 * ep: storage for returned key
172 * key: key for initial scan position
173 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
174 *
175 * Side effects:
176 * Pins the page the cursor references.
177 *
178 * Returns:
179 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
180 */
181 static int
__bt_seqset(t,ep,key,flags)182 __bt_seqset(t, ep, key, flags)
183 BTREE *t;
184 EPG *ep;
185 DBT *key;
186 int flags;
187 {
188 PAGE *h;
189 db_pgno_t pg;
190 int exact;
191
192 /*
193 * Find the first, last or specific key in the tree and point the
194 * cursor at it. The cursor may not be moved until a new key has
195 * been found.
196 */
197 switch (flags) {
198 case R_CURSOR: /* Keyed scan. */
199 /*
200 * Find the first instance of the key or the smallest key
201 * which is greater than or equal to the specified key.
202 */
203 if (key->data == NULL || key->size == 0) {
204 errno = EINVAL;
205 return (RET_ERROR);
206 }
207 return (__bt_first(t, key, ep, &exact));
208 case R_FIRST: /* First record. */
209 case R_NEXT:
210 case R_RNEXT:
211 BT_CLR(t);
212 /* Walk down the left-hand side of the tree. */
213 for (pg = P_ROOT;;) {
214 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
215 return (RET_ERROR);
216
217 /* Check for an empty tree. */
218 if (NEXTINDEX(h) == 0) {
219 mpool_put(t->bt_mp, h, 0);
220 return (RET_SPECIAL);
221 }
222
223 if (h->flags & (P_BLEAF | P_RLEAF))
224 break;
225 pg = GETBINTERNAL(h, 0)->pgno;
226 BT_PUSH(t, h->pgno, 0);
227 mpool_put(t->bt_mp, h, 0);
228 }
229 ep->page = h;
230 ep->index = 0;
231 break;
232 case R_LAST: /* Last record. */
233 case R_PREV:
234 case R_RPREV:
235 BT_CLR(t);
236 /* Walk down the right-hand side of the tree. */
237 for (pg = P_ROOT;;) {
238 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
239 return (RET_ERROR);
240
241 /* Check for an empty tree. */
242 if (NEXTINDEX(h) == 0) {
243 mpool_put(t->bt_mp, h, 0);
244 return (RET_SPECIAL);
245 }
246
247 if (h->flags & (P_BLEAF | P_RLEAF))
248 break;
249 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
250 BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1);
251 mpool_put(t->bt_mp, h, 0);
252 }
253
254 ep->page = h;
255 ep->index = NEXTINDEX(h) - 1;
256 break;
257 }
258 return (RET_SUCCESS);
259 }
260
261 /*
262 * __bt_seqadvance --
263 * Advance the sequential scan.
264 *
265 * Parameters:
266 * t: tree
267 * flags: R_NEXT, R_PREV
268 *
269 * Side effects:
270 * Pins the page the new key/data record is on.
271 *
272 * Returns:
273 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
274 */
275 static int
__bt_seqadv(t,ep,flags)276 __bt_seqadv(t, ep, flags)
277 BTREE *t;
278 EPG *ep;
279 int flags;
280 {
281 CURSOR *c;
282 PAGE *h;
283 indx_t idx = 0;
284 db_pgno_t pg;
285 int exact, rval;
286
287 /*
288 * There are a couple of states that we can be in. The cursor has
289 * been initialized by the time we get here, but that's all we know.
290 */
291 c = &t->bt_cursor;
292
293 /*
294 * The cursor was deleted and there weren't any duplicate records,
295 * so the cursor's key was saved. Find out where that key would
296 * be in the current tree. If the returned key is an exact match,
297 * it means that a key/data pair was inserted into the tree after
298 * the delete. We could reasonably return the key, but the problem
299 * is that this is the access pattern we'll see if the user is
300 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
301 * the cursor record and then replaces it, so the cursor was saved,
302 * and we'll simply return the same "new" record until the user
303 * notices and doesn't do a put() of it. Since the key is an exact
304 * match, we could as easily put the new record before the cursor,
305 * and we've made no guarantee to return it. So, move forward or
306 * back a record if it's an exact match.
307 *
308 * XXX
309 * In the current implementation, put's to the cursor are done with
310 * delete/add pairs. This has two consequences. First, it means
311 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
312 * the same behavior as above. Second, you can return the same key
313 * twice if you have duplicate records. The scenario is that the
314 * cursor record is deleted, moving the cursor forward or backward
315 * to a duplicate. The add then inserts the new record at a location
316 * ahead of the cursor because duplicates aren't sorted in any way,
317 * and the new record is later returned. This has to be fixed at some
318 * point.
319 */
320 if (F_ISSET(c, CURS_ACQUIRE)) {
321 if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
322 return (RET_ERROR);
323 if (!exact)
324 return (rval);
325 /*
326 * XXX
327 * Kluge -- get, release, get the page.
328 */
329 c->pg.pgno = ep->page->pgno;
330 c->pg.index = ep->index;
331 mpool_put(t->bt_mp, ep->page, 0);
332 }
333
334 /* Get the page referenced by the cursor. */
335 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
336 return (RET_ERROR);
337
338 /*
339 * Find the next/previous record in the tree and point the cursor at
340 * it. The cursor may not be moved until a new key has been found.
341 */
342 switch (flags) {
343 case R_NEXT: /* Next record. */
344 case R_RNEXT:
345 /*
346 * The cursor was deleted in duplicate records, and moved
347 * forward to a record that has yet to be returned. Clear
348 * that flag, and return the record.
349 */
350 if (F_ISSET(c, CURS_AFTER))
351 goto usecurrent;
352 idx = c->pg.index;
353 if (++idx == NEXTINDEX(h)) {
354 if (flags == R_RNEXT) {
355 ep->page = h;
356 ep->index = idx;
357 return (bt_rseq_next(t, ep));
358 }
359 pg = h->nextpg;
360 mpool_put(t->bt_mp, h, 0);
361 if (pg == P_INVALID)
362 return (RET_SPECIAL);
363 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
364 return (RET_ERROR);
365 idx = 0;
366 }
367 break;
368 case R_PREV: /* Previous record. */
369 case R_RPREV:
370 /*
371 * The cursor was deleted in duplicate records, and moved
372 * backward to a record that has yet to be returned. Clear
373 * that flag, and return the record.
374 */
375 if (F_ISSET(c, CURS_BEFORE)) {
376 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE);
377 ep->page = h;
378 ep->index = c->pg.index;
379 return (RET_SUCCESS);
380 }
381 idx = c->pg.index;
382 if (idx == 0) {
383 if (flags == R_RPREV) {
384 ep->page = h;
385 ep->index = idx;
386 return (bt_rseq_prev(t, ep));
387 }
388 pg = h->prevpg;
389 mpool_put(t->bt_mp, h, 0);
390 if (pg == P_INVALID)
391 return (RET_SPECIAL);
392 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
393 return (RET_ERROR);
394 idx = NEXTINDEX(h) - 1;
395 } else
396 --idx;
397 break;
398 }
399
400 ep->page = h;
401 ep->index = idx;
402 return (RET_SUCCESS);
403 }
404
405 /*
406 * Get the first item on the next page, but by going up and down the tree.
407 */
408 static int
bt_rseq_next(BTREE * t,EPG * ep)409 bt_rseq_next(BTREE *t, EPG *ep)
410 {
411 PAGE *h;
412 indx_t idx;
413 EPGNO *up;
414 db_pgno_t pg;
415
416 h = ep->page;
417 idx = ep->index;
418 do {
419 /* Move up the tree. */
420 up = BT_POP(t);
421 mpool_put(t->bt_mp, h, 0);
422 /* Did we hit the right edge of the root? */
423 if (up == NULL)
424 return (RET_SPECIAL);
425 if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
426 return (RET_ERROR);
427 idx = up->index;
428 } while (++idx == NEXTINDEX(h));
429
430 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
431 /* Move back down the tree. */
432 BT_PUSH(t, h->pgno, idx);
433 pg = GETBINTERNAL(h, idx)->pgno;
434 mpool_put(t->bt_mp, h, 0);
435 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
436 return (RET_ERROR);
437 idx = 0;
438 }
439 ep->page = h;
440 ep->index = idx;
441 return (RET_SUCCESS);
442 }
443
444 /*
445 * Get the last item on the previous page, but by going up and down the tree.
446 */
447 static int
bt_rseq_prev(BTREE * t,EPG * ep)448 bt_rseq_prev(BTREE *t, EPG *ep)
449 {
450 PAGE *h;
451 indx_t idx;
452 EPGNO *up;
453 db_pgno_t pg;
454
455 h = ep->page;
456 idx = ep->index;
457 do {
458 /* Move up the tree. */
459 up = BT_POP(t);
460 mpool_put(t->bt_mp, h, 0);
461 /* Did we hit the left edge of the root? */
462 if (up == NULL)
463 return (RET_SPECIAL);
464 if ((h = mpool_get(t->bt_mp, up->pgno, 0)) == NULL)
465 return (RET_ERROR);
466 idx = up->index;
467 } while (idx == 0);
468 --idx;
469 while (!(h->flags & (P_BLEAF | P_RLEAF))) {
470 /* Move back down the tree. */
471 BT_PUSH(t, h->pgno, idx);
472 pg = GETBINTERNAL(h, idx)->pgno;
473 mpool_put(t->bt_mp, h, 0);
474 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
475 return (RET_ERROR);
476 idx = NEXTINDEX(h) - 1;
477 }
478 ep->page = h;
479 ep->index = idx;
480 return (RET_SUCCESS);
481 }
482
483 /*
484 * __bt_first --
485 * Find the first entry.
486 *
487 * Parameters:
488 * t: the tree
489 * key: the key
490 * erval: return EPG
491 * exactp: pointer to exact match flag
492 *
493 * Returns:
494 * The first entry in the tree greater than or equal to key,
495 * or RET_SPECIAL if no such key exists.
496 */
497 static int
__bt_first(t,key,erval,exactp)498 __bt_first(t, key, erval, exactp)
499 BTREE *t;
500 const DBT *key;
501 EPG *erval;
502 int *exactp;
503 {
504 PAGE *h, *hprev;
505 EPG *ep, save;
506 db_pgno_t pg;
507
508 /*
509 * Find any matching record; __bt_search pins the page.
510 *
511 * If it's an exact match and duplicates are possible, walk backwards
512 * in the tree until we find the first one. Otherwise, make sure it's
513 * a valid key (__bt_search may return an index just past the end of a
514 * page) and return it.
515 */
516 if ((ep = __bt_search(t, key, exactp)) == NULL)
517 return (RET_SPECIAL);
518 if (*exactp) {
519 if (F_ISSET(t, B_NODUPS)) {
520 *erval = *ep;
521 return (RET_SUCCESS);
522 }
523
524 /*
525 * Walk backwards, as long as the entry matches and there are
526 * keys left in the tree. Save a copy of each match in case
527 * we go too far.
528 */
529 save = *ep;
530 h = ep->page;
531 do {
532 if (save.page->pgno != ep->page->pgno) {
533 mpool_put(t->bt_mp, save.page, 0);
534 save = *ep;
535 } else
536 save.index = ep->index;
537
538 /*
539 * Don't unpin the page the last (or original) match
540 * was on, but make sure it's unpinned if an error
541 * occurs.
542 */
543 if (ep->index == 0) {
544 if (h->prevpg == P_INVALID)
545 break;
546 if (h->pgno != save.page->pgno)
547 mpool_put(t->bt_mp, h, 0);
548 if ((hprev = mpool_get(t->bt_mp,
549 h->prevpg, 0)) == NULL) {
550 if (h->pgno == save.page->pgno)
551 mpool_put(t->bt_mp,
552 save.page, 0);
553 return (RET_ERROR);
554 }
555 ep->page = h = hprev;
556 ep->index = NEXTINDEX(h);
557 }
558 --ep->index;
559 } while (__bt_cmp(t, key, ep) == 0);
560
561 /*
562 * Reach here with the last page that was looked at pinned,
563 * which may or may not be the same as the last (or original)
564 * match page. If it's not useful, release it.
565 */
566 if (h->pgno != save.page->pgno)
567 mpool_put(t->bt_mp, h, 0);
568
569 *erval = save;
570 return (RET_SUCCESS);
571 }
572
573 /* If at the end of a page, find the next entry. */
574 if (ep->index == NEXTINDEX(ep->page)) {
575 h = ep->page;
576 pg = h->nextpg;
577 mpool_put(t->bt_mp, h, 0);
578 if (pg == P_INVALID)
579 return (RET_SPECIAL);
580 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
581 return (RET_ERROR);
582 ep->index = 0;
583 ep->page = h;
584 }
585 *erval = *ep;
586 return (RET_SUCCESS);
587 }
588
589 /*
590 * __bt_setcur --
591 * Set the cursor to an entry in the tree.
592 *
593 * Parameters:
594 * t: the tree
595 * pgno: page number
596 * index: page index
597 */
598 void
__bt_setcur(t,pgno,idx)599 __bt_setcur(t, pgno, idx)
600 BTREE *t;
601 db_pgno_t pgno;
602 u_int idx;
603 {
604 /* Lose any already deleted key. */
605 if (t->bt_cursor.key.data != NULL) {
606 free(t->bt_cursor.key.data);
607 t->bt_cursor.key.size = 0;
608 t->bt_cursor.key.data = NULL;
609 }
610 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
611
612 /* Update the cursor. */
613 t->bt_cursor.pg.pgno = pgno;
614 t->bt_cursor.pg.index = idx;
615 F_SET(&t->bt_cursor, CURS_INIT);
616 }
617