1 /*- 2 * Copyright (c) 1990, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 4. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 */ 32 33 #if defined(LIBC_SCCS) && !defined(lint) 34 static char sccsid[] = "@(#)bt_seq.c 8.7 (Berkeley) 7/20/94"; 35 #endif /* LIBC_SCCS and not lint */ 36 #include <sys/cdefs.h> 37 __FBSDID("$FreeBSD$"); 38 39 #include <sys/types.h> 40 41 #include <errno.h> 42 #include <stddef.h> 43 #include <stdio.h> 44 #include <stdlib.h> 45 46 #include <db.h> 47 #include "btree.h" 48 49 static int __bt_first(BTREE *, const DBT *, EPG *, int *); 50 static int __bt_seqadv(BTREE *, EPG *, int); 51 static int __bt_seqset(BTREE *, EPG *, DBT *, int); 52 53 /* 54 * Sequential scan support. 55 * 56 * The tree can be scanned sequentially, starting from either end of the 57 * tree or from any specific key. A scan request before any scanning is 58 * done is initialized as starting from the least node. 59 */ 60 61 /* 62 * __bt_seq -- 63 * Btree sequential scan interface. 64 * 65 * Parameters: 66 * dbp: pointer to access method 67 * key: key for positioning and return value 68 * data: data return value 69 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. 70 * 71 * Returns: 72 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 73 */ 74 int 75 __bt_seq(const DB *dbp, DBT *key, DBT *data, u_int flags) 76 { 77 BTREE *t; 78 EPG e; 79 int status; 80 81 t = dbp->internal; 82 83 /* Toss any page pinned across calls. */ 84 if (t->bt_pinned != NULL) { 85 mpool_put(t->bt_mp, t->bt_pinned, 0); 86 t->bt_pinned = NULL; 87 } 88 89 /* 90 * If scan unitialized as yet, or starting at a specific record, set 91 * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin 92 * the page the cursor references if they're successful. 93 */ 94 switch (flags) { 95 case R_NEXT: 96 case R_PREV: 97 if (F_ISSET(&t->bt_cursor, CURS_INIT)) { 98 status = __bt_seqadv(t, &e, flags); 99 break; 100 } 101 /* FALLTHROUGH */ 102 case R_FIRST: 103 case R_LAST: 104 case R_CURSOR: 105 status = __bt_seqset(t, &e, key, flags); 106 break; 107 default: 108 errno = EINVAL; 109 return (RET_ERROR); 110 } 111 112 if (status == RET_SUCCESS) { 113 __bt_setcur(t, e.page->pgno, e.index); 114 115 status = 116 __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); 117 118 /* 119 * If the user is doing concurrent access, we copied the 120 * key/data, toss the page. 121 */ 122 if (F_ISSET(t, B_DB_LOCK)) 123 mpool_put(t->bt_mp, e.page, 0); 124 else 125 t->bt_pinned = e.page; 126 } 127 return (status); 128 } 129 130 /* 131 * __bt_seqset -- 132 * Set the sequential scan to a specific key. 133 * 134 * Parameters: 135 * t: tree 136 * ep: storage for returned key 137 * key: key for initial scan position 138 * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV 139 * 140 * Side effects: 141 * Pins the page the cursor references. 142 * 143 * Returns: 144 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 145 */ 146 static int 147 __bt_seqset(BTREE *t, EPG *ep, DBT *key, int flags) 148 { 149 PAGE *h; 150 pgno_t pg; 151 int exact; 152 153 /* 154 * Find the first, last or specific key in the tree and point the 155 * cursor at it. The cursor may not be moved until a new key has 156 * been found. 157 */ 158 switch (flags) { 159 case R_CURSOR: /* Keyed scan. */ 160 /* 161 * Find the first instance of the key or the smallest key 162 * which is greater than or equal to the specified key. 163 */ 164 if (key->data == NULL || key->size == 0) { 165 errno = EINVAL; 166 return (RET_ERROR); 167 } 168 return (__bt_first(t, key, ep, &exact)); 169 case R_FIRST: /* First record. */ 170 case R_NEXT: 171 /* Walk down the left-hand side of the tree. */ 172 for (pg = P_ROOT;;) { 173 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 174 return (RET_ERROR); 175 176 /* Check for an empty tree. */ 177 if (NEXTINDEX(h) == 0) { 178 mpool_put(t->bt_mp, h, 0); 179 return (RET_SPECIAL); 180 } 181 182 if (h->flags & (P_BLEAF | P_RLEAF)) 183 break; 184 pg = GETBINTERNAL(h, 0)->pgno; 185 mpool_put(t->bt_mp, h, 0); 186 } 187 ep->page = h; 188 ep->index = 0; 189 break; 190 case R_LAST: /* Last record. */ 191 case R_PREV: 192 /* Walk down the right-hand side of the tree. */ 193 for (pg = P_ROOT;;) { 194 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 195 return (RET_ERROR); 196 197 /* Check for an empty tree. */ 198 if (NEXTINDEX(h) == 0) { 199 mpool_put(t->bt_mp, h, 0); 200 return (RET_SPECIAL); 201 } 202 203 if (h->flags & (P_BLEAF | P_RLEAF)) 204 break; 205 pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; 206 mpool_put(t->bt_mp, h, 0); 207 } 208 209 ep->page = h; 210 ep->index = NEXTINDEX(h) - 1; 211 break; 212 } 213 return (RET_SUCCESS); 214 } 215 216 /* 217 * __bt_seqadvance -- 218 * Advance the sequential scan. 219 * 220 * Parameters: 221 * t: tree 222 * flags: R_NEXT, R_PREV 223 * 224 * Side effects: 225 * Pins the page the new key/data record is on. 226 * 227 * Returns: 228 * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. 229 */ 230 static int 231 __bt_seqadv(BTREE *t, EPG *ep, int flags) 232 { 233 CURSOR *c; 234 PAGE *h; 235 indx_t idx; 236 pgno_t pg; 237 int exact; 238 239 /* 240 * There are a couple of states that we can be in. The cursor has 241 * been initialized by the time we get here, but that's all we know. 242 */ 243 c = &t->bt_cursor; 244 245 /* 246 * The cursor was deleted where there weren't any duplicate records, 247 * so the key was saved. Find out where that key would go in the 248 * current tree. It doesn't matter if the returned key is an exact 249 * match or not -- if it's an exact match, the record was added after 250 * the delete so we can just return it. If not, as long as there's 251 * a record there, return it. 252 */ 253 if (F_ISSET(c, CURS_ACQUIRE)) 254 return (__bt_first(t, &c->key, ep, &exact)); 255 256 /* Get the page referenced by the cursor. */ 257 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 258 return (RET_ERROR); 259 260 /* 261 * Find the next/previous record in the tree and point the cursor at 262 * it. The cursor may not be moved until a new key has been found. 263 */ 264 switch (flags) { 265 case R_NEXT: /* Next record. */ 266 /* 267 * The cursor was deleted in duplicate records, and moved 268 * forward to a record that has yet to be returned. Clear 269 * that flag, and return the record. 270 */ 271 if (F_ISSET(c, CURS_AFTER)) 272 goto usecurrent; 273 idx = c->pg.index; 274 if (++idx == NEXTINDEX(h)) { 275 pg = h->nextpg; 276 mpool_put(t->bt_mp, h, 0); 277 if (pg == P_INVALID) 278 return (RET_SPECIAL); 279 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 280 return (RET_ERROR); 281 idx = 0; 282 } 283 break; 284 case R_PREV: /* Previous record. */ 285 /* 286 * The cursor was deleted in duplicate records, and moved 287 * backward to a record that has yet to be returned. Clear 288 * that flag, and return the record. 289 */ 290 if (F_ISSET(c, CURS_BEFORE)) { 291 usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); 292 ep->page = h; 293 ep->index = c->pg.index; 294 return (RET_SUCCESS); 295 } 296 idx = c->pg.index; 297 if (idx == 0) { 298 pg = h->prevpg; 299 mpool_put(t->bt_mp, h, 0); 300 if (pg == P_INVALID) 301 return (RET_SPECIAL); 302 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 303 return (RET_ERROR); 304 idx = NEXTINDEX(h) - 1; 305 } else 306 --idx; 307 break; 308 } 309 310 ep->page = h; 311 ep->index = idx; 312 return (RET_SUCCESS); 313 } 314 315 /* 316 * __bt_first -- 317 * Find the first entry. 318 * 319 * Parameters: 320 * t: the tree 321 * key: the key 322 * erval: return EPG 323 * exactp: pointer to exact match flag 324 * 325 * Returns: 326 * The first entry in the tree greater than or equal to key, 327 * or RET_SPECIAL if no such key exists. 328 */ 329 static int 330 __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp) 331 { 332 PAGE *h; 333 EPG *ep, save; 334 pgno_t pg; 335 336 /* 337 * Find any matching record; __bt_search pins the page. 338 * 339 * If it's an exact match and duplicates are possible, walk backwards 340 * in the tree until we find the first one. Otherwise, make sure it's 341 * a valid key (__bt_search may return an index just past the end of a 342 * page) and return it. 343 */ 344 if ((ep = __bt_search(t, key, exactp)) == NULL) 345 return (0); 346 if (*exactp) { 347 if (F_ISSET(t, B_NODUPS)) { 348 *erval = *ep; 349 return (RET_SUCCESS); 350 } 351 352 /* 353 * Walk backwards, as long as the entry matches and there are 354 * keys left in the tree. Save a copy of each match in case 355 * we go too far. 356 */ 357 save = *ep; 358 h = ep->page; 359 do { 360 if (save.page->pgno != ep->page->pgno) { 361 mpool_put(t->bt_mp, save.page, 0); 362 save = *ep; 363 } else 364 save.index = ep->index; 365 366 /* 367 * Don't unpin the page the last (or original) match 368 * was on, but make sure it's unpinned if an error 369 * occurs. 370 */ 371 if (ep->index == 0) { 372 if (h->prevpg == P_INVALID) 373 break; 374 if (h->pgno != save.page->pgno) 375 mpool_put(t->bt_mp, h, 0); 376 if ((h = mpool_get(t->bt_mp, 377 h->prevpg, 0)) == NULL) { 378 if (h->pgno == save.page->pgno) 379 mpool_put(t->bt_mp, 380 save.page, 0); 381 return (RET_ERROR); 382 } 383 ep->page = h; 384 ep->index = NEXTINDEX(h); 385 } 386 --ep->index; 387 } while (__bt_cmp(t, key, ep) == 0); 388 389 /* 390 * Reach here with the last page that was looked at pinned, 391 * which may or may not be the same as the last (or original) 392 * match page. If it's not useful, release it. 393 */ 394 if (h->pgno != save.page->pgno) 395 mpool_put(t->bt_mp, h, 0); 396 397 *erval = save; 398 return (RET_SUCCESS); 399 } 400 401 /* If at the end of a page, find the next entry. */ 402 if (ep->index == NEXTINDEX(ep->page)) { 403 h = ep->page; 404 pg = h->nextpg; 405 mpool_put(t->bt_mp, h, 0); 406 if (pg == P_INVALID) 407 return (RET_SPECIAL); 408 if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) 409 return (RET_ERROR); 410 ep->index = 0; 411 ep->page = h; 412 } 413 *erval = *ep; 414 return (RET_SUCCESS); 415 } 416 417 /* 418 * __bt_setcur -- 419 * Set the cursor to an entry in the tree. 420 * 421 * Parameters: 422 * t: the tree 423 * pgno: page number 424 * idx: page index 425 */ 426 void 427 __bt_setcur(BTREE *t, pgno_t pgno, u_int idx) 428 { 429 /* Lose any already deleted key. */ 430 if (t->bt_cursor.key.data != NULL) { 431 free(t->bt_cursor.key.data); 432 t->bt_cursor.key.size = 0; 433 t->bt_cursor.key.data = NULL; 434 } 435 F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); 436 437 /* Update the cursor. */ 438 t->bt_cursor.pg.pgno = pgno; 439 t->bt_cursor.pg.index = idx; 440 F_SET(&t->bt_cursor, CURS_INIT); 441 } 442