1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 8 #include "config.h" 9 10 #ifndef lint 11 static const char sccsid[] = "@(#)bt_cursor.c 10.81 (Sleepycat) 12/16/98"; 12 #endif /* not lint */ 13 14 #ifndef NO_SYSTEM_INCLUDES 15 #include <sys/types.h> 16 17 #include <errno.h> 18 #include <stdlib.h> 19 #include <string.h> 20 #endif 21 22 #include "db_int.h" 23 #include "db_page.h" 24 #include "btree.h" 25 #include "shqueue.h" 26 #include "db_shash.h" 27 #include "lock.h" 28 #include "lock_ext.h" 29 30 static int __bam_c_close __P((DBC *)); 31 static int __bam_c_del __P((DBC *, u_int32_t)); 32 static int __bam_c_destroy __P((DBC *)); 33 static int __bam_c_first __P((DBC *, CURSOR *)); 34 static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); 35 static int __bam_c_getstack __P((DBC *, CURSOR *)); 36 static int __bam_c_last __P((DBC *, CURSOR *)); 37 static int __bam_c_next __P((DBC *, CURSOR *, int)); 38 static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *)); 39 static int __bam_c_prev __P((DBC *, CURSOR *)); 40 static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); 41 static void __bam_c_reset __P((CURSOR *)); 42 static int __bam_c_rget __P((DBC *, DBT *, u_int32_t)); 43 static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *)); 44 static int __bam_dsearch __P((DBC *, CURSOR *, DBT *, u_int32_t *)); 45 46 /* Discard the current page/lock held by a cursor. */ 47 #undef DISCARD 48 #define DISCARD(dbc, cp) { \ 49 if ((cp)->page != NULL) { \ 50 (void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0); \ 51 (cp)->page = NULL; \ 52 } \ 53 if ((cp)->lock != LOCK_INVALID) { \ 54 (void)__BT_TLPUT((dbc), (cp)->lock); \ 55 (cp)->lock = LOCK_INVALID; \ 56 } \ 57 } 58 59 /* If the cursor references a deleted record. */ 60 #undef IS_CUR_DELETED 61 #define IS_CUR_DELETED(cp) \ 62 (((cp)->dpgno == PGNO_INVALID && \ 63 B_DISSET(GET_BKEYDATA((cp)->page, \ 64 (cp)->indx + O_INDX)->type)) || \ 65 ((cp)->dpgno != PGNO_INVALID && \ 66 B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type))) 67 68 /* If the cursor and index combination references a deleted record. */ 69 #undef IS_DELETED 70 #define IS_DELETED(cp, indx) \ 71 (((cp)->dpgno == PGNO_INVALID && \ 72 B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) || \ 73 ((cp)->dpgno != PGNO_INVALID && \ 74 B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type))) 75 76 /* 77 * Test to see if two cursors could point to duplicates of the same key, 78 * whether on-page or off-page. The leaf page numbers must be the same 79 * in both cases. In the case of off-page duplicates, the key indices 80 * on the leaf page will be the same. In the case of on-page duplicates, 81 * the duplicate page number must not be set, and the key index offsets 82 * must be the same. For the last test, as the saved copy of the cursor 83 * will not have a valid page pointer, we use the cursor's. 84 */ 85 #undef POSSIBLE_DUPLICATE 86 #define POSSIBLE_DUPLICATE(cursor, saved_copy) \ 87 ((cursor)->pgno == (saved_copy).pgno && \ 88 ((cursor)->indx == (saved_copy).indx || \ 89 ((cursor)->dpgno == PGNO_INVALID && \ 90 (saved_copy).dpgno == PGNO_INVALID && \ 91 (cursor)->page->inp[(cursor)->indx] == \ 92 (cursor)->page->inp[(saved_copy).indx]))) 93 94 /* 95 * __bam_c_reset -- 96 * Initialize internal cursor structure. 97 */ 98 static void 99 __bam_c_reset(cp) 100 CURSOR *cp; 101 { 102 cp->sp = cp->csp = cp->stack; 103 cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]); 104 cp->page = NULL; 105 cp->pgno = PGNO_INVALID; 106 cp->indx = 0; 107 cp->dpgno = PGNO_INVALID; 108 cp->dindx = 0; 109 cp->lock = LOCK_INVALID; 110 cp->mode = DB_LOCK_NG; 111 cp->recno = RECNO_OOB; 112 cp->flags = 0; 113 } 114 115 /* 116 * __bam_c_init -- 117 * Initialize the access private portion of a cursor 118 * 119 * PUBLIC: int __bam_c_init __P((DBC *)); 120 */ 121 int 122 __bam_c_init(dbc) 123 DBC *dbc; 124 { 125 DB *dbp; 126 CURSOR *cp; 127 int ret; 128 129 if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0) 130 return (ret); 131 132 dbp = dbc->dbp; 133 cp->dbc = dbc; 134 135 /* 136 * Logical record numbers are always the same size, and we don't want 137 * to have to check for space every time we return one. Allocate it 138 * in advance. 139 */ 140 if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { 141 if ((ret = __os_malloc(sizeof(db_recno_t), 142 NULL, &dbc->rkey.data)) != 0) { 143 __os_free(cp, sizeof(CURSOR)); 144 return (ret); 145 } 146 dbc->rkey.ulen = sizeof(db_recno_t); 147 } 148 149 /* Initialize methods. */ 150 dbc->internal = cp; 151 if (dbp->type == DB_BTREE) { 152 dbc->c_am_close = __bam_c_close; 153 dbc->c_am_destroy = __bam_c_destroy; 154 dbc->c_del = __bam_c_del; 155 dbc->c_get = __bam_c_get; 156 dbc->c_put = __bam_c_put; 157 } else { 158 dbc->c_am_close = __bam_c_close; 159 dbc->c_am_destroy = __bam_c_destroy; 160 dbc->c_del = __ram_c_del; 161 dbc->c_get = __ram_c_get; 162 dbc->c_put = __ram_c_put; 163 } 164 165 /* Initialize dynamic information. */ 166 __bam_c_reset(cp); 167 168 return (0); 169 } 170 171 /* 172 * __bam_c_close -- 173 * Close down the cursor from a single use. 174 */ 175 static int 176 __bam_c_close(dbc) 177 DBC *dbc; 178 { 179 CURSOR *cp; 180 DB *dbp; 181 int ret; 182 183 dbp = dbc->dbp; 184 cp = dbc->internal; 185 ret = 0; 186 187 /* 188 * If a cursor deleted a btree key, perform the actual deletion. 189 * (Recno keys are either deleted immediately or never deleted.) 190 */ 191 if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED)) 192 ret = __bam_c_physdel(dbc, cp, NULL); 193 194 /* Discard any locks not acquired inside of a transaction. */ 195 if (cp->lock != LOCK_INVALID) { 196 (void)__BT_TLPUT(dbc, cp->lock); 197 cp->lock = LOCK_INVALID; 198 } 199 200 /* Sanity checks. */ 201 #ifdef DIAGNOSTIC 202 if (cp->csp != cp->stack) 203 __db_err(dbp->dbenv, "btree cursor close: stack not empty"); 204 #endif 205 206 /* Initialize dynamic information. */ 207 __bam_c_reset(cp); 208 209 return (ret); 210 } 211 212 /* 213 * __bam_c_destroy -- 214 * Close a single cursor -- internal version. 215 */ 216 static int 217 __bam_c_destroy(dbc) 218 DBC *dbc; 219 { 220 /* Discard the structures. */ 221 __os_free(dbc->internal, sizeof(CURSOR)); 222 223 return (0); 224 } 225 226 /* 227 * __bam_c_del -- 228 * Delete using a cursor. 229 */ 230 static int 231 __bam_c_del(dbc, flags) 232 DBC *dbc; 233 u_int32_t flags; 234 { 235 CURSOR *cp; 236 DB *dbp; 237 DB_LOCK lock; 238 PAGE *h; 239 db_pgno_t pgno; 240 db_indx_t indx; 241 int ret; 242 243 dbp = dbc->dbp; 244 cp = dbc->internal; 245 h = NULL; 246 247 DB_PANIC_CHECK(dbp); 248 249 /* Check for invalid flags. */ 250 if ((ret = __db_cdelchk(dbp, flags, 251 F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) 252 return (ret); 253 254 /* 255 * If we are running CDB, this had better be either a write 256 * cursor or an immediate writer. 257 */ 258 if (F_ISSET(dbp, DB_AM_CDB)) 259 if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) 260 return (EINVAL); 261 262 DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags); 263 264 /* If already deleted, return failure. */ 265 if (F_ISSET(cp, C_DELETED)) 266 return (DB_KEYEMPTY); 267 268 /* 269 * We don't physically delete the record until the cursor moves, 270 * so we have to have a long-lived write lock on the page instead 271 * of a long-lived read lock. Note, we have to have a read lock 272 * to even get here, so we simply discard it. 273 */ 274 if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) { 275 if ((ret = __bam_lget(dbc, 276 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) 277 goto err; 278 (void)__BT_TLPUT(dbc, cp->lock); 279 cp->lock = lock; 280 cp->mode = DB_LOCK_WRITE; 281 } 282 283 /* 284 * Acquire the underlying page (which may be different from the above 285 * page because it may be a duplicate page), and set the on-page and 286 * in-cursor delete flags. We don't need to lock it as we've already 287 * write-locked the page leading to it. 288 */ 289 if (cp->dpgno == PGNO_INVALID) { 290 pgno = cp->pgno; 291 indx = cp->indx; 292 } else { 293 pgno = cp->dpgno; 294 indx = cp->dindx; 295 } 296 297 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) 298 goto err; 299 300 /* Log the change. */ 301 if (DB_LOGGING(dbc) && 302 (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h), 303 0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) { 304 (void)memp_fput(dbp->mpf, h, 0); 305 goto err; 306 } 307 308 /* 309 * Set the intent-to-delete flag on the page and update all cursors. */ 310 if (cp->dpgno == PGNO_INVALID) 311 B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type); 312 else 313 B_DSET(GET_BKEYDATA(h, indx)->type); 314 (void)__bam_ca_delete(dbp, pgno, indx, 1); 315 316 ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); 317 h = NULL; 318 319 /* 320 * If the tree has record numbers, we have to adjust the counts. 321 * 322 * !!! 323 * This test is right -- we don't yet support duplicates and record 324 * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM 325 * set. 326 */ 327 if (F_ISSET(dbp, DB_BT_RECNUM)) { 328 if ((ret = __bam_c_getstack(dbc, cp)) != 0) 329 goto err; 330 if ((ret = __bam_adjust(dbc, -1)) != 0) 331 goto err; 332 (void)__bam_stkrel(dbc, 0); 333 } 334 335 err: if (h != NULL) 336 (void)memp_fput(dbp->mpf, h, 0); 337 return (ret); 338 } 339 340 /* 341 * __bam_c_get -- 342 * Get using a cursor (btree). 343 */ 344 static int 345 __bam_c_get(dbc, key, data, flags) 346 DBC *dbc; 347 DBT *key, *data; 348 u_int32_t flags; 349 { 350 CURSOR *cp, copy, start; 351 DB *dbp; 352 PAGE *h; 353 int exact, ret, tmp_rmw; 354 355 dbp = dbc->dbp; 356 cp = dbc->internal; 357 358 DB_PANIC_CHECK(dbp); 359 360 /* Check for invalid flags. */ 361 if ((ret = __db_cgetchk(dbp, 362 key, data, flags, cp->pgno != PGNO_INVALID)) != 0) 363 return (ret); 364 365 /* Clear OR'd in additional bits so we can check for flag equality. */ 366 tmp_rmw = 0; 367 if (LF_ISSET(DB_RMW)) { 368 if (!F_ISSET(dbp, DB_AM_CDB)) { 369 tmp_rmw = 1; 370 F_SET(dbc, DBC_RMW); 371 } 372 LF_CLR(DB_RMW); 373 } 374 375 DEBUG_LREAD(dbc, dbc->txn, "bam_c_get", 376 flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); 377 378 /* 379 * Return a cursor's record number. It has nothing to do with the 380 * cursor get code except that it's been rammed into the interface. 381 */ 382 if (flags == DB_GET_RECNO) { 383 ret = __bam_c_rget(dbc, data, flags); 384 if (tmp_rmw) 385 F_CLR(dbc, DBC_RMW); 386 return (ret); 387 } 388 389 /* 390 * Initialize the cursor for a new retrieval. Clear the cursor's 391 * page pointer, it was set before this operation, and no longer 392 * has any meaning. 393 */ 394 cp->page = NULL; 395 copy = *cp; 396 cp->lock = LOCK_INVALID; 397 398 switch (flags) { 399 case DB_CURRENT: 400 /* It's not possible to return a deleted record. */ 401 if (F_ISSET(cp, C_DELETED)) { 402 ret = DB_KEYEMPTY; 403 goto err; 404 } 405 406 /* Acquire the current page. */ 407 if ((ret = __bam_lget(dbc, 408 0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0) 409 ret = memp_fget(dbp->mpf, 410 cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno, 411 0, &cp->page); 412 if (ret != 0) 413 goto err; 414 break; 415 case DB_NEXT_DUP: 416 if (cp->pgno == PGNO_INVALID) { 417 ret = EINVAL; 418 goto err; 419 } 420 if ((ret = __bam_c_next(dbc, cp, 1)) != 0) 421 goto err; 422 423 /* Make sure we didn't go past the end of the duplicates. */ 424 if (!POSSIBLE_DUPLICATE(cp, copy)) { 425 ret = DB_NOTFOUND; 426 goto err; 427 } 428 break; 429 case DB_NEXT: 430 if (cp->pgno != PGNO_INVALID) { 431 if ((ret = __bam_c_next(dbc, cp, 1)) != 0) 432 goto err; 433 break; 434 } 435 /* FALLTHROUGH */ 436 case DB_FIRST: 437 if ((ret = __bam_c_first(dbc, cp)) != 0) 438 goto err; 439 break; 440 case DB_PREV: 441 if (cp->pgno != PGNO_INVALID) { 442 if ((ret = __bam_c_prev(dbc, cp)) != 0) 443 goto err; 444 break; 445 } 446 /* FALLTHROUGH */ 447 case DB_LAST: 448 if ((ret = __bam_c_last(dbc, cp)) != 0) 449 goto err; 450 break; 451 case DB_SET: 452 if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) 453 goto err; 454 455 /* 456 * We cannot currently be referencing a deleted record, but we 457 * may be referencing off-page duplicates. 458 * 459 * If we're referencing off-page duplicates, move off-page. 460 * If we moved off-page, move to the next non-deleted record. 461 * If we moved to the next non-deleted record, check to make 462 * sure we didn't switch records because our current record 463 * had no non-deleted data items. 464 */ 465 start = *cp; 466 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) 467 goto err; 468 if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) { 469 if ((ret = __bam_c_next(dbc, cp, 0)) != 0) 470 goto err; 471 if (!POSSIBLE_DUPLICATE(cp, start)) { 472 ret = DB_NOTFOUND; 473 goto err; 474 } 475 } 476 break; 477 case DB_SET_RECNO: 478 if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) 479 goto err; 480 break; 481 case DB_GET_BOTH: 482 if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) { 483 /* Acquire the current page. */ 484 if ((ret = memp_fget(dbp->mpf, 485 cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno, 486 0, &cp->page)) != 0) 487 goto err; 488 489 /* If DBC_CONTINUE, move to the next item. */ 490 if (F_ISSET(dbc, DBC_CONTINUE) && 491 (ret = __bam_c_next(dbc, cp, 1)) != 0) 492 goto err; 493 } else { 494 if ((ret = 495 __bam_c_search(dbc, cp, key, flags, &exact)) != 0) 496 goto err; 497 498 /* 499 * We may be referencing a duplicates page. Move to 500 * the first duplicate. 501 */ 502 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) 503 goto err; 504 } 505 506 /* Search for a matching entry. */ 507 if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0) 508 goto err; 509 510 /* Ignore deleted entries. */ 511 if (IS_CUR_DELETED(cp)) { 512 ret = DB_NOTFOUND; 513 goto err; 514 } 515 break; 516 case DB_SET_RANGE: 517 if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0) 518 goto err; 519 520 /* 521 * As we didn't require an exact match, the search function 522 * may have returned an entry past the end of the page. If 523 * so, move to the next entry. 524 */ 525 if (cp->indx == NUM_ENT(cp->page) && 526 (ret = __bam_c_next(dbc, cp, 0)) != 0) 527 goto err; 528 529 /* 530 * We may be referencing off-page duplicates, if so, move 531 * off-page. 532 */ 533 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) 534 goto err; 535 536 /* 537 * We may be referencing a deleted record, if so, move to 538 * the next non-deleted record. 539 */ 540 if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0) 541 goto err; 542 break; 543 } 544 545 /* 546 * Return the key if the user didn't give us one. If we've moved to 547 * a duplicate page, we may no longer have a pointer to the main page, 548 * so we have to go get it. We know that it's already read-locked, 549 * however, so we don't have to acquire a new lock. 550 */ 551 if (flags != DB_SET) { 552 if (cp->dpgno != PGNO_INVALID) { 553 if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) 554 goto err; 555 } else 556 h = cp->page; 557 ret = __db_ret(dbp, 558 h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen); 559 if (cp->dpgno != PGNO_INVALID) 560 (void)memp_fput(dbp->mpf, h, 0); 561 if (ret) 562 goto err; 563 } 564 565 /* Return the data. */ 566 if ((ret = __db_ret(dbp, cp->page, 567 cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx, 568 data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) 569 goto err; 570 571 /* 572 * If the previous cursor record has been deleted, physically delete 573 * the entry from the page. We clear the deleted flag before we call 574 * the underlying delete routine so that, if an error occurs, and we 575 * restore the cursor, the deleted flag is cleared. This is because, 576 * if we manage to physically modify the page, and then restore the 577 * cursor, we might try to repeat the page modification when closing 578 * the cursor. 579 */ 580 if (F_ISSET(©, C_DELETED)) { 581 F_CLR(©, C_DELETED); 582 if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0) 583 goto err; 584 } 585 F_CLR(cp, C_DELETED); 586 587 /* Release the previous lock, if any; the current lock is retained. */ 588 if (copy.lock != LOCK_INVALID) 589 (void)__BT_TLPUT(dbc, copy.lock); 590 591 /* Release the current page. */ 592 if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) 593 goto err; 594 595 if (0) { 596 err: if (cp->page != NULL) 597 (void)memp_fput(dbp->mpf, cp->page, 0); 598 if (cp->lock != LOCK_INVALID) 599 (void)__BT_TLPUT(dbc, cp->lock); 600 *cp = copy; 601 } 602 603 /* Release temporary lock upgrade. */ 604 if (tmp_rmw) 605 F_CLR(dbc, DBC_RMW); 606 607 return (ret); 608 } 609 610 /* 611 * __bam_dsearch -- 612 * Search for a matching data item (or the first data item that's 613 * equal to or greater than the one we're searching for). 614 */ 615 static int 616 __bam_dsearch(dbc, cp, data, iflagp) 617 DBC *dbc; 618 CURSOR *cp; 619 DBT *data; 620 u_int32_t *iflagp; 621 { 622 DB *dbp; 623 CURSOR copy, last; 624 int cmp, ret; 625 626 dbp = dbc->dbp; 627 628 /* 629 * If iflagp is non-NULL, we're doing an insert. 630 * 631 * If the duplicates are off-page, use the duplicate search routine. 632 */ 633 if (cp->dpgno != PGNO_INVALID) { 634 if ((ret = __db_dsearch(dbc, iflagp != NULL, 635 data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0) 636 return (ret); 637 cp->dpgno = cp->page->pgno; 638 639 if (iflagp == NULL) { 640 if (cmp != 0) 641 return (DB_NOTFOUND); 642 return (0); 643 } 644 *iflagp = DB_BEFORE; 645 return (0); 646 } 647 648 /* Otherwise, do the search ourselves. */ 649 copy = *cp; 650 for (;;) { 651 /* Save the last interesting cursor position. */ 652 last = *cp; 653 654 /* See if the data item matches the one we're looking for. */ 655 if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX, 656 dbp->dup_compare == NULL ? 657 __bam_defcmp : dbp->dup_compare)) == 0) { 658 if (iflagp != NULL) 659 *iflagp = DB_AFTER; 660 return (0); 661 } 662 663 /* 664 * If duplicate entries are sorted, we're done if we find a 665 * page entry that sorts greater than the application item. 666 * If doing an insert, return success, otherwise DB_NOTFOUND. 667 */ 668 if (dbp->dup_compare != NULL && cmp < 0) { 669 if (iflagp == NULL) 670 return (DB_NOTFOUND); 671 *iflagp = DB_BEFORE; 672 return (0); 673 } 674 675 /* 676 * Move to the next item. If we reach the end of the page and 677 * we're doing an insert, set the cursor to the last item and 678 * set the referenced memory location so callers know to insert 679 * after the item, instead of before it. If not inserting, we 680 * return DB_NOTFOUND. 681 */ 682 if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) { 683 if (iflagp == NULL) 684 return (DB_NOTFOUND); 685 goto use_last; 686 } 687 688 /* 689 * Make sure we didn't go past the end of the duplicates. The 690 * error conditions are the same as above. 691 */ 692 if (!POSSIBLE_DUPLICATE(cp, copy)) { 693 if (iflagp == NULL) 694 return (DB_NOTFOUND); 695 use_last: *cp = last; 696 *iflagp = DB_AFTER; 697 return (0); 698 } 699 } 700 /* NOTREACHED */ 701 } 702 703 /* 704 * __bam_c_rget -- 705 * Return the record number for a cursor. 706 */ 707 static int 708 __bam_c_rget(dbc, data, flags) 709 DBC *dbc; 710 DBT *data; 711 u_int32_t flags; 712 { 713 CURSOR *cp; 714 DB *dbp; 715 DBT dbt; 716 db_recno_t recno; 717 int exact, ret; 718 719 COMPQUIET(flags, 0); 720 dbp = dbc->dbp; 721 cp = dbc->internal; 722 723 /* Get the page with the current item on it. */ 724 if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) 725 return (ret); 726 727 /* Get a copy of the key. */ 728 memset(&dbt, 0, sizeof(DBT)); 729 dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL; 730 if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0) 731 goto err; 732 733 exact = 1; 734 if ((ret = __bam_search(dbc, &dbt, 735 F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 736 1, &recno, &exact)) != 0) 737 goto err; 738 739 ret = __db_retcopy(data, &recno, sizeof(recno), 740 &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc); 741 742 /* Release the stack. */ 743 __bam_stkrel(dbc, 0); 744 745 err: (void)memp_fput(dbp->mpf, cp->page, 0); 746 __os_free(dbt.data, dbt.size); 747 return (ret); 748 } 749 750 /* 751 * __bam_c_put -- 752 * Put using a cursor. 753 */ 754 static int 755 __bam_c_put(dbc, key, data, flags) 756 DBC *dbc; 757 DBT *key, *data; 758 u_int32_t flags; 759 { 760 CURSOR *cp, copy; 761 DB *dbp; 762 DBT dbt; 763 db_indx_t indx; 764 db_pgno_t pgno; 765 u_int32_t iiflags, iiop; 766 int exact, needkey, ret, stack; 767 void *arg; 768 769 dbp = dbc->dbp; 770 cp = dbc->internal; 771 772 DB_PANIC_CHECK(dbp); 773 774 DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put", 775 flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL, 776 data, flags); 777 778 if ((ret = __db_cputchk(dbp, key, data, flags, 779 F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0) 780 return (ret); 781 782 /* 783 * If we are running CDB, this had better be either a write 784 * cursor or an immediate writer. If it's a regular writer, 785 * that means we have an IWRITE lock and we need to upgrade 786 * it to a write lock. 787 */ 788 if (F_ISSET(dbp, DB_AM_CDB)) { 789 if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER)) 790 return (EINVAL); 791 792 if (F_ISSET(dbc, DBC_RMW) && 793 (ret = lock_get(dbp->dbenv->lk_info, dbc->locker, 794 DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, 795 &dbc->mylock)) != 0) 796 return (EAGAIN); 797 } 798 799 if (0) { 800 split: /* 801 * To split, we need a valid key for the page. Since it's a 802 * cursor, we have to build one. 803 * 804 * Acquire a copy of a key from the page. 805 */ 806 if (needkey) { 807 memset(&dbt, 0, sizeof(DBT)); 808 if ((ret = __db_ret(dbp, cp->page, indx, 809 &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) 810 goto err; 811 arg = &dbt; 812 } else 813 arg = key; 814 815 /* 816 * Discard any locks and pinned pages (the locks are discarded 817 * even if we're running with transactions, as they lock pages 818 * that we're sorry we ever acquired). If stack is set and the 819 * cursor entries are valid, they point to the same entries as 820 * the stack, don't free them twice. 821 */ 822 if (stack) { 823 (void)__bam_stkrel(dbc, 1); 824 stack = 0; 825 } else 826 DISCARD(dbc, cp); 827 828 /* 829 * Restore the cursor to its original value. This is necessary 830 * for two reasons. First, we are about to copy it in case of 831 * error, again. Second, we adjust cursors during the split, 832 * and we have to ensure this cursor is adjusted appropriately, 833 * along with all the other cursors. 834 */ 835 *cp = copy; 836 837 if ((ret = __bam_split(dbc, arg)) != 0) 838 goto err; 839 } 840 841 /* 842 * Initialize the cursor for a new retrieval. Clear the cursor's 843 * page pointer, it was set before this operation, and no longer 844 * has any meaning. 845 */ 846 cp->page = NULL; 847 copy = *cp; 848 cp->lock = LOCK_INVALID; 849 850 iiflags = needkey = ret = stack = 0; 851 switch (flags) { 852 case DB_AFTER: 853 case DB_BEFORE: 854 case DB_CURRENT: 855 needkey = 1; 856 if (cp->dpgno == PGNO_INVALID) { 857 pgno = cp->pgno; 858 indx = cp->indx; 859 } else { 860 pgno = cp->dpgno; 861 indx = cp->dindx; 862 } 863 864 /* 865 * !!! 866 * This test is right -- we don't yet support duplicates and 867 * record numbers in the same tree, so ignore duplicates if 868 * DB_BT_RECNUM set. 869 */ 870 if (F_ISSET(dbp, DB_BT_RECNUM) && 871 (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) { 872 /* Acquire a complete stack. */ 873 if ((ret = __bam_c_getstack(dbc, cp)) != 0) 874 goto err; 875 cp->page = cp->csp->page; 876 877 stack = 1; 878 iiflags = BI_DOINCR; 879 } else { 880 /* Acquire the current page. */ 881 if ((ret = __bam_lget(dbc, 882 0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0) 883 ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page); 884 if (ret != 0) 885 goto err; 886 887 iiflags = 0; 888 } 889 890 /* 891 * If the user has specified a duplicate comparison function, 892 * we return an error if DB_CURRENT was specified and the 893 * replacement data doesn't compare equal to the current data. 894 * This stops apps from screwing up the duplicate sort order. 895 */ 896 if (flags == DB_CURRENT && dbp->dup_compare != NULL) 897 if (__bam_cmp(dbp, data, 898 cp->page, indx, dbp->dup_compare) != 0) { 899 ret = EINVAL; 900 goto err; 901 } 902 903 iiop = flags; 904 break; 905 case DB_KEYFIRST: 906 case DB_KEYLAST: 907 /* 908 * If we have a duplicate comparison function, we position to 909 * the first of any on-page duplicates, and use __bam_dsearch 910 * to search for the right slot. Otherwise, we position to 911 * the first/last of any on-page duplicates based on the flag 912 * value. 913 */ 914 if ((ret = __bam_c_search(dbc, cp, key, 915 flags == DB_KEYFIRST || dbp->dup_compare != NULL ? 916 DB_KEYFIRST : DB_KEYLAST, &exact)) != 0) 917 goto err; 918 stack = 1; 919 920 /* 921 * If an exact match: 922 * If duplicates aren't supported, replace the current 923 * item. (When implementing the DB->put function, our 924 * caller has already checked the DB_NOOVERWRITE flag.) 925 * 926 * If there's a duplicate comparison function, find the 927 * correct slot for this duplicate item. 928 * 929 * If there's no duplicate comparison function, set the 930 * insert flag based on the argument flags. 931 * 932 * If there's no match, the search function returned the 933 * smallest slot greater than the key, use it. 934 */ 935 if (exact) { 936 if (F_ISSET(dbp, DB_AM_DUP)) { 937 /* 938 * If at off-page duplicate page, move to the 939 * first or last entry -- if a comparison 940 * function was specified, start searching at 941 * the first entry. Otherwise, move based on 942 * the DB_KEYFIRST/DB_KEYLAST flags. 943 */ 944 if ((ret = __bam_dup(dbc, cp, cp->indx, 945 dbp->dup_compare == NULL && 946 flags != DB_KEYFIRST)) != 0) 947 goto err; 948 949 /* 950 * If there's a comparison function, search for 951 * the correct slot. Otherwise, set the insert 952 * flag based on the argment flag. 953 */ 954 if (dbp->dup_compare == NULL) 955 iiop = flags == DB_KEYFIRST ? 956 DB_BEFORE : DB_AFTER; 957 else 958 if ((ret = __bam_dsearch(dbc, 959 cp, data, &iiop)) != 0) 960 goto err; 961 } else 962 iiop = DB_CURRENT; 963 iiflags = 0; 964 } else { 965 iiop = DB_BEFORE; 966 iiflags = BI_NEWKEY; 967 } 968 969 if (cp->dpgno == PGNO_INVALID) { 970 pgno = cp->pgno; 971 indx = cp->indx; 972 } else { 973 pgno = cp->dpgno; 974 indx = cp->dindx; 975 } 976 break; 977 } 978 979 ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags); 980 981 if (ret == DB_NEEDSPLIT) 982 goto split; 983 if (ret != 0) 984 goto err; 985 986 /* 987 * Reset any cursors referencing this item that might have the item 988 * marked for deletion. 989 */ 990 if (iiop == DB_CURRENT) { 991 (void)__bam_ca_delete(dbp, pgno, indx, 0); 992 993 /* 994 * It's also possible that we are the cursor that had the 995 * item marked for deletion, in which case we want to make 996 * sure that we don't delete it because we had the delete 997 * flag set already. 998 */ 999 if (cp->pgno == copy.pgno && cp->indx == copy.indx && 1000 cp->dpgno == copy.dpgno && cp->dindx == copy.dindx) 1001 F_CLR(©, C_DELETED); 1002 } 1003 1004 /* 1005 * Update the cursor to point to the new entry. The new entry was 1006 * stored on the current page, because we split pages until it was 1007 * possible. 1008 */ 1009 if (cp->dpgno == PGNO_INVALID) 1010 cp->indx = indx; 1011 else 1012 cp->dindx = indx; 1013 1014 /* 1015 * If the previous cursor record has been deleted, physically delete 1016 * the entry from the page. We clear the deleted flag before we call 1017 * the underlying delete routine so that, if an error occurs, and we 1018 * restore the cursor, the deleted flag is cleared. This is because, 1019 * if we manage to physically modify the page, and then restore the 1020 * cursor, we might try to repeat the page modification when closing 1021 * the cursor. 1022 */ 1023 if (F_ISSET(©, C_DELETED)) { 1024 F_CLR(©, C_DELETED); 1025 if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0) 1026 goto err; 1027 } 1028 F_CLR(cp, C_DELETED); 1029 1030 /* Release the previous lock, if any; the current lock is retained. */ 1031 if (copy.lock != LOCK_INVALID) 1032 (void)__BT_TLPUT(dbc, copy.lock); 1033 1034 /* 1035 * Discard any pages pinned in the tree and their locks, except for 1036 * the leaf page, for which we only discard the pin, not the lock. 1037 * 1038 * Note, the leaf page participated in the stack we acquired, and so 1039 * we have to adjust the stack as necessary. If there was only a 1040 * single page on the stack, we don't have to free further stack pages. 1041 */ 1042 if (stack && BT_STK_POP(cp) != NULL) 1043 (void)__bam_stkrel(dbc, 0); 1044 1045 /* Release the current page. */ 1046 if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) 1047 goto err; 1048 1049 if (0) { 1050 err: /* Discard any pinned pages. */ 1051 if (stack) 1052 (void)__bam_stkrel(dbc, 0); 1053 else 1054 DISCARD(dbc, cp); 1055 *cp = copy; 1056 } 1057 1058 if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) 1059 (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, 1060 DB_LOCK_IWRITE, 0); 1061 1062 return (ret); 1063 } 1064 1065 /* 1066 * __bam_c_first -- 1067 * Return the first record. 1068 */ 1069 static int 1070 __bam_c_first(dbc, cp) 1071 DBC *dbc; 1072 CURSOR *cp; 1073 { 1074 DB *dbp; 1075 db_pgno_t pgno; 1076 int ret; 1077 1078 dbp = dbc->dbp; 1079 1080 /* Walk down the left-hand side of the tree. */ 1081 for (pgno = PGNO_ROOT;;) { 1082 if ((ret = 1083 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) 1084 return (ret); 1085 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) 1086 return (ret); 1087 1088 /* If we find a leaf page, we're done. */ 1089 if (ISLEAF(cp->page)) 1090 break; 1091 1092 pgno = GET_BINTERNAL(cp->page, 0)->pgno; 1093 DISCARD(dbc, cp); 1094 } 1095 1096 cp->pgno = cp->page->pgno; 1097 cp->indx = 0; 1098 cp->dpgno = PGNO_INVALID; 1099 1100 /* Check for duplicates. */ 1101 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0) 1102 return (ret); 1103 1104 /* If on an empty page or a deleted record, move to the next one. */ 1105 if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp)) 1106 if ((ret = __bam_c_next(dbc, cp, 0)) != 0) 1107 return (ret); 1108 1109 return (0); 1110 } 1111 1112 /* 1113 * __bam_c_last -- 1114 * Return the last record. 1115 */ 1116 static int 1117 __bam_c_last(dbc, cp) 1118 DBC *dbc; 1119 CURSOR *cp; 1120 { 1121 DB *dbp; 1122 db_pgno_t pgno; 1123 int ret; 1124 1125 dbp = dbc->dbp; 1126 1127 /* Walk down the right-hand side of the tree. */ 1128 for (pgno = PGNO_ROOT;;) { 1129 if ((ret = 1130 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) 1131 return (ret); 1132 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) 1133 return (ret); 1134 1135 /* If we find a leaf page, we're done. */ 1136 if (ISLEAF(cp->page)) 1137 break; 1138 1139 pgno = 1140 GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno; 1141 DISCARD(dbc, cp); 1142 } 1143 1144 cp->pgno = cp->page->pgno; 1145 cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX; 1146 cp->dpgno = PGNO_INVALID; 1147 1148 /* Check for duplicates. */ 1149 if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0) 1150 return (ret); 1151 1152 /* If on an empty page or a deleted record, move to the next one. */ 1153 if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp)) 1154 if ((ret = __bam_c_prev(dbc, cp)) != 0) 1155 return (ret); 1156 1157 return (0); 1158 } 1159 1160 /* 1161 * __bam_c_next -- 1162 * Move to the next record. 1163 */ 1164 static int 1165 __bam_c_next(dbc, cp, initial_move) 1166 DBC *dbc; 1167 CURSOR *cp; 1168 int initial_move; 1169 { 1170 DB *dbp; 1171 db_indx_t adjust, indx; 1172 db_pgno_t pgno; 1173 int ret; 1174 1175 dbp = dbc->dbp; 1176 1177 /* 1178 * We're either moving through a page of duplicates or a btree leaf 1179 * page. 1180 */ 1181 if (cp->dpgno == PGNO_INVALID) { 1182 adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX; 1183 pgno = cp->pgno; 1184 indx = cp->indx; 1185 } else { 1186 adjust = O_INDX; 1187 pgno = cp->dpgno; 1188 indx = cp->dindx; 1189 } 1190 if (cp->page == NULL) { 1191 if ((ret = 1192 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) 1193 return (ret); 1194 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) 1195 return (ret); 1196 } 1197 1198 /* 1199 * If at the end of the page, move to a subsequent page. 1200 * 1201 * !!! 1202 * Check for >= NUM_ENT. If we're here as the result of a search that 1203 * landed us on NUM_ENT, we'll increment indx before we test. 1204 * 1205 * !!! 1206 * This code handles empty pages and pages with only deleted entries. 1207 */ 1208 if (initial_move) 1209 indx += adjust; 1210 for (;;) { 1211 if (indx >= NUM_ENT(cp->page)) { 1212 /* 1213 * If we're in a btree leaf page, we've reached the end 1214 * of the tree. If we've reached the end of a page of 1215 * duplicates, continue from the btree leaf page where 1216 * we found this page of duplicates. 1217 */ 1218 pgno = cp->page->next_pgno; 1219 if (pgno == PGNO_INVALID) { 1220 /* If in a btree leaf page, it's EOF. */ 1221 if (cp->dpgno == PGNO_INVALID) 1222 return (DB_NOTFOUND); 1223 1224 /* Continue from the last btree leaf page. */ 1225 cp->dpgno = PGNO_INVALID; 1226 1227 adjust = P_INDX; 1228 pgno = cp->pgno; 1229 indx = cp->indx + P_INDX; 1230 } else 1231 indx = 0; 1232 1233 DISCARD(dbc, cp); 1234 if ((ret = __bam_lget(dbc, 1235 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) 1236 return (ret); 1237 if ((ret = 1238 memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) 1239 return (ret); 1240 continue; 1241 } 1242 1243 /* Ignore deleted records. */ 1244 if (IS_DELETED(cp, indx)) { 1245 indx += adjust; 1246 continue; 1247 } 1248 1249 /* 1250 * If we're not in a duplicates page, check to see if we've 1251 * found a page of duplicates, in which case we move to the 1252 * first entry. 1253 */ 1254 if (cp->dpgno == PGNO_INVALID) { 1255 cp->pgno = cp->page->pgno; 1256 cp->indx = indx; 1257 1258 if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0) 1259 return (ret); 1260 if (cp->dpgno != PGNO_INVALID) { 1261 indx = cp->dindx; 1262 adjust = O_INDX; 1263 continue; 1264 } 1265 } else { 1266 cp->dpgno = cp->page->pgno; 1267 cp->dindx = indx; 1268 } 1269 break; 1270 } 1271 return (0); 1272 } 1273 1274 /* 1275 * __bam_c_prev -- 1276 * Move to the previous record. 1277 */ 1278 static int 1279 __bam_c_prev(dbc, cp) 1280 DBC *dbc; 1281 CURSOR *cp; 1282 { 1283 DB *dbp; 1284 db_indx_t indx, adjust; 1285 db_pgno_t pgno; 1286 int ret, set_indx; 1287 1288 dbp = dbc->dbp; 1289 1290 /* 1291 * We're either moving through a page of duplicates or a btree leaf 1292 * page. 1293 */ 1294 if (cp->dpgno == PGNO_INVALID) { 1295 adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX; 1296 pgno = cp->pgno; 1297 indx = cp->indx; 1298 } else { 1299 adjust = O_INDX; 1300 pgno = cp->dpgno; 1301 indx = cp->dindx; 1302 } 1303 if (cp->page == NULL) { 1304 if ((ret = 1305 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) 1306 return (ret); 1307 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) 1308 return (ret); 1309 } 1310 1311 /* 1312 * If at the beginning of the page, move to any previous one. 1313 * 1314 * !!! 1315 * This code handles empty pages and pages with only deleted entries. 1316 */ 1317 for (;;) { 1318 if (indx == 0) { 1319 /* 1320 * If we're in a btree leaf page, we've reached the 1321 * beginning of the tree. If we've reached the first 1322 * of a page of duplicates, continue from the btree 1323 * leaf page where we found this page of duplicates. 1324 */ 1325 pgno = cp->page->prev_pgno; 1326 if (pgno == PGNO_INVALID) { 1327 /* If in a btree leaf page, it's SOF. */ 1328 if (cp->dpgno == PGNO_INVALID) 1329 return (DB_NOTFOUND); 1330 1331 /* Continue from the last btree leaf page. */ 1332 cp->dpgno = PGNO_INVALID; 1333 1334 adjust = P_INDX; 1335 pgno = cp->pgno; 1336 indx = cp->indx; 1337 set_indx = 0; 1338 } else 1339 set_indx = 1; 1340 1341 DISCARD(dbc, cp); 1342 if ((ret = __bam_lget(dbc, 1343 0, pgno, DB_LOCK_READ, &cp->lock)) != 0) 1344 return (ret); 1345 if ((ret = 1346 memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) 1347 return (ret); 1348 1349 if (set_indx) 1350 indx = NUM_ENT(cp->page); 1351 if (indx == 0) 1352 continue; 1353 } 1354 1355 /* Ignore deleted records. */ 1356 indx -= adjust; 1357 if (IS_DELETED(cp, indx)) 1358 continue; 1359 1360 /* 1361 * If we're not in a duplicates page, check to see if we've 1362 * found a page of duplicates, in which case we move to the 1363 * last entry. 1364 */ 1365 if (cp->dpgno == PGNO_INVALID) { 1366 cp->pgno = cp->page->pgno; 1367 cp->indx = indx; 1368 1369 if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0) 1370 return (ret); 1371 if (cp->dpgno != PGNO_INVALID) { 1372 indx = cp->dindx + O_INDX; 1373 adjust = O_INDX; 1374 continue; 1375 } 1376 } else { 1377 cp->dpgno = cp->page->pgno; 1378 cp->dindx = indx; 1379 } 1380 break; 1381 } 1382 return (0); 1383 } 1384 1385 /* 1386 * __bam_c_search -- 1387 * Move to a specified record. 1388 */ 1389 static int 1390 __bam_c_search(dbc, cp, key, flags, exactp) 1391 DBC *dbc; 1392 CURSOR *cp; 1393 const DBT *key; 1394 u_int32_t flags; 1395 int *exactp; 1396 { 1397 BTREE *t; 1398 DB *dbp; 1399 DB_LOCK lock; 1400 PAGE *h; 1401 db_recno_t recno; 1402 db_indx_t indx; 1403 u_int32_t sflags; 1404 int cmp, needexact, ret; 1405 1406 dbp = dbc->dbp; 1407 t = dbp->internal; 1408 1409 /* Find an entry in the database. */ 1410 switch (flags) { 1411 case DB_SET_RECNO: 1412 if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) 1413 return (ret); 1414 sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; 1415 needexact = *exactp = 1; 1416 ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp); 1417 break; 1418 case DB_SET: 1419 case DB_GET_BOTH: 1420 sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; 1421 needexact = *exactp = 1; 1422 goto search; 1423 case DB_SET_RANGE: 1424 sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND; 1425 needexact = *exactp = 0; 1426 goto search; 1427 case DB_KEYFIRST: 1428 sflags = S_KEYFIRST; 1429 goto fast_search; 1430 case DB_KEYLAST: 1431 sflags = S_KEYLAST; 1432 fast_search: needexact = *exactp = 0; 1433 /* 1434 * If the application has a history of inserting into the first 1435 * or last pages of the database, we check those pages first to 1436 * avoid doing a full search. 1437 * 1438 * Record numbers can't be fast-tracked, the entire tree has to 1439 * be locked. 1440 */ 1441 h = NULL; 1442 lock = LOCK_INVALID; 1443 if (F_ISSET(dbp, DB_BT_RECNUM)) 1444 goto search; 1445 1446 /* Check if the application has a history of sorted input. */ 1447 if (t->bt_lpgno == PGNO_INVALID) 1448 goto search; 1449 1450 /* 1451 * Lock and retrieve the page on which we did the last insert. 1452 * It's okay if it doesn't exist, or if it's not the page type 1453 * we expected, it just means that the world changed. 1454 */ 1455 if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock)) 1456 goto fast_miss; 1457 if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h)) 1458 goto fast_miss; 1459 if (TYPE(h) != P_LBTREE) 1460 goto fast_miss; 1461 if (NUM_ENT(h) == 0) 1462 goto fast_miss; 1463 1464 /* 1465 * What we do here is test to see if we're at the beginning or 1466 * end of the tree and if the new item sorts before/after the 1467 * first/last page entry. We don't try and catch inserts into 1468 * the middle of the tree (although we could, as long as there 1469 * were two keys on the page and we saved both the index and 1470 * the page number of the last insert). 1471 */ 1472 if (h->next_pgno == PGNO_INVALID) { 1473 indx = NUM_ENT(h) - P_INDX; 1474 if ((cmp = 1475 __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0) 1476 goto try_begin; 1477 if (cmp > 0) { 1478 indx += P_INDX; 1479 goto fast_hit; 1480 } 1481 1482 /* 1483 * Found a duplicate. If doing DB_KEYLAST, we're at 1484 * the correct position, otherwise, move to the first 1485 * of the duplicates. 1486 */ 1487 if (flags == DB_KEYLAST) 1488 goto fast_hit; 1489 for (; 1490 indx > 0 && h->inp[indx - P_INDX] == h->inp[indx]; 1491 indx -= P_INDX) 1492 ; 1493 goto fast_hit; 1494 } 1495 try_begin: if (h->prev_pgno == PGNO_INVALID) { 1496 indx = 0; 1497 if ((cmp = 1498 __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0) 1499 goto fast_miss; 1500 if (cmp < 0) 1501 goto fast_hit; 1502 /* 1503 * Found a duplicate. If doing DB_KEYFIRST, we're at 1504 * the correct position, otherwise, move to the last 1505 * of the duplicates. 1506 */ 1507 if (flags == DB_KEYFIRST) 1508 goto fast_hit; 1509 for (; 1510 indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && 1511 h->inp[indx] == h->inp[indx + P_INDX]; 1512 indx += P_INDX) 1513 ; 1514 goto fast_hit; 1515 } 1516 goto fast_miss; 1517 1518 fast_hit: /* Set the exact match flag, we may have found a duplicate. */ 1519 *exactp = cmp == 0; 1520 1521 /* Enter the entry in the stack. */ 1522 BT_STK_CLR(cp); 1523 BT_STK_ENTER(cp, h, indx, lock, ret); 1524 break; 1525 1526 fast_miss: if (h != NULL) 1527 (void)memp_fput(dbp->mpf, h, 0); 1528 if (lock != LOCK_INVALID) 1529 (void)__BT_LPUT(dbc, lock); 1530 1531 search: ret = __bam_search(dbc, key, sflags, 1, NULL, exactp); 1532 break; 1533 default: /* XXX: Impossible. */ 1534 abort(); 1535 /* NOTREACHED */ 1536 } 1537 if (ret != 0) 1538 return (ret); 1539 1540 /* 1541 * Initialize the cursor to reference it. This has to be done 1542 * before we return (even with DB_NOTFOUND) because we have to 1543 * free the page(s) we locked in __bam_search. 1544 */ 1545 cp->page = cp->csp->page; 1546 cp->pgno = cp->csp->page->pgno; 1547 cp->indx = cp->csp->indx; 1548 cp->lock = cp->csp->lock; 1549 cp->dpgno = PGNO_INVALID; 1550 1551 /* 1552 * If we inserted a key into the first or last slot of the tree, 1553 * remember where it was so we can do it more quickly next time. 1554 */ 1555 if (flags == DB_KEYFIRST || flags == DB_KEYLAST) 1556 t->bt_lpgno = 1557 ((cp->page->next_pgno == PGNO_INVALID && 1558 cp->indx >= NUM_ENT(cp->page)) || 1559 (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ? 1560 cp->pgno : PGNO_INVALID; 1561 1562 /* If we need an exact match and didn't find one, we're done. */ 1563 if (needexact && *exactp == 0) 1564 return (DB_NOTFOUND); 1565 1566 return (0); 1567 } 1568 1569 /* 1570 * __bam_dup -- 1571 * Check for an off-page duplicates entry, and if found, move to the 1572 * first or last entry. 1573 * 1574 * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int)); 1575 */ 1576 int 1577 __bam_dup(dbc, cp, indx, last_dup) 1578 DBC *dbc; 1579 CURSOR *cp; 1580 u_int32_t indx; 1581 int last_dup; 1582 { 1583 BOVERFLOW *bo; 1584 DB *dbp; 1585 db_pgno_t pgno; 1586 int ret; 1587 1588 dbp = dbc->dbp; 1589 1590 /* 1591 * Check for an overflow entry. If we find one, move to the 1592 * duplicates page, and optionally move to the last record on 1593 * that page. 1594 * 1595 * !!! 1596 * We don't lock duplicates pages, we've already got the correct 1597 * lock on the main page. 1598 */ 1599 bo = GET_BOVERFLOW(cp->page, indx + O_INDX); 1600 if (B_TYPE(bo->type) != B_DUPLICATE) 1601 return (0); 1602 1603 pgno = bo->pgno; 1604 if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) 1605 return (ret); 1606 cp->page = NULL; 1607 if (last_dup) { 1608 if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0) 1609 return (ret); 1610 indx = NUM_ENT(cp->page) - O_INDX; 1611 } else { 1612 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0) 1613 return (ret); 1614 indx = 0; 1615 } 1616 1617 /* Update the cursor's duplicate information. */ 1618 cp->dpgno = cp->page->pgno; 1619 cp->dindx = indx; 1620 1621 return (0); 1622 } 1623 1624 /* 1625 * __bam_c_physdel -- 1626 * Actually do the cursor deletion. 1627 */ 1628 static int 1629 __bam_c_physdel(dbc, cp, h) 1630 DBC *dbc; 1631 CURSOR *cp; 1632 PAGE *h; 1633 { 1634 enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd; 1635 BOVERFLOW bo; 1636 DB *dbp; 1637 DBT dbt; 1638 DB_LOCK lock; 1639 db_indx_t indx; 1640 db_pgno_t pgno, next_pgno, prev_pgno; 1641 int delete_page, local_page, ret; 1642 1643 dbp = dbc->dbp; 1644 1645 delete_page = ret = 0; 1646 1647 /* Figure out what we're deleting. */ 1648 if (cp->dpgno == PGNO_INVALID) { 1649 pgno = cp->pgno; 1650 indx = cp->indx; 1651 } else { 1652 pgno = cp->dpgno; 1653 indx = cp->dindx; 1654 } 1655 1656 /* 1657 * If the item is referenced by another cursor, set that cursor's 1658 * delete flag and leave it up to it to do the delete. 1659 * 1660 * !!! 1661 * This test for > 0 is a tricky. There are two ways that we can 1662 * be called here. Either we are closing the cursor or we've moved 1663 * off the page with the deleted entry. In the first case, we've 1664 * already removed the cursor from the active queue, so we won't see 1665 * it in __bam_ca_delete. In the second case, it will be on a different 1666 * item, so we won't bother with it in __bam_ca_delete. 1667 */ 1668 if (__bam_ca_delete(dbp, pgno, indx, 1) > 0) 1669 return (0); 1670 1671 /* 1672 * If this is concurrent DB, upgrade the lock if necessary. 1673 */ 1674 if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) && 1675 (ret = lock_get(dbp->dbenv->lk_info, 1676 dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, 1677 &dbc->mylock)) != 0) 1678 return (EAGAIN); 1679 1680 /* 1681 * If we don't already have the page locked, get it and delete the 1682 * items. 1683 */ 1684 if ((h == NULL || h->pgno != pgno)) { 1685 if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0) 1686 return (ret); 1687 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) 1688 return (ret); 1689 local_page = 1; 1690 } else 1691 local_page = 0; 1692 1693 /* 1694 * If we're deleting a duplicate entry and there are other duplicate 1695 * entries remaining, call the common code to do the work and fix up 1696 * the parent page as necessary. Otherwise, do a normal btree delete. 1697 * 1698 * There are 5 possible cases: 1699 * 1700 * 1. It's not a duplicate item: do a normal btree delete. 1701 * 2. It's a duplicate item: 1702 * 2a: We delete an item from a page of duplicates, but there are 1703 * more items on the page. 1704 * 2b: We delete the last item from a page of duplicates, deleting 1705 * the last duplicate. 1706 * 2c: We delete the last item from a page of duplicates, but there 1707 * is a previous page of duplicates. 1708 * 2d: We delete the last item from a page of duplicates, but there 1709 * is a following page of duplicates. 1710 * 1711 * In the case of: 1712 * 1713 * 1: There's nothing further to do. 1714 * 2a: There's nothing further to do. 1715 * 2b: Do the normal btree delete instead of a duplicate delete, as 1716 * that deletes both the duplicate chain and the parent page's 1717 * entry. 1718 * 2c: There's nothing further to do. 1719 * 2d: Delete the duplicate, and update the parent page's entry. 1720 */ 1721 if (TYPE(h) == P_DUPLICATE) { 1722 pgno = PGNO(h); 1723 prev_pgno = PREV_PGNO(h); 1724 next_pgno = NEXT_PGNO(h); 1725 1726 if (NUM_ENT(h) == 1 && 1727 prev_pgno == PGNO_INVALID && next_pgno == PGNO_INVALID) 1728 cmd = DELETE_PAGE; 1729 else { 1730 cmd = DELETE_ITEM; 1731 1732 /* Delete the duplicate. */ 1733 if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0) 1734 goto err; 1735 1736 /* 1737 * 2a: h != NULL, h->pgno == pgno 1738 * 2b: We don't reach this clause, as the above test 1739 * was true. 1740 * 2c: h == NULL, prev_pgno != PGNO_INVALID 1741 * 2d: h != NULL, next_pgno != PGNO_INVALID 1742 * 1743 * Test for 2a and 2c: if we didn't empty the current 1744 * page or there was a previous page of duplicates, we 1745 * don't need to touch the parent page. 1746 */ 1747 if ((h != NULL && pgno == h->pgno) || 1748 prev_pgno != PGNO_INVALID) 1749 cmd = NOTHING_FURTHER; 1750 } 1751 1752 /* 1753 * Release any page we're holding and its lock. 1754 * 1755 * !!! 1756 * If there is no subsequent page in the duplicate chain, then 1757 * __db_drem will have put page "h" and set it to NULL. 1758 */ 1759 if (local_page) { 1760 if (h != NULL) 1761 (void)memp_fput(dbp->mpf, h, 0); 1762 (void)__BT_TLPUT(dbc, lock); 1763 local_page = 0; 1764 } 1765 1766 if (cmd == NOTHING_FURTHER) 1767 goto done; 1768 1769 /* Acquire the parent page and switch the index to its entry. */ 1770 if ((ret = 1771 __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0) 1772 goto err; 1773 if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) { 1774 (void)__BT_TLPUT(dbc, lock); 1775 goto err; 1776 } 1777 local_page = 1; 1778 indx = cp->indx; 1779 1780 if (cmd == DELETE_PAGE) 1781 goto btd; 1782 1783 /* 1784 * Copy, delete, update, add-back the parent page's data entry. 1785 * 1786 * XXX 1787 * This may be a performance/logging problem. We should add a 1788 * log message which simply logs/updates a random set of bytes 1789 * on a page, and use it instead of doing a delete/add pair. 1790 */ 1791 indx += O_INDX; 1792 bo = *GET_BOVERFLOW(h, indx); 1793 (void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE); 1794 bo.pgno = next_pgno; 1795 memset(&dbt, 0, sizeof(dbt)); 1796 dbt.data = &bo; 1797 dbt.size = BOVERFLOW_SIZE; 1798 (void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL); 1799 (void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY); 1800 goto done; 1801 } 1802 1803 btd: /* 1804 * If the page is going to be emptied, delete it. To delete a leaf 1805 * page we need a copy of a key from the page. We use the 0th page 1806 * index since it's the last key that the page held. 1807 * 1808 * We malloc the page information instead of using the return key/data 1809 * memory because we've already set them -- the reason we've already 1810 * set them is because we're (potentially) about to do a reverse split, 1811 * which would make our saved page information useless. 1812 * 1813 * !!! 1814 * The following operations to delete a page might deadlock. I think 1815 * that's OK. The problem is if we're deleting an item because we're 1816 * closing cursors because we've already deadlocked and want to call 1817 * txn_abort(). If we fail due to deadlock, we leave a locked empty 1818 * page in the tree, which won't be empty long because we're going to 1819 * undo the delete. 1820 */ 1821 if (NUM_ENT(h) == 2 && h->pgno != PGNO_ROOT) { 1822 memset(&dbt, 0, sizeof(DBT)); 1823 dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL; 1824 if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0) 1825 goto err; 1826 delete_page = 1; 1827 } 1828 1829 /* 1830 * Do a normal btree delete. 1831 * 1832 * !!! 1833 * Delete the key item first, otherwise the duplicate checks in 1834 * __bam_ditem() won't work! 1835 */ 1836 if ((ret = __bam_ditem(dbc, h, indx)) != 0) 1837 goto err; 1838 if ((ret = __bam_ditem(dbc, h, indx)) != 0) 1839 goto err; 1840 1841 /* Discard any remaining locks/pages. */ 1842 if (local_page) { 1843 (void)memp_fput(dbp->mpf, h, 0); 1844 (void)__BT_TLPUT(dbc, lock); 1845 local_page = 0; 1846 } 1847 1848 /* Delete the page if it was emptied. */ 1849 if (delete_page) 1850 ret = __bam_dpage(dbc, &dbt); 1851 1852 err: 1853 done: if (delete_page) 1854 __os_free(dbt.data, dbt.size); 1855 1856 if (local_page) { 1857 /* 1858 * It's possible for h to be NULL, as __db_drem may have 1859 * been relinking pages by the time that it deadlocked. 1860 */ 1861 if (h != NULL) 1862 (void)memp_fput(dbp->mpf, h, 0); 1863 (void)__BT_TLPUT(dbc, lock); 1864 } 1865 1866 if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW)) 1867 (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock, 1868 DB_LOCK_IWRITE, 0); 1869 1870 return (ret); 1871 } 1872 1873 /* 1874 * __bam_c_getstack -- 1875 * Acquire a full stack for a cursor. 1876 */ 1877 static int 1878 __bam_c_getstack(dbc, cp) 1879 DBC *dbc; 1880 CURSOR *cp; 1881 { 1882 DB *dbp; 1883 DBT dbt; 1884 PAGE *h; 1885 db_pgno_t pgno; 1886 int exact, ret; 1887 1888 dbp = dbc->dbp; 1889 h = NULL; 1890 memset(&dbt, 0, sizeof(DBT)); 1891 ret = 0; 1892 1893 /* Get the page with the current item on it. */ 1894 pgno = cp->pgno; 1895 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) 1896 return (ret); 1897 1898 /* Get a copy of a key from the page. */ 1899 dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL; 1900 if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0) 1901 goto err; 1902 1903 /* Get a write-locked stack for that page. */ 1904 exact = 0; 1905 ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact); 1906 1907 /* We no longer need the key or the page. */ 1908 err: if (h != NULL) 1909 (void)memp_fput(dbp->mpf, h, 0); 1910 if (dbt.data != NULL) 1911 __os_free(dbt.data, dbt.size); 1912 return (ret); 1913 } 1914