1 /*- 2 * Copyright (c) 1990, 1993, 1994 3 * The Regents of the University of California. All rights reserved. 4 * 5 * This code is derived from software contributed to Berkeley by 6 * Mike Olson. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the University of 19 * California, Berkeley and its contributors. 20 * 4. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #if defined(LIBC_SCCS) && !defined(lint) 38 static char sccsid[] = "@(#)bt_delete.c 8.13 (Berkeley) 7/28/94"; 39 #endif /* LIBC_SCCS and not lint */ 40 41 #include <sys/types.h> 42 43 #include <errno.h> 44 #include <stdio.h> 45 #include <string.h> 46 47 #include <db.h> 48 #include "btree.h" 49 50 static int __bt_bdelete __P((BTREE *, const DBT *)); 51 static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int)); 52 static int __bt_pdelete __P((BTREE *, PAGE *)); 53 static int __bt_relink __P((BTREE *, PAGE *)); 54 static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *)); 55 56 /* 57 * __bt_delete 58 * Delete the item(s) referenced by a key. 59 * 60 * Return RET_SPECIAL if the key is not found. 61 */ 62 int 63 __bt_delete(dbp, key, flags) 64 const DB *dbp; 65 const DBT *key; 66 u_int flags; 67 { 68 BTREE *t; 69 CURSOR *c; 70 PAGE *h; 71 int status; 72 73 t = dbp->internal; 74 75 /* Toss any page pinned across calls. */ 76 if (t->bt_pinned != NULL) { 77 mpool_put(t->bt_mp, t->bt_pinned, 0); 78 t->bt_pinned = NULL; 79 } 80 81 /* Check for change to a read-only tree. */ 82 if (F_ISSET(t, B_RDONLY)) { 83 errno = EPERM; 84 return (RET_ERROR); 85 } 86 87 switch (flags) { 88 case 0: 89 status = __bt_bdelete(t, key); 90 break; 91 case R_CURSOR: 92 /* 93 * If flags is R_CURSOR, delete the cursor. Must already 94 * have started a scan and not have already deleted it. 95 */ 96 c = &t->bt_cursor; 97 if (F_ISSET(c, CURS_INIT)) { 98 if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) 99 return (RET_SPECIAL); 100 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 101 return (RET_ERROR); 102 103 /* 104 * If the page is about to be emptied, we'll need to 105 * delete it, which means we have to acquire a stack. 106 */ 107 if (NEXTINDEX(h) == 1) 108 if (__bt_stkacq(t, &h, &t->bt_cursor)) 109 return (RET_ERROR); 110 111 status = __bt_dleaf(t, NULL, h, c->pg.index); 112 113 if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { 114 if (__bt_pdelete(t, h)) 115 return (RET_ERROR); 116 } else 117 mpool_put(t->bt_mp, 118 h, status == RET_SUCCESS ? MPOOL_DIRTY : 0); 119 break; 120 } 121 /* FALLTHROUGH */ 122 default: 123 errno = EINVAL; 124 return (RET_ERROR); 125 } 126 if (status == RET_SUCCESS) 127 F_SET(t, B_MODIFIED); 128 return (status); 129 } 130 131 /* 132 * __bt_stkacq -- 133 * Acquire a stack so we can delete a cursor entry. 134 * 135 * Parameters: 136 * t: tree 137 * hp: pointer to current, pinned PAGE pointer 138 * c: pointer to the cursor 139 * 140 * Returns: 141 * 0 on success, 1 on failure 142 */ 143 static int 144 __bt_stkacq(t, hp, c) 145 BTREE *t; 146 PAGE **hp; 147 CURSOR *c; 148 { 149 BINTERNAL *bi; 150 EPG *e; 151 EPGNO *parent; 152 PAGE *h; 153 indx_t index; 154 pgno_t pgno; 155 recno_t nextpg, prevpg; 156 int exact, level; 157 158 /* 159 * Find the first occurrence of the key in the tree. Toss the 160 * currently locked page so we don't hit an already-locked page. 161 */ 162 h = *hp; 163 mpool_put(t->bt_mp, h, 0); 164 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 165 return (1); 166 h = e->page; 167 168 /* See if we got it in one shot. */ 169 if (h->pgno == c->pg.pgno) 170 goto ret; 171 172 /* 173 * Move right, looking for the page. At each move we have to move 174 * up the stack until we don't have to move to the next page. If 175 * we have to change pages at an internal level, we have to fix the 176 * stack back up. 177 */ 178 while (h->pgno != c->pg.pgno) { 179 if ((nextpg = h->nextpg) == P_INVALID) 180 break; 181 mpool_put(t->bt_mp, h, 0); 182 183 /* Move up the stack. */ 184 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 185 /* Get the parent page. */ 186 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 187 return (1); 188 189 /* Move to the next index. */ 190 if (parent->index != NEXTINDEX(h) - 1) { 191 index = parent->index + 1; 192 BT_PUSH(t, h->pgno, index); 193 break; 194 } 195 mpool_put(t->bt_mp, h, 0); 196 } 197 198 /* Restore the stack. */ 199 while (level--) { 200 /* Push the next level down onto the stack. */ 201 bi = GETBINTERNAL(h, index); 202 pgno = bi->pgno; 203 BT_PUSH(t, pgno, 0); 204 205 /* Lose the currently pinned page. */ 206 mpool_put(t->bt_mp, h, 0); 207 208 /* Get the next level down. */ 209 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 210 return (1); 211 index = 0; 212 } 213 mpool_put(t->bt_mp, h, 0); 214 if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) 215 return (1); 216 } 217 218 if (h->pgno == c->pg.pgno) 219 goto ret; 220 221 /* Reacquire the original stack. */ 222 mpool_put(t->bt_mp, h, 0); 223 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 224 return (1); 225 h = e->page; 226 227 /* 228 * Move left, looking for the page. At each move we have to move 229 * up the stack until we don't have to change pages to move to the 230 * next page. If we have to change pages at an internal level, we 231 * have to fix the stack back up. 232 */ 233 while (h->pgno != c->pg.pgno) { 234 if ((prevpg = h->prevpg) == P_INVALID) 235 break; 236 mpool_put(t->bt_mp, h, 0); 237 238 /* Move up the stack. */ 239 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 240 /* Get the parent page. */ 241 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 242 return (1); 243 244 /* Move to the next index. */ 245 if (parent->index != 0) { 246 index = parent->index - 1; 247 BT_PUSH(t, h->pgno, index); 248 break; 249 } 250 mpool_put(t->bt_mp, h, 0); 251 } 252 253 /* Restore the stack. */ 254 while (level--) { 255 /* Push the next level down onto the stack. */ 256 bi = GETBINTERNAL(h, index); 257 pgno = bi->pgno; 258 259 /* Lose the currently pinned page. */ 260 mpool_put(t->bt_mp, h, 0); 261 262 /* Get the next level down. */ 263 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 264 return (1); 265 266 index = NEXTINDEX(h) - 1; 267 BT_PUSH(t, pgno, index); 268 } 269 mpool_put(t->bt_mp, h, 0); 270 if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) 271 return (1); 272 } 273 274 275 ret: mpool_put(t->bt_mp, h, 0); 276 return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); 277 } 278 279 /* 280 * __bt_bdelete -- 281 * Delete all key/data pairs matching the specified key. 282 * 283 * Parameters: 284 * t: tree 285 * key: key to delete 286 * 287 * Returns: 288 * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. 289 */ 290 static int 291 __bt_bdelete(t, key) 292 BTREE *t; 293 const DBT *key; 294 { 295 EPG *e; 296 PAGE *h; 297 int deleted, exact, redo; 298 299 deleted = 0; 300 301 /* Find any matching record; __bt_search pins the page. */ 302 loop: if ((e = __bt_search(t, key, &exact)) == NULL) 303 return (deleted ? RET_SUCCESS : RET_ERROR); 304 if (!exact) { 305 mpool_put(t->bt_mp, e->page, 0); 306 return (deleted ? RET_SUCCESS : RET_SPECIAL); 307 } 308 309 /* 310 * Delete forward, then delete backward, from the found key. If 311 * there are duplicates and we reach either side of the page, do 312 * the key search again, so that we get them all. 313 */ 314 redo = 0; 315 h = e->page; 316 do { 317 if (__bt_dleaf(t, key, h, e->index)) { 318 mpool_put(t->bt_mp, h, 0); 319 return (RET_ERROR); 320 } 321 if (F_ISSET(t, B_NODUPS)) { 322 if (NEXTINDEX(h) == 0) { 323 if (__bt_pdelete(t, h)) 324 return (RET_ERROR); 325 } else 326 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 327 return (RET_SUCCESS); 328 } 329 deleted = 1; 330 } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); 331 332 /* Check for right-hand edge of the page. */ 333 if (e->index == NEXTINDEX(h)) 334 redo = 1; 335 336 /* Delete from the key to the beginning of the page. */ 337 while (e->index-- > 0) { 338 if (__bt_cmp(t, key, e) != 0) 339 break; 340 if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) { 341 mpool_put(t->bt_mp, h, 0); 342 return (RET_ERROR); 343 } 344 if (e->index == 0) 345 redo = 1; 346 } 347 348 /* Check for an empty page. */ 349 if (NEXTINDEX(h) == 0) { 350 if (__bt_pdelete(t, h)) 351 return (RET_ERROR); 352 goto loop; 353 } 354 355 /* Put the page. */ 356 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 357 358 if (redo) 359 goto loop; 360 return (RET_SUCCESS); 361 } 362 363 /* 364 * __bt_pdelete -- 365 * Delete a single page from the tree. 366 * 367 * Parameters: 368 * t: tree 369 * h: leaf page 370 * 371 * Returns: 372 * RET_SUCCESS, RET_ERROR. 373 * 374 * Side-effects: 375 * mpool_put's the page 376 */ 377 static int 378 __bt_pdelete(t, h) 379 BTREE *t; 380 PAGE *h; 381 { 382 BINTERNAL *bi; 383 PAGE *pg; 384 EPGNO *parent; 385 indx_t cnt, index, *ip, offset; 386 u_int32_t nksize; 387 char *from; 388 389 /* 390 * Walk the parent page stack -- a LIFO stack of the pages that were 391 * traversed when we searched for the page where the delete occurred. 392 * Each stack entry is a page number and a page index offset. The 393 * offset is for the page traversed on the search. We've just deleted 394 * a page, so we have to delete the key from the parent page. 395 * 396 * If the delete from the parent page makes it empty, this process may 397 * continue all the way up the tree. We stop if we reach the root page 398 * (which is never deleted, it's just not worth the effort) or if the 399 * delete does not empty the page. 400 */ 401 while ((parent = BT_POP(t)) != NULL) { 402 /* Get the parent page. */ 403 if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 404 return (RET_ERROR); 405 406 index = parent->index; 407 bi = GETBINTERNAL(pg, index); 408 409 /* Free any overflow pages. */ 410 if (bi->flags & P_BIGKEY && 411 __ovfl_delete(t, bi->bytes) == RET_ERROR) { 412 mpool_put(t->bt_mp, pg, 0); 413 return (RET_ERROR); 414 } 415 416 /* 417 * Free the parent if it has only the one key and it's not the 418 * root page. If it's the rootpage, turn it back into an empty 419 * leaf page. 420 */ 421 if (NEXTINDEX(pg) == 1) 422 if (pg->pgno == P_ROOT) { 423 pg->lower = BTDATAOFF; 424 pg->upper = t->bt_psize; 425 pg->flags = P_BLEAF; 426 } else { 427 if (__bt_relink(t, pg) || __bt_free(t, pg)) 428 return (RET_ERROR); 429 continue; 430 } 431 else { 432 /* Pack remaining key items at the end of the page. */ 433 nksize = NBINTERNAL(bi->ksize); 434 from = (char *)pg + pg->upper; 435 memmove(from + nksize, from, (char *)bi - from); 436 pg->upper += nksize; 437 438 /* Adjust indices' offsets, shift the indices down. */ 439 offset = pg->linp[index]; 440 for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip) 441 if (ip[0] < offset) 442 ip[0] += nksize; 443 for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip) 444 ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; 445 pg->lower -= sizeof(indx_t); 446 } 447 448 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 449 break; 450 } 451 452 /* Free the leaf page, as long as it wasn't the root. */ 453 if (h->pgno == P_ROOT) { 454 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 455 return (RET_SUCCESS); 456 } 457 return (__bt_relink(t, h) || __bt_free(t, h)); 458 } 459 460 /* 461 * __bt_dleaf -- 462 * Delete a single record from a leaf page. 463 * 464 * Parameters: 465 * t: tree 466 * key: referenced key 467 * h: page 468 * index: index on page to delete 469 * 470 * Returns: 471 * RET_SUCCESS, RET_ERROR. 472 */ 473 int 474 __bt_dleaf(t, key, h, index) 475 BTREE *t; 476 const DBT *key; 477 PAGE *h; 478 u_int index; 479 { 480 BLEAF *bl; 481 indx_t cnt, *ip, offset; 482 u_int32_t nbytes; 483 void *to; 484 char *from; 485 486 /* If this record is referenced by the cursor, delete the cursor. */ 487 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 488 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 489 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index && 490 __bt_curdel(t, key, h, index)) 491 return (RET_ERROR); 492 493 /* If the entry uses overflow pages, make them available for reuse. */ 494 to = bl = GETBLEAF(h, index); 495 if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) 496 return (RET_ERROR); 497 if (bl->flags & P_BIGDATA && 498 __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) 499 return (RET_ERROR); 500 501 /* Pack the remaining key/data items at the end of the page. */ 502 nbytes = NBLEAF(bl); 503 from = (char *)h + h->upper; 504 memmove(from + nbytes, from, (char *)to - from); 505 h->upper += nbytes; 506 507 /* Adjust the indices' offsets, shift the indices down. */ 508 offset = h->linp[index]; 509 for (cnt = index, ip = &h->linp[0]; cnt--; ++ip) 510 if (ip[0] < offset) 511 ip[0] += nbytes; 512 for (cnt = NEXTINDEX(h) - index; --cnt; ++ip) 513 ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; 514 h->lower -= sizeof(indx_t); 515 516 /* If the cursor is on this page, adjust it as necessary. */ 517 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 518 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 519 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index) 520 --t->bt_cursor.pg.index; 521 522 return (RET_SUCCESS); 523 } 524 525 /* 526 * __bt_curdel -- 527 * Delete the cursor. 528 * 529 * Parameters: 530 * t: tree 531 * key: referenced key (or NULL) 532 * h: page 533 * index: index on page to delete 534 * 535 * Returns: 536 * RET_SUCCESS, RET_ERROR. 537 */ 538 static int 539 __bt_curdel(t, key, h, index) 540 BTREE *t; 541 const DBT *key; 542 PAGE *h; 543 u_int index; 544 { 545 CURSOR *c; 546 EPG e; 547 PAGE *pg; 548 int curcopy, status; 549 550 /* 551 * If there are duplicates, move forward or backward to one. 552 * Otherwise, copy the key into the cursor area. 553 */ 554 c = &t->bt_cursor; 555 F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); 556 557 curcopy = 0; 558 if (!F_ISSET(t, B_NODUPS)) { 559 /* 560 * We're going to have to do comparisons. If we weren't 561 * provided a copy of the key, i.e. the user is deleting 562 * the current cursor position, get one. 563 */ 564 if (key == NULL) { 565 e.page = h; 566 e.index = index; 567 if ((status = __bt_ret(t, &e, 568 &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) 569 return (status); 570 curcopy = 1; 571 key = &c->key; 572 } 573 /* Check previous key, if not at the beginning of the page. */ 574 if (index > 0) { 575 e.page = h; 576 e.index = index - 1; 577 if (__bt_cmp(t, key, &e) == 0) { 578 F_SET(c, CURS_BEFORE); 579 goto dup2; 580 } 581 } 582 /* Check next key, if not at the end of the page. */ 583 if (index < NEXTINDEX(h) - 1) { 584 e.page = h; 585 e.index = index + 1; 586 if (__bt_cmp(t, key, &e) == 0) { 587 F_SET(c, CURS_AFTER); 588 goto dup2; 589 } 590 } 591 /* Check previous key if at the beginning of the page. */ 592 if (index == 0 && h->prevpg != P_INVALID) { 593 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 594 return (RET_ERROR); 595 e.page = pg; 596 e.index = NEXTINDEX(pg) - 1; 597 if (__bt_cmp(t, key, &e) == 0) { 598 F_SET(c, CURS_BEFORE); 599 goto dup1; 600 } 601 mpool_put(t->bt_mp, pg, 0); 602 } 603 /* Check next key if at the end of the page. */ 604 if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { 605 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 606 return (RET_ERROR); 607 e.page = pg; 608 e.index = 0; 609 if (__bt_cmp(t, key, &e) == 0) { 610 F_SET(c, CURS_AFTER); 611 dup1: mpool_put(t->bt_mp, pg, 0); 612 dup2: c->pg.pgno = e.page->pgno; 613 c->pg.index = e.index; 614 return (RET_SUCCESS); 615 } 616 mpool_put(t->bt_mp, pg, 0); 617 } 618 } 619 e.page = h; 620 e.index = index; 621 if (curcopy || (status = 622 __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { 623 F_SET(c, CURS_ACQUIRE); 624 return (RET_SUCCESS); 625 } 626 return (status); 627 } 628 629 /* 630 * __bt_relink -- 631 * Link around a deleted page. 632 * 633 * Parameters: 634 * t: tree 635 * h: page to be deleted 636 */ 637 static int 638 __bt_relink(t, h) 639 BTREE *t; 640 PAGE *h; 641 { 642 PAGE *pg; 643 644 if (h->nextpg != P_INVALID) { 645 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 646 return (RET_ERROR); 647 pg->prevpg = h->prevpg; 648 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 649 } 650 if (h->prevpg != P_INVALID) { 651 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 652 return (RET_ERROR); 653 pg->nextpg = h->nextpg; 654 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 655 } 656 return (0); 657 } 658