1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1990, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Mike Olson. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 */ 34 35 #include <sys/types.h> 36 37 #include <errno.h> 38 #include <stdio.h> 39 #include <string.h> 40 41 #include <db.h> 42 #include "btree.h" 43 44 static int __bt_bdelete(BTREE *, const DBT *); 45 static int __bt_curdel(BTREE *, const DBT *, PAGE *, u_int); 46 static int __bt_pdelete(BTREE *, PAGE *); 47 static int __bt_relink(BTREE *, PAGE *); 48 static int __bt_stkacq(BTREE *, PAGE **, CURSOR *); 49 50 /* 51 * __bt_delete 52 * Delete the item(s) referenced by a key. 53 * 54 * Return RET_SPECIAL if the key is not found. 55 */ 56 int 57 __bt_delete(const DB *dbp, const DBT *key, u_int flags) 58 { 59 BTREE *t; 60 CURSOR *c; 61 PAGE *h; 62 int status; 63 64 t = dbp->internal; 65 66 /* Toss any page pinned across calls. */ 67 if (t->bt_pinned != NULL) { 68 mpool_put(t->bt_mp, t->bt_pinned, 0); 69 t->bt_pinned = NULL; 70 } 71 72 /* Check for change to a read-only tree. */ 73 if (F_ISSET(t, B_RDONLY)) { 74 errno = EPERM; 75 return (RET_ERROR); 76 } 77 78 switch (flags) { 79 case 0: 80 status = __bt_bdelete(t, key); 81 break; 82 case R_CURSOR: 83 /* 84 * If flags is R_CURSOR, delete the cursor. Must already 85 * have started a scan and not have already deleted it. 86 */ 87 c = &t->bt_cursor; 88 if (F_ISSET(c, CURS_INIT)) { 89 if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) 90 return (RET_SPECIAL); 91 if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) 92 return (RET_ERROR); 93 94 /* 95 * If the page is about to be emptied, we'll need to 96 * delete it, which means we have to acquire a stack. 97 */ 98 if (NEXTINDEX(h) == 1) 99 if (__bt_stkacq(t, &h, &t->bt_cursor)) 100 return (RET_ERROR); 101 102 status = __bt_dleaf(t, NULL, h, c->pg.index); 103 104 if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { 105 if (__bt_pdelete(t, h)) 106 return (RET_ERROR); 107 } else 108 mpool_put(t->bt_mp, 109 h, status == RET_SUCCESS ? MPOOL_DIRTY : 0); 110 break; 111 } 112 /* FALLTHROUGH */ 113 default: 114 errno = EINVAL; 115 return (RET_ERROR); 116 } 117 if (status == RET_SUCCESS) 118 F_SET(t, B_MODIFIED); 119 return (status); 120 } 121 122 /* 123 * __bt_stkacq -- 124 * Acquire a stack so we can delete a cursor entry. 125 * 126 * Parameters: 127 * t: tree 128 * hp: pointer to current, pinned PAGE pointer 129 * c: pointer to the cursor 130 * 131 * Returns: 132 * 0 on success, 1 on failure 133 */ 134 static int 135 __bt_stkacq(BTREE *t, PAGE **hp, CURSOR *c) 136 { 137 BINTERNAL *bi; 138 EPG *e; 139 EPGNO *parent; 140 PAGE *h; 141 indx_t idx; 142 pgno_t pgno; 143 recno_t nextpg, prevpg; 144 int exact, level; 145 146 /* 147 * Find the first occurrence of the key in the tree. Toss the 148 * currently locked page so we don't hit an already-locked page. 149 */ 150 h = *hp; 151 mpool_put(t->bt_mp, h, 0); 152 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 153 return (1); 154 h = e->page; 155 156 /* See if we got it in one shot. */ 157 if (h->pgno == c->pg.pgno) 158 goto ret; 159 160 /* 161 * Move right, looking for the page. At each move we have to move 162 * up the stack until we don't have to move to the next page. If 163 * we have to change pages at an internal level, we have to fix the 164 * stack back up. 165 */ 166 while (h->pgno != c->pg.pgno) { 167 if ((nextpg = h->nextpg) == P_INVALID) 168 break; 169 mpool_put(t->bt_mp, h, 0); 170 171 /* Move up the stack. */ 172 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 173 /* Get the parent page. */ 174 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 175 return (1); 176 177 /* Move to the next index. */ 178 if (parent->index != NEXTINDEX(h) - 1) { 179 idx = parent->index + 1; 180 BT_PUSH(t, h->pgno, idx); 181 break; 182 } 183 mpool_put(t->bt_mp, h, 0); 184 } 185 186 /* Restore the stack. */ 187 while (level--) { 188 /* Push the next level down onto the stack. */ 189 bi = GETBINTERNAL(h, idx); 190 pgno = bi->pgno; 191 BT_PUSH(t, pgno, 0); 192 193 /* Lose the currently pinned page. */ 194 mpool_put(t->bt_mp, h, 0); 195 196 /* Get the next level down. */ 197 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 198 return (1); 199 idx = 0; 200 } 201 mpool_put(t->bt_mp, h, 0); 202 if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) 203 return (1); 204 } 205 206 if (h->pgno == c->pg.pgno) 207 goto ret; 208 209 /* Reacquire the original stack. */ 210 mpool_put(t->bt_mp, h, 0); 211 if ((e = __bt_search(t, &c->key, &exact)) == NULL) 212 return (1); 213 h = e->page; 214 215 /* 216 * Move left, looking for the page. At each move we have to move 217 * up the stack until we don't have to change pages to move to the 218 * next page. If we have to change pages at an internal level, we 219 * have to fix the stack back up. 220 */ 221 while (h->pgno != c->pg.pgno) { 222 if ((prevpg = h->prevpg) == P_INVALID) 223 break; 224 mpool_put(t->bt_mp, h, 0); 225 226 /* Move up the stack. */ 227 for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { 228 /* Get the parent page. */ 229 if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 230 return (1); 231 232 /* Move to the next index. */ 233 if (parent->index != 0) { 234 idx = parent->index - 1; 235 BT_PUSH(t, h->pgno, idx); 236 break; 237 } 238 mpool_put(t->bt_mp, h, 0); 239 } 240 241 /* Restore the stack. */ 242 while (level--) { 243 /* Push the next level down onto the stack. */ 244 bi = GETBINTERNAL(h, idx); 245 pgno = bi->pgno; 246 247 /* Lose the currently pinned page. */ 248 mpool_put(t->bt_mp, h, 0); 249 250 /* Get the next level down. */ 251 if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) 252 return (1); 253 254 idx = NEXTINDEX(h) - 1; 255 BT_PUSH(t, pgno, idx); 256 } 257 mpool_put(t->bt_mp, h, 0); 258 if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) 259 return (1); 260 } 261 262 263 ret: mpool_put(t->bt_mp, h, 0); 264 return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); 265 } 266 267 /* 268 * __bt_bdelete -- 269 * Delete all key/data pairs matching the specified key. 270 * 271 * Parameters: 272 * t: tree 273 * key: key to delete 274 * 275 * Returns: 276 * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. 277 */ 278 static int 279 __bt_bdelete(BTREE *t, const DBT *key) 280 { 281 EPG *e; 282 PAGE *h; 283 int deleted, exact, redo; 284 285 deleted = 0; 286 287 /* Find any matching record; __bt_search pins the page. */ 288 loop: if ((e = __bt_search(t, key, &exact)) == NULL) 289 return (deleted ? RET_SUCCESS : RET_ERROR); 290 if (!exact) { 291 mpool_put(t->bt_mp, e->page, 0); 292 return (deleted ? RET_SUCCESS : RET_SPECIAL); 293 } 294 295 /* 296 * Delete forward, then delete backward, from the found key. If 297 * there are duplicates and we reach either side of the page, do 298 * the key search again, so that we get them all. 299 */ 300 redo = 0; 301 h = e->page; 302 do { 303 if (__bt_dleaf(t, key, h, e->index)) { 304 mpool_put(t->bt_mp, h, 0); 305 return (RET_ERROR); 306 } 307 if (F_ISSET(t, B_NODUPS)) { 308 if (NEXTINDEX(h) == 0) { 309 if (__bt_pdelete(t, h)) 310 return (RET_ERROR); 311 } else 312 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 313 return (RET_SUCCESS); 314 } 315 deleted = 1; 316 } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); 317 318 /* Check for right-hand edge of the page. */ 319 if (e->index == NEXTINDEX(h)) 320 redo = 1; 321 322 /* Delete from the key to the beginning of the page. */ 323 while (e->index-- > 0) { 324 if (__bt_cmp(t, key, e) != 0) 325 break; 326 if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) { 327 mpool_put(t->bt_mp, h, 0); 328 return (RET_ERROR); 329 } 330 if (e->index == 0) 331 redo = 1; 332 } 333 334 /* Check for an empty page. */ 335 if (NEXTINDEX(h) == 0) { 336 if (__bt_pdelete(t, h)) 337 return (RET_ERROR); 338 goto loop; 339 } 340 341 /* Put the page. */ 342 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 343 344 if (redo) 345 goto loop; 346 return (RET_SUCCESS); 347 } 348 349 /* 350 * __bt_pdelete -- 351 * Delete a single page from the tree. 352 * 353 * Parameters: 354 * t: tree 355 * h: leaf page 356 * 357 * Returns: 358 * RET_SUCCESS, RET_ERROR. 359 * 360 * Side-effects: 361 * mpool_put's the page 362 */ 363 static int 364 __bt_pdelete(BTREE *t, PAGE *h) 365 { 366 BINTERNAL *bi; 367 PAGE *pg; 368 EPGNO *parent; 369 indx_t cnt, idx, *ip, offset; 370 u_int32_t nksize; 371 char *from; 372 373 /* 374 * Walk the parent page stack -- a LIFO stack of the pages that were 375 * traversed when we searched for the page where the delete occurred. 376 * Each stack entry is a page number and a page index offset. The 377 * offset is for the page traversed on the search. We've just deleted 378 * a page, so we have to delete the key from the parent page. 379 * 380 * If the delete from the parent page makes it empty, this process may 381 * continue all the way up the tree. We stop if we reach the root page 382 * (which is never deleted, it's just not worth the effort) or if the 383 * delete does not empty the page. 384 */ 385 while ((parent = BT_POP(t)) != NULL) { 386 /* Get the parent page. */ 387 if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) 388 return (RET_ERROR); 389 390 idx = parent->index; 391 bi = GETBINTERNAL(pg, idx); 392 393 /* Free any overflow pages. */ 394 if (bi->flags & P_BIGKEY && 395 __ovfl_delete(t, bi->bytes) == RET_ERROR) { 396 mpool_put(t->bt_mp, pg, 0); 397 return (RET_ERROR); 398 } 399 400 /* 401 * Free the parent if it has only the one key and it's not the 402 * root page. If it's the rootpage, turn it back into an empty 403 * leaf page. 404 */ 405 if (NEXTINDEX(pg) == 1) { 406 if (pg->pgno == P_ROOT) { 407 pg->lower = BTDATAOFF; 408 pg->upper = t->bt_psize; 409 pg->flags = P_BLEAF; 410 } else { 411 if (__bt_relink(t, pg) || __bt_free(t, pg)) 412 return (RET_ERROR); 413 continue; 414 } 415 } else { 416 /* Pack remaining key items at the end of the page. */ 417 nksize = NBINTERNAL(bi->ksize); 418 from = (char *)pg + pg->upper; 419 memmove(from + nksize, from, (char *)bi - from); 420 pg->upper += nksize; 421 422 /* Adjust indices' offsets, shift the indices down. */ 423 offset = pg->linp[idx]; 424 for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip) 425 if (ip[0] < offset) 426 ip[0] += nksize; 427 for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip) 428 ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; 429 pg->lower -= sizeof(indx_t); 430 } 431 432 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 433 break; 434 } 435 436 /* Free the leaf page, as long as it wasn't the root. */ 437 if (h->pgno == P_ROOT) { 438 mpool_put(t->bt_mp, h, MPOOL_DIRTY); 439 return (RET_SUCCESS); 440 } 441 return (__bt_relink(t, h) || __bt_free(t, h)); 442 } 443 444 /* 445 * __bt_dleaf -- 446 * Delete a single record from a leaf page. 447 * 448 * Parameters: 449 * t: tree 450 * key: referenced key 451 * h: page 452 * idx: index on page to delete 453 * 454 * Returns: 455 * RET_SUCCESS, RET_ERROR. 456 */ 457 int 458 __bt_dleaf(BTREE *t, const DBT *key, PAGE *h, u_int idx) 459 { 460 BLEAF *bl; 461 indx_t cnt, *ip, offset; 462 u_int32_t nbytes; 463 void *to; 464 char *from; 465 466 /* If this record is referenced by the cursor, delete the cursor. */ 467 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 468 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 469 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx && 470 __bt_curdel(t, key, h, idx)) 471 return (RET_ERROR); 472 473 /* If the entry uses overflow pages, make them available for reuse. */ 474 to = bl = GETBLEAF(h, idx); 475 if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) 476 return (RET_ERROR); 477 if (bl->flags & P_BIGDATA && 478 __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) 479 return (RET_ERROR); 480 481 /* Pack the remaining key/data items at the end of the page. */ 482 nbytes = NBLEAF(bl); 483 from = (char *)h + h->upper; 484 memmove(from + nbytes, from, (char *)to - from); 485 h->upper += nbytes; 486 487 /* Adjust the indices' offsets, shift the indices down. */ 488 offset = h->linp[idx]; 489 for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip) 490 if (ip[0] < offset) 491 ip[0] += nbytes; 492 for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip) 493 ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; 494 h->lower -= sizeof(indx_t); 495 496 /* If the cursor is on this page, adjust it as necessary. */ 497 if (F_ISSET(&t->bt_cursor, CURS_INIT) && 498 !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && 499 t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx) 500 --t->bt_cursor.pg.index; 501 502 return (RET_SUCCESS); 503 } 504 505 /* 506 * __bt_curdel -- 507 * Delete the cursor. 508 * 509 * Parameters: 510 * t: tree 511 * key: referenced key (or NULL) 512 * h: page 513 * idx: index on page to delete 514 * 515 * Returns: 516 * RET_SUCCESS, RET_ERROR. 517 */ 518 static int 519 __bt_curdel(BTREE *t, const DBT *key, PAGE *h, u_int idx) 520 { 521 CURSOR *c; 522 EPG e; 523 PAGE *pg; 524 int curcopy, status; 525 526 /* 527 * If there are duplicates, move forward or backward to one. 528 * Otherwise, copy the key into the cursor area. 529 */ 530 c = &t->bt_cursor; 531 F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); 532 533 curcopy = 0; 534 if (!F_ISSET(t, B_NODUPS)) { 535 /* 536 * We're going to have to do comparisons. If we weren't 537 * provided a copy of the key, i.e. the user is deleting 538 * the current cursor position, get one. 539 */ 540 if (key == NULL) { 541 e.page = h; 542 e.index = idx; 543 if ((status = __bt_ret(t, &e, 544 &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) 545 return (status); 546 curcopy = 1; 547 key = &c->key; 548 } 549 /* Check previous key, if not at the beginning of the page. */ 550 if (idx > 0) { 551 e.page = h; 552 e.index = idx - 1; 553 if (__bt_cmp(t, key, &e) == 0) { 554 F_SET(c, CURS_BEFORE); 555 goto dup2; 556 } 557 } 558 /* Check next key, if not at the end of the page. */ 559 if (idx < NEXTINDEX(h) - 1) { 560 e.page = h; 561 e.index = idx + 1; 562 if (__bt_cmp(t, key, &e) == 0) { 563 F_SET(c, CURS_AFTER); 564 goto dup2; 565 } 566 } 567 /* Check previous key if at the beginning of the page. */ 568 if (idx == 0 && h->prevpg != P_INVALID) { 569 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 570 return (RET_ERROR); 571 e.page = pg; 572 e.index = NEXTINDEX(pg) - 1; 573 if (__bt_cmp(t, key, &e) == 0) { 574 F_SET(c, CURS_BEFORE); 575 goto dup1; 576 } 577 mpool_put(t->bt_mp, pg, 0); 578 } 579 /* Check next key if at the end of the page. */ 580 if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { 581 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 582 return (RET_ERROR); 583 e.page = pg; 584 e.index = 0; 585 if (__bt_cmp(t, key, &e) == 0) { 586 F_SET(c, CURS_AFTER); 587 dup1: mpool_put(t->bt_mp, pg, 0); 588 dup2: c->pg.pgno = e.page->pgno; 589 c->pg.index = e.index; 590 return (RET_SUCCESS); 591 } 592 mpool_put(t->bt_mp, pg, 0); 593 } 594 } 595 e.page = h; 596 e.index = idx; 597 if (curcopy || (status = 598 __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { 599 F_SET(c, CURS_ACQUIRE); 600 return (RET_SUCCESS); 601 } 602 return (status); 603 } 604 605 /* 606 * __bt_relink -- 607 * Link around a deleted page. 608 * 609 * Parameters: 610 * t: tree 611 * h: page to be deleted 612 */ 613 static int 614 __bt_relink(BTREE *t, PAGE *h) 615 { 616 PAGE *pg; 617 618 if (h->nextpg != P_INVALID) { 619 if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) 620 return (RET_ERROR); 621 pg->prevpg = h->prevpg; 622 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 623 } 624 if (h->prevpg != P_INVALID) { 625 if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) 626 return (RET_ERROR); 627 pg->nextpg = h->nextpg; 628 mpool_put(t->bt_mp, pg, MPOOL_DIRTY); 629 } 630 return (0); 631 } 632