xref: /freebsd/lib/libc/db/btree/bt_delete.c (revision f126890ac5386406dadf7c4cfa9566cbb56537c5)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1990, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Mike Olson.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #include <sys/types.h>
36 
37 #include <errno.h>
38 #include <stdio.h>
39 #include <string.h>
40 
41 #include <db.h>
42 #include "btree.h"
43 
44 static int __bt_bdelete(BTREE *, const DBT *);
45 static int __bt_curdel(BTREE *, const DBT *, PAGE *, u_int);
46 static int __bt_pdelete(BTREE *, PAGE *);
47 static int __bt_relink(BTREE *, PAGE *);
48 static int __bt_stkacq(BTREE *, PAGE **, CURSOR *);
49 
50 /*
51  * __bt_delete
52  *	Delete the item(s) referenced by a key.
53  *
54  * Return RET_SPECIAL if the key is not found.
55  */
56 int
57 __bt_delete(const DB *dbp, const DBT *key, u_int flags)
58 {
59 	BTREE *t;
60 	CURSOR *c;
61 	PAGE *h;
62 	int status;
63 
64 	t = dbp->internal;
65 
66 	/* Toss any page pinned across calls. */
67 	if (t->bt_pinned != NULL) {
68 		mpool_put(t->bt_mp, t->bt_pinned, 0);
69 		t->bt_pinned = NULL;
70 	}
71 
72 	/* Check for change to a read-only tree. */
73 	if (F_ISSET(t, B_RDONLY)) {
74 		errno = EPERM;
75 		return (RET_ERROR);
76 	}
77 
78 	switch (flags) {
79 	case 0:
80 		status = __bt_bdelete(t, key);
81 		break;
82 	case R_CURSOR:
83 		/*
84 		 * If flags is R_CURSOR, delete the cursor.  Must already
85 		 * have started a scan and not have already deleted it.
86 		 */
87 		c = &t->bt_cursor;
88 		if (F_ISSET(c, CURS_INIT)) {
89 			if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
90 				return (RET_SPECIAL);
91 			if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
92 				return (RET_ERROR);
93 
94 			/*
95 			 * If the page is about to be emptied, we'll need to
96 			 * delete it, which means we have to acquire a stack.
97 			 */
98 			if (NEXTINDEX(h) == 1)
99 				if (__bt_stkacq(t, &h, &t->bt_cursor))
100 					return (RET_ERROR);
101 
102 			status = __bt_dleaf(t, NULL, h, c->pg.index);
103 
104 			if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {
105 				if (__bt_pdelete(t, h))
106 					return (RET_ERROR);
107 			} else
108 				mpool_put(t->bt_mp,
109 				    h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);
110 			break;
111 		}
112 		/* FALLTHROUGH */
113 	default:
114 		errno = EINVAL;
115 		return (RET_ERROR);
116 	}
117 	if (status == RET_SUCCESS)
118 		F_SET(t, B_MODIFIED);
119 	return (status);
120 }
121 
122 /*
123  * __bt_stkacq --
124  *	Acquire a stack so we can delete a cursor entry.
125  *
126  * Parameters:
127  *	  t:	tree
128  *	 hp:	pointer to current, pinned PAGE pointer
129  *	  c:	pointer to the cursor
130  *
131  * Returns:
132  *	0 on success, 1 on failure
133  */
134 static int
135 __bt_stkacq(BTREE *t, PAGE **hp, CURSOR *c)
136 {
137 	BINTERNAL *bi;
138 	EPG *e;
139 	EPGNO *parent;
140 	PAGE *h;
141 	indx_t idx;
142 	pgno_t pgno;
143 	recno_t nextpg, prevpg;
144 	int exact, level;
145 
146 	/*
147 	 * Find the first occurrence of the key in the tree.  Toss the
148 	 * currently locked page so we don't hit an already-locked page.
149 	 */
150 	h = *hp;
151 	mpool_put(t->bt_mp, h, 0);
152 	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
153 		return (1);
154 	h = e->page;
155 
156 	/* See if we got it in one shot. */
157 	if (h->pgno == c->pg.pgno)
158 		goto ret;
159 
160 	/*
161 	 * Move right, looking for the page.  At each move we have to move
162 	 * up the stack until we don't have to move to the next page.  If
163 	 * we have to change pages at an internal level, we have to fix the
164 	 * stack back up.
165 	 */
166 	while (h->pgno != c->pg.pgno) {
167 		if ((nextpg = h->nextpg) == P_INVALID)
168 			break;
169 		mpool_put(t->bt_mp, h, 0);
170 
171 		/* Move up the stack. */
172 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
173 			/* Get the parent page. */
174 			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
175 				return (1);
176 
177 			/* Move to the next index. */
178 			if (parent->index != NEXTINDEX(h) - 1) {
179 				idx = parent->index + 1;
180 				BT_PUSH(t, h->pgno, idx);
181 				break;
182 			}
183 			mpool_put(t->bt_mp, h, 0);
184 		}
185 
186 		/* Restore the stack. */
187 		while (level--) {
188 			/* Push the next level down onto the stack. */
189 			bi = GETBINTERNAL(h, idx);
190 			pgno = bi->pgno;
191 			BT_PUSH(t, pgno, 0);
192 
193 			/* Lose the currently pinned page. */
194 			mpool_put(t->bt_mp, h, 0);
195 
196 			/* Get the next level down. */
197 			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
198 				return (1);
199 			idx = 0;
200 		}
201 		mpool_put(t->bt_mp, h, 0);
202 		if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
203 			return (1);
204 	}
205 
206 	if (h->pgno == c->pg.pgno)
207 		goto ret;
208 
209 	/* Reacquire the original stack. */
210 	mpool_put(t->bt_mp, h, 0);
211 	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
212 		return (1);
213 	h = e->page;
214 
215 	/*
216 	 * Move left, looking for the page.  At each move we have to move
217 	 * up the stack until we don't have to change pages to move to the
218 	 * next page.  If we have to change pages at an internal level, we
219 	 * have to fix the stack back up.
220 	 */
221 	while (h->pgno != c->pg.pgno) {
222 		if ((prevpg = h->prevpg) == P_INVALID)
223 			break;
224 		mpool_put(t->bt_mp, h, 0);
225 
226 		/* Move up the stack. */
227 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
228 			/* Get the parent page. */
229 			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
230 				return (1);
231 
232 			/* Move to the next index. */
233 			if (parent->index != 0) {
234 				idx = parent->index - 1;
235 				BT_PUSH(t, h->pgno, idx);
236 				break;
237 			}
238 			mpool_put(t->bt_mp, h, 0);
239 		}
240 
241 		/* Restore the stack. */
242 		while (level--) {
243 			/* Push the next level down onto the stack. */
244 			bi = GETBINTERNAL(h, idx);
245 			pgno = bi->pgno;
246 
247 			/* Lose the currently pinned page. */
248 			mpool_put(t->bt_mp, h, 0);
249 
250 			/* Get the next level down. */
251 			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
252 				return (1);
253 
254 			idx = NEXTINDEX(h) - 1;
255 			BT_PUSH(t, pgno, idx);
256 		}
257 		mpool_put(t->bt_mp, h, 0);
258 		if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
259 			return (1);
260 	}
261 
262 
263 ret:	mpool_put(t->bt_mp, h, 0);
264 	return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
265 }
266 
267 /*
268  * __bt_bdelete --
269  *	Delete all key/data pairs matching the specified key.
270  *
271  * Parameters:
272  *	  t:	tree
273  *	key:	key to delete
274  *
275  * Returns:
276  *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
277  */
278 static int
279 __bt_bdelete(BTREE *t, const DBT *key)
280 {
281 	EPG *e;
282 	PAGE *h;
283 	int deleted, exact, redo;
284 
285 	deleted = 0;
286 
287 	/* Find any matching record; __bt_search pins the page. */
288 loop:	if ((e = __bt_search(t, key, &exact)) == NULL)
289 		return (deleted ? RET_SUCCESS : RET_ERROR);
290 	if (!exact) {
291 		mpool_put(t->bt_mp, e->page, 0);
292 		return (deleted ? RET_SUCCESS : RET_SPECIAL);
293 	}
294 
295 	/*
296 	 * Delete forward, then delete backward, from the found key.  If
297 	 * there are duplicates and we reach either side of the page, do
298 	 * the key search again, so that we get them all.
299 	 */
300 	redo = 0;
301 	h = e->page;
302 	do {
303 		if (__bt_dleaf(t, key, h, e->index)) {
304 			mpool_put(t->bt_mp, h, 0);
305 			return (RET_ERROR);
306 		}
307 		if (F_ISSET(t, B_NODUPS)) {
308 			if (NEXTINDEX(h) == 0) {
309 				if (__bt_pdelete(t, h))
310 					return (RET_ERROR);
311 			} else
312 				mpool_put(t->bt_mp, h, MPOOL_DIRTY);
313 			return (RET_SUCCESS);
314 		}
315 		deleted = 1;
316 	} while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
317 
318 	/* Check for right-hand edge of the page. */
319 	if (e->index == NEXTINDEX(h))
320 		redo = 1;
321 
322 	/* Delete from the key to the beginning of the page. */
323 	while (e->index-- > 0) {
324 		if (__bt_cmp(t, key, e) != 0)
325 			break;
326 		if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {
327 			mpool_put(t->bt_mp, h, 0);
328 			return (RET_ERROR);
329 		}
330 		if (e->index == 0)
331 			redo = 1;
332 	}
333 
334 	/* Check for an empty page. */
335 	if (NEXTINDEX(h) == 0) {
336 		if (__bt_pdelete(t, h))
337 			return (RET_ERROR);
338 		goto loop;
339 	}
340 
341 	/* Put the page. */
342 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
343 
344 	if (redo)
345 		goto loop;
346 	return (RET_SUCCESS);
347 }
348 
349 /*
350  * __bt_pdelete --
351  *	Delete a single page from the tree.
352  *
353  * Parameters:
354  *	t:	tree
355  *	h:	leaf page
356  *
357  * Returns:
358  *	RET_SUCCESS, RET_ERROR.
359  *
360  * Side-effects:
361  *	mpool_put's the page
362  */
363 static int
364 __bt_pdelete(BTREE *t, PAGE *h)
365 {
366 	BINTERNAL *bi;
367 	PAGE *pg;
368 	EPGNO *parent;
369 	indx_t cnt, idx, *ip, offset;
370 	u_int32_t nksize;
371 	char *from;
372 
373 	/*
374 	 * Walk the parent page stack -- a LIFO stack of the pages that were
375 	 * traversed when we searched for the page where the delete occurred.
376 	 * Each stack entry is a page number and a page index offset.  The
377 	 * offset is for the page traversed on the search.  We've just deleted
378 	 * a page, so we have to delete the key from the parent page.
379 	 *
380 	 * If the delete from the parent page makes it empty, this process may
381 	 * continue all the way up the tree.  We stop if we reach the root page
382 	 * (which is never deleted, it's just not worth the effort) or if the
383 	 * delete does not empty the page.
384 	 */
385 	while ((parent = BT_POP(t)) != NULL) {
386 		/* Get the parent page. */
387 		if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
388 			return (RET_ERROR);
389 
390 		idx = parent->index;
391 		bi = GETBINTERNAL(pg, idx);
392 
393 		/* Free any overflow pages. */
394 		if (bi->flags & P_BIGKEY &&
395 		    __ovfl_delete(t, bi->bytes) == RET_ERROR) {
396 			mpool_put(t->bt_mp, pg, 0);
397 			return (RET_ERROR);
398 		}
399 
400 		/*
401 		 * Free the parent if it has only the one key and it's not the
402 		 * root page. If it's the rootpage, turn it back into an empty
403 		 * leaf page.
404 		 */
405 		if (NEXTINDEX(pg) == 1) {
406 			if (pg->pgno == P_ROOT) {
407 				pg->lower = BTDATAOFF;
408 				pg->upper = t->bt_psize;
409 				pg->flags = P_BLEAF;
410 			} else {
411 				if (__bt_relink(t, pg) || __bt_free(t, pg))
412 					return (RET_ERROR);
413 				continue;
414 			}
415 		} else {
416 			/* Pack remaining key items at the end of the page. */
417 			nksize = NBINTERNAL(bi->ksize);
418 			from = (char *)pg + pg->upper;
419 			memmove(from + nksize, from, (char *)bi - from);
420 			pg->upper += nksize;
421 
422 			/* Adjust indices' offsets, shift the indices down. */
423 			offset = pg->linp[idx];
424 			for (cnt = idx, ip = &pg->linp[0]; cnt--; ++ip)
425 				if (ip[0] < offset)
426 					ip[0] += nksize;
427 			for (cnt = NEXTINDEX(pg) - idx; --cnt; ++ip)
428 				ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];
429 			pg->lower -= sizeof(indx_t);
430 		}
431 
432 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
433 		break;
434 	}
435 
436 	/* Free the leaf page, as long as it wasn't the root. */
437 	if (h->pgno == P_ROOT) {
438 		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
439 		return (RET_SUCCESS);
440 	}
441 	return (__bt_relink(t, h) || __bt_free(t, h));
442 }
443 
444 /*
445  * __bt_dleaf --
446  *	Delete a single record from a leaf page.
447  *
448  * Parameters:
449  *	t:	tree
450  *    key:	referenced key
451  *	h:	page
452  *	idx:	index on page to delete
453  *
454  * Returns:
455  *	RET_SUCCESS, RET_ERROR.
456  */
457 int
458 __bt_dleaf(BTREE *t, const DBT *key, PAGE *h, u_int idx)
459 {
460 	BLEAF *bl;
461 	indx_t cnt, *ip, offset;
462 	u_int32_t nbytes;
463 	void *to;
464 	char *from;
465 
466 	/* If this record is referenced by the cursor, delete the cursor. */
467 	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
468 	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
469 	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == idx &&
470 	    __bt_curdel(t, key, h, idx))
471 		return (RET_ERROR);
472 
473 	/* If the entry uses overflow pages, make them available for reuse. */
474 	to = bl = GETBLEAF(h, idx);
475 	if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
476 		return (RET_ERROR);
477 	if (bl->flags & P_BIGDATA &&
478 	    __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
479 		return (RET_ERROR);
480 
481 	/* Pack the remaining key/data items at the end of the page. */
482 	nbytes = NBLEAF(bl);
483 	from = (char *)h + h->upper;
484 	memmove(from + nbytes, from, (char *)to - from);
485 	h->upper += nbytes;
486 
487 	/* Adjust the indices' offsets, shift the indices down. */
488 	offset = h->linp[idx];
489 	for (cnt = idx, ip = &h->linp[0]; cnt--; ++ip)
490 		if (ip[0] < offset)
491 			ip[0] += nbytes;
492 	for (cnt = NEXTINDEX(h) - idx; --cnt; ++ip)
493 		ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
494 	h->lower -= sizeof(indx_t);
495 
496 	/* If the cursor is on this page, adjust it as necessary. */
497 	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
498 	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
499 	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > idx)
500 		--t->bt_cursor.pg.index;
501 
502 	return (RET_SUCCESS);
503 }
504 
505 /*
506  * __bt_curdel --
507  *	Delete the cursor.
508  *
509  * Parameters:
510  *	t:	tree
511  *    key:	referenced key (or NULL)
512  *	h:	page
513  *    idx:	index on page to delete
514  *
515  * Returns:
516  *	RET_SUCCESS, RET_ERROR.
517  */
518 static int
519 __bt_curdel(BTREE *t, const DBT *key, PAGE *h, u_int idx)
520 {
521 	CURSOR *c;
522 	EPG e;
523 	PAGE *pg;
524 	int curcopy, status;
525 
526 	/*
527 	 * If there are duplicates, move forward or backward to one.
528 	 * Otherwise, copy the key into the cursor area.
529 	 */
530 	c = &t->bt_cursor;
531 	F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);
532 
533 	curcopy = 0;
534 	if (!F_ISSET(t, B_NODUPS)) {
535 		/*
536 		 * We're going to have to do comparisons.  If we weren't
537 		 * provided a copy of the key, i.e. the user is deleting
538 		 * the current cursor position, get one.
539 		 */
540 		if (key == NULL) {
541 			e.page = h;
542 			e.index = idx;
543 			if ((status = __bt_ret(t, &e,
544 			    &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)
545 				return (status);
546 			curcopy = 1;
547 			key = &c->key;
548 		}
549 		/* Check previous key, if not at the beginning of the page. */
550 		if (idx > 0) {
551 			e.page = h;
552 			e.index = idx - 1;
553 			if (__bt_cmp(t, key, &e) == 0) {
554 				F_SET(c, CURS_BEFORE);
555 				goto dup2;
556 			}
557 		}
558 		/* Check next key, if not at the end of the page. */
559 		if (idx < NEXTINDEX(h) - 1) {
560 			e.page = h;
561 			e.index = idx + 1;
562 			if (__bt_cmp(t, key, &e) == 0) {
563 				F_SET(c, CURS_AFTER);
564 				goto dup2;
565 			}
566 		}
567 		/* Check previous key if at the beginning of the page. */
568 		if (idx == 0 && h->prevpg != P_INVALID) {
569 			if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
570 				return (RET_ERROR);
571 			e.page = pg;
572 			e.index = NEXTINDEX(pg) - 1;
573 			if (__bt_cmp(t, key, &e) == 0) {
574 				F_SET(c, CURS_BEFORE);
575 				goto dup1;
576 			}
577 			mpool_put(t->bt_mp, pg, 0);
578 		}
579 		/* Check next key if at the end of the page. */
580 		if (idx == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {
581 			if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
582 				return (RET_ERROR);
583 			e.page = pg;
584 			e.index = 0;
585 			if (__bt_cmp(t, key, &e) == 0) {
586 				F_SET(c, CURS_AFTER);
587 dup1:				mpool_put(t->bt_mp, pg, 0);
588 dup2:				c->pg.pgno = e.page->pgno;
589 				c->pg.index = e.index;
590 				return (RET_SUCCESS);
591 			}
592 			mpool_put(t->bt_mp, pg, 0);
593 		}
594 	}
595 	e.page = h;
596 	e.index = idx;
597 	if (curcopy || (status =
598 	    __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {
599 		F_SET(c, CURS_ACQUIRE);
600 		return (RET_SUCCESS);
601 	}
602 	return (status);
603 }
604 
605 /*
606  * __bt_relink --
607  *	Link around a deleted page.
608  *
609  * Parameters:
610  *	t:	tree
611  *	h:	page to be deleted
612  */
613 static int
614 __bt_relink(BTREE *t, PAGE *h)
615 {
616 	PAGE *pg;
617 
618 	if (h->nextpg != P_INVALID) {
619 		if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
620 			return (RET_ERROR);
621 		pg->prevpg = h->prevpg;
622 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
623 	}
624 	if (h->prevpg != P_INVALID) {
625 		if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
626 			return (RET_ERROR);
627 		pg->nextpg = h->nextpg;
628 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
629 	}
630 	return (0);
631 }
632