xref: /freebsd/lib/libc/db/btree/bt_delete.c (revision 1e413cf93298b5b97441a21d9a50fdcd0ee9945e)
1 /*-
2  * Copyright (c) 1990, 1993, 1994
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * This code is derived from software contributed to Berkeley by
6  * Mike Olson.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 4. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #if defined(LIBC_SCCS) && !defined(lint)
34 static char sccsid[] = "@(#)bt_delete.c	8.13 (Berkeley) 7/28/94";
35 #endif /* LIBC_SCCS and not lint */
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <sys/types.h>
40 
41 #include <errno.h>
42 #include <stdio.h>
43 #include <string.h>
44 
45 #include <db.h>
46 #include "btree.h"
47 
48 static int __bt_bdelete(BTREE *, const DBT *);
49 static int __bt_curdel(BTREE *, const DBT *, PAGE *, u_int);
50 static int __bt_pdelete(BTREE *, PAGE *);
51 static int __bt_relink(BTREE *, PAGE *);
52 static int __bt_stkacq(BTREE *, PAGE **, CURSOR *);
53 
54 /*
55  * __bt_delete
56  *	Delete the item(s) referenced by a key.
57  *
58  * Return RET_SPECIAL if the key is not found.
59  */
60 int
61 __bt_delete(dbp, key, flags)
62 	const DB *dbp;
63 	const DBT *key;
64 	u_int flags;
65 {
66 	BTREE *t;
67 	CURSOR *c;
68 	PAGE *h;
69 	int status;
70 
71 	t = dbp->internal;
72 
73 	/* Toss any page pinned across calls. */
74 	if (t->bt_pinned != NULL) {
75 		mpool_put(t->bt_mp, t->bt_pinned, 0);
76 		t->bt_pinned = NULL;
77 	}
78 
79 	/* Check for change to a read-only tree. */
80 	if (F_ISSET(t, B_RDONLY)) {
81 		errno = EPERM;
82 		return (RET_ERROR);
83 	}
84 
85 	switch (flags) {
86 	case 0:
87 		status = __bt_bdelete(t, key);
88 		break;
89 	case R_CURSOR:
90 		/*
91 		 * If flags is R_CURSOR, delete the cursor.  Must already
92 		 * have started a scan and not have already deleted it.
93 		 */
94 		c = &t->bt_cursor;
95 		if (F_ISSET(c, CURS_INIT)) {
96 			if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
97 				return (RET_SPECIAL);
98 			if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
99 				return (RET_ERROR);
100 
101 			/*
102 			 * If the page is about to be emptied, we'll need to
103 			 * delete it, which means we have to acquire a stack.
104 			 */
105 			if (NEXTINDEX(h) == 1)
106 				if (__bt_stkacq(t, &h, &t->bt_cursor))
107 					return (RET_ERROR);
108 
109 			status = __bt_dleaf(t, NULL, h, c->pg.index);
110 
111 			if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {
112 				if (__bt_pdelete(t, h))
113 					return (RET_ERROR);
114 			} else
115 				mpool_put(t->bt_mp,
116 				    h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);
117 			break;
118 		}
119 		/* FALLTHROUGH */
120 	default:
121 		errno = EINVAL;
122 		return (RET_ERROR);
123 	}
124 	if (status == RET_SUCCESS)
125 		F_SET(t, B_MODIFIED);
126 	return (status);
127 }
128 
129 /*
130  * __bt_stkacq --
131  *	Acquire a stack so we can delete a cursor entry.
132  *
133  * Parameters:
134  *	  t:	tree
135  *	 hp:	pointer to current, pinned PAGE pointer
136  *	  c:	pointer to the cursor
137  *
138  * Returns:
139  *	0 on success, 1 on failure
140  */
141 static int
142 __bt_stkacq(t, hp, c)
143 	BTREE *t;
144 	PAGE **hp;
145 	CURSOR *c;
146 {
147 	BINTERNAL *bi;
148 	EPG *e;
149 	EPGNO *parent;
150 	PAGE *h;
151 	indx_t index;
152 	pgno_t pgno;
153 	recno_t nextpg, prevpg;
154 	int exact, level;
155 
156 	/*
157 	 * Find the first occurrence of the key in the tree.  Toss the
158 	 * currently locked page so we don't hit an already-locked page.
159 	 */
160 	h = *hp;
161 	mpool_put(t->bt_mp, h, 0);
162 	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
163 		return (1);
164 	h = e->page;
165 
166 	/* See if we got it in one shot. */
167 	if (h->pgno == c->pg.pgno)
168 		goto ret;
169 
170 	/*
171 	 * Move right, looking for the page.  At each move we have to move
172 	 * up the stack until we don't have to move to the next page.  If
173 	 * we have to change pages at an internal level, we have to fix the
174 	 * stack back up.
175 	 */
176 	while (h->pgno != c->pg.pgno) {
177 		if ((nextpg = h->nextpg) == P_INVALID)
178 			break;
179 		mpool_put(t->bt_mp, h, 0);
180 
181 		/* Move up the stack. */
182 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
183 			/* Get the parent page. */
184 			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
185 				return (1);
186 
187 			/* Move to the next index. */
188 			if (parent->index != NEXTINDEX(h) - 1) {
189 				index = parent->index + 1;
190 				BT_PUSH(t, h->pgno, index);
191 				break;
192 			}
193 			mpool_put(t->bt_mp, h, 0);
194 		}
195 
196 		/* Restore the stack. */
197 		while (level--) {
198 			/* Push the next level down onto the stack. */
199 			bi = GETBINTERNAL(h, index);
200 			pgno = bi->pgno;
201 			BT_PUSH(t, pgno, 0);
202 
203 			/* Lose the currently pinned page. */
204 			mpool_put(t->bt_mp, h, 0);
205 
206 			/* Get the next level down. */
207 			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
208 				return (1);
209 			index = 0;
210 		}
211 		mpool_put(t->bt_mp, h, 0);
212 		if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
213 			return (1);
214 	}
215 
216 	if (h->pgno == c->pg.pgno)
217 		goto ret;
218 
219 	/* Reacquire the original stack. */
220 	mpool_put(t->bt_mp, h, 0);
221 	if ((e = __bt_search(t, &c->key, &exact)) == NULL)
222 		return (1);
223 	h = e->page;
224 
225 	/*
226 	 * Move left, looking for the page.  At each move we have to move
227 	 * up the stack until we don't have to change pages to move to the
228 	 * next page.  If we have to change pages at an internal level, we
229 	 * have to fix the stack back up.
230 	 */
231 	while (h->pgno != c->pg.pgno) {
232 		if ((prevpg = h->prevpg) == P_INVALID)
233 			break;
234 		mpool_put(t->bt_mp, h, 0);
235 
236 		/* Move up the stack. */
237 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
238 			/* Get the parent page. */
239 			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
240 				return (1);
241 
242 			/* Move to the next index. */
243 			if (parent->index != 0) {
244 				index = parent->index - 1;
245 				BT_PUSH(t, h->pgno, index);
246 				break;
247 			}
248 			mpool_put(t->bt_mp, h, 0);
249 		}
250 
251 		/* Restore the stack. */
252 		while (level--) {
253 			/* Push the next level down onto the stack. */
254 			bi = GETBINTERNAL(h, index);
255 			pgno = bi->pgno;
256 
257 			/* Lose the currently pinned page. */
258 			mpool_put(t->bt_mp, h, 0);
259 
260 			/* Get the next level down. */
261 			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
262 				return (1);
263 
264 			index = NEXTINDEX(h) - 1;
265 			BT_PUSH(t, pgno, index);
266 		}
267 		mpool_put(t->bt_mp, h, 0);
268 		if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
269 			return (1);
270 	}
271 
272 
273 ret:	mpool_put(t->bt_mp, h, 0);
274 	return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
275 }
276 
277 /*
278  * __bt_bdelete --
279  *	Delete all key/data pairs matching the specified key.
280  *
281  * Parameters:
282  *	  t:	tree
283  *	key:	key to delete
284  *
285  * Returns:
286  *	RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
287  */
288 static int
289 __bt_bdelete(t, key)
290 	BTREE *t;
291 	const DBT *key;
292 {
293 	EPG *e;
294 	PAGE *h;
295 	int deleted, exact, redo;
296 
297 	deleted = 0;
298 
299 	/* Find any matching record; __bt_search pins the page. */
300 loop:	if ((e = __bt_search(t, key, &exact)) == NULL)
301 		return (deleted ? RET_SUCCESS : RET_ERROR);
302 	if (!exact) {
303 		mpool_put(t->bt_mp, e->page, 0);
304 		return (deleted ? RET_SUCCESS : RET_SPECIAL);
305 	}
306 
307 	/*
308 	 * Delete forward, then delete backward, from the found key.  If
309 	 * there are duplicates and we reach either side of the page, do
310 	 * the key search again, so that we get them all.
311 	 */
312 	redo = 0;
313 	h = e->page;
314 	do {
315 		if (__bt_dleaf(t, key, h, e->index)) {
316 			mpool_put(t->bt_mp, h, 0);
317 			return (RET_ERROR);
318 		}
319 		if (F_ISSET(t, B_NODUPS)) {
320 			if (NEXTINDEX(h) == 0) {
321 				if (__bt_pdelete(t, h))
322 					return (RET_ERROR);
323 			} else
324 				mpool_put(t->bt_mp, h, MPOOL_DIRTY);
325 			return (RET_SUCCESS);
326 		}
327 		deleted = 1;
328 	} while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
329 
330 	/* Check for right-hand edge of the page. */
331 	if (e->index == NEXTINDEX(h))
332 		redo = 1;
333 
334 	/* Delete from the key to the beginning of the page. */
335 	while (e->index-- > 0) {
336 		if (__bt_cmp(t, key, e) != 0)
337 			break;
338 		if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {
339 			mpool_put(t->bt_mp, h, 0);
340 			return (RET_ERROR);
341 		}
342 		if (e->index == 0)
343 			redo = 1;
344 	}
345 
346 	/* Check for an empty page. */
347 	if (NEXTINDEX(h) == 0) {
348 		if (__bt_pdelete(t, h))
349 			return (RET_ERROR);
350 		goto loop;
351 	}
352 
353 	/* Put the page. */
354 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);
355 
356 	if (redo)
357 		goto loop;
358 	return (RET_SUCCESS);
359 }
360 
361 /*
362  * __bt_pdelete --
363  *	Delete a single page from the tree.
364  *
365  * Parameters:
366  *	t:	tree
367  *	h:	leaf page
368  *
369  * Returns:
370  *	RET_SUCCESS, RET_ERROR.
371  *
372  * Side-effects:
373  *	mpool_put's the page
374  */
375 static int
376 __bt_pdelete(t, h)
377 	BTREE *t;
378 	PAGE *h;
379 {
380 	BINTERNAL *bi;
381 	PAGE *pg;
382 	EPGNO *parent;
383 	indx_t cnt, index, *ip, offset;
384 	u_int32_t nksize;
385 	char *from;
386 
387 	/*
388 	 * Walk the parent page stack -- a LIFO stack of the pages that were
389 	 * traversed when we searched for the page where the delete occurred.
390 	 * Each stack entry is a page number and a page index offset.  The
391 	 * offset is for the page traversed on the search.  We've just deleted
392 	 * a page, so we have to delete the key from the parent page.
393 	 *
394 	 * If the delete from the parent page makes it empty, this process may
395 	 * continue all the way up the tree.  We stop if we reach the root page
396 	 * (which is never deleted, it's just not worth the effort) or if the
397 	 * delete does not empty the page.
398 	 */
399 	while ((parent = BT_POP(t)) != NULL) {
400 		/* Get the parent page. */
401 		if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
402 			return (RET_ERROR);
403 
404 		index = parent->index;
405 		bi = GETBINTERNAL(pg, index);
406 
407 		/* Free any overflow pages. */
408 		if (bi->flags & P_BIGKEY &&
409 		    __ovfl_delete(t, bi->bytes) == RET_ERROR) {
410 			mpool_put(t->bt_mp, pg, 0);
411 			return (RET_ERROR);
412 		}
413 
414 		/*
415 		 * Free the parent if it has only the one key and it's not the
416 		 * root page. If it's the rootpage, turn it back into an empty
417 		 * leaf page.
418 		 */
419 		if (NEXTINDEX(pg) == 1)
420 			if (pg->pgno == P_ROOT) {
421 				pg->lower = BTDATAOFF;
422 				pg->upper = t->bt_psize;
423 				pg->flags = P_BLEAF;
424 			} else {
425 				if (__bt_relink(t, pg) || __bt_free(t, pg))
426 					return (RET_ERROR);
427 				continue;
428 			}
429 		else {
430 			/* Pack remaining key items at the end of the page. */
431 			nksize = NBINTERNAL(bi->ksize);
432 			from = (char *)pg + pg->upper;
433 			memmove(from + nksize, from, (char *)bi - from);
434 			pg->upper += nksize;
435 
436 			/* Adjust indices' offsets, shift the indices down. */
437 			offset = pg->linp[index];
438 			for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip)
439 				if (ip[0] < offset)
440 					ip[0] += nksize;
441 			for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip)
442 				ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];
443 			pg->lower -= sizeof(indx_t);
444 		}
445 
446 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
447 		break;
448 	}
449 
450 	/* Free the leaf page, as long as it wasn't the root. */
451 	if (h->pgno == P_ROOT) {
452 		mpool_put(t->bt_mp, h, MPOOL_DIRTY);
453 		return (RET_SUCCESS);
454 	}
455 	return (__bt_relink(t, h) || __bt_free(t, h));
456 }
457 
458 /*
459  * __bt_dleaf --
460  *	Delete a single record from a leaf page.
461  *
462  * Parameters:
463  *	t:	tree
464  *    key:	referenced key
465  *	h:	page
466  *	index:	index on page to delete
467  *
468  * Returns:
469  *	RET_SUCCESS, RET_ERROR.
470  */
471 int
472 __bt_dleaf(t, key, h, index)
473 	BTREE *t;
474 	const DBT *key;
475 	PAGE *h;
476 	u_int index;
477 {
478 	BLEAF *bl;
479 	indx_t cnt, *ip, offset;
480 	u_int32_t nbytes;
481 	void *to;
482 	char *from;
483 
484 	/* If this record is referenced by the cursor, delete the cursor. */
485 	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
486 	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
487 	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index &&
488 	    __bt_curdel(t, key, h, index))
489 		return (RET_ERROR);
490 
491 	/* If the entry uses overflow pages, make them available for reuse. */
492 	to = bl = GETBLEAF(h, index);
493 	if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
494 		return (RET_ERROR);
495 	if (bl->flags & P_BIGDATA &&
496 	    __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
497 		return (RET_ERROR);
498 
499 	/* Pack the remaining key/data items at the end of the page. */
500 	nbytes = NBLEAF(bl);
501 	from = (char *)h + h->upper;
502 	memmove(from + nbytes, from, (char *)to - from);
503 	h->upper += nbytes;
504 
505 	/* Adjust the indices' offsets, shift the indices down. */
506 	offset = h->linp[index];
507 	for (cnt = index, ip = &h->linp[0]; cnt--; ++ip)
508 		if (ip[0] < offset)
509 			ip[0] += nbytes;
510 	for (cnt = NEXTINDEX(h) - index; --cnt; ++ip)
511 		ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
512 	h->lower -= sizeof(indx_t);
513 
514 	/* If the cursor is on this page, adjust it as necessary. */
515 	if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
516 	    !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
517 	    t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index)
518 		--t->bt_cursor.pg.index;
519 
520 	return (RET_SUCCESS);
521 }
522 
523 /*
524  * __bt_curdel --
525  *	Delete the cursor.
526  *
527  * Parameters:
528  *	t:	tree
529  *    key:	referenced key (or NULL)
530  *	h:	page
531  *  index:	index on page to delete
532  *
533  * Returns:
534  *	RET_SUCCESS, RET_ERROR.
535  */
536 static int
537 __bt_curdel(t, key, h, index)
538 	BTREE *t;
539 	const DBT *key;
540 	PAGE *h;
541 	u_int index;
542 {
543 	CURSOR *c;
544 	EPG e;
545 	PAGE *pg;
546 	int curcopy, status;
547 
548 	/*
549 	 * If there are duplicates, move forward or backward to one.
550 	 * Otherwise, copy the key into the cursor area.
551 	 */
552 	c = &t->bt_cursor;
553 	F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);
554 
555 	curcopy = 0;
556 	if (!F_ISSET(t, B_NODUPS)) {
557 		/*
558 		 * We're going to have to do comparisons.  If we weren't
559 		 * provided a copy of the key, i.e. the user is deleting
560 		 * the current cursor position, get one.
561 		 */
562 		if (key == NULL) {
563 			e.page = h;
564 			e.index = index;
565 			if ((status = __bt_ret(t, &e,
566 			    &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)
567 				return (status);
568 			curcopy = 1;
569 			key = &c->key;
570 		}
571 		/* Check previous key, if not at the beginning of the page. */
572 		if (index > 0) {
573 			e.page = h;
574 			e.index = index - 1;
575 			if (__bt_cmp(t, key, &e) == 0) {
576 				F_SET(c, CURS_BEFORE);
577 				goto dup2;
578 			}
579 		}
580 		/* Check next key, if not at the end of the page. */
581 		if (index < NEXTINDEX(h) - 1) {
582 			e.page = h;
583 			e.index = index + 1;
584 			if (__bt_cmp(t, key, &e) == 0) {
585 				F_SET(c, CURS_AFTER);
586 				goto dup2;
587 			}
588 		}
589 		/* Check previous key if at the beginning of the page. */
590 		if (index == 0 && h->prevpg != P_INVALID) {
591 			if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
592 				return (RET_ERROR);
593 			e.page = pg;
594 			e.index = NEXTINDEX(pg) - 1;
595 			if (__bt_cmp(t, key, &e) == 0) {
596 				F_SET(c, CURS_BEFORE);
597 				goto dup1;
598 			}
599 			mpool_put(t->bt_mp, pg, 0);
600 		}
601 		/* Check next key if at the end of the page. */
602 		if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {
603 			if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
604 				return (RET_ERROR);
605 			e.page = pg;
606 			e.index = 0;
607 			if (__bt_cmp(t, key, &e) == 0) {
608 				F_SET(c, CURS_AFTER);
609 dup1:				mpool_put(t->bt_mp, pg, 0);
610 dup2:				c->pg.pgno = e.page->pgno;
611 				c->pg.index = e.index;
612 				return (RET_SUCCESS);
613 			}
614 			mpool_put(t->bt_mp, pg, 0);
615 		}
616 	}
617 	e.page = h;
618 	e.index = index;
619 	if (curcopy || (status =
620 	    __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {
621 		F_SET(c, CURS_ACQUIRE);
622 		return (RET_SUCCESS);
623 	}
624 	return (status);
625 }
626 
627 /*
628  * __bt_relink --
629  *	Link around a deleted page.
630  *
631  * Parameters:
632  *	t:	tree
633  *	h:	page to be deleted
634  */
635 static int
636 __bt_relink(t, h)
637 	BTREE *t;
638 	PAGE *h;
639 {
640 	PAGE *pg;
641 
642 	if (h->nextpg != P_INVALID) {
643 		if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
644 			return (RET_ERROR);
645 		pg->prevpg = h->prevpg;
646 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
647 	}
648 	if (h->prevpg != P_INVALID) {
649 		if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
650 			return (RET_ERROR);
651 		pg->nextpg = h->nextpg;
652 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
653 	}
654 	return (0);
655 }
656