xref: /illumos-gate/usr/src/cmd/sendmail/db/btree/bt_cursor.c (revision 533affcbc7fc4d0c8132976ea454aaa715fe2307)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 
8 #include "config.h"
9 
10 #ifndef lint
11 static const char sccsid[] = "@(#)bt_cursor.c	10.81 (Sleepycat) 12/16/98";
12 #endif /* not lint */
13 
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
16 
17 #include <errno.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #endif
21 
22 #include "db_int.h"
23 #include "db_page.h"
24 #include "btree.h"
25 #include "shqueue.h"
26 #include "db_shash.h"
27 #include "lock.h"
28 #include "lock_ext.h"
29 
30 static int __bam_c_close __P((DBC *));
31 static int __bam_c_del __P((DBC *, u_int32_t));
32 static int __bam_c_destroy __P((DBC *));
33 static int __bam_c_first __P((DBC *, CURSOR *));
34 static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
35 static int __bam_c_getstack __P((DBC *, CURSOR *));
36 static int __bam_c_last __P((DBC *, CURSOR *));
37 static int __bam_c_next __P((DBC *, CURSOR *, int));
38 static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *));
39 static int __bam_c_prev __P((DBC *, CURSOR *));
40 static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
41 static void __bam_c_reset __P((CURSOR *));
42 static int __bam_c_rget __P((DBC *, DBT *, u_int32_t));
43 static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *));
44 static int __bam_dsearch __P((DBC *, CURSOR *,  DBT *, u_int32_t *));
45 
46 /* Discard the current page/lock held by a cursor. */
47 #undef	DISCARD
48 #define	DISCARD(dbc, cp) {						\
49 	if ((cp)->page != NULL) {					\
50 		(void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0);	\
51 		(cp)->page = NULL;					\
52 	}								\
53 	if ((cp)->lock != LOCK_INVALID) {				\
54 		(void)__BT_TLPUT((dbc), (cp)->lock);			\
55 		(cp)->lock = LOCK_INVALID;				\
56 	}								\
57 }
58 
59 /* If the cursor references a deleted record. */
60 #undef	IS_CUR_DELETED
61 #define	IS_CUR_DELETED(cp)						\
62 	(((cp)->dpgno == PGNO_INVALID &&				\
63 	B_DISSET(GET_BKEYDATA((cp)->page,				\
64 	(cp)->indx + O_INDX)->type)) ||					\
65 	((cp)->dpgno != PGNO_INVALID &&					\
66 	B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type)))
67 
68 /* If the cursor and index combination references a deleted record. */
69 #undef	IS_DELETED
70 #define	IS_DELETED(cp, indx)						\
71 	(((cp)->dpgno == PGNO_INVALID &&				\
72 	B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) ||	\
73 	((cp)->dpgno != PGNO_INVALID &&					\
74 	B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type)))
75 
76 /*
77  * Test to see if two cursors could point to duplicates of the same key,
78  * whether on-page or off-page.  The leaf page numbers must be the same
79  * in both cases.  In the case of off-page duplicates, the key indices
80  * on the leaf page will be the same.  In the case of on-page duplicates,
81  * the duplicate page number must not be set, and the key index offsets
82  * must be the same.  For the last test, as the saved copy of the cursor
83  * will not have a valid page pointer, we use the cursor's.
84  */
85 #undef	POSSIBLE_DUPLICATE
86 #define	POSSIBLE_DUPLICATE(cursor, saved_copy)				\
87 	((cursor)->pgno == (saved_copy).pgno &&				\
88 	((cursor)->indx == (saved_copy).indx ||				\
89 	((cursor)->dpgno == PGNO_INVALID &&				\
90 	    (saved_copy).dpgno == PGNO_INVALID &&			\
91 	    (cursor)->page->inp[(cursor)->indx] ==			\
92 	    (cursor)->page->inp[(saved_copy).indx])))
93 
94 /*
95  * __bam_c_reset --
96  *	Initialize internal cursor structure.
97  */
98 static void
99 __bam_c_reset(cp)
100 	CURSOR *cp;
101 {
102 	cp->sp = cp->csp = cp->stack;
103 	cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
104 	cp->page = NULL;
105 	cp->pgno = PGNO_INVALID;
106 	cp->indx = 0;
107 	cp->dpgno = PGNO_INVALID;
108 	cp->dindx = 0;
109 	cp->lock = LOCK_INVALID;
110 	cp->mode = DB_LOCK_NG;
111 	cp->recno = RECNO_OOB;
112 	cp->flags = 0;
113 }
114 
115 /*
116  * __bam_c_init --
117  *	Initialize the access private portion of a cursor
118  *
119  * PUBLIC: int __bam_c_init __P((DBC *));
120  */
121 int
122 __bam_c_init(dbc)
123 	DBC *dbc;
124 {
125 	DB *dbp;
126 	CURSOR *cp;
127 	int ret;
128 
129 	if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0)
130 		return (ret);
131 
132 	dbp = dbc->dbp;
133 	cp->dbc = dbc;
134 
135 	/*
136 	 * Logical record numbers are always the same size, and we don't want
137 	 * to have to check for space every time we return one.  Allocate it
138 	 * in advance.
139 	 */
140 	if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
141 		if ((ret = __os_malloc(sizeof(db_recno_t),
142 		    NULL, &dbc->rkey.data)) != 0) {
143 			__os_free(cp, sizeof(CURSOR));
144 			return (ret);
145 		}
146 		dbc->rkey.ulen = sizeof(db_recno_t);
147 	}
148 
149 	/* Initialize methods. */
150 	dbc->internal = cp;
151 	if (dbp->type == DB_BTREE) {
152 		dbc->c_am_close = __bam_c_close;
153 		dbc->c_am_destroy = __bam_c_destroy;
154 		dbc->c_del = __bam_c_del;
155 		dbc->c_get = __bam_c_get;
156 		dbc->c_put = __bam_c_put;
157 	} else {
158 		dbc->c_am_close = __bam_c_close;
159 		dbc->c_am_destroy = __bam_c_destroy;
160 		dbc->c_del = __ram_c_del;
161 		dbc->c_get = __ram_c_get;
162 		dbc->c_put = __ram_c_put;
163 	}
164 
165 	/* Initialize dynamic information. */
166 	__bam_c_reset(cp);
167 
168 	return (0);
169 }
170 
171 /*
172  * __bam_c_close --
173  *	Close down the cursor from a single use.
174  */
175 static int
176 __bam_c_close(dbc)
177 	DBC *dbc;
178 {
179 	CURSOR *cp;
180 	DB *dbp;
181 	int ret;
182 
183 	dbp = dbc->dbp;
184 	cp = dbc->internal;
185 	ret = 0;
186 
187 	/*
188 	 * If a cursor deleted a btree key, perform the actual deletion.
189 	 * (Recno keys are either deleted immediately or never deleted.)
190 	 */
191 	if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED))
192 		ret = __bam_c_physdel(dbc, cp, NULL);
193 
194 	/* Discard any locks not acquired inside of a transaction. */
195 	if (cp->lock != LOCK_INVALID) {
196 		(void)__BT_TLPUT(dbc, cp->lock);
197 		cp->lock = LOCK_INVALID;
198 	}
199 
200 	/* Sanity checks. */
201 #ifdef DIAGNOSTIC
202 	if (cp->csp != cp->stack)
203 		__db_err(dbp->dbenv, "btree cursor close: stack not empty");
204 #endif
205 
206 	/* Initialize dynamic information. */
207 	__bam_c_reset(cp);
208 
209 	return (ret);
210 }
211 
212 /*
213  * __bam_c_destroy --
214  *	Close a single cursor -- internal version.
215  */
216 static int
217 __bam_c_destroy(dbc)
218 	DBC *dbc;
219 {
220 	/* Discard the structures. */
221 	__os_free(dbc->internal, sizeof(CURSOR));
222 
223 	return (0);
224 }
225 
226 /*
227  * __bam_c_del --
228  *	Delete using a cursor.
229  */
230 static int
231 __bam_c_del(dbc, flags)
232 	DBC *dbc;
233 	u_int32_t flags;
234 {
235 	CURSOR *cp;
236 	DB *dbp;
237 	DB_LOCK lock;
238 	PAGE *h;
239 	db_pgno_t pgno;
240 	db_indx_t indx;
241 	int ret;
242 
243 	dbp = dbc->dbp;
244 	cp = dbc->internal;
245 	h = NULL;
246 
247 	DB_PANIC_CHECK(dbp);
248 
249 	/* Check for invalid flags. */
250 	if ((ret = __db_cdelchk(dbp, flags,
251 	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
252 		return (ret);
253 
254 	/*
255 	 * If we are running CDB, this had better be either a write
256 	 * cursor or an immediate writer.
257 	 */
258 	if (F_ISSET(dbp, DB_AM_CDB))
259 		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
260 			return (EINVAL);
261 
262 	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags);
263 
264 	/* If already deleted, return failure. */
265 	if (F_ISSET(cp, C_DELETED))
266 		return (DB_KEYEMPTY);
267 
268 	/*
269 	 * We don't physically delete the record until the cursor moves,
270 	 * so we have to have a long-lived write lock on the page instead
271 	 * of a long-lived read lock.  Note, we have to have a read lock
272 	 * to even get here, so we simply discard it.
273 	 */
274 	if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) {
275 		if ((ret = __bam_lget(dbc,
276 		    0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
277 			goto err;
278 		(void)__BT_TLPUT(dbc, cp->lock);
279 		cp->lock = lock;
280 		cp->mode = DB_LOCK_WRITE;
281 	}
282 
283 	/*
284 	 * Acquire the underlying page (which may be different from the above
285 	 * page because it may be a duplicate page), and set the on-page and
286 	 * in-cursor delete flags.  We don't need to lock it as we've already
287 	 * write-locked the page leading to it.
288 	 */
289 	if (cp->dpgno == PGNO_INVALID) {
290 		pgno = cp->pgno;
291 		indx = cp->indx;
292 	} else {
293 		pgno = cp->dpgno;
294 		indx = cp->dindx;
295 	}
296 
297 	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
298 		goto err;
299 
300 	/* Log the change. */
301 	if (DB_LOGGING(dbc) &&
302 	    (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h),
303 	    0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) {
304 		(void)memp_fput(dbp->mpf, h, 0);
305 		goto err;
306 	}
307 
308 	/*
309 	 * Set the intent-to-delete flag on the page and update all cursors. */
310 	if (cp->dpgno == PGNO_INVALID)
311 		B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
312 	else
313 		B_DSET(GET_BKEYDATA(h, indx)->type);
314 	(void)__bam_ca_delete(dbp, pgno, indx, 1);
315 
316 	ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
317 	h = NULL;
318 
319 	/*
320 	 * If the tree has record numbers, we have to adjust the counts.
321 	 *
322 	 * !!!
323 	 * This test is right -- we don't yet support duplicates and record
324 	 * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM
325 	 * set.
326 	 */
327 	if (F_ISSET(dbp, DB_BT_RECNUM)) {
328 		if ((ret = __bam_c_getstack(dbc, cp)) != 0)
329 			goto err;
330 		if ((ret = __bam_adjust(dbc, -1)) != 0)
331 			goto err;
332 		(void)__bam_stkrel(dbc, 0);
333 	}
334 
335 err:	if (h != NULL)
336 		(void)memp_fput(dbp->mpf, h, 0);
337 	return (ret);
338 }
339 
340 /*
341  * __bam_c_get --
342  *	Get using a cursor (btree).
343  */
344 static int
345 __bam_c_get(dbc, key, data, flags)
346 	DBC *dbc;
347 	DBT *key, *data;
348 	u_int32_t flags;
349 {
350 	CURSOR *cp, copy, start;
351 	DB *dbp;
352 	PAGE *h;
353 	int exact, ret, tmp_rmw;
354 
355 	dbp = dbc->dbp;
356 	cp = dbc->internal;
357 
358 	DB_PANIC_CHECK(dbp);
359 
360 	/* Check for invalid flags. */
361 	if ((ret = __db_cgetchk(dbp,
362 	    key, data, flags, cp->pgno != PGNO_INVALID)) != 0)
363 		return (ret);
364 
365 	/* Clear OR'd in additional bits so we can check for flag equality. */
366 	tmp_rmw = 0;
367 	if (LF_ISSET(DB_RMW)) {
368 		if (!F_ISSET(dbp, DB_AM_CDB)) {
369 			tmp_rmw = 1;
370 			F_SET(dbc, DBC_RMW);
371 		}
372 		LF_CLR(DB_RMW);
373 	}
374 
375 	DEBUG_LREAD(dbc, dbc->txn, "bam_c_get",
376 	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
377 
378 	/*
379 	 * Return a cursor's record number.  It has nothing to do with the
380 	 * cursor get code except that it's been rammed into the interface.
381 	 */
382 	if (flags == DB_GET_RECNO) {
383 		ret = __bam_c_rget(dbc, data, flags);
384 		if (tmp_rmw)
385 			F_CLR(dbc, DBC_RMW);
386 		return (ret);
387 	}
388 
389 	/*
390 	 * Initialize the cursor for a new retrieval.  Clear the cursor's
391 	 * page pointer, it was set before this operation, and no longer
392 	 * has any meaning.
393 	 */
394 	cp->page = NULL;
395 	copy = *cp;
396 	cp->lock = LOCK_INVALID;
397 
398 	switch (flags) {
399 	case DB_CURRENT:
400 		/* It's not possible to return a deleted record. */
401 		if (F_ISSET(cp, C_DELETED)) {
402 			ret = DB_KEYEMPTY;
403 			goto err;
404 		}
405 
406 		/* Acquire the current page. */
407 		if ((ret = __bam_lget(dbc,
408 		    0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0)
409 			ret = memp_fget(dbp->mpf,
410 			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
411 			    0, &cp->page);
412 		if (ret != 0)
413 			goto err;
414 		break;
415 	case DB_NEXT_DUP:
416 		if (cp->pgno == PGNO_INVALID) {
417 			ret = EINVAL;
418 			goto err;
419 		}
420 		if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
421 			goto err;
422 
423 		/* Make sure we didn't go past the end of the duplicates. */
424 		if (!POSSIBLE_DUPLICATE(cp, copy)) {
425 			ret = DB_NOTFOUND;
426 			goto err;
427 		}
428 		break;
429 	case DB_NEXT:
430 		if (cp->pgno != PGNO_INVALID) {
431 			if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
432 				goto err;
433 			break;
434 		}
435 		/* FALLTHROUGH */
436 	case DB_FIRST:
437 		if ((ret = __bam_c_first(dbc, cp)) != 0)
438 			goto err;
439 		break;
440 	case DB_PREV:
441 		if (cp->pgno != PGNO_INVALID) {
442 			if ((ret = __bam_c_prev(dbc, cp)) != 0)
443 				goto err;
444 			break;
445 		}
446 		/* FALLTHROUGH */
447 	case DB_LAST:
448 		if ((ret = __bam_c_last(dbc, cp)) != 0)
449 			goto err;
450 		break;
451 	case DB_SET:
452 		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
453 			goto err;
454 
455 		/*
456 		 * We cannot currently be referencing a deleted record, but we
457 		 * may be referencing off-page duplicates.
458 		 *
459 		 * If we're referencing off-page duplicates, move off-page.
460 		 * If we moved off-page, move to the next non-deleted record.
461 		 * If we moved to the next non-deleted record, check to make
462 		 * sure we didn't switch records because our current record
463 		 * had no non-deleted data items.
464 		 */
465 		start = *cp;
466 		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
467 			goto err;
468 		if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) {
469 			if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
470 				goto err;
471 			if (!POSSIBLE_DUPLICATE(cp, start)) {
472 				ret = DB_NOTFOUND;
473 				goto err;
474 			}
475 		}
476 		break;
477 	case DB_SET_RECNO:
478 		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
479 			goto err;
480 		break;
481 	case DB_GET_BOTH:
482 		if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) {
483 			/* Acquire the current page. */
484 			if ((ret = memp_fget(dbp->mpf,
485 			    cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
486 			    0, &cp->page)) != 0)
487 				goto err;
488 
489 			/* If DBC_CONTINUE, move to the next item. */
490 			if (F_ISSET(dbc, DBC_CONTINUE) &&
491 			    (ret = __bam_c_next(dbc, cp, 1)) != 0)
492 				goto err;
493 		} else {
494 			if ((ret =
495 			    __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
496 				goto err;
497 
498 			/*
499 			 * We may be referencing a duplicates page.  Move to
500 			 * the first duplicate.
501 			 */
502 			if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
503 				goto err;
504 		}
505 
506 		/* Search for a matching entry. */
507 		if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0)
508 			goto err;
509 
510 		/* Ignore deleted entries. */
511 		if (IS_CUR_DELETED(cp)) {
512 			ret = DB_NOTFOUND;
513 			goto err;
514 		}
515 		break;
516 	case DB_SET_RANGE:
517 		if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
518 			goto err;
519 
520 		/*
521 		 * As we didn't require an exact match, the search function
522 		 * may have returned an entry past the end of the page.  If
523 		 * so, move to the next entry.
524 		 */
525 		if (cp->indx == NUM_ENT(cp->page) &&
526 		    (ret = __bam_c_next(dbc, cp, 0)) != 0)
527 			goto err;
528 
529 		/*
530 		 * We may be referencing off-page duplicates, if so, move
531 		 * off-page.
532 		 */
533 		if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
534 			goto err;
535 
536 		/*
537 		 * We may be referencing a deleted record, if so, move to
538 		 * the next non-deleted record.
539 		 */
540 		if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0)
541 			goto err;
542 		break;
543 	}
544 
545 	/*
546 	 * Return the key if the user didn't give us one.  If we've moved to
547 	 * a duplicate page, we may no longer have a pointer to the main page,
548 	 * so we have to go get it.  We know that it's already read-locked,
549 	 * however, so we don't have to acquire a new lock.
550 	 */
551 	if (flags != DB_SET) {
552 		if (cp->dpgno != PGNO_INVALID) {
553 			if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
554 				goto err;
555 		} else
556 			h = cp->page;
557 		ret = __db_ret(dbp,
558 		    h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen);
559 		if (cp->dpgno != PGNO_INVALID)
560 			(void)memp_fput(dbp->mpf, h, 0);
561 		if (ret)
562 			goto err;
563 	}
564 
565 	/* Return the data. */
566 	if ((ret = __db_ret(dbp, cp->page,
567 	    cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx,
568 	    data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
569 		goto err;
570 
571 	/*
572 	 * If the previous cursor record has been deleted, physically delete
573 	 * the entry from the page.  We clear the deleted flag before we call
574 	 * the underlying delete routine so that, if an error occurs, and we
575 	 * restore the cursor, the deleted flag is cleared.  This is because,
576 	 * if we manage to physically modify the page, and then restore the
577 	 * cursor, we might try to repeat the page modification when closing
578 	 * the cursor.
579 	 */
580 	if (F_ISSET(&copy, C_DELETED)) {
581 		F_CLR(&copy, C_DELETED);
582 		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
583 			goto err;
584 	}
585 	F_CLR(cp, C_DELETED);
586 
587 	/* Release the previous lock, if any; the current lock is retained. */
588 	if (copy.lock != LOCK_INVALID)
589 		(void)__BT_TLPUT(dbc, copy.lock);
590 
591 	/* Release the current page. */
592 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
593 		goto err;
594 
595 	if (0) {
596 err:		if (cp->page != NULL)
597 			(void)memp_fput(dbp->mpf, cp->page, 0);
598 		if (cp->lock != LOCK_INVALID)
599 			(void)__BT_TLPUT(dbc, cp->lock);
600 		*cp = copy;
601 	}
602 
603 	/* Release temporary lock upgrade. */
604 	if (tmp_rmw)
605 		F_CLR(dbc, DBC_RMW);
606 
607 	return (ret);
608 }
609 
610 /*
611  * __bam_dsearch --
612  *	Search for a matching data item (or the first data item that's
613  *	equal to or greater than the one we're searching for).
614  */
615 static int
616 __bam_dsearch(dbc, cp, data, iflagp)
617 	DBC *dbc;
618 	CURSOR *cp;
619 	DBT *data;
620 	u_int32_t *iflagp;
621 {
622 	DB *dbp;
623 	CURSOR copy, last;
624 	int cmp, ret;
625 
626 	dbp = dbc->dbp;
627 
628 	/*
629 	 * If iflagp is non-NULL, we're doing an insert.
630 	 *
631 	 * If the duplicates are off-page, use the duplicate search routine.
632 	 */
633 	if (cp->dpgno != PGNO_INVALID) {
634 		if ((ret = __db_dsearch(dbc, iflagp != NULL,
635 		    data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0)
636 			return (ret);
637 		cp->dpgno = cp->page->pgno;
638 
639 		if (iflagp == NULL) {
640 			if (cmp != 0)
641 				return (DB_NOTFOUND);
642 			return (0);
643 		}
644 		*iflagp = DB_BEFORE;
645 		return (0);
646 	}
647 
648 	/* Otherwise, do the search ourselves. */
649 	copy = *cp;
650 	for (;;) {
651 		/* Save the last interesting cursor position. */
652 		last = *cp;
653 
654 		/* See if the data item matches the one we're looking for. */
655 		if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX,
656 		    dbp->dup_compare == NULL ?
657 		    __bam_defcmp : dbp->dup_compare)) == 0) {
658 			if (iflagp != NULL)
659 				*iflagp = DB_AFTER;
660 			return (0);
661 		}
662 
663 		/*
664 		 * If duplicate entries are sorted, we're done if we find a
665 		 * page entry that sorts greater than the application item.
666 		 * If doing an insert, return success, otherwise DB_NOTFOUND.
667 		 */
668 		if (dbp->dup_compare != NULL && cmp < 0) {
669 			if (iflagp == NULL)
670 				return (DB_NOTFOUND);
671 			*iflagp = DB_BEFORE;
672 			return (0);
673 		}
674 
675 		/*
676 		 * Move to the next item.  If we reach the end of the page and
677 		 * we're doing an insert, set the cursor to the last item and
678 		 * set the referenced memory location so callers know to insert
679 		 * after the item, instead of before it.  If not inserting, we
680 		 * return DB_NOTFOUND.
681 		 */
682 		if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) {
683 			if (iflagp == NULL)
684 				return (DB_NOTFOUND);
685 			goto use_last;
686 		}
687 
688 		/*
689 		 * Make sure we didn't go past the end of the duplicates.  The
690 		 * error conditions are the same as above.
691 		 */
692 		if (!POSSIBLE_DUPLICATE(cp, copy)) {
693 			if (iflagp == NULL)
694 				 return (DB_NOTFOUND);
695 use_last:		*cp = last;
696 			*iflagp = DB_AFTER;
697 			return (0);
698 		}
699 	}
700 	/* NOTREACHED */
701 }
702 
703 /*
704  * __bam_c_rget --
705  *	Return the record number for a cursor.
706  */
707 static int
708 __bam_c_rget(dbc, data, flags)
709 	DBC *dbc;
710 	DBT *data;
711 	u_int32_t flags;
712 {
713 	CURSOR *cp;
714 	DB *dbp;
715 	DBT dbt;
716 	db_recno_t recno;
717 	int exact, ret;
718 
719 	COMPQUIET(flags, 0);
720 	dbp = dbc->dbp;
721 	cp = dbc->internal;
722 
723 	/* Get the page with the current item on it. */
724 	if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
725 		return (ret);
726 
727 	/* Get a copy of the key. */
728 	memset(&dbt, 0, sizeof(DBT));
729 	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
730 	if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0)
731 		goto err;
732 
733 	exact = 1;
734 	if ((ret = __bam_search(dbc, &dbt,
735 	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
736 	    1, &recno, &exact)) != 0)
737 		goto err;
738 
739 	ret = __db_retcopy(data, &recno, sizeof(recno),
740 	    &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc);
741 
742 	/* Release the stack. */
743 	__bam_stkrel(dbc, 0);
744 
745 err:	(void)memp_fput(dbp->mpf, cp->page, 0);
746 	__os_free(dbt.data, dbt.size);
747 	return (ret);
748 }
749 
750 /*
751  * __bam_c_put --
752  *	Put using a cursor.
753  */
754 static int
755 __bam_c_put(dbc, key, data, flags)
756 	DBC *dbc;
757 	DBT *key, *data;
758 	u_int32_t flags;
759 {
760 	CURSOR *cp, copy;
761 	DB *dbp;
762 	DBT dbt;
763 	db_indx_t indx;
764 	db_pgno_t pgno;
765 	u_int32_t iiflags, iiop;
766 	int exact, needkey, ret, stack;
767 	void *arg;
768 
769 	dbp = dbc->dbp;
770 	cp = dbc->internal;
771 
772 	DB_PANIC_CHECK(dbp);
773 
774 	DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put",
775 	    flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
776 	    data, flags);
777 
778 	if ((ret = __db_cputchk(dbp, key, data, flags,
779 	    F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
780 		return (ret);
781 
782 	/*
783 	 * If we are running CDB, this had better be either a write
784 	 * cursor or an immediate writer.  If it's a regular writer,
785 	 * that means we have an IWRITE lock and we need to upgrade
786 	 * it to a write lock.
787 	 */
788 	if (F_ISSET(dbp, DB_AM_CDB)) {
789 		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
790 			return (EINVAL);
791 
792 		if (F_ISSET(dbc, DBC_RMW) &&
793 		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
794 		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
795 		    &dbc->mylock)) != 0)
796 			return (EAGAIN);
797 	}
798 
799 	if (0) {
800 split:		/*
801 		 * To split, we need a valid key for the page.  Since it's a
802 		 * cursor, we have to build one.
803 		 *
804 		 * Acquire a copy of a key from the page.
805 		 */
806 		if (needkey) {
807 			memset(&dbt, 0, sizeof(DBT));
808 			if ((ret = __db_ret(dbp, cp->page, indx,
809 			    &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
810 				goto err;
811 			arg = &dbt;
812 		} else
813 			arg = key;
814 
815 		/*
816 		 * Discard any locks and pinned pages (the locks are discarded
817 		 * even if we're running with transactions, as they lock pages
818 		 * that we're sorry we ever acquired).  If stack is set and the
819 		 * cursor entries are valid, they point to the same entries as
820 		 * the stack, don't free them twice.
821 		 */
822 		if (stack) {
823 			(void)__bam_stkrel(dbc, 1);
824 			stack = 0;
825 		} else
826 			DISCARD(dbc, cp);
827 
828 		/*
829 		 * Restore the cursor to its original value.  This is necessary
830 		 * for two reasons.  First, we are about to copy it in case of
831 		 * error, again.  Second, we adjust cursors during the split,
832 		 * and we have to ensure this cursor is adjusted appropriately,
833 		 * along with all the other cursors.
834 		 */
835 		*cp = copy;
836 
837 		if ((ret = __bam_split(dbc, arg)) != 0)
838 			goto err;
839 	}
840 
841 	/*
842 	 * Initialize the cursor for a new retrieval.  Clear the cursor's
843 	 * page pointer, it was set before this operation, and no longer
844 	 * has any meaning.
845 	 */
846 	cp->page = NULL;
847 	copy = *cp;
848 	cp->lock = LOCK_INVALID;
849 
850 	iiflags = needkey = ret = stack = 0;
851 	switch (flags) {
852 	case DB_AFTER:
853 	case DB_BEFORE:
854 	case DB_CURRENT:
855 		needkey = 1;
856 		if (cp->dpgno == PGNO_INVALID) {
857 			pgno = cp->pgno;
858 			indx = cp->indx;
859 		} else {
860 			pgno = cp->dpgno;
861 			indx = cp->dindx;
862 		}
863 
864 		/*
865 		 * !!!
866 		 * This test is right -- we don't yet support duplicates and
867 		 * record numbers in the same tree, so ignore duplicates if
868 		 * DB_BT_RECNUM set.
869 		 */
870 		if (F_ISSET(dbp, DB_BT_RECNUM) &&
871 		    (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) {
872 			/* Acquire a complete stack. */
873 			if ((ret = __bam_c_getstack(dbc, cp)) != 0)
874 				goto err;
875 			cp->page = cp->csp->page;
876 
877 			stack = 1;
878 			iiflags = BI_DOINCR;
879 		} else {
880 			/* Acquire the current page. */
881 			if ((ret = __bam_lget(dbc,
882 			    0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0)
883 				ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page);
884 			if (ret != 0)
885 				goto err;
886 
887 			iiflags = 0;
888 		}
889 
890 		/*
891 		 * If the user has specified a duplicate comparison function,
892 		 * we return an error if DB_CURRENT was specified and the
893 		 * replacement data doesn't compare equal to the current data.
894 		 * This stops apps from screwing up the duplicate sort order.
895 		 */
896 		if (flags == DB_CURRENT && dbp->dup_compare != NULL)
897 			if (__bam_cmp(dbp, data,
898 			    cp->page, indx, dbp->dup_compare) != 0) {
899 				ret = EINVAL;
900 				goto err;
901 			}
902 
903 		iiop = flags;
904 		break;
905 	case DB_KEYFIRST:
906 	case DB_KEYLAST:
907 		/*
908 		 * If we have a duplicate comparison function, we position to
909 		 * the first of any on-page duplicates, and use __bam_dsearch
910 		 * to search for the right slot.  Otherwise, we position to
911 		 * the first/last of any on-page duplicates based on the flag
912 		 * value.
913 		 */
914 		if ((ret = __bam_c_search(dbc, cp, key,
915 		    flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
916 		    DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
917 			goto err;
918 		stack = 1;
919 
920 		/*
921 		 * If an exact match:
922 		 *	If duplicates aren't supported, replace the current
923 		 *	item.  (When implementing the DB->put function, our
924 		 *	caller has already checked the DB_NOOVERWRITE flag.)
925 		 *
926 		 *	If there's a duplicate comparison function, find the
927 		 *	correct slot for this duplicate item.
928 		 *
929 		 *	If there's no duplicate comparison function, set the
930 		 *	insert flag based on the argument flags.
931 		 *
932 		 * If there's no match, the search function returned the
933 		 * smallest slot greater than the key, use it.
934 		 */
935 		if (exact) {
936 			if (F_ISSET(dbp, DB_AM_DUP)) {
937 				/*
938 				 * If at off-page duplicate page, move to the
939 				 * first or last entry -- if a comparison
940 				 * function was specified, start searching at
941 				 * the first entry.  Otherwise, move based on
942 				 * the DB_KEYFIRST/DB_KEYLAST flags.
943 				 */
944 				if ((ret = __bam_dup(dbc, cp, cp->indx,
945 				    dbp->dup_compare == NULL &&
946 				    flags != DB_KEYFIRST)) != 0)
947 					goto err;
948 
949 				/*
950 				 * If there's a comparison function, search for
951 				 * the correct slot.  Otherwise, set the insert
952 				 * flag based on the argment flag.
953 				 */
954 				if (dbp->dup_compare == NULL)
955 					iiop = flags == DB_KEYFIRST ?
956 					    DB_BEFORE : DB_AFTER;
957 				else
958 					if ((ret = __bam_dsearch(dbc,
959 					    cp, data, &iiop)) != 0)
960 						goto err;
961 			} else
962 				iiop = DB_CURRENT;
963 			iiflags = 0;
964 		} else {
965 			iiop = DB_BEFORE;
966 			iiflags = BI_NEWKEY;
967 		}
968 
969 		if (cp->dpgno == PGNO_INVALID) {
970 			pgno = cp->pgno;
971 			indx = cp->indx;
972 		} else {
973 			pgno = cp->dpgno;
974 			indx = cp->dindx;
975 		}
976 		break;
977 	}
978 
979 	ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags);
980 
981 	if (ret == DB_NEEDSPLIT)
982 		goto split;
983 	if (ret != 0)
984 		goto err;
985 
986 	/*
987 	 * Reset any cursors referencing this item that might have the item
988 	 * marked for deletion.
989 	 */
990 	if (iiop == DB_CURRENT) {
991 		(void)__bam_ca_delete(dbp, pgno, indx, 0);
992 
993 		/*
994 		 * It's also possible that we are the cursor that had the
995 		 * item marked for deletion, in which case we want to make
996 		 * sure that we don't delete it because we had the delete
997 		 * flag set already.
998 		 */
999 		if (cp->pgno == copy.pgno && cp->indx == copy.indx &&
1000 		    cp->dpgno == copy.dpgno && cp->dindx == copy.dindx)
1001 			F_CLR(&copy, C_DELETED);
1002 	}
1003 
1004 	/*
1005 	 * Update the cursor to point to the new entry.  The new entry was
1006 	 * stored on the current page, because we split pages until it was
1007 	 * possible.
1008 	 */
1009 	if (cp->dpgno == PGNO_INVALID)
1010 		cp->indx = indx;
1011 	else
1012 		cp->dindx = indx;
1013 
1014 	/*
1015 	 * If the previous cursor record has been deleted, physically delete
1016 	 * the entry from the page.  We clear the deleted flag before we call
1017 	 * the underlying delete routine so that, if an error occurs, and we
1018 	 * restore the cursor, the deleted flag is cleared.  This is because,
1019 	 * if we manage to physically modify the page, and then restore the
1020 	 * cursor, we might try to repeat the page modification when closing
1021 	 * the cursor.
1022 	 */
1023 	if (F_ISSET(&copy, C_DELETED)) {
1024 		F_CLR(&copy, C_DELETED);
1025 		if ((ret = __bam_c_physdel(dbc, &copy, cp->page)) != 0)
1026 			goto err;
1027 	}
1028 	F_CLR(cp, C_DELETED);
1029 
1030 	/* Release the previous lock, if any; the current lock is retained. */
1031 	if (copy.lock != LOCK_INVALID)
1032 		(void)__BT_TLPUT(dbc, copy.lock);
1033 
1034 	/*
1035 	 * Discard any pages pinned in the tree and their locks, except for
1036 	 * the leaf page, for which we only discard the pin, not the lock.
1037 	 *
1038 	 * Note, the leaf page participated in the stack we acquired, and so
1039 	 * we have to adjust the stack as necessary.  If there was only a
1040 	 * single page on the stack, we don't have to free further stack pages.
1041 	 */
1042 	if (stack && BT_STK_POP(cp) != NULL)
1043 		(void)__bam_stkrel(dbc, 0);
1044 
1045 	/* Release the current page. */
1046 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1047 		goto err;
1048 
1049 	if (0) {
1050 err:		/* Discard any pinned pages. */
1051 		if (stack)
1052 			(void)__bam_stkrel(dbc, 0);
1053 		else
1054 			DISCARD(dbc, cp);
1055 		*cp = copy;
1056 	}
1057 
1058 	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1059 		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1060 		    DB_LOCK_IWRITE, 0);
1061 
1062 	return (ret);
1063 }
1064 
1065 /*
1066  * __bam_c_first --
1067  *	Return the first record.
1068  */
1069 static int
1070 __bam_c_first(dbc, cp)
1071 	DBC *dbc;
1072 	CURSOR *cp;
1073 {
1074 	DB *dbp;
1075 	db_pgno_t pgno;
1076 	int ret;
1077 
1078 	dbp = dbc->dbp;
1079 
1080 	/* Walk down the left-hand side of the tree. */
1081 	for (pgno = PGNO_ROOT;;) {
1082 		if ((ret =
1083 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1084 			return (ret);
1085 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1086 			return (ret);
1087 
1088 		/* If we find a leaf page, we're done. */
1089 		if (ISLEAF(cp->page))
1090 			break;
1091 
1092 		pgno = GET_BINTERNAL(cp->page, 0)->pgno;
1093 		DISCARD(dbc, cp);
1094 	}
1095 
1096 	cp->pgno = cp->page->pgno;
1097 	cp->indx = 0;
1098 	cp->dpgno = PGNO_INVALID;
1099 
1100 	/* Check for duplicates. */
1101 	if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
1102 		return (ret);
1103 
1104 	/* If on an empty page or a deleted record, move to the next one. */
1105 	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1106 		if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
1107 			return (ret);
1108 
1109 	return (0);
1110 }
1111 
1112 /*
1113  * __bam_c_last --
1114  *	Return the last record.
1115  */
1116 static int
1117 __bam_c_last(dbc, cp)
1118 	DBC *dbc;
1119 	CURSOR *cp;
1120 {
1121 	DB *dbp;
1122 	db_pgno_t pgno;
1123 	int ret;
1124 
1125 	dbp = dbc->dbp;
1126 
1127 	/* Walk down the right-hand side of the tree. */
1128 	for (pgno = PGNO_ROOT;;) {
1129 		if ((ret =
1130 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1131 			return (ret);
1132 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1133 			return (ret);
1134 
1135 		/* If we find a leaf page, we're done. */
1136 		if (ISLEAF(cp->page))
1137 			break;
1138 
1139 		pgno =
1140 		    GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
1141 		DISCARD(dbc, cp);
1142 	}
1143 
1144 	cp->pgno = cp->page->pgno;
1145 	cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX;
1146 	cp->dpgno = PGNO_INVALID;
1147 
1148 	/* Check for duplicates. */
1149 	if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0)
1150 		return (ret);
1151 
1152 	/* If on an empty page or a deleted record, move to the next one. */
1153 	if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1154 		if ((ret = __bam_c_prev(dbc, cp)) != 0)
1155 			return (ret);
1156 
1157 	return (0);
1158 }
1159 
1160 /*
1161  * __bam_c_next --
1162  *	Move to the next record.
1163  */
1164 static int
1165 __bam_c_next(dbc, cp, initial_move)
1166 	DBC *dbc;
1167 	CURSOR *cp;
1168 	int initial_move;
1169 {
1170 	DB *dbp;
1171 	db_indx_t adjust, indx;
1172 	db_pgno_t pgno;
1173 	int ret;
1174 
1175 	dbp = dbc->dbp;
1176 
1177 	/*
1178 	 * We're either moving through a page of duplicates or a btree leaf
1179 	 * page.
1180 	 */
1181 	if (cp->dpgno == PGNO_INVALID) {
1182 		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1183 		pgno = cp->pgno;
1184 		indx = cp->indx;
1185 	} else {
1186 		adjust = O_INDX;
1187 		pgno = cp->dpgno;
1188 		indx = cp->dindx;
1189 	}
1190 	if (cp->page == NULL) {
1191 		if ((ret =
1192 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1193 			return (ret);
1194 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1195 			return (ret);
1196 	}
1197 
1198 	/*
1199 	 * If at the end of the page, move to a subsequent page.
1200 	 *
1201 	 * !!!
1202 	 * Check for >= NUM_ENT.  If we're here as the result of a search that
1203 	 * landed us on NUM_ENT, we'll increment indx before we test.
1204 	 *
1205 	 * !!!
1206 	 * This code handles empty pages and pages with only deleted entries.
1207 	 */
1208 	if (initial_move)
1209 		indx += adjust;
1210 	for (;;) {
1211 		if (indx >= NUM_ENT(cp->page)) {
1212 			/*
1213 			 * If we're in a btree leaf page, we've reached the end
1214 			 * of the tree.  If we've reached the end of a page of
1215 			 * duplicates, continue from the btree leaf page where
1216 			 * we found this page of duplicates.
1217 			 */
1218 			pgno = cp->page->next_pgno;
1219 			if (pgno == PGNO_INVALID) {
1220 				/* If in a btree leaf page, it's EOF. */
1221 				if (cp->dpgno == PGNO_INVALID)
1222 					return (DB_NOTFOUND);
1223 
1224 				/* Continue from the last btree leaf page. */
1225 				cp->dpgno = PGNO_INVALID;
1226 
1227 				adjust = P_INDX;
1228 				pgno = cp->pgno;
1229 				indx = cp->indx + P_INDX;
1230 			} else
1231 				indx = 0;
1232 
1233 			DISCARD(dbc, cp);
1234 			if ((ret = __bam_lget(dbc,
1235 			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1236 				return (ret);
1237 			if ((ret =
1238 			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1239 				return (ret);
1240 			continue;
1241 		}
1242 
1243 		/* Ignore deleted records. */
1244 		if (IS_DELETED(cp, indx)) {
1245 			indx += adjust;
1246 			continue;
1247 		}
1248 
1249 		/*
1250 		 * If we're not in a duplicates page, check to see if we've
1251 		 * found a page of duplicates, in which case we move to the
1252 		 * first entry.
1253 		 */
1254 		if (cp->dpgno == PGNO_INVALID) {
1255 			cp->pgno = cp->page->pgno;
1256 			cp->indx = indx;
1257 
1258 			if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0)
1259 				return (ret);
1260 			if (cp->dpgno != PGNO_INVALID) {
1261 				indx = cp->dindx;
1262 				adjust = O_INDX;
1263 				continue;
1264 			}
1265 		} else {
1266 			cp->dpgno = cp->page->pgno;
1267 			cp->dindx = indx;
1268 		}
1269 		break;
1270 	}
1271 	return (0);
1272 }
1273 
1274 /*
1275  * __bam_c_prev --
1276  *	Move to the previous record.
1277  */
1278 static int
1279 __bam_c_prev(dbc, cp)
1280 	DBC *dbc;
1281 	CURSOR *cp;
1282 {
1283 	DB *dbp;
1284 	db_indx_t indx, adjust;
1285 	db_pgno_t pgno;
1286 	int ret, set_indx;
1287 
1288 	dbp = dbc->dbp;
1289 
1290 	/*
1291 	 * We're either moving through a page of duplicates or a btree leaf
1292 	 * page.
1293 	 */
1294 	if (cp->dpgno == PGNO_INVALID) {
1295 		adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1296 		pgno = cp->pgno;
1297 		indx = cp->indx;
1298 	} else {
1299 		adjust = O_INDX;
1300 		pgno = cp->dpgno;
1301 		indx = cp->dindx;
1302 	}
1303 	if (cp->page == NULL) {
1304 		if ((ret =
1305 		    __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1306 			return (ret);
1307 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1308 			return (ret);
1309 	}
1310 
1311 	/*
1312 	 * If at the beginning of the page, move to any previous one.
1313 	 *
1314 	 * !!!
1315 	 * This code handles empty pages and pages with only deleted entries.
1316 	 */
1317 	for (;;) {
1318 		if (indx == 0) {
1319 			/*
1320 			 * If we're in a btree leaf page, we've reached the
1321 			 * beginning of the tree.  If we've reached the first
1322 			 * of a page of duplicates, continue from the btree
1323 			 * leaf page where we found this page of duplicates.
1324 			 */
1325 			pgno = cp->page->prev_pgno;
1326 			if (pgno == PGNO_INVALID) {
1327 				/* If in a btree leaf page, it's SOF. */
1328 				if (cp->dpgno == PGNO_INVALID)
1329 					return (DB_NOTFOUND);
1330 
1331 				/* Continue from the last btree leaf page. */
1332 				cp->dpgno = PGNO_INVALID;
1333 
1334 				adjust = P_INDX;
1335 				pgno = cp->pgno;
1336 				indx = cp->indx;
1337 				set_indx = 0;
1338 			} else
1339 				set_indx = 1;
1340 
1341 			DISCARD(dbc, cp);
1342 			if ((ret = __bam_lget(dbc,
1343 			    0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1344 				return (ret);
1345 			if ((ret =
1346 			    memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1347 				return (ret);
1348 
1349 			if (set_indx)
1350 				indx = NUM_ENT(cp->page);
1351 			if (indx == 0)
1352 				continue;
1353 		}
1354 
1355 		/* Ignore deleted records. */
1356 		indx -= adjust;
1357 		if (IS_DELETED(cp, indx))
1358 			continue;
1359 
1360 		/*
1361 		 * If we're not in a duplicates page, check to see if we've
1362 		 * found a page of duplicates, in which case we move to the
1363 		 * last entry.
1364 		 */
1365 		if (cp->dpgno == PGNO_INVALID) {
1366 			cp->pgno = cp->page->pgno;
1367 			cp->indx = indx;
1368 
1369 			if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0)
1370 				return (ret);
1371 			if (cp->dpgno != PGNO_INVALID) {
1372 				indx = cp->dindx + O_INDX;
1373 				adjust = O_INDX;
1374 				continue;
1375 			}
1376 		} else {
1377 			cp->dpgno = cp->page->pgno;
1378 			cp->dindx = indx;
1379 		}
1380 		break;
1381 	}
1382 	return (0);
1383 }
1384 
1385 /*
1386  * __bam_c_search --
1387  *	Move to a specified record.
1388  */
1389 static int
1390 __bam_c_search(dbc, cp, key, flags, exactp)
1391 	DBC *dbc;
1392 	CURSOR *cp;
1393 	const DBT *key;
1394 	u_int32_t flags;
1395 	int *exactp;
1396 {
1397 	BTREE *t;
1398 	DB *dbp;
1399 	DB_LOCK lock;
1400 	PAGE *h;
1401 	db_recno_t recno;
1402 	db_indx_t indx;
1403 	u_int32_t sflags;
1404 	int cmp, needexact, ret;
1405 
1406 	dbp = dbc->dbp;
1407 	t = dbp->internal;
1408 
1409 	/* Find an entry in the database. */
1410 	switch (flags) {
1411 	case DB_SET_RECNO:
1412 		if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
1413 			return (ret);
1414 		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1415 		needexact = *exactp = 1;
1416 		ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp);
1417 		break;
1418 	case DB_SET:
1419 	case DB_GET_BOTH:
1420 		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1421 		needexact = *exactp = 1;
1422 		goto search;
1423 	case DB_SET_RANGE:
1424 		sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1425 		needexact = *exactp = 0;
1426 		goto search;
1427 	case DB_KEYFIRST:
1428 		sflags = S_KEYFIRST;
1429 		goto fast_search;
1430 	case DB_KEYLAST:
1431 		sflags = S_KEYLAST;
1432 fast_search:	needexact = *exactp = 0;
1433 		/*
1434 		 * If the application has a history of inserting into the first
1435 		 * or last pages of the database, we check those pages first to
1436 		 * avoid doing a full search.
1437 		 *
1438 		 * Record numbers can't be fast-tracked, the entire tree has to
1439 		 * be locked.
1440 		 */
1441 		h = NULL;
1442 		lock = LOCK_INVALID;
1443 		if (F_ISSET(dbp, DB_BT_RECNUM))
1444 			goto search;
1445 
1446 		/* Check if the application has a history of sorted input. */
1447 		if (t->bt_lpgno == PGNO_INVALID)
1448 			goto search;
1449 
1450 		/*
1451 		 * Lock and retrieve the page on which we did the last insert.
1452 		 * It's okay if it doesn't exist, or if it's not the page type
1453 		 * we expected, it just means that the world changed.
1454 		 */
1455 		if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
1456 			goto fast_miss;
1457 		if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h))
1458 			goto fast_miss;
1459 		if (TYPE(h) != P_LBTREE)
1460 			goto fast_miss;
1461 		if (NUM_ENT(h) == 0)
1462 			goto fast_miss;
1463 
1464 		/*
1465 		 * What we do here is test to see if we're at the beginning or
1466 		 * end of the tree and if the new item sorts before/after the
1467 		 * first/last page entry.  We don't try and catch inserts into
1468 		 * the middle of the tree (although we could, as long as there
1469 		 * were two keys on the page and we saved both the index and
1470 		 * the page number of the last insert).
1471 		 */
1472 		if (h->next_pgno == PGNO_INVALID) {
1473 			indx = NUM_ENT(h) - P_INDX;
1474 			if ((cmp =
1475 			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0)
1476 				goto try_begin;
1477 			if (cmp > 0) {
1478 				indx += P_INDX;
1479 				goto fast_hit;
1480 			}
1481 
1482 			/*
1483 			 * Found a duplicate.  If doing DB_KEYLAST, we're at
1484 			 * the correct position, otherwise, move to the first
1485 			 * of the duplicates.
1486 			 */
1487 			if (flags == DB_KEYLAST)
1488 				goto fast_hit;
1489 			for (;
1490 			    indx > 0 && h->inp[indx - P_INDX] == h->inp[indx];
1491 			    indx -= P_INDX)
1492 				;
1493 			goto fast_hit;
1494 		}
1495 try_begin:	if (h->prev_pgno == PGNO_INVALID) {
1496 			indx = 0;
1497 			if ((cmp =
1498 			    __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0)
1499 				goto fast_miss;
1500 			if (cmp < 0)
1501 				goto fast_hit;
1502 			/*
1503 			 * Found a duplicate.  If doing DB_KEYFIRST, we're at
1504 			 * the correct position, otherwise, move to the last
1505 			 * of the duplicates.
1506 			 */
1507 			if (flags == DB_KEYFIRST)
1508 				goto fast_hit;
1509 			for (;
1510 			    indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
1511 			    h->inp[indx] == h->inp[indx + P_INDX];
1512 			    indx += P_INDX)
1513 				;
1514 			goto fast_hit;
1515 		}
1516 		goto fast_miss;
1517 
1518 fast_hit:	/* Set the exact match flag, we may have found a duplicate. */
1519 		*exactp = cmp == 0;
1520 
1521 		/* Enter the entry in the stack. */
1522 		BT_STK_CLR(cp);
1523 		BT_STK_ENTER(cp, h, indx, lock, ret);
1524 		break;
1525 
1526 fast_miss:	if (h != NULL)
1527 			(void)memp_fput(dbp->mpf, h, 0);
1528 		if (lock != LOCK_INVALID)
1529 			(void)__BT_LPUT(dbc, lock);
1530 
1531 search:		ret = __bam_search(dbc, key, sflags, 1, NULL, exactp);
1532 		break;
1533 	default:				/* XXX: Impossible. */
1534 		abort();
1535 		/* NOTREACHED */
1536 	}
1537 	if (ret != 0)
1538 		return (ret);
1539 
1540 	/*
1541 	 * Initialize the cursor to reference it.  This has to be done
1542 	 * before we return (even with DB_NOTFOUND) because we have to
1543 	 * free the page(s) we locked in __bam_search.
1544 	 */
1545 	cp->page = cp->csp->page;
1546 	cp->pgno = cp->csp->page->pgno;
1547 	cp->indx = cp->csp->indx;
1548 	cp->lock = cp->csp->lock;
1549 	cp->dpgno = PGNO_INVALID;
1550 
1551 	/*
1552 	 * If we inserted a key into the first or last slot of the tree,
1553 	 * remember where it was so we can do it more quickly next time.
1554 	 */
1555 	if (flags == DB_KEYFIRST || flags == DB_KEYLAST)
1556 		t->bt_lpgno =
1557 		    ((cp->page->next_pgno == PGNO_INVALID &&
1558 		    cp->indx >= NUM_ENT(cp->page)) ||
1559 		    (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ?
1560 		    cp->pgno : PGNO_INVALID;
1561 
1562 	/* If we need an exact match and didn't find one, we're done. */
1563 	if (needexact && *exactp == 0)
1564 		return (DB_NOTFOUND);
1565 
1566 	return (0);
1567 }
1568 
1569 /*
1570  * __bam_dup --
1571  *	Check for an off-page duplicates entry, and if found, move to the
1572  *	first or last entry.
1573  *
1574  * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int));
1575  */
1576 int
1577 __bam_dup(dbc, cp, indx, last_dup)
1578 	DBC *dbc;
1579 	CURSOR *cp;
1580 	u_int32_t indx;
1581 	int last_dup;
1582 {
1583 	BOVERFLOW *bo;
1584 	DB *dbp;
1585 	db_pgno_t pgno;
1586 	int ret;
1587 
1588 	dbp = dbc->dbp;
1589 
1590 	/*
1591 	 * Check for an overflow entry.  If we find one, move to the
1592 	 * duplicates page, and optionally move to the last record on
1593 	 * that page.
1594 	 *
1595 	 * !!!
1596 	 * We don't lock duplicates pages, we've already got the correct
1597 	 * lock on the main page.
1598 	 */
1599 	bo = GET_BOVERFLOW(cp->page, indx + O_INDX);
1600 	if (B_TYPE(bo->type) != B_DUPLICATE)
1601 		return (0);
1602 
1603 	pgno = bo->pgno;
1604 	if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1605 		return (ret);
1606 	cp->page = NULL;
1607 	if (last_dup) {
1608 		if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0)
1609 			return (ret);
1610 		indx = NUM_ENT(cp->page) - O_INDX;
1611 	} else {
1612 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1613 			return (ret);
1614 		indx = 0;
1615 	}
1616 
1617 	/* Update the cursor's duplicate information. */
1618 	cp->dpgno = cp->page->pgno;
1619 	cp->dindx = indx;
1620 
1621 	return (0);
1622 }
1623 
1624 /*
1625  * __bam_c_physdel --
1626  *	Actually do the cursor deletion.
1627  */
1628 static int
1629 __bam_c_physdel(dbc, cp, h)
1630 	DBC *dbc;
1631 	CURSOR *cp;
1632 	PAGE *h;
1633 {
1634 	enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd;
1635 	BOVERFLOW bo;
1636 	DB *dbp;
1637 	DBT dbt;
1638 	DB_LOCK lock;
1639 	db_indx_t indx;
1640 	db_pgno_t pgno, next_pgno, prev_pgno;
1641 	int delete_page, local_page, ret;
1642 
1643 	dbp = dbc->dbp;
1644 
1645 	delete_page = ret = 0;
1646 
1647 	/* Figure out what we're deleting. */
1648 	if (cp->dpgno == PGNO_INVALID) {
1649 		pgno = cp->pgno;
1650 		indx = cp->indx;
1651 	} else {
1652 		pgno = cp->dpgno;
1653 		indx = cp->dindx;
1654 	}
1655 
1656 	/*
1657 	 * If the item is referenced by another cursor, set that cursor's
1658 	 * delete flag and leave it up to it to do the delete.
1659 	 *
1660 	 * !!!
1661 	 * This test for > 0 is a tricky.  There are two ways that we can
1662 	 * be called here.  Either we are closing the cursor or we've moved
1663 	 * off the page with the deleted entry.  In the first case, we've
1664 	 * already removed the cursor from the active queue, so we won't see
1665 	 * it in __bam_ca_delete. In the second case, it will be on a different
1666 	 * item, so we won't bother with it in __bam_ca_delete.
1667 	 */
1668 	if (__bam_ca_delete(dbp, pgno, indx, 1) > 0)
1669 		return (0);
1670 
1671 	/*
1672 	 * If this is concurrent DB, upgrade the lock if necessary.
1673 	 */
1674 	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) &&
1675 	    (ret = lock_get(dbp->dbenv->lk_info,
1676 	    dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
1677 	    &dbc->mylock)) != 0)
1678 		return (EAGAIN);
1679 
1680 	/*
1681 	 * If we don't already have the page locked, get it and delete the
1682 	 * items.
1683 	 */
1684 	if ((h == NULL || h->pgno != pgno)) {
1685 		if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
1686 			return (ret);
1687 		if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1688 			return (ret);
1689 		local_page = 1;
1690 	} else
1691 		local_page = 0;
1692 
1693 	/*
1694 	 * If we're deleting a duplicate entry and there are other duplicate
1695 	 * entries remaining, call the common code to do the work and fix up
1696 	 * the parent page as necessary.  Otherwise, do a normal btree delete.
1697 	 *
1698 	 * There are 5 possible cases:
1699 	 *
1700 	 * 1. It's not a duplicate item: do a normal btree delete.
1701 	 * 2. It's a duplicate item:
1702 	 *	2a: We delete an item from a page of duplicates, but there are
1703 	 *	    more items on the page.
1704 	 *      2b: We delete the last item from a page of duplicates, deleting
1705 	 *	    the last duplicate.
1706 	 *      2c: We delete the last item from a page of duplicates, but there
1707 	 *	    is a previous page of duplicates.
1708 	 *      2d: We delete the last item from a page of duplicates, but there
1709 	 *	    is a following page of duplicates.
1710 	 *
1711 	 * In the case of:
1712 	 *
1713 	 *  1: There's nothing further to do.
1714 	 * 2a: There's nothing further to do.
1715 	 * 2b: Do the normal btree delete instead of a duplicate delete, as
1716 	 *     that deletes both the duplicate chain and the parent page's
1717 	 *     entry.
1718 	 * 2c: There's nothing further to do.
1719 	 * 2d: Delete the duplicate, and update the parent page's entry.
1720 	 */
1721 	if (TYPE(h) == P_DUPLICATE) {
1722 		pgno = PGNO(h);
1723 		prev_pgno = PREV_PGNO(h);
1724 		next_pgno = NEXT_PGNO(h);
1725 
1726 		if (NUM_ENT(h) == 1 &&
1727 		    prev_pgno == PGNO_INVALID && next_pgno == PGNO_INVALID)
1728 			cmd = DELETE_PAGE;
1729 		else {
1730 			cmd = DELETE_ITEM;
1731 
1732 			/* Delete the duplicate. */
1733 			if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0)
1734 				goto err;
1735 
1736 			/*
1737 			 * 2a: h != NULL, h->pgno == pgno
1738 			 * 2b: We don't reach this clause, as the above test
1739 			 *     was true.
1740 			 * 2c: h == NULL, prev_pgno != PGNO_INVALID
1741 			 * 2d: h != NULL, next_pgno != PGNO_INVALID
1742 			 *
1743 			 * Test for 2a and 2c: if we didn't empty the current
1744 			 * page or there was a previous page of duplicates, we
1745 			 * don't need to touch the parent page.
1746 			 */
1747 			if ((h != NULL && pgno == h->pgno) ||
1748 			    prev_pgno != PGNO_INVALID)
1749 				cmd = NOTHING_FURTHER;
1750 		}
1751 
1752 		/*
1753 		 * Release any page we're holding and its lock.
1754 		 *
1755 		 * !!!
1756 		 * If there is no subsequent page in the duplicate chain, then
1757 		 * __db_drem will have put page "h" and set it to NULL.
1758 		*/
1759 		if (local_page) {
1760 			if (h != NULL)
1761 				(void)memp_fput(dbp->mpf, h, 0);
1762 			(void)__BT_TLPUT(dbc, lock);
1763 			local_page = 0;
1764 		}
1765 
1766 		if (cmd == NOTHING_FURTHER)
1767 			goto done;
1768 
1769 		/* Acquire the parent page and switch the index to its entry. */
1770 		if ((ret =
1771 		    __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
1772 			goto err;
1773 		if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) {
1774 			(void)__BT_TLPUT(dbc, lock);
1775 			goto err;
1776 		}
1777 		local_page = 1;
1778 		indx = cp->indx;
1779 
1780 		if (cmd == DELETE_PAGE)
1781 			goto btd;
1782 
1783 		/*
1784 		 * Copy, delete, update, add-back the parent page's data entry.
1785 		 *
1786 		 * XXX
1787 		 * This may be a performance/logging problem.  We should add a
1788 		 * log message which simply logs/updates a random set of bytes
1789 		 * on a page, and use it instead of doing a delete/add pair.
1790 		 */
1791 		indx += O_INDX;
1792 		bo = *GET_BOVERFLOW(h, indx);
1793 		(void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE);
1794 		bo.pgno = next_pgno;
1795 		memset(&dbt, 0, sizeof(dbt));
1796 		dbt.data = &bo;
1797 		dbt.size = BOVERFLOW_SIZE;
1798 		(void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL);
1799 		(void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
1800 		goto done;
1801 	}
1802 
1803 btd:	/*
1804 	 * If the page is going to be emptied, delete it.  To delete a leaf
1805 	 * page we need a copy of a key from the page.  We use the 0th page
1806 	 * index since it's the last key that the page held.
1807 	 *
1808 	 * We malloc the page information instead of using the return key/data
1809 	 * memory because we've already set them -- the reason we've already
1810 	 * set them is because we're (potentially) about to do a reverse split,
1811 	 * which would make our saved page information useless.
1812 	 *
1813 	 * !!!
1814 	 * The following operations to delete a page might deadlock.  I think
1815 	 * that's OK.  The problem is if we're deleting an item because we're
1816 	 * closing cursors because we've already deadlocked and want to call
1817 	 * txn_abort().  If we fail due to deadlock, we leave a locked empty
1818 	 * page in the tree, which won't be empty long because we're going to
1819 	 * undo the delete.
1820 	 */
1821 	if (NUM_ENT(h) == 2 && h->pgno != PGNO_ROOT) {
1822 		memset(&dbt, 0, sizeof(DBT));
1823 		dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1824 		if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1825 			goto err;
1826 		delete_page = 1;
1827 	}
1828 
1829 	/*
1830 	 * Do a normal btree delete.
1831 	 *
1832 	 * !!!
1833 	 * Delete the key item first, otherwise the duplicate checks in
1834 	 * __bam_ditem() won't work!
1835 	 */
1836 	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1837 		goto err;
1838 	if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1839 		goto err;
1840 
1841 	/* Discard any remaining locks/pages. */
1842 	if (local_page) {
1843 		(void)memp_fput(dbp->mpf, h, 0);
1844 		(void)__BT_TLPUT(dbc, lock);
1845 		local_page = 0;
1846 	}
1847 
1848 	/* Delete the page if it was emptied. */
1849 	if (delete_page)
1850 		ret = __bam_dpage(dbc, &dbt);
1851 
1852 err:
1853 done:	if (delete_page)
1854 		__os_free(dbt.data, dbt.size);
1855 
1856 	if (local_page) {
1857 		/*
1858 		 * It's possible for h to be NULL, as __db_drem may have
1859 		 * been relinking pages by the time that it deadlocked.
1860 		 */
1861 		if (h != NULL)
1862 			(void)memp_fput(dbp->mpf, h, 0);
1863 		(void)__BT_TLPUT(dbc, lock);
1864 	}
1865 
1866 	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1867 		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1868 		    DB_LOCK_IWRITE, 0);
1869 
1870 	return (ret);
1871 }
1872 
1873 /*
1874  * __bam_c_getstack --
1875  *	Acquire a full stack for a cursor.
1876  */
1877 static int
1878 __bam_c_getstack(dbc, cp)
1879 	DBC *dbc;
1880 	CURSOR *cp;
1881 {
1882 	DB *dbp;
1883 	DBT dbt;
1884 	PAGE *h;
1885 	db_pgno_t pgno;
1886 	int exact, ret;
1887 
1888 	dbp = dbc->dbp;
1889 	h = NULL;
1890 	memset(&dbt, 0, sizeof(DBT));
1891 	ret = 0;
1892 
1893 	/* Get the page with the current item on it. */
1894 	pgno = cp->pgno;
1895 	if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1896 		return (ret);
1897 
1898 	/* Get a copy of a key from the page. */
1899 	dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1900 	if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1901 		goto err;
1902 
1903 	/* Get a write-locked stack for that page. */
1904 	exact = 0;
1905 	ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact);
1906 
1907 	/* We no longer need the key or the page. */
1908 err:	if (h != NULL)
1909 		(void)memp_fput(dbp->mpf, h, 0);
1910 	if (dbt.data != NULL)
1911 		__os_free(dbt.data, dbt.size);
1912 	return (ret);
1913 }
1914