1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7
8 #include "config.h"
9
10 #ifndef lint
11 static const char sccsid[] = "@(#)bt_cursor.c 10.81 (Sleepycat) 12/16/98";
12 #endif /* not lint */
13
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
16
17 #include <errno.h>
18 #include <stdlib.h>
19 #include <string.h>
20 #endif
21
22 #include "db_int.h"
23 #include "db_page.h"
24 #include "btree.h"
25 #include "shqueue.h"
26 #include "db_shash.h"
27 #include "lock.h"
28 #include "lock_ext.h"
29
30 static int __bam_c_close __P((DBC *));
31 static int __bam_c_del __P((DBC *, u_int32_t));
32 static int __bam_c_destroy __P((DBC *));
33 static int __bam_c_first __P((DBC *, CURSOR *));
34 static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
35 static int __bam_c_getstack __P((DBC *, CURSOR *));
36 static int __bam_c_last __P((DBC *, CURSOR *));
37 static int __bam_c_next __P((DBC *, CURSOR *, int));
38 static int __bam_c_physdel __P((DBC *, CURSOR *, PAGE *));
39 static int __bam_c_prev __P((DBC *, CURSOR *));
40 static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
41 static void __bam_c_reset __P((CURSOR *));
42 static int __bam_c_rget __P((DBC *, DBT *, u_int32_t));
43 static int __bam_c_search __P((DBC *, CURSOR *, const DBT *, u_int32_t, int *));
44 static int __bam_dsearch __P((DBC *, CURSOR *, DBT *, u_int32_t *));
45
46 /* Discard the current page/lock held by a cursor. */
47 #undef DISCARD
48 #define DISCARD(dbc, cp) { \
49 if ((cp)->page != NULL) { \
50 (void)memp_fput((dbc)->dbp->mpf, (cp)->page, 0); \
51 (cp)->page = NULL; \
52 } \
53 if ((cp)->lock != LOCK_INVALID) { \
54 (void)__BT_TLPUT((dbc), (cp)->lock); \
55 (cp)->lock = LOCK_INVALID; \
56 } \
57 }
58
59 /* If the cursor references a deleted record. */
60 #undef IS_CUR_DELETED
61 #define IS_CUR_DELETED(cp) \
62 (((cp)->dpgno == PGNO_INVALID && \
63 B_DISSET(GET_BKEYDATA((cp)->page, \
64 (cp)->indx + O_INDX)->type)) || \
65 ((cp)->dpgno != PGNO_INVALID && \
66 B_DISSET(GET_BKEYDATA((cp)->page, (cp)->dindx)->type)))
67
68 /* If the cursor and index combination references a deleted record. */
69 #undef IS_DELETED
70 #define IS_DELETED(cp, indx) \
71 (((cp)->dpgno == PGNO_INVALID && \
72 B_DISSET(GET_BKEYDATA((cp)->page, (indx) + O_INDX)->type)) || \
73 ((cp)->dpgno != PGNO_INVALID && \
74 B_DISSET(GET_BKEYDATA((cp)->page, (indx))->type)))
75
76 /*
77 * Test to see if two cursors could point to duplicates of the same key,
78 * whether on-page or off-page. The leaf page numbers must be the same
79 * in both cases. In the case of off-page duplicates, the key indices
80 * on the leaf page will be the same. In the case of on-page duplicates,
81 * the duplicate page number must not be set, and the key index offsets
82 * must be the same. For the last test, as the saved copy of the cursor
83 * will not have a valid page pointer, we use the cursor's.
84 */
85 #undef POSSIBLE_DUPLICATE
86 #define POSSIBLE_DUPLICATE(cursor, saved_copy) \
87 ((cursor)->pgno == (saved_copy).pgno && \
88 ((cursor)->indx == (saved_copy).indx || \
89 ((cursor)->dpgno == PGNO_INVALID && \
90 (saved_copy).dpgno == PGNO_INVALID && \
91 (cursor)->page->inp[(cursor)->indx] == \
92 (cursor)->page->inp[(saved_copy).indx])))
93
94 /*
95 * __bam_c_reset --
96 * Initialize internal cursor structure.
97 */
98 static void
__bam_c_reset(cp)99 __bam_c_reset(cp)
100 CURSOR *cp;
101 {
102 cp->sp = cp->csp = cp->stack;
103 cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
104 cp->page = NULL;
105 cp->pgno = PGNO_INVALID;
106 cp->indx = 0;
107 cp->dpgno = PGNO_INVALID;
108 cp->dindx = 0;
109 cp->lock = LOCK_INVALID;
110 cp->mode = DB_LOCK_NG;
111 cp->recno = RECNO_OOB;
112 cp->flags = 0;
113 }
114
115 /*
116 * __bam_c_init --
117 * Initialize the access private portion of a cursor
118 *
119 * PUBLIC: int __bam_c_init __P((DBC *));
120 */
121 int
__bam_c_init(dbc)122 __bam_c_init(dbc)
123 DBC *dbc;
124 {
125 DB *dbp;
126 CURSOR *cp;
127 int ret;
128
129 if ((ret = __os_calloc(1, sizeof(CURSOR), &cp)) != 0)
130 return (ret);
131
132 dbp = dbc->dbp;
133 cp->dbc = dbc;
134
135 /*
136 * Logical record numbers are always the same size, and we don't want
137 * to have to check for space every time we return one. Allocate it
138 * in advance.
139 */
140 if (dbp->type == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
141 if ((ret = __os_malloc(sizeof(db_recno_t),
142 NULL, &dbc->rkey.data)) != 0) {
143 __os_free(cp, sizeof(CURSOR));
144 return (ret);
145 }
146 dbc->rkey.ulen = sizeof(db_recno_t);
147 }
148
149 /* Initialize methods. */
150 dbc->internal = cp;
151 if (dbp->type == DB_BTREE) {
152 dbc->c_am_close = __bam_c_close;
153 dbc->c_am_destroy = __bam_c_destroy;
154 dbc->c_del = __bam_c_del;
155 dbc->c_get = __bam_c_get;
156 dbc->c_put = __bam_c_put;
157 } else {
158 dbc->c_am_close = __bam_c_close;
159 dbc->c_am_destroy = __bam_c_destroy;
160 dbc->c_del = __ram_c_del;
161 dbc->c_get = __ram_c_get;
162 dbc->c_put = __ram_c_put;
163 }
164
165 /* Initialize dynamic information. */
166 __bam_c_reset(cp);
167
168 return (0);
169 }
170
171 /*
172 * __bam_c_close --
173 * Close down the cursor from a single use.
174 */
175 static int
__bam_c_close(dbc)176 __bam_c_close(dbc)
177 DBC *dbc;
178 {
179 CURSOR *cp;
180 DB *dbp;
181 int ret;
182
183 dbp = dbc->dbp;
184 cp = dbc->internal;
185 ret = 0;
186
187 /*
188 * If a cursor deleted a btree key, perform the actual deletion.
189 * (Recno keys are either deleted immediately or never deleted.)
190 */
191 if (dbp->type == DB_BTREE && F_ISSET(cp, C_DELETED))
192 ret = __bam_c_physdel(dbc, cp, NULL);
193
194 /* Discard any locks not acquired inside of a transaction. */
195 if (cp->lock != LOCK_INVALID) {
196 (void)__BT_TLPUT(dbc, cp->lock);
197 cp->lock = LOCK_INVALID;
198 }
199
200 /* Sanity checks. */
201 #ifdef DIAGNOSTIC
202 if (cp->csp != cp->stack)
203 __db_err(dbp->dbenv, "btree cursor close: stack not empty");
204 #endif
205
206 /* Initialize dynamic information. */
207 __bam_c_reset(cp);
208
209 return (ret);
210 }
211
212 /*
213 * __bam_c_destroy --
214 * Close a single cursor -- internal version.
215 */
216 static int
__bam_c_destroy(dbc)217 __bam_c_destroy(dbc)
218 DBC *dbc;
219 {
220 /* Discard the structures. */
221 __os_free(dbc->internal, sizeof(CURSOR));
222
223 return (0);
224 }
225
226 /*
227 * __bam_c_del --
228 * Delete using a cursor.
229 */
230 static int
__bam_c_del(dbc,flags)231 __bam_c_del(dbc, flags)
232 DBC *dbc;
233 u_int32_t flags;
234 {
235 CURSOR *cp;
236 DB *dbp;
237 DB_LOCK lock;
238 PAGE *h;
239 db_pgno_t pgno;
240 db_indx_t indx;
241 int ret;
242
243 dbp = dbc->dbp;
244 cp = dbc->internal;
245 h = NULL;
246
247 DB_PANIC_CHECK(dbp);
248
249 /* Check for invalid flags. */
250 if ((ret = __db_cdelchk(dbp, flags,
251 F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
252 return (ret);
253
254 /*
255 * If we are running CDB, this had better be either a write
256 * cursor or an immediate writer.
257 */
258 if (F_ISSET(dbp, DB_AM_CDB))
259 if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
260 return (EINVAL);
261
262 DEBUG_LWRITE(dbc, dbc->txn, "bam_c_del", NULL, NULL, flags);
263
264 /* If already deleted, return failure. */
265 if (F_ISSET(cp, C_DELETED))
266 return (DB_KEYEMPTY);
267
268 /*
269 * We don't physically delete the record until the cursor moves,
270 * so we have to have a long-lived write lock on the page instead
271 * of a long-lived read lock. Note, we have to have a read lock
272 * to even get here, so we simply discard it.
273 */
274 if (F_ISSET(dbp, DB_AM_LOCKING) && cp->mode != DB_LOCK_WRITE) {
275 if ((ret = __bam_lget(dbc,
276 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
277 goto err;
278 (void)__BT_TLPUT(dbc, cp->lock);
279 cp->lock = lock;
280 cp->mode = DB_LOCK_WRITE;
281 }
282
283 /*
284 * Acquire the underlying page (which may be different from the above
285 * page because it may be a duplicate page), and set the on-page and
286 * in-cursor delete flags. We don't need to lock it as we've already
287 * write-locked the page leading to it.
288 */
289 if (cp->dpgno == PGNO_INVALID) {
290 pgno = cp->pgno;
291 indx = cp->indx;
292 } else {
293 pgno = cp->dpgno;
294 indx = cp->dindx;
295 }
296
297 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
298 goto err;
299
300 /* Log the change. */
301 if (DB_LOGGING(dbc) &&
302 (ret = __bam_cdel_log(dbp->dbenv->lg_info, dbc->txn, &LSN(h),
303 0, dbp->log_fileid, PGNO(h), &LSN(h), indx)) != 0) {
304 (void)memp_fput(dbp->mpf, h, 0);
305 goto err;
306 }
307
308 /*
309 * Set the intent-to-delete flag on the page and update all cursors. */
310 if (cp->dpgno == PGNO_INVALID)
311 B_DSET(GET_BKEYDATA(h, indx + O_INDX)->type);
312 else
313 B_DSET(GET_BKEYDATA(h, indx)->type);
314 (void)__bam_ca_delete(dbp, pgno, indx, 1);
315
316 ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY);
317 h = NULL;
318
319 /*
320 * If the tree has record numbers, we have to adjust the counts.
321 *
322 * !!!
323 * This test is right -- we don't yet support duplicates and record
324 * numbers in the same tree, so ignore duplicates if DB_BT_RECNUM
325 * set.
326 */
327 if (F_ISSET(dbp, DB_BT_RECNUM)) {
328 if ((ret = __bam_c_getstack(dbc, cp)) != 0)
329 goto err;
330 if ((ret = __bam_adjust(dbc, -1)) != 0)
331 goto err;
332 (void)__bam_stkrel(dbc, 0);
333 }
334
335 err: if (h != NULL)
336 (void)memp_fput(dbp->mpf, h, 0);
337 return (ret);
338 }
339
340 /*
341 * __bam_c_get --
342 * Get using a cursor (btree).
343 */
344 static int
__bam_c_get(dbc,key,data,flags)345 __bam_c_get(dbc, key, data, flags)
346 DBC *dbc;
347 DBT *key, *data;
348 u_int32_t flags;
349 {
350 CURSOR *cp, copy, start;
351 DB *dbp;
352 PAGE *h;
353 int exact, ret, tmp_rmw;
354
355 dbp = dbc->dbp;
356 cp = dbc->internal;
357
358 DB_PANIC_CHECK(dbp);
359
360 /* Check for invalid flags. */
361 if ((ret = __db_cgetchk(dbp,
362 key, data, flags, cp->pgno != PGNO_INVALID)) != 0)
363 return (ret);
364
365 /* Clear OR'd in additional bits so we can check for flag equality. */
366 tmp_rmw = 0;
367 if (LF_ISSET(DB_RMW)) {
368 if (!F_ISSET(dbp, DB_AM_CDB)) {
369 tmp_rmw = 1;
370 F_SET(dbc, DBC_RMW);
371 }
372 LF_CLR(DB_RMW);
373 }
374
375 DEBUG_LREAD(dbc, dbc->txn, "bam_c_get",
376 flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
377
378 /*
379 * Return a cursor's record number. It has nothing to do with the
380 * cursor get code except that it's been rammed into the interface.
381 */
382 if (flags == DB_GET_RECNO) {
383 ret = __bam_c_rget(dbc, data, flags);
384 if (tmp_rmw)
385 F_CLR(dbc, DBC_RMW);
386 return (ret);
387 }
388
389 /*
390 * Initialize the cursor for a new retrieval. Clear the cursor's
391 * page pointer, it was set before this operation, and no longer
392 * has any meaning.
393 */
394 cp->page = NULL;
395 copy = *cp;
396 cp->lock = LOCK_INVALID;
397
398 switch (flags) {
399 case DB_CURRENT:
400 /* It's not possible to return a deleted record. */
401 if (F_ISSET(cp, C_DELETED)) {
402 ret = DB_KEYEMPTY;
403 goto err;
404 }
405
406 /* Acquire the current page. */
407 if ((ret = __bam_lget(dbc,
408 0, cp->pgno, DB_LOCK_READ, &cp->lock)) == 0)
409 ret = memp_fget(dbp->mpf,
410 cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
411 0, &cp->page);
412 if (ret != 0)
413 goto err;
414 break;
415 case DB_NEXT_DUP:
416 if (cp->pgno == PGNO_INVALID) {
417 ret = EINVAL;
418 goto err;
419 }
420 if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
421 goto err;
422
423 /* Make sure we didn't go past the end of the duplicates. */
424 if (!POSSIBLE_DUPLICATE(cp, copy)) {
425 ret = DB_NOTFOUND;
426 goto err;
427 }
428 break;
429 case DB_NEXT:
430 if (cp->pgno != PGNO_INVALID) {
431 if ((ret = __bam_c_next(dbc, cp, 1)) != 0)
432 goto err;
433 break;
434 }
435 /* FALLTHROUGH */
436 case DB_FIRST:
437 if ((ret = __bam_c_first(dbc, cp)) != 0)
438 goto err;
439 break;
440 case DB_PREV:
441 if (cp->pgno != PGNO_INVALID) {
442 if ((ret = __bam_c_prev(dbc, cp)) != 0)
443 goto err;
444 break;
445 }
446 /* FALLTHROUGH */
447 case DB_LAST:
448 if ((ret = __bam_c_last(dbc, cp)) != 0)
449 goto err;
450 break;
451 case DB_SET:
452 if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
453 goto err;
454
455 /*
456 * We cannot currently be referencing a deleted record, but we
457 * may be referencing off-page duplicates.
458 *
459 * If we're referencing off-page duplicates, move off-page.
460 * If we moved off-page, move to the next non-deleted record.
461 * If we moved to the next non-deleted record, check to make
462 * sure we didn't switch records because our current record
463 * had no non-deleted data items.
464 */
465 start = *cp;
466 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
467 goto err;
468 if (cp->dpgno != PGNO_INVALID && IS_CUR_DELETED(cp)) {
469 if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
470 goto err;
471 if (!POSSIBLE_DUPLICATE(cp, start)) {
472 ret = DB_NOTFOUND;
473 goto err;
474 }
475 }
476 break;
477 case DB_SET_RECNO:
478 if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
479 goto err;
480 break;
481 case DB_GET_BOTH:
482 if (F_ISSET(dbc, DBC_CONTINUE | DBC_KEYSET)) {
483 /* Acquire the current page. */
484 if ((ret = memp_fget(dbp->mpf,
485 cp->dpgno == PGNO_INVALID ? &cp->pgno : &cp->dpgno,
486 0, &cp->page)) != 0)
487 goto err;
488
489 /* If DBC_CONTINUE, move to the next item. */
490 if (F_ISSET(dbc, DBC_CONTINUE) &&
491 (ret = __bam_c_next(dbc, cp, 1)) != 0)
492 goto err;
493 } else {
494 if ((ret =
495 __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
496 goto err;
497
498 /*
499 * We may be referencing a duplicates page. Move to
500 * the first duplicate.
501 */
502 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
503 goto err;
504 }
505
506 /* Search for a matching entry. */
507 if ((ret = __bam_dsearch(dbc, cp, data, NULL)) != 0)
508 goto err;
509
510 /* Ignore deleted entries. */
511 if (IS_CUR_DELETED(cp)) {
512 ret = DB_NOTFOUND;
513 goto err;
514 }
515 break;
516 case DB_SET_RANGE:
517 if ((ret = __bam_c_search(dbc, cp, key, flags, &exact)) != 0)
518 goto err;
519
520 /*
521 * As we didn't require an exact match, the search function
522 * may have returned an entry past the end of the page. If
523 * so, move to the next entry.
524 */
525 if (cp->indx == NUM_ENT(cp->page) &&
526 (ret = __bam_c_next(dbc, cp, 0)) != 0)
527 goto err;
528
529 /*
530 * We may be referencing off-page duplicates, if so, move
531 * off-page.
532 */
533 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
534 goto err;
535
536 /*
537 * We may be referencing a deleted record, if so, move to
538 * the next non-deleted record.
539 */
540 if (IS_CUR_DELETED(cp) && (ret = __bam_c_next(dbc, cp, 0)) != 0)
541 goto err;
542 break;
543 }
544
545 /*
546 * Return the key if the user didn't give us one. If we've moved to
547 * a duplicate page, we may no longer have a pointer to the main page,
548 * so we have to go get it. We know that it's already read-locked,
549 * however, so we don't have to acquire a new lock.
550 */
551 if (flags != DB_SET) {
552 if (cp->dpgno != PGNO_INVALID) {
553 if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
554 goto err;
555 } else
556 h = cp->page;
557 ret = __db_ret(dbp,
558 h, cp->indx, key, &dbc->rkey.data, &dbc->rkey.ulen);
559 if (cp->dpgno != PGNO_INVALID)
560 (void)memp_fput(dbp->mpf, h, 0);
561 if (ret)
562 goto err;
563 }
564
565 /* Return the data. */
566 if ((ret = __db_ret(dbp, cp->page,
567 cp->dpgno == PGNO_INVALID ? cp->indx + O_INDX : cp->dindx,
568 data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
569 goto err;
570
571 /*
572 * If the previous cursor record has been deleted, physically delete
573 * the entry from the page. We clear the deleted flag before we call
574 * the underlying delete routine so that, if an error occurs, and we
575 * restore the cursor, the deleted flag is cleared. This is because,
576 * if we manage to physically modify the page, and then restore the
577 * cursor, we might try to repeat the page modification when closing
578 * the cursor.
579 */
580 if (F_ISSET(©, C_DELETED)) {
581 F_CLR(©, C_DELETED);
582 if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0)
583 goto err;
584 }
585 F_CLR(cp, C_DELETED);
586
587 /* Release the previous lock, if any; the current lock is retained. */
588 if (copy.lock != LOCK_INVALID)
589 (void)__BT_TLPUT(dbc, copy.lock);
590
591 /* Release the current page. */
592 if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
593 goto err;
594
595 if (0) {
596 err: if (cp->page != NULL)
597 (void)memp_fput(dbp->mpf, cp->page, 0);
598 if (cp->lock != LOCK_INVALID)
599 (void)__BT_TLPUT(dbc, cp->lock);
600 *cp = copy;
601 }
602
603 /* Release temporary lock upgrade. */
604 if (tmp_rmw)
605 F_CLR(dbc, DBC_RMW);
606
607 return (ret);
608 }
609
610 /*
611 * __bam_dsearch --
612 * Search for a matching data item (or the first data item that's
613 * equal to or greater than the one we're searching for).
614 */
615 static int
__bam_dsearch(dbc,cp,data,iflagp)616 __bam_dsearch(dbc, cp, data, iflagp)
617 DBC *dbc;
618 CURSOR *cp;
619 DBT *data;
620 u_int32_t *iflagp;
621 {
622 DB *dbp;
623 CURSOR copy, last;
624 int cmp, ret;
625
626 dbp = dbc->dbp;
627
628 /*
629 * If iflagp is non-NULL, we're doing an insert.
630 *
631 * If the duplicates are off-page, use the duplicate search routine.
632 */
633 if (cp->dpgno != PGNO_INVALID) {
634 if ((ret = __db_dsearch(dbc, iflagp != NULL,
635 data, cp->dpgno, &cp->dindx, &cp->page, &cmp)) != 0)
636 return (ret);
637 cp->dpgno = cp->page->pgno;
638
639 if (iflagp == NULL) {
640 if (cmp != 0)
641 return (DB_NOTFOUND);
642 return (0);
643 }
644 *iflagp = DB_BEFORE;
645 return (0);
646 }
647
648 /* Otherwise, do the search ourselves. */
649 copy = *cp;
650 for (;;) {
651 /* Save the last interesting cursor position. */
652 last = *cp;
653
654 /* See if the data item matches the one we're looking for. */
655 if ((cmp = __bam_cmp(dbp, data, cp->page, cp->indx + O_INDX,
656 dbp->dup_compare == NULL ?
657 __bam_defcmp : dbp->dup_compare)) == 0) {
658 if (iflagp != NULL)
659 *iflagp = DB_AFTER;
660 return (0);
661 }
662
663 /*
664 * If duplicate entries are sorted, we're done if we find a
665 * page entry that sorts greater than the application item.
666 * If doing an insert, return success, otherwise DB_NOTFOUND.
667 */
668 if (dbp->dup_compare != NULL && cmp < 0) {
669 if (iflagp == NULL)
670 return (DB_NOTFOUND);
671 *iflagp = DB_BEFORE;
672 return (0);
673 }
674
675 /*
676 * Move to the next item. If we reach the end of the page and
677 * we're doing an insert, set the cursor to the last item and
678 * set the referenced memory location so callers know to insert
679 * after the item, instead of before it. If not inserting, we
680 * return DB_NOTFOUND.
681 */
682 if ((cp->indx += P_INDX) >= NUM_ENT(cp->page)) {
683 if (iflagp == NULL)
684 return (DB_NOTFOUND);
685 goto use_last;
686 }
687
688 /*
689 * Make sure we didn't go past the end of the duplicates. The
690 * error conditions are the same as above.
691 */
692 if (!POSSIBLE_DUPLICATE(cp, copy)) {
693 if (iflagp == NULL)
694 return (DB_NOTFOUND);
695 use_last: *cp = last;
696 *iflagp = DB_AFTER;
697 return (0);
698 }
699 }
700 /* NOTREACHED */
701 }
702
703 /*
704 * __bam_c_rget --
705 * Return the record number for a cursor.
706 */
707 static int
__bam_c_rget(dbc,data,flags)708 __bam_c_rget(dbc, data, flags)
709 DBC *dbc;
710 DBT *data;
711 u_int32_t flags;
712 {
713 CURSOR *cp;
714 DB *dbp;
715 DBT dbt;
716 db_recno_t recno;
717 int exact, ret;
718
719 COMPQUIET(flags, 0);
720 dbp = dbc->dbp;
721 cp = dbc->internal;
722
723 /* Get the page with the current item on it. */
724 if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
725 return (ret);
726
727 /* Get a copy of the key. */
728 memset(&dbt, 0, sizeof(DBT));
729 dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
730 if ((ret = __db_ret(dbp, cp->page, cp->indx, &dbt, NULL, NULL)) != 0)
731 goto err;
732
733 exact = 1;
734 if ((ret = __bam_search(dbc, &dbt,
735 F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
736 1, &recno, &exact)) != 0)
737 goto err;
738
739 ret = __db_retcopy(data, &recno, sizeof(recno),
740 &dbc->rdata.data, &dbc->rdata.ulen, dbp->db_malloc);
741
742 /* Release the stack. */
743 __bam_stkrel(dbc, 0);
744
745 err: (void)memp_fput(dbp->mpf, cp->page, 0);
746 __os_free(dbt.data, dbt.size);
747 return (ret);
748 }
749
750 /*
751 * __bam_c_put --
752 * Put using a cursor.
753 */
754 static int
__bam_c_put(dbc,key,data,flags)755 __bam_c_put(dbc, key, data, flags)
756 DBC *dbc;
757 DBT *key, *data;
758 u_int32_t flags;
759 {
760 CURSOR *cp, copy;
761 DB *dbp;
762 DBT dbt;
763 db_indx_t indx;
764 db_pgno_t pgno;
765 u_int32_t iiflags, iiop;
766 int exact, needkey, ret, stack;
767 void *arg;
768
769 dbp = dbc->dbp;
770 cp = dbc->internal;
771
772 DB_PANIC_CHECK(dbp);
773
774 DEBUG_LWRITE(dbc, dbc->txn, "bam_c_put",
775 flags == DB_KEYFIRST || flags == DB_KEYLAST ? key : NULL,
776 data, flags);
777
778 if ((ret = __db_cputchk(dbp, key, data, flags,
779 F_ISSET(dbp, DB_AM_RDONLY), cp->pgno != PGNO_INVALID)) != 0)
780 return (ret);
781
782 /*
783 * If we are running CDB, this had better be either a write
784 * cursor or an immediate writer. If it's a regular writer,
785 * that means we have an IWRITE lock and we need to upgrade
786 * it to a write lock.
787 */
788 if (F_ISSET(dbp, DB_AM_CDB)) {
789 if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
790 return (EINVAL);
791
792 if (F_ISSET(dbc, DBC_RMW) &&
793 (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
794 DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
795 &dbc->mylock)) != 0)
796 return (EAGAIN);
797 }
798
799 if (0) {
800 split: /*
801 * To split, we need a valid key for the page. Since it's a
802 * cursor, we have to build one.
803 *
804 * Acquire a copy of a key from the page.
805 */
806 if (needkey) {
807 memset(&dbt, 0, sizeof(DBT));
808 if ((ret = __db_ret(dbp, cp->page, indx,
809 &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
810 goto err;
811 arg = &dbt;
812 } else
813 arg = key;
814
815 /*
816 * Discard any locks and pinned pages (the locks are discarded
817 * even if we're running with transactions, as they lock pages
818 * that we're sorry we ever acquired). If stack is set and the
819 * cursor entries are valid, they point to the same entries as
820 * the stack, don't free them twice.
821 */
822 if (stack) {
823 (void)__bam_stkrel(dbc, 1);
824 stack = 0;
825 } else
826 DISCARD(dbc, cp);
827
828 /*
829 * Restore the cursor to its original value. This is necessary
830 * for two reasons. First, we are about to copy it in case of
831 * error, again. Second, we adjust cursors during the split,
832 * and we have to ensure this cursor is adjusted appropriately,
833 * along with all the other cursors.
834 */
835 *cp = copy;
836
837 if ((ret = __bam_split(dbc, arg)) != 0)
838 goto err;
839 }
840
841 /*
842 * Initialize the cursor for a new retrieval. Clear the cursor's
843 * page pointer, it was set before this operation, and no longer
844 * has any meaning.
845 */
846 cp->page = NULL;
847 copy = *cp;
848 cp->lock = LOCK_INVALID;
849
850 iiflags = needkey = ret = stack = 0;
851 switch (flags) {
852 case DB_AFTER:
853 case DB_BEFORE:
854 case DB_CURRENT:
855 needkey = 1;
856 if (cp->dpgno == PGNO_INVALID) {
857 pgno = cp->pgno;
858 indx = cp->indx;
859 } else {
860 pgno = cp->dpgno;
861 indx = cp->dindx;
862 }
863
864 /*
865 * !!!
866 * This test is right -- we don't yet support duplicates and
867 * record numbers in the same tree, so ignore duplicates if
868 * DB_BT_RECNUM set.
869 */
870 if (F_ISSET(dbp, DB_BT_RECNUM) &&
871 (flags != DB_CURRENT || F_ISSET(cp, C_DELETED))) {
872 /* Acquire a complete stack. */
873 if ((ret = __bam_c_getstack(dbc, cp)) != 0)
874 goto err;
875 cp->page = cp->csp->page;
876
877 stack = 1;
878 iiflags = BI_DOINCR;
879 } else {
880 /* Acquire the current page. */
881 if ((ret = __bam_lget(dbc,
882 0, cp->pgno, DB_LOCK_WRITE, &cp->lock)) == 0)
883 ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page);
884 if (ret != 0)
885 goto err;
886
887 iiflags = 0;
888 }
889
890 /*
891 * If the user has specified a duplicate comparison function,
892 * we return an error if DB_CURRENT was specified and the
893 * replacement data doesn't compare equal to the current data.
894 * This stops apps from screwing up the duplicate sort order.
895 */
896 if (flags == DB_CURRENT && dbp->dup_compare != NULL)
897 if (__bam_cmp(dbp, data,
898 cp->page, indx, dbp->dup_compare) != 0) {
899 ret = EINVAL;
900 goto err;
901 }
902
903 iiop = flags;
904 break;
905 case DB_KEYFIRST:
906 case DB_KEYLAST:
907 /*
908 * If we have a duplicate comparison function, we position to
909 * the first of any on-page duplicates, and use __bam_dsearch
910 * to search for the right slot. Otherwise, we position to
911 * the first/last of any on-page duplicates based on the flag
912 * value.
913 */
914 if ((ret = __bam_c_search(dbc, cp, key,
915 flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
916 DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
917 goto err;
918 stack = 1;
919
920 /*
921 * If an exact match:
922 * If duplicates aren't supported, replace the current
923 * item. (When implementing the DB->put function, our
924 * caller has already checked the DB_NOOVERWRITE flag.)
925 *
926 * If there's a duplicate comparison function, find the
927 * correct slot for this duplicate item.
928 *
929 * If there's no duplicate comparison function, set the
930 * insert flag based on the argument flags.
931 *
932 * If there's no match, the search function returned the
933 * smallest slot greater than the key, use it.
934 */
935 if (exact) {
936 if (F_ISSET(dbp, DB_AM_DUP)) {
937 /*
938 * If at off-page duplicate page, move to the
939 * first or last entry -- if a comparison
940 * function was specified, start searching at
941 * the first entry. Otherwise, move based on
942 * the DB_KEYFIRST/DB_KEYLAST flags.
943 */
944 if ((ret = __bam_dup(dbc, cp, cp->indx,
945 dbp->dup_compare == NULL &&
946 flags != DB_KEYFIRST)) != 0)
947 goto err;
948
949 /*
950 * If there's a comparison function, search for
951 * the correct slot. Otherwise, set the insert
952 * flag based on the argment flag.
953 */
954 if (dbp->dup_compare == NULL)
955 iiop = flags == DB_KEYFIRST ?
956 DB_BEFORE : DB_AFTER;
957 else
958 if ((ret = __bam_dsearch(dbc,
959 cp, data, &iiop)) != 0)
960 goto err;
961 } else
962 iiop = DB_CURRENT;
963 iiflags = 0;
964 } else {
965 iiop = DB_BEFORE;
966 iiflags = BI_NEWKEY;
967 }
968
969 if (cp->dpgno == PGNO_INVALID) {
970 pgno = cp->pgno;
971 indx = cp->indx;
972 } else {
973 pgno = cp->dpgno;
974 indx = cp->dindx;
975 }
976 break;
977 }
978
979 ret = __bam_iitem(dbc, &cp->page, &indx, key, data, iiop, iiflags);
980
981 if (ret == DB_NEEDSPLIT)
982 goto split;
983 if (ret != 0)
984 goto err;
985
986 /*
987 * Reset any cursors referencing this item that might have the item
988 * marked for deletion.
989 */
990 if (iiop == DB_CURRENT) {
991 (void)__bam_ca_delete(dbp, pgno, indx, 0);
992
993 /*
994 * It's also possible that we are the cursor that had the
995 * item marked for deletion, in which case we want to make
996 * sure that we don't delete it because we had the delete
997 * flag set already.
998 */
999 if (cp->pgno == copy.pgno && cp->indx == copy.indx &&
1000 cp->dpgno == copy.dpgno && cp->dindx == copy.dindx)
1001 F_CLR(©, C_DELETED);
1002 }
1003
1004 /*
1005 * Update the cursor to point to the new entry. The new entry was
1006 * stored on the current page, because we split pages until it was
1007 * possible.
1008 */
1009 if (cp->dpgno == PGNO_INVALID)
1010 cp->indx = indx;
1011 else
1012 cp->dindx = indx;
1013
1014 /*
1015 * If the previous cursor record has been deleted, physically delete
1016 * the entry from the page. We clear the deleted flag before we call
1017 * the underlying delete routine so that, if an error occurs, and we
1018 * restore the cursor, the deleted flag is cleared. This is because,
1019 * if we manage to physically modify the page, and then restore the
1020 * cursor, we might try to repeat the page modification when closing
1021 * the cursor.
1022 */
1023 if (F_ISSET(©, C_DELETED)) {
1024 F_CLR(©, C_DELETED);
1025 if ((ret = __bam_c_physdel(dbc, ©, cp->page)) != 0)
1026 goto err;
1027 }
1028 F_CLR(cp, C_DELETED);
1029
1030 /* Release the previous lock, if any; the current lock is retained. */
1031 if (copy.lock != LOCK_INVALID)
1032 (void)__BT_TLPUT(dbc, copy.lock);
1033
1034 /*
1035 * Discard any pages pinned in the tree and their locks, except for
1036 * the leaf page, for which we only discard the pin, not the lock.
1037 *
1038 * Note, the leaf page participated in the stack we acquired, and so
1039 * we have to adjust the stack as necessary. If there was only a
1040 * single page on the stack, we don't have to free further stack pages.
1041 */
1042 if (stack && BT_STK_POP(cp) != NULL)
1043 (void)__bam_stkrel(dbc, 0);
1044
1045 /* Release the current page. */
1046 if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1047 goto err;
1048
1049 if (0) {
1050 err: /* Discard any pinned pages. */
1051 if (stack)
1052 (void)__bam_stkrel(dbc, 0);
1053 else
1054 DISCARD(dbc, cp);
1055 *cp = copy;
1056 }
1057
1058 if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1059 (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1060 DB_LOCK_IWRITE, 0);
1061
1062 return (ret);
1063 }
1064
1065 /*
1066 * __bam_c_first --
1067 * Return the first record.
1068 */
1069 static int
__bam_c_first(dbc,cp)1070 __bam_c_first(dbc, cp)
1071 DBC *dbc;
1072 CURSOR *cp;
1073 {
1074 DB *dbp;
1075 db_pgno_t pgno;
1076 int ret;
1077
1078 dbp = dbc->dbp;
1079
1080 /* Walk down the left-hand side of the tree. */
1081 for (pgno = PGNO_ROOT;;) {
1082 if ((ret =
1083 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1084 return (ret);
1085 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1086 return (ret);
1087
1088 /* If we find a leaf page, we're done. */
1089 if (ISLEAF(cp->page))
1090 break;
1091
1092 pgno = GET_BINTERNAL(cp->page, 0)->pgno;
1093 DISCARD(dbc, cp);
1094 }
1095
1096 cp->pgno = cp->page->pgno;
1097 cp->indx = 0;
1098 cp->dpgno = PGNO_INVALID;
1099
1100 /* Check for duplicates. */
1101 if ((ret = __bam_dup(dbc, cp, cp->indx, 0)) != 0)
1102 return (ret);
1103
1104 /* If on an empty page or a deleted record, move to the next one. */
1105 if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1106 if ((ret = __bam_c_next(dbc, cp, 0)) != 0)
1107 return (ret);
1108
1109 return (0);
1110 }
1111
1112 /*
1113 * __bam_c_last --
1114 * Return the last record.
1115 */
1116 static int
__bam_c_last(dbc,cp)1117 __bam_c_last(dbc, cp)
1118 DBC *dbc;
1119 CURSOR *cp;
1120 {
1121 DB *dbp;
1122 db_pgno_t pgno;
1123 int ret;
1124
1125 dbp = dbc->dbp;
1126
1127 /* Walk down the right-hand side of the tree. */
1128 for (pgno = PGNO_ROOT;;) {
1129 if ((ret =
1130 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1131 return (ret);
1132 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1133 return (ret);
1134
1135 /* If we find a leaf page, we're done. */
1136 if (ISLEAF(cp->page))
1137 break;
1138
1139 pgno =
1140 GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
1141 DISCARD(dbc, cp);
1142 }
1143
1144 cp->pgno = cp->page->pgno;
1145 cp->indx = NUM_ENT(cp->page) == 0 ? 0 : NUM_ENT(cp->page) - P_INDX;
1146 cp->dpgno = PGNO_INVALID;
1147
1148 /* Check for duplicates. */
1149 if ((ret = __bam_dup(dbc, cp, cp->indx, 1)) != 0)
1150 return (ret);
1151
1152 /* If on an empty page or a deleted record, move to the next one. */
1153 if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(cp))
1154 if ((ret = __bam_c_prev(dbc, cp)) != 0)
1155 return (ret);
1156
1157 return (0);
1158 }
1159
1160 /*
1161 * __bam_c_next --
1162 * Move to the next record.
1163 */
1164 static int
__bam_c_next(dbc,cp,initial_move)1165 __bam_c_next(dbc, cp, initial_move)
1166 DBC *dbc;
1167 CURSOR *cp;
1168 int initial_move;
1169 {
1170 DB *dbp;
1171 db_indx_t adjust, indx;
1172 db_pgno_t pgno;
1173 int ret;
1174
1175 dbp = dbc->dbp;
1176
1177 /*
1178 * We're either moving through a page of duplicates or a btree leaf
1179 * page.
1180 */
1181 if (cp->dpgno == PGNO_INVALID) {
1182 adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1183 pgno = cp->pgno;
1184 indx = cp->indx;
1185 } else {
1186 adjust = O_INDX;
1187 pgno = cp->dpgno;
1188 indx = cp->dindx;
1189 }
1190 if (cp->page == NULL) {
1191 if ((ret =
1192 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1193 return (ret);
1194 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1195 return (ret);
1196 }
1197
1198 /*
1199 * If at the end of the page, move to a subsequent page.
1200 *
1201 * !!!
1202 * Check for >= NUM_ENT. If we're here as the result of a search that
1203 * landed us on NUM_ENT, we'll increment indx before we test.
1204 *
1205 * !!!
1206 * This code handles empty pages and pages with only deleted entries.
1207 */
1208 if (initial_move)
1209 indx += adjust;
1210 for (;;) {
1211 if (indx >= NUM_ENT(cp->page)) {
1212 /*
1213 * If we're in a btree leaf page, we've reached the end
1214 * of the tree. If we've reached the end of a page of
1215 * duplicates, continue from the btree leaf page where
1216 * we found this page of duplicates.
1217 */
1218 pgno = cp->page->next_pgno;
1219 if (pgno == PGNO_INVALID) {
1220 /* If in a btree leaf page, it's EOF. */
1221 if (cp->dpgno == PGNO_INVALID)
1222 return (DB_NOTFOUND);
1223
1224 /* Continue from the last btree leaf page. */
1225 cp->dpgno = PGNO_INVALID;
1226
1227 adjust = P_INDX;
1228 pgno = cp->pgno;
1229 indx = cp->indx + P_INDX;
1230 } else
1231 indx = 0;
1232
1233 DISCARD(dbc, cp);
1234 if ((ret = __bam_lget(dbc,
1235 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1236 return (ret);
1237 if ((ret =
1238 memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1239 return (ret);
1240 continue;
1241 }
1242
1243 /* Ignore deleted records. */
1244 if (IS_DELETED(cp, indx)) {
1245 indx += adjust;
1246 continue;
1247 }
1248
1249 /*
1250 * If we're not in a duplicates page, check to see if we've
1251 * found a page of duplicates, in which case we move to the
1252 * first entry.
1253 */
1254 if (cp->dpgno == PGNO_INVALID) {
1255 cp->pgno = cp->page->pgno;
1256 cp->indx = indx;
1257
1258 if ((ret = __bam_dup(dbc, cp, indx, 0)) != 0)
1259 return (ret);
1260 if (cp->dpgno != PGNO_INVALID) {
1261 indx = cp->dindx;
1262 adjust = O_INDX;
1263 continue;
1264 }
1265 } else {
1266 cp->dpgno = cp->page->pgno;
1267 cp->dindx = indx;
1268 }
1269 break;
1270 }
1271 return (0);
1272 }
1273
1274 /*
1275 * __bam_c_prev --
1276 * Move to the previous record.
1277 */
1278 static int
__bam_c_prev(dbc,cp)1279 __bam_c_prev(dbc, cp)
1280 DBC *dbc;
1281 CURSOR *cp;
1282 {
1283 DB *dbp;
1284 db_indx_t indx, adjust;
1285 db_pgno_t pgno;
1286 int ret, set_indx;
1287
1288 dbp = dbc->dbp;
1289
1290 /*
1291 * We're either moving through a page of duplicates or a btree leaf
1292 * page.
1293 */
1294 if (cp->dpgno == PGNO_INVALID) {
1295 adjust = dbp->type == DB_BTREE ? P_INDX : O_INDX;
1296 pgno = cp->pgno;
1297 indx = cp->indx;
1298 } else {
1299 adjust = O_INDX;
1300 pgno = cp->dpgno;
1301 indx = cp->dindx;
1302 }
1303 if (cp->page == NULL) {
1304 if ((ret =
1305 __bam_lget(dbc, 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1306 return (ret);
1307 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1308 return (ret);
1309 }
1310
1311 /*
1312 * If at the beginning of the page, move to any previous one.
1313 *
1314 * !!!
1315 * This code handles empty pages and pages with only deleted entries.
1316 */
1317 for (;;) {
1318 if (indx == 0) {
1319 /*
1320 * If we're in a btree leaf page, we've reached the
1321 * beginning of the tree. If we've reached the first
1322 * of a page of duplicates, continue from the btree
1323 * leaf page where we found this page of duplicates.
1324 */
1325 pgno = cp->page->prev_pgno;
1326 if (pgno == PGNO_INVALID) {
1327 /* If in a btree leaf page, it's SOF. */
1328 if (cp->dpgno == PGNO_INVALID)
1329 return (DB_NOTFOUND);
1330
1331 /* Continue from the last btree leaf page. */
1332 cp->dpgno = PGNO_INVALID;
1333
1334 adjust = P_INDX;
1335 pgno = cp->pgno;
1336 indx = cp->indx;
1337 set_indx = 0;
1338 } else
1339 set_indx = 1;
1340
1341 DISCARD(dbc, cp);
1342 if ((ret = __bam_lget(dbc,
1343 0, pgno, DB_LOCK_READ, &cp->lock)) != 0)
1344 return (ret);
1345 if ((ret =
1346 memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1347 return (ret);
1348
1349 if (set_indx)
1350 indx = NUM_ENT(cp->page);
1351 if (indx == 0)
1352 continue;
1353 }
1354
1355 /* Ignore deleted records. */
1356 indx -= adjust;
1357 if (IS_DELETED(cp, indx))
1358 continue;
1359
1360 /*
1361 * If we're not in a duplicates page, check to see if we've
1362 * found a page of duplicates, in which case we move to the
1363 * last entry.
1364 */
1365 if (cp->dpgno == PGNO_INVALID) {
1366 cp->pgno = cp->page->pgno;
1367 cp->indx = indx;
1368
1369 if ((ret = __bam_dup(dbc, cp, indx, 1)) != 0)
1370 return (ret);
1371 if (cp->dpgno != PGNO_INVALID) {
1372 indx = cp->dindx + O_INDX;
1373 adjust = O_INDX;
1374 continue;
1375 }
1376 } else {
1377 cp->dpgno = cp->page->pgno;
1378 cp->dindx = indx;
1379 }
1380 break;
1381 }
1382 return (0);
1383 }
1384
1385 /*
1386 * __bam_c_search --
1387 * Move to a specified record.
1388 */
1389 static int
__bam_c_search(dbc,cp,key,flags,exactp)1390 __bam_c_search(dbc, cp, key, flags, exactp)
1391 DBC *dbc;
1392 CURSOR *cp;
1393 const DBT *key;
1394 u_int32_t flags;
1395 int *exactp;
1396 {
1397 BTREE *t;
1398 DB *dbp;
1399 DB_LOCK lock;
1400 PAGE *h;
1401 db_recno_t recno;
1402 db_indx_t indx;
1403 u_int32_t sflags;
1404 int cmp, needexact, ret;
1405
1406 dbp = dbc->dbp;
1407 t = dbp->internal;
1408
1409 /* Find an entry in the database. */
1410 switch (flags) {
1411 case DB_SET_RECNO:
1412 if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
1413 return (ret);
1414 sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1415 needexact = *exactp = 1;
1416 ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp);
1417 break;
1418 case DB_SET:
1419 case DB_GET_BOTH:
1420 sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1421 needexact = *exactp = 1;
1422 goto search;
1423 case DB_SET_RANGE:
1424 sflags = F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND;
1425 needexact = *exactp = 0;
1426 goto search;
1427 case DB_KEYFIRST:
1428 sflags = S_KEYFIRST;
1429 goto fast_search;
1430 case DB_KEYLAST:
1431 sflags = S_KEYLAST;
1432 fast_search: needexact = *exactp = 0;
1433 /*
1434 * If the application has a history of inserting into the first
1435 * or last pages of the database, we check those pages first to
1436 * avoid doing a full search.
1437 *
1438 * Record numbers can't be fast-tracked, the entire tree has to
1439 * be locked.
1440 */
1441 h = NULL;
1442 lock = LOCK_INVALID;
1443 if (F_ISSET(dbp, DB_BT_RECNUM))
1444 goto search;
1445
1446 /* Check if the application has a history of sorted input. */
1447 if (t->bt_lpgno == PGNO_INVALID)
1448 goto search;
1449
1450 /*
1451 * Lock and retrieve the page on which we did the last insert.
1452 * It's okay if it doesn't exist, or if it's not the page type
1453 * we expected, it just means that the world changed.
1454 */
1455 if (__bam_lget(dbc, 0, t->bt_lpgno, DB_LOCK_WRITE, &lock))
1456 goto fast_miss;
1457 if (memp_fget(dbp->mpf, &t->bt_lpgno, 0, &h))
1458 goto fast_miss;
1459 if (TYPE(h) != P_LBTREE)
1460 goto fast_miss;
1461 if (NUM_ENT(h) == 0)
1462 goto fast_miss;
1463
1464 /*
1465 * What we do here is test to see if we're at the beginning or
1466 * end of the tree and if the new item sorts before/after the
1467 * first/last page entry. We don't try and catch inserts into
1468 * the middle of the tree (although we could, as long as there
1469 * were two keys on the page and we saved both the index and
1470 * the page number of the last insert).
1471 */
1472 if (h->next_pgno == PGNO_INVALID) {
1473 indx = NUM_ENT(h) - P_INDX;
1474 if ((cmp =
1475 __bam_cmp(dbp, key, h, indx, t->bt_compare)) < 0)
1476 goto try_begin;
1477 if (cmp > 0) {
1478 indx += P_INDX;
1479 goto fast_hit;
1480 }
1481
1482 /*
1483 * Found a duplicate. If doing DB_KEYLAST, we're at
1484 * the correct position, otherwise, move to the first
1485 * of the duplicates.
1486 */
1487 if (flags == DB_KEYLAST)
1488 goto fast_hit;
1489 for (;
1490 indx > 0 && h->inp[indx - P_INDX] == h->inp[indx];
1491 indx -= P_INDX)
1492 ;
1493 goto fast_hit;
1494 }
1495 try_begin: if (h->prev_pgno == PGNO_INVALID) {
1496 indx = 0;
1497 if ((cmp =
1498 __bam_cmp(dbp, key, h, indx, t->bt_compare)) > 0)
1499 goto fast_miss;
1500 if (cmp < 0)
1501 goto fast_hit;
1502 /*
1503 * Found a duplicate. If doing DB_KEYFIRST, we're at
1504 * the correct position, otherwise, move to the last
1505 * of the duplicates.
1506 */
1507 if (flags == DB_KEYFIRST)
1508 goto fast_hit;
1509 for (;
1510 indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
1511 h->inp[indx] == h->inp[indx + P_INDX];
1512 indx += P_INDX)
1513 ;
1514 goto fast_hit;
1515 }
1516 goto fast_miss;
1517
1518 fast_hit: /* Set the exact match flag, we may have found a duplicate. */
1519 *exactp = cmp == 0;
1520
1521 /* Enter the entry in the stack. */
1522 BT_STK_CLR(cp);
1523 BT_STK_ENTER(cp, h, indx, lock, ret);
1524 break;
1525
1526 fast_miss: if (h != NULL)
1527 (void)memp_fput(dbp->mpf, h, 0);
1528 if (lock != LOCK_INVALID)
1529 (void)__BT_LPUT(dbc, lock);
1530
1531 search: ret = __bam_search(dbc, key, sflags, 1, NULL, exactp);
1532 break;
1533 default: /* XXX: Impossible. */
1534 abort();
1535 /* NOTREACHED */
1536 }
1537 if (ret != 0)
1538 return (ret);
1539
1540 /*
1541 * Initialize the cursor to reference it. This has to be done
1542 * before we return (even with DB_NOTFOUND) because we have to
1543 * free the page(s) we locked in __bam_search.
1544 */
1545 cp->page = cp->csp->page;
1546 cp->pgno = cp->csp->page->pgno;
1547 cp->indx = cp->csp->indx;
1548 cp->lock = cp->csp->lock;
1549 cp->dpgno = PGNO_INVALID;
1550
1551 /*
1552 * If we inserted a key into the first or last slot of the tree,
1553 * remember where it was so we can do it more quickly next time.
1554 */
1555 if (flags == DB_KEYFIRST || flags == DB_KEYLAST)
1556 t->bt_lpgno =
1557 ((cp->page->next_pgno == PGNO_INVALID &&
1558 cp->indx >= NUM_ENT(cp->page)) ||
1559 (cp->page->prev_pgno == PGNO_INVALID && cp->indx == 0)) ?
1560 cp->pgno : PGNO_INVALID;
1561
1562 /* If we need an exact match and didn't find one, we're done. */
1563 if (needexact && *exactp == 0)
1564 return (DB_NOTFOUND);
1565
1566 return (0);
1567 }
1568
1569 /*
1570 * __bam_dup --
1571 * Check for an off-page duplicates entry, and if found, move to the
1572 * first or last entry.
1573 *
1574 * PUBLIC: int __bam_dup __P((DBC *, CURSOR *, u_int32_t, int));
1575 */
1576 int
__bam_dup(dbc,cp,indx,last_dup)1577 __bam_dup(dbc, cp, indx, last_dup)
1578 DBC *dbc;
1579 CURSOR *cp;
1580 u_int32_t indx;
1581 int last_dup;
1582 {
1583 BOVERFLOW *bo;
1584 DB *dbp;
1585 db_pgno_t pgno;
1586 int ret;
1587
1588 dbp = dbc->dbp;
1589
1590 /*
1591 * Check for an overflow entry. If we find one, move to the
1592 * duplicates page, and optionally move to the last record on
1593 * that page.
1594 *
1595 * !!!
1596 * We don't lock duplicates pages, we've already got the correct
1597 * lock on the main page.
1598 */
1599 bo = GET_BOVERFLOW(cp->page, indx + O_INDX);
1600 if (B_TYPE(bo->type) != B_DUPLICATE)
1601 return (0);
1602
1603 pgno = bo->pgno;
1604 if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
1605 return (ret);
1606 cp->page = NULL;
1607 if (last_dup) {
1608 if ((ret = __db_dend(dbc, pgno, &cp->page)) != 0)
1609 return (ret);
1610 indx = NUM_ENT(cp->page) - O_INDX;
1611 } else {
1612 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &cp->page)) != 0)
1613 return (ret);
1614 indx = 0;
1615 }
1616
1617 /* Update the cursor's duplicate information. */
1618 cp->dpgno = cp->page->pgno;
1619 cp->dindx = indx;
1620
1621 return (0);
1622 }
1623
1624 /*
1625 * __bam_c_physdel --
1626 * Actually do the cursor deletion.
1627 */
1628 static int
__bam_c_physdel(dbc,cp,h)1629 __bam_c_physdel(dbc, cp, h)
1630 DBC *dbc;
1631 CURSOR *cp;
1632 PAGE *h;
1633 {
1634 enum { DELETE_ITEM, DELETE_PAGE, NOTHING_FURTHER } cmd;
1635 BOVERFLOW bo;
1636 DB *dbp;
1637 DBT dbt;
1638 DB_LOCK lock;
1639 db_indx_t indx;
1640 db_pgno_t pgno, next_pgno, prev_pgno;
1641 int delete_page, local_page, ret;
1642
1643 dbp = dbc->dbp;
1644
1645 delete_page = ret = 0;
1646
1647 /* Figure out what we're deleting. */
1648 if (cp->dpgno == PGNO_INVALID) {
1649 pgno = cp->pgno;
1650 indx = cp->indx;
1651 } else {
1652 pgno = cp->dpgno;
1653 indx = cp->dindx;
1654 }
1655
1656 /*
1657 * If the item is referenced by another cursor, set that cursor's
1658 * delete flag and leave it up to it to do the delete.
1659 *
1660 * !!!
1661 * This test for > 0 is a tricky. There are two ways that we can
1662 * be called here. Either we are closing the cursor or we've moved
1663 * off the page with the deleted entry. In the first case, we've
1664 * already removed the cursor from the active queue, so we won't see
1665 * it in __bam_ca_delete. In the second case, it will be on a different
1666 * item, so we won't bother with it in __bam_ca_delete.
1667 */
1668 if (__bam_ca_delete(dbp, pgno, indx, 1) > 0)
1669 return (0);
1670
1671 /*
1672 * If this is concurrent DB, upgrade the lock if necessary.
1673 */
1674 if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW) &&
1675 (ret = lock_get(dbp->dbenv->lk_info,
1676 dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
1677 &dbc->mylock)) != 0)
1678 return (EAGAIN);
1679
1680 /*
1681 * If we don't already have the page locked, get it and delete the
1682 * items.
1683 */
1684 if ((h == NULL || h->pgno != pgno)) {
1685 if ((ret = __bam_lget(dbc, 0, pgno, DB_LOCK_WRITE, &lock)) != 0)
1686 return (ret);
1687 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1688 return (ret);
1689 local_page = 1;
1690 } else
1691 local_page = 0;
1692
1693 /*
1694 * If we're deleting a duplicate entry and there are other duplicate
1695 * entries remaining, call the common code to do the work and fix up
1696 * the parent page as necessary. Otherwise, do a normal btree delete.
1697 *
1698 * There are 5 possible cases:
1699 *
1700 * 1. It's not a duplicate item: do a normal btree delete.
1701 * 2. It's a duplicate item:
1702 * 2a: We delete an item from a page of duplicates, but there are
1703 * more items on the page.
1704 * 2b: We delete the last item from a page of duplicates, deleting
1705 * the last duplicate.
1706 * 2c: We delete the last item from a page of duplicates, but there
1707 * is a previous page of duplicates.
1708 * 2d: We delete the last item from a page of duplicates, but there
1709 * is a following page of duplicates.
1710 *
1711 * In the case of:
1712 *
1713 * 1: There's nothing further to do.
1714 * 2a: There's nothing further to do.
1715 * 2b: Do the normal btree delete instead of a duplicate delete, as
1716 * that deletes both the duplicate chain and the parent page's
1717 * entry.
1718 * 2c: There's nothing further to do.
1719 * 2d: Delete the duplicate, and update the parent page's entry.
1720 */
1721 if (TYPE(h) == P_DUPLICATE) {
1722 pgno = PGNO(h);
1723 prev_pgno = PREV_PGNO(h);
1724 next_pgno = NEXT_PGNO(h);
1725
1726 if (NUM_ENT(h) == 1 &&
1727 prev_pgno == PGNO_INVALID && next_pgno == PGNO_INVALID)
1728 cmd = DELETE_PAGE;
1729 else {
1730 cmd = DELETE_ITEM;
1731
1732 /* Delete the duplicate. */
1733 if ((ret = __db_drem(dbc, &h, indx, __bam_free)) != 0)
1734 goto err;
1735
1736 /*
1737 * 2a: h != NULL, h->pgno == pgno
1738 * 2b: We don't reach this clause, as the above test
1739 * was true.
1740 * 2c: h == NULL, prev_pgno != PGNO_INVALID
1741 * 2d: h != NULL, next_pgno != PGNO_INVALID
1742 *
1743 * Test for 2a and 2c: if we didn't empty the current
1744 * page or there was a previous page of duplicates, we
1745 * don't need to touch the parent page.
1746 */
1747 if ((h != NULL && pgno == h->pgno) ||
1748 prev_pgno != PGNO_INVALID)
1749 cmd = NOTHING_FURTHER;
1750 }
1751
1752 /*
1753 * Release any page we're holding and its lock.
1754 *
1755 * !!!
1756 * If there is no subsequent page in the duplicate chain, then
1757 * __db_drem will have put page "h" and set it to NULL.
1758 */
1759 if (local_page) {
1760 if (h != NULL)
1761 (void)memp_fput(dbp->mpf, h, 0);
1762 (void)__BT_TLPUT(dbc, lock);
1763 local_page = 0;
1764 }
1765
1766 if (cmd == NOTHING_FURTHER)
1767 goto done;
1768
1769 /* Acquire the parent page and switch the index to its entry. */
1770 if ((ret =
1771 __bam_lget(dbc, 0, cp->pgno, DB_LOCK_WRITE, &lock)) != 0)
1772 goto err;
1773 if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) {
1774 (void)__BT_TLPUT(dbc, lock);
1775 goto err;
1776 }
1777 local_page = 1;
1778 indx = cp->indx;
1779
1780 if (cmd == DELETE_PAGE)
1781 goto btd;
1782
1783 /*
1784 * Copy, delete, update, add-back the parent page's data entry.
1785 *
1786 * XXX
1787 * This may be a performance/logging problem. We should add a
1788 * log message which simply logs/updates a random set of bytes
1789 * on a page, and use it instead of doing a delete/add pair.
1790 */
1791 indx += O_INDX;
1792 bo = *GET_BOVERFLOW(h, indx);
1793 (void)__db_ditem(dbc, h, indx, BOVERFLOW_SIZE);
1794 bo.pgno = next_pgno;
1795 memset(&dbt, 0, sizeof(dbt));
1796 dbt.data = &bo;
1797 dbt.size = BOVERFLOW_SIZE;
1798 (void)__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &dbt, NULL);
1799 (void)memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY);
1800 goto done;
1801 }
1802
1803 btd: /*
1804 * If the page is going to be emptied, delete it. To delete a leaf
1805 * page we need a copy of a key from the page. We use the 0th page
1806 * index since it's the last key that the page held.
1807 *
1808 * We malloc the page information instead of using the return key/data
1809 * memory because we've already set them -- the reason we've already
1810 * set them is because we're (potentially) about to do a reverse split,
1811 * which would make our saved page information useless.
1812 *
1813 * !!!
1814 * The following operations to delete a page might deadlock. I think
1815 * that's OK. The problem is if we're deleting an item because we're
1816 * closing cursors because we've already deadlocked and want to call
1817 * txn_abort(). If we fail due to deadlock, we leave a locked empty
1818 * page in the tree, which won't be empty long because we're going to
1819 * undo the delete.
1820 */
1821 if (NUM_ENT(h) == 2 && h->pgno != PGNO_ROOT) {
1822 memset(&dbt, 0, sizeof(DBT));
1823 dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1824 if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1825 goto err;
1826 delete_page = 1;
1827 }
1828
1829 /*
1830 * Do a normal btree delete.
1831 *
1832 * !!!
1833 * Delete the key item first, otherwise the duplicate checks in
1834 * __bam_ditem() won't work!
1835 */
1836 if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1837 goto err;
1838 if ((ret = __bam_ditem(dbc, h, indx)) != 0)
1839 goto err;
1840
1841 /* Discard any remaining locks/pages. */
1842 if (local_page) {
1843 (void)memp_fput(dbp->mpf, h, 0);
1844 (void)__BT_TLPUT(dbc, lock);
1845 local_page = 0;
1846 }
1847
1848 /* Delete the page if it was emptied. */
1849 if (delete_page)
1850 ret = __bam_dpage(dbc, &dbt);
1851
1852 err:
1853 done: if (delete_page)
1854 __os_free(dbt.data, dbt.size);
1855
1856 if (local_page) {
1857 /*
1858 * It's possible for h to be NULL, as __db_drem may have
1859 * been relinking pages by the time that it deadlocked.
1860 */
1861 if (h != NULL)
1862 (void)memp_fput(dbp->mpf, h, 0);
1863 (void)__BT_TLPUT(dbc, lock);
1864 }
1865
1866 if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
1867 (void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
1868 DB_LOCK_IWRITE, 0);
1869
1870 return (ret);
1871 }
1872
1873 /*
1874 * __bam_c_getstack --
1875 * Acquire a full stack for a cursor.
1876 */
1877 static int
__bam_c_getstack(dbc,cp)1878 __bam_c_getstack(dbc, cp)
1879 DBC *dbc;
1880 CURSOR *cp;
1881 {
1882 DB *dbp;
1883 DBT dbt;
1884 PAGE *h;
1885 db_pgno_t pgno;
1886 int exact, ret;
1887
1888 dbp = dbc->dbp;
1889 h = NULL;
1890 memset(&dbt, 0, sizeof(DBT));
1891 ret = 0;
1892
1893 /* Get the page with the current item on it. */
1894 pgno = cp->pgno;
1895 if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
1896 return (ret);
1897
1898 /* Get a copy of a key from the page. */
1899 dbt.flags = DB_DBT_MALLOC | DB_DBT_INTERNAL;
1900 if ((ret = __db_ret(dbp, h, 0, &dbt, NULL, NULL)) != 0)
1901 goto err;
1902
1903 /* Get a write-locked stack for that page. */
1904 exact = 0;
1905 ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact);
1906
1907 /* We no longer need the key or the page. */
1908 err: if (h != NULL)
1909 (void)memp_fput(dbp->mpf, h, 0);
1910 if (dbt.data != NULL)
1911 __os_free(dbt.data, dbt.size);
1912 return (ret);
1913 }
1914