xref: /titanic_51/usr/src/cmd/sendmail/db/hash/hash_page.c (revision 7fd791373689a6af05e27efec3b1ab556e02aa23)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 /*
8  * Copyright (c) 1990, 1993, 1994
9  *	Margo Seltzer.  All rights reserved.
10  */
11 /*
12  * Copyright (c) 1990, 1993, 1994
13  *	The Regents of the University of California.  All rights reserved.
14  *
15  * This code is derived from software contributed to Berkeley by
16  * Margo Seltzer.
17  *
18  * Redistribution and use in source and binary forms, with or without
19  * modification, are permitted provided that the following conditions
20  * are met:
21  * 1. Redistributions of source code must retain the above copyright
22  *    notice, this list of conditions and the following disclaimer.
23  * 2. Redistributions in binary form must reproduce the above copyright
24  *    notice, this list of conditions and the following disclaimer in the
25  *    documentation and/or other materials provided with the distribution.
26  * 3. All advertising materials mentioning features or use of this software
27  *    must display the following acknowledgement:
28  *	This product includes software developed by the University of
29  *	California, Berkeley and its contributors.
30  * 4. Neither the name of the University nor the names of its contributors
31  *    may be used to endorse or promote products derived from this software
32  *    without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
35  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
37  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
38  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
40  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
41  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
42  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
43  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
44  * SUCH DAMAGE.
45  */
46 
47 #include "config.h"
48 
49 #ifndef lint
50 static const char sccsid[] = "@(#)hash_page.c	10.55 (Sleepycat) 1/3/99";
51 #endif /* not lint */
52 
53 /*
54  * PACKAGE:  hashing
55  *
56  * DESCRIPTION:
57  *      Page manipulation for hashing package.
58  *
59  * ROUTINES:
60  *
61  * External
62  *      __get_page
63  *      __add_ovflpage
64  *      __overflow_page
65  * Internal
66  *      open_temp
67  */
68 
69 #ifndef NO_SYSTEM_INCLUDES
70 #include <sys/types.h>
71 
72 #include <errno.h>
73 #include <string.h>
74 #endif
75 
76 #include "db_int.h"
77 #include "db_page.h"
78 #include "hash.h"
79 
80 static int __ham_lock_bucket __P((DBC *, db_lockmode_t));
81 
82 #ifdef DEBUG_SLOW
83 static void  __account_page(DB *, db_pgno_t, int);
84 #endif
85 
86 /*
87  * PUBLIC: int __ham_item __P((DBC *, db_lockmode_t));
88  */
89 int
90 __ham_item(dbc, mode)
91 	DBC *dbc;
92 	db_lockmode_t mode;
93 {
94 	DB *dbp;
95 	HASH_CURSOR *hcp;
96 	db_pgno_t next_pgno;
97 	int ret;
98 
99 	dbp = dbc->dbp;
100 	hcp = (HASH_CURSOR *)dbc->internal;
101 
102 	if (F_ISSET(hcp, H_DELETED))
103 		return (EINVAL);
104 	F_CLR(hcp, H_OK | H_NOMORE);
105 
106 	/* Check if we need to get a page for this cursor. */
107 	if ((ret = __ham_get_cpage(dbc, mode)) != 0)
108 		return (ret);
109 
110 	/* Check if we are looking for space in which to insert an item. */
111 	if (hcp->seek_size && hcp->seek_found_page == PGNO_INVALID
112 	    && hcp->seek_size < P_FREESPACE(hcp->pagep))
113 		hcp->seek_found_page = hcp->pgno;
114 
115 	/* Check if we need to go on to the next page. */
116 	if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno == PGNO_INVALID)
117 		/*
118 		 * ISDUP is set, and offset is at the beginning of the datum.
119 		 * We need to grab the length of the datum, then set the datum
120 		 * pointer to be the beginning of the datum.
121 		 */
122 		memcpy(&hcp->dup_len,
123 		    HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx)) +
124 		    hcp->dup_off, sizeof(db_indx_t));
125 	else if (F_ISSET(hcp, H_ISDUP)) {
126 		/* Make sure we're not about to run off the page. */
127 		if (hcp->dpagep == NULL && (ret = __ham_get_page(dbp,
128 		    hcp->dpgno, &hcp->dpagep)) != 0)
129 			return (ret);
130 
131 		if (hcp->dndx >= NUM_ENT(hcp->dpagep)) {
132 			if (NEXT_PGNO(hcp->dpagep) == PGNO_INVALID) {
133 				if (F_ISSET(hcp, H_DUPONLY)) {
134 					F_CLR(hcp, H_OK);
135 					F_SET(hcp, H_NOMORE);
136 					return (0);
137 				}
138 				if ((ret = __ham_put_page(dbp,
139 				    hcp->dpagep, 0)) != 0)
140 					return (ret);
141 				F_CLR(hcp, H_ISDUP);
142 				hcp->dpagep = NULL;
143 				hcp->dpgno = PGNO_INVALID;
144 				hcp->dndx = NDX_INVALID;
145 				hcp->bndx++;
146 			} else if ((ret = __ham_next_cpage(dbc,
147 			    NEXT_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0)
148 				return (ret);
149 		}
150 	}
151 
152 	if (hcp->bndx >= (db_indx_t)H_NUMPAIRS(hcp->pagep)) {
153 		/* Fetch next page. */
154 		if (NEXT_PGNO(hcp->pagep) == PGNO_INVALID) {
155 			F_SET(hcp, H_NOMORE);
156 			if (hcp->dpagep != NULL &&
157 			    (ret = __ham_put_page(dbp, hcp->dpagep, 0)) != 0)
158 				return (ret);
159 			hcp->dpgno = PGNO_INVALID;
160 			return (DB_NOTFOUND);
161 		}
162 		next_pgno = NEXT_PGNO(hcp->pagep);
163 		hcp->bndx = 0;
164 		if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
165 			return (ret);
166 	}
167 
168 	F_SET(hcp, H_OK);
169 	return (0);
170 }
171 
172 /*
173  * PUBLIC: int __ham_item_reset __P((DBC *));
174  */
175 int
176 __ham_item_reset(dbc)
177 	DBC *dbc;
178 {
179 	HASH_CURSOR *hcp;
180 	DB *dbp;
181 	int ret;
182 
183 	ret = 0;
184 	dbp = dbc->dbp;
185 	hcp = (HASH_CURSOR *)dbc->internal;
186 	if (hcp->pagep != NULL)
187 		ret = __ham_put_page(dbp, hcp->pagep, 0);
188 	if (ret == 0 && hcp->dpagep != NULL)
189 		ret = __ham_put_page(dbp, hcp->dpagep, 0);
190 
191 	__ham_item_init(hcp);
192 	return (ret);
193 }
194 
195 /*
196  * PUBLIC: void __ham_item_init __P((HASH_CURSOR *));
197  */
198 void
199 __ham_item_init(hcp)
200 	HASH_CURSOR *hcp;
201 {
202 	/*
203 	 * If this cursor still holds any locks, we must
204 	 * release them if we are not running with transactions.
205 	 */
206 	if (hcp->lock && hcp->dbc->txn == NULL)
207 	    (void)lock_put(hcp->dbc->dbp->dbenv->lk_info, hcp->lock);
208 
209 	/*
210 	 * The following fields must *not* be initialized here
211 	 * because they may have meaning across inits.
212 	 * 	hlock, hdr, split_buf, stats
213 	 */
214 	hcp->bucket = BUCKET_INVALID;
215 	hcp->lbucket = BUCKET_INVALID;
216 	hcp->lock = 0;
217 	hcp->pagep = NULL;
218 	hcp->pgno = PGNO_INVALID;
219 	hcp->bndx = NDX_INVALID;
220 	hcp->dpagep = NULL;
221 	hcp->dpgno = PGNO_INVALID;
222 	hcp->dndx = NDX_INVALID;
223 	hcp->dup_off = 0;
224 	hcp->dup_len = 0;
225 	hcp->dup_tlen = 0;
226 	hcp->seek_size = 0;
227 	hcp->seek_found_page = PGNO_INVALID;
228 	hcp->flags = 0;
229 }
230 
231 /*
232  * PUBLIC: int __ham_item_done __P((DBC *, int));
233  */
234 int
235 __ham_item_done(dbc, dirty)
236 	DBC *dbc;
237 	int dirty;
238 {
239 	DB *dbp;
240 	HASH_CURSOR *hcp;
241 	int ret, t_ret;
242 
243 	dbp = dbc->dbp;
244 	hcp = (HASH_CURSOR *)dbc->internal;
245 	t_ret = ret = 0;
246 
247 	if (hcp->pagep)
248 		ret = __ham_put_page(dbp, hcp->pagep,
249 		    dirty && hcp->dpagep == NULL);
250 	hcp->pagep = NULL;
251 
252 	if (hcp->dpagep)
253 		t_ret = __ham_put_page(dbp, hcp->dpagep, dirty);
254 	hcp->dpagep = NULL;
255 
256 	if (ret == 0 && t_ret != 0)
257 		ret = t_ret;
258 
259 	/*
260 	 * We don't throw out the page number since we might want to
261 	 * continue getting on this page.
262 	 */
263 	return (ret != 0 ? ret : t_ret);
264 }
265 
266 /*
267  * Returns the last item in a bucket.
268  *
269  * PUBLIC: int __ham_item_last __P((DBC *, db_lockmode_t));
270  */
271 int
272 __ham_item_last(dbc, mode)
273 	DBC *dbc;
274 	db_lockmode_t mode;
275 {
276 	HASH_CURSOR *hcp;
277 	int ret;
278 
279 	hcp = (HASH_CURSOR *)dbc->internal;
280 	if ((ret = __ham_item_reset(dbc)) != 0)
281 		return (ret);
282 
283 	hcp->bucket = hcp->hdr->max_bucket;
284 	F_SET(hcp, H_OK);
285 	return (__ham_item_prev(dbc, mode));
286 }
287 
288 /*
289  * PUBLIC: int __ham_item_first __P((DBC *, db_lockmode_t));
290  */
291 int
292 __ham_item_first(dbc, mode)
293 	DBC *dbc;
294 	db_lockmode_t mode;
295 {
296 	HASH_CURSOR *hcp;
297 	int ret;
298 
299 	hcp = (HASH_CURSOR *)dbc->internal;
300 	if ((ret = __ham_item_reset(dbc)) != 0)
301 		return (ret);
302 	F_SET(hcp, H_OK);
303 	hcp->bucket = 0;
304 	return (__ham_item_next(dbc, mode));
305 }
306 
307 /*
308  * __ham_item_prev --
309  *	Returns a pointer to key/data pair on a page.  In the case of
310  *	bigkeys, just returns the page number and index of the bigkey
311  *	pointer pair.
312  *
313  * PUBLIC: int __ham_item_prev __P((DBC *, db_lockmode_t));
314  */
315 int
316 __ham_item_prev(dbc, mode)
317 	DBC *dbc;
318 	db_lockmode_t mode;
319 {
320 	DB *dbp;
321 	HASH_CURSOR *hcp;
322 	db_pgno_t next_pgno;
323 	int ret;
324 
325 	dbp = dbc->dbp;
326 	hcp = (HASH_CURSOR *)dbc->internal;
327 	/*
328 	 * There are N cases for backing up in a hash file.
329 	 * Case 1: In the middle of a page, no duplicates, just dec the index.
330 	 * Case 2: In the middle of a duplicate set, back up one.
331 	 * Case 3: At the beginning of a duplicate set, get out of set and
332 	 *	back up to next key.
333 	 * Case 4: At the beginning of a page; go to previous page.
334 	 * Case 5: At the beginning of a bucket; go to prev bucket.
335 	 */
336 	F_CLR(hcp, H_OK | H_NOMORE | H_DELETED);
337 
338 	/*
339 	 * First handle the duplicates.  Either you'll get the key here
340 	 * or you'll exit the duplicate set and drop into the code below
341 	 * to handle backing up through keys.
342 	 */
343 	if (F_ISSET(hcp, H_ISDUP)) {
344 		if (hcp->dpgno == PGNO_INVALID) {
345 			/* Duplicates are on-page. */
346 			if (hcp->dup_off != 0)
347 				if ((ret = __ham_get_cpage(dbc, mode)) != 0)
348 					return (ret);
349 				else {
350 					HASH_CURSOR *h;
351 					h = hcp;
352 					memcpy(&h->dup_len, HKEYDATA_DATA(
353 					    H_PAIRDATA(h->pagep, h->bndx))
354 					    + h->dup_off - sizeof(db_indx_t),
355 					    sizeof(db_indx_t));
356 					hcp->dup_off -=
357 					    DUP_SIZE(hcp->dup_len);
358 					hcp->dndx--;
359 					return (__ham_item(dbc, mode));
360 				}
361 		} else if (hcp->dndx > 0) {	/* Duplicates are off-page. */
362 			hcp->dndx--;
363 			return (__ham_item(dbc, mode));
364 		} else if ((ret = __ham_get_cpage(dbc, mode)) != 0)
365 			return (ret);
366 		else if (PREV_PGNO(hcp->dpagep) == PGNO_INVALID) {
367 			if (F_ISSET(hcp, H_DUPONLY)) {
368 				F_CLR(hcp, H_OK);
369 				F_SET(hcp, H_NOMORE);
370 				return (0);
371 			} else {
372 				F_CLR(hcp, H_ISDUP); /* End of dups */
373 				hcp->dpgno = PGNO_INVALID;
374 				if (hcp->dpagep != NULL)
375 					(void)__ham_put_page(dbp,
376 					    hcp->dpagep, 0);
377 				hcp->dpagep = NULL;
378 			}
379 		} else if ((ret = __ham_next_cpage(dbc,
380 		    PREV_PGNO(hcp->dpagep), 0, H_ISDUP)) != 0)
381 			return (ret);
382 		else {
383 			hcp->dndx = NUM_ENT(hcp->pagep) - 1;
384 			return (__ham_item(dbc, mode));
385 		}
386 	}
387 
388 	/*
389 	 * If we get here, we are not in a duplicate set, and just need
390 	 * to back up the cursor.  There are still three cases:
391 	 * midpage, beginning of page, beginning of bucket.
392 	 */
393 
394 	if (F_ISSET(hcp, H_DUPONLY)) {
395 		F_CLR(hcp, H_OK);
396 		F_SET(hcp, H_NOMORE);
397 		return (0);
398 	}
399 
400 	if (hcp->bndx == 0) { 		/* Beginning of page. */
401 		if ((ret = __ham_get_cpage(dbc, mode)) != 0)
402 			return (ret);
403 		hcp->pgno = PREV_PGNO(hcp->pagep);
404 		if (hcp->pgno == PGNO_INVALID) {
405 			/* Beginning of bucket. */
406 			F_SET(hcp, H_NOMORE);
407 			return (DB_NOTFOUND);
408 		} else if ((ret =
409 		    __ham_next_cpage(dbc, hcp->pgno, 0, 0)) != 0)
410 			return (ret);
411 		else
412 			hcp->bndx = H_NUMPAIRS(hcp->pagep);
413 	}
414 
415 	/*
416 	 * Either we've got the cursor set up to be decremented, or we
417 	 * have to find the end of a bucket.
418 	 */
419 	if (hcp->bndx == NDX_INVALID) {
420 		if (hcp->pagep == NULL)
421 			next_pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
422 		else
423 			goto got_page;
424 
425 		do {
426 			if ((ret = __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
427 				return (ret);
428 got_page:		next_pgno = NEXT_PGNO(hcp->pagep);
429 			hcp->bndx = H_NUMPAIRS(hcp->pagep);
430 		} while (next_pgno != PGNO_INVALID);
431 
432 		if (hcp->bndx == 0) {
433 			/* Bucket was empty. */
434 			F_SET(hcp, H_NOMORE);
435 			return (DB_NOTFOUND);
436 		}
437 	}
438 
439 	hcp->bndx--;
440 
441 	return (__ham_item(dbc, mode));
442 }
443 
444 /*
445  * Sets the cursor to the next key/data pair on a page.
446  *
447  * PUBLIC: int __ham_item_next __P((DBC *, db_lockmode_t));
448  */
449 int
450 __ham_item_next(dbc, mode)
451 	DBC *dbc;
452 	db_lockmode_t mode;
453 {
454 	HASH_CURSOR *hcp;
455 
456 	hcp = (HASH_CURSOR *)dbc->internal;
457 	/*
458 	 * Deleted on-page duplicates are a weird case. If we delete the last
459 	 * one, then our cursor is at the very end of a duplicate set and
460 	 * we actually need to go on to the next key.
461 	 */
462 	if (F_ISSET(hcp, H_DELETED)) {
463 		if (hcp->bndx != NDX_INVALID &&
464 		    F_ISSET(hcp, H_ISDUP) &&
465 		    hcp->dpgno == PGNO_INVALID &&
466 		    hcp->dup_tlen == hcp->dup_off) {
467 			if (F_ISSET(hcp, H_DUPONLY)) {
468 				F_CLR(hcp, H_OK);
469 				F_SET(hcp, H_NOMORE);
470 				return (0);
471 			} else {
472 				F_CLR(hcp, H_ISDUP);
473 				hcp->dpgno = PGNO_INVALID;
474 				hcp->bndx++;
475 			}
476 		} else if (!F_ISSET(hcp, H_ISDUP) &&
477 		    F_ISSET(hcp, H_DUPONLY)) {
478 			F_CLR(hcp, H_OK);
479 			F_SET(hcp, H_NOMORE);
480 			return (0);
481 		}
482 		F_CLR(hcp, H_DELETED);
483 	} else if (hcp->bndx == NDX_INVALID) {
484 		hcp->bndx = 0;
485 		hcp->dpgno = PGNO_INVALID;
486 		F_CLR(hcp, H_ISDUP);
487 	} else if (F_ISSET(hcp, H_ISDUP) && hcp->dpgno != PGNO_INVALID)
488 		hcp->dndx++;
489 	else if (F_ISSET(hcp, H_ISDUP)) {
490 		if (hcp->dup_off + DUP_SIZE(hcp->dup_len) >=
491 		    hcp->dup_tlen && F_ISSET(hcp, H_DUPONLY)) {
492 			F_CLR(hcp, H_OK);
493 			F_SET(hcp, H_NOMORE);
494 			return (0);
495 		}
496 		hcp->dndx++;
497 		hcp->dup_off += DUP_SIZE(hcp->dup_len);
498 		if (hcp->dup_off >= hcp->dup_tlen) {
499 			F_CLR(hcp, H_ISDUP);
500 			hcp->dpgno = PGNO_INVALID;
501 			hcp->bndx++;
502 		}
503 	} else if (F_ISSET(hcp, H_DUPONLY)) {
504 		F_CLR(hcp, H_OK);
505 		F_SET(hcp, H_NOMORE);
506 		return (0);
507 	} else
508 		hcp->bndx++;
509 
510 	return (__ham_item(dbc, mode));
511 }
512 
513 /*
514  * PUBLIC: void __ham_putitem __P((PAGE *p, const DBT *, int));
515  *
516  * This is a little bit sleazy in that we're overloading the meaning
517  * of the H_OFFPAGE type here.  When we recover deletes, we have the
518  * entire entry instead of having only the DBT, so we'll pass type
519  * H_OFFPAGE to mean, "copy the whole entry" as opposed to constructing
520  * an H_KEYDATA around it.
521  */
522 void
523 __ham_putitem(p, dbt, type)
524 	PAGE *p;
525 	const DBT *dbt;
526 	int type;
527 {
528 	u_int16_t n, off;
529 
530 	n = NUM_ENT(p);
531 
532 	/* Put the item element on the page. */
533 	if (type == H_OFFPAGE) {
534 		off = HOFFSET(p) - dbt->size;
535 		HOFFSET(p) = p->inp[n] = off;
536 		memcpy(P_ENTRY(p, n), dbt->data, dbt->size);
537 	} else {
538 		off = HOFFSET(p) - HKEYDATA_SIZE(dbt->size);
539 		HOFFSET(p) = p->inp[n] = off;
540 		PUT_HKEYDATA(P_ENTRY(p, n), dbt->data, dbt->size, type);
541 	}
542 
543 	/* Adjust page info. */
544 	NUM_ENT(p) += 1;
545 }
546 
547 /*
548  * PUBLIC: void __ham_reputpair
549  * PUBLIC:    __P((PAGE *p, u_int32_t, u_int32_t, const DBT *, const DBT *));
550  *
551  * This is a special case to restore a key/data pair to its original
552  * location during recovery.  We are guaranteed that the pair fits
553  * on the page and is not the last pair on the page (because if it's
554  * the last pair, the normal insert works).
555  */
556 void
557 __ham_reputpair(p, psize, ndx, key, data)
558 	PAGE *p;
559 	u_int32_t psize, ndx;
560 	const DBT *key, *data;
561 {
562 	db_indx_t i, movebytes, newbytes;
563 	u_int8_t *from;
564 
565 	/* First shuffle the existing items up on the page.  */
566 	movebytes =
567 	    (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 1)]) - HOFFSET(p);
568 	newbytes = key->size + data->size;
569 	from = (u_int8_t *)p + HOFFSET(p);
570 	memmove(from - newbytes, from, movebytes);
571 
572 	/*
573 	 * Adjust the indices and move them up 2 spaces. Note that we
574 	 * have to check the exit condition inside the loop just in case
575 	 * we are dealing with index 0 (db_indx_t's are unsigned).
576 	 */
577 	for (i = NUM_ENT(p) - 1; ; i-- ) {
578 		p->inp[i + 2] = p->inp[i] - newbytes;
579 		if (i == H_KEYINDEX(ndx))
580 			break;
581 	}
582 
583 	/* Put the key and data on the page. */
584 	p->inp[H_KEYINDEX(ndx)] =
585 	    (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 1)]) - key->size;
586 	p->inp[H_DATAINDEX(ndx)] = p->inp[H_KEYINDEX(ndx)] - data->size;
587 	memcpy(P_ENTRY(p, H_KEYINDEX(ndx)), key->data, key->size);
588 	memcpy(P_ENTRY(p, H_DATAINDEX(ndx)), data->data, data->size);
589 
590 	/* Adjust page info. */
591 	HOFFSET(p) -= newbytes;
592 	NUM_ENT(p) += 2;
593 }
594 
595 
596 /*
597  * PUBLIC: int __ham_del_pair __P((DBC *, int));
598  */
599 int
600 __ham_del_pair(dbc, reclaim_page)
601 	DBC *dbc;
602 	int reclaim_page;
603 {
604 	DB *dbp;
605 	HASH_CURSOR *hcp;
606 	DBT data_dbt, key_dbt;
607 	DB_ENV *dbenv;
608 	DB_LSN new_lsn, *n_lsn, tmp_lsn;
609 	PAGE *p;
610 	db_indx_t ndx;
611 	db_pgno_t chg_pgno, pgno;
612 	int ret, tret;
613 
614 	dbp = dbc->dbp;
615 	hcp = (HASH_CURSOR *)dbc->internal;
616 
617 	dbenv = dbp->dbenv;
618 	ndx = hcp->bndx;
619 	if (hcp->pagep == NULL &&
620 	    (ret = __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0)
621 		return (ret);
622 
623 	p = hcp->pagep;
624 
625 	/*
626 	 * We optimize for the normal case which is when neither the key nor
627 	 * the data are large.  In this case, we write a single log record
628 	 * and do the delete.  If either is large, we'll call __big_delete
629 	 * to remove the big item and then update the page to remove the
630 	 * entry referring to the big item.
631 	 */
632 	ret = 0;
633 	if (HPAGE_PTYPE(H_PAIRKEY(p, ndx)) == H_OFFPAGE) {
634 		memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(p, H_KEYINDEX(ndx))),
635 		    sizeof(db_pgno_t));
636 		ret = __db_doff(dbc, pgno, __ham_del_page);
637 	}
638 
639 	if (ret == 0)
640 		switch (HPAGE_PTYPE(H_PAIRDATA(p, ndx))) {
641 		case H_OFFPAGE:
642 			memcpy(&pgno,
643 			    HOFFPAGE_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
644 			    sizeof(db_pgno_t));
645 			ret = __db_doff(dbc, pgno, __ham_del_page);
646 			break;
647 		case H_OFFDUP:
648 			memcpy(&pgno,
649 			    HOFFDUP_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))),
650 			    sizeof(db_pgno_t));
651 			ret = __db_ddup(dbc, pgno, __ham_del_page);
652 			F_CLR(hcp, H_ISDUP);
653 			break;
654 		case H_DUPLICATE:
655 			/*
656 			 * If we delete a pair that is/was a duplicate, then
657 			 * we had better clear the flag so that we update the
658 			 * cursor appropriately.
659 			 */
660 			F_CLR(hcp, H_ISDUP);
661 			break;
662 		}
663 
664 	if (ret)
665 		return (ret);
666 
667 	/* Now log the delete off this page. */
668 	if (DB_LOGGING(dbc)) {
669 		key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx));
670 		key_dbt.size =
671 		    LEN_HITEM(p, hcp->hdr->pagesize, H_KEYINDEX(ndx));
672 		data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx));
673 		data_dbt.size =
674 		    LEN_HITEM(p, hcp->hdr->pagesize, H_DATAINDEX(ndx));
675 
676 		if ((ret = __ham_insdel_log(dbenv->lg_info,
677 		    dbc->txn, &new_lsn, 0, DELPAIR,
678 		    dbp->log_fileid, PGNO(p), (u_int32_t)ndx,
679 		    &LSN(p), &key_dbt, &data_dbt)) != 0)
680 			return (ret);
681 
682 		/* Move lsn onto page. */
683 		LSN(p) = new_lsn;
684 	}
685 
686 	__ham_dpair(dbp, p, ndx);
687 
688 	/*
689 	 * If we are locking, we will not maintain this, because it is
690 	 * a hot spot.
691 	 * XXX perhaps we can retain incremental numbers and apply them
692 	 * later.
693 	 */
694 	if (!F_ISSET(dbp, DB_AM_LOCKING))
695 		--hcp->hdr->nelem;
696 
697 	/*
698 	 * If we need to reclaim the page, then check if the page is empty.
699 	 * There are two cases.  If it's empty and it's not the first page
700 	 * in the bucket (i.e., the bucket page) then we can simply remove
701 	 * it. If it is the first chain in the bucket, then we need to copy
702 	 * the second page into it and remove the second page.
703 	 */
704 	if (reclaim_page && NUM_ENT(p) == 0 && PREV_PGNO(p) == PGNO_INVALID &&
705 	    NEXT_PGNO(p) != PGNO_INVALID) {
706 		PAGE *n_pagep, *nn_pagep;
707 		db_pgno_t tmp_pgno;
708 
709 		/*
710 		 * First page in chain is empty and we know that there
711 		 * are more pages in the chain.
712 		 */
713 		if ((ret =
714 		    __ham_get_page(dbp, NEXT_PGNO(p), &n_pagep)) != 0)
715 			return (ret);
716 
717 		if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
718 			if ((ret =
719 			    __ham_get_page(dbp, NEXT_PGNO(n_pagep),
720 			    &nn_pagep)) != 0) {
721 				(void) __ham_put_page(dbp, n_pagep, 0);
722 				return (ret);
723 			}
724 		}
725 
726 		if (DB_LOGGING(dbc)) {
727 			key_dbt.data = n_pagep;
728 			key_dbt.size = hcp->hdr->pagesize;
729 			if ((ret = __ham_copypage_log(dbenv->lg_info,
730 			    dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(p),
731 			    &LSN(p), PGNO(n_pagep), &LSN(n_pagep),
732 			    NEXT_PGNO(n_pagep),
733 			    NEXT_PGNO(n_pagep) == PGNO_INVALID ? NULL :
734 			    &LSN(nn_pagep), &key_dbt)) != 0)
735 				return (ret);
736 
737 			/* Move lsn onto page. */
738 			LSN(p) = new_lsn;	/* Structure assignment. */
739 			LSN(n_pagep) = new_lsn;
740 			if (NEXT_PGNO(n_pagep) != PGNO_INVALID)
741 				LSN(nn_pagep) = new_lsn;
742 		}
743 		if (NEXT_PGNO(n_pagep) != PGNO_INVALID) {
744 			PREV_PGNO(nn_pagep) = PGNO(p);
745 			(void)__ham_put_page(dbp, nn_pagep, 1);
746 		}
747 
748 		tmp_pgno = PGNO(p);
749 		tmp_lsn = LSN(p);
750 		memcpy(p, n_pagep, hcp->hdr->pagesize);
751 		PGNO(p) = tmp_pgno;
752 		LSN(p) = tmp_lsn;
753 		PREV_PGNO(p) = PGNO_INVALID;
754 
755 		/*
756 		 * Cursor is advanced to the beginning of the next page.
757 		 */
758 		hcp->bndx = 0;
759 		hcp->pgno = PGNO(p);
760 		F_SET(hcp, H_DELETED);
761 		chg_pgno = PGNO(p);
762 		if ((ret = __ham_dirty_page(dbp, p)) != 0 ||
763 		    (ret = __ham_del_page(dbc, n_pagep)) != 0)
764 			return (ret);
765 	} else if (reclaim_page &&
766 	    NUM_ENT(p) == 0 && PREV_PGNO(p) != PGNO_INVALID) {
767 		PAGE *n_pagep, *p_pagep;
768 
769 		if ((ret =
770 		    __ham_get_page(dbp, PREV_PGNO(p), &p_pagep)) != 0)
771 			return (ret);
772 
773 		if (NEXT_PGNO(p) != PGNO_INVALID) {
774 			if ((ret = __ham_get_page(dbp,
775 			    NEXT_PGNO(p), &n_pagep)) != 0) {
776 				(void)__ham_put_page(dbp, p_pagep, 0);
777 				return (ret);
778 			}
779 			n_lsn = &LSN(n_pagep);
780 		} else {
781 			n_pagep = NULL;
782 			n_lsn = NULL;
783 		}
784 
785 		NEXT_PGNO(p_pagep) = NEXT_PGNO(p);
786 		if (n_pagep != NULL)
787 			PREV_PGNO(n_pagep) = PGNO(p_pagep);
788 
789 		if (DB_LOGGING(dbc)) {
790 			if ((ret = __ham_newpage_log(dbenv->lg_info,
791 			    dbc->txn, &new_lsn, 0, DELOVFL,
792 			    dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep),
793 			    PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0)
794 				return (ret);
795 
796 			/* Move lsn onto page. */
797 			LSN(p_pagep) = new_lsn;	/* Structure assignment. */
798 			if (n_pagep)
799 				LSN(n_pagep) = new_lsn;
800 			LSN(p) = new_lsn;
801 		}
802 		hcp->pgno = NEXT_PGNO(p);
803 		hcp->bndx = 0;
804 		/*
805 		 * Since we are about to delete the cursor page and we have
806 		 * just moved the cursor, we need to make sure that the
807 		 * old page pointer isn't left hanging around in the cursor.
808 		 */
809 		hcp->pagep = NULL;
810 		chg_pgno = PGNO(p);
811 		ret = __ham_del_page(dbc, p);
812 		if ((tret = __ham_put_page(dbp, p_pagep, 1)) != 0 &&
813 		    ret == 0)
814 			ret = tret;
815 		if (n_pagep != NULL &&
816 		    (tret = __ham_put_page(dbp, n_pagep, 1)) != 0 &&
817 		    ret == 0)
818 			ret = tret;
819 		if (ret != 0)
820 			return (ret);
821 	} else {
822 		/*
823 		 * Mark item deleted so that we don't try to return it, and
824 		 * so that we update the cursor correctly on the next call
825 		 * to next.
826 		 */
827 		F_SET(hcp, H_DELETED);
828 		chg_pgno = hcp->pgno;
829 		ret = __ham_dirty_page(dbp, p);
830 	}
831 	__ham_c_update(hcp, chg_pgno, 0, 0, 0);
832 
833 	/*
834 	 * Since we just deleted a pair from the master page, anything
835 	 * in hcp->dpgno should be cleared.
836 	 */
837 	hcp->dpgno = PGNO_INVALID;
838 
839 	F_CLR(hcp, H_OK);
840 	return (ret);
841 }
842 
843 /*
844  * __ham_replpair --
845  *	Given the key data indicated by the cursor, replace part/all of it
846  *	according to the fields in the dbt.
847  *
848  * PUBLIC: int __ham_replpair __P((DBC *, DBT *, u_int32_t));
849  */
850 int
851 __ham_replpair(dbc, dbt, make_dup)
852 	DBC *dbc;
853 	DBT *dbt;
854 	u_int32_t make_dup;
855 {
856 	DB *dbp;
857 	HASH_CURSOR *hcp;
858 	DBT old_dbt, tdata, tmp;
859 	DB_LSN	new_lsn;
860 	int32_t change;			/* XXX: Possible overflow. */
861 	u_int32_t len;
862 	int is_big, ret, type;
863 	u_int8_t *beg, *dest, *end, *hk, *src;
864 
865 	/*
866 	 * Big item replacements are handled in generic code.
867 	 * Items that fit on the current page fall into 4 classes.
868 	 * 1. On-page element, same size
869 	 * 2. On-page element, new is bigger (fits)
870 	 * 3. On-page element, new is bigger (does not fit)
871 	 * 4. On-page element, old is bigger
872 	 * Numbers 1, 2, and 4 are essentially the same (and should
873 	 * be the common case).  We handle case 3 as a delete and
874 	 * add.
875 	 */
876 	dbp = dbc->dbp;
877 	hcp = (HASH_CURSOR *)dbc->internal;
878 
879 	/*
880 	 * We need to compute the number of bytes that we are adding or
881 	 * removing from the entry.  Normally, we can simply substract
882 	 * the number of bytes we are replacing (dbt->dlen) from the
883 	 * number of bytes we are inserting (dbt->size).  However, if
884 	 * we are doing a partial put off the end of a record, then this
885 	 * formula doesn't work, because we are essentially adding
886 	 * new bytes.
887 	 */
888 	change = dbt->size - dbt->dlen;
889 
890 	hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
891 	is_big = HPAGE_PTYPE(hk) == H_OFFPAGE;
892 
893 	if (is_big)
894 		memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
895 	else
896 		len = LEN_HKEYDATA(hcp->pagep,
897 		    dbp->pgsize, H_DATAINDEX(hcp->bndx));
898 
899 	if (dbt->doff + dbt->dlen > len)
900 		change += dbt->doff + dbt->dlen - len;
901 
902 
903 	if (change > (int32_t)P_FREESPACE(hcp->pagep) || is_big) {
904 		/*
905 		 * Case 3 -- two subcases.
906 		 * A. This is not really a partial operation, but an overwrite.
907 		 *    Simple del and add works.
908 		 * B. This is a partial and we need to construct the data that
909 		 *    we are really inserting (yuck).
910 		 * In both cases, we need to grab the key off the page (in
911 		 * some cases we could do this outside of this routine; for
912 		 * cleanliness we do it here.  If you happen to be on a big
913 		 * key, this could be a performance hit).
914 		 */
915 		tmp.flags = 0;
916 		F_SET(&tmp, DB_DBT_MALLOC | DB_DBT_INTERNAL);
917 		if ((ret =
918 		    __db_ret(dbp, hcp->pagep, H_KEYINDEX(hcp->bndx),
919 		    &tmp, &dbc->rkey.data, &dbc->rkey.size)) != 0)
920 			return (ret);
921 
922 		if (dbt->doff == 0 && dbt->dlen == len) {
923 			ret = __ham_del_pair(dbc, 0);
924 			if (ret == 0)
925 			    ret = __ham_add_el(dbc, &tmp, dbt, H_KEYDATA);
926 		} else {					/* Case B */
927 			type = HPAGE_PTYPE(hk) != H_OFFPAGE ?
928 			    HPAGE_PTYPE(hk) : H_KEYDATA;
929 			tdata.flags = 0;
930 			F_SET(&tdata, DB_DBT_MALLOC | DB_DBT_INTERNAL);
931 
932 			if ((ret = __db_ret(dbp, hcp->pagep,
933 			    H_DATAINDEX(hcp->bndx), &tdata, &dbc->rdata.data,
934 			    &dbc->rdata.size)) != 0)
935 				goto err;
936 
937 			/* Now we can delete the item. */
938 			if ((ret = __ham_del_pair(dbc, 0)) != 0) {
939 				__os_free(tdata.data, tdata.size);
940 				goto err;
941 			}
942 
943 			/* Now shift old data around to make room for new. */
944 			if (change > 0) {
945 				 if ((ret = __os_realloc(&tdata.data,
946 				     tdata.size + change)) != 0)
947 					return (ret);
948 				memset((u_int8_t *)tdata.data + tdata.size,
949 				    0, change);
950 			}
951 			end = (u_int8_t *)tdata.data + tdata.size;
952 
953 			src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen;
954 			if (src < end && tdata.size > dbt->doff + dbt->dlen) {
955 				len = tdata.size - dbt->doff - dbt->dlen;
956 				dest = src + change;
957 				memmove(dest, src, len);
958 			}
959 			memcpy((u_int8_t *)tdata.data + dbt->doff,
960 			    dbt->data, dbt->size);
961 			tdata.size += change;
962 
963 			/* Now add the pair. */
964 			ret = __ham_add_el(dbc, &tmp, &tdata, type);
965 			__os_free(tdata.data, tdata.size);
966 		}
967 err:		__os_free(tmp.data, tmp.size);
968 		return (ret);
969 	}
970 
971 	/*
972 	 * Set up pointer into existing data. Do it before the log
973 	 * message so we can use it inside of the log setup.
974 	 */
975 	beg = HKEYDATA_DATA(H_PAIRDATA(hcp->pagep, hcp->bndx));
976 	beg += dbt->doff;
977 
978 	/*
979 	 * If we are going to have to move bytes at all, figure out
980 	 * all the parameters here.  Then log the call before moving
981 	 * anything around.
982 	 */
983 	if (DB_LOGGING(dbc)) {
984 		old_dbt.data = beg;
985 		old_dbt.size = dbt->dlen;
986 		if ((ret = __ham_replace_log(dbp->dbenv->lg_info,
987 		    dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(hcp->pagep),
988 		    (u_int32_t)H_DATAINDEX(hcp->bndx), &LSN(hcp->pagep),
989 		    (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0)
990 			return (ret);
991 
992 		LSN(hcp->pagep) = new_lsn;	/* Structure assignment. */
993 	}
994 
995 	__ham_onpage_replace(hcp->pagep, dbp->pgsize,
996 	    (u_int32_t)H_DATAINDEX(hcp->bndx), (int32_t)dbt->doff, change, dbt);
997 
998 	return (0);
999 }
1000 
1001 /*
1002  * Replace data on a page with new data, possibly growing or shrinking what's
1003  * there.  This is called on two different occasions. On one (from replpair)
1004  * we are interested in changing only the data.  On the other (from recovery)
1005  * we are replacing the entire data (header and all) with a new element.  In
1006  * the latter case, the off argument is negative.
1007  * pagep: the page that we're changing
1008  * ndx: page index of the element that is growing/shrinking.
1009  * off: Offset at which we are beginning the replacement.
1010  * change: the number of bytes (+ or -) that the element is growing/shrinking.
1011  * dbt: the new data that gets written at beg.
1012  * PUBLIC: void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t,
1013  * PUBLIC:     int32_t,  DBT *));
1014  */
1015 void
1016 __ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt)
1017 	PAGE *pagep;
1018 	size_t pgsize;
1019 	u_int32_t ndx;
1020 	int32_t off;
1021 	int32_t change;
1022 	DBT *dbt;
1023 {
1024 	db_indx_t i;
1025 	int32_t len;
1026 	u_int8_t *src, *dest;
1027 	int zero_me;
1028 
1029 	if (change != 0) {
1030 		zero_me = 0;
1031 		src = (u_int8_t *)(pagep) + HOFFSET(pagep);
1032 		if (off < 0)
1033 			len = pagep->inp[ndx] - HOFFSET(pagep);
1034 		else if ((u_int32_t)off >= LEN_HKEYDATA(pagep, pgsize, ndx)) {
1035 			len = HKEYDATA_DATA(P_ENTRY(pagep, ndx)) +
1036 			    LEN_HKEYDATA(pagep, pgsize, ndx) - src;
1037 			zero_me = 1;
1038 		} else
1039 			len = (HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off) - src;
1040 		dest = src - change;
1041 		memmove(dest, src, len);
1042 		if (zero_me)
1043 			memset(dest + len, 0, change);
1044 
1045 		/* Now update the indices. */
1046 		for (i = ndx; i < NUM_ENT(pagep); i++)
1047 			pagep->inp[i] -= change;
1048 		HOFFSET(pagep) -= change;
1049 	}
1050 	if (off >= 0)
1051 		memcpy(HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off,
1052 		    dbt->data, dbt->size);
1053 	else
1054 		memcpy(P_ENTRY(pagep, ndx), dbt->data, dbt->size);
1055 }
1056 
1057 /*
1058  * PUBLIC: int __ham_split_page __P((DBC *, u_int32_t, u_int32_t));
1059  */
1060 int
1061 __ham_split_page(dbc, obucket, nbucket)
1062 	DBC *dbc;
1063 	u_int32_t obucket, nbucket;
1064 {
1065 	DB *dbp;
1066 	HASH_CURSOR *hcp;
1067 	DBT key, page_dbt;
1068 	DB_ENV *dbenv;
1069 	DB_LSN new_lsn;
1070 	PAGE **pp, *old_pagep, *temp_pagep, *new_pagep;
1071 	db_indx_t n;
1072 	db_pgno_t bucket_pgno, next_pgno;
1073 	u_int32_t big_len, len;
1074 	int ret, tret;
1075 	void *big_buf;
1076 
1077 	dbp = dbc->dbp;
1078 	hcp = (HASH_CURSOR *)dbc->internal;
1079 	dbenv = dbp->dbenv;
1080 	temp_pagep = old_pagep = new_pagep = NULL;
1081 
1082 	bucket_pgno = BUCKET_TO_PAGE(hcp, obucket);
1083 	if ((ret = __ham_get_page(dbp, bucket_pgno, &old_pagep)) != 0)
1084 		return (ret);
1085 	if ((ret = __ham_new_page(dbp, BUCKET_TO_PAGE(hcp, nbucket), P_HASH,
1086 	    &new_pagep)) != 0)
1087 		goto err;
1088 
1089 	temp_pagep = hcp->split_buf;
1090 	memcpy(temp_pagep, old_pagep, hcp->hdr->pagesize);
1091 
1092 	if (DB_LOGGING(dbc)) {
1093 		page_dbt.size = hcp->hdr->pagesize;
1094 		page_dbt.data = old_pagep;
1095 		if ((ret = __ham_splitdata_log(dbenv->lg_info,
1096 		    dbc->txn, &new_lsn, 0, dbp->log_fileid, SPLITOLD,
1097 		    PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0)
1098 			goto err;
1099 	}
1100 
1101 	P_INIT(old_pagep, hcp->hdr->pagesize, PGNO(old_pagep), PGNO_INVALID,
1102 	    PGNO_INVALID, 0, P_HASH);
1103 
1104 	if (DB_LOGGING(dbc))
1105 		LSN(old_pagep) = new_lsn;	/* Structure assignment. */
1106 
1107 	big_len = 0;
1108 	big_buf = NULL;
1109 	key.flags = 0;
1110 	while (temp_pagep != NULL) {
1111 		for (n = 0; n < (db_indx_t)H_NUMPAIRS(temp_pagep); n++) {
1112 			if ((ret =
1113 			    __db_ret(dbp, temp_pagep, H_KEYINDEX(n),
1114 			    &key, &big_buf, &big_len)) != 0)
1115 				goto err;
1116 
1117 			if (__ham_call_hash(hcp, key.data, key.size)
1118 			    == obucket)
1119 				pp = &old_pagep;
1120 			else
1121 				pp = &new_pagep;
1122 
1123 			/*
1124 			 * Figure out how many bytes we need on the new
1125 			 * page to store the key/data pair.
1126 			 */
1127 
1128 			len = LEN_HITEM(temp_pagep, hcp->hdr->pagesize,
1129 			    H_DATAINDEX(n)) +
1130 			    LEN_HITEM(temp_pagep, hcp->hdr->pagesize,
1131 			    H_KEYINDEX(n)) +
1132 			    2 * sizeof(db_indx_t);
1133 
1134 			if (P_FREESPACE(*pp) < len) {
1135 				if (DB_LOGGING(dbc)) {
1136 					page_dbt.size = hcp->hdr->pagesize;
1137 					page_dbt.data = *pp;
1138 					if ((ret = __ham_splitdata_log(
1139 					    dbenv->lg_info, dbc->txn,
1140 					    &new_lsn, 0, dbp->log_fileid,
1141 					    SPLITNEW, PGNO(*pp), &page_dbt,
1142 					    &LSN(*pp))) != 0)
1143 						goto err;
1144 					LSN(*pp) = new_lsn;
1145 				}
1146 				if ((ret =
1147 				    __ham_add_ovflpage(dbc, *pp, 1, pp)) != 0)
1148 					goto err;
1149 			}
1150 			__ham_copy_item(dbp->pgsize,
1151 			    temp_pagep, H_KEYINDEX(n), *pp);
1152 			__ham_copy_item(dbp->pgsize,
1153 			    temp_pagep, H_DATAINDEX(n), *pp);
1154 		}
1155 		next_pgno = NEXT_PGNO(temp_pagep);
1156 
1157 		/* Clear temp_page; if it's a link overflow page, free it. */
1158 		if (PGNO(temp_pagep) != bucket_pgno && (ret =
1159 		    __ham_del_page(dbc, temp_pagep)) != 0)
1160 			goto err;
1161 
1162 		if (next_pgno == PGNO_INVALID)
1163 			temp_pagep = NULL;
1164 		else if ((ret =
1165 		    __ham_get_page(dbp, next_pgno, &temp_pagep)) != 0)
1166 			goto err;
1167 
1168 		if (temp_pagep != NULL && DB_LOGGING(dbc)) {
1169 			page_dbt.size = hcp->hdr->pagesize;
1170 			page_dbt.data = temp_pagep;
1171 			if ((ret = __ham_splitdata_log(dbenv->lg_info,
1172 			    dbc->txn, &new_lsn, 0, dbp->log_fileid,
1173 			    SPLITOLD, PGNO(temp_pagep),
1174 			    &page_dbt, &LSN(temp_pagep))) != 0)
1175 				goto err;
1176 			LSN(temp_pagep) = new_lsn;
1177 		}
1178 	}
1179 	if (big_buf != NULL)
1180 		__os_free(big_buf, big_len);
1181 
1182 	/*
1183 	 * If the original bucket spanned multiple pages, then we've got
1184 	 * a pointer to a page that used to be on the bucket chain.  It
1185 	 * should be deleted.
1186 	 */
1187 	if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno &&
1188 	    (ret = __ham_del_page(dbc, temp_pagep)) != 0)
1189 		goto err;
1190 
1191 	/*
1192 	 * Write new buckets out.
1193 	 */
1194 	if (DB_LOGGING(dbc)) {
1195 		page_dbt.size = hcp->hdr->pagesize;
1196 		page_dbt.data = old_pagep;
1197 		if ((ret = __ham_splitdata_log(dbenv->lg_info,
1198 		   dbc->txn, &new_lsn, 0, dbp->log_fileid,
1199 		   SPLITNEW, PGNO(old_pagep),
1200 		    &page_dbt, &LSN(old_pagep))) != 0)
1201 			goto err;
1202 		LSN(old_pagep) = new_lsn;
1203 
1204 		page_dbt.data = new_pagep;
1205 		if ((ret = __ham_splitdata_log(dbenv->lg_info,
1206 		    dbc->txn, &new_lsn, 0, dbp->log_fileid,
1207 		    SPLITNEW, PGNO(new_pagep), &page_dbt, &LSN(new_pagep))) != 0)
1208 			goto err;
1209 		LSN(new_pagep) = new_lsn;
1210 	}
1211 	ret = __ham_put_page(dbp, old_pagep, 1);
1212 	if ((tret = __ham_put_page(dbp, new_pagep, 1)) != 0 &&
1213 	    ret == 0)
1214 		ret = tret;
1215 
1216 	if (0) {
1217 err:		if (old_pagep != NULL)
1218 			(void)__ham_put_page(dbp, old_pagep, 1);
1219 		if (new_pagep != NULL)
1220 			(void)__ham_put_page(dbp, new_pagep, 1);
1221 		if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno)
1222 			(void)__ham_put_page(dbp, temp_pagep, 1);
1223 	}
1224 	return (ret);
1225 }
1226 
1227 /*
1228  * Add the given pair to the page.  The page in question may already be
1229  * held (i.e. it was already gotten).  If it is, then the page is passed
1230  * in via the pagep parameter.  On return, pagep will contain the page
1231  * to which we just added something.  This allows us to link overflow
1232  * pages and return the new page having correctly put the last page.
1233  *
1234  * PUBLIC: int __ham_add_el __P((DBC *, const DBT *, const DBT *, int));
1235  */
1236 int
1237 __ham_add_el(dbc, key, val, type)
1238 	DBC *dbc;
1239 	const DBT *key, *val;
1240 	int type;
1241 {
1242 	DB *dbp;
1243 	HASH_CURSOR *hcp;
1244 	const DBT *pkey, *pdata;
1245 	DBT key_dbt, data_dbt;
1246 	DB_LSN new_lsn;
1247 	HOFFPAGE doff, koff;
1248 	db_pgno_t next_pgno;
1249 	u_int32_t data_size, key_size, pairsize, rectype;
1250 	int do_expand, is_keybig, is_databig, ret;
1251 	int key_type, data_type;
1252 
1253 	dbp = dbc->dbp;
1254 	hcp = (HASH_CURSOR *)dbc->internal;
1255 	do_expand = 0;
1256 
1257 	if (hcp->pagep == NULL && (ret = __ham_get_page(dbp,
1258 	    hcp->seek_found_page != PGNO_INVALID ?  hcp->seek_found_page :
1259 	    hcp->pgno, &hcp->pagep)) != 0)
1260 		return (ret);
1261 
1262 	key_size = HKEYDATA_PSIZE(key->size);
1263 	data_size = HKEYDATA_PSIZE(val->size);
1264 	is_keybig = ISBIG(hcp, key->size);
1265 	is_databig = ISBIG(hcp, val->size);
1266 	if (is_keybig)
1267 		key_size = HOFFPAGE_PSIZE;
1268 	if (is_databig)
1269 		data_size = HOFFPAGE_PSIZE;
1270 
1271 	pairsize = key_size + data_size;
1272 
1273 	/* Advance to first page in chain with room for item. */
1274 	while (H_NUMPAIRS(hcp->pagep) && NEXT_PGNO(hcp->pagep) !=
1275 	    PGNO_INVALID) {
1276 		/*
1277 		 * This may not be the end of the chain, but the pair may fit
1278 		 * anyway.  Check if it's a bigpair that fits or a regular
1279 		 * pair that fits.
1280 		 */
1281 		if (P_FREESPACE(hcp->pagep) >= pairsize)
1282 			break;
1283 		next_pgno = NEXT_PGNO(hcp->pagep);
1284 		if ((ret =
1285 		    __ham_next_cpage(dbc, next_pgno, 0, 0)) != 0)
1286 			return (ret);
1287 	}
1288 
1289 	/*
1290 	 * Check if we need to allocate a new page.
1291 	 */
1292 	if (P_FREESPACE(hcp->pagep) < pairsize) {
1293 		do_expand = 1;
1294 		if ((ret = __ham_add_ovflpage(dbc,
1295 		    hcp->pagep, 1, &hcp->pagep)) !=  0)
1296 			return (ret);
1297 		hcp->pgno = PGNO(hcp->pagep);
1298 	}
1299 
1300 	/*
1301 	 * Update cursor.
1302 	 */
1303 	hcp->bndx = H_NUMPAIRS(hcp->pagep);
1304 	F_CLR(hcp, H_DELETED);
1305 	if (is_keybig) {
1306 		koff.type = H_OFFPAGE;
1307 		UMRW(koff.unused[0]);
1308 		UMRW(koff.unused[1]);
1309 		UMRW(koff.unused[2]);
1310 		if ((ret = __db_poff(dbc,
1311 		    key, &koff.pgno, __ham_overflow_page)) != 0)
1312 			return (ret);
1313 		koff.tlen = key->size;
1314 		key_dbt.data = &koff;
1315 		key_dbt.size = sizeof(koff);
1316 		pkey = &key_dbt;
1317 		key_type = H_OFFPAGE;
1318 	} else {
1319 		pkey = key;
1320 		key_type = H_KEYDATA;
1321 	}
1322 
1323 	if (is_databig) {
1324 		doff.type = H_OFFPAGE;
1325 		UMRW(doff.unused[0]);
1326 		UMRW(doff.unused[1]);
1327 		UMRW(doff.unused[2]);
1328 		if ((ret = __db_poff(dbc,
1329 		    val, &doff.pgno, __ham_overflow_page)) != 0)
1330 			return (ret);
1331 		doff.tlen = val->size;
1332 		data_dbt.data = &doff;
1333 		data_dbt.size = sizeof(doff);
1334 		pdata = &data_dbt;
1335 		data_type = H_OFFPAGE;
1336 	} else {
1337 		pdata = val;
1338 		data_type = type;
1339 	}
1340 
1341 	if (DB_LOGGING(dbc)) {
1342 		rectype = PUTPAIR;
1343 		if (is_databig)
1344 			rectype |= PAIR_DATAMASK;
1345 		if (is_keybig)
1346 			rectype |= PAIR_KEYMASK;
1347 
1348 		if ((ret = __ham_insdel_log(dbp->dbenv->lg_info,
1349 		    dbc->txn, &new_lsn, 0, rectype,
1350 		    dbp->log_fileid, PGNO(hcp->pagep),
1351 		    (u_int32_t)H_NUMPAIRS(hcp->pagep),
1352 		    &LSN(hcp->pagep), pkey, pdata)) != 0)
1353 			return (ret);
1354 
1355 		/* Move lsn onto page. */
1356 		LSN(hcp->pagep) = new_lsn;	/* Structure assignment. */
1357 	}
1358 
1359 	__ham_putitem(hcp->pagep, pkey, key_type);
1360 	__ham_putitem(hcp->pagep, pdata, data_type);
1361 
1362 	/*
1363 	 * For splits, we are going to update item_info's page number
1364 	 * field, so that we can easily return to the same page the
1365 	 * next time we come in here.  For other operations, this shouldn't
1366 	 * matter, since odds are this is the last thing that happens before
1367 	 * we return to the user program.
1368 	 */
1369 	hcp->pgno = PGNO(hcp->pagep);
1370 
1371 	/*
1372 	 * XXX Maybe keep incremental numbers here
1373 	 */
1374 	if (!F_ISSET(dbp, DB_AM_LOCKING))
1375 		hcp->hdr->nelem++;
1376 
1377 	if (do_expand || (hcp->hdr->ffactor != 0 &&
1378 	    (u_int32_t)H_NUMPAIRS(hcp->pagep) > hcp->hdr->ffactor))
1379 		F_SET(hcp, H_EXPAND);
1380 	return (0);
1381 }
1382 
1383 
1384 /*
1385  * Special __putitem call used in splitting -- copies one entry to
1386  * another.  Works for all types of hash entries (H_OFFPAGE, H_KEYDATA,
1387  * H_DUPLICATE, H_OFFDUP).  Since we log splits at a high level, we
1388  * do not need to do any logging here.
1389  *
1390  * PUBLIC: void __ham_copy_item __P((size_t, PAGE *, u_int32_t, PAGE *));
1391  */
1392 void
1393 __ham_copy_item(pgsize, src_page, src_ndx, dest_page)
1394 	size_t pgsize;
1395 	PAGE *src_page;
1396 	u_int32_t src_ndx;
1397 	PAGE *dest_page;
1398 {
1399 	u_int32_t len;
1400 	void *src, *dest;
1401 
1402 	/*
1403 	 * Copy the key and data entries onto this new page.
1404 	 */
1405 	src = P_ENTRY(src_page, src_ndx);
1406 
1407 	/* Set up space on dest. */
1408 	len = LEN_HITEM(src_page, pgsize, src_ndx);
1409 	HOFFSET(dest_page) -= len;
1410 	dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page);
1411 	dest = P_ENTRY(dest_page, NUM_ENT(dest_page));
1412 	NUM_ENT(dest_page)++;
1413 
1414 	memcpy(dest, src, len);
1415 }
1416 
1417 /*
1418  *
1419  * Returns:
1420  *      pointer on success
1421  *      NULL on error
1422  *
1423  * PUBLIC: int __ham_add_ovflpage __P((DBC *, PAGE *, int, PAGE **));
1424  */
1425 int
1426 __ham_add_ovflpage(dbc, pagep, release, pp)
1427 	DBC *dbc;
1428 	PAGE *pagep;
1429 	int release;
1430 	PAGE **pp;
1431 {
1432 	DB *dbp;
1433 	HASH_CURSOR *hcp;
1434 	DB_LSN new_lsn;
1435 	PAGE *new_pagep;
1436 	int ret;
1437 
1438 	dbp = dbc->dbp;
1439 	hcp = (HASH_CURSOR *)dbc->internal;
1440 
1441 	if ((ret = __ham_overflow_page(dbc, P_HASH, &new_pagep)) != 0)
1442 		return (ret);
1443 
1444 	if (DB_LOGGING(dbc)) {
1445 		if ((ret = __ham_newpage_log(dbp->dbenv->lg_info,
1446 		    dbc->txn, &new_lsn, 0, PUTOVFL,
1447 		    dbp->log_fileid, PGNO(pagep), &LSN(pagep),
1448 		    PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0)
1449 			return (ret);
1450 
1451 		/* Move lsn onto page. */
1452 		LSN(pagep) = LSN(new_pagep) = new_lsn;
1453 	}
1454 	NEXT_PGNO(pagep) = PGNO(new_pagep);
1455 	PREV_PGNO(new_pagep) = PGNO(pagep);
1456 
1457 	if (release)
1458 		ret = __ham_put_page(dbp, pagep, 1);
1459 
1460 	hcp->stats.hash_overflows++;
1461 	*pp = new_pagep;
1462 	return (ret);
1463 }
1464 
1465 
1466 /*
1467  * PUBLIC: int __ham_new_page __P((DB *, u_int32_t, u_int32_t, PAGE **));
1468  */
1469 int
1470 __ham_new_page(dbp, addr, type, pp)
1471 	DB *dbp;
1472 	u_int32_t addr, type;
1473 	PAGE **pp;
1474 {
1475 	PAGE *pagep;
1476 	int ret;
1477 
1478 	if ((ret = memp_fget(dbp->mpf,
1479 	    &addr, DB_MPOOL_CREATE, &pagep)) != 0)
1480 		return (ret);
1481 
1482 	/* This should not be necessary because page-in should do it. */
1483 	P_INIT(pagep, dbp->pgsize, addr, PGNO_INVALID, PGNO_INVALID, 0, type);
1484 
1485 	*pp = pagep;
1486 	return (0);
1487 }
1488 
1489 /*
1490  * PUBLIC: int __ham_del_page __P((DBC *, PAGE *));
1491  */
1492 int
1493 __ham_del_page(dbc, pagep)
1494 	DBC *dbc;
1495 	PAGE *pagep;
1496 {
1497 	DB *dbp;
1498 	HASH_CURSOR *hcp;
1499 	DB_LSN new_lsn;
1500 	int ret;
1501 
1502 	dbp = dbc->dbp;
1503 	hcp = (HASH_CURSOR *)dbc->internal;
1504 	ret = 0;
1505 	DIRTY_META(dbp, hcp, ret);
1506 	if (ret != 0) {
1507 		if (ret != EAGAIN)
1508 			__db_err(dbp->dbenv,
1509 			    "free_ovflpage: unable to lock meta data page %s\n",
1510 			    strerror(ret));
1511 		/*
1512 		 * If we are going to return an error, then we should free
1513 		 * the page, so it doesn't stay pinned forever.
1514 		 */
1515 		(void)__ham_put_page(dbp, pagep, 0);
1516 		return (ret);
1517 	}
1518 
1519 	if (DB_LOGGING(dbc)) {
1520 		if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info,
1521 		    dbc->txn, &new_lsn, 0, DELPGNO,
1522 		    dbp->log_fileid, PGNO(pagep), hcp->hdr->last_freed,
1523 		    (u_int32_t)TYPE(pagep), NEXT_PGNO(pagep), P_INVALID,
1524 		    &LSN(pagep), &hcp->hdr->lsn)) != 0)
1525 			return (ret);
1526 
1527 		hcp->hdr->lsn = new_lsn;
1528 		LSN(pagep) = new_lsn;
1529 	}
1530 
1531 #ifdef DIAGNOSTIC
1532 	{
1533 		db_pgno_t __pgno;
1534 		DB_LSN __lsn;
1535 		__pgno = pagep->pgno;
1536 		__lsn = pagep->lsn;
1537 		memset(pagep, 0xdb, dbp->pgsize);
1538 		pagep->pgno = __pgno;
1539 		pagep->lsn = __lsn;
1540 	}
1541 #endif
1542 	TYPE(pagep) = P_INVALID;
1543 	NEXT_PGNO(pagep) = hcp->hdr->last_freed;
1544 	hcp->hdr->last_freed = PGNO(pagep);
1545 
1546 	return (__ham_put_page(dbp, pagep, 1));
1547 }
1548 
1549 
1550 /*
1551  * PUBLIC: int __ham_put_page __P((DB *, PAGE *, int32_t));
1552  */
1553 int
1554 __ham_put_page(dbp, pagep, is_dirty)
1555 	DB *dbp;
1556 	PAGE *pagep;
1557 	int32_t is_dirty;
1558 {
1559 #ifdef DEBUG_SLOW
1560 	__account_page(dbp, ((BKT *)((char *)pagep - sizeof(BKT)))->pgno, -1);
1561 #endif
1562 	return (memp_fput(dbp->mpf, pagep, (is_dirty ? DB_MPOOL_DIRTY : 0)));
1563 }
1564 
1565 /*
1566  * __ham_dirty_page --
1567  *	Mark a page dirty.
1568  *
1569  * PUBLIC: int __ham_dirty_page __P((DB *, PAGE *));
1570  */
1571 int
1572 __ham_dirty_page(dbp, pagep)
1573 	DB *dbp;
1574 	PAGE *pagep;
1575 {
1576 	return (memp_fset(dbp->mpf, pagep, DB_MPOOL_DIRTY));
1577 }
1578 
1579 /*
1580  * PUBLIC: int __ham_get_page __P((DB *, db_pgno_t, PAGE **));
1581  */
1582 int
1583 __ham_get_page(dbp, addr, pagep)
1584 	DB *dbp;
1585 	db_pgno_t addr;
1586 	PAGE **pagep;
1587 {
1588 	int ret;
1589 
1590 	ret = memp_fget(dbp->mpf, &addr, DB_MPOOL_CREATE, pagep);
1591 #ifdef DEBUG_SLOW
1592 	if (*pagep != NULL)
1593 		__account_page(dbp, addr, 1);
1594 #endif
1595 	return (ret);
1596 }
1597 
1598 /*
1599  * PUBLIC: int __ham_overflow_page
1600  * PUBLIC:     __P((DBC *, u_int32_t, PAGE **));
1601  */
1602 int
1603 __ham_overflow_page(dbc, type, pp)
1604 	DBC *dbc;
1605 	u_int32_t type;
1606 	PAGE **pp;
1607 {
1608 	DB *dbp;
1609 	HASH_CURSOR *hcp;
1610 	DB_LSN *lsnp, new_lsn;
1611 	PAGE *p;
1612 	db_pgno_t new_addr, next_free, newalloc_flag;
1613 	u_int32_t offset, splitnum;
1614 	int ret;
1615 
1616 	ret = 0;
1617 	dbp = dbc->dbp;
1618 	hcp = (HASH_CURSOR *)dbc->internal;
1619 	DIRTY_META(dbp, hcp, ret);
1620 	if (ret != 0)
1621 		return (ret);
1622 
1623 	/*
1624 	 * This routine is split up into two parts.  First we have
1625 	 * to figure out the address of the new page that we are
1626 	 * allocating.  Then we have to log the allocation.  Only
1627 	 * after the log do we get to complete allocation of the
1628 	 * new page.
1629 	 */
1630 	new_addr = hcp->hdr->last_freed;
1631 	if (new_addr != PGNO_INVALID) {
1632 		if ((ret = __ham_get_page(dbp, new_addr, &p)) != 0)
1633 			return (ret);
1634 		next_free = NEXT_PGNO(p);
1635 		lsnp = &LSN(p);
1636 		newalloc_flag = 0;
1637 	} else {
1638 		splitnum = hcp->hdr->ovfl_point;
1639 		hcp->hdr->spares[splitnum]++;
1640 		offset = hcp->hdr->spares[splitnum] -
1641 		    (splitnum ? hcp->hdr->spares[splitnum - 1] : 0);
1642 		new_addr = PGNO_OF(hcp, hcp->hdr->ovfl_point, offset);
1643 		if (new_addr > MAX_PAGES(hcp)) {
1644 			__db_err(dbp->dbenv, "hash: out of file pages");
1645 			hcp->hdr->spares[splitnum]--;
1646 			return (ENOMEM);
1647 		}
1648 		next_free = PGNO_INVALID;
1649 		p = NULL;
1650 		lsnp = NULL;
1651 		newalloc_flag = 1;
1652 	}
1653 
1654 	if (DB_LOGGING(dbc)) {
1655 		if ((ret = __ham_newpgno_log(dbp->dbenv->lg_info,
1656 		    dbc->txn, &new_lsn, 0, ALLOCPGNO,
1657 		    dbp->log_fileid, new_addr, next_free,
1658 		    0, newalloc_flag, type, lsnp, &hcp->hdr->lsn)) != 0)
1659 			return (ret);
1660 
1661 		hcp->hdr->lsn = new_lsn;
1662 		if (lsnp != NULL)
1663 			*lsnp = new_lsn;
1664 	}
1665 
1666 	if (p != NULL) {
1667 		/* We just took something off the free list, initialize it. */
1668 		hcp->hdr->last_freed = next_free;
1669 		P_INIT(p, hcp->hdr->pagesize, PGNO(p), PGNO_INVALID,
1670 		    PGNO_INVALID, 0, (u_int8_t)type);
1671 	} else {
1672 		/* Get the new page. */
1673 		if ((ret = __ham_new_page(dbp, new_addr, type, &p)) != 0)
1674 			return (ret);
1675 	}
1676 	if (DB_LOGGING(dbc))
1677 		LSN(p) = new_lsn;
1678 
1679 	*pp = p;
1680 	return (0);
1681 }
1682 
1683 #ifdef DEBUG
1684 /*
1685  * PUBLIC: #ifdef DEBUG
1686  * PUBLIC: db_pgno_t __bucket_to_page __P((HASH_CURSOR *, db_pgno_t));
1687  * PUBLIC: #endif
1688  */
1689 db_pgno_t
1690 __bucket_to_page(hcp, n)
1691 	HASH_CURSOR *hcp;
1692 	db_pgno_t n;
1693 {
1694 	int ret_val;
1695 
1696 	ret_val = n + 1;
1697 	if (n != 0)
1698 		ret_val += hcp->hdr->spares[__db_log2(n + 1) - 1];
1699 	return (ret_val);
1700 }
1701 #endif
1702 
1703 /*
1704  * Create a bunch of overflow pages at the current split point.
1705  * PUBLIC: void __ham_init_ovflpages __P((DBC *));
1706  */
1707 void
1708 __ham_init_ovflpages(dbc)
1709 	DBC *dbc;
1710 {
1711 	DB *dbp;
1712 	HASH_CURSOR *hcp;
1713 	DB_LSN new_lsn;
1714 	PAGE *p;
1715 	db_pgno_t last_pgno, new_pgno;
1716 	u_int32_t i, curpages, numpages;
1717 
1718 	dbp = dbc->dbp;
1719 	hcp = (HASH_CURSOR *)dbc->internal;
1720 
1721 	curpages = hcp->hdr->spares[hcp->hdr->ovfl_point] -
1722 	    hcp->hdr->spares[hcp->hdr->ovfl_point - 1];
1723 	numpages = hcp->hdr->ovfl_point + 1 - curpages;
1724 
1725 	last_pgno = hcp->hdr->last_freed;
1726 	new_pgno = PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + 1);
1727 	if (DB_LOGGING(dbc)) {
1728 		(void)__ham_ovfl_log(dbp->dbenv->lg_info,
1729 		    dbc->txn, &new_lsn, 0, dbp->log_fileid, new_pgno,
1730 		    numpages, last_pgno, hcp->hdr->ovfl_point, &hcp->hdr->lsn);
1731 		hcp->hdr->lsn = new_lsn;
1732 	} else
1733 		ZERO_LSN(new_lsn);
1734 
1735 	hcp->hdr->spares[hcp->hdr->ovfl_point] += numpages;
1736 	for (i = numpages; i > 0; i--) {
1737 		if (__ham_new_page(dbp,
1738 		    PGNO_OF(hcp, hcp->hdr->ovfl_point, curpages + i),
1739 		    P_INVALID, &p) != 0)
1740 			break;
1741 		LSN(p) = new_lsn;
1742 		NEXT_PGNO(p) = last_pgno;
1743 		last_pgno = PGNO(p);
1744 		(void)__ham_put_page(dbp, p, 1);
1745 	}
1746 	hcp->hdr->last_freed = last_pgno;
1747 }
1748 
1749 /*
1750  * PUBLIC: int __ham_get_cpage __P((DBC *, db_lockmode_t));
1751  */
1752 int
1753 __ham_get_cpage(dbc, mode)
1754 	DBC *dbc;
1755 	db_lockmode_t mode;
1756 {
1757 	DB *dbp;
1758 	HASH_CURSOR *hcp;
1759 	int ret;
1760 
1761 	dbp = dbc->dbp;
1762 	hcp = (HASH_CURSOR *)dbc->internal;
1763 
1764 	/*
1765 	 * There are three cases with respect to buckets and locks.  If there
1766 	 * is no lock held, then if we are locking, we should get the lock.
1767 	 * If there is a lock held and it's for the current bucket, we don't
1768 	 * need to do anything.  If there is a lock, but it's for a different
1769 	 * bucket, then we need to release and get.
1770 	 */
1771 	if (F_ISSET(dbp, DB_AM_LOCKING)) {
1772 		if (hcp->lock != 0 && hcp->lbucket != hcp->bucket) {
1773 			/*
1774 			 * If this is the original lock, don't release it,
1775 			 * because we may need to restore it upon exit.
1776 			 */
1777 			if (dbc->txn == NULL &&
1778 			    !F_ISSET(hcp, H_ORIGINAL) && (ret =
1779 			    lock_put(dbp->dbenv->lk_info, hcp->lock)) != 0)
1780 				return (ret);
1781 			F_CLR(hcp, H_ORIGINAL);
1782 			hcp->lock = 0;
1783 		}
1784 		if (hcp->lock == 0 && (ret = __ham_lock_bucket(dbc, mode)) != 0)
1785 			return (ret);
1786 		hcp->lbucket = hcp->bucket;
1787 	}
1788 
1789 	if (hcp->pagep == NULL) {
1790 		if (hcp->pgno == PGNO_INVALID) {
1791 			hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
1792 			hcp->bndx = 0;
1793 		}
1794 
1795 		if ((ret =
1796 		    __ham_get_page(dbp, hcp->pgno, &hcp->pagep)) != 0)
1797 			return (ret);
1798 	}
1799 
1800 	if (hcp->dpgno != PGNO_INVALID && hcp->dpagep == NULL)
1801 		if ((ret =
1802 		    __ham_get_page(dbp, hcp->dpgno, &hcp->dpagep)) != 0)
1803 			return (ret);
1804 	return (0);
1805 }
1806 
1807 /*
1808  * Get a new page at the cursor, putting the last page if necessary.
1809  * If the flag is set to H_ISDUP, then we are talking about the
1810  * duplicate page, not the main page.
1811  *
1812  * PUBLIC: int __ham_next_cpage __P((DBC *, db_pgno_t, int, u_int32_t));
1813  */
1814 int
1815 __ham_next_cpage(dbc, pgno, dirty, flags)
1816 	DBC *dbc;
1817 	db_pgno_t pgno;
1818 	int dirty;
1819 	u_int32_t flags;
1820 {
1821 	DB *dbp;
1822 	HASH_CURSOR *hcp;
1823 	PAGE *p;
1824 	int ret;
1825 
1826 	dbp = dbc->dbp;
1827 	hcp = (HASH_CURSOR *)dbc->internal;
1828 	if (LF_ISSET(H_ISDUP) && hcp->dpagep != NULL &&
1829 	    (ret = __ham_put_page(dbp, hcp->dpagep, dirty)) != 0)
1830 		return (ret);
1831 	else if (!LF_ISSET(H_ISDUP) && hcp->pagep != NULL &&
1832 	    (ret = __ham_put_page(dbp, hcp->pagep, dirty)) != 0)
1833 		return (ret);
1834 
1835 	if ((ret = __ham_get_page(dbp, pgno, &p)) != 0)
1836 		return (ret);
1837 
1838 	if (LF_ISSET(H_ISDUP)) {
1839 		hcp->dpagep = p;
1840 		hcp->dpgno = pgno;
1841 		hcp->dndx = 0;
1842 	} else {
1843 		hcp->pagep = p;
1844 		hcp->pgno = pgno;
1845 		hcp->bndx = 0;
1846 	}
1847 
1848 	return (0);
1849 }
1850 
1851 /*
1852  * __ham_lock_bucket --
1853  *	Get the lock on a particular bucket.
1854  */
1855 static int
1856 __ham_lock_bucket(dbc, mode)
1857 	DBC *dbc;
1858 	db_lockmode_t mode;
1859 {
1860 	HASH_CURSOR *hcp;
1861 	int ret;
1862 
1863 	hcp = (HASH_CURSOR *)dbc->internal;
1864 	dbc->lock.pgno = (db_pgno_t)(hcp->bucket);
1865 	if (dbc->txn == NULL)
1866 		ret = lock_get(dbc->dbp->dbenv->lk_info, dbc->locker, 0,
1867 		    &dbc->lock_dbt, mode, &hcp->lock);
1868 	else
1869 		ret = lock_tget(dbc->dbp->dbenv->lk_info, dbc->txn, 0,
1870 		    &dbc->lock_dbt, mode, &hcp->lock);
1871 
1872 	return (ret < 0 ? EAGAIN : ret);
1873 }
1874 
1875 /*
1876  * __ham_dpair --
1877  *	Delete a pair on a page, paying no attention to what the pair
1878  *	represents.  The caller is responsible for freeing up duplicates
1879  *	or offpage entries that might be referenced by this pair.
1880  *
1881  * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t));
1882  */
1883 void
1884 __ham_dpair(dbp, p, pndx)
1885 	DB *dbp;
1886 	PAGE *p;
1887 	u_int32_t pndx;
1888 {
1889 	db_indx_t delta, n;
1890 	u_int8_t *dest, *src;
1891 
1892 	/*
1893 	 * Compute "delta", the amount we have to shift all of the
1894 	 * offsets.  To find the delta, we just need to calculate
1895 	 * the size of the pair of elements we are removing.
1896 	 */
1897 	delta = H_PAIRSIZE(p, dbp->pgsize, pndx);
1898 
1899 	/*
1900 	 * The hard case: we want to remove something other than
1901 	 * the last item on the page.  We need to shift data and
1902 	 * offsets down.
1903 	 */
1904 	if ((db_indx_t)pndx != H_NUMPAIRS(p) - 1) {
1905 		/*
1906 		 * Move the data: src is the first occupied byte on
1907 		 * the page. (Length is delta.)
1908 		 */
1909 		src = (u_int8_t *)p + HOFFSET(p);
1910 
1911 		/*
1912 		 * Destination is delta bytes beyond src.  This might
1913 		 * be an overlapping copy, so we have to use memmove.
1914 		 */
1915 		dest = src + delta;
1916 		memmove(dest, src, p->inp[H_DATAINDEX(pndx)] - HOFFSET(p));
1917 	}
1918 
1919 	/* Adjust the offsets. */
1920 	for (n = (db_indx_t)pndx; n < (db_indx_t)(H_NUMPAIRS(p) - 1); n++) {
1921 		p->inp[H_KEYINDEX(n)] = p->inp[H_KEYINDEX(n+1)] + delta;
1922 		p->inp[H_DATAINDEX(n)] = p->inp[H_DATAINDEX(n+1)] + delta;
1923 	}
1924 
1925 	/* Adjust page metadata. */
1926 	HOFFSET(p) = HOFFSET(p) + delta;
1927 	NUM_ENT(p) = NUM_ENT(p) - 2;
1928 }
1929 
1930