xref: /titanic_52/usr/src/cmd/sendmail/db/btree/bt_recno.c (revision 54925bf60766fbb4f1f2d7c843721406a7b7a3fb)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 
8 #include "config.h"
9 
10 #ifndef lint
11 static const char sccsid[] = "@(#)bt_recno.c	10.53 (Sleepycat) 12/11/98";
12 #endif /* not lint */
13 
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
16 
17 #include <errno.h>
18 #include <limits.h>
19 #include <string.h>
20 #endif
21 
22 #include "db_int.h"
23 #include "db_page.h"
24 #include "btree.h"
25 #include "db_ext.h"
26 #include "shqueue.h"
27 #include "db_shash.h"
28 #include "lock.h"
29 #include "lock_ext.h"
30 
31 static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
32 static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
33 static int __ram_fmap __P((DBC *, db_recno_t));
34 static int __ram_i_delete __P((DBC *));
35 static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
36 static int __ram_source __P((DB *, RECNO *, const char *));
37 static int __ram_sync __P((DB *, u_int32_t));
38 static int __ram_update __P((DBC *, db_recno_t, int));
39 static int __ram_vmap __P((DBC *, db_recno_t));
40 static int __ram_writeback __P((DBC *));
41 
42 /*
43  * In recno, there are two meanings to the on-page "deleted" flag.  If we're
44  * re-numbering records, it means the record was implicitly created.  We skip
45  * over implicitly created records if doing a cursor "next" or "prev", and
46  * return DB_KEYEMPTY if they're explicitly requested..  If not re-numbering
47  * records, it means that the record was implicitly created, or was deleted.
48  * We skip over implicitly created or deleted records if doing a cursor "next"
49  * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
50  *
51  * If we're re-numbering records, then we have to detect in the cursor that
52  * a record was deleted, and adjust the cursor as necessary on the next get.
53  * If we're not re-numbering records, then we can detect that a record has
54  * been deleted by looking at the actual on-page record, so we completely
55  * ignore the cursor's delete flag.  This is different from the B+tree code.
56  * It also maintains whether the cursor references a deleted record in the
57  * cursor, and it doesn't always check the on-page value.
58  */
59 #define	CD_SET(dbp, cp) {						\
60 	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
61 		F_SET(cp, C_DELETED);					\
62 }
63 #define	CD_CLR(dbp, cp) {						\
64 	if (F_ISSET(dbp, DB_RE_RENUMBER))				\
65 		F_CLR(cp, C_DELETED);					\
66 }
67 #define	CD_ISSET(dbp, cp)						\
68 	(F_ISSET(dbp, DB_RE_RENUMBER) && F_ISSET(cp, C_DELETED))
69 
70 /*
71  * __ram_open --
72  *	Recno open function.
73  *
74  * PUBLIC: int __ram_open __P((DB *, DB_INFO *));
75  */
76 int
77 __ram_open(dbp, dbinfo)
78 	DB *dbp;
79 	DB_INFO *dbinfo;
80 {
81 	BTREE *t;
82 	DBC *dbc;
83 	RECNO *rp;
84 	int ret, t_ret;
85 
86 	/* Allocate and initialize the private btree structure. */
87 	if ((ret = __os_calloc(1, sizeof(BTREE), &t)) != 0)
88 		return (ret);
89 	dbp->internal = t;
90 	__bam_setovflsize(dbp);
91 
92 	/* Allocate and initialize the private recno structure. */
93 	if ((ret = __os_calloc(1, sizeof(*rp), &rp)) != 0)
94 		return (ret);
95 	/* Link in the private recno structure. */
96 	t->recno = rp;
97 
98 	/*
99 	 * Intention is to make sure all of the user's selections are okay
100 	 * here and then use them without checking.
101 	 */
102 	if (dbinfo == NULL) {
103 		rp->re_delim = '\n';
104 		rp->re_pad = ' ';
105 		rp->re_fd = -1;
106 		F_SET(rp, RECNO_EOF);
107 	} else {
108 		/*
109 		 * If the user specified a source tree, open it and map it in.
110 		 *
111 		 * !!!
112 		 * We don't complain if the user specified transactions or
113 		 * threads.  It's possible to make it work, but you'd better
114 		 * know what you're doing!
115 		 */
116 		if (dbinfo->re_source == NULL) {
117 			rp->re_fd = -1;
118 			F_SET(rp, RECNO_EOF);
119 		} else {
120 			if ((ret =
121 			    __ram_source(dbp, rp, dbinfo->re_source)) != 0)
122 			goto err;
123 		}
124 
125 		/* Copy delimiter, length and padding values. */
126 		rp->re_delim =
127 		    F_ISSET(dbp, DB_RE_DELIMITER) ? dbinfo->re_delim : '\n';
128 		rp->re_pad = F_ISSET(dbp, DB_RE_PAD) ? dbinfo->re_pad : ' ';
129 
130 		if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
131 			if ((rp->re_len = dbinfo->re_len) == 0) {
132 				__db_err(dbp->dbenv,
133 				    "record length must be greater than 0");
134 				ret = EINVAL;
135 				goto err;
136 			}
137 		} else
138 			rp->re_len = 0;
139 	}
140 
141 	/* Initialize the remaining fields/methods of the DB. */
142 	dbp->am_close = __ram_close;
143 	dbp->del = __ram_delete;
144 	dbp->put = __ram_put;
145 	dbp->stat = __bam_stat;
146 	dbp->sync = __ram_sync;
147 
148 	/* Start up the tree. */
149 	if ((ret = __bam_read_root(dbp)) != 0)
150 		goto err;
151 
152 	/* Set the overflow page size. */
153 	__bam_setovflsize(dbp);
154 
155 	/* If we're snapshotting an underlying source file, do it now. */
156 	if (dbinfo != NULL && F_ISSET(dbinfo, DB_SNAPSHOT)) {
157 		/* Allocate a cursor. */
158 		if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
159 			goto err;
160 
161 		/* Do the snapshot. */
162 		if ((ret = __ram_update(dbc,
163 		    DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
164 			ret = 0;
165 
166 		/* Discard the cursor. */
167 		if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
168 			ret = t_ret;
169 
170 		if (ret != 0)
171 			goto err;
172 	}
173 
174 	return (0);
175 
176 err:	/* If we mmap'd a source file, discard it. */
177 	if (rp->re_smap != NULL)
178 		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
179 
180 	/* If we opened a source file, discard it. */
181 	if (rp->re_fd != -1)
182 		(void)__os_close(rp->re_fd);
183 	if (rp->re_source != NULL)
184 		__os_freestr(rp->re_source);
185 
186 	__os_free(rp, sizeof(*rp));
187 
188 	return (ret);
189 }
190 
191 /*
192  * __ram_delete --
193  *	Recno db->del function.
194  */
195 static int
196 __ram_delete(dbp, txn, key, flags)
197 	DB *dbp;
198 	DB_TXN *txn;
199 	DBT *key;
200 	u_int32_t flags;
201 {
202 	CURSOR *cp;
203 	DBC *dbc;
204 	db_recno_t recno;
205 	int ret, t_ret;
206 
207 	DB_PANIC_CHECK(dbp);
208 
209 	/* Check for invalid flags. */
210 	if ((ret = __db_delchk(dbp,
211 	    key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
212 		return (ret);
213 
214 	/* Acquire a cursor. */
215 	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
216 		return (ret);
217 
218 	DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags);
219 
220 	/* Check the user's record number and fill in as necessary. */
221 	if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
222 		goto err;
223 
224 	/* Do the delete. */
225 	cp = dbc->internal;
226 	cp->recno = recno;
227 	ret = __ram_i_delete(dbc);
228 
229 	/* Release the cursor. */
230 err:	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
231 		ret = t_ret;
232 
233 	return (ret);
234 }
235 
236 /*
237  * __ram_i_delete --
238  *	Internal version of recno delete, called by __ram_delete and
239  *	__ram_c_del.
240  */
241 static int
242 __ram_i_delete(dbc)
243 	DBC *dbc;
244 {
245 	BKEYDATA bk;
246 	BTREE *t;
247 	CURSOR *cp;
248 	DB *dbp;
249 	DBT hdr, data;
250 	PAGE *h;
251 	db_indx_t indx;
252 	int exact, ret, stack;
253 
254 	dbp = dbc->dbp;
255 	cp = dbc->internal;
256 	t = dbp->internal;
257 	stack = 0;
258 
259 	/*
260 	 * If this is CDB and this isn't a write cursor, then it's an error.
261 	 * If it is a write cursor, but we don't yet hold the write lock, then
262 	 * we need to upgrade to the write lock.
263 	 */
264 	if (F_ISSET(dbp, DB_AM_CDB)) {
265 		/* Make sure it's a valid update cursor. */
266 		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
267 			return (EINVAL);
268 
269 		if (F_ISSET(dbc, DBC_RMW) &&
270 		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
271 		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
272 		    &dbc->mylock)) != 0)
273 			return (EAGAIN);
274 	}
275 
276 	/* Search the tree for the key; delete only deletes exact matches. */
277 	if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
278 		goto err;
279 	if (!exact) {
280 		ret = DB_NOTFOUND;
281 		goto err;
282 	}
283 	stack = 1;
284 
285 	h = cp->csp->page;
286 	indx = cp->csp->indx;
287 
288 	/*
289 	 * If re-numbering records, the on-page deleted flag can only mean
290 	 * that this record was implicitly created.  Applications aren't
291 	 * permitted to delete records they never created, return an error.
292 	 *
293 	 * If not re-numbering records, the on-page deleted flag means that
294 	 * this record was implicitly created, or, was deleted at some time.
295 	 * The former is an error because applications aren't permitted to
296 	 * delete records they never created, the latter is an error because
297 	 * if the record was "deleted", we could never have found it.
298 	 */
299 	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
300 		ret = DB_KEYEMPTY;
301 		goto err;
302 	}
303 
304 	if (F_ISSET(dbp, DB_RE_RENUMBER)) {
305 		/* Delete the item, adjust the counts, adjust the cursors. */
306 		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
307 			goto err;
308 		__bam_adjust(dbc, -1);
309 		__ram_ca(dbp, cp->recno, CA_DELETE);
310 
311 		/*
312 		 * If the page is empty, delete it.   The whole tree is locked
313 		 * so there are no preparations to make.
314 		 */
315 		if (NUM_ENT(h) == 0 && h->pgno != PGNO_ROOT) {
316 			stack = 0;
317 			ret = __bam_dpages(dbc);
318 		}
319 	} else {
320 		/* Use a delete/put pair to replace the record with a marker. */
321 		if ((ret = __bam_ditem(dbc, h, indx)) != 0)
322 			goto err;
323 
324 		B_TSET(bk.type, B_KEYDATA, 1);
325 		bk.len = 0;
326 		memset(&hdr, 0, sizeof(hdr));
327 		hdr.data = &bk;
328 		hdr.size = SSZA(BKEYDATA, data);
329 		memset(&data, 0, sizeof(data));
330 		data.data = (char *)"";
331 		data.size = 0;
332 		if ((ret = __db_pitem(dbc,
333 		    h, indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
334 			goto err;
335 	}
336 	F_SET(t->recno, RECNO_MODIFIED);
337 
338 err:	if (stack)
339 		__bam_stkrel(dbc, 0);
340 
341 	/* If we upgraded the CDB lock upon entry; downgrade it now. */
342 	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
343 		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
344 		    DB_LOCK_IWRITE, 0);
345 	return (ret);
346 }
347 
348 /*
349  * __ram_put --
350  *	Recno db->put function.
351  */
352 static int
353 __ram_put(dbp, txn, key, data, flags)
354 	DB *dbp;
355 	DB_TXN *txn;
356 	DBT *key, *data;
357 	u_int32_t flags;
358 {
359 	DBC *dbc;
360 	db_recno_t recno;
361 	int ret, t_ret;
362 
363 	DB_PANIC_CHECK(dbp);
364 
365 	/* Check for invalid flags. */
366 	if ((ret = __db_putchk(dbp,
367 	    key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0)
368 		return (ret);
369 
370 	/* Allocate a cursor. */
371 	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
372 		return (ret);
373 
374 	DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags);
375 
376 	/*
377 	 * If we're appending to the tree, make sure we've read in all of
378 	 * the backing source file.  Otherwise, check the user's record
379 	 * number and fill in as necessary.
380 	 */
381 	ret = flags == DB_APPEND ?
382 	    __ram_update(dbc, DB_MAX_RECORDS, 0) :
383 	    __ram_getno(dbc, key, &recno, 1);
384 
385 	/* Add the record. */
386 	if (ret == 0)
387 		ret = __ram_add(dbc, &recno, data, flags, 0);
388 
389 	/* Discard the cursor. */
390 	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
391 		ret = t_ret;
392 
393 	/* Return the record number if we're appending to the tree. */
394 	if (ret == 0 && flags == DB_APPEND)
395 		*(db_recno_t *)key->data = recno;
396 
397 	return (ret);
398 }
399 
400 /*
401  * __ram_sync --
402  *	Recno db->sync function.
403  */
404 static int
405 __ram_sync(dbp, flags)
406 	DB *dbp;
407 	u_int32_t flags;
408 {
409 	DBC *dbc;
410 	int ret, t_ret;
411 
412 	/*
413 	 * Sync the underlying btree.
414 	 *
415 	 * !!!
416 	 * We don't need to do a panic check or flags check, the "real"
417 	 * sync function does all that for us.
418 	 */
419 	if ((ret = __db_sync(dbp, flags)) != 0)
420 		return (ret);
421 
422 	/* Allocate a cursor. */
423 	if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
424 		return (ret);
425 
426 	DEBUG_LWRITE(dbc, NULL, "ram_sync", NULL, NULL, flags);
427 
428 	/* Copy back the backing source file. */
429 	ret = __ram_writeback(dbc);
430 
431 	/* Discard the cursor. */
432 	if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
433 		ret = t_ret;
434 
435 	return (ret);
436 }
437 
438 /*
439  * __ram_close --
440  *	Recno db->close function.
441  *
442  * PUBLIC: int __ram_close __P((DB *));
443  */
444 int
445 __ram_close(dbp)
446 	DB *dbp;
447 {
448 	RECNO *rp;
449 
450 	rp = ((BTREE *)dbp->internal)->recno;
451 
452 	/* Close any underlying mmap region. */
453 	if (rp->re_smap != NULL)
454 		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
455 
456 	/* Close any backing source file descriptor. */
457 	if (rp->re_fd != -1)
458 		(void)__os_close(rp->re_fd);
459 
460 	/* Free any backing source file name. */
461 	if (rp->re_source != NULL)
462 		__os_freestr(rp->re_source);
463 
464 	/* Free allocated memory. */
465 	__os_free(rp, sizeof(RECNO));
466 	((BTREE *)dbp->internal)->recno = NULL;
467 
468 	/* Close the underlying btree. */
469 	return (__bam_close(dbp));
470 }
471 
472 /*
473  * __ram_c_del --
474  *	Recno cursor->c_del function.
475  *
476  * PUBLIC: int __ram_c_del __P((DBC *, u_int32_t));
477  */
478 int
479 __ram_c_del(dbc, flags)
480 	DBC *dbc;
481 	u_int32_t flags;
482 {
483 	CURSOR *cp;
484 	DB *dbp;
485 	int ret;
486 
487 	dbp = dbc->dbp;
488 	cp = dbc->internal;
489 
490 	DB_PANIC_CHECK(dbp);
491 
492 	/* Check for invalid flags. */
493 	if ((ret = __db_cdelchk(dbp, flags,
494 	    F_ISSET(dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
495 		return (ret);
496 
497 	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_del", NULL, NULL, flags);
498 
499 	/*
500 	 * If we are running CDB, this had better be either a write
501 	 * cursor or an immediate writer.
502 	 */
503 	if (F_ISSET(dbp, DB_AM_CDB))
504 		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
505 			return (EINVAL);
506 
507 	/*
508 	 * The semantics of cursors during delete are as follows: if record
509 	 * numbers are mutable (DB_RE_RENUMBER is set), deleting a record
510 	 * causes the cursor to automatically point to the record immediately
511 	 * following.  In this case it is possible to use a single cursor for
512 	 * repeated delete operations, without intervening operations.
513 	 *
514 	 * If record numbers are not mutable, then records are replaced with
515 	 * a marker containing a delete flag.  If the record referenced by
516 	 * this cursor has already been deleted, we will detect that as part
517 	 * of the delete operation, and fail.
518 	 */
519 	return (__ram_i_delete(dbc));
520 }
521 
522 /*
523  * __ram_c_get --
524  *	Recno cursor->c_get function.
525  *
526  * PUBLIC: int __ram_c_get __P((DBC *, DBT *, DBT *, u_int32_t));
527  */
528 int
529 __ram_c_get(dbc, key, data, flags)
530 	DBC *dbc;
531 	DBT *key, *data;
532 	u_int32_t flags;
533 {
534 	CURSOR *cp, copy;
535 	DB *dbp;
536 	PAGE *h;
537 	db_indx_t indx;
538 	int exact, ret, stack, tmp_rmw;
539 
540 	dbp = dbc->dbp;
541 	cp = dbc->internal;
542 
543 	DB_PANIC_CHECK(dbp);
544 
545 	/* Check for invalid flags. */
546 	if ((ret = __db_cgetchk(dbc->dbp,
547 	    key, data, flags, cp->recno != RECNO_OOB)) != 0)
548 		return (ret);
549 
550 	/* Clear OR'd in additional bits so we can check for flag equality. */
551 	tmp_rmw = 0;
552 	if (LF_ISSET(DB_RMW)) {
553 		if (!F_ISSET(dbp, DB_AM_CDB)) {
554 			tmp_rmw = 1;
555 			F_SET(dbc, DBC_RMW);
556 		}
557 		LF_CLR(DB_RMW);
558 	}
559 
560 	DEBUG_LREAD(dbc, dbc->txn, "ram_c_get",
561 	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
562 
563 	/* Initialize the cursor for a new retrieval. */
564 	copy = *cp;
565 
566 retry:	/* Update the record number. */
567 	stack = 0;
568 	switch (flags) {
569 	case DB_CURRENT:
570 		/*
571 		 * If record numbers are mutable: if we just deleted a record,
572 		 * there is no action necessary, we return the record following
573 		 * the deleted item by virtue of renumbering the tree.
574 		 */
575 		break;
576 	case DB_NEXT:
577 		/*
578 		 * If record numbers are mutable: if we just deleted a record,
579 		 * we have to avoid incrementing the record number so that we
580 		 * return the right record by virtue of renumbering the tree.
581 		 */
582 		if (CD_ISSET(dbp, cp))
583 			break;
584 
585 		if (cp->recno != RECNO_OOB) {
586 			++cp->recno;
587 			break;
588 		}
589 		/* FALLTHROUGH */
590 	case DB_FIRST:
591 		flags = DB_NEXT;
592 		cp->recno = 1;
593 		break;
594 	case DB_PREV:
595 		if (cp->recno != RECNO_OOB) {
596 			if (cp->recno == 1) {
597 				ret = DB_NOTFOUND;
598 				goto err;
599 			}
600 			--cp->recno;
601 			break;
602 		}
603 		/* FALLTHROUGH */
604 	case DB_LAST:
605 		flags = DB_PREV;
606 		if (((ret = __ram_update(dbc,
607 		    DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
608 			goto err;
609 		if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
610 			goto err;
611 		if (cp->recno == 0) {
612 			ret = DB_NOTFOUND;
613 			goto err;
614 		}
615 		break;
616 	case DB_SET:
617 	case DB_SET_RANGE:
618 		if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
619 			goto err;
620 		break;
621 	}
622 
623 	/* Return the key if the user didn't give us one. */
624 	if (flags != DB_SET && flags != DB_SET_RANGE &&
625 	    (ret = __db_retcopy(key, &cp->recno, sizeof(cp->recno),
626 	    &dbc->rkey.data, &dbc->rkey.ulen, dbp->db_malloc)) != 0)
627 		goto err;
628 
629 	/* Search the tree for the record. */
630 	if ((ret = __bam_rsearch(dbc, &cp->recno,
631 	    F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 1, &exact)) != 0)
632 		goto err;
633 	stack = 1;
634 	if (!exact) {
635 		ret = DB_NOTFOUND;
636 		goto err;
637 	}
638 	h = cp->csp->page;
639 	indx = cp->csp->indx;
640 
641 	/*
642 	 * If re-numbering records, the on-page deleted flag means this record
643 	 * was implicitly created.  If not re-numbering records, the on-page
644 	 * deleted flag means this record was implicitly created, or, it was
645 	 * deleted at some time.  Regardless, we skip such records if doing
646 	 * cursor next/prev operations, and fail if the application requested
647 	 * them explicitly.
648 	 */
649 	if (B_DISSET(GET_BKEYDATA(h, indx)->type)) {
650 		if (flags == DB_NEXT || flags == DB_PREV) {
651 			(void)__bam_stkrel(dbc, 0);
652 			goto retry;
653 		}
654 		ret = DB_KEYEMPTY;
655 		goto err;
656 	}
657 
658 	/* Return the data item. */
659 	if ((ret = __db_ret(dbp,
660 	    h, indx, data, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
661 		goto err;
662 
663 	/* The cursor was reset, no further delete adjustment is necessary. */
664 	CD_CLR(dbp, cp);
665 
666 err:	if (stack)
667 		(void)__bam_stkrel(dbc, 0);
668 
669 	/* Release temporary lock upgrade. */
670 	if (tmp_rmw)
671 		F_CLR(dbc, DBC_RMW);
672 
673 	if (ret != 0)
674 		*cp = copy;
675 
676 	return (ret);
677 }
678 
679 /*
680  * __ram_c_put --
681  *	Recno cursor->c_put function.
682  *
683  * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t));
684  */
685 int
686 __ram_c_put(dbc, key, data, flags)
687 	DBC *dbc;
688 	DBT *key, *data;
689 	u_int32_t flags;
690 {
691 	CURSOR *cp, copy;
692 	DB *dbp;
693 	int exact, ret;
694 	void *arg;
695 
696 	dbp = dbc->dbp;
697 	cp = dbc->internal;
698 
699 	DB_PANIC_CHECK(dbp);
700 
701 	if ((ret = __db_cputchk(dbc->dbp, key, data, flags,
702 	    F_ISSET(dbc->dbp, DB_AM_RDONLY), cp->recno != RECNO_OOB)) != 0)
703 		return (ret);
704 
705 	DEBUG_LWRITE(dbc, dbc->txn, "ram_c_put", NULL, data, flags);
706 
707 	/*
708 	 * If we are running CDB, this had better be either a write
709 	 * cursor or an immediate writer.  If it's a regular writer,
710 	 * that means we have an IWRITE lock and we need to upgrade
711 	 * it to a write lock.
712 	 */
713 	if (F_ISSET(dbp, DB_AM_CDB)) {
714 		if (!F_ISSET(dbc, DBC_RMW | DBC_WRITER))
715 			return (EINVAL);
716 
717 		if (F_ISSET(dbc, DBC_RMW) &&
718 		    (ret = lock_get(dbp->dbenv->lk_info, dbc->locker,
719 		    DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE,
720 		    &dbc->mylock)) != 0)
721 			return (EAGAIN);
722 	}
723 
724 	/* Initialize the cursor for a new retrieval. */
725 	copy = *cp;
726 
727 	/*
728 	 * To split, we need a valid key for the page.  Since it's a cursor,
729 	 * we have to build one.
730 	 *
731 	 * The split code discards all short-term locks and stack pages.
732 	 */
733 	if (0) {
734 split:		arg = &cp->recno;
735 		if ((ret = __bam_split(dbc, arg)) != 0)
736 			goto err;
737 	}
738 
739 	if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
740 		goto err;
741 	if (!exact) {
742 		ret = DB_NOTFOUND;
743 		goto err;
744 	}
745 	if ((ret = __bam_iitem(dbc, &cp->csp->page,
746 	    &cp->csp->indx, key, data, flags, 0)) == DB_NEEDSPLIT) {
747 		if ((ret = __bam_stkrel(dbc, 0)) != 0)
748 			goto err;
749 		goto split;
750 	}
751 	if ((ret = __bam_stkrel(dbc, 0)) != 0)
752 		goto err;
753 
754 	switch (flags) {
755 	case DB_AFTER:
756 		/* Adjust the cursors. */
757 		__ram_ca(dbp, cp->recno, CA_IAFTER);
758 
759 		/* Set this cursor to reference the new record. */
760 		cp->recno = copy.recno + 1;
761 		break;
762 	case DB_BEFORE:
763 		/* Adjust the cursors. */
764 		__ram_ca(dbp, cp->recno, CA_IBEFORE);
765 
766 		/* Set this cursor to reference the new record. */
767 		cp->recno = copy.recno;
768 		break;
769 	}
770 
771 	/* The cursor was reset, no further delete adjustment is necessary. */
772 	CD_CLR(dbp, cp);
773 
774 err:	if (F_ISSET(dbp, DB_AM_CDB) && F_ISSET(dbc, DBC_RMW))
775 		(void)__lock_downgrade(dbp->dbenv->lk_info, dbc->mylock,
776 		    DB_LOCK_IWRITE, 0);
777 
778 	if (ret != 0)
779 		*cp = copy;
780 
781 	return (ret);
782 }
783 
784 /*
785  * __ram_ca --
786  *	Adjust cursors.
787  *
788  * PUBLIC: void __ram_ca __P((DB *, db_recno_t, ca_recno_arg));
789  */
790 void
791 __ram_ca(dbp, recno, op)
792 	DB *dbp;
793 	db_recno_t recno;
794 	ca_recno_arg op;
795 {
796 	CURSOR *cp;
797 	DBC *dbc;
798 
799 	/*
800 	 * Adjust the cursors.  See the comment in __bam_ca_delete().
801 	 */
802 	DB_THREAD_LOCK(dbp);
803 	for (dbc = TAILQ_FIRST(&dbp->active_queue);
804 	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
805 		cp = dbc->internal;
806 		switch (op) {
807 		case CA_DELETE:
808 			if (recno > cp->recno)
809 				--cp->recno;
810 			if (recno == cp->recno)
811 				CD_SET(dbp, cp);
812 			break;
813 		case CA_IAFTER:
814 			if (recno > cp->recno)
815 				++cp->recno;
816 			break;
817 		case CA_IBEFORE:
818 			if (recno >= cp->recno)
819 				++cp->recno;
820 			break;
821 		}
822 	}
823 	DB_THREAD_UNLOCK(dbp);
824 }
825 
826 /*
827  * __ram_getno --
828  *	Check the user's record number, and make sure we've seen it.
829  *
830  * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
831  */
832 int
833 __ram_getno(dbc, key, rep, can_create)
834 	DBC *dbc;
835 	const DBT *key;
836 	db_recno_t *rep;
837 	int can_create;
838 {
839 	DB *dbp;
840 	db_recno_t recno;
841 
842 	dbp = dbc->dbp;
843 
844 	/* Check the user's record number. */
845 	if ((recno = *(db_recno_t *)key->data) == 0) {
846 		__db_err(dbp->dbenv, "illegal record number of 0");
847 		return (EINVAL);
848 	}
849 	if (rep != NULL)
850 		*rep = recno;
851 
852 	/*
853 	 * Btree can neither create records nor read them in.  Recno can
854 	 * do both, see if we can find the record.
855 	 */
856 	return (dbp->type == DB_RECNO ?
857 	    __ram_update(dbc, recno, can_create) : 0);
858 }
859 
860 /*
861  * __ram_update --
862  *	Ensure the tree has records up to and including the specified one.
863  */
864 static int
865 __ram_update(dbc, recno, can_create)
866 	DBC *dbc;
867 	db_recno_t recno;
868 	int can_create;
869 {
870 	BTREE *t;
871 	DB *dbp;
872 	RECNO *rp;
873 	db_recno_t nrecs;
874 	int ret;
875 
876 	dbp = dbc->dbp;
877 	t = dbp->internal;
878 	rp = t->recno;
879 
880 	/*
881 	 * If we can't create records and we've read the entire backing input
882 	 * file, we're done.
883 	 */
884 	if (!can_create && F_ISSET(rp, RECNO_EOF))
885 		return (0);
886 
887 	/*
888 	 * If we haven't seen this record yet, try to get it from the original
889 	 * file.
890 	 */
891 	if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
892 		return (ret);
893 	if (!F_ISSET(rp, RECNO_EOF) && recno > nrecs) {
894 		if ((ret = rp->re_irec(dbc, recno)) != 0)
895 			return (ret);
896 		if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
897 			return (ret);
898 	}
899 
900 	/*
901 	 * If we can create records, create empty ones up to the requested
902 	 * record.
903 	 */
904 	if (!can_create || recno <= nrecs + 1)
905 		return (0);
906 
907 	dbc->rdata.dlen = 0;
908 	dbc->rdata.doff = 0;
909 	dbc->rdata.flags = 0;
910 	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
911 		if (dbc->rdata.ulen < rp->re_len) {
912 			if ((ret =
913 			    __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
914 				dbc->rdata.ulen = 0;
915 				dbc->rdata.data = NULL;
916 				return (ret);
917 			}
918 			dbc->rdata.ulen = rp->re_len;
919 		}
920 		dbc->rdata.size = rp->re_len;
921 		memset(dbc->rdata.data, rp->re_pad, rp->re_len);
922 	} else
923 		dbc->rdata.size = 0;
924 
925 	while (recno > ++nrecs)
926 		if ((ret = __ram_add(dbc,
927 		    &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0)
928 			return (ret);
929 	return (0);
930 }
931 
932 /*
933  * __ram_source --
934  *	Load information about the backing file.
935  */
936 static int
937 __ram_source(dbp, rp, fname)
938 	DB *dbp;
939 	RECNO *rp;
940 	const char *fname;
941 {
942 	size_t size;
943 	u_int32_t bytes, mbytes, oflags;
944 	int ret;
945 
946 	/*
947 	 * !!!
948 	 * The caller has full responsibility for cleaning up on error --
949 	 * (it has to anyway, in case it fails after this routine succeeds).
950 	 */
951 	if ((ret = __db_appname(dbp->dbenv,
952 	    DB_APP_DATA, NULL, fname, 0, NULL, &rp->re_source)) != 0)
953 		return (ret);
954 
955 	oflags = F_ISSET(dbp, DB_AM_RDONLY) ? DB_RDONLY : 0;
956 	if ((ret =
957 	    __db_open(rp->re_source, oflags, oflags, 0, &rp->re_fd)) != 0) {
958 		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
959 		return (ret);
960 	}
961 
962 	/*
963 	 * XXX
964 	 * We'd like to test to see if the file is too big to mmap.  Since we
965 	 * don't know what size or type off_t's or size_t's are, or the largest
966 	 * unsigned integral type is, or what random insanity the local C
967 	 * compiler will perpetrate, doing the comparison in a portable way is
968 	 * flatly impossible.  Hope that mmap fails if the file is too large.
969 	 */
970 	if ((ret = __os_ioinfo(rp->re_source,
971 	    rp->re_fd, &mbytes, &bytes, NULL)) != 0) {
972 		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
973 		return (ret);
974 	}
975 	if (mbytes == 0 && bytes == 0) {
976 		F_SET(rp, RECNO_EOF);
977 		return (0);
978 	}
979 
980 	size = mbytes * MEGABYTE + bytes;
981 	if ((ret = __db_mapfile(rp->re_source,
982 	    rp->re_fd, (size_t)size, 1, &rp->re_smap)) != 0)
983 		return (ret);
984 	rp->re_cmap = rp->re_smap;
985 	rp->re_emap = (u_int8_t *)rp->re_smap + (rp->re_msize = size);
986 	rp->re_irec = F_ISSET(dbp, DB_RE_FIXEDLEN) ?  __ram_fmap : __ram_vmap;
987 	return (0);
988 }
989 
990 /*
991  * __ram_writeback --
992  *	Rewrite the backing file.
993  */
994 static int
995 __ram_writeback(dbc)
996 	DBC *dbc;
997 {
998 	DB *dbp;
999 	DBT key, data;
1000 	RECNO *rp;
1001 	db_recno_t keyno;
1002 	ssize_t nw;
1003 	int fd, ret, t_ret;
1004 	u_int8_t delim, *pad;
1005 
1006 	dbp = dbc->dbp;
1007 	rp = ((BTREE *)dbp->internal)->recno;
1008 
1009 	/* If the file wasn't modified, we're done. */
1010 	if (!F_ISSET(rp, RECNO_MODIFIED))
1011 		return (0);
1012 
1013 	/* If there's no backing source file, we're done. */
1014 	if (rp->re_source == NULL) {
1015 		F_CLR(rp, RECNO_MODIFIED);
1016 		return (0);
1017 	}
1018 
1019 	/*
1020 	 * Read any remaining records into the tree.
1021 	 *
1022 	 * !!!
1023 	 * This is why we can't support transactions when applications specify
1024 	 * backing (re_source) files.  At this point we have to read in the
1025 	 * rest of the records from the file so that we can write all of the
1026 	 * records back out again, which could modify a page for which we'd
1027 	 * have to log changes and which we don't have locked.  This could be
1028 	 * partially fixed by taking a snapshot of the entire file during the
1029 	 * db_open(), or, since db_open() isn't transaction protected, as part
1030 	 * of the first DB operation.  But, if a checkpoint occurs then, the
1031 	 * part of the log holding the copy of the file could be discarded, and
1032 	 * that would make it impossible to recover in the face of disaster.
1033 	 * This could all probably be fixed, but it would require transaction
1034 	 * protecting the backing source file, i.e. mpool would have to know
1035 	 * about it, and we don't want to go there.
1036 	 */
1037 	if ((ret =
1038 	    __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
1039 		return (ret);
1040 
1041 	/*
1042 	 * !!!
1043 	 * Close any underlying mmap region.  This is required for Windows NT
1044 	 * (4.0, Service Pack 2) -- if the file is still mapped, the following
1045 	 * open will fail.
1046 	 */
1047 	if (rp->re_smap != NULL) {
1048 		(void)__db_unmapfile(rp->re_smap, rp->re_msize);
1049 		rp->re_smap = NULL;
1050 	}
1051 
1052 	/* Get rid of any backing file descriptor, just on GP's. */
1053 	if (rp->re_fd != -1) {
1054 		(void)__os_close(rp->re_fd);
1055 		rp->re_fd = -1;
1056 	}
1057 
1058 	/* Open the file, truncating it. */
1059 	if ((ret = __db_open(rp->re_source,
1060 	    DB_SEQUENTIAL | DB_TRUNCATE,
1061 	    DB_SEQUENTIAL | DB_TRUNCATE, 0, &fd)) != 0) {
1062 		__db_err(dbp->dbenv, "%s: %s", rp->re_source, strerror(ret));
1063 		return (ret);
1064 	}
1065 
1066 	/*
1067 	 * We step through the records, writing each one out.  Use the record
1068 	 * number and the dbp->get() function, instead of a cursor, so we find
1069 	 * and write out "deleted" or non-existent records.
1070 	 */
1071 	memset(&key, 0, sizeof(key));
1072 	memset(&data, 0, sizeof(data));
1073 	key.size = sizeof(db_recno_t);
1074 	key.data = &keyno;
1075 
1076 	/*
1077 	 * We'll need the delimiter if we're doing variable-length records,
1078 	 * and the pad character if we're doing fixed-length records.
1079 	 */
1080 	delim = rp->re_delim;
1081 	if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
1082 		if ((ret = __os_malloc(rp->re_len, NULL, &pad)) != 0)
1083 			goto err;
1084 		memset(pad, rp->re_pad, rp->re_len);
1085 	} else
1086 		COMPQUIET(pad, NULL);
1087 	for (keyno = 1;; ++keyno) {
1088 		switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) {
1089 		case 0:
1090 			if ((ret =
1091 			    __os_write(fd, data.data, data.size, &nw)) != 0)
1092 				goto err;
1093 			if (nw != (ssize_t)data.size) {
1094 				ret = EIO;
1095 				goto err;
1096 			}
1097 			break;
1098 		case DB_KEYEMPTY:
1099 			if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
1100 				if ((ret =
1101 				    __os_write(fd, pad, rp->re_len, &nw)) != 0)
1102 					goto err;
1103 				if (nw != (ssize_t)rp->re_len) {
1104 					ret = EIO;
1105 					goto err;
1106 				}
1107 			}
1108 			break;
1109 		case DB_NOTFOUND:
1110 			ret = 0;
1111 			goto done;
1112 		}
1113 		if (!F_ISSET(dbp, DB_RE_FIXEDLEN)) {
1114 			if ((ret = __os_write(fd, &delim, 1, &nw)) != 0)
1115 				goto err;
1116 			if (nw != 1) {
1117 				ret = EIO;
1118 				goto err;
1119 			}
1120 		}
1121 	}
1122 
1123 err:
1124 done:	/* Close the file descriptor. */
1125 	if ((t_ret = __os_close(fd)) != 0 || ret == 0)
1126 		ret = t_ret;
1127 
1128 	if (ret == 0)
1129 		F_CLR(rp, RECNO_MODIFIED);
1130 	return (ret);
1131 }
1132 
1133 /*
1134  * __ram_fmap --
1135  *	Get fixed length records from a file.
1136  */
1137 static int
1138 __ram_fmap(dbc, top)
1139 	DBC *dbc;
1140 	db_recno_t top;
1141 {
1142 	DB *dbp;
1143 	DBT data;
1144 	RECNO *rp;
1145 	db_recno_t recno;
1146 	u_int32_t len;
1147 	u_int8_t *sp, *ep, *p;
1148 	int ret;
1149 
1150 	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
1151 		return (ret);
1152 
1153 	dbp = dbc->dbp;
1154 	rp = ((BTREE *)(dbp->internal))->recno;
1155 
1156 	if (dbc->rdata.ulen < rp->re_len) {
1157 		if ((ret = __os_realloc(&dbc->rdata.data, rp->re_len)) != 0) {
1158 			dbc->rdata.ulen = 0;
1159 			dbc->rdata.data = NULL;
1160 			return (ret);
1161 		}
1162 		dbc->rdata.ulen = rp->re_len;
1163 	}
1164 
1165 	memset(&data, 0, sizeof(data));
1166 	data.data = dbc->rdata.data;
1167 	data.size = rp->re_len;
1168 
1169 	sp = (u_int8_t *)rp->re_cmap;
1170 	ep = (u_int8_t *)rp->re_emap;
1171 	while (recno < top) {
1172 		if (sp >= ep) {
1173 			F_SET(rp, RECNO_EOF);
1174 			return (DB_NOTFOUND);
1175 		}
1176 		len = rp->re_len;
1177 		for (p = dbc->rdata.data;
1178 		    sp < ep && len > 0; *p++ = *sp++, --len)
1179 			;
1180 
1181 		/*
1182 		 * Another process may have read this record from the input
1183 		 * file and stored it into the database already, in which
1184 		 * case we don't need to repeat that operation.  We detect
1185 		 * this by checking if the last record we've read is greater
1186 		 * or equal to the number of records in the database.
1187 		 *
1188 		 * XXX
1189 		 * We should just do a seek, since the records are fixed
1190 		 * length.
1191 		 */
1192 		if (rp->re_last >= recno) {
1193 			if (len != 0)
1194 				memset(p, rp->re_pad, len);
1195 
1196 			++recno;
1197 			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
1198 				return (ret);
1199 		}
1200 		++rp->re_last;
1201 	}
1202 	rp->re_cmap = sp;
1203 	return (0);
1204 }
1205 
1206 /*
1207  * __ram_vmap --
1208  *	Get variable length records from a file.
1209  */
1210 static int
1211 __ram_vmap(dbc, top)
1212 	DBC *dbc;
1213 	db_recno_t top;
1214 {
1215 	DBT data;
1216 	RECNO *rp;
1217 	db_recno_t recno;
1218 	u_int8_t *sp, *ep;
1219 	int delim, ret;
1220 
1221 	rp = ((BTREE *)(dbc->dbp->internal))->recno;
1222 
1223 	if ((ret = __bam_nrecs(dbc, &recno)) != 0)
1224 		return (ret);
1225 
1226 	memset(&data, 0, sizeof(data));
1227 
1228 	delim = rp->re_delim;
1229 
1230 	sp = (u_int8_t *)rp->re_cmap;
1231 	ep = (u_int8_t *)rp->re_emap;
1232 	while (recno < top) {
1233 		if (sp >= ep) {
1234 			F_SET(rp, RECNO_EOF);
1235 			return (DB_NOTFOUND);
1236 		}
1237 		for (data.data = sp; sp < ep && *sp != delim; ++sp)
1238 			;
1239 
1240 		/*
1241 		 * Another process may have read this record from the input
1242 		 * file and stored it into the database already, in which
1243 		 * case we don't need to repeat that operation.  We detect
1244 		 * this by checking if the last record we've read is greater
1245 		 * or equal to the number of records in the database.
1246 		 */
1247 		if (rp->re_last >= recno) {
1248 			data.size = sp - (u_int8_t *)data.data;
1249 			++recno;
1250 			if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
1251 				return (ret);
1252 		}
1253 		++rp->re_last;
1254 		++sp;
1255 	}
1256 	rp->re_cmap = sp;
1257 	return (0);
1258 }
1259 
1260 /*
1261  * __ram_add --
1262  *	Add records into the tree.
1263  */
1264 static int
1265 __ram_add(dbc, recnop, data, flags, bi_flags)
1266 	DBC *dbc;
1267 	db_recno_t *recnop;
1268 	DBT *data;
1269 	u_int32_t flags, bi_flags;
1270 {
1271 	BKEYDATA *bk;
1272 	CURSOR *cp;
1273 	DB *dbp;
1274 	PAGE *h;
1275 	db_indx_t indx;
1276 	int exact, isdeleted, ret, stack;
1277 
1278 	dbp = dbc->dbp;
1279 	cp = dbc->internal;
1280 
1281 retry:	/* Find the slot for insertion. */
1282 	if ((ret = __bam_rsearch(dbc, recnop,
1283 	    S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
1284 		return (ret);
1285 	h = cp->csp->page;
1286 	indx = cp->csp->indx;
1287 	stack = 1;
1288 
1289 	/*
1290 	 * If re-numbering records, the on-page deleted flag means this record
1291 	 * was implicitly created.  If not re-numbering records, the on-page
1292 	 * deleted flag means this record was implicitly created, or, it was
1293 	 * deleted at some time.
1294 	 *
1295 	 * If DB_NOOVERWRITE is set and the item already exists in the tree,
1296 	 * return an error unless the item was either marked for deletion or
1297 	 * only implicitly created.
1298 	 */
1299 	isdeleted = 0;
1300 	if (exact) {
1301 		bk = GET_BKEYDATA(h, indx);
1302 		if (B_DISSET(bk->type))
1303 			isdeleted = 1;
1304 		else
1305 			if (flags == DB_NOOVERWRITE) {
1306 				ret = DB_KEYEXIST;
1307 				goto err;
1308 			}
1309 	}
1310 
1311 	/*
1312 	 * Select the arguments for __bam_iitem() and do the insert.  If the
1313 	 * key is an exact match, or we're replacing the data item with a
1314 	 * new data item, replace the current item.  If the key isn't an exact
1315 	 * match, we're inserting a new key/data pair, before the search
1316 	 * location.
1317 	 */
1318 	switch (ret = __bam_iitem(dbc,
1319 	    &h, &indx, NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
1320 	case 0:
1321 		/*
1322 		 * Don't adjust anything.
1323 		 *
1324 		 * If we inserted a record, no cursors need adjusting because
1325 		 * the only new record it's possible to insert is at the very
1326 		 * end of the tree.  The necessary adjustments to the internal
1327 		 * page counts were made by __bam_iitem().
1328 		 *
1329 		 * If we overwrote a record, no cursors need adjusting because
1330 		 * future DBcursor->get calls will simply return the underlying
1331 		 * record (there's no adjustment made for the DB_CURRENT flag
1332 		 * when a cursor get operation immediately follows a cursor
1333 		 * delete operation, and the normal adjustment for the DB_NEXT
1334 		 * flag is still correct).
1335 		 */
1336 		break;
1337 	case DB_NEEDSPLIT:
1338 		/* Discard the stack of pages and split the page. */
1339 		(void)__bam_stkrel(dbc, 0);
1340 		stack = 0;
1341 
1342 		if ((ret = __bam_split(dbc, recnop)) != 0)
1343 			goto err;
1344 
1345 		goto retry;
1346 		/* NOTREACHED */
1347 	default:
1348 		goto err;
1349 	}
1350 
1351 
1352 err:	if (stack)
1353 		__bam_stkrel(dbc, 0);
1354 
1355 	return (ret);
1356 }
1357