xref: /illumos-gate/usr/src/cmd/sendmail/db/txn/txn.c (revision e5803b76927480e8f9b67b22201c484ccf4c2bcf)
1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998
5  *	Sleepycat Software.  All rights reserved.
6  */
7 /*
8  * Copyright (c) 1995, 1996
9  *	The President and Fellows of Harvard University.  All rights reserved.
10  *
11  * This code is derived from software contributed to Berkeley by
12  * Margo Seltzer.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  */
42 
43 #include "config.h"
44 
45 #ifndef lint
46 static const char sccsid[] = "@(#)txn.c	10.66 (Sleepycat) 1/3/99";
47 #endif /* not lint */
48 
49 
50 #ifndef NO_SYSTEM_INCLUDES
51 #include <sys/types.h>
52 
53 #include <errno.h>
54 #include <string.h>
55 #include <time.h>
56 #endif
57 
58 #include "db_int.h"
59 #include "shqueue.h"
60 #include "db_page.h"
61 #include "db_shash.h"
62 #include "txn.h"
63 #include "db_dispatch.h"
64 #include "lock.h"
65 #include "log.h"
66 #include "db_am.h"
67 #include "common_ext.h"
68 
69 static int  __txn_begin __P((DB_TXN *));
70 static int  __txn_check_running __P((const DB_TXN *, TXN_DETAIL **));
71 static int  __txn_end __P((DB_TXN *, int));
72 static void __txn_freekids __P((DB_TXN *));
73 static int  __txn_grow_region __P((DB_TXNMGR *));
74 static int  __txn_init __P((DB_TXNREGION *));
75 static int  __txn_undo __P((DB_TXN *));
76 static int  __txn_validate_region __P((DB_TXNMGR *));
77 
78 /*
79  * This file contains the top level routines of the transaction library.
80  * It assumes that a lock manager and log manager that conform to the db_log(3)
81  * and db_lock(3) interfaces exist.
82  *
83  * Initialize a transaction region in shared memory.
84  * Return 0 on success, errno on failure.
85  */
86 static int
87 __txn_init(txn_region)
88 	DB_TXNREGION *txn_region;
89 {
90 	time_t now;
91 
92 	(void)time(&now);
93 
94 	/* maxtxns is already initialized. */
95 	txn_region->magic = DB_TXNMAGIC;
96 	txn_region->version = DB_TXNVERSION;
97 	txn_region->last_txnid = TXN_MINIMUM;
98 	/*
99 	 * XXX
100 	 * If we ever do more types of locking and logging, this changes.
101 	 */
102 	txn_region->logtype = 0;
103 	txn_region->locktype = 0;
104 	txn_region->time_ckp = now;
105 	ZERO_LSN(txn_region->last_ckp);
106 	ZERO_LSN(txn_region->pending_ckp);
107 	SH_TAILQ_INIT(&txn_region->active_txn);
108 	__db_shalloc_init((void *)&txn_region[1],
109 	    TXN_REGION_SIZE(txn_region->maxtxns) - sizeof(DB_TXNREGION));
110 
111 	return (0);
112 }
113 
114 int
115 txn_open(path, flags, mode, dbenv, mgrpp)
116 	const char *path;
117 	u_int32_t flags;
118 	int mode;
119 	DB_ENV *dbenv;
120 	DB_TXNMGR **mgrpp;
121 {
122 	DB_TXNMGR *tmgrp;
123 	u_int32_t maxtxns;
124 	int ret;
125 
126 	/* Validate arguments. */
127 	if (dbenv == NULL)
128 		return (EINVAL);
129 #ifdef HAVE_SPINLOCKS
130 #define	OKFLAGS	(DB_CREATE | DB_THREAD | DB_TXN_NOSYNC)
131 #else
132 #define	OKFLAGS	(DB_CREATE | DB_TXN_NOSYNC)
133 #endif
134 	if ((ret = __db_fchk(dbenv, "txn_open", flags, OKFLAGS)) != 0)
135 		return (ret);
136 
137 	maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 20;
138 
139 	/* Now, create the transaction manager structure and set its fields. */
140 	if ((ret = __os_calloc(1, sizeof(DB_TXNMGR), &tmgrp)) != 0)
141 		return (ret);
142 
143 	/* Initialize the transaction manager structure. */
144 	tmgrp->mutexp = NULL;
145 	tmgrp->dbenv = dbenv;
146 	tmgrp->recover =
147 	    dbenv->tx_recover == NULL ? __db_dispatch : dbenv->tx_recover;
148 	tmgrp->flags = LF_ISSET(DB_TXN_NOSYNC | DB_THREAD);
149 	TAILQ_INIT(&tmgrp->txn_chain);
150 
151 	/* Join/create the txn region. */
152 	tmgrp->reginfo.dbenv = dbenv;
153 	tmgrp->reginfo.appname = DB_APP_NONE;
154 	if (path == NULL)
155 		tmgrp->reginfo.path = NULL;
156 	else
157 		if ((ret = __os_strdup(path, &tmgrp->reginfo.path)) != 0)
158 			goto err;
159 	tmgrp->reginfo.file = DEFAULT_TXN_FILE;
160 	tmgrp->reginfo.mode = mode;
161 	tmgrp->reginfo.size = TXN_REGION_SIZE(maxtxns);
162 	tmgrp->reginfo.dbflags = flags;
163 	tmgrp->reginfo.addr = NULL;
164 	tmgrp->reginfo.fd = -1;
165 	tmgrp->reginfo.flags = dbenv->tx_max == 0 ? REGION_SIZEDEF : 0;
166 	if ((ret = __db_rattach(&tmgrp->reginfo)) != 0)
167 		goto err;
168 
169 	/* Fill in region-related fields. */
170 	tmgrp->region = tmgrp->reginfo.addr;
171 	tmgrp->mem = &tmgrp->region[1];
172 
173 	if (F_ISSET(&tmgrp->reginfo, REGION_CREATED)) {
174 		tmgrp->region->maxtxns = maxtxns;
175 		if ((ret = __txn_init(tmgrp->region)) != 0)
176 			goto err;
177 
178 	} else if (tmgrp->region->magic != DB_TXNMAGIC) {
179 		/* Check if valid region. */
180 		__db_err(dbenv, "txn_open: Bad magic number");
181 		ret = EINVAL;
182 		goto err;
183 	}
184 
185 	if (LF_ISSET(DB_THREAD)) {
186 		if ((ret = __db_shalloc(tmgrp->mem, sizeof(db_mutex_t),
187 		    MUTEX_ALIGNMENT, &tmgrp->mutexp)) == 0)
188 			/*
189 			 * Since we only get here if threading is turned on, we
190 			 * know that we have spinlocks, so the offset is going
191 			 * to be ignored.  We put 0 here as a valid placeholder.
192 			 */
193 			__db_mutex_init(tmgrp->mutexp, 0);
194 		if (ret != 0)
195 			goto err;
196 	}
197 
198 	UNLOCK_TXNREGION(tmgrp);
199 	*mgrpp = tmgrp;
200 	return (0);
201 
202 err:	if (tmgrp->reginfo.addr != NULL) {
203 		if (tmgrp->mutexp != NULL)
204 			__db_shalloc_free(tmgrp->mem, tmgrp->mutexp);
205 
206 		UNLOCK_TXNREGION(tmgrp);
207 		(void)__db_rdetach(&tmgrp->reginfo);
208 		if (F_ISSET(&tmgrp->reginfo, REGION_CREATED))
209 			(void)txn_unlink(path, 1, dbenv);
210 	}
211 
212 	if (tmgrp->reginfo.path != NULL)
213 		__os_freestr(tmgrp->reginfo.path);
214 	__os_free(tmgrp, sizeof(*tmgrp));
215 	return (ret);
216 }
217 
218 /*
219  * __txn_panic --
220  *	Panic a transaction region.
221  *
222  * PUBLIC: void __txn_panic __P((DB_ENV *));
223  */
224 void
225 __txn_panic(dbenv)
226 	DB_ENV *dbenv;
227 {
228 	if (dbenv->tx_info != NULL)
229 		dbenv->tx_info->region->hdr.panic = 1;
230 }
231 
232 /*
233  * txn_begin --
234  *	This is a wrapper to the actual begin process.  Normal txn_begin()
235  * allocates a DB_TXN structure for the caller, while txn_xa_begin() does
236  * not.  Other than that, both call into the common __txn_begin code().
237  *
238  * Internally, we use TXN_DETAIL structures, but the DB_TXN structure
239  * provides access to the transaction ID and the offset in the transaction
240  * region of the TXN_DETAIL structure.
241  */
242 int
243 txn_begin(tmgrp, parent, txnpp)
244 	DB_TXNMGR *tmgrp;
245 	DB_TXN *parent, **txnpp;
246 {
247 	DB_TXN *txn;
248 	int ret;
249 
250 	TXN_PANIC_CHECK(tmgrp);
251 
252 	if ((ret = __os_calloc(1, sizeof(DB_TXN), &txn)) != 0)
253 		return (ret);
254 
255 	txn->parent = parent;
256 	TAILQ_INIT(&txn->kids);
257 	txn->mgrp = tmgrp;
258 	txn->flags = TXN_MALLOC;
259 	if ((ret = __txn_begin(txn)) != 0) {
260 		__os_free(txn, sizeof(DB_TXN));
261 		txn = NULL;
262 	}
263 	if (txn != NULL && parent != NULL)
264 		TAILQ_INSERT_HEAD(&parent->kids, txn, klinks);
265 	*txnpp = txn;
266 	return (ret);
267 }
268 
269 /*
270  * __txn_xa_begin --
271  *	XA version of txn_begin.
272  *
273  * PUBLIC: int __txn_xa_begin __P((DB_ENV *, DB_TXN *));
274  */
275 int
276 __txn_xa_begin(dbenv, txn)
277 	DB_ENV *dbenv;
278 	DB_TXN *txn;
279 {
280 	TXN_PANIC_CHECK(dbenv->tx_info);
281 
282 	memset(txn, 0, sizeof(DB_TXN));
283 
284 	txn->mgrp = dbenv->tx_info;
285 
286 	return (__txn_begin(txn));
287 }
288 
289 /*
290  * __txn_begin --
291  *	Normal DB version of txn_begin.
292  */
293 static int
294 __txn_begin(txn)
295 	DB_TXN *txn;
296 {
297 	DB_LSN begin_lsn;
298 	DB_TXNMGR *mgr;
299 	TXN_DETAIL *td;
300 	size_t off;
301 	u_int32_t id;
302 	int ret;
303 
304 	/*
305 	 * We do not have to write begin records (and if we do not, then we
306 	 * need never write records for read-only transactions).  However,
307 	 * we do need to find the current LSN so that we can store it in the
308 	 * transaction structure, so we can know where to take checkpoints.
309 	 */
310 	mgr = txn->mgrp;
311 	if (mgr->dbenv->lg_info != NULL && (ret =
312 	    log_put(mgr->dbenv->lg_info, &begin_lsn, NULL, DB_CURLSN)) != 0)
313 		goto err2;
314 
315 	LOCK_TXNREGION(mgr);
316 
317 	/* Make sure that last_txnid is not going to wrap around. */
318 	if (mgr->region->last_txnid == TXN_INVALID) {
319 		__db_err(mgr->dbenv, "txn_begin: %s  %s",
320 		    "Transaction ID wrapping.",
321 		    "Snapshot your database and start a new log.");
322 		ret = EINVAL;
323 		goto err1;
324 	}
325 
326 	if ((ret = __txn_validate_region(mgr)) != 0)
327 		goto err1;
328 
329 	/* Allocate a new transaction detail structure. */
330 	if ((ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td)) != 0
331 	    && ret == ENOMEM && (ret = __txn_grow_region(mgr)) == 0)
332 	    	ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td);
333 	if (ret != 0)
334 		goto err1;
335 
336 	/* Place transaction on active transaction list. */
337 	SH_TAILQ_INSERT_HEAD(&mgr->region->active_txn, td, links, __txn_detail);
338 
339 	id = ++mgr->region->last_txnid;
340 	++mgr->region->nbegins;
341 
342 	td->txnid = id;
343 	td->begin_lsn = begin_lsn;
344 	ZERO_LSN(td->last_lsn);
345 	td->last_lock = 0;
346 	td->status = TXN_RUNNING;
347 	if (txn->parent != NULL)
348 		td->parent = txn->parent->off;
349 	else
350 		td->parent = 0;
351 
352 	off = (u_int8_t *)td - (u_int8_t *)mgr->region;
353 	UNLOCK_TXNREGION(mgr);
354 
355 	ZERO_LSN(txn->last_lsn);
356 	txn->txnid = id;
357 	txn->off = off;
358 
359 	if (F_ISSET(txn, TXN_MALLOC)) {
360 		LOCK_TXNTHREAD(mgr);
361 		TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links);
362 		UNLOCK_TXNTHREAD(mgr);
363 	}
364 
365 	return (0);
366 
367 err1:	UNLOCK_TXNREGION(mgr);
368 
369 err2:	return (ret);
370 }
371 /*
372  * txn_commit --
373  *	Commit a transaction.
374  */
375 int
376 txn_commit(txnp)
377 	DB_TXN *txnp;
378 {
379 	DB_LOG *logp;
380 	DB_TXNMGR *mgr;
381 	int ret;
382 
383 	mgr = txnp->mgrp;
384 
385 	TXN_PANIC_CHECK(mgr);
386 	if ((ret = __txn_check_running(txnp, NULL)) != 0)
387 		return (ret);
388 
389 	/*
390 	 * If there are any log records, write a log record and sync
391 	 * the log, else do no log writes.  If the commit is for a child
392 	 * transaction, we do not need to commit the child synchronously
393 	 * since if its parent aborts, it will abort too and its parent
394 	 * (or ultimate ancestor) will write synchronously.
395 	 */
396 	if ((logp = mgr->dbenv->lg_info) != NULL &&
397 	    !IS_ZERO_LSN(txnp->last_lsn)) {
398 		if (txnp->parent == NULL)
399 	    		ret = __txn_regop_log(logp, txnp, &txnp->last_lsn,
400 			    F_ISSET(mgr, DB_TXN_NOSYNC) ? 0 : DB_FLUSH,
401 			    TXN_COMMIT);
402 		else
403 	    		ret = __txn_child_log(logp, txnp, &txnp->last_lsn, 0,
404 			    TXN_COMMIT, txnp->parent->txnid);
405 		if (ret != 0)
406 			return (ret);
407 	}
408 
409 	/*
410 	 * If this is the senior ancestor (i.e., it has no children), then we
411 	 * can release all the child transactions since everyone is committing.
412 	 * Then we can release this transaction.  If this is not the ultimate
413 	 * ancestor, then we can neither free it or its children.
414 	 */
415 	if (txnp->parent == NULL)
416 		__txn_freekids(txnp);
417 
418 	return (__txn_end(txnp, 1));
419 }
420 
421 /*
422  * txn_abort --
423  *	Abort a transcation.
424  */
425 int
426 txn_abort(txnp)
427 	DB_TXN *txnp;
428 {
429 	int ret;
430 	DB_TXN *kids;
431 
432 	TXN_PANIC_CHECK(txnp->mgrp);
433 	if ((ret = __txn_check_running(txnp, NULL)) != 0)
434 		return (ret);
435 
436 	for (kids = TAILQ_FIRST(&txnp->kids);
437 	    kids != NULL;
438 	    kids = TAILQ_FIRST(&txnp->kids))
439 		txn_abort(kids);
440 
441 	if ((ret = __txn_undo(txnp)) != 0) {
442 		__db_err(txnp->mgrp->dbenv,
443 		    "txn_abort: Log undo failed %s", strerror(ret));
444 		return (ret);
445 	}
446 	return (__txn_end(txnp, 0));
447 }
448 
449 /*
450  * txn_prepare --
451  *	Flush the log so a future commit is guaranteed to succeed.
452  */
453 int
454 txn_prepare(txnp)
455 	DB_TXN *txnp;
456 {
457 	DBT xid;
458 	DB_ENV *dbenv;
459 	TXN_DETAIL *td;
460 	int ret;
461 
462 	if ((ret = __txn_check_running(txnp, &td)) != 0)
463 		return (ret);
464 
465 	dbenv = txnp->mgrp->dbenv;
466 	memset(&xid, 0, sizeof(xid));
467 	xid.data = td->xid;
468 	/*
469 	 * We indicate that a transaction is an XA transaction by putting
470 	 * a valid size in the xid.size fiels.  XA requires that the transaction
471 	 * be either ENDED or SUSPENDED when prepare is called, so we know
472 	 * that if the xa_status isn't in one of those states, but we are
473 	 * calling prepare that we are not an XA transaction.
474 	 */
475 	xid.size =
476 	    td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED ?
477 	    0 : sizeof(td->xid);
478 	if (dbenv->lg_info != NULL &&
479 	    (ret = __txn_xa_regop_log(dbenv->lg_info, txnp, &txnp->last_lsn,
480 	    F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_PREPARE,
481 	    &xid, td->format, td->gtrid, td->bqual, &td->begin_lsn)) != 0) {
482 		__db_err(dbenv,
483 		    "txn_prepare: log_write failed %s\n", strerror(ret));
484 		return (ret);
485 	}
486 
487 	LOCK_TXNTHREAD(txnp->mgrp);
488 	td->status = TXN_PREPARED;
489 	UNLOCK_TXNTHREAD(txnp->mgrp);
490 	return (ret);
491 }
492 
493 /*
494  * Return the transaction ID associated with a particular transaction
495  */
496 u_int32_t
497 txn_id(txnp)
498 	DB_TXN *txnp;
499 {
500 	return (txnp->txnid);
501 }
502 
503 /*
504  * txn_close --
505  *	Close the transaction region, does not imply a checkpoint.
506  */
507 int
508 txn_close(tmgrp)
509 	DB_TXNMGR *tmgrp;
510 {
511 	DB_TXN *txnp;
512 	int ret, t_ret;
513 
514 	TXN_PANIC_CHECK(tmgrp);
515 
516 	ret = 0;
517 
518 	/*
519 	 * This function had better only be called once per process
520 	 * (i.e., not per thread), so there should be no synchronization
521 	 * required.
522 	 */
523 	while ((txnp =
524 	    TAILQ_FIRST(&tmgrp->txn_chain)) != TAILQ_END(&tmgrp->txn_chain))
525 		if ((t_ret = txn_abort(txnp)) != 0) {
526 			__txn_end(txnp, 0);
527 			if (ret == 0)
528 				ret = t_ret;
529 		}
530 
531 	if (tmgrp->dbenv->lg_info &&
532 	    (t_ret = log_flush(tmgrp->dbenv->lg_info, NULL)) != 0 && ret == 0)
533 		ret = t_ret;
534 
535 	if (tmgrp->mutexp != NULL) {
536 		LOCK_TXNREGION(tmgrp);
537 		__db_shalloc_free(tmgrp->mem, tmgrp->mutexp);
538 		UNLOCK_TXNREGION(tmgrp);
539 	}
540 
541 	if ((t_ret = __db_rdetach(&tmgrp->reginfo)) != 0 && ret == 0)
542 		ret = t_ret;
543 
544 	if (tmgrp->reginfo.path != NULL)
545 		__os_freestr(tmgrp->reginfo.path);
546 	__os_free(tmgrp, sizeof(*tmgrp));
547 
548 	return (ret);
549 }
550 
551 /*
552  * txn_unlink --
553  *	Remove the transaction region.
554  */
555 int
556 txn_unlink(path, force, dbenv)
557 	const char *path;
558 	int force;
559 	DB_ENV *dbenv;
560 {
561 	REGINFO reginfo;
562 	int ret;
563 
564 	memset(&reginfo, 0, sizeof(reginfo));
565 	reginfo.dbenv = dbenv;
566 	reginfo.appname = DB_APP_NONE;
567 	if (path != NULL && (ret = __os_strdup(path, &reginfo.path)) != 0)
568 		return (ret);
569 	reginfo.file = DEFAULT_TXN_FILE;
570 	ret = __db_runlink(&reginfo, force);
571 	if (reginfo.path != NULL)
572 		__os_freestr(reginfo.path);
573 	return (ret);
574 }
575 
576 /* Internal routines. */
577 
578 /*
579  * Return 0 if the txnp is reasonable, otherwise returns EINVAL.
580  */
581 static int
582 __txn_check_running(txnp, tdp)
583 	const DB_TXN *txnp;
584 	TXN_DETAIL **tdp;
585 {
586 	TXN_DETAIL *tp;
587 
588 	tp = NULL;
589 	if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) {
590 		tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
591 		/*
592 		 * Child transactions could be marked committed which is OK.
593 		 */
594 		if (tp->status != TXN_RUNNING &&
595 		    tp->status != TXN_PREPARED && tp->status != TXN_COMMITTED)
596 			tp = NULL;
597 		if (tdp != NULL)
598 			*tdp = tp;
599 	}
600 
601 	return (tp == NULL ? EINVAL : 0);
602 }
603 
604 static int
605 __txn_end(txnp, is_commit)
606 	DB_TXN *txnp;
607 	int is_commit;
608 {
609 	DB_LOCKREQ request;
610 	DB_TXNMGR *mgr;
611 	TXN_DETAIL *tp;
612 	u_int32_t locker;
613 	int ret;
614 
615 	mgr = txnp->mgrp;
616 
617 	/* Release the locks. */
618 	locker = txnp->txnid;
619 	request.op = txnp->parent == NULL ||
620 	    is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT;
621 
622 	if (mgr->dbenv->lk_info) {
623 		ret =
624 		    lock_tvec(mgr->dbenv->lk_info, txnp, 0, &request, 1, NULL);
625 		if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) {
626 			__db_err(mgr->dbenv, "%s: release locks failed %s",
627 			    is_commit ? "txn_commit" : "txn_abort",
628 			    strerror(ret));
629 			return (ret);
630 		}
631 	}
632 
633 	/* End the transaction. */
634 	LOCK_TXNREGION(mgr);
635 
636 	/*
637 	 * Child transactions that are committing cannot be released until
638 	 * the parent commits, since the parent may abort, causing the child
639 	 * to abort as well.
640 	 */
641 	tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off);
642 	if (txnp->parent == NULL || !is_commit) {
643 		SH_TAILQ_REMOVE(&mgr->region->active_txn,
644 		    tp, links, __txn_detail);
645 
646 		__db_shalloc_free(mgr->mem, tp);
647 	} else
648 		tp->status = is_commit ? TXN_COMMITTED : TXN_ABORTED;
649 
650 	if (is_commit)
651 		mgr->region->ncommits++;
652 	else
653 		mgr->region->naborts++;
654 
655 	UNLOCK_TXNREGION(mgr);
656 
657 	/*
658 	 * If the transaction aborted, we can remove it from its parent links.
659 	 * If it committed, then we need to leave it on, since the parent can
660 	 * still abort.
661 	 */
662 	if (txnp->parent != NULL && !is_commit)
663 		TAILQ_REMOVE(&txnp->parent->kids, txnp, klinks);
664 
665 	/* Free the space. */
666 	if (F_ISSET(txnp, TXN_MALLOC) && (txnp->parent == NULL || !is_commit)) {
667 		LOCK_TXNTHREAD(mgr);
668 		TAILQ_REMOVE(&mgr->txn_chain, txnp, links);
669 		UNLOCK_TXNTHREAD(mgr);
670 
671 		__os_free(txnp, sizeof(*txnp));
672 	}
673 
674 	return (0);
675 }
676 
677 
678 /*
679  * __txn_undo --
680  *	Undo the transaction with id txnid.  Returns 0 on success and
681  *	errno on failure.
682  */
683 static int
684 __txn_undo(txnp)
685 	DB_TXN *txnp;
686 {
687 	DBT rdbt;
688 	DB_LOG *logp;
689 	DB_LSN key_lsn;
690 	DB_TXNMGR *mgr;
691 	int ret;
692 
693 	mgr = txnp->mgrp;
694 	logp = mgr->dbenv->lg_info;
695 	if (logp == NULL)
696 		return (0);
697 
698 	/*
699 	 * This is the simplest way to code this, but if the mallocs during
700 	 * recovery turn out to be a performance issue, we can do the
701 	 * allocation here and use DB_DBT_USERMEM.
702 	 */
703 	memset(&rdbt, 0, sizeof(rdbt));
704 	if (F_ISSET(logp, DB_AM_THREAD))
705 		F_SET(&rdbt, DB_DBT_MALLOC);
706 
707 	key_lsn = txnp->last_lsn;		/* structure assignment */
708 	for (ret = 0; ret == 0 && !IS_ZERO_LSN(key_lsn);) {
709 		/*
710 		 * The dispatch routine returns the lsn of the record
711 		 * before the current one in the key_lsn argument.
712 		 */
713 		if ((ret = log_get(logp, &key_lsn, &rdbt, DB_SET)) == 0) {
714 			ret =
715 			    mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL);
716 			if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) {
717 				__os_free(rdbt.data, rdbt.size);
718 				rdbt.data = NULL;
719 			}
720 		}
721 		if (ret != 0)
722 			return (ret);
723 	}
724 
725 	return (ret);
726 }
727 
728 /*
729  * Transaction checkpoint.
730  * If either kbytes or minutes is non-zero, then we only take the checkpoint
731  * more than "minutes" minutes have passed since the last checkpoint or if
732  * more than "kbytes" of log data have been written since the last checkpoint.
733  * When taking a checkpoint, find the oldest active transaction and figure out
734  * its first LSN.  This is the lowest LSN we can checkpoint, since any record
735  * written after since that point may be involved in a transaction and may
736  * therefore need to be undone in the case of an abort.
737  */
738 int
739 txn_checkpoint(mgr, kbytes, minutes)
740 	const DB_TXNMGR *mgr;
741 	u_int32_t kbytes, minutes;
742 {
743 	DB_LOG *dblp;
744 	DB_LSN ckp_lsn, sync_lsn, last_ckp;
745 	TXN_DETAIL *txnp;
746 	time_t last_ckp_time, now;
747 	u_int32_t kbytes_written;
748 	int ret;
749 
750 	TXN_PANIC_CHECK(mgr);
751 
752 	/*
753 	 * Check if we need to run recovery.
754 	 */
755 	ZERO_LSN(ckp_lsn);
756 	if (minutes != 0) {
757 		(void)time(&now);
758 
759 		LOCK_TXNREGION(mgr);
760 		last_ckp_time = mgr->region->time_ckp;
761 		UNLOCK_TXNREGION(mgr);
762 
763 		if (now - last_ckp_time >= (time_t)(minutes * 60))
764 			goto do_ckp;
765 	}
766 
767 	if (kbytes != 0) {
768 		dblp = mgr->dbenv->lg_info;
769 		LOCK_LOGREGION(dblp);
770 		kbytes_written =
771 		    dblp->lp->stat.st_wc_mbytes * 1024 +
772 		    dblp->lp->stat.st_wc_bytes / 1024;
773 		ckp_lsn = dblp->lp->lsn;
774 		UNLOCK_LOGREGION(dblp);
775 		if (kbytes_written >= (u_int32_t)kbytes)
776 			goto do_ckp;
777 	}
778 
779 	/*
780 	 * If we checked time and data and didn't go to checkpoint,
781 	 * we're done.
782 	 */
783 	if (minutes != 0 || kbytes != 0)
784 		return (0);
785 
786 do_ckp:
787 	if (IS_ZERO_LSN(ckp_lsn)) {
788 		dblp = mgr->dbenv->lg_info;
789 		LOCK_LOGREGION(dblp);
790 		ckp_lsn = dblp->lp->lsn;
791 		UNLOCK_LOGREGION(dblp);
792 	}
793 
794 	/*
795 	 * We have to find an LSN such that all transactions begun
796 	 * before that LSN are complete.
797 	 */
798 	LOCK_TXNREGION(mgr);
799 
800 	if (!IS_ZERO_LSN(mgr->region->pending_ckp))
801 		ckp_lsn = mgr->region->pending_ckp;
802 	else
803 		for (txnp =
804 		    SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail);
805 		    txnp != NULL;
806 		    txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) {
807 
808 			/*
809 			 * Look through the active transactions for the
810 			 * lowest begin lsn.
811 			 */
812 			if (!IS_ZERO_LSN(txnp->begin_lsn) &&
813 			    log_compare(&txnp->begin_lsn, &ckp_lsn) < 0)
814 				ckp_lsn = txnp->begin_lsn;
815 		}
816 
817 	mgr->region->pending_ckp = ckp_lsn;
818 	UNLOCK_TXNREGION(mgr);
819 
820 	/*
821 	 * memp_sync may change the lsn you pass it, so don't pass it
822 	 * the actual ckp_lsn, pass it a temp instead.
823 	 */
824 	sync_lsn = ckp_lsn;
825 	if (mgr->dbenv->mp_info != NULL &&
826 	    (ret = memp_sync(mgr->dbenv->mp_info, &sync_lsn)) != 0) {
827 		/*
828 		 * ret == DB_INCOMPLETE means that there are still buffers to
829 		 * flush, the checkpoint is not complete.  Wait and try again.
830 		 */
831 		if (ret > 0)
832 			__db_err(mgr->dbenv,
833 			    "txn_checkpoint: system failure in memp_sync %s\n",
834 			    strerror(ret));
835 		return (ret);
836 	}
837 	if (mgr->dbenv->lg_info != NULL) {
838 		LOCK_TXNREGION(mgr);
839 		last_ckp = mgr->region->last_ckp;
840 		ZERO_LSN(mgr->region->pending_ckp);
841 		UNLOCK_TXNREGION(mgr);
842 
843 		if ((ret = __txn_ckp_log(mgr->dbenv->lg_info,
844 		   NULL, &ckp_lsn, DB_CHECKPOINT, &ckp_lsn, &last_ckp)) != 0) {
845 			__db_err(mgr->dbenv,
846 			    "txn_checkpoint: log failed at LSN [%ld %ld] %s\n",
847 			    (long)ckp_lsn.file, (long)ckp_lsn.offset,
848 			    strerror(ret));
849 			return (ret);
850 		}
851 
852 		LOCK_TXNREGION(mgr);
853 		mgr->region->last_ckp = ckp_lsn;
854 		(void)time(&mgr->region->time_ckp);
855 		UNLOCK_TXNREGION(mgr);
856 	}
857 	return (0);
858 }
859 
860 /*
861  * __txn_validate_region --
862  *	Called at every interface to verify if the region has changed size,
863  *	and if so, to remap the region in and reset the process' pointers.
864  */
865 static int
866 __txn_validate_region(tp)
867 	DB_TXNMGR *tp;
868 {
869 	int ret;
870 
871 	if (tp->reginfo.size == tp->region->hdr.size)
872 		return (0);
873 
874 	/* Detach/reattach the region. */
875 	if ((ret = __db_rreattach(&tp->reginfo, tp->region->hdr.size)) != 0)
876 		return (ret);
877 
878 	/* Reset region information. */
879 	tp->region = tp->reginfo.addr;
880 	tp->mem = &tp->region[1];
881 
882 	return (0);
883 }
884 
885 static int
886 __txn_grow_region(tp)
887 	DB_TXNMGR *tp;
888 {
889 	size_t incr, oldsize;
890 	u_int32_t mutex_offset, oldmax;
891 	u_int8_t *curaddr;
892 	int ret;
893 
894 	oldmax = tp->region->maxtxns;
895 	incr = oldmax * sizeof(DB_TXN);
896 	mutex_offset = tp->mutexp != NULL ?
897 	    (u_int8_t *)tp->mutexp - (u_int8_t *)tp->region : 0;
898 
899 	oldsize = tp->reginfo.size;
900 	if ((ret = __db_rgrow(&tp->reginfo, oldsize + incr)) != 0)
901 		return (ret);
902 	tp->region = tp->reginfo.addr;
903 
904 	/* Throw the new space on the free list. */
905 	curaddr = (u_int8_t *)tp->region + oldsize;
906 	tp->mem = &tp->region[1];
907 	tp->mutexp = mutex_offset != 0 ?
908 	    (db_mutex_t *)((u_int8_t *)tp->region + mutex_offset) : NULL;
909 
910 	*((size_t *)curaddr) = incr - sizeof(size_t);
911 	curaddr += sizeof(size_t);
912 	__db_shalloc_free(tp->mem, curaddr);
913 
914 	tp->region->maxtxns = 2 * oldmax;
915 
916 	return (0);
917 }
918 
919 int
920 txn_stat(mgr, statp, db_malloc)
921 	DB_TXNMGR *mgr;
922 	DB_TXN_STAT **statp;
923 	void *(*db_malloc) __P((size_t));
924 {
925 	DB_TXN_STAT *stats;
926 	TXN_DETAIL *txnp;
927 	size_t nbytes;
928 	u_int32_t nactive, ndx;
929 	int ret;
930 
931 	TXN_PANIC_CHECK(mgr);
932 
933 	LOCK_TXNREGION(mgr);
934 	nactive = mgr->region->nbegins -
935 	    mgr->region->naborts - mgr->region->ncommits;
936 	UNLOCK_TXNREGION(mgr);
937 
938 	/*
939 	 * Allocate a bunch of extra active structures to handle any
940 	 * that have been created since we unlocked the region.
941 	 */
942 	nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200);
943 	if ((ret = __os_malloc(nbytes, db_malloc, &stats)) != 0)
944 		return (ret);
945 
946 	LOCK_TXNREGION(mgr);
947 	stats->st_last_txnid = mgr->region->last_txnid;
948 	stats->st_last_ckp = mgr->region->last_ckp;
949 	stats->st_maxtxns = mgr->region->maxtxns;
950 	stats->st_naborts = mgr->region->naborts;
951 	stats->st_nbegins = mgr->region->nbegins;
952 	stats->st_ncommits = mgr->region->ncommits;
953 	stats->st_pending_ckp = mgr->region->pending_ckp;
954 	stats->st_time_ckp = mgr->region->time_ckp;
955 	stats->st_nactive = stats->st_nbegins -
956 	    stats->st_naborts - stats->st_ncommits;
957 	if (stats->st_nactive > nactive + 200)
958 		stats->st_nactive = nactive + 200;
959 	stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1];
960 
961 	ndx = 0;
962 	for (txnp = SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail);
963 	    txnp != NULL;
964 	    txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) {
965 		stats->st_txnarray[ndx].txnid = txnp->txnid;
966 		stats->st_txnarray[ndx].lsn = txnp->begin_lsn;
967 		ndx++;
968 
969 		if (ndx >= stats->st_nactive)
970 			break;
971 	}
972 
973 	stats->st_region_wait = mgr->region->hdr.lock.mutex_set_wait;
974 	stats->st_region_nowait = mgr->region->hdr.lock.mutex_set_nowait;
975 	stats->st_refcnt = mgr->region->hdr.refcnt;
976 	stats->st_regsize = mgr->region->hdr.size;
977 
978 	UNLOCK_TXNREGION(mgr);
979 	*statp = stats;
980 	return (0);
981 }
982 
983 static void
984 __txn_freekids(txnp)
985 	DB_TXN *txnp;
986 {
987 	DB_TXNMGR *mgr;
988 	TXN_DETAIL *tp;
989 	DB_TXN *kids;
990 
991 	mgr = txnp->mgrp;
992 
993 	for (kids = TAILQ_FIRST(&txnp->kids);
994 	    kids != NULL;
995 	    kids = TAILQ_FIRST(&txnp->kids)) {
996 		/* Free any children of this transaction. */
997 		__txn_freekids(kids);
998 
999 		/* Free the transaction detail in the region. */
1000 		LOCK_TXNREGION(mgr);
1001 		tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + kids->off);
1002 		SH_TAILQ_REMOVE(&mgr->region->active_txn,
1003 		    tp, links, __txn_detail);
1004 
1005 		__db_shalloc_free(mgr->mem, tp);
1006 		UNLOCK_TXNREGION(mgr);
1007 
1008 		/* Now remove from its parent. */
1009 		TAILQ_REMOVE(&txnp->kids, kids, klinks);
1010 		if (F_ISSET(txnp, TXN_MALLOC)) {
1011 			LOCK_TXNTHREAD(mgr);
1012 			TAILQ_REMOVE(&mgr->txn_chain, kids, links);
1013 			UNLOCK_TXNTHREAD(mgr);
1014 			__os_free(kids, sizeof(*kids));
1015 		}
1016 	}
1017 }
1018 
1019 /*
1020  * __txn_is_ancestor --
1021  * 	Determine if a transaction is an ancestor of another transaction.
1022  * This is used during lock promotion when we do not have the per-process
1023  * data structures that link parents together.  Instead, we'll have to
1024  * follow the links in the transaction region.
1025  *
1026  * PUBLIC: int __txn_is_ancestor __P((DB_TXNMGR *, size_t, size_t));
1027  */
1028 int
1029 __txn_is_ancestor(mgr, hold_off, req_off)
1030 	DB_TXNMGR *mgr;
1031 	size_t hold_off, req_off;
1032 {
1033 	TXN_DETAIL *hold_tp, *req_tp;
1034 
1035 	hold_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + hold_off);
1036 	req_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + req_off);
1037 
1038 	while (req_tp->parent != 0) {
1039 		req_tp =
1040 		    (TXN_DETAIL *)((u_int8_t *)mgr->region + req_tp->parent);
1041 		if (req_tp->txnid == hold_tp->txnid)
1042 			return (1);
1043 	}
1044 
1045 	return (0);
1046 }
1047