1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 /*
8 * Copyright (c) 1995, 1996
9 * The President and Fellows of Harvard University. All rights reserved.
10 *
11 * This code is derived from software contributed to Berkeley by
12 * Margo Seltzer.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 * 3. All advertising materials mentioning features or use of this software
23 * must display the following acknowledgement:
24 * This product includes software developed by the University of
25 * California, Berkeley and its contributors.
26 * 4. Neither the name of the University nor the names of its contributors
27 * may be used to endorse or promote products derived from this software
28 * without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 */
42
43 #include "config.h"
44
45 #ifndef lint
46 static const char sccsid[] = "@(#)txn.c 10.66 (Sleepycat) 1/3/99";
47 #endif /* not lint */
48
49
50 #ifndef NO_SYSTEM_INCLUDES
51 #include <sys/types.h>
52
53 #include <errno.h>
54 #include <string.h>
55 #include <time.h>
56 #endif
57
58 #include "db_int.h"
59 #include "shqueue.h"
60 #include "db_page.h"
61 #include "db_shash.h"
62 #include "txn.h"
63 #include "db_dispatch.h"
64 #include "lock.h"
65 #include "log.h"
66 #include "db_am.h"
67 #include "common_ext.h"
68
69 static int __txn_begin __P((DB_TXN *));
70 static int __txn_check_running __P((const DB_TXN *, TXN_DETAIL **));
71 static int __txn_end __P((DB_TXN *, int));
72 static void __txn_freekids __P((DB_TXN *));
73 static int __txn_grow_region __P((DB_TXNMGR *));
74 static int __txn_init __P((DB_TXNREGION *));
75 static int __txn_undo __P((DB_TXN *));
76 static int __txn_validate_region __P((DB_TXNMGR *));
77
78 /*
79 * This file contains the top level routines of the transaction library.
80 * It assumes that a lock manager and log manager that conform to the db_log(3)
81 * and db_lock(3) interfaces exist.
82 *
83 * Initialize a transaction region in shared memory.
84 * Return 0 on success, errno on failure.
85 */
86 static int
__txn_init(txn_region)87 __txn_init(txn_region)
88 DB_TXNREGION *txn_region;
89 {
90 time_t now;
91
92 (void)time(&now);
93
94 /* maxtxns is already initialized. */
95 txn_region->magic = DB_TXNMAGIC;
96 txn_region->version = DB_TXNVERSION;
97 txn_region->last_txnid = TXN_MINIMUM;
98 /*
99 * XXX
100 * If we ever do more types of locking and logging, this changes.
101 */
102 txn_region->logtype = 0;
103 txn_region->locktype = 0;
104 txn_region->time_ckp = now;
105 ZERO_LSN(txn_region->last_ckp);
106 ZERO_LSN(txn_region->pending_ckp);
107 SH_TAILQ_INIT(&txn_region->active_txn);
108 __db_shalloc_init((void *)&txn_region[1],
109 TXN_REGION_SIZE(txn_region->maxtxns) - sizeof(DB_TXNREGION));
110
111 return (0);
112 }
113
114 int
txn_open(path,flags,mode,dbenv,mgrpp)115 txn_open(path, flags, mode, dbenv, mgrpp)
116 const char *path;
117 u_int32_t flags;
118 int mode;
119 DB_ENV *dbenv;
120 DB_TXNMGR **mgrpp;
121 {
122 DB_TXNMGR *tmgrp;
123 u_int32_t maxtxns;
124 int ret;
125
126 /* Validate arguments. */
127 if (dbenv == NULL)
128 return (EINVAL);
129 #ifdef HAVE_SPINLOCKS
130 #define OKFLAGS (DB_CREATE | DB_THREAD | DB_TXN_NOSYNC)
131 #else
132 #define OKFLAGS (DB_CREATE | DB_TXN_NOSYNC)
133 #endif
134 if ((ret = __db_fchk(dbenv, "txn_open", flags, OKFLAGS)) != 0)
135 return (ret);
136
137 maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 20;
138
139 /* Now, create the transaction manager structure and set its fields. */
140 if ((ret = __os_calloc(1, sizeof(DB_TXNMGR), &tmgrp)) != 0)
141 return (ret);
142
143 /* Initialize the transaction manager structure. */
144 tmgrp->mutexp = NULL;
145 tmgrp->dbenv = dbenv;
146 tmgrp->recover =
147 dbenv->tx_recover == NULL ? __db_dispatch : dbenv->tx_recover;
148 tmgrp->flags = LF_ISSET(DB_TXN_NOSYNC | DB_THREAD);
149 TAILQ_INIT(&tmgrp->txn_chain);
150
151 /* Join/create the txn region. */
152 tmgrp->reginfo.dbenv = dbenv;
153 tmgrp->reginfo.appname = DB_APP_NONE;
154 if (path == NULL)
155 tmgrp->reginfo.path = NULL;
156 else
157 if ((ret = __os_strdup(path, &tmgrp->reginfo.path)) != 0)
158 goto err;
159 tmgrp->reginfo.file = DEFAULT_TXN_FILE;
160 tmgrp->reginfo.mode = mode;
161 tmgrp->reginfo.size = TXN_REGION_SIZE(maxtxns);
162 tmgrp->reginfo.dbflags = flags;
163 tmgrp->reginfo.addr = NULL;
164 tmgrp->reginfo.fd = -1;
165 tmgrp->reginfo.flags = dbenv->tx_max == 0 ? REGION_SIZEDEF : 0;
166 if ((ret = __db_rattach(&tmgrp->reginfo)) != 0)
167 goto err;
168
169 /* Fill in region-related fields. */
170 tmgrp->region = tmgrp->reginfo.addr;
171 tmgrp->mem = &tmgrp->region[1];
172
173 if (F_ISSET(&tmgrp->reginfo, REGION_CREATED)) {
174 tmgrp->region->maxtxns = maxtxns;
175 if ((ret = __txn_init(tmgrp->region)) != 0)
176 goto err;
177
178 } else if (tmgrp->region->magic != DB_TXNMAGIC) {
179 /* Check if valid region. */
180 __db_err(dbenv, "txn_open: Bad magic number");
181 ret = EINVAL;
182 goto err;
183 }
184
185 if (LF_ISSET(DB_THREAD)) {
186 if ((ret = __db_shalloc(tmgrp->mem, sizeof(db_mutex_t),
187 MUTEX_ALIGNMENT, &tmgrp->mutexp)) == 0)
188 /*
189 * Since we only get here if threading is turned on, we
190 * know that we have spinlocks, so the offset is going
191 * to be ignored. We put 0 here as a valid placeholder.
192 */
193 __db_mutex_init(tmgrp->mutexp, 0);
194 if (ret != 0)
195 goto err;
196 }
197
198 UNLOCK_TXNREGION(tmgrp);
199 *mgrpp = tmgrp;
200 return (0);
201
202 err: if (tmgrp->reginfo.addr != NULL) {
203 if (tmgrp->mutexp != NULL)
204 __db_shalloc_free(tmgrp->mem, tmgrp->mutexp);
205
206 UNLOCK_TXNREGION(tmgrp);
207 (void)__db_rdetach(&tmgrp->reginfo);
208 if (F_ISSET(&tmgrp->reginfo, REGION_CREATED))
209 (void)txn_unlink(path, 1, dbenv);
210 }
211
212 if (tmgrp->reginfo.path != NULL)
213 __os_freestr(tmgrp->reginfo.path);
214 __os_free(tmgrp, sizeof(*tmgrp));
215 return (ret);
216 }
217
218 /*
219 * __txn_panic --
220 * Panic a transaction region.
221 *
222 * PUBLIC: void __txn_panic __P((DB_ENV *));
223 */
224 void
__txn_panic(dbenv)225 __txn_panic(dbenv)
226 DB_ENV *dbenv;
227 {
228 if (dbenv->tx_info != NULL)
229 dbenv->tx_info->region->hdr.panic = 1;
230 }
231
232 /*
233 * txn_begin --
234 * This is a wrapper to the actual begin process. Normal txn_begin()
235 * allocates a DB_TXN structure for the caller, while txn_xa_begin() does
236 * not. Other than that, both call into the common __txn_begin code().
237 *
238 * Internally, we use TXN_DETAIL structures, but the DB_TXN structure
239 * provides access to the transaction ID and the offset in the transaction
240 * region of the TXN_DETAIL structure.
241 */
242 int
txn_begin(tmgrp,parent,txnpp)243 txn_begin(tmgrp, parent, txnpp)
244 DB_TXNMGR *tmgrp;
245 DB_TXN *parent, **txnpp;
246 {
247 DB_TXN *txn;
248 int ret;
249
250 TXN_PANIC_CHECK(tmgrp);
251
252 if ((ret = __os_calloc(1, sizeof(DB_TXN), &txn)) != 0)
253 return (ret);
254
255 txn->parent = parent;
256 TAILQ_INIT(&txn->kids);
257 txn->mgrp = tmgrp;
258 txn->flags = TXN_MALLOC;
259 if ((ret = __txn_begin(txn)) != 0) {
260 __os_free(txn, sizeof(DB_TXN));
261 txn = NULL;
262 }
263 if (txn != NULL && parent != NULL)
264 TAILQ_INSERT_HEAD(&parent->kids, txn, klinks);
265 *txnpp = txn;
266 return (ret);
267 }
268
269 /*
270 * __txn_xa_begin --
271 * XA version of txn_begin.
272 *
273 * PUBLIC: int __txn_xa_begin __P((DB_ENV *, DB_TXN *));
274 */
275 int
__txn_xa_begin(dbenv,txn)276 __txn_xa_begin(dbenv, txn)
277 DB_ENV *dbenv;
278 DB_TXN *txn;
279 {
280 TXN_PANIC_CHECK(dbenv->tx_info);
281
282 memset(txn, 0, sizeof(DB_TXN));
283
284 txn->mgrp = dbenv->tx_info;
285
286 return (__txn_begin(txn));
287 }
288
289 /*
290 * __txn_begin --
291 * Normal DB version of txn_begin.
292 */
293 static int
__txn_begin(txn)294 __txn_begin(txn)
295 DB_TXN *txn;
296 {
297 DB_LSN begin_lsn;
298 DB_TXNMGR *mgr;
299 TXN_DETAIL *td;
300 size_t off;
301 u_int32_t id;
302 int ret;
303
304 /*
305 * We do not have to write begin records (and if we do not, then we
306 * need never write records for read-only transactions). However,
307 * we do need to find the current LSN so that we can store it in the
308 * transaction structure, so we can know where to take checkpoints.
309 */
310 mgr = txn->mgrp;
311 if (mgr->dbenv->lg_info != NULL && (ret =
312 log_put(mgr->dbenv->lg_info, &begin_lsn, NULL, DB_CURLSN)) != 0)
313 goto err2;
314
315 LOCK_TXNREGION(mgr);
316
317 /* Make sure that last_txnid is not going to wrap around. */
318 if (mgr->region->last_txnid == TXN_INVALID) {
319 __db_err(mgr->dbenv, "txn_begin: %s %s",
320 "Transaction ID wrapping.",
321 "Snapshot your database and start a new log.");
322 ret = EINVAL;
323 goto err1;
324 }
325
326 if ((ret = __txn_validate_region(mgr)) != 0)
327 goto err1;
328
329 /* Allocate a new transaction detail structure. */
330 if ((ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td)) != 0
331 && ret == ENOMEM && (ret = __txn_grow_region(mgr)) == 0)
332 ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td);
333 if (ret != 0)
334 goto err1;
335
336 /* Place transaction on active transaction list. */
337 SH_TAILQ_INSERT_HEAD(&mgr->region->active_txn, td, links, __txn_detail);
338
339 id = ++mgr->region->last_txnid;
340 ++mgr->region->nbegins;
341
342 td->txnid = id;
343 td->begin_lsn = begin_lsn;
344 ZERO_LSN(td->last_lsn);
345 td->last_lock = 0;
346 td->status = TXN_RUNNING;
347 if (txn->parent != NULL)
348 td->parent = txn->parent->off;
349 else
350 td->parent = 0;
351
352 off = (u_int8_t *)td - (u_int8_t *)mgr->region;
353 UNLOCK_TXNREGION(mgr);
354
355 ZERO_LSN(txn->last_lsn);
356 txn->txnid = id;
357 txn->off = off;
358
359 if (F_ISSET(txn, TXN_MALLOC)) {
360 LOCK_TXNTHREAD(mgr);
361 TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links);
362 UNLOCK_TXNTHREAD(mgr);
363 }
364
365 return (0);
366
367 err1: UNLOCK_TXNREGION(mgr);
368
369 err2: return (ret);
370 }
371 /*
372 * txn_commit --
373 * Commit a transaction.
374 */
375 int
txn_commit(txnp)376 txn_commit(txnp)
377 DB_TXN *txnp;
378 {
379 DB_LOG *logp;
380 DB_TXNMGR *mgr;
381 int ret;
382
383 mgr = txnp->mgrp;
384
385 TXN_PANIC_CHECK(mgr);
386 if ((ret = __txn_check_running(txnp, NULL)) != 0)
387 return (ret);
388
389 /*
390 * If there are any log records, write a log record and sync
391 * the log, else do no log writes. If the commit is for a child
392 * transaction, we do not need to commit the child synchronously
393 * since if its parent aborts, it will abort too and its parent
394 * (or ultimate ancestor) will write synchronously.
395 */
396 if ((logp = mgr->dbenv->lg_info) != NULL &&
397 !IS_ZERO_LSN(txnp->last_lsn)) {
398 if (txnp->parent == NULL)
399 ret = __txn_regop_log(logp, txnp, &txnp->last_lsn,
400 F_ISSET(mgr, DB_TXN_NOSYNC) ? 0 : DB_FLUSH,
401 TXN_COMMIT);
402 else
403 ret = __txn_child_log(logp, txnp, &txnp->last_lsn, 0,
404 TXN_COMMIT, txnp->parent->txnid);
405 if (ret != 0)
406 return (ret);
407 }
408
409 /*
410 * If this is the senior ancestor (i.e., it has no children), then we
411 * can release all the child transactions since everyone is committing.
412 * Then we can release this transaction. If this is not the ultimate
413 * ancestor, then we can neither free it or its children.
414 */
415 if (txnp->parent == NULL)
416 __txn_freekids(txnp);
417
418 return (__txn_end(txnp, 1));
419 }
420
421 /*
422 * txn_abort --
423 * Abort a transcation.
424 */
425 int
txn_abort(txnp)426 txn_abort(txnp)
427 DB_TXN *txnp;
428 {
429 int ret;
430 DB_TXN *kids;
431
432 TXN_PANIC_CHECK(txnp->mgrp);
433 if ((ret = __txn_check_running(txnp, NULL)) != 0)
434 return (ret);
435
436 for (kids = TAILQ_FIRST(&txnp->kids);
437 kids != NULL;
438 kids = TAILQ_FIRST(&txnp->kids))
439 txn_abort(kids);
440
441 if ((ret = __txn_undo(txnp)) != 0) {
442 __db_err(txnp->mgrp->dbenv,
443 "txn_abort: Log undo failed %s", strerror(ret));
444 return (ret);
445 }
446 return (__txn_end(txnp, 0));
447 }
448
449 /*
450 * txn_prepare --
451 * Flush the log so a future commit is guaranteed to succeed.
452 */
453 int
txn_prepare(txnp)454 txn_prepare(txnp)
455 DB_TXN *txnp;
456 {
457 DBT xid;
458 DB_ENV *dbenv;
459 TXN_DETAIL *td;
460 int ret;
461
462 if ((ret = __txn_check_running(txnp, &td)) != 0)
463 return (ret);
464
465 dbenv = txnp->mgrp->dbenv;
466 memset(&xid, 0, sizeof(xid));
467 xid.data = td->xid;
468 /*
469 * We indicate that a transaction is an XA transaction by putting
470 * a valid size in the xid.size fiels. XA requires that the transaction
471 * be either ENDED or SUSPENDED when prepare is called, so we know
472 * that if the xa_status isn't in one of those states, but we are
473 * calling prepare that we are not an XA transaction.
474 */
475 xid.size =
476 td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED ?
477 0 : sizeof(td->xid);
478 if (dbenv->lg_info != NULL &&
479 (ret = __txn_xa_regop_log(dbenv->lg_info, txnp, &txnp->last_lsn,
480 F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_PREPARE,
481 &xid, td->format, td->gtrid, td->bqual, &td->begin_lsn)) != 0) {
482 __db_err(dbenv,
483 "txn_prepare: log_write failed %s\n", strerror(ret));
484 return (ret);
485 }
486
487 LOCK_TXNTHREAD(txnp->mgrp);
488 td->status = TXN_PREPARED;
489 UNLOCK_TXNTHREAD(txnp->mgrp);
490 return (ret);
491 }
492
493 /*
494 * Return the transaction ID associated with a particular transaction
495 */
496 u_int32_t
txn_id(txnp)497 txn_id(txnp)
498 DB_TXN *txnp;
499 {
500 return (txnp->txnid);
501 }
502
503 /*
504 * txn_close --
505 * Close the transaction region, does not imply a checkpoint.
506 */
507 int
txn_close(tmgrp)508 txn_close(tmgrp)
509 DB_TXNMGR *tmgrp;
510 {
511 DB_TXN *txnp;
512 int ret, t_ret;
513
514 TXN_PANIC_CHECK(tmgrp);
515
516 ret = 0;
517
518 /*
519 * This function had better only be called once per process
520 * (i.e., not per thread), so there should be no synchronization
521 * required.
522 */
523 while ((txnp =
524 TAILQ_FIRST(&tmgrp->txn_chain)) != TAILQ_END(&tmgrp->txn_chain))
525 if ((t_ret = txn_abort(txnp)) != 0) {
526 __txn_end(txnp, 0);
527 if (ret == 0)
528 ret = t_ret;
529 }
530
531 if (tmgrp->dbenv->lg_info &&
532 (t_ret = log_flush(tmgrp->dbenv->lg_info, NULL)) != 0 && ret == 0)
533 ret = t_ret;
534
535 if (tmgrp->mutexp != NULL) {
536 LOCK_TXNREGION(tmgrp);
537 __db_shalloc_free(tmgrp->mem, tmgrp->mutexp);
538 UNLOCK_TXNREGION(tmgrp);
539 }
540
541 if ((t_ret = __db_rdetach(&tmgrp->reginfo)) != 0 && ret == 0)
542 ret = t_ret;
543
544 if (tmgrp->reginfo.path != NULL)
545 __os_freestr(tmgrp->reginfo.path);
546 __os_free(tmgrp, sizeof(*tmgrp));
547
548 return (ret);
549 }
550
551 /*
552 * txn_unlink --
553 * Remove the transaction region.
554 */
555 int
txn_unlink(path,force,dbenv)556 txn_unlink(path, force, dbenv)
557 const char *path;
558 int force;
559 DB_ENV *dbenv;
560 {
561 REGINFO reginfo;
562 int ret;
563
564 memset(®info, 0, sizeof(reginfo));
565 reginfo.dbenv = dbenv;
566 reginfo.appname = DB_APP_NONE;
567 if (path != NULL && (ret = __os_strdup(path, ®info.path)) != 0)
568 return (ret);
569 reginfo.file = DEFAULT_TXN_FILE;
570 ret = __db_runlink(®info, force);
571 if (reginfo.path != NULL)
572 __os_freestr(reginfo.path);
573 return (ret);
574 }
575
576 /* Internal routines. */
577
578 /*
579 * Return 0 if the txnp is reasonable, otherwise returns EINVAL.
580 */
581 static int
__txn_check_running(txnp,tdp)582 __txn_check_running(txnp, tdp)
583 const DB_TXN *txnp;
584 TXN_DETAIL **tdp;
585 {
586 TXN_DETAIL *tp;
587
588 tp = NULL;
589 if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) {
590 tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off);
591 /*
592 * Child transactions could be marked committed which is OK.
593 */
594 if (tp->status != TXN_RUNNING &&
595 tp->status != TXN_PREPARED && tp->status != TXN_COMMITTED)
596 tp = NULL;
597 if (tdp != NULL)
598 *tdp = tp;
599 }
600
601 return (tp == NULL ? EINVAL : 0);
602 }
603
604 static int
__txn_end(txnp,is_commit)605 __txn_end(txnp, is_commit)
606 DB_TXN *txnp;
607 int is_commit;
608 {
609 DB_LOCKREQ request;
610 DB_TXNMGR *mgr;
611 TXN_DETAIL *tp;
612 u_int32_t locker;
613 int ret;
614
615 mgr = txnp->mgrp;
616
617 /* Release the locks. */
618 locker = txnp->txnid;
619 request.op = txnp->parent == NULL ||
620 is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT;
621
622 if (mgr->dbenv->lk_info) {
623 ret =
624 lock_tvec(mgr->dbenv->lk_info, txnp, 0, &request, 1, NULL);
625 if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) {
626 __db_err(mgr->dbenv, "%s: release locks failed %s",
627 is_commit ? "txn_commit" : "txn_abort",
628 strerror(ret));
629 return (ret);
630 }
631 }
632
633 /* End the transaction. */
634 LOCK_TXNREGION(mgr);
635
636 /*
637 * Child transactions that are committing cannot be released until
638 * the parent commits, since the parent may abort, causing the child
639 * to abort as well.
640 */
641 tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off);
642 if (txnp->parent == NULL || !is_commit) {
643 SH_TAILQ_REMOVE(&mgr->region->active_txn,
644 tp, links, __txn_detail);
645
646 __db_shalloc_free(mgr->mem, tp);
647 } else
648 tp->status = is_commit ? TXN_COMMITTED : TXN_ABORTED;
649
650 if (is_commit)
651 mgr->region->ncommits++;
652 else
653 mgr->region->naborts++;
654
655 UNLOCK_TXNREGION(mgr);
656
657 /*
658 * If the transaction aborted, we can remove it from its parent links.
659 * If it committed, then we need to leave it on, since the parent can
660 * still abort.
661 */
662 if (txnp->parent != NULL && !is_commit)
663 TAILQ_REMOVE(&txnp->parent->kids, txnp, klinks);
664
665 /* Free the space. */
666 if (F_ISSET(txnp, TXN_MALLOC) && (txnp->parent == NULL || !is_commit)) {
667 LOCK_TXNTHREAD(mgr);
668 TAILQ_REMOVE(&mgr->txn_chain, txnp, links);
669 UNLOCK_TXNTHREAD(mgr);
670
671 __os_free(txnp, sizeof(*txnp));
672 }
673
674 return (0);
675 }
676
677
678 /*
679 * __txn_undo --
680 * Undo the transaction with id txnid. Returns 0 on success and
681 * errno on failure.
682 */
683 static int
__txn_undo(txnp)684 __txn_undo(txnp)
685 DB_TXN *txnp;
686 {
687 DBT rdbt;
688 DB_LOG *logp;
689 DB_LSN key_lsn;
690 DB_TXNMGR *mgr;
691 int ret;
692
693 mgr = txnp->mgrp;
694 logp = mgr->dbenv->lg_info;
695 if (logp == NULL)
696 return (0);
697
698 /*
699 * This is the simplest way to code this, but if the mallocs during
700 * recovery turn out to be a performance issue, we can do the
701 * allocation here and use DB_DBT_USERMEM.
702 */
703 memset(&rdbt, 0, sizeof(rdbt));
704 if (F_ISSET(logp, DB_AM_THREAD))
705 F_SET(&rdbt, DB_DBT_MALLOC);
706
707 key_lsn = txnp->last_lsn; /* structure assignment */
708 for (ret = 0; ret == 0 && !IS_ZERO_LSN(key_lsn);) {
709 /*
710 * The dispatch routine returns the lsn of the record
711 * before the current one in the key_lsn argument.
712 */
713 if ((ret = log_get(logp, &key_lsn, &rdbt, DB_SET)) == 0) {
714 ret =
715 mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL);
716 if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) {
717 __os_free(rdbt.data, rdbt.size);
718 rdbt.data = NULL;
719 }
720 }
721 if (ret != 0)
722 return (ret);
723 }
724
725 return (ret);
726 }
727
728 /*
729 * Transaction checkpoint.
730 * If either kbytes or minutes is non-zero, then we only take the checkpoint
731 * more than "minutes" minutes have passed since the last checkpoint or if
732 * more than "kbytes" of log data have been written since the last checkpoint.
733 * When taking a checkpoint, find the oldest active transaction and figure out
734 * its first LSN. This is the lowest LSN we can checkpoint, since any record
735 * written after since that point may be involved in a transaction and may
736 * therefore need to be undone in the case of an abort.
737 */
738 int
txn_checkpoint(mgr,kbytes,minutes)739 txn_checkpoint(mgr, kbytes, minutes)
740 const DB_TXNMGR *mgr;
741 u_int32_t kbytes, minutes;
742 {
743 DB_LOG *dblp;
744 DB_LSN ckp_lsn, sync_lsn, last_ckp;
745 TXN_DETAIL *txnp;
746 time_t last_ckp_time, now;
747 u_int32_t kbytes_written;
748 int ret;
749
750 TXN_PANIC_CHECK(mgr);
751
752 /*
753 * Check if we need to run recovery.
754 */
755 ZERO_LSN(ckp_lsn);
756 if (minutes != 0) {
757 (void)time(&now);
758
759 LOCK_TXNREGION(mgr);
760 last_ckp_time = mgr->region->time_ckp;
761 UNLOCK_TXNREGION(mgr);
762
763 if (now - last_ckp_time >= (time_t)(minutes * 60))
764 goto do_ckp;
765 }
766
767 if (kbytes != 0) {
768 dblp = mgr->dbenv->lg_info;
769 LOCK_LOGREGION(dblp);
770 kbytes_written =
771 dblp->lp->stat.st_wc_mbytes * 1024 +
772 dblp->lp->stat.st_wc_bytes / 1024;
773 ckp_lsn = dblp->lp->lsn;
774 UNLOCK_LOGREGION(dblp);
775 if (kbytes_written >= (u_int32_t)kbytes)
776 goto do_ckp;
777 }
778
779 /*
780 * If we checked time and data and didn't go to checkpoint,
781 * we're done.
782 */
783 if (minutes != 0 || kbytes != 0)
784 return (0);
785
786 do_ckp:
787 if (IS_ZERO_LSN(ckp_lsn)) {
788 dblp = mgr->dbenv->lg_info;
789 LOCK_LOGREGION(dblp);
790 ckp_lsn = dblp->lp->lsn;
791 UNLOCK_LOGREGION(dblp);
792 }
793
794 /*
795 * We have to find an LSN such that all transactions begun
796 * before that LSN are complete.
797 */
798 LOCK_TXNREGION(mgr);
799
800 if (!IS_ZERO_LSN(mgr->region->pending_ckp))
801 ckp_lsn = mgr->region->pending_ckp;
802 else
803 for (txnp =
804 SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail);
805 txnp != NULL;
806 txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) {
807
808 /*
809 * Look through the active transactions for the
810 * lowest begin lsn.
811 */
812 if (!IS_ZERO_LSN(txnp->begin_lsn) &&
813 log_compare(&txnp->begin_lsn, &ckp_lsn) < 0)
814 ckp_lsn = txnp->begin_lsn;
815 }
816
817 mgr->region->pending_ckp = ckp_lsn;
818 UNLOCK_TXNREGION(mgr);
819
820 /*
821 * memp_sync may change the lsn you pass it, so don't pass it
822 * the actual ckp_lsn, pass it a temp instead.
823 */
824 sync_lsn = ckp_lsn;
825 if (mgr->dbenv->mp_info != NULL &&
826 (ret = memp_sync(mgr->dbenv->mp_info, &sync_lsn)) != 0) {
827 /*
828 * ret == DB_INCOMPLETE means that there are still buffers to
829 * flush, the checkpoint is not complete. Wait and try again.
830 */
831 if (ret > 0)
832 __db_err(mgr->dbenv,
833 "txn_checkpoint: system failure in memp_sync %s\n",
834 strerror(ret));
835 return (ret);
836 }
837 if (mgr->dbenv->lg_info != NULL) {
838 LOCK_TXNREGION(mgr);
839 last_ckp = mgr->region->last_ckp;
840 ZERO_LSN(mgr->region->pending_ckp);
841 UNLOCK_TXNREGION(mgr);
842
843 if ((ret = __txn_ckp_log(mgr->dbenv->lg_info,
844 NULL, &ckp_lsn, DB_CHECKPOINT, &ckp_lsn, &last_ckp)) != 0) {
845 __db_err(mgr->dbenv,
846 "txn_checkpoint: log failed at LSN [%ld %ld] %s\n",
847 (long)ckp_lsn.file, (long)ckp_lsn.offset,
848 strerror(ret));
849 return (ret);
850 }
851
852 LOCK_TXNREGION(mgr);
853 mgr->region->last_ckp = ckp_lsn;
854 (void)time(&mgr->region->time_ckp);
855 UNLOCK_TXNREGION(mgr);
856 }
857 return (0);
858 }
859
860 /*
861 * __txn_validate_region --
862 * Called at every interface to verify if the region has changed size,
863 * and if so, to remap the region in and reset the process' pointers.
864 */
865 static int
__txn_validate_region(tp)866 __txn_validate_region(tp)
867 DB_TXNMGR *tp;
868 {
869 int ret;
870
871 if (tp->reginfo.size == tp->region->hdr.size)
872 return (0);
873
874 /* Detach/reattach the region. */
875 if ((ret = __db_rreattach(&tp->reginfo, tp->region->hdr.size)) != 0)
876 return (ret);
877
878 /* Reset region information. */
879 tp->region = tp->reginfo.addr;
880 tp->mem = &tp->region[1];
881
882 return (0);
883 }
884
885 static int
__txn_grow_region(tp)886 __txn_grow_region(tp)
887 DB_TXNMGR *tp;
888 {
889 size_t incr, oldsize;
890 u_int32_t mutex_offset, oldmax;
891 u_int8_t *curaddr;
892 int ret;
893
894 oldmax = tp->region->maxtxns;
895 incr = oldmax * sizeof(DB_TXN);
896 mutex_offset = tp->mutexp != NULL ?
897 (u_int8_t *)tp->mutexp - (u_int8_t *)tp->region : 0;
898
899 oldsize = tp->reginfo.size;
900 if ((ret = __db_rgrow(&tp->reginfo, oldsize + incr)) != 0)
901 return (ret);
902 tp->region = tp->reginfo.addr;
903
904 /* Throw the new space on the free list. */
905 curaddr = (u_int8_t *)tp->region + oldsize;
906 tp->mem = &tp->region[1];
907 tp->mutexp = mutex_offset != 0 ?
908 (db_mutex_t *)((u_int8_t *)tp->region + mutex_offset) : NULL;
909
910 *((size_t *)curaddr) = incr - sizeof(size_t);
911 curaddr += sizeof(size_t);
912 __db_shalloc_free(tp->mem, curaddr);
913
914 tp->region->maxtxns = 2 * oldmax;
915
916 return (0);
917 }
918
919 int
txn_stat(mgr,statp,db_malloc)920 txn_stat(mgr, statp, db_malloc)
921 DB_TXNMGR *mgr;
922 DB_TXN_STAT **statp;
923 void *(*db_malloc) __P((size_t));
924 {
925 DB_TXN_STAT *stats;
926 TXN_DETAIL *txnp;
927 size_t nbytes;
928 u_int32_t nactive, ndx;
929 int ret;
930
931 TXN_PANIC_CHECK(mgr);
932
933 LOCK_TXNREGION(mgr);
934 nactive = mgr->region->nbegins -
935 mgr->region->naborts - mgr->region->ncommits;
936 UNLOCK_TXNREGION(mgr);
937
938 /*
939 * Allocate a bunch of extra active structures to handle any
940 * that have been created since we unlocked the region.
941 */
942 nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200);
943 if ((ret = __os_malloc(nbytes, db_malloc, &stats)) != 0)
944 return (ret);
945
946 LOCK_TXNREGION(mgr);
947 stats->st_last_txnid = mgr->region->last_txnid;
948 stats->st_last_ckp = mgr->region->last_ckp;
949 stats->st_maxtxns = mgr->region->maxtxns;
950 stats->st_naborts = mgr->region->naborts;
951 stats->st_nbegins = mgr->region->nbegins;
952 stats->st_ncommits = mgr->region->ncommits;
953 stats->st_pending_ckp = mgr->region->pending_ckp;
954 stats->st_time_ckp = mgr->region->time_ckp;
955 stats->st_nactive = stats->st_nbegins -
956 stats->st_naborts - stats->st_ncommits;
957 if (stats->st_nactive > nactive + 200)
958 stats->st_nactive = nactive + 200;
959 stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1];
960
961 ndx = 0;
962 for (txnp = SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail);
963 txnp != NULL;
964 txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) {
965 stats->st_txnarray[ndx].txnid = txnp->txnid;
966 stats->st_txnarray[ndx].lsn = txnp->begin_lsn;
967 ndx++;
968
969 if (ndx >= stats->st_nactive)
970 break;
971 }
972
973 stats->st_region_wait = mgr->region->hdr.lock.mutex_set_wait;
974 stats->st_region_nowait = mgr->region->hdr.lock.mutex_set_nowait;
975 stats->st_refcnt = mgr->region->hdr.refcnt;
976 stats->st_regsize = mgr->region->hdr.size;
977
978 UNLOCK_TXNREGION(mgr);
979 *statp = stats;
980 return (0);
981 }
982
983 static void
__txn_freekids(txnp)984 __txn_freekids(txnp)
985 DB_TXN *txnp;
986 {
987 DB_TXNMGR *mgr;
988 TXN_DETAIL *tp;
989 DB_TXN *kids;
990
991 mgr = txnp->mgrp;
992
993 for (kids = TAILQ_FIRST(&txnp->kids);
994 kids != NULL;
995 kids = TAILQ_FIRST(&txnp->kids)) {
996 /* Free any children of this transaction. */
997 __txn_freekids(kids);
998
999 /* Free the transaction detail in the region. */
1000 LOCK_TXNREGION(mgr);
1001 tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + kids->off);
1002 SH_TAILQ_REMOVE(&mgr->region->active_txn,
1003 tp, links, __txn_detail);
1004
1005 __db_shalloc_free(mgr->mem, tp);
1006 UNLOCK_TXNREGION(mgr);
1007
1008 /* Now remove from its parent. */
1009 TAILQ_REMOVE(&txnp->kids, kids, klinks);
1010 if (F_ISSET(txnp, TXN_MALLOC)) {
1011 LOCK_TXNTHREAD(mgr);
1012 TAILQ_REMOVE(&mgr->txn_chain, kids, links);
1013 UNLOCK_TXNTHREAD(mgr);
1014 __os_free(kids, sizeof(*kids));
1015 }
1016 }
1017 }
1018
1019 /*
1020 * __txn_is_ancestor --
1021 * Determine if a transaction is an ancestor of another transaction.
1022 * This is used during lock promotion when we do not have the per-process
1023 * data structures that link parents together. Instead, we'll have to
1024 * follow the links in the transaction region.
1025 *
1026 * PUBLIC: int __txn_is_ancestor __P((DB_TXNMGR *, size_t, size_t));
1027 */
1028 int
__txn_is_ancestor(mgr,hold_off,req_off)1029 __txn_is_ancestor(mgr, hold_off, req_off)
1030 DB_TXNMGR *mgr;
1031 size_t hold_off, req_off;
1032 {
1033 TXN_DETAIL *hold_tp, *req_tp;
1034
1035 hold_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + hold_off);
1036 req_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + req_off);
1037
1038 while (req_tp->parent != 0) {
1039 req_tp =
1040 (TXN_DETAIL *)((u_int8_t *)mgr->region + req_tp->parent);
1041 if (req_tp->txnid == hold_tp->txnid)
1042 return (1);
1043 }
1044
1045 return (0);
1046 }
1047