1 /*- 2 * See the file LICENSE for redistribution information. 3 * 4 * Copyright (c) 1996, 1997, 1998 5 * Sleepycat Software. All rights reserved. 6 */ 7 /* 8 * Copyright (c) 1995, 1996 9 * The President and Fellows of Harvard University. All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * Margo Seltzer. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. All advertising materials mentioning features or use of this software 23 * must display the following acknowledgement: 24 * This product includes software developed by the University of 25 * California, Berkeley and its contributors. 26 * 4. Neither the name of the University nor the names of its contributors 27 * may be used to endorse or promote products derived from this software 28 * without specific prior written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 33 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 40 * SUCH DAMAGE. 41 */ 42 43 #include "config.h" 44 45 #ifndef lint 46 static const char sccsid[] = "@(#)txn.c 10.66 (Sleepycat) 1/3/99"; 47 #endif /* not lint */ 48 49 50 #ifndef NO_SYSTEM_INCLUDES 51 #include <sys/types.h> 52 53 #include <errno.h> 54 #include <string.h> 55 #include <time.h> 56 #endif 57 58 #include "db_int.h" 59 #include "shqueue.h" 60 #include "db_page.h" 61 #include "db_shash.h" 62 #include "txn.h" 63 #include "db_dispatch.h" 64 #include "lock.h" 65 #include "log.h" 66 #include "db_am.h" 67 #include "common_ext.h" 68 69 static int __txn_begin __P((DB_TXN *)); 70 static int __txn_check_running __P((const DB_TXN *, TXN_DETAIL **)); 71 static int __txn_end __P((DB_TXN *, int)); 72 static void __txn_freekids __P((DB_TXN *)); 73 static int __txn_grow_region __P((DB_TXNMGR *)); 74 static int __txn_init __P((DB_TXNREGION *)); 75 static int __txn_undo __P((DB_TXN *)); 76 static int __txn_validate_region __P((DB_TXNMGR *)); 77 78 /* 79 * This file contains the top level routines of the transaction library. 80 * It assumes that a lock manager and log manager that conform to the db_log(3) 81 * and db_lock(3) interfaces exist. 82 * 83 * Initialize a transaction region in shared memory. 84 * Return 0 on success, errno on failure. 85 */ 86 static int 87 __txn_init(txn_region) 88 DB_TXNREGION *txn_region; 89 { 90 time_t now; 91 92 (void)time(&now); 93 94 /* maxtxns is already initialized. */ 95 txn_region->magic = DB_TXNMAGIC; 96 txn_region->version = DB_TXNVERSION; 97 txn_region->last_txnid = TXN_MINIMUM; 98 /* 99 * XXX 100 * If we ever do more types of locking and logging, this changes. 101 */ 102 txn_region->logtype = 0; 103 txn_region->locktype = 0; 104 txn_region->time_ckp = now; 105 ZERO_LSN(txn_region->last_ckp); 106 ZERO_LSN(txn_region->pending_ckp); 107 SH_TAILQ_INIT(&txn_region->active_txn); 108 __db_shalloc_init((void *)&txn_region[1], 109 TXN_REGION_SIZE(txn_region->maxtxns) - sizeof(DB_TXNREGION)); 110 111 return (0); 112 } 113 114 int 115 txn_open(path, flags, mode, dbenv, mgrpp) 116 const char *path; 117 u_int32_t flags; 118 int mode; 119 DB_ENV *dbenv; 120 DB_TXNMGR **mgrpp; 121 { 122 DB_TXNMGR *tmgrp; 123 u_int32_t maxtxns; 124 int ret; 125 126 /* Validate arguments. */ 127 if (dbenv == NULL) 128 return (EINVAL); 129 #ifdef HAVE_SPINLOCKS 130 #define OKFLAGS (DB_CREATE | DB_THREAD | DB_TXN_NOSYNC) 131 #else 132 #define OKFLAGS (DB_CREATE | DB_TXN_NOSYNC) 133 #endif 134 if ((ret = __db_fchk(dbenv, "txn_open", flags, OKFLAGS)) != 0) 135 return (ret); 136 137 maxtxns = dbenv->tx_max != 0 ? dbenv->tx_max : 20; 138 139 /* Now, create the transaction manager structure and set its fields. */ 140 if ((ret = __os_calloc(1, sizeof(DB_TXNMGR), &tmgrp)) != 0) 141 return (ret); 142 143 /* Initialize the transaction manager structure. */ 144 tmgrp->mutexp = NULL; 145 tmgrp->dbenv = dbenv; 146 tmgrp->recover = 147 dbenv->tx_recover == NULL ? __db_dispatch : dbenv->tx_recover; 148 tmgrp->flags = LF_ISSET(DB_TXN_NOSYNC | DB_THREAD); 149 TAILQ_INIT(&tmgrp->txn_chain); 150 151 /* Join/create the txn region. */ 152 tmgrp->reginfo.dbenv = dbenv; 153 tmgrp->reginfo.appname = DB_APP_NONE; 154 if (path == NULL) 155 tmgrp->reginfo.path = NULL; 156 else 157 if ((ret = __os_strdup(path, &tmgrp->reginfo.path)) != 0) 158 goto err; 159 tmgrp->reginfo.file = DEFAULT_TXN_FILE; 160 tmgrp->reginfo.mode = mode; 161 tmgrp->reginfo.size = TXN_REGION_SIZE(maxtxns); 162 tmgrp->reginfo.dbflags = flags; 163 tmgrp->reginfo.addr = NULL; 164 tmgrp->reginfo.fd = -1; 165 tmgrp->reginfo.flags = dbenv->tx_max == 0 ? REGION_SIZEDEF : 0; 166 if ((ret = __db_rattach(&tmgrp->reginfo)) != 0) 167 goto err; 168 169 /* Fill in region-related fields. */ 170 tmgrp->region = tmgrp->reginfo.addr; 171 tmgrp->mem = &tmgrp->region[1]; 172 173 if (F_ISSET(&tmgrp->reginfo, REGION_CREATED)) { 174 tmgrp->region->maxtxns = maxtxns; 175 if ((ret = __txn_init(tmgrp->region)) != 0) 176 goto err; 177 178 } else if (tmgrp->region->magic != DB_TXNMAGIC) { 179 /* Check if valid region. */ 180 __db_err(dbenv, "txn_open: Bad magic number"); 181 ret = EINVAL; 182 goto err; 183 } 184 185 if (LF_ISSET(DB_THREAD)) { 186 if ((ret = __db_shalloc(tmgrp->mem, sizeof(db_mutex_t), 187 MUTEX_ALIGNMENT, &tmgrp->mutexp)) == 0) 188 /* 189 * Since we only get here if threading is turned on, we 190 * know that we have spinlocks, so the offset is going 191 * to be ignored. We put 0 here as a valid placeholder. 192 */ 193 __db_mutex_init(tmgrp->mutexp, 0); 194 if (ret != 0) 195 goto err; 196 } 197 198 UNLOCK_TXNREGION(tmgrp); 199 *mgrpp = tmgrp; 200 return (0); 201 202 err: if (tmgrp->reginfo.addr != NULL) { 203 if (tmgrp->mutexp != NULL) 204 __db_shalloc_free(tmgrp->mem, tmgrp->mutexp); 205 206 UNLOCK_TXNREGION(tmgrp); 207 (void)__db_rdetach(&tmgrp->reginfo); 208 if (F_ISSET(&tmgrp->reginfo, REGION_CREATED)) 209 (void)txn_unlink(path, 1, dbenv); 210 } 211 212 if (tmgrp->reginfo.path != NULL) 213 __os_freestr(tmgrp->reginfo.path); 214 __os_free(tmgrp, sizeof(*tmgrp)); 215 return (ret); 216 } 217 218 /* 219 * __txn_panic -- 220 * Panic a transaction region. 221 * 222 * PUBLIC: void __txn_panic __P((DB_ENV *)); 223 */ 224 void 225 __txn_panic(dbenv) 226 DB_ENV *dbenv; 227 { 228 if (dbenv->tx_info != NULL) 229 dbenv->tx_info->region->hdr.panic = 1; 230 } 231 232 /* 233 * txn_begin -- 234 * This is a wrapper to the actual begin process. Normal txn_begin() 235 * allocates a DB_TXN structure for the caller, while txn_xa_begin() does 236 * not. Other than that, both call into the common __txn_begin code(). 237 * 238 * Internally, we use TXN_DETAIL structures, but the DB_TXN structure 239 * provides access to the transaction ID and the offset in the transaction 240 * region of the TXN_DETAIL structure. 241 */ 242 int 243 txn_begin(tmgrp, parent, txnpp) 244 DB_TXNMGR *tmgrp; 245 DB_TXN *parent, **txnpp; 246 { 247 DB_TXN *txn; 248 int ret; 249 250 TXN_PANIC_CHECK(tmgrp); 251 252 if ((ret = __os_calloc(1, sizeof(DB_TXN), &txn)) != 0) 253 return (ret); 254 255 txn->parent = parent; 256 TAILQ_INIT(&txn->kids); 257 txn->mgrp = tmgrp; 258 txn->flags = TXN_MALLOC; 259 if ((ret = __txn_begin(txn)) != 0) { 260 __os_free(txn, sizeof(DB_TXN)); 261 txn = NULL; 262 } 263 if (txn != NULL && parent != NULL) 264 TAILQ_INSERT_HEAD(&parent->kids, txn, klinks); 265 *txnpp = txn; 266 return (ret); 267 } 268 269 /* 270 * __txn_xa_begin -- 271 * XA version of txn_begin. 272 * 273 * PUBLIC: int __txn_xa_begin __P((DB_ENV *, DB_TXN *)); 274 */ 275 int 276 __txn_xa_begin(dbenv, txn) 277 DB_ENV *dbenv; 278 DB_TXN *txn; 279 { 280 TXN_PANIC_CHECK(dbenv->tx_info); 281 282 memset(txn, 0, sizeof(DB_TXN)); 283 284 txn->mgrp = dbenv->tx_info; 285 286 return (__txn_begin(txn)); 287 } 288 289 /* 290 * __txn_begin -- 291 * Normal DB version of txn_begin. 292 */ 293 static int 294 __txn_begin(txn) 295 DB_TXN *txn; 296 { 297 DB_LSN begin_lsn; 298 DB_TXNMGR *mgr; 299 TXN_DETAIL *td; 300 size_t off; 301 u_int32_t id; 302 int ret; 303 304 /* 305 * We do not have to write begin records (and if we do not, then we 306 * need never write records for read-only transactions). However, 307 * we do need to find the current LSN so that we can store it in the 308 * transaction structure, so we can know where to take checkpoints. 309 */ 310 mgr = txn->mgrp; 311 if (mgr->dbenv->lg_info != NULL && (ret = 312 log_put(mgr->dbenv->lg_info, &begin_lsn, NULL, DB_CURLSN)) != 0) 313 goto err2; 314 315 LOCK_TXNREGION(mgr); 316 317 /* Make sure that last_txnid is not going to wrap around. */ 318 if (mgr->region->last_txnid == TXN_INVALID) { 319 __db_err(mgr->dbenv, "txn_begin: %s %s", 320 "Transaction ID wrapping.", 321 "Snapshot your database and start a new log."); 322 ret = EINVAL; 323 goto err1; 324 } 325 326 if ((ret = __txn_validate_region(mgr)) != 0) 327 goto err1; 328 329 /* Allocate a new transaction detail structure. */ 330 if ((ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td)) != 0 331 && ret == ENOMEM && (ret = __txn_grow_region(mgr)) == 0) 332 ret = __db_shalloc(mgr->mem, sizeof(TXN_DETAIL), 0, &td); 333 if (ret != 0) 334 goto err1; 335 336 /* Place transaction on active transaction list. */ 337 SH_TAILQ_INSERT_HEAD(&mgr->region->active_txn, td, links, __txn_detail); 338 339 id = ++mgr->region->last_txnid; 340 ++mgr->region->nbegins; 341 342 td->txnid = id; 343 td->begin_lsn = begin_lsn; 344 ZERO_LSN(td->last_lsn); 345 td->last_lock = 0; 346 td->status = TXN_RUNNING; 347 if (txn->parent != NULL) 348 td->parent = txn->parent->off; 349 else 350 td->parent = 0; 351 352 off = (u_int8_t *)td - (u_int8_t *)mgr->region; 353 UNLOCK_TXNREGION(mgr); 354 355 ZERO_LSN(txn->last_lsn); 356 txn->txnid = id; 357 txn->off = off; 358 359 if (F_ISSET(txn, TXN_MALLOC)) { 360 LOCK_TXNTHREAD(mgr); 361 TAILQ_INSERT_TAIL(&mgr->txn_chain, txn, links); 362 UNLOCK_TXNTHREAD(mgr); 363 } 364 365 return (0); 366 367 err1: UNLOCK_TXNREGION(mgr); 368 369 err2: return (ret); 370 } 371 /* 372 * txn_commit -- 373 * Commit a transaction. 374 */ 375 int 376 txn_commit(txnp) 377 DB_TXN *txnp; 378 { 379 DB_LOG *logp; 380 DB_TXNMGR *mgr; 381 int ret; 382 383 mgr = txnp->mgrp; 384 385 TXN_PANIC_CHECK(mgr); 386 if ((ret = __txn_check_running(txnp, NULL)) != 0) 387 return (ret); 388 389 /* 390 * If there are any log records, write a log record and sync 391 * the log, else do no log writes. If the commit is for a child 392 * transaction, we do not need to commit the child synchronously 393 * since if its parent aborts, it will abort too and its parent 394 * (or ultimate ancestor) will write synchronously. 395 */ 396 if ((logp = mgr->dbenv->lg_info) != NULL && 397 !IS_ZERO_LSN(txnp->last_lsn)) { 398 if (txnp->parent == NULL) 399 ret = __txn_regop_log(logp, txnp, &txnp->last_lsn, 400 F_ISSET(mgr, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, 401 TXN_COMMIT); 402 else 403 ret = __txn_child_log(logp, txnp, &txnp->last_lsn, 0, 404 TXN_COMMIT, txnp->parent->txnid); 405 if (ret != 0) 406 return (ret); 407 } 408 409 /* 410 * If this is the senior ancestor (i.e., it has no children), then we 411 * can release all the child transactions since everyone is committing. 412 * Then we can release this transaction. If this is not the ultimate 413 * ancestor, then we can neither free it or its children. 414 */ 415 if (txnp->parent == NULL) 416 __txn_freekids(txnp); 417 418 return (__txn_end(txnp, 1)); 419 } 420 421 /* 422 * txn_abort -- 423 * Abort a transcation. 424 */ 425 int 426 txn_abort(txnp) 427 DB_TXN *txnp; 428 { 429 int ret; 430 DB_TXN *kids; 431 432 TXN_PANIC_CHECK(txnp->mgrp); 433 if ((ret = __txn_check_running(txnp, NULL)) != 0) 434 return (ret); 435 436 for (kids = TAILQ_FIRST(&txnp->kids); 437 kids != NULL; 438 kids = TAILQ_FIRST(&txnp->kids)) 439 txn_abort(kids); 440 441 if ((ret = __txn_undo(txnp)) != 0) { 442 __db_err(txnp->mgrp->dbenv, 443 "txn_abort: Log undo failed %s", strerror(ret)); 444 return (ret); 445 } 446 return (__txn_end(txnp, 0)); 447 } 448 449 /* 450 * txn_prepare -- 451 * Flush the log so a future commit is guaranteed to succeed. 452 */ 453 int 454 txn_prepare(txnp) 455 DB_TXN *txnp; 456 { 457 DBT xid; 458 DB_ENV *dbenv; 459 TXN_DETAIL *td; 460 int ret; 461 462 if ((ret = __txn_check_running(txnp, &td)) != 0) 463 return (ret); 464 465 dbenv = txnp->mgrp->dbenv; 466 memset(&xid, 0, sizeof(xid)); 467 xid.data = td->xid; 468 /* 469 * We indicate that a transaction is an XA transaction by putting 470 * a valid size in the xid.size fiels. XA requires that the transaction 471 * be either ENDED or SUSPENDED when prepare is called, so we know 472 * that if the xa_status isn't in one of those states, but we are 473 * calling prepare that we are not an XA transaction. 474 */ 475 xid.size = 476 td->xa_status != TXN_XA_ENDED && td->xa_status != TXN_XA_SUSPENDED ? 477 0 : sizeof(td->xid); 478 if (dbenv->lg_info != NULL && 479 (ret = __txn_xa_regop_log(dbenv->lg_info, txnp, &txnp->last_lsn, 480 F_ISSET(txnp->mgrp, DB_TXN_NOSYNC) ? 0 : DB_FLUSH, TXN_PREPARE, 481 &xid, td->format, td->gtrid, td->bqual, &td->begin_lsn)) != 0) { 482 __db_err(dbenv, 483 "txn_prepare: log_write failed %s\n", strerror(ret)); 484 return (ret); 485 } 486 487 LOCK_TXNTHREAD(txnp->mgrp); 488 td->status = TXN_PREPARED; 489 UNLOCK_TXNTHREAD(txnp->mgrp); 490 return (ret); 491 } 492 493 /* 494 * Return the transaction ID associated with a particular transaction 495 */ 496 u_int32_t 497 txn_id(txnp) 498 DB_TXN *txnp; 499 { 500 return (txnp->txnid); 501 } 502 503 /* 504 * txn_close -- 505 * Close the transaction region, does not imply a checkpoint. 506 */ 507 int 508 txn_close(tmgrp) 509 DB_TXNMGR *tmgrp; 510 { 511 DB_TXN *txnp; 512 int ret, t_ret; 513 514 TXN_PANIC_CHECK(tmgrp); 515 516 ret = 0; 517 518 /* 519 * This function had better only be called once per process 520 * (i.e., not per thread), so there should be no synchronization 521 * required. 522 */ 523 while ((txnp = 524 TAILQ_FIRST(&tmgrp->txn_chain)) != TAILQ_END(&tmgrp->txn_chain)) 525 if ((t_ret = txn_abort(txnp)) != 0) { 526 __txn_end(txnp, 0); 527 if (ret == 0) 528 ret = t_ret; 529 } 530 531 if (tmgrp->dbenv->lg_info && 532 (t_ret = log_flush(tmgrp->dbenv->lg_info, NULL)) != 0 && ret == 0) 533 ret = t_ret; 534 535 if (tmgrp->mutexp != NULL) { 536 LOCK_TXNREGION(tmgrp); 537 __db_shalloc_free(tmgrp->mem, tmgrp->mutexp); 538 UNLOCK_TXNREGION(tmgrp); 539 } 540 541 if ((t_ret = __db_rdetach(&tmgrp->reginfo)) != 0 && ret == 0) 542 ret = t_ret; 543 544 if (tmgrp->reginfo.path != NULL) 545 __os_freestr(tmgrp->reginfo.path); 546 __os_free(tmgrp, sizeof(*tmgrp)); 547 548 return (ret); 549 } 550 551 /* 552 * txn_unlink -- 553 * Remove the transaction region. 554 */ 555 int 556 txn_unlink(path, force, dbenv) 557 const char *path; 558 int force; 559 DB_ENV *dbenv; 560 { 561 REGINFO reginfo; 562 int ret; 563 564 memset(®info, 0, sizeof(reginfo)); 565 reginfo.dbenv = dbenv; 566 reginfo.appname = DB_APP_NONE; 567 if (path != NULL && (ret = __os_strdup(path, ®info.path)) != 0) 568 return (ret); 569 reginfo.file = DEFAULT_TXN_FILE; 570 ret = __db_runlink(®info, force); 571 if (reginfo.path != NULL) 572 __os_freestr(reginfo.path); 573 return (ret); 574 } 575 576 /* Internal routines. */ 577 578 /* 579 * Return 0 if the txnp is reasonable, otherwise returns EINVAL. 580 */ 581 static int 582 __txn_check_running(txnp, tdp) 583 const DB_TXN *txnp; 584 TXN_DETAIL **tdp; 585 { 586 TXN_DETAIL *tp; 587 588 tp = NULL; 589 if (txnp != NULL && txnp->mgrp != NULL && txnp->mgrp->region != NULL) { 590 tp = (TXN_DETAIL *)((u_int8_t *)txnp->mgrp->region + txnp->off); 591 /* 592 * Child transactions could be marked committed which is OK. 593 */ 594 if (tp->status != TXN_RUNNING && 595 tp->status != TXN_PREPARED && tp->status != TXN_COMMITTED) 596 tp = NULL; 597 if (tdp != NULL) 598 *tdp = tp; 599 } 600 601 return (tp == NULL ? EINVAL : 0); 602 } 603 604 static int 605 __txn_end(txnp, is_commit) 606 DB_TXN *txnp; 607 int is_commit; 608 { 609 DB_LOCKREQ request; 610 DB_TXNMGR *mgr; 611 TXN_DETAIL *tp; 612 u_int32_t locker; 613 int ret; 614 615 mgr = txnp->mgrp; 616 617 /* Release the locks. */ 618 locker = txnp->txnid; 619 request.op = txnp->parent == NULL || 620 is_commit == 0 ? DB_LOCK_PUT_ALL : DB_LOCK_INHERIT; 621 622 if (mgr->dbenv->lk_info) { 623 ret = 624 lock_tvec(mgr->dbenv->lk_info, txnp, 0, &request, 1, NULL); 625 if (ret != 0 && (ret != DB_LOCK_DEADLOCK || is_commit)) { 626 __db_err(mgr->dbenv, "%s: release locks failed %s", 627 is_commit ? "txn_commit" : "txn_abort", 628 strerror(ret)); 629 return (ret); 630 } 631 } 632 633 /* End the transaction. */ 634 LOCK_TXNREGION(mgr); 635 636 /* 637 * Child transactions that are committing cannot be released until 638 * the parent commits, since the parent may abort, causing the child 639 * to abort as well. 640 */ 641 tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + txnp->off); 642 if (txnp->parent == NULL || !is_commit) { 643 SH_TAILQ_REMOVE(&mgr->region->active_txn, 644 tp, links, __txn_detail); 645 646 __db_shalloc_free(mgr->mem, tp); 647 } else 648 tp->status = is_commit ? TXN_COMMITTED : TXN_ABORTED; 649 650 if (is_commit) 651 mgr->region->ncommits++; 652 else 653 mgr->region->naborts++; 654 655 UNLOCK_TXNREGION(mgr); 656 657 /* 658 * If the transaction aborted, we can remove it from its parent links. 659 * If it committed, then we need to leave it on, since the parent can 660 * still abort. 661 */ 662 if (txnp->parent != NULL && !is_commit) 663 TAILQ_REMOVE(&txnp->parent->kids, txnp, klinks); 664 665 /* Free the space. */ 666 if (F_ISSET(txnp, TXN_MALLOC) && (txnp->parent == NULL || !is_commit)) { 667 LOCK_TXNTHREAD(mgr); 668 TAILQ_REMOVE(&mgr->txn_chain, txnp, links); 669 UNLOCK_TXNTHREAD(mgr); 670 671 __os_free(txnp, sizeof(*txnp)); 672 } 673 674 return (0); 675 } 676 677 678 /* 679 * __txn_undo -- 680 * Undo the transaction with id txnid. Returns 0 on success and 681 * errno on failure. 682 */ 683 static int 684 __txn_undo(txnp) 685 DB_TXN *txnp; 686 { 687 DBT rdbt; 688 DB_LOG *logp; 689 DB_LSN key_lsn; 690 DB_TXNMGR *mgr; 691 int ret; 692 693 mgr = txnp->mgrp; 694 logp = mgr->dbenv->lg_info; 695 if (logp == NULL) 696 return (0); 697 698 /* 699 * This is the simplest way to code this, but if the mallocs during 700 * recovery turn out to be a performance issue, we can do the 701 * allocation here and use DB_DBT_USERMEM. 702 */ 703 memset(&rdbt, 0, sizeof(rdbt)); 704 if (F_ISSET(logp, DB_AM_THREAD)) 705 F_SET(&rdbt, DB_DBT_MALLOC); 706 707 key_lsn = txnp->last_lsn; /* structure assignment */ 708 for (ret = 0; ret == 0 && !IS_ZERO_LSN(key_lsn);) { 709 /* 710 * The dispatch routine returns the lsn of the record 711 * before the current one in the key_lsn argument. 712 */ 713 if ((ret = log_get(logp, &key_lsn, &rdbt, DB_SET)) == 0) { 714 ret = 715 mgr->recover(logp, &rdbt, &key_lsn, TXN_UNDO, NULL); 716 if (F_ISSET(logp, DB_AM_THREAD) && rdbt.data != NULL) { 717 __os_free(rdbt.data, rdbt.size); 718 rdbt.data = NULL; 719 } 720 } 721 if (ret != 0) 722 return (ret); 723 } 724 725 return (ret); 726 } 727 728 /* 729 * Transaction checkpoint. 730 * If either kbytes or minutes is non-zero, then we only take the checkpoint 731 * more than "minutes" minutes have passed since the last checkpoint or if 732 * more than "kbytes" of log data have been written since the last checkpoint. 733 * When taking a checkpoint, find the oldest active transaction and figure out 734 * its first LSN. This is the lowest LSN we can checkpoint, since any record 735 * written after since that point may be involved in a transaction and may 736 * therefore need to be undone in the case of an abort. 737 */ 738 int 739 txn_checkpoint(mgr, kbytes, minutes) 740 const DB_TXNMGR *mgr; 741 u_int32_t kbytes, minutes; 742 { 743 DB_LOG *dblp; 744 DB_LSN ckp_lsn, sync_lsn, last_ckp; 745 TXN_DETAIL *txnp; 746 time_t last_ckp_time, now; 747 u_int32_t kbytes_written; 748 int ret; 749 750 TXN_PANIC_CHECK(mgr); 751 752 /* 753 * Check if we need to run recovery. 754 */ 755 ZERO_LSN(ckp_lsn); 756 if (minutes != 0) { 757 (void)time(&now); 758 759 LOCK_TXNREGION(mgr); 760 last_ckp_time = mgr->region->time_ckp; 761 UNLOCK_TXNREGION(mgr); 762 763 if (now - last_ckp_time >= (time_t)(minutes * 60)) 764 goto do_ckp; 765 } 766 767 if (kbytes != 0) { 768 dblp = mgr->dbenv->lg_info; 769 LOCK_LOGREGION(dblp); 770 kbytes_written = 771 dblp->lp->stat.st_wc_mbytes * 1024 + 772 dblp->lp->stat.st_wc_bytes / 1024; 773 ckp_lsn = dblp->lp->lsn; 774 UNLOCK_LOGREGION(dblp); 775 if (kbytes_written >= (u_int32_t)kbytes) 776 goto do_ckp; 777 } 778 779 /* 780 * If we checked time and data and didn't go to checkpoint, 781 * we're done. 782 */ 783 if (minutes != 0 || kbytes != 0) 784 return (0); 785 786 do_ckp: 787 if (IS_ZERO_LSN(ckp_lsn)) { 788 dblp = mgr->dbenv->lg_info; 789 LOCK_LOGREGION(dblp); 790 ckp_lsn = dblp->lp->lsn; 791 UNLOCK_LOGREGION(dblp); 792 } 793 794 /* 795 * We have to find an LSN such that all transactions begun 796 * before that LSN are complete. 797 */ 798 LOCK_TXNREGION(mgr); 799 800 if (!IS_ZERO_LSN(mgr->region->pending_ckp)) 801 ckp_lsn = mgr->region->pending_ckp; 802 else 803 for (txnp = 804 SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail); 805 txnp != NULL; 806 txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) { 807 808 /* 809 * Look through the active transactions for the 810 * lowest begin lsn. 811 */ 812 if (!IS_ZERO_LSN(txnp->begin_lsn) && 813 log_compare(&txnp->begin_lsn, &ckp_lsn) < 0) 814 ckp_lsn = txnp->begin_lsn; 815 } 816 817 mgr->region->pending_ckp = ckp_lsn; 818 UNLOCK_TXNREGION(mgr); 819 820 /* 821 * memp_sync may change the lsn you pass it, so don't pass it 822 * the actual ckp_lsn, pass it a temp instead. 823 */ 824 sync_lsn = ckp_lsn; 825 if (mgr->dbenv->mp_info != NULL && 826 (ret = memp_sync(mgr->dbenv->mp_info, &sync_lsn)) != 0) { 827 /* 828 * ret == DB_INCOMPLETE means that there are still buffers to 829 * flush, the checkpoint is not complete. Wait and try again. 830 */ 831 if (ret > 0) 832 __db_err(mgr->dbenv, 833 "txn_checkpoint: system failure in memp_sync %s\n", 834 strerror(ret)); 835 return (ret); 836 } 837 if (mgr->dbenv->lg_info != NULL) { 838 LOCK_TXNREGION(mgr); 839 last_ckp = mgr->region->last_ckp; 840 ZERO_LSN(mgr->region->pending_ckp); 841 UNLOCK_TXNREGION(mgr); 842 843 if ((ret = __txn_ckp_log(mgr->dbenv->lg_info, 844 NULL, &ckp_lsn, DB_CHECKPOINT, &ckp_lsn, &last_ckp)) != 0) { 845 __db_err(mgr->dbenv, 846 "txn_checkpoint: log failed at LSN [%ld %ld] %s\n", 847 (long)ckp_lsn.file, (long)ckp_lsn.offset, 848 strerror(ret)); 849 return (ret); 850 } 851 852 LOCK_TXNREGION(mgr); 853 mgr->region->last_ckp = ckp_lsn; 854 (void)time(&mgr->region->time_ckp); 855 UNLOCK_TXNREGION(mgr); 856 } 857 return (0); 858 } 859 860 /* 861 * __txn_validate_region -- 862 * Called at every interface to verify if the region has changed size, 863 * and if so, to remap the region in and reset the process' pointers. 864 */ 865 static int 866 __txn_validate_region(tp) 867 DB_TXNMGR *tp; 868 { 869 int ret; 870 871 if (tp->reginfo.size == tp->region->hdr.size) 872 return (0); 873 874 /* Detach/reattach the region. */ 875 if ((ret = __db_rreattach(&tp->reginfo, tp->region->hdr.size)) != 0) 876 return (ret); 877 878 /* Reset region information. */ 879 tp->region = tp->reginfo.addr; 880 tp->mem = &tp->region[1]; 881 882 return (0); 883 } 884 885 static int 886 __txn_grow_region(tp) 887 DB_TXNMGR *tp; 888 { 889 size_t incr, oldsize; 890 u_int32_t mutex_offset, oldmax; 891 u_int8_t *curaddr; 892 int ret; 893 894 oldmax = tp->region->maxtxns; 895 incr = oldmax * sizeof(DB_TXN); 896 mutex_offset = tp->mutexp != NULL ? 897 (u_int8_t *)tp->mutexp - (u_int8_t *)tp->region : 0; 898 899 oldsize = tp->reginfo.size; 900 if ((ret = __db_rgrow(&tp->reginfo, oldsize + incr)) != 0) 901 return (ret); 902 tp->region = tp->reginfo.addr; 903 904 /* Throw the new space on the free list. */ 905 curaddr = (u_int8_t *)tp->region + oldsize; 906 tp->mem = &tp->region[1]; 907 tp->mutexp = mutex_offset != 0 ? 908 (db_mutex_t *)((u_int8_t *)tp->region + mutex_offset) : NULL; 909 910 *((size_t *)curaddr) = incr - sizeof(size_t); 911 curaddr += sizeof(size_t); 912 __db_shalloc_free(tp->mem, curaddr); 913 914 tp->region->maxtxns = 2 * oldmax; 915 916 return (0); 917 } 918 919 int 920 txn_stat(mgr, statp, db_malloc) 921 DB_TXNMGR *mgr; 922 DB_TXN_STAT **statp; 923 void *(*db_malloc) __P((size_t)); 924 { 925 DB_TXN_STAT *stats; 926 TXN_DETAIL *txnp; 927 size_t nbytes; 928 u_int32_t nactive, ndx; 929 int ret; 930 931 TXN_PANIC_CHECK(mgr); 932 933 LOCK_TXNREGION(mgr); 934 nactive = mgr->region->nbegins - 935 mgr->region->naborts - mgr->region->ncommits; 936 UNLOCK_TXNREGION(mgr); 937 938 /* 939 * Allocate a bunch of extra active structures to handle any 940 * that have been created since we unlocked the region. 941 */ 942 nbytes = sizeof(DB_TXN_STAT) + sizeof(DB_TXN_ACTIVE) * (nactive + 200); 943 if ((ret = __os_malloc(nbytes, db_malloc, &stats)) != 0) 944 return (ret); 945 946 LOCK_TXNREGION(mgr); 947 stats->st_last_txnid = mgr->region->last_txnid; 948 stats->st_last_ckp = mgr->region->last_ckp; 949 stats->st_maxtxns = mgr->region->maxtxns; 950 stats->st_naborts = mgr->region->naborts; 951 stats->st_nbegins = mgr->region->nbegins; 952 stats->st_ncommits = mgr->region->ncommits; 953 stats->st_pending_ckp = mgr->region->pending_ckp; 954 stats->st_time_ckp = mgr->region->time_ckp; 955 stats->st_nactive = stats->st_nbegins - 956 stats->st_naborts - stats->st_ncommits; 957 if (stats->st_nactive > nactive + 200) 958 stats->st_nactive = nactive + 200; 959 stats->st_txnarray = (DB_TXN_ACTIVE *)&stats[1]; 960 961 ndx = 0; 962 for (txnp = SH_TAILQ_FIRST(&mgr->region->active_txn, __txn_detail); 963 txnp != NULL; 964 txnp = SH_TAILQ_NEXT(txnp, links, __txn_detail)) { 965 stats->st_txnarray[ndx].txnid = txnp->txnid; 966 stats->st_txnarray[ndx].lsn = txnp->begin_lsn; 967 ndx++; 968 969 if (ndx >= stats->st_nactive) 970 break; 971 } 972 973 stats->st_region_wait = mgr->region->hdr.lock.mutex_set_wait; 974 stats->st_region_nowait = mgr->region->hdr.lock.mutex_set_nowait; 975 stats->st_refcnt = mgr->region->hdr.refcnt; 976 stats->st_regsize = mgr->region->hdr.size; 977 978 UNLOCK_TXNREGION(mgr); 979 *statp = stats; 980 return (0); 981 } 982 983 static void 984 __txn_freekids(txnp) 985 DB_TXN *txnp; 986 { 987 DB_TXNMGR *mgr; 988 TXN_DETAIL *tp; 989 DB_TXN *kids; 990 991 mgr = txnp->mgrp; 992 993 for (kids = TAILQ_FIRST(&txnp->kids); 994 kids != NULL; 995 kids = TAILQ_FIRST(&txnp->kids)) { 996 /* Free any children of this transaction. */ 997 __txn_freekids(kids); 998 999 /* Free the transaction detail in the region. */ 1000 LOCK_TXNREGION(mgr); 1001 tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + kids->off); 1002 SH_TAILQ_REMOVE(&mgr->region->active_txn, 1003 tp, links, __txn_detail); 1004 1005 __db_shalloc_free(mgr->mem, tp); 1006 UNLOCK_TXNREGION(mgr); 1007 1008 /* Now remove from its parent. */ 1009 TAILQ_REMOVE(&txnp->kids, kids, klinks); 1010 if (F_ISSET(txnp, TXN_MALLOC)) { 1011 LOCK_TXNTHREAD(mgr); 1012 TAILQ_REMOVE(&mgr->txn_chain, kids, links); 1013 UNLOCK_TXNTHREAD(mgr); 1014 __os_free(kids, sizeof(*kids)); 1015 } 1016 } 1017 } 1018 1019 /* 1020 * __txn_is_ancestor -- 1021 * Determine if a transaction is an ancestor of another transaction. 1022 * This is used during lock promotion when we do not have the per-process 1023 * data structures that link parents together. Instead, we'll have to 1024 * follow the links in the transaction region. 1025 * 1026 * PUBLIC: int __txn_is_ancestor __P((DB_TXNMGR *, size_t, size_t)); 1027 */ 1028 int 1029 __txn_is_ancestor(mgr, hold_off, req_off) 1030 DB_TXNMGR *mgr; 1031 size_t hold_off, req_off; 1032 { 1033 TXN_DETAIL *hold_tp, *req_tp; 1034 1035 hold_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + hold_off); 1036 req_tp = (TXN_DETAIL *)((u_int8_t *)mgr->region + req_off); 1037 1038 while (req_tp->parent != 0) { 1039 req_tp = 1040 (TXN_DETAIL *)((u_int8_t *)mgr->region + req_tp->parent); 1041 if (req_tp->txnid == hold_tp->txnid) 1042 return (1); 1043 } 1044 1045 return (0); 1046 } 1047