1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/vnode.h> 29 #include <sys/buf.h> 30 #include <sys/errno.h> 31 #include <sys/fssnap_if.h> 32 #include <sys/fs/ufs_inode.h> 33 #include <sys/fs/ufs_filio.h> 34 #include <sys/sysmacros.h> 35 #include <sys/modctl.h> 36 #include <sys/fs/ufs_log.h> 37 #include <sys/fs/ufs_bio.h> 38 #include <sys/fs/ufs_fsdir.h> 39 #include <sys/debug.h> 40 #include <sys/atomic.h> 41 #include <sys/kmem.h> 42 #include <sys/inttypes.h> 43 #include <sys/vfs.h> 44 #include <sys/mntent.h> 45 #include <sys/conf.h> 46 #include <sys/param.h> 47 #include <sys/kstat.h> 48 #include <sys/cmn_err.h> 49 #include <sys/sdt.h> 50 51 #define LUFS_GENID_PRIME UINT64_C(4294967291) 52 #define LUFS_GENID_BASE UINT64_C(311) 53 #define LUFS_NEXT_ID(id) ((uint32_t)(((id) * LUFS_GENID_BASE) % \ 54 LUFS_GENID_PRIME)) 55 56 extern kmutex_t ufs_scan_lock; 57 58 static kmutex_t log_mutex; /* general purpose log layer lock */ 59 kmutex_t ml_scan; /* Scan thread syncronization */ 60 kcondvar_t ml_scan_cv; /* Scan thread syncronization */ 61 62 struct kmem_cache *lufs_sv; 63 struct kmem_cache *lufs_bp; 64 65 /* Tunables */ 66 uint_t ldl_maxlogsize = LDL_MAXLOGSIZE; 67 uint_t ldl_minlogsize = LDL_MINLOGSIZE; 68 uint_t ldl_softlogcap = LDL_SOFTLOGCAP; 69 uint32_t ldl_divisor = LDL_DIVISOR; 70 uint32_t ldl_mintransfer = LDL_MINTRANSFER; 71 uint32_t ldl_maxtransfer = LDL_MAXTRANSFER; 72 uint32_t ldl_minbufsize = LDL_MINBUFSIZE; 73 uint32_t ldl_cgsizereq = 0; 74 75 /* Generation of header ids */ 76 static kmutex_t genid_mutex; 77 static uint32_t last_loghead_ident = UINT32_C(0); 78 79 /* 80 * Logging delta and roll statistics 81 */ 82 struct delta_kstats { 83 kstat_named_t ds_superblock_deltas; 84 kstat_named_t ds_bitmap_deltas; 85 kstat_named_t ds_suminfo_deltas; 86 kstat_named_t ds_allocblk_deltas; 87 kstat_named_t ds_ab0_deltas; 88 kstat_named_t ds_dir_deltas; 89 kstat_named_t ds_inode_deltas; 90 kstat_named_t ds_fbiwrite_deltas; 91 kstat_named_t ds_quota_deltas; 92 kstat_named_t ds_shadow_deltas; 93 94 kstat_named_t ds_superblock_rolled; 95 kstat_named_t ds_bitmap_rolled; 96 kstat_named_t ds_suminfo_rolled; 97 kstat_named_t ds_allocblk_rolled; 98 kstat_named_t ds_ab0_rolled; 99 kstat_named_t ds_dir_rolled; 100 kstat_named_t ds_inode_rolled; 101 kstat_named_t ds_fbiwrite_rolled; 102 kstat_named_t ds_quota_rolled; 103 kstat_named_t ds_shadow_rolled; 104 } dkstats = { 105 { "superblock_deltas", KSTAT_DATA_UINT64 }, 106 { "bitmap_deltas", KSTAT_DATA_UINT64 }, 107 { "suminfo_deltas", KSTAT_DATA_UINT64 }, 108 { "allocblk_deltas", KSTAT_DATA_UINT64 }, 109 { "ab0_deltas", KSTAT_DATA_UINT64 }, 110 { "dir_deltas", KSTAT_DATA_UINT64 }, 111 { "inode_deltas", KSTAT_DATA_UINT64 }, 112 { "fbiwrite_deltas", KSTAT_DATA_UINT64 }, 113 { "quota_deltas", KSTAT_DATA_UINT64 }, 114 { "shadow_deltas", KSTAT_DATA_UINT64 }, 115 116 { "superblock_rolled", KSTAT_DATA_UINT64 }, 117 { "bitmap_rolled", KSTAT_DATA_UINT64 }, 118 { "suminfo_rolled", KSTAT_DATA_UINT64 }, 119 { "allocblk_rolled", KSTAT_DATA_UINT64 }, 120 { "ab0_rolled", KSTAT_DATA_UINT64 }, 121 { "dir_rolled", KSTAT_DATA_UINT64 }, 122 { "inode_rolled", KSTAT_DATA_UINT64 }, 123 { "fbiwrite_rolled", KSTAT_DATA_UINT64 }, 124 { "quota_rolled", KSTAT_DATA_UINT64 }, 125 { "shadow_rolled", KSTAT_DATA_UINT64 } 126 }; 127 128 uint64_t delta_stats[DT_MAX]; 129 uint64_t roll_stats[DT_MAX]; 130 131 /* 132 * General logging kstats 133 */ 134 struct logstats logstats = { 135 { "master_reads", KSTAT_DATA_UINT64 }, 136 { "master_writes", KSTAT_DATA_UINT64 }, 137 { "log_reads_inmem", KSTAT_DATA_UINT64 }, 138 { "log_reads", KSTAT_DATA_UINT64 }, 139 { "log_writes", KSTAT_DATA_UINT64 }, 140 { "log_master_reads", KSTAT_DATA_UINT64 }, 141 { "log_roll_reads", KSTAT_DATA_UINT64 }, 142 { "log_roll_writes", KSTAT_DATA_UINT64 } 143 }; 144 145 int 146 trans_not_done(struct buf *cb) 147 { 148 sema_v(&cb->b_io); 149 return (0); 150 } 151 152 static void 153 trans_wait_panic(struct buf *cb) 154 { 155 while ((cb->b_flags & B_DONE) == 0) 156 drv_usecwait(10); 157 } 158 159 int 160 trans_not_wait(struct buf *cb) 161 { 162 /* 163 * In case of panic, busy wait for completion 164 */ 165 if (panicstr) 166 trans_wait_panic(cb); 167 else 168 sema_p(&cb->b_io); 169 170 return (geterror(cb)); 171 } 172 173 int 174 trans_wait(struct buf *cb) 175 { 176 /* 177 * In case of panic, busy wait for completion and run md daemon queues 178 */ 179 if (panicstr) 180 trans_wait_panic(cb); 181 return (biowait(cb)); 182 } 183 184 static void 185 setsum(int32_t *sp, int32_t *lp, int nb) 186 { 187 int32_t csum = 0; 188 189 *sp = 0; 190 nb /= sizeof (int32_t); 191 while (nb--) 192 csum += *lp++; 193 *sp = csum; 194 } 195 196 static int 197 checksum(int32_t *sp, int32_t *lp, int nb) 198 { 199 int32_t ssum = *sp; 200 201 setsum(sp, lp, nb); 202 if (ssum != *sp) { 203 *sp = ssum; 204 return (0); 205 } 206 return (1); 207 } 208 209 void 210 lufs_unsnarf(ufsvfs_t *ufsvfsp) 211 { 212 ml_unit_t *ul; 213 mt_map_t *mtm; 214 215 ul = ufsvfsp->vfs_log; 216 if (ul == NULL) 217 return; 218 219 mtm = ul->un_logmap; 220 221 /* 222 * Wait for a pending top_issue_sync which is 223 * dispatched (via taskq_dispatch()) but hasnt completed yet. 224 */ 225 226 mutex_enter(&mtm->mtm_lock); 227 228 while (mtm->mtm_taskq_sync_count != 0) { 229 cv_wait(&mtm->mtm_cv, &mtm->mtm_lock); 230 } 231 232 mutex_exit(&mtm->mtm_lock); 233 234 /* Roll committed transactions */ 235 logmap_roll_dev(ul); 236 237 /* Kill the roll thread */ 238 logmap_kill_roll(ul); 239 240 /* release saved alloction info */ 241 if (ul->un_ebp) 242 kmem_free(ul->un_ebp, ul->un_nbeb); 243 244 /* release circular bufs */ 245 free_cirbuf(&ul->un_rdbuf); 246 free_cirbuf(&ul->un_wrbuf); 247 248 /* release maps */ 249 if (ul->un_logmap) 250 ul->un_logmap = map_put(ul->un_logmap); 251 if (ul->un_deltamap) 252 ul->un_deltamap = map_put(ul->un_deltamap); 253 if (ul->un_matamap) 254 ul->un_matamap = map_put(ul->un_matamap); 255 256 mutex_destroy(&ul->un_log_mutex); 257 mutex_destroy(&ul->un_state_mutex); 258 259 /* release state buffer MUST BE LAST!! (contains our ondisk data) */ 260 if (ul->un_bp) 261 brelse(ul->un_bp); 262 kmem_free(ul, sizeof (*ul)); 263 264 ufsvfsp->vfs_log = NULL; 265 } 266 267 int 268 lufs_snarf(ufsvfs_t *ufsvfsp, struct fs *fs, int ronly) 269 { 270 buf_t *bp, *tbp; 271 ml_unit_t *ul; 272 extent_block_t *ebp; 273 ic_extent_block_t *nebp; 274 size_t nb; 275 daddr_t bno; /* in disk blocks */ 276 int i; 277 278 /* LINTED: warning: logical expression always true: op "||" */ 279 ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE); 280 281 /* 282 * Get the allocation table 283 * During a remount the superblock pointed to by the ufsvfsp 284 * is out of date. Hence the need for the ``new'' superblock 285 * pointer, fs, passed in as a parameter. 286 */ 287 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, fs->fs_logbno), 288 fs->fs_bsize); 289 if (bp->b_flags & B_ERROR) { 290 brelse(bp); 291 return (EIO); 292 } 293 ebp = (void *)bp->b_un.b_addr; 294 if (!checksum(&ebp->chksum, (int32_t *)bp->b_un.b_addr, 295 fs->fs_bsize)) { 296 brelse(bp); 297 return (ENODEV); 298 } 299 300 /* 301 * It is possible to get log blocks with all zeros. 302 * We should also check for nextents to be zero in such case. 303 */ 304 if (ebp->type != LUFS_EXTENTS || ebp->nextents == 0) { 305 brelse(bp); 306 return (EDOM); 307 } 308 /* 309 * Put allocation into memory. This requires conversion between 310 * on the ondisk format of the extent (type extent_t) and the 311 * in-core format of the extent (type ic_extent_t). The 312 * difference is the in-core form of the extent block stores 313 * the physical offset of the extent in disk blocks, which 314 * can require more than a 32-bit field. 315 */ 316 nb = (size_t)(sizeof (ic_extent_block_t) + 317 ((ebp->nextents - 1) * sizeof (ic_extent_t))); 318 nebp = kmem_alloc(nb, KM_SLEEP); 319 nebp->ic_nextents = ebp->nextents; 320 nebp->ic_nbytes = ebp->nbytes; 321 nebp->ic_nextbno = ebp->nextbno; 322 for (i = 0; i < ebp->nextents; i++) { 323 nebp->ic_extents[i].ic_lbno = ebp->extents[i].lbno; 324 nebp->ic_extents[i].ic_nbno = ebp->extents[i].nbno; 325 nebp->ic_extents[i].ic_pbno = 326 logbtodb(fs, ebp->extents[i].pbno); 327 } 328 brelse(bp); 329 330 /* 331 * Get the log state 332 */ 333 bno = nebp->ic_extents[0].ic_pbno; 334 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, DEV_BSIZE); 335 if (bp->b_flags & B_ERROR) { 336 brelse(bp); 337 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno + 1, DEV_BSIZE); 338 if (bp->b_flags & B_ERROR) { 339 brelse(bp); 340 kmem_free(nebp, nb); 341 return (EIO); 342 } 343 } 344 345 /* 346 * Put ondisk struct into an anonymous buffer 347 * This buffer will contain the memory for the ml_odunit struct 348 */ 349 tbp = ngeteblk(dbtob(LS_SECTORS)); 350 tbp->b_edev = bp->b_edev; 351 tbp->b_dev = bp->b_dev; 352 tbp->b_blkno = bno; 353 bcopy(bp->b_un.b_addr, tbp->b_un.b_addr, DEV_BSIZE); 354 bcopy(bp->b_un.b_addr, tbp->b_un.b_addr + DEV_BSIZE, DEV_BSIZE); 355 bp->b_flags |= (B_STALE | B_AGE); 356 brelse(bp); 357 bp = tbp; 358 359 /* 360 * Verify the log state 361 * 362 * read/only mounts w/bad logs are allowed. umount will 363 * eventually roll the bad log until the first IO error. 364 * fsck will then repair the file system. 365 * 366 * read/write mounts with bad logs are not allowed. 367 * 368 */ 369 ul = (ml_unit_t *)kmem_zalloc(sizeof (*ul), KM_SLEEP); 370 bcopy(bp->b_un.b_addr, &ul->un_ondisk, sizeof (ml_odunit_t)); 371 if ((ul->un_chksum != ul->un_head_ident + ul->un_tail_ident) || 372 (ul->un_version != LUFS_VERSION_LATEST) || 373 (!ronly && ul->un_badlog)) { 374 kmem_free(ul, sizeof (*ul)); 375 brelse(bp); 376 kmem_free(nebp, nb); 377 return (EIO); 378 } 379 /* 380 * Initialize the incore-only fields 381 */ 382 if (ronly) 383 ul->un_flags |= LDL_NOROLL; 384 ul->un_bp = bp; 385 ul->un_ufsvfs = ufsvfsp; 386 ul->un_dev = ufsvfsp->vfs_dev; 387 ul->un_ebp = nebp; 388 ul->un_nbeb = nb; 389 ul->un_maxresv = btodb(ul->un_logsize) * LDL_USABLE_BSIZE; 390 ul->un_deltamap = map_get(ul, deltamaptype, DELTAMAP_NHASH); 391 ul->un_logmap = map_get(ul, logmaptype, LOGMAP_NHASH); 392 if (ul->un_debug & MT_MATAMAP) 393 ul->un_matamap = map_get(ul, matamaptype, DELTAMAP_NHASH); 394 mutex_init(&ul->un_log_mutex, NULL, MUTEX_DEFAULT, NULL); 395 mutex_init(&ul->un_state_mutex, NULL, MUTEX_DEFAULT, NULL); 396 397 /* 398 * Aquire the ufs_scan_lock before linking the mtm data 399 * structure so that we keep ufs_sync() and ufs_update() away 400 * when they execute the ufs_scan_inodes() run while we're in 401 * progress of enabling/disabling logging. 402 */ 403 mutex_enter(&ufs_scan_lock); 404 ufsvfsp->vfs_log = ul; 405 406 /* remember the state of the log before the log scan */ 407 logmap_logscan(ul); 408 mutex_exit(&ufs_scan_lock); 409 410 /* 411 * Error during scan 412 * 413 * If this is a read/only mount; ignore the error. 414 * At a later time umount/fsck will repair the fs. 415 * 416 */ 417 if (ul->un_flags & LDL_ERROR) { 418 if (!ronly) { 419 /* 420 * Aquire the ufs_scan_lock before de-linking 421 * the mtm data structure so that we keep ufs_sync() 422 * and ufs_update() away when they execute the 423 * ufs_scan_inodes() run while we're in progress of 424 * enabling/disabling logging. 425 */ 426 mutex_enter(&ufs_scan_lock); 427 lufs_unsnarf(ufsvfsp); 428 mutex_exit(&ufs_scan_lock); 429 return (EIO); 430 } 431 ul->un_flags &= ~LDL_ERROR; 432 } 433 if (!ronly) 434 logmap_start_roll(ul); 435 return (0); 436 } 437 438 uint32_t 439 lufs_hd_genid(const ml_unit_t *up) 440 { 441 uint32_t id; 442 443 mutex_enter(&genid_mutex); 444 445 /* 446 * The formula below implements an exponential, modular sequence. 447 * 448 * ID(N) = (SEED * (BASE^N)) % PRIME 449 * 450 * The numbers will be pseudo random. They depend on SEED, BASE, PRIME, 451 * but will sweep through almost all of the range 1....PRIME-1. 452 * Most importantly they will not repeat for PRIME-2 (4294967289) 453 * repetitions. If they would repeat that could possibly cause hangs, 454 * panics at mount/umount and failed mount operations. 455 */ 456 id = LUFS_NEXT_ID(last_loghead_ident); 457 458 /* Checking if new identity used already */ 459 if (up != NULL && up->un_head_ident == id) { 460 DTRACE_PROBE1(head_ident_collision, uint32_t, id); 461 462 /* 463 * The following preserves the algorithm for the fix for 464 * "panic: free: freeing free frag, dev:0x2000000018, blk:34605, 465 * cg:26, ino:148071,". 466 * If the header identities un_head_ident are equal to the 467 * present element in the sequence, the next element of the 468 * sequence is returned instead. 469 */ 470 id = LUFS_NEXT_ID(id); 471 } 472 473 last_loghead_ident = id; 474 475 mutex_exit(&genid_mutex); 476 477 return (id); 478 } 479 480 static void 481 lufs_genid_init(void) 482 { 483 uint64_t seed; 484 485 /* Initialization */ 486 mutex_init(&genid_mutex, NULL, MUTEX_DEFAULT, NULL); 487 488 /* Seed the algorithm */ 489 do { 490 timestruc_t tv; 491 492 gethrestime(&tv); 493 494 seed = (tv.tv_nsec << 3); 495 seed ^= tv.tv_sec; 496 497 last_loghead_ident = (uint32_t)(seed % LUFS_GENID_PRIME); 498 } while (last_loghead_ident == UINT32_C(0)); 499 } 500 501 static int 502 lufs_initialize( 503 ufsvfs_t *ufsvfsp, 504 daddr_t bno, 505 size_t nb, 506 struct fiolog *flp) 507 { 508 ml_odunit_t *ud, *ud2; 509 buf_t *bp; 510 511 /* LINTED: warning: logical expression always true: op "||" */ 512 ASSERT(sizeof (ml_odunit_t) < DEV_BSIZE); 513 ASSERT(nb >= ldl_minlogsize); 514 515 bp = UFS_GETBLK(ufsvfsp, ufsvfsp->vfs_dev, bno, dbtob(LS_SECTORS)); 516 bzero(bp->b_un.b_addr, bp->b_bcount); 517 518 ud = (void *)bp->b_un.b_addr; 519 ud->od_version = LUFS_VERSION_LATEST; 520 ud->od_maxtransfer = MIN(ufsvfsp->vfs_iotransz, ldl_maxtransfer); 521 if (ud->od_maxtransfer < ldl_mintransfer) 522 ud->od_maxtransfer = ldl_mintransfer; 523 ud->od_devbsize = DEV_BSIZE; 524 525 ud->od_requestsize = flp->nbytes_actual; 526 ud->od_statesize = dbtob(LS_SECTORS); 527 ud->od_logsize = nb - ud->od_statesize; 528 529 ud->od_statebno = INT32_C(0); 530 531 ud->od_head_ident = lufs_hd_genid(NULL); 532 ud->od_tail_ident = ud->od_head_ident; 533 ud->od_chksum = ud->od_head_ident + ud->od_tail_ident; 534 535 ud->od_bol_lof = dbtob(ud->od_statebno) + ud->od_statesize; 536 ud->od_eol_lof = ud->od_bol_lof + ud->od_logsize; 537 ud->od_head_lof = ud->od_bol_lof; 538 ud->od_tail_lof = ud->od_bol_lof; 539 540 ASSERT(lufs_initialize_debug(ud)); 541 542 ud2 = (void *)(bp->b_un.b_addr + DEV_BSIZE); 543 bcopy(ud, ud2, sizeof (*ud)); 544 545 UFS_BWRITE2(ufsvfsp, bp); 546 if (bp->b_flags & B_ERROR) { 547 brelse(bp); 548 return (EIO); 549 } 550 brelse(bp); 551 552 return (0); 553 } 554 555 /* 556 * Free log space 557 * Assumes the file system is write locked and is not logging 558 */ 559 static int 560 lufs_free(struct ufsvfs *ufsvfsp) 561 { 562 int error = 0, i, j; 563 buf_t *bp = NULL; 564 extent_t *ep; 565 extent_block_t *ebp; 566 struct fs *fs = ufsvfsp->vfs_fs; 567 daddr_t fno; 568 int32_t logbno; 569 long nfno; 570 inode_t *ip = NULL; 571 char clean; 572 573 /* 574 * Nothing to free 575 */ 576 if (fs->fs_logbno == 0) 577 return (0); 578 579 /* 580 * Mark the file system as FSACTIVE and no log but honor the 581 * current value of fs_reclaim. The reclaim thread could have 582 * been active when lufs_disable() was called and if fs_reclaim 583 * is reset to zero here it could lead to lost inodes. 584 */ 585 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread; 586 mutex_enter(&ufsvfsp->vfs_lock); 587 clean = fs->fs_clean; 588 logbno = fs->fs_logbno; 589 fs->fs_clean = FSACTIVE; 590 fs->fs_logbno = INT32_C(0); 591 ufs_sbwrite(ufsvfsp); 592 mutex_exit(&ufsvfsp->vfs_lock); 593 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1; 594 if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) { 595 error = EIO; 596 fs->fs_clean = clean; 597 fs->fs_logbno = logbno; 598 goto errout; 599 } 600 601 /* 602 * fetch the allocation block 603 * superblock -> one block of extents -> log data 604 */ 605 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, logbtodb(fs, logbno), 606 fs->fs_bsize); 607 if (bp->b_flags & B_ERROR) { 608 error = EIO; 609 goto errout; 610 } 611 612 /* 613 * Free up the allocated space (dummy inode needed for free()) 614 */ 615 ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO); 616 ebp = (void *)bp->b_un.b_addr; 617 for (i = 0, ep = &ebp->extents[0]; i < ebp->nextents; ++i, ++ep) { 618 fno = logbtofrag(fs, ep->pbno); 619 nfno = dbtofsb(fs, ep->nbno); 620 for (j = 0; j < nfno; j += fs->fs_frag, fno += fs->fs_frag) 621 free(ip, fno, fs->fs_bsize, 0); 622 } 623 free(ip, logbtofrag(fs, logbno), fs->fs_bsize, 0); 624 brelse(bp); 625 bp = NULL; 626 627 /* 628 * Push the metadata dirtied during the allocations 629 */ 630 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread; 631 sbupdate(ufsvfsp->vfs_vfs); 632 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1; 633 bflush(ufsvfsp->vfs_dev); 634 error = bfinval(ufsvfsp->vfs_dev, 0); 635 if (error) 636 goto errout; 637 638 /* 639 * Free the dummy inode 640 */ 641 ufs_free_inode(ip); 642 643 return (0); 644 645 errout: 646 /* 647 * Free up all resources 648 */ 649 if (bp) 650 brelse(bp); 651 if (ip) 652 ufs_free_inode(ip); 653 return (error); 654 } 655 656 /* 657 * Allocate log space 658 * Assumes the file system is write locked and is not logging 659 */ 660 static int 661 lufs_alloc(struct ufsvfs *ufsvfsp, struct fiolog *flp, size_t minb, cred_t *cr) 662 { 663 int error = 0; 664 buf_t *bp = NULL; 665 extent_t *ep, *nep; 666 extent_block_t *ebp; 667 struct fs *fs = ufsvfsp->vfs_fs; 668 daddr_t fno; /* in frags */ 669 daddr_t bno; /* in disk blocks */ 670 int32_t logbno = INT32_C(0); /* will be fs_logbno */ 671 struct inode *ip = NULL; 672 size_t nb = flp->nbytes_actual; 673 size_t tb = 0; 674 675 /* 676 * Mark the file system as FSACTIVE 677 */ 678 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread; 679 mutex_enter(&ufsvfsp->vfs_lock); 680 fs->fs_clean = FSACTIVE; 681 ufs_sbwrite(ufsvfsp); 682 mutex_exit(&ufsvfsp->vfs_lock); 683 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1; 684 685 /* 686 * Allocate the allocation block (need dummy shadow inode; 687 * we use a shadow inode so the quota sub-system ignores 688 * the block allocations.) 689 * superblock -> one block of extents -> log data 690 */ 691 ip = ufs_alloc_inode(ufsvfsp, UFSROOTINO); 692 ip->i_mode = IFSHAD; /* make the dummy a shadow inode */ 693 rw_enter(&ip->i_contents, RW_WRITER); 694 fno = contigpref(ufsvfsp, nb + fs->fs_bsize, minb); 695 error = alloc(ip, fno, fs->fs_bsize, &fno, cr); 696 if (error) 697 goto errout; 698 bno = fsbtodb(fs, fno); 699 700 bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev, bno, fs->fs_bsize); 701 if (bp->b_flags & B_ERROR) { 702 error = EIO; 703 goto errout; 704 } 705 706 ebp = (void *)bp->b_un.b_addr; 707 ebp->type = LUFS_EXTENTS; 708 ebp->nextbno = UINT32_C(0); 709 ebp->nextents = UINT32_C(0); 710 ebp->chksum = INT32_C(0); 711 if (fs->fs_magic == FS_MAGIC) 712 logbno = bno; 713 else 714 logbno = dbtofsb(fs, bno); 715 716 /* 717 * Initialize the first extent 718 */ 719 ep = &ebp->extents[0]; 720 error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr); 721 if (error) 722 goto errout; 723 bno = fsbtodb(fs, fno); 724 725 ep->lbno = UINT32_C(0); 726 if (fs->fs_magic == FS_MAGIC) 727 ep->pbno = (uint32_t)bno; 728 else 729 ep->pbno = (uint32_t)fno; 730 ep->nbno = (uint32_t)fsbtodb(fs, fs->fs_frag); 731 ebp->nextents = UINT32_C(1); 732 tb = fs->fs_bsize; 733 nb -= fs->fs_bsize; 734 735 while (nb) { 736 error = alloc(ip, fno + fs->fs_frag, fs->fs_bsize, &fno, cr); 737 if (error) { 738 if (tb < minb) 739 goto errout; 740 error = 0; 741 break; 742 } 743 bno = fsbtodb(fs, fno); 744 if ((daddr_t)((logbtodb(fs, ep->pbno) + ep->nbno) == bno)) 745 ep->nbno += (uint32_t)(fsbtodb(fs, fs->fs_frag)); 746 else { 747 nep = ep + 1; 748 if ((caddr_t)(nep + 1) > 749 (bp->b_un.b_addr + fs->fs_bsize)) { 750 free(ip, fno, fs->fs_bsize, 0); 751 break; 752 } 753 nep->lbno = ep->lbno + ep->nbno; 754 if (fs->fs_magic == FS_MAGIC) 755 nep->pbno = (uint32_t)bno; 756 else 757 nep->pbno = (uint32_t)fno; 758 nep->nbno = (uint32_t)(fsbtodb(fs, fs->fs_frag)); 759 ebp->nextents++; 760 ep = nep; 761 } 762 tb += fs->fs_bsize; 763 nb -= fs->fs_bsize; 764 } 765 766 if (tb < minb) { /* Failed to reach minimum log size */ 767 error = ENOSPC; 768 goto errout; 769 } 770 771 ebp->nbytes = (uint32_t)tb; 772 setsum(&ebp->chksum, (int32_t *)bp->b_un.b_addr, fs->fs_bsize); 773 UFS_BWRITE2(ufsvfsp, bp); 774 if (bp->b_flags & B_ERROR) { 775 error = EIO; 776 goto errout; 777 } 778 /* 779 * Initialize the first two sectors of the log 780 */ 781 error = lufs_initialize(ufsvfsp, logbtodb(fs, ebp->extents[0].pbno), 782 tb, flp); 783 if (error) 784 goto errout; 785 786 /* 787 * We are done initializing the allocation block and the log 788 */ 789 brelse(bp); 790 bp = NULL; 791 792 /* 793 * Update the superblock and push the dirty metadata 794 */ 795 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread; 796 sbupdate(ufsvfsp->vfs_vfs); 797 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1; 798 bflush(ufsvfsp->vfs_dev); 799 error = bfinval(ufsvfsp->vfs_dev, 1); 800 if (error) 801 goto errout; 802 if (ufsvfsp->vfs_bufp->b_flags & B_ERROR) { 803 error = EIO; 804 goto errout; 805 } 806 807 /* 808 * Everything is safely on disk; update log space pointer in sb 809 */ 810 ufsvfsp->vfs_ulockfs.ul_sbowner = curthread; 811 mutex_enter(&ufsvfsp->vfs_lock); 812 fs->fs_logbno = (uint32_t)logbno; 813 ufs_sbwrite(ufsvfsp); 814 mutex_exit(&ufsvfsp->vfs_lock); 815 ufsvfsp->vfs_ulockfs.ul_sbowner = (kthread_id_t)-1; 816 817 /* 818 * Free the dummy inode 819 */ 820 rw_exit(&ip->i_contents); 821 ufs_free_inode(ip); 822 823 /* inform user of real log size */ 824 flp->nbytes_actual = tb; 825 return (0); 826 827 errout: 828 /* 829 * Free all resources 830 */ 831 if (bp) 832 brelse(bp); 833 if (logbno) { 834 fs->fs_logbno = logbno; 835 (void) lufs_free(ufsvfsp); 836 } 837 if (ip) { 838 rw_exit(&ip->i_contents); 839 ufs_free_inode(ip); 840 } 841 return (error); 842 } 843 844 /* 845 * Disable logging 846 */ 847 int 848 lufs_disable(vnode_t *vp, struct fiolog *flp) 849 { 850 int error = 0; 851 inode_t *ip = VTOI(vp); 852 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 853 struct fs *fs = ufsvfsp->vfs_fs; 854 struct lockfs lf; 855 struct ulockfs *ulp; 856 857 flp->error = FIOLOG_ENONE; 858 859 /* 860 * Logging is already disabled; done 861 */ 862 if (fs->fs_logbno == 0 || ufsvfsp->vfs_log == NULL) 863 return (0); 864 865 /* 866 * Readonly file system 867 */ 868 if (fs->fs_ronly) { 869 flp->error = FIOLOG_EROFS; 870 return (0); 871 } 872 873 /* 874 * File system must be write locked to disable logging 875 */ 876 error = ufs_fiolfss(vp, &lf); 877 if (error) { 878 return (error); 879 } 880 if (!LOCKFS_IS_ULOCK(&lf)) { 881 flp->error = FIOLOG_EULOCK; 882 return (0); 883 } 884 lf.lf_lock = LOCKFS_WLOCK; 885 lf.lf_flags = 0; 886 lf.lf_comment = NULL; 887 error = ufs_fiolfs(vp, &lf, 1); 888 if (error) { 889 flp->error = FIOLOG_EWLOCK; 890 return (0); 891 } 892 893 if (ufsvfsp->vfs_log == NULL || fs->fs_logbno == 0) 894 goto errout; 895 896 /* 897 * WE ARE COMMITTED TO DISABLING LOGGING PAST THIS POINT 898 */ 899 900 /* 901 * Disable logging: 902 * Suspend the reclaim thread and force the delete thread to exit. 903 * When a nologging mount has completed there may still be 904 * work for reclaim to do so just suspend this thread until 905 * it's [deadlock-] safe for it to continue. The delete 906 * thread won't be needed as ufs_iinactive() calls 907 * ufs_delete() when logging is disabled. 908 * Freeze and drain reader ops. 909 * Commit any outstanding reader transactions (ufs_flush). 910 * Set the ``unmounted'' bit in the ufstrans struct. 911 * If debug, remove metadata from matamap. 912 * Disable matamap processing. 913 * NULL the trans ops table. 914 * Free all of the incore structs related to logging. 915 * Allow reader ops. 916 */ 917 ufs_thread_suspend(&ufsvfsp->vfs_reclaim); 918 ufs_thread_exit(&ufsvfsp->vfs_delete); 919 920 vfs_lock_wait(ufsvfsp->vfs_vfs); 921 ulp = &ufsvfsp->vfs_ulockfs; 922 mutex_enter(&ulp->ul_lock); 923 atomic_add_long(&ufs_quiesce_pend, 1); 924 (void) ufs_quiesce(ulp); 925 926 (void) ufs_flush(ufsvfsp->vfs_vfs); 927 928 TRANS_MATA_UMOUNT(ufsvfsp); 929 ufsvfsp->vfs_domatamap = 0; 930 931 /* 932 * Free all of the incore structs 933 * Aquire the ufs_scan_lock before de-linking the mtm data 934 * structure so that we keep ufs_sync() and ufs_update() away 935 * when they execute the ufs_scan_inodes() run while we're in 936 * progress of enabling/disabling logging. 937 */ 938 mutex_enter(&ufs_scan_lock); 939 (void) lufs_unsnarf(ufsvfsp); 940 mutex_exit(&ufs_scan_lock); 941 942 atomic_add_long(&ufs_quiesce_pend, -1); 943 mutex_exit(&ulp->ul_lock); 944 vfs_setmntopt(ufsvfsp->vfs_vfs, MNTOPT_NOLOGGING, NULL, 0); 945 vfs_unlock(ufsvfsp->vfs_vfs); 946 947 fs->fs_rolled = FS_ALL_ROLLED; 948 ufsvfsp->vfs_nolog_si = 0; 949 950 /* 951 * Free the log space and mark the superblock as FSACTIVE 952 */ 953 (void) lufs_free(ufsvfsp); 954 955 /* 956 * Allow the reclaim thread to continue. 957 */ 958 ufs_thread_continue(&ufsvfsp->vfs_reclaim); 959 960 /* 961 * Unlock the file system 962 */ 963 lf.lf_lock = LOCKFS_ULOCK; 964 lf.lf_flags = 0; 965 error = ufs_fiolfs(vp, &lf, 1); 966 if (error) 967 flp->error = FIOLOG_ENOULOCK; 968 969 return (0); 970 971 errout: 972 lf.lf_lock = LOCKFS_ULOCK; 973 lf.lf_flags = 0; 974 (void) ufs_fiolfs(vp, &lf, 1); 975 return (error); 976 } 977 978 /* 979 * Enable logging 980 */ 981 int 982 lufs_enable(struct vnode *vp, struct fiolog *flp, cred_t *cr) 983 { 984 int error; 985 int reclaim; 986 inode_t *ip = VTOI(vp); 987 ufsvfs_t *ufsvfsp = ip->i_ufsvfs; 988 struct fs *fs; 989 ml_unit_t *ul; 990 struct lockfs lf; 991 struct ulockfs *ulp; 992 vfs_t *vfsp = ufsvfsp->vfs_vfs; 993 uint64_t tmp_nbytes_actual; 994 uint64_t cg_minlogsize; 995 uint32_t cgsize; 996 static int minlogsizewarn = 0; 997 static int maxlogsizewarn = 0; 998 999 /* 1000 * Check if logging is already enabled 1001 */ 1002 if (ufsvfsp->vfs_log) { 1003 flp->error = FIOLOG_ETRANS; 1004 /* for root ensure logging option is set */ 1005 vfs_setmntopt(vfsp, MNTOPT_LOGGING, NULL, 0); 1006 return (0); 1007 } 1008 fs = ufsvfsp->vfs_fs; 1009 1010 /* 1011 * Come back here to recheck if we had to disable the log. 1012 */ 1013 recheck: 1014 error = 0; 1015 reclaim = 0; 1016 flp->error = FIOLOG_ENONE; 1017 1018 /* 1019 * The size of the ufs log is determined using the following rules: 1020 * 1021 * 1) If no size is requested the log size is calculated as a 1022 * ratio of the total file system size. By default this is 1023 * 1MB of log per 1GB of file system. This calculation is then 1024 * capped at the log size specified by ldl_softlogcap. 1025 * 2) The log size requested may then be increased based on the 1026 * number of cylinder groups contained in the file system. 1027 * To prevent a hang the log has to be large enough to contain a 1028 * single transaction that alters every cylinder group in the file 1029 * system. This is calculated as cg_minlogsize. 1030 * 3) Finally a check is made that the log size requested is within 1031 * the limits of ldl_minlogsize and ldl_maxlogsize. 1032 */ 1033 1034 /* 1035 * Adjust requested log size 1036 */ 1037 flp->nbytes_actual = flp->nbytes_requested; 1038 if (flp->nbytes_actual == 0) { 1039 tmp_nbytes_actual = 1040 (((uint64_t)fs->fs_size) / ldl_divisor) << fs->fs_fshift; 1041 flp->nbytes_actual = (uint_t)MIN(tmp_nbytes_actual, INT_MAX); 1042 /* 1043 * The 1MB per 1GB log size allocation only applies up to 1044 * ldl_softlogcap size of log. 1045 */ 1046 flp->nbytes_actual = MIN(flp->nbytes_actual, ldl_softlogcap); 1047 } 1048 1049 cgsize = ldl_cgsizereq ? ldl_cgsizereq : LDL_CGSIZEREQ(fs); 1050 1051 /* 1052 * Determine the log size required based on the number of cylinder 1053 * groups in the file system. The log has to be at least this size 1054 * to prevent possible hangs due to log space exhaustion. 1055 */ 1056 cg_minlogsize = cgsize * fs->fs_ncg; 1057 1058 /* 1059 * Ensure that the minimum log size isn't so small that it could lead 1060 * to a full log hang. 1061 */ 1062 if (ldl_minlogsize < LDL_MINLOGSIZE) { 1063 ldl_minlogsize = LDL_MINLOGSIZE; 1064 if (!minlogsizewarn) { 1065 cmn_err(CE_WARN, "ldl_minlogsize too small, increasing " 1066 "to 0x%x", LDL_MINLOGSIZE); 1067 minlogsizewarn = 1; 1068 } 1069 } 1070 1071 /* 1072 * Ensure that the maximum log size isn't greater than INT_MAX as the 1073 * logical log offset fields would overflow. 1074 */ 1075 if (ldl_maxlogsize > INT_MAX) { 1076 ldl_maxlogsize = INT_MAX; 1077 if (!maxlogsizewarn) { 1078 cmn_err(CE_WARN, "ldl_maxlogsize too large, reducing " 1079 "to 0x%x", INT_MAX); 1080 maxlogsizewarn = 1; 1081 } 1082 } 1083 1084 if (cg_minlogsize > ldl_maxlogsize) { 1085 cmn_err(CE_WARN, 1086 "%s: reducing calculated log size from 0x%x to " 1087 "ldl_maxlogsize (0x%x).", fs->fs_fsmnt, (int)cg_minlogsize, 1088 ldl_maxlogsize); 1089 } 1090 1091 cg_minlogsize = MAX(cg_minlogsize, ldl_minlogsize); 1092 cg_minlogsize = MIN(cg_minlogsize, ldl_maxlogsize); 1093 1094 flp->nbytes_actual = MAX(flp->nbytes_actual, cg_minlogsize); 1095 flp->nbytes_actual = MAX(flp->nbytes_actual, ldl_minlogsize); 1096 flp->nbytes_actual = MIN(flp->nbytes_actual, ldl_maxlogsize); 1097 flp->nbytes_actual = blkroundup(fs, flp->nbytes_actual); 1098 1099 /* 1100 * logging is enabled and the log is the right size; done 1101 */ 1102 ul = ufsvfsp->vfs_log; 1103 if (ul && fs->fs_logbno && (flp->nbytes_actual == ul->un_requestsize)) 1104 return (0); 1105 1106 /* 1107 * Readonly file system 1108 */ 1109 if (fs->fs_ronly) { 1110 flp->error = FIOLOG_EROFS; 1111 return (0); 1112 } 1113 1114 /* 1115 * File system must be write locked to enable logging 1116 */ 1117 error = ufs_fiolfss(vp, &lf); 1118 if (error) { 1119 return (error); 1120 } 1121 if (!LOCKFS_IS_ULOCK(&lf)) { 1122 flp->error = FIOLOG_EULOCK; 1123 return (0); 1124 } 1125 lf.lf_lock = LOCKFS_WLOCK; 1126 lf.lf_flags = 0; 1127 lf.lf_comment = NULL; 1128 error = ufs_fiolfs(vp, &lf, 1); 1129 if (error) { 1130 flp->error = FIOLOG_EWLOCK; 1131 return (0); 1132 } 1133 1134 /* 1135 * Grab appropriate locks to synchronize with the rest 1136 * of the system 1137 */ 1138 vfs_lock_wait(vfsp); 1139 ulp = &ufsvfsp->vfs_ulockfs; 1140 mutex_enter(&ulp->ul_lock); 1141 1142 /* 1143 * File system must be fairly consistent to enable logging 1144 */ 1145 if (fs->fs_clean != FSLOG && 1146 fs->fs_clean != FSACTIVE && 1147 fs->fs_clean != FSSTABLE && 1148 fs->fs_clean != FSCLEAN) { 1149 flp->error = FIOLOG_ECLEAN; 1150 goto unlockout; 1151 } 1152 1153 /* 1154 * A write-locked file system is only active if there are 1155 * open deleted files; so remember to set FS_RECLAIM later. 1156 */ 1157 if (fs->fs_clean == FSACTIVE) 1158 reclaim = FS_RECLAIM; 1159 1160 /* 1161 * Logging is already enabled; must be changing the log's size 1162 */ 1163 if (fs->fs_logbno && ufsvfsp->vfs_log) { 1164 /* 1165 * Before we can disable logging, we must give up our 1166 * lock. As a consequence of unlocking and disabling the 1167 * log, the fs structure may change. Because of this, when 1168 * disabling is complete, we will go back to recheck to 1169 * repeat all of the checks that we performed to get to 1170 * this point. Disabling sets fs->fs_logbno to 0, so this 1171 * will not put us into an infinite loop. 1172 */ 1173 mutex_exit(&ulp->ul_lock); 1174 vfs_unlock(vfsp); 1175 1176 lf.lf_lock = LOCKFS_ULOCK; 1177 lf.lf_flags = 0; 1178 error = ufs_fiolfs(vp, &lf, 1); 1179 if (error) { 1180 flp->error = FIOLOG_ENOULOCK; 1181 return (0); 1182 } 1183 error = lufs_disable(vp, flp); 1184 if (error || (flp->error != FIOLOG_ENONE)) 1185 return (0); 1186 goto recheck; 1187 } 1188 1189 error = lufs_alloc(ufsvfsp, flp, cg_minlogsize, cr); 1190 if (error) 1191 goto errout; 1192 1193 /* 1194 * Create all of the incore structs 1195 */ 1196 error = lufs_snarf(ufsvfsp, fs, 0); 1197 if (error) 1198 goto errout; 1199 1200 /* 1201 * DON'T ``GOTO ERROUT'' PAST THIS POINT 1202 */ 1203 1204 /* 1205 * Pretend we were just mounted with logging enabled 1206 * Get the ops vector 1207 * If debug, record metadata locations with log subsystem 1208 * Start the delete thread 1209 * Start the reclaim thread, if necessary 1210 */ 1211 vfs_setmntopt(vfsp, MNTOPT_LOGGING, NULL, 0); 1212 1213 TRANS_DOMATAMAP(ufsvfsp); 1214 TRANS_MATA_MOUNT(ufsvfsp); 1215 TRANS_MATA_SI(ufsvfsp, fs); 1216 ufs_thread_start(&ufsvfsp->vfs_delete, ufs_thread_delete, vfsp); 1217 if (fs->fs_reclaim & (FS_RECLAIM|FS_RECLAIMING)) { 1218 fs->fs_reclaim &= ~FS_RECLAIM; 1219 fs->fs_reclaim |= FS_RECLAIMING; 1220 ufs_thread_start(&ufsvfsp->vfs_reclaim, 1221 ufs_thread_reclaim, vfsp); 1222 } else 1223 fs->fs_reclaim |= reclaim; 1224 1225 mutex_exit(&ulp->ul_lock); 1226 vfs_unlock(vfsp); 1227 1228 /* 1229 * Unlock the file system 1230 */ 1231 lf.lf_lock = LOCKFS_ULOCK; 1232 lf.lf_flags = 0; 1233 error = ufs_fiolfs(vp, &lf, 1); 1234 if (error) { 1235 flp->error = FIOLOG_ENOULOCK; 1236 return (0); 1237 } 1238 1239 /* 1240 * There's nothing in the log yet (we've just allocated it) 1241 * so directly write out the super block. 1242 * Note, we have to force this sb out to disk 1243 * (not just to the log) so that if we crash we know we are logging 1244 */ 1245 mutex_enter(&ufsvfsp->vfs_lock); 1246 fs->fs_clean = FSLOG; 1247 fs->fs_rolled = FS_NEED_ROLL; /* Mark the fs as unrolled */ 1248 UFS_BWRITE2(NULL, ufsvfsp->vfs_bufp); 1249 mutex_exit(&ufsvfsp->vfs_lock); 1250 1251 return (0); 1252 1253 errout: 1254 /* 1255 * Aquire the ufs_scan_lock before de-linking the mtm data 1256 * structure so that we keep ufs_sync() and ufs_update() away 1257 * when they execute the ufs_scan_inodes() run while we're in 1258 * progress of enabling/disabling logging. 1259 */ 1260 mutex_enter(&ufs_scan_lock); 1261 (void) lufs_unsnarf(ufsvfsp); 1262 mutex_exit(&ufs_scan_lock); 1263 1264 (void) lufs_free(ufsvfsp); 1265 unlockout: 1266 mutex_exit(&ulp->ul_lock); 1267 vfs_unlock(vfsp); 1268 1269 lf.lf_lock = LOCKFS_ULOCK; 1270 lf.lf_flags = 0; 1271 (void) ufs_fiolfs(vp, &lf, 1); 1272 return (error); 1273 } 1274 1275 void 1276 lufs_read_strategy(ml_unit_t *ul, buf_t *bp) 1277 { 1278 mt_map_t *logmap = ul->un_logmap; 1279 offset_t mof = ldbtob(bp->b_blkno); 1280 off_t nb = bp->b_bcount; 1281 mapentry_t *age; 1282 char *va; 1283 int (*saviodone)(); 1284 int entire_range; 1285 1286 /* 1287 * get a linked list of overlapping deltas 1288 * returns with &mtm->mtm_rwlock held 1289 */ 1290 entire_range = logmap_list_get(logmap, mof, nb, &age); 1291 1292 /* 1293 * no overlapping deltas were found; read master 1294 */ 1295 if (age == NULL) { 1296 rw_exit(&logmap->mtm_rwlock); 1297 if (ul->un_flags & LDL_ERROR) { 1298 bp->b_flags |= B_ERROR; 1299 bp->b_error = EIO; 1300 biodone(bp); 1301 } else { 1302 ul->un_ufsvfs->vfs_iotstamp = lbolt; 1303 logstats.ls_lreads.value.ui64++; 1304 (void) bdev_strategy(bp); 1305 lwp_stat_update(LWP_STAT_INBLK, 1); 1306 } 1307 return; 1308 } 1309 1310 va = bp_mapin_common(bp, VM_SLEEP); 1311 /* 1312 * if necessary, sync read the data from master 1313 * errors are returned in bp 1314 */ 1315 if (!entire_range) { 1316 saviodone = bp->b_iodone; 1317 bp->b_iodone = trans_not_done; 1318 logstats.ls_mreads.value.ui64++; 1319 (void) bdev_strategy(bp); 1320 lwp_stat_update(LWP_STAT_INBLK, 1); 1321 if (trans_not_wait(bp)) 1322 ldl_seterror(ul, "Error reading master"); 1323 bp->b_iodone = saviodone; 1324 } 1325 1326 /* 1327 * sync read the data from the log 1328 * errors are returned inline 1329 */ 1330 if (ldl_read(ul, va, mof, nb, age)) { 1331 bp->b_flags |= B_ERROR; 1332 bp->b_error = EIO; 1333 } 1334 1335 /* 1336 * unlist the deltas 1337 */ 1338 logmap_list_put(logmap, age); 1339 1340 /* 1341 * all done 1342 */ 1343 if (ul->un_flags & LDL_ERROR) { 1344 bp->b_flags |= B_ERROR; 1345 bp->b_error = EIO; 1346 } 1347 biodone(bp); 1348 } 1349 1350 void 1351 lufs_write_strategy(ml_unit_t *ul, buf_t *bp) 1352 { 1353 offset_t mof = ldbtob(bp->b_blkno); 1354 off_t nb = bp->b_bcount; 1355 char *va; 1356 mapentry_t *me; 1357 1358 ASSERT((nb & DEV_BMASK) == 0); 1359 ul->un_logmap->mtm_ref = 1; 1360 1361 /* 1362 * if there are deltas, move into log 1363 */ 1364 me = deltamap_remove(ul->un_deltamap, mof, nb); 1365 if (me) { 1366 1367 va = bp_mapin_common(bp, VM_SLEEP); 1368 1369 ASSERT(((ul->un_debug & MT_WRITE_CHECK) == 0) || 1370 (ul->un_matamap == NULL)|| 1371 matamap_within(ul->un_matamap, mof, nb)); 1372 1373 /* 1374 * move to logmap 1375 */ 1376 if (ufs_crb_enable) { 1377 logmap_add_buf(ul, va, mof, me, 1378 bp->b_un.b_addr, nb); 1379 } else { 1380 logmap_add(ul, va, mof, me); 1381 } 1382 1383 if (ul->un_flags & LDL_ERROR) { 1384 bp->b_flags |= B_ERROR; 1385 bp->b_error = EIO; 1386 } 1387 biodone(bp); 1388 return; 1389 } 1390 if (ul->un_flags & LDL_ERROR) { 1391 bp->b_flags |= B_ERROR; 1392 bp->b_error = EIO; 1393 biodone(bp); 1394 return; 1395 } 1396 1397 /* 1398 * Check that we are not updating metadata, or if so then via B_PHYS. 1399 */ 1400 ASSERT((ul->un_matamap == NULL) || 1401 !(matamap_overlap(ul->un_matamap, mof, nb) && 1402 ((bp->b_flags & B_PHYS) == 0))); 1403 1404 ul->un_ufsvfs->vfs_iotstamp = lbolt; 1405 logstats.ls_lwrites.value.ui64++; 1406 1407 /* If snapshots are enabled, write through the snapshot driver */ 1408 if (ul->un_ufsvfs->vfs_snapshot) 1409 fssnap_strategy(&ul->un_ufsvfs->vfs_snapshot, bp); 1410 else 1411 (void) bdev_strategy(bp); 1412 1413 lwp_stat_update(LWP_STAT_OUBLK, 1); 1414 } 1415 1416 void 1417 lufs_strategy(ml_unit_t *ul, buf_t *bp) 1418 { 1419 if (bp->b_flags & B_READ) 1420 lufs_read_strategy(ul, bp); 1421 else 1422 lufs_write_strategy(ul, bp); 1423 } 1424 1425 /* ARGSUSED */ 1426 static int 1427 delta_stats_update(kstat_t *ksp, int rw) 1428 { 1429 if (rw == KSTAT_WRITE) { 1430 delta_stats[DT_SB] = dkstats.ds_superblock_deltas.value.ui64; 1431 delta_stats[DT_CG] = dkstats.ds_bitmap_deltas.value.ui64; 1432 delta_stats[DT_SI] = dkstats.ds_suminfo_deltas.value.ui64; 1433 delta_stats[DT_AB] = dkstats.ds_allocblk_deltas.value.ui64; 1434 delta_stats[DT_ABZERO] = dkstats.ds_ab0_deltas.value.ui64; 1435 delta_stats[DT_DIR] = dkstats.ds_dir_deltas.value.ui64; 1436 delta_stats[DT_INODE] = dkstats.ds_inode_deltas.value.ui64; 1437 delta_stats[DT_FBI] = dkstats.ds_fbiwrite_deltas.value.ui64; 1438 delta_stats[DT_QR] = dkstats.ds_quota_deltas.value.ui64; 1439 delta_stats[DT_SHAD] = dkstats.ds_shadow_deltas.value.ui64; 1440 1441 roll_stats[DT_SB] = dkstats.ds_superblock_rolled.value.ui64; 1442 roll_stats[DT_CG] = dkstats.ds_bitmap_rolled.value.ui64; 1443 roll_stats[DT_SI] = dkstats.ds_suminfo_rolled.value.ui64; 1444 roll_stats[DT_AB] = dkstats.ds_allocblk_rolled.value.ui64; 1445 roll_stats[DT_ABZERO] = dkstats.ds_ab0_rolled.value.ui64; 1446 roll_stats[DT_DIR] = dkstats.ds_dir_rolled.value.ui64; 1447 roll_stats[DT_INODE] = dkstats.ds_inode_rolled.value.ui64; 1448 roll_stats[DT_FBI] = dkstats.ds_fbiwrite_rolled.value.ui64; 1449 roll_stats[DT_QR] = dkstats.ds_quota_rolled.value.ui64; 1450 roll_stats[DT_SHAD] = dkstats.ds_shadow_rolled.value.ui64; 1451 } else { 1452 dkstats.ds_superblock_deltas.value.ui64 = delta_stats[DT_SB]; 1453 dkstats.ds_bitmap_deltas.value.ui64 = delta_stats[DT_CG]; 1454 dkstats.ds_suminfo_deltas.value.ui64 = delta_stats[DT_SI]; 1455 dkstats.ds_allocblk_deltas.value.ui64 = delta_stats[DT_AB]; 1456 dkstats.ds_ab0_deltas.value.ui64 = delta_stats[DT_ABZERO]; 1457 dkstats.ds_dir_deltas.value.ui64 = delta_stats[DT_DIR]; 1458 dkstats.ds_inode_deltas.value.ui64 = delta_stats[DT_INODE]; 1459 dkstats.ds_fbiwrite_deltas.value.ui64 = delta_stats[DT_FBI]; 1460 dkstats.ds_quota_deltas.value.ui64 = delta_stats[DT_QR]; 1461 dkstats.ds_shadow_deltas.value.ui64 = delta_stats[DT_SHAD]; 1462 1463 dkstats.ds_superblock_rolled.value.ui64 = roll_stats[DT_SB]; 1464 dkstats.ds_bitmap_rolled.value.ui64 = roll_stats[DT_CG]; 1465 dkstats.ds_suminfo_rolled.value.ui64 = roll_stats[DT_SI]; 1466 dkstats.ds_allocblk_rolled.value.ui64 = roll_stats[DT_AB]; 1467 dkstats.ds_ab0_rolled.value.ui64 = roll_stats[DT_ABZERO]; 1468 dkstats.ds_dir_rolled.value.ui64 = roll_stats[DT_DIR]; 1469 dkstats.ds_inode_rolled.value.ui64 = roll_stats[DT_INODE]; 1470 dkstats.ds_fbiwrite_rolled.value.ui64 = roll_stats[DT_FBI]; 1471 dkstats.ds_quota_rolled.value.ui64 = roll_stats[DT_QR]; 1472 dkstats.ds_shadow_rolled.value.ui64 = roll_stats[DT_SHAD]; 1473 } 1474 return (0); 1475 } 1476 1477 extern size_t ufs_crb_limit; 1478 extern int ufs_max_crb_divisor; 1479 1480 void 1481 lufs_init(void) 1482 { 1483 kstat_t *ksp; 1484 1485 /* Create kmem caches */ 1486 lufs_sv = kmem_cache_create("lufs_save", sizeof (lufs_save_t), 0, 1487 NULL, NULL, NULL, NULL, NULL, 0); 1488 lufs_bp = kmem_cache_create("lufs_bufs", sizeof (lufs_buf_t), 0, 1489 NULL, NULL, NULL, NULL, NULL, 0); 1490 1491 mutex_init(&log_mutex, NULL, MUTEX_DEFAULT, NULL); 1492 1493 _init_top(); 1494 1495 if (&bio_lufs_strategy != NULL) 1496 bio_lufs_strategy = (void (*) (void *, buf_t *)) lufs_strategy; 1497 1498 /* 1499 * Initialise general logging and delta kstats 1500 */ 1501 ksp = kstat_create("ufs_log", 0, "logstats", "ufs", KSTAT_TYPE_NAMED, 1502 sizeof (logstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1503 if (ksp) { 1504 ksp->ks_data = (void *) &logstats; 1505 kstat_install(ksp); 1506 } 1507 1508 ksp = kstat_create("ufs_log", 0, "deltastats", "ufs", KSTAT_TYPE_NAMED, 1509 sizeof (dkstats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 1510 if (ksp) { 1511 ksp->ks_data = (void *) &dkstats; 1512 ksp->ks_update = delta_stats_update; 1513 kstat_install(ksp); 1514 } 1515 1516 /* Initialize generation of logging ids */ 1517 lufs_genid_init(); 1518 1519 /* 1520 * Set up the maximum amount of kmem that the crbs (system wide) 1521 * can use. 1522 */ 1523 ufs_crb_limit = kmem_maxavail() / ufs_max_crb_divisor; 1524 } 1525