1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/systm.h> 30 #include <sys/types.h> 31 #include <sys/vnode.h> 32 #include <sys/errno.h> 33 #include <sys/sysmacros.h> 34 #include <sys/debug.h> 35 #include <sys/kmem.h> 36 #include <sys/conf.h> 37 #include <sys/proc.h> 38 #include <sys/cmn_err.h> 39 #include <sys/fssnap_if.h> 40 #include <sys/fs/ufs_inode.h> 41 #include <sys/fs/ufs_filio.h> 42 #include <sys/fs/ufs_log.h> 43 #include <sys/fs/ufs_bio.h> 44 #include <sys/inttypes.h> 45 #include <sys/callb.h> 46 #include <sys/tnf_probe.h> 47 48 /* 49 * Kernel threads for logging 50 * Currently only one for rolling the log (one per log). 51 */ 52 53 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16 54 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4 55 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64 56 57 /* 58 * Macros 59 */ 60 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme) 61 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof) 62 63 /* 64 * Tunables 65 */ 66 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS; 67 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS; 68 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS; 69 long logmap_maxnme = 1536; 70 int trans_roll_tics = 0; 71 uint64_t trans_roll_new_delta = 0; 72 uint64_t lrr_wait = 0; 73 /* 74 * Key for thread specific data for the roll thread to 75 * bypass snapshot throttling 76 */ 77 uint_t bypass_snapshot_throttle_key; 78 79 /* 80 * externs 81 */ 82 extern kmutex_t ml_scan; 83 extern kcondvar_t ml_scan_cv; 84 extern int maxphys; 85 86 static void 87 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop) 88 { 89 mutex_enter(&logmap->mtm_mutex); 90 logmap->mtm_ref = 0; 91 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 92 cv_broadcast(&logmap->mtm_from_roll_cv); 93 } 94 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING); 95 CALLB_CPR_SAFE_BEGIN(cprinfop); 96 (void) cv_timedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex, 97 lbolt + trans_roll_tics); 98 CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex); 99 logmap->mtm_flags |= MTM_ROLLING; 100 mutex_exit(&logmap->mtm_mutex); 101 } 102 103 /* 104 * returns the number of 8K buffers to use for rolling the log 105 */ 106 static uint32_t 107 log_roll_buffers() 108 { 109 /* 110 * sanity validate the tunable lufs_num_roll_bufs 111 */ 112 if (lufs_num_roll_bufs < lufs_min_roll_bufs) { 113 return (lufs_min_roll_bufs); 114 } 115 if (lufs_num_roll_bufs > lufs_max_roll_bufs) { 116 return (lufs_max_roll_bufs); 117 } 118 return (lufs_num_roll_bufs); 119 } 120 121 /* 122 * Find something to roll, then if we don't have cached roll buffers 123 * covering all the deltas in that MAPBLOCK then read the master 124 * and overlay the deltas. 125 * returns; 126 * 0 if sucessful 127 * 1 on finding nothing to roll 128 * 2 on error 129 */ 130 int 131 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs, 132 int *retnbuf) 133 { 134 offset_t mof; 135 buf_t *bp; 136 rollbuf_t *rbp; 137 mt_map_t *logmap = ul->un_logmap; 138 daddr_t mblkno; 139 int i; 140 int error; 141 int nbuf; 142 143 /* 144 * Make sure there is really something to roll 145 */ 146 mof = 0; 147 if (!logmap_next_roll(logmap, &mof)) { 148 return (1); 149 } 150 151 /* 152 * build some master blocks + deltas to roll forward 153 */ 154 rw_enter(&logmap->mtm_rwlock, RW_READER); 155 nbuf = 0; 156 do { 157 mof = mof & (offset_t)MAPBLOCKMASK; 158 mblkno = lbtodb(mof); 159 160 /* 161 * Check for the case of a new delta to a set up buffer 162 */ 163 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 164 if (P2ALIGN(rbp->rb_bh.b_blkno, 165 MAPBLOCKSIZE / DEV_BSIZE) == mblkno) { 166 TNF_PROBE_0(trans_roll_new_delta, "lufs", 167 /* CSTYLED */); 168 trans_roll_new_delta++; 169 /* Flush out the current set of buffers */ 170 goto flush_bufs; 171 } 172 } 173 174 /* 175 * Work out what to roll next. If it isn't cached then read 176 * it asynchronously from the master. 177 */ 178 bp = &rbp->rb_bh; 179 bp->b_blkno = mblkno; 180 bp->b_flags = B_READ; 181 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT); 182 bp->b_bufsize = MAPBLOCKSIZE; 183 if (top_read_roll(rbp, ul)) { 184 /* logmap deltas were in use */ 185 if (nbuf == 0) { 186 /* 187 * On first buffer wait for the logmap user 188 * to finish by grabbing the logmap lock 189 * exclusively rather than spinning 190 */ 191 rw_exit(&logmap->mtm_rwlock); 192 lrr_wait++; 193 rw_enter(&logmap->mtm_rwlock, RW_WRITER); 194 rw_exit(&logmap->mtm_rwlock); 195 return (1); 196 } 197 /* we have at least one buffer - flush it */ 198 goto flush_bufs; 199 } 200 if ((bp->b_flags & B_INVAL) == 0) { 201 nbuf++; 202 } 203 mof += MAPBLOCKSIZE; 204 } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof)); 205 206 /* 207 * If there was nothing to roll cycle back 208 */ 209 if (nbuf == 0) { 210 rw_exit(&logmap->mtm_rwlock); 211 return (1); 212 } 213 214 flush_bufs: 215 /* 216 * For each buffer, if it isn't cached then wait for the read to 217 * finish and overlay the deltas. 218 */ 219 for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 220 if (!rbp->rb_crb) { 221 bp = &rbp->rb_bh; 222 if (trans_not_wait(bp)) { 223 ldl_seterror(ul, 224 "Error reading master during ufs log roll"); 225 error = 1; 226 } 227 /* 228 * sync read the data from the log 229 */ 230 if (ldl_read(ul, bp->b_un.b_addr, 231 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, 232 MAPBLOCKSIZE, rbp->rb_age)) { 233 error = 1; 234 } 235 } 236 237 /* 238 * reset the age bit in the age list 239 */ 240 logmap_list_put_roll(logmap, rbp->rb_age); 241 242 if (ul->un_flags & LDL_ERROR) { 243 error = 1; 244 } 245 } 246 rw_exit(&logmap->mtm_rwlock); 247 if (error) 248 return (2); 249 *retnbuf = nbuf; 250 return (0); 251 } 252 253 /* 254 * Write out a cached roll buffer 255 */ 256 void 257 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 258 { 259 crb_t *crb = rbp->rb_crb; 260 buf_t *bp = &rbp->rb_bh; 261 262 bp->b_blkno = lbtodb(crb->c_mof); 263 bp->b_un.b_addr = crb->c_buf; 264 bp->b_bcount = crb->c_nb; 265 bp->b_bufsize = crb->c_nb; 266 ASSERT((crb->c_nb & DEV_BMASK) == 0); 267 bp->b_flags = B_WRITE; 268 logstats.ls_rwrites.value.ui64++; 269 270 /* if snapshots are enabled, call it */ 271 if (ufsvfsp->vfs_snapshot) { 272 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 273 } else { 274 (void) bdev_strategy(bp); 275 } 276 } 277 278 /* 279 * Write out a set of non cached roll buffers 280 */ 281 void 282 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 283 { 284 buf_t *bp = &rbp->rb_bh; 285 buf_t *bp2; 286 rbsecmap_t secmap = rbp->rb_secmap; 287 int j, k; 288 289 ASSERT(secmap); 290 ASSERT((bp->b_flags & B_INVAL) == 0); 291 292 do { /* for each contiguous block of sectors */ 293 /* find start of next sector to write */ 294 for (j = 0; j < 16; ++j) { 295 if (secmap & UINT16_C(1)) 296 break; 297 secmap >>= 1; 298 } 299 bp->b_un.b_addr += (j << DEV_BSHIFT); 300 bp->b_blkno += j; 301 302 /* calculate number of sectors */ 303 secmap >>= 1; 304 j++; 305 for (k = 1; j < 16; ++j) { 306 if ((secmap & UINT16_C(1)) == 0) 307 break; 308 secmap >>= 1; 309 k++; 310 } 311 bp->b_bcount = k << DEV_BSHIFT; 312 bp->b_flags = B_WRITE; 313 logstats.ls_rwrites.value.ui64++; 314 315 /* if snapshots are enabled, call it */ 316 if (ufsvfsp->vfs_snapshot) 317 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 318 else 319 (void) bdev_strategy(bp); 320 if (secmap) { 321 /* 322 * Allocate another buf_t to handle 323 * the next write in this MAPBLOCK 324 * Chain them via b_list. 325 */ 326 bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP); 327 bp->b_list = bp2; 328 bioinit(bp2); 329 bp2->b_iodone = trans_not_done; 330 bp2->b_bufsize = MAPBLOCKSIZE; 331 bp2->b_edev = bp->b_edev; 332 bp2->b_un.b_addr = 333 bp->b_un.b_addr + bp->b_bcount; 334 bp2->b_blkno = bp->b_blkno + k; 335 bp = bp2; 336 } 337 } while (secmap); 338 } 339 340 /* 341 * Asynchronously roll the deltas, using the sector map 342 * in each rollbuf_t. 343 */ 344 int 345 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf) 346 { 347 348 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 349 rollbuf_t *rbp; 350 buf_t *bp, *bp2; 351 rollbuf_t *head, *prev, *rbp2; 352 353 /* 354 * Order the buffers by blkno 355 */ 356 ASSERT(nbuf > 0); 357 #ifdef lint 358 prev = rbs; 359 #endif 360 for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) { 361 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) { 362 if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) { 363 if (rbp2 == head) { 364 rbp->rb_next = head; 365 head = rbp; 366 } else { 367 prev->rb_next = rbp; 368 rbp->rb_next = rbp2; 369 } 370 break; 371 } 372 } 373 if (rbp2 == NULL) { 374 prev->rb_next = rbp; 375 rbp->rb_next = NULL; 376 } 377 } 378 379 /* 380 * issue the in-order writes 381 */ 382 for (rbp = head; rbp; rbp = rbp2) { 383 if (rbp->rb_crb) { 384 log_roll_write_crb(ufsvfsp, rbp); 385 } else { 386 log_roll_write_bufs(ufsvfsp, rbp); 387 } 388 /* null out the rb_next link for next set of rolling */ 389 rbp2 = rbp->rb_next; 390 rbp->rb_next = NULL; 391 } 392 393 /* 394 * wait for all the writes to finish 395 */ 396 for (rbp = rbs; rbp < rbs + nbuf; rbp++) { 397 bp = &rbp->rb_bh; 398 if (trans_not_wait(bp)) { 399 ldl_seterror(ul, 400 "Error writing master during ufs log roll"); 401 } 402 403 /* 404 * Now wait for all the "cloned" buffer writes (if any) 405 * and free those headers 406 */ 407 bp2 = bp->b_list; 408 bp->b_list = NULL; 409 while (bp2) { 410 if (trans_not_wait(bp2)) { 411 ldl_seterror(ul, 412 "Error writing master during ufs log roll"); 413 } 414 bp = bp2; 415 bp2 = bp2->b_list; 416 kmem_free(bp, sizeof (buf_t)); 417 } 418 } 419 420 if (ul->un_flags & LDL_ERROR) 421 return (1); 422 return (0); 423 } 424 425 void 426 trans_roll(ml_unit_t *ul) 427 { 428 callb_cpr_t cprinfo; 429 mt_map_t *logmap = ul->un_logmap; 430 rollbuf_t *rbs; 431 rollbuf_t *rbp; 432 buf_t *bp; 433 caddr_t roll_bufs; 434 uint32_t nmblk; 435 int i; 436 int doingforceroll; 437 int nbuf; 438 439 CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr, 440 "trans_roll"); 441 442 /* 443 * We do not want the roll thread's writes to be 444 * throttled by the snapshot. 445 * If they are throttled then we can have a deadlock 446 * between the roll thread and the snapshot taskq thread: 447 * roll thread wants the throttling semaphore and 448 * the snapshot taskq thread cannot release the semaphore 449 * because it is writing to the log and the log is full. 450 */ 451 452 (void) tsd_set(bypass_snapshot_throttle_key, (void*)1); 453 454 /* 455 * setup some roll parameters 456 */ 457 if (trans_roll_tics == 0) 458 trans_roll_tics = 5 * hz; 459 nmblk = log_roll_buffers(); 460 461 /* 462 * allocate the buffers and buffer headers 463 */ 464 roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP); 465 rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP); 466 467 /* 468 * initialize the buffer headers 469 */ 470 for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) { 471 rbp->rb_next = NULL; 472 bp = &rbp->rb_bh; 473 bioinit(bp); 474 bp->b_edev = ul->un_dev; 475 bp->b_iodone = trans_not_done; 476 bp->b_bufsize = MAPBLOCKSIZE; 477 } 478 479 doingforceroll = 0; 480 481 again: 482 /* 483 * LOOP FOREVER 484 */ 485 486 /* 487 * exit on demand 488 */ 489 mutex_enter(&logmap->mtm_mutex); 490 if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) { 491 kmem_free(rbs, nmblk * sizeof (rollbuf_t)); 492 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE); 493 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING | 494 MTM_ROLL_EXIT | MTM_ROLLING); 495 cv_broadcast(&logmap->mtm_from_roll_cv); 496 CALLB_CPR_EXIT(&cprinfo); 497 thread_exit(); 498 /* NOTREACHED */ 499 } 500 501 /* 502 * MT_SCAN debug mode 503 * don't roll except in FORCEROLL situations 504 */ 505 if (logmap->mtm_debug & MT_SCAN) 506 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) { 507 mutex_exit(&logmap->mtm_mutex); 508 trans_roll_wait(logmap, &cprinfo); 509 goto again; 510 } 511 ASSERT(logmap->mtm_trimlof == 0); 512 513 /* 514 * If we've finished a force roll cycle then wakeup any 515 * waiters. 516 */ 517 if (doingforceroll) { 518 doingforceroll = 0; 519 logmap->mtm_flags &= ~MTM_FORCE_ROLL; 520 mutex_exit(&logmap->mtm_mutex); 521 cv_broadcast(&logmap->mtm_from_roll_cv); 522 } else { 523 mutex_exit(&logmap->mtm_mutex); 524 } 525 526 /* 527 * If someone wants us to roll something; then do it 528 */ 529 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 530 doingforceroll = 1; 531 goto rollsomething; 532 } 533 534 /* 535 * Log is busy, check if logmap is getting full. 536 */ 537 if (logmap_need_roll(logmap)) { 538 goto rollsomething; 539 } 540 541 /* 542 * Check if the log is idle and is not empty 543 */ 544 if (!logmap->mtm_ref && !ldl_empty(ul)) { 545 goto rollsomething; 546 } 547 548 /* 549 * Log is busy, check if its getting full 550 */ 551 if (ldl_need_roll(ul)) { 552 goto rollsomething; 553 } 554 555 /* 556 * nothing to do; wait a bit and then start over 557 */ 558 trans_roll_wait(logmap, &cprinfo); 559 goto again; 560 561 /* 562 * ROLL SOMETHING 563 */ 564 565 rollsomething: 566 /* 567 * Use the cached roll buffers, or read the master 568 * and overlay the deltas 569 */ 570 switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) { 571 case 1: trans_roll_wait(logmap, &cprinfo); 572 /* FALLTHROUGH */ 573 case 2: goto again; 574 /* default case is success */ 575 } 576 577 /* 578 * Asynchronously write out the deltas 579 */ 580 if (log_roll_write(ul, rbs, nbuf)) 581 goto again; 582 583 /* 584 * free up the deltas in the logmap 585 */ 586 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 587 bp = &rbp->rb_bh; 588 logmap_remove_roll(logmap, 589 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE); 590 } 591 592 /* 593 * free up log space; if possible 594 */ 595 logmap_sethead(logmap, ul); 596 597 /* 598 * LOOP 599 */ 600 goto again; 601 } 602