1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/systm.h> 29 #include <sys/types.h> 30 #include <sys/vnode.h> 31 #include <sys/errno.h> 32 #include <sys/sysmacros.h> 33 #include <sys/debug.h> 34 #include <sys/kmem.h> 35 #include <sys/conf.h> 36 #include <sys/proc.h> 37 #include <sys/cmn_err.h> 38 #include <sys/fssnap_if.h> 39 #include <sys/fs/ufs_inode.h> 40 #include <sys/fs/ufs_filio.h> 41 #include <sys/fs/ufs_log.h> 42 #include <sys/fs/ufs_bio.h> 43 #include <sys/inttypes.h> 44 #include <sys/callb.h> 45 #include <sys/tnf_probe.h> 46 47 /* 48 * Kernel threads for logging 49 * Currently only one for rolling the log (one per log). 50 */ 51 52 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16 53 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4 54 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64 55 56 /* 57 * Macros 58 */ 59 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme) 60 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof) 61 62 /* 63 * Tunables 64 */ 65 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS; 66 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS; 67 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS; 68 long logmap_maxnme = 1536; 69 int trans_roll_tics = 0; 70 uint64_t trans_roll_new_delta = 0; 71 uint64_t lrr_wait = 0; 72 /* 73 * Key for thread specific data for the roll thread to 74 * bypass snapshot throttling 75 */ 76 uint_t bypass_snapshot_throttle_key; 77 78 /* 79 * externs 80 */ 81 extern kmutex_t ml_scan; 82 extern kcondvar_t ml_scan_cv; 83 extern int maxphys; 84 85 static void 86 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop) 87 { 88 mutex_enter(&logmap->mtm_mutex); 89 logmap->mtm_ref = 0; 90 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 91 cv_broadcast(&logmap->mtm_from_roll_cv); 92 } 93 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING); 94 CALLB_CPR_SAFE_BEGIN(cprinfop); 95 (void) cv_timedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex, 96 lbolt + trans_roll_tics); 97 CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex); 98 logmap->mtm_flags |= MTM_ROLLING; 99 mutex_exit(&logmap->mtm_mutex); 100 } 101 102 /* 103 * returns the number of 8K buffers to use for rolling the log 104 */ 105 static uint32_t 106 log_roll_buffers() 107 { 108 /* 109 * sanity validate the tunable lufs_num_roll_bufs 110 */ 111 if (lufs_num_roll_bufs < lufs_min_roll_bufs) { 112 return (lufs_min_roll_bufs); 113 } 114 if (lufs_num_roll_bufs > lufs_max_roll_bufs) { 115 return (lufs_max_roll_bufs); 116 } 117 return (lufs_num_roll_bufs); 118 } 119 120 /* 121 * Find something to roll, then if we don't have cached roll buffers 122 * covering all the deltas in that MAPBLOCK then read the master 123 * and overlay the deltas. 124 * returns; 125 * 0 if sucessful 126 * 1 on finding nothing to roll 127 * 2 on error 128 */ 129 int 130 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs, 131 int *retnbuf) 132 { 133 offset_t mof; 134 buf_t *bp; 135 rollbuf_t *rbp; 136 mt_map_t *logmap = ul->un_logmap; 137 daddr_t mblkno; 138 int i; 139 int error; 140 int nbuf; 141 142 /* 143 * Make sure there is really something to roll 144 */ 145 mof = 0; 146 if (!logmap_next_roll(logmap, &mof)) { 147 return (1); 148 } 149 150 /* 151 * build some master blocks + deltas to roll forward 152 */ 153 rw_enter(&logmap->mtm_rwlock, RW_READER); 154 nbuf = 0; 155 do { 156 mof = mof & (offset_t)MAPBLOCKMASK; 157 mblkno = lbtodb(mof); 158 159 /* 160 * Check for the case of a new delta to a set up buffer 161 */ 162 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 163 if (P2ALIGN(rbp->rb_bh.b_blkno, 164 MAPBLOCKSIZE / DEV_BSIZE) == mblkno) { 165 TNF_PROBE_0(trans_roll_new_delta, "lufs", 166 /* CSTYLED */); 167 trans_roll_new_delta++; 168 /* Flush out the current set of buffers */ 169 goto flush_bufs; 170 } 171 } 172 173 /* 174 * Work out what to roll next. If it isn't cached then read 175 * it asynchronously from the master. 176 */ 177 bp = &rbp->rb_bh; 178 bp->b_blkno = mblkno; 179 bp->b_flags = B_READ; 180 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT); 181 bp->b_bufsize = MAPBLOCKSIZE; 182 if (top_read_roll(rbp, ul)) { 183 /* logmap deltas were in use */ 184 if (nbuf == 0) { 185 /* 186 * On first buffer wait for the logmap user 187 * to finish by grabbing the logmap lock 188 * exclusively rather than spinning 189 */ 190 rw_exit(&logmap->mtm_rwlock); 191 lrr_wait++; 192 rw_enter(&logmap->mtm_rwlock, RW_WRITER); 193 rw_exit(&logmap->mtm_rwlock); 194 return (1); 195 } 196 /* we have at least one buffer - flush it */ 197 goto flush_bufs; 198 } 199 if ((bp->b_flags & B_INVAL) == 0) { 200 nbuf++; 201 } 202 mof += MAPBLOCKSIZE; 203 } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof)); 204 205 /* 206 * If there was nothing to roll cycle back 207 */ 208 if (nbuf == 0) { 209 rw_exit(&logmap->mtm_rwlock); 210 return (1); 211 } 212 213 flush_bufs: 214 /* 215 * For each buffer, if it isn't cached then wait for the read to 216 * finish and overlay the deltas. 217 */ 218 for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 219 if (!rbp->rb_crb) { 220 bp = &rbp->rb_bh; 221 if (trans_not_wait(bp)) { 222 ldl_seterror(ul, 223 "Error reading master during ufs log roll"); 224 error = 1; 225 } 226 /* 227 * sync read the data from the log 228 */ 229 if (ldl_read(ul, bp->b_un.b_addr, 230 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, 231 MAPBLOCKSIZE, rbp->rb_age)) { 232 error = 1; 233 } 234 } 235 236 /* 237 * reset the age bit in the age list 238 */ 239 logmap_list_put_roll(logmap, rbp->rb_age); 240 241 if (ul->un_flags & LDL_ERROR) { 242 error = 1; 243 } 244 } 245 rw_exit(&logmap->mtm_rwlock); 246 if (error) 247 return (2); 248 *retnbuf = nbuf; 249 return (0); 250 } 251 252 /* 253 * Write out a cached roll buffer 254 */ 255 void 256 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 257 { 258 crb_t *crb = rbp->rb_crb; 259 buf_t *bp = &rbp->rb_bh; 260 261 bp->b_blkno = lbtodb(crb->c_mof); 262 bp->b_un.b_addr = crb->c_buf; 263 bp->b_bcount = crb->c_nb; 264 bp->b_bufsize = crb->c_nb; 265 ASSERT((crb->c_nb & DEV_BMASK) == 0); 266 bp->b_flags = B_WRITE; 267 logstats.ls_rwrites.value.ui64++; 268 269 /* if snapshots are enabled, call it */ 270 if (ufsvfsp->vfs_snapshot) { 271 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 272 } else { 273 (void) bdev_strategy(bp); 274 } 275 } 276 277 /* 278 * Write out a set of non cached roll buffers 279 */ 280 void 281 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 282 { 283 buf_t *bp = &rbp->rb_bh; 284 buf_t *bp2; 285 rbsecmap_t secmap = rbp->rb_secmap; 286 int j, k; 287 288 ASSERT(secmap); 289 ASSERT((bp->b_flags & B_INVAL) == 0); 290 291 do { /* for each contiguous block of sectors */ 292 /* find start of next sector to write */ 293 for (j = 0; j < 16; ++j) { 294 if (secmap & UINT16_C(1)) 295 break; 296 secmap >>= 1; 297 } 298 bp->b_un.b_addr += (j << DEV_BSHIFT); 299 bp->b_blkno += j; 300 301 /* calculate number of sectors */ 302 secmap >>= 1; 303 j++; 304 for (k = 1; j < 16; ++j) { 305 if ((secmap & UINT16_C(1)) == 0) 306 break; 307 secmap >>= 1; 308 k++; 309 } 310 bp->b_bcount = k << DEV_BSHIFT; 311 bp->b_flags = B_WRITE; 312 logstats.ls_rwrites.value.ui64++; 313 314 /* if snapshots are enabled, call it */ 315 if (ufsvfsp->vfs_snapshot) 316 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 317 else 318 (void) bdev_strategy(bp); 319 if (secmap) { 320 /* 321 * Allocate another buf_t to handle 322 * the next write in this MAPBLOCK 323 * Chain them via b_list. 324 */ 325 bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP); 326 bp->b_list = bp2; 327 bioinit(bp2); 328 bp2->b_iodone = trans_not_done; 329 bp2->b_bufsize = MAPBLOCKSIZE; 330 bp2->b_edev = bp->b_edev; 331 bp2->b_un.b_addr = 332 bp->b_un.b_addr + bp->b_bcount; 333 bp2->b_blkno = bp->b_blkno + k; 334 bp = bp2; 335 } 336 } while (secmap); 337 } 338 339 /* 340 * Asynchronously roll the deltas, using the sector map 341 * in each rollbuf_t. 342 */ 343 int 344 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf) 345 { 346 347 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 348 rollbuf_t *rbp; 349 buf_t *bp, *bp2; 350 rollbuf_t *head, *prev, *rbp2; 351 352 /* 353 * Order the buffers by blkno 354 */ 355 ASSERT(nbuf > 0); 356 #ifdef lint 357 prev = rbs; 358 #endif 359 for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) { 360 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) { 361 if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) { 362 if (rbp2 == head) { 363 rbp->rb_next = head; 364 head = rbp; 365 } else { 366 prev->rb_next = rbp; 367 rbp->rb_next = rbp2; 368 } 369 break; 370 } 371 } 372 if (rbp2 == NULL) { 373 prev->rb_next = rbp; 374 rbp->rb_next = NULL; 375 } 376 } 377 378 /* 379 * issue the in-order writes 380 */ 381 for (rbp = head; rbp; rbp = rbp2) { 382 if (rbp->rb_crb) { 383 log_roll_write_crb(ufsvfsp, rbp); 384 } else { 385 log_roll_write_bufs(ufsvfsp, rbp); 386 } 387 /* null out the rb_next link for next set of rolling */ 388 rbp2 = rbp->rb_next; 389 rbp->rb_next = NULL; 390 } 391 392 /* 393 * wait for all the writes to finish 394 */ 395 for (rbp = rbs; rbp < rbs + nbuf; rbp++) { 396 bp = &rbp->rb_bh; 397 if (trans_not_wait(bp)) { 398 ldl_seterror(ul, 399 "Error writing master during ufs log roll"); 400 } 401 402 /* 403 * Now wait for all the "cloned" buffer writes (if any) 404 * and free those headers 405 */ 406 bp2 = bp->b_list; 407 bp->b_list = NULL; 408 while (bp2) { 409 if (trans_not_wait(bp2)) { 410 ldl_seterror(ul, 411 "Error writing master during ufs log roll"); 412 } 413 bp = bp2; 414 bp2 = bp2->b_list; 415 kmem_free(bp, sizeof (buf_t)); 416 } 417 } 418 419 if (ul->un_flags & LDL_ERROR) 420 return (1); 421 return (0); 422 } 423 424 void 425 trans_roll(ml_unit_t *ul) 426 { 427 callb_cpr_t cprinfo; 428 mt_map_t *logmap = ul->un_logmap; 429 rollbuf_t *rbs; 430 rollbuf_t *rbp; 431 buf_t *bp; 432 caddr_t roll_bufs; 433 uint32_t nmblk; 434 int i; 435 int doingforceroll; 436 int nbuf; 437 438 CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr, 439 "trans_roll"); 440 441 /* 442 * We do not want the roll thread's writes to be 443 * throttled by the snapshot. 444 * If they are throttled then we can have a deadlock 445 * between the roll thread and the snapshot taskq thread: 446 * roll thread wants the throttling semaphore and 447 * the snapshot taskq thread cannot release the semaphore 448 * because it is writing to the log and the log is full. 449 */ 450 451 (void) tsd_set(bypass_snapshot_throttle_key, (void*)1); 452 453 /* 454 * setup some roll parameters 455 */ 456 if (trans_roll_tics == 0) 457 trans_roll_tics = 5 * hz; 458 nmblk = log_roll_buffers(); 459 460 /* 461 * allocate the buffers and buffer headers 462 */ 463 roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP); 464 rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP); 465 466 /* 467 * initialize the buffer headers 468 */ 469 for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) { 470 rbp->rb_next = NULL; 471 bp = &rbp->rb_bh; 472 bioinit(bp); 473 bp->b_edev = ul->un_dev; 474 bp->b_iodone = trans_not_done; 475 bp->b_bufsize = MAPBLOCKSIZE; 476 } 477 478 doingforceroll = 0; 479 480 again: 481 /* 482 * LOOP FOREVER 483 */ 484 485 /* 486 * exit on demand 487 */ 488 mutex_enter(&logmap->mtm_mutex); 489 if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) { 490 kmem_free(rbs, nmblk * sizeof (rollbuf_t)); 491 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE); 492 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING | 493 MTM_ROLL_EXIT | MTM_ROLLING); 494 cv_broadcast(&logmap->mtm_from_roll_cv); 495 CALLB_CPR_EXIT(&cprinfo); 496 thread_exit(); 497 /* NOTREACHED */ 498 } 499 500 /* 501 * MT_SCAN debug mode 502 * don't roll except in FORCEROLL situations 503 */ 504 if (logmap->mtm_debug & MT_SCAN) 505 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) { 506 mutex_exit(&logmap->mtm_mutex); 507 trans_roll_wait(logmap, &cprinfo); 508 goto again; 509 } 510 ASSERT(logmap->mtm_trimlof == 0); 511 512 /* 513 * If we've finished a force roll cycle then wakeup any 514 * waiters. 515 */ 516 if (doingforceroll) { 517 doingforceroll = 0; 518 logmap->mtm_flags &= ~MTM_FORCE_ROLL; 519 mutex_exit(&logmap->mtm_mutex); 520 cv_broadcast(&logmap->mtm_from_roll_cv); 521 } else { 522 mutex_exit(&logmap->mtm_mutex); 523 } 524 525 /* 526 * If someone wants us to roll something; then do it 527 */ 528 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 529 doingforceroll = 1; 530 goto rollsomething; 531 } 532 533 /* 534 * Log is busy, check if logmap is getting full. 535 */ 536 if (logmap_need_roll(logmap)) { 537 goto rollsomething; 538 } 539 540 /* 541 * Check if the log is idle and is not empty 542 */ 543 if (!logmap->mtm_ref && !ldl_empty(ul)) { 544 goto rollsomething; 545 } 546 547 /* 548 * Log is busy, check if its getting full 549 */ 550 if (ldl_need_roll(ul)) { 551 goto rollsomething; 552 } 553 554 /* 555 * nothing to do; wait a bit and then start over 556 */ 557 trans_roll_wait(logmap, &cprinfo); 558 goto again; 559 560 /* 561 * ROLL SOMETHING 562 */ 563 564 rollsomething: 565 /* 566 * Use the cached roll buffers, or read the master 567 * and overlay the deltas 568 */ 569 switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) { 570 case 1: trans_roll_wait(logmap, &cprinfo); 571 /* FALLTHROUGH */ 572 case 2: goto again; 573 /* default case is success */ 574 } 575 576 /* 577 * Asynchronously write out the deltas 578 */ 579 if (log_roll_write(ul, rbs, nbuf)) 580 goto again; 581 582 /* 583 * free up the deltas in the logmap 584 */ 585 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 586 bp = &rbp->rb_bh; 587 logmap_remove_roll(logmap, 588 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE); 589 } 590 591 /* 592 * free up log space; if possible 593 */ 594 logmap_sethead(logmap, ul); 595 596 /* 597 * LOOP 598 */ 599 goto again; 600 } 601