1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/vnode.h> 29 #include <sys/errno.h> 30 #include <sys/sysmacros.h> 31 #include <sys/debug.h> 32 #include <sys/kmem.h> 33 #include <sys/conf.h> 34 #include <sys/proc.h> 35 #include <sys/cmn_err.h> 36 #include <sys/fssnap_if.h> 37 #include <sys/fs/ufs_inode.h> 38 #include <sys/fs/ufs_filio.h> 39 #include <sys/fs/ufs_log.h> 40 #include <sys/fs/ufs_bio.h> 41 #include <sys/inttypes.h> 42 #include <sys/callb.h> 43 #include <sys/tnf_probe.h> 44 45 /* 46 * Kernel threads for logging 47 * Currently only one for rolling the log (one per log). 48 */ 49 50 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16 51 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4 52 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64 53 54 /* 55 * Macros 56 */ 57 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme) 58 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof) 59 60 /* 61 * Tunables 62 */ 63 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS; 64 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS; 65 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS; 66 long logmap_maxnme = 1536; 67 int trans_roll_tics = 0; 68 uint64_t trans_roll_new_delta = 0; 69 uint64_t lrr_wait = 0; 70 /* 71 * Key for thread specific data for the roll thread to 72 * bypass snapshot throttling 73 */ 74 uint_t bypass_snapshot_throttle_key; 75 76 /* 77 * externs 78 */ 79 extern kmutex_t ml_scan; 80 extern kcondvar_t ml_scan_cv; 81 extern int maxphys; 82 83 static void 84 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop) 85 { 86 mutex_enter(&logmap->mtm_mutex); 87 logmap->mtm_ref = 0; 88 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 89 cv_broadcast(&logmap->mtm_from_roll_cv); 90 } 91 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING); 92 CALLB_CPR_SAFE_BEGIN(cprinfop); 93 (void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex, 94 trans_roll_tics, TR_CLOCK_TICK); 95 CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex); 96 logmap->mtm_flags |= MTM_ROLLING; 97 mutex_exit(&logmap->mtm_mutex); 98 } 99 100 /* 101 * returns the number of 8K buffers to use for rolling the log 102 */ 103 static uint32_t 104 log_roll_buffers() 105 { 106 /* 107 * sanity validate the tunable lufs_num_roll_bufs 108 */ 109 if (lufs_num_roll_bufs < lufs_min_roll_bufs) { 110 return (lufs_min_roll_bufs); 111 } 112 if (lufs_num_roll_bufs > lufs_max_roll_bufs) { 113 return (lufs_max_roll_bufs); 114 } 115 return (lufs_num_roll_bufs); 116 } 117 118 /* 119 * Find something to roll, then if we don't have cached roll buffers 120 * covering all the deltas in that MAPBLOCK then read the master 121 * and overlay the deltas. 122 * returns; 123 * 0 if sucessful 124 * 1 on finding nothing to roll 125 * 2 on error 126 */ 127 int 128 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs, 129 int *retnbuf) 130 { 131 offset_t mof; 132 buf_t *bp; 133 rollbuf_t *rbp; 134 mt_map_t *logmap = ul->un_logmap; 135 daddr_t mblkno; 136 int i; 137 int error; 138 int nbuf; 139 140 /* 141 * Make sure there is really something to roll 142 */ 143 mof = 0; 144 if (!logmap_next_roll(logmap, &mof)) { 145 return (1); 146 } 147 148 /* 149 * build some master blocks + deltas to roll forward 150 */ 151 rw_enter(&logmap->mtm_rwlock, RW_READER); 152 nbuf = 0; 153 do { 154 mof = mof & (offset_t)MAPBLOCKMASK; 155 mblkno = lbtodb(mof); 156 157 /* 158 * Check for the case of a new delta to a set up buffer 159 */ 160 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 161 if (P2ALIGN(rbp->rb_bh.b_blkno, 162 MAPBLOCKSIZE / DEV_BSIZE) == mblkno) { 163 TNF_PROBE_0(trans_roll_new_delta, "lufs", 164 /* CSTYLED */); 165 trans_roll_new_delta++; 166 /* Flush out the current set of buffers */ 167 goto flush_bufs; 168 } 169 } 170 171 /* 172 * Work out what to roll next. If it isn't cached then read 173 * it asynchronously from the master. 174 */ 175 bp = &rbp->rb_bh; 176 bp->b_blkno = mblkno; 177 bp->b_flags = B_READ; 178 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT); 179 bp->b_bufsize = MAPBLOCKSIZE; 180 if (top_read_roll(rbp, ul)) { 181 /* logmap deltas were in use */ 182 if (nbuf == 0) { 183 /* 184 * On first buffer wait for the logmap user 185 * to finish by grabbing the logmap lock 186 * exclusively rather than spinning 187 */ 188 rw_exit(&logmap->mtm_rwlock); 189 lrr_wait++; 190 rw_enter(&logmap->mtm_rwlock, RW_WRITER); 191 rw_exit(&logmap->mtm_rwlock); 192 return (1); 193 } 194 /* we have at least one buffer - flush it */ 195 goto flush_bufs; 196 } 197 if ((bp->b_flags & B_INVAL) == 0) { 198 nbuf++; 199 } 200 mof += MAPBLOCKSIZE; 201 } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof)); 202 203 /* 204 * If there was nothing to roll cycle back 205 */ 206 if (nbuf == 0) { 207 rw_exit(&logmap->mtm_rwlock); 208 return (1); 209 } 210 211 flush_bufs: 212 /* 213 * For each buffer, if it isn't cached then wait for the read to 214 * finish and overlay the deltas. 215 */ 216 for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 217 if (!rbp->rb_crb) { 218 bp = &rbp->rb_bh; 219 if (trans_not_wait(bp)) { 220 ldl_seterror(ul, 221 "Error reading master during ufs log roll"); 222 error = 1; 223 } 224 /* 225 * sync read the data from the log 226 */ 227 if (ldl_read(ul, bp->b_un.b_addr, 228 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, 229 MAPBLOCKSIZE, rbp->rb_age)) { 230 error = 1; 231 } 232 } 233 234 /* 235 * reset the age bit in the age list 236 */ 237 logmap_list_put_roll(logmap, rbp->rb_age); 238 239 if (ul->un_flags & LDL_ERROR) { 240 error = 1; 241 } 242 } 243 rw_exit(&logmap->mtm_rwlock); 244 if (error) 245 return (2); 246 *retnbuf = nbuf; 247 return (0); 248 } 249 250 /* 251 * Write out a cached roll buffer 252 */ 253 void 254 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 255 { 256 crb_t *crb = rbp->rb_crb; 257 buf_t *bp = &rbp->rb_bh; 258 259 bp->b_blkno = lbtodb(crb->c_mof); 260 bp->b_un.b_addr = crb->c_buf; 261 bp->b_bcount = crb->c_nb; 262 bp->b_bufsize = crb->c_nb; 263 ASSERT((crb->c_nb & DEV_BMASK) == 0); 264 bp->b_flags = B_WRITE; 265 logstats.ls_rwrites.value.ui64++; 266 267 /* if snapshots are enabled, call it */ 268 if (ufsvfsp->vfs_snapshot) { 269 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 270 } else { 271 (void) bdev_strategy(bp); 272 } 273 } 274 275 /* 276 * Write out a set of non cached roll buffers 277 */ 278 void 279 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 280 { 281 buf_t *bp = &rbp->rb_bh; 282 buf_t *bp2; 283 rbsecmap_t secmap = rbp->rb_secmap; 284 int j, k; 285 286 ASSERT(secmap); 287 ASSERT((bp->b_flags & B_INVAL) == 0); 288 289 do { /* for each contiguous block of sectors */ 290 /* find start of next sector to write */ 291 for (j = 0; j < 16; ++j) { 292 if (secmap & UINT16_C(1)) 293 break; 294 secmap >>= 1; 295 } 296 bp->b_un.b_addr += (j << DEV_BSHIFT); 297 bp->b_blkno += j; 298 299 /* calculate number of sectors */ 300 secmap >>= 1; 301 j++; 302 for (k = 1; j < 16; ++j) { 303 if ((secmap & UINT16_C(1)) == 0) 304 break; 305 secmap >>= 1; 306 k++; 307 } 308 bp->b_bcount = k << DEV_BSHIFT; 309 bp->b_flags = B_WRITE; 310 logstats.ls_rwrites.value.ui64++; 311 312 /* if snapshots are enabled, call it */ 313 if (ufsvfsp->vfs_snapshot) 314 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 315 else 316 (void) bdev_strategy(bp); 317 if (secmap) { 318 /* 319 * Allocate another buf_t to handle 320 * the next write in this MAPBLOCK 321 * Chain them via b_list. 322 */ 323 bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP); 324 bp->b_list = bp2; 325 bioinit(bp2); 326 bp2->b_iodone = trans_not_done; 327 bp2->b_bufsize = MAPBLOCKSIZE; 328 bp2->b_edev = bp->b_edev; 329 bp2->b_un.b_addr = 330 bp->b_un.b_addr + bp->b_bcount; 331 bp2->b_blkno = bp->b_blkno + k; 332 bp = bp2; 333 } 334 } while (secmap); 335 } 336 337 /* 338 * Asynchronously roll the deltas, using the sector map 339 * in each rollbuf_t. 340 */ 341 int 342 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf) 343 { 344 345 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 346 rollbuf_t *rbp; 347 buf_t *bp, *bp2; 348 rollbuf_t *head, *prev, *rbp2; 349 350 /* 351 * Order the buffers by blkno 352 */ 353 ASSERT(nbuf > 0); 354 #ifdef lint 355 prev = rbs; 356 #endif 357 for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) { 358 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) { 359 if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) { 360 if (rbp2 == head) { 361 rbp->rb_next = head; 362 head = rbp; 363 } else { 364 prev->rb_next = rbp; 365 rbp->rb_next = rbp2; 366 } 367 break; 368 } 369 } 370 if (rbp2 == NULL) { 371 prev->rb_next = rbp; 372 rbp->rb_next = NULL; 373 } 374 } 375 376 /* 377 * issue the in-order writes 378 */ 379 for (rbp = head; rbp; rbp = rbp2) { 380 if (rbp->rb_crb) { 381 log_roll_write_crb(ufsvfsp, rbp); 382 } else { 383 log_roll_write_bufs(ufsvfsp, rbp); 384 } 385 /* null out the rb_next link for next set of rolling */ 386 rbp2 = rbp->rb_next; 387 rbp->rb_next = NULL; 388 } 389 390 /* 391 * wait for all the writes to finish 392 */ 393 for (rbp = rbs; rbp < rbs + nbuf; rbp++) { 394 bp = &rbp->rb_bh; 395 if (trans_not_wait(bp)) { 396 ldl_seterror(ul, 397 "Error writing master during ufs log roll"); 398 } 399 400 /* 401 * Now wait for all the "cloned" buffer writes (if any) 402 * and free those headers 403 */ 404 bp2 = bp->b_list; 405 bp->b_list = NULL; 406 while (bp2) { 407 if (trans_not_wait(bp2)) { 408 ldl_seterror(ul, 409 "Error writing master during ufs log roll"); 410 } 411 bp = bp2; 412 bp2 = bp2->b_list; 413 kmem_free(bp, sizeof (buf_t)); 414 } 415 } 416 417 if (ul->un_flags & LDL_ERROR) 418 return (1); 419 return (0); 420 } 421 422 void 423 trans_roll(ml_unit_t *ul) 424 { 425 callb_cpr_t cprinfo; 426 mt_map_t *logmap = ul->un_logmap; 427 rollbuf_t *rbs; 428 rollbuf_t *rbp; 429 buf_t *bp; 430 caddr_t roll_bufs; 431 uint32_t nmblk; 432 int i; 433 int doingforceroll; 434 int nbuf; 435 436 CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr, 437 "trans_roll"); 438 439 /* 440 * We do not want the roll thread's writes to be 441 * throttled by the snapshot. 442 * If they are throttled then we can have a deadlock 443 * between the roll thread and the snapshot taskq thread: 444 * roll thread wants the throttling semaphore and 445 * the snapshot taskq thread cannot release the semaphore 446 * because it is writing to the log and the log is full. 447 */ 448 449 (void) tsd_set(bypass_snapshot_throttle_key, (void*)1); 450 451 /* 452 * setup some roll parameters 453 */ 454 if (trans_roll_tics == 0) 455 trans_roll_tics = 5 * hz; 456 nmblk = log_roll_buffers(); 457 458 /* 459 * allocate the buffers and buffer headers 460 */ 461 roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP); 462 rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP); 463 464 /* 465 * initialize the buffer headers 466 */ 467 for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) { 468 rbp->rb_next = NULL; 469 bp = &rbp->rb_bh; 470 bioinit(bp); 471 bp->b_edev = ul->un_dev; 472 bp->b_iodone = trans_not_done; 473 bp->b_bufsize = MAPBLOCKSIZE; 474 } 475 476 doingforceroll = 0; 477 478 again: 479 /* 480 * LOOP FOREVER 481 */ 482 483 /* 484 * exit on demand 485 */ 486 mutex_enter(&logmap->mtm_mutex); 487 if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) { 488 kmem_free(rbs, nmblk * sizeof (rollbuf_t)); 489 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE); 490 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING | 491 MTM_ROLL_EXIT | MTM_ROLLING); 492 cv_broadcast(&logmap->mtm_from_roll_cv); 493 CALLB_CPR_EXIT(&cprinfo); 494 thread_exit(); 495 /* NOTREACHED */ 496 } 497 498 /* 499 * MT_SCAN debug mode 500 * don't roll except in FORCEROLL situations 501 */ 502 if (logmap->mtm_debug & MT_SCAN) 503 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) { 504 mutex_exit(&logmap->mtm_mutex); 505 trans_roll_wait(logmap, &cprinfo); 506 goto again; 507 } 508 ASSERT(logmap->mtm_trimlof == 0); 509 510 /* 511 * If we've finished a force roll cycle then wakeup any 512 * waiters. 513 */ 514 if (doingforceroll) { 515 doingforceroll = 0; 516 logmap->mtm_flags &= ~MTM_FORCE_ROLL; 517 mutex_exit(&logmap->mtm_mutex); 518 cv_broadcast(&logmap->mtm_from_roll_cv); 519 } else { 520 mutex_exit(&logmap->mtm_mutex); 521 } 522 523 /* 524 * If someone wants us to roll something; then do it 525 */ 526 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 527 doingforceroll = 1; 528 goto rollsomething; 529 } 530 531 /* 532 * Log is busy, check if logmap is getting full. 533 */ 534 if (logmap_need_roll(logmap)) { 535 goto rollsomething; 536 } 537 538 /* 539 * Check if the log is idle and is not empty 540 */ 541 if (!logmap->mtm_ref && !ldl_empty(ul)) { 542 goto rollsomething; 543 } 544 545 /* 546 * Log is busy, check if its getting full 547 */ 548 if (ldl_need_roll(ul)) { 549 goto rollsomething; 550 } 551 552 /* 553 * nothing to do; wait a bit and then start over 554 */ 555 trans_roll_wait(logmap, &cprinfo); 556 goto again; 557 558 /* 559 * ROLL SOMETHING 560 */ 561 562 rollsomething: 563 /* 564 * Use the cached roll buffers, or read the master 565 * and overlay the deltas 566 */ 567 switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) { 568 case 1: trans_roll_wait(logmap, &cprinfo); 569 /* FALLTHROUGH */ 570 case 2: goto again; 571 /* default case is success */ 572 } 573 574 /* 575 * Asynchronously write out the deltas 576 */ 577 if (log_roll_write(ul, rbs, nbuf)) 578 goto again; 579 580 /* 581 * free up the deltas in the logmap 582 */ 583 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 584 bp = &rbp->rb_bh; 585 logmap_remove_roll(logmap, 586 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE); 587 } 588 589 /* 590 * free up log space; if possible 591 */ 592 logmap_sethead(logmap, ul); 593 594 /* 595 * LOOP 596 */ 597 goto again; 598 } 599