1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/systm.h> 27 #include <sys/types.h> 28 #include <sys/vnode.h> 29 #include <sys/errno.h> 30 #include <sys/sysmacros.h> 31 #include <sys/debug.h> 32 #include <sys/kmem.h> 33 #include <sys/conf.h> 34 #include <sys/proc.h> 35 #include <sys/cmn_err.h> 36 #include <sys/fssnap_if.h> 37 #include <sys/fs/ufs_inode.h> 38 #include <sys/fs/ufs_filio.h> 39 #include <sys/fs/ufs_log.h> 40 #include <sys/fs/ufs_bio.h> 41 #include <sys/inttypes.h> 42 #include <sys/callb.h> 43 44 /* 45 * Kernel threads for logging 46 * Currently only one for rolling the log (one per log). 47 */ 48 49 #define LUFS_DEFAULT_NUM_ROLL_BUFS 16 50 #define LUFS_DEFAULT_MIN_ROLL_BUFS 4 51 #define LUFS_DEFAULT_MAX_ROLL_BUFS 64 52 53 /* 54 * Macros 55 */ 56 #define logmap_need_roll(logmap) ((logmap)->mtm_nme > logmap_maxnme) 57 #define ldl_empty(ul) ((ul)->un_head_lof == (ul)->un_tail_lof) 58 59 /* 60 * Tunables 61 */ 62 uint32_t lufs_num_roll_bufs = LUFS_DEFAULT_NUM_ROLL_BUFS; 63 uint32_t lufs_min_roll_bufs = LUFS_DEFAULT_MIN_ROLL_BUFS; 64 uint32_t lufs_max_roll_bufs = LUFS_DEFAULT_MAX_ROLL_BUFS; 65 long logmap_maxnme = 1536; 66 int trans_roll_tics = 0; 67 uint64_t trans_roll_new_delta = 0; 68 uint64_t lrr_wait = 0; 69 /* 70 * Key for thread specific data for the roll thread to 71 * bypass snapshot throttling 72 */ 73 uint_t bypass_snapshot_throttle_key; 74 75 /* 76 * externs 77 */ 78 extern kmutex_t ml_scan; 79 extern kcondvar_t ml_scan_cv; 80 extern int maxphys; 81 82 static void 83 trans_roll_wait(mt_map_t *logmap, callb_cpr_t *cprinfop) 84 { 85 mutex_enter(&logmap->mtm_mutex); 86 logmap->mtm_ref = 0; 87 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 88 cv_broadcast(&logmap->mtm_from_roll_cv); 89 } 90 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLLING); 91 CALLB_CPR_SAFE_BEGIN(cprinfop); 92 (void) cv_reltimedwait(&logmap->mtm_to_roll_cv, &logmap->mtm_mutex, 93 trans_roll_tics, TR_CLOCK_TICK); 94 CALLB_CPR_SAFE_END(cprinfop, &logmap->mtm_mutex); 95 logmap->mtm_flags |= MTM_ROLLING; 96 mutex_exit(&logmap->mtm_mutex); 97 } 98 99 /* 100 * returns the number of 8K buffers to use for rolling the log 101 */ 102 static uint32_t 103 log_roll_buffers() 104 { 105 /* 106 * sanity validate the tunable lufs_num_roll_bufs 107 */ 108 if (lufs_num_roll_bufs < lufs_min_roll_bufs) { 109 return (lufs_min_roll_bufs); 110 } 111 if (lufs_num_roll_bufs > lufs_max_roll_bufs) { 112 return (lufs_max_roll_bufs); 113 } 114 return (lufs_num_roll_bufs); 115 } 116 117 /* 118 * Find something to roll, then if we don't have cached roll buffers 119 * covering all the deltas in that MAPBLOCK then read the master 120 * and overlay the deltas. 121 * returns; 122 * 0 if sucessful 123 * 1 on finding nothing to roll 124 * 2 on error 125 */ 126 int 127 log_roll_read(ml_unit_t *ul, rollbuf_t *rbs, int nmblk, caddr_t roll_bufs, 128 int *retnbuf) 129 { 130 offset_t mof; 131 buf_t *bp; 132 rollbuf_t *rbp; 133 mt_map_t *logmap = ul->un_logmap; 134 daddr_t mblkno; 135 int i; 136 int error; 137 int nbuf; 138 139 /* 140 * Make sure there is really something to roll 141 */ 142 mof = 0; 143 if (!logmap_next_roll(logmap, &mof)) { 144 return (1); 145 } 146 147 /* 148 * build some master blocks + deltas to roll forward 149 */ 150 rw_enter(&logmap->mtm_rwlock, RW_READER); 151 nbuf = 0; 152 do { 153 mof = mof & (offset_t)MAPBLOCKMASK; 154 mblkno = lbtodb(mof); 155 156 /* 157 * Check for the case of a new delta to a set up buffer 158 */ 159 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 160 if (P2ALIGN(rbp->rb_bh.b_blkno, 161 MAPBLOCKSIZE / DEV_BSIZE) == mblkno) { 162 trans_roll_new_delta++; 163 /* Flush out the current set of buffers */ 164 goto flush_bufs; 165 } 166 } 167 168 /* 169 * Work out what to roll next. If it isn't cached then read 170 * it asynchronously from the master. 171 */ 172 bp = &rbp->rb_bh; 173 bp->b_blkno = mblkno; 174 bp->b_flags = B_READ; 175 bp->b_un.b_addr = roll_bufs + (nbuf << MAPBLOCKSHIFT); 176 bp->b_bufsize = MAPBLOCKSIZE; 177 if (top_read_roll(rbp, ul)) { 178 /* logmap deltas were in use */ 179 if (nbuf == 0) { 180 /* 181 * On first buffer wait for the logmap user 182 * to finish by grabbing the logmap lock 183 * exclusively rather than spinning 184 */ 185 rw_exit(&logmap->mtm_rwlock); 186 lrr_wait++; 187 rw_enter(&logmap->mtm_rwlock, RW_WRITER); 188 rw_exit(&logmap->mtm_rwlock); 189 return (1); 190 } 191 /* we have at least one buffer - flush it */ 192 goto flush_bufs; 193 } 194 if ((bp->b_flags & B_INVAL) == 0) { 195 nbuf++; 196 } 197 mof += MAPBLOCKSIZE; 198 } while ((nbuf < nmblk) && logmap_next_roll(logmap, &mof)); 199 200 /* 201 * If there was nothing to roll cycle back 202 */ 203 if (nbuf == 0) { 204 rw_exit(&logmap->mtm_rwlock); 205 return (1); 206 } 207 208 flush_bufs: 209 /* 210 * For each buffer, if it isn't cached then wait for the read to 211 * finish and overlay the deltas. 212 */ 213 for (error = 0, i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 214 if (!rbp->rb_crb) { 215 bp = &rbp->rb_bh; 216 if (trans_not_wait(bp)) { 217 ldl_seterror(ul, 218 "Error reading master during ufs log roll"); 219 error = 1; 220 } 221 /* 222 * sync read the data from the log 223 */ 224 if (ldl_read(ul, bp->b_un.b_addr, 225 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, 226 MAPBLOCKSIZE, rbp->rb_age)) { 227 error = 1; 228 } 229 } 230 231 /* 232 * reset the age bit in the age list 233 */ 234 logmap_list_put_roll(logmap, rbp->rb_age); 235 236 if (ul->un_flags & LDL_ERROR) { 237 error = 1; 238 } 239 } 240 rw_exit(&logmap->mtm_rwlock); 241 if (error) 242 return (2); 243 *retnbuf = nbuf; 244 return (0); 245 } 246 247 /* 248 * Write out a cached roll buffer 249 */ 250 void 251 log_roll_write_crb(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 252 { 253 crb_t *crb = rbp->rb_crb; 254 buf_t *bp = &rbp->rb_bh; 255 256 bp->b_blkno = lbtodb(crb->c_mof); 257 bp->b_un.b_addr = crb->c_buf; 258 bp->b_bcount = crb->c_nb; 259 bp->b_bufsize = crb->c_nb; 260 ASSERT((crb->c_nb & DEV_BMASK) == 0); 261 bp->b_flags = B_WRITE; 262 logstats.ls_rwrites.value.ui64++; 263 264 /* if snapshots are enabled, call it */ 265 if (ufsvfsp->vfs_snapshot) { 266 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 267 } else { 268 (void) bdev_strategy(bp); 269 } 270 } 271 272 /* 273 * Write out a set of non cached roll buffers 274 */ 275 void 276 log_roll_write_bufs(ufsvfs_t *ufsvfsp, rollbuf_t *rbp) 277 { 278 buf_t *bp = &rbp->rb_bh; 279 buf_t *bp2; 280 rbsecmap_t secmap = rbp->rb_secmap; 281 int j, k; 282 283 ASSERT(secmap); 284 ASSERT((bp->b_flags & B_INVAL) == 0); 285 286 do { /* for each contiguous block of sectors */ 287 /* find start of next sector to write */ 288 for (j = 0; j < 16; ++j) { 289 if (secmap & UINT16_C(1)) 290 break; 291 secmap >>= 1; 292 } 293 bp->b_un.b_addr += (j << DEV_BSHIFT); 294 bp->b_blkno += j; 295 296 /* calculate number of sectors */ 297 secmap >>= 1; 298 j++; 299 for (k = 1; j < 16; ++j) { 300 if ((secmap & UINT16_C(1)) == 0) 301 break; 302 secmap >>= 1; 303 k++; 304 } 305 bp->b_bcount = k << DEV_BSHIFT; 306 bp->b_flags = B_WRITE; 307 logstats.ls_rwrites.value.ui64++; 308 309 /* if snapshots are enabled, call it */ 310 if (ufsvfsp->vfs_snapshot) 311 fssnap_strategy(&ufsvfsp->vfs_snapshot, bp); 312 else 313 (void) bdev_strategy(bp); 314 if (secmap) { 315 /* 316 * Allocate another buf_t to handle 317 * the next write in this MAPBLOCK 318 * Chain them via b_list. 319 */ 320 bp2 = kmem_alloc(sizeof (buf_t), KM_SLEEP); 321 bp->b_list = bp2; 322 bioinit(bp2); 323 bp2->b_iodone = trans_not_done; 324 bp2->b_bufsize = MAPBLOCKSIZE; 325 bp2->b_edev = bp->b_edev; 326 bp2->b_un.b_addr = 327 bp->b_un.b_addr + bp->b_bcount; 328 bp2->b_blkno = bp->b_blkno + k; 329 bp = bp2; 330 } 331 } while (secmap); 332 } 333 334 /* 335 * Asynchronously roll the deltas, using the sector map 336 * in each rollbuf_t. 337 */ 338 int 339 log_roll_write(ml_unit_t *ul, rollbuf_t *rbs, int nbuf) 340 { 341 342 ufsvfs_t *ufsvfsp = ul->un_ufsvfs; 343 rollbuf_t *rbp; 344 buf_t *bp, *bp2; 345 rollbuf_t *head, *prev, *rbp2; 346 347 /* 348 * Order the buffers by blkno 349 */ 350 ASSERT(nbuf > 0); 351 #ifdef lint 352 prev = rbs; 353 #endif 354 for (head = rbs, rbp = rbs + 1; rbp < rbs + nbuf; rbp++) { 355 for (rbp2 = head; rbp2; prev = rbp2, rbp2 = rbp2->rb_next) { 356 if (rbp->rb_bh.b_blkno < rbp2->rb_bh.b_blkno) { 357 if (rbp2 == head) { 358 rbp->rb_next = head; 359 head = rbp; 360 } else { 361 prev->rb_next = rbp; 362 rbp->rb_next = rbp2; 363 } 364 break; 365 } 366 } 367 if (rbp2 == NULL) { 368 prev->rb_next = rbp; 369 rbp->rb_next = NULL; 370 } 371 } 372 373 /* 374 * issue the in-order writes 375 */ 376 for (rbp = head; rbp; rbp = rbp2) { 377 if (rbp->rb_crb) { 378 log_roll_write_crb(ufsvfsp, rbp); 379 } else { 380 log_roll_write_bufs(ufsvfsp, rbp); 381 } 382 /* null out the rb_next link for next set of rolling */ 383 rbp2 = rbp->rb_next; 384 rbp->rb_next = NULL; 385 } 386 387 /* 388 * wait for all the writes to finish 389 */ 390 for (rbp = rbs; rbp < rbs + nbuf; rbp++) { 391 bp = &rbp->rb_bh; 392 if (trans_not_wait(bp)) { 393 ldl_seterror(ul, 394 "Error writing master during ufs log roll"); 395 } 396 397 /* 398 * Now wait for all the "cloned" buffer writes (if any) 399 * and free those headers 400 */ 401 bp2 = bp->b_list; 402 bp->b_list = NULL; 403 while (bp2) { 404 if (trans_not_wait(bp2)) { 405 ldl_seterror(ul, 406 "Error writing master during ufs log roll"); 407 } 408 bp = bp2; 409 bp2 = bp2->b_list; 410 kmem_free(bp, sizeof (buf_t)); 411 } 412 } 413 414 if (ul->un_flags & LDL_ERROR) 415 return (1); 416 return (0); 417 } 418 419 void 420 trans_roll(ml_unit_t *ul) 421 { 422 callb_cpr_t cprinfo; 423 mt_map_t *logmap = ul->un_logmap; 424 rollbuf_t *rbs; 425 rollbuf_t *rbp; 426 buf_t *bp; 427 caddr_t roll_bufs; 428 uint32_t nmblk; 429 int i; 430 int doingforceroll; 431 int nbuf; 432 433 CALLB_CPR_INIT(&cprinfo, &logmap->mtm_mutex, callb_generic_cpr, 434 "trans_roll"); 435 436 /* 437 * We do not want the roll thread's writes to be 438 * throttled by the snapshot. 439 * If they are throttled then we can have a deadlock 440 * between the roll thread and the snapshot taskq thread: 441 * roll thread wants the throttling semaphore and 442 * the snapshot taskq thread cannot release the semaphore 443 * because it is writing to the log and the log is full. 444 */ 445 446 (void) tsd_set(bypass_snapshot_throttle_key, (void*)1); 447 448 /* 449 * setup some roll parameters 450 */ 451 if (trans_roll_tics == 0) 452 trans_roll_tics = 5 * hz; 453 nmblk = log_roll_buffers(); 454 455 /* 456 * allocate the buffers and buffer headers 457 */ 458 roll_bufs = kmem_alloc(nmblk * MAPBLOCKSIZE, KM_SLEEP); 459 rbs = kmem_alloc(nmblk * sizeof (rollbuf_t), KM_SLEEP); 460 461 /* 462 * initialize the buffer headers 463 */ 464 for (i = 0, rbp = rbs; i < nmblk; ++i, ++rbp) { 465 rbp->rb_next = NULL; 466 bp = &rbp->rb_bh; 467 bioinit(bp); 468 bp->b_edev = ul->un_dev; 469 bp->b_iodone = trans_not_done; 470 bp->b_bufsize = MAPBLOCKSIZE; 471 } 472 473 doingforceroll = 0; 474 475 again: 476 /* 477 * LOOP FOREVER 478 */ 479 480 /* 481 * exit on demand 482 */ 483 mutex_enter(&logmap->mtm_mutex); 484 if ((ul->un_flags & LDL_ERROR) || (logmap->mtm_flags & MTM_ROLL_EXIT)) { 485 kmem_free(rbs, nmblk * sizeof (rollbuf_t)); 486 kmem_free(roll_bufs, nmblk * MAPBLOCKSIZE); 487 logmap->mtm_flags &= ~(MTM_FORCE_ROLL | MTM_ROLL_RUNNING | 488 MTM_ROLL_EXIT | MTM_ROLLING); 489 cv_broadcast(&logmap->mtm_from_roll_cv); 490 CALLB_CPR_EXIT(&cprinfo); 491 thread_exit(); 492 /* NOTREACHED */ 493 } 494 495 /* 496 * MT_SCAN debug mode 497 * don't roll except in FORCEROLL situations 498 */ 499 if (logmap->mtm_debug & MT_SCAN) 500 if ((logmap->mtm_flags & MTM_FORCE_ROLL) == 0) { 501 mutex_exit(&logmap->mtm_mutex); 502 trans_roll_wait(logmap, &cprinfo); 503 goto again; 504 } 505 ASSERT(logmap->mtm_trimlof == 0); 506 507 /* 508 * If we've finished a force roll cycle then wakeup any 509 * waiters. 510 */ 511 if (doingforceroll) { 512 doingforceroll = 0; 513 logmap->mtm_flags &= ~MTM_FORCE_ROLL; 514 mutex_exit(&logmap->mtm_mutex); 515 cv_broadcast(&logmap->mtm_from_roll_cv); 516 } else { 517 mutex_exit(&logmap->mtm_mutex); 518 } 519 520 /* 521 * If someone wants us to roll something; then do it 522 */ 523 if (logmap->mtm_flags & MTM_FORCE_ROLL) { 524 doingforceroll = 1; 525 goto rollsomething; 526 } 527 528 /* 529 * Log is busy, check if logmap is getting full. 530 */ 531 if (logmap_need_roll(logmap)) { 532 goto rollsomething; 533 } 534 535 /* 536 * Check if the log is idle and is not empty 537 */ 538 if (!logmap->mtm_ref && !ldl_empty(ul)) { 539 goto rollsomething; 540 } 541 542 /* 543 * Log is busy, check if its getting full 544 */ 545 if (ldl_need_roll(ul)) { 546 goto rollsomething; 547 } 548 549 /* 550 * nothing to do; wait a bit and then start over 551 */ 552 trans_roll_wait(logmap, &cprinfo); 553 goto again; 554 555 /* 556 * ROLL SOMETHING 557 */ 558 559 rollsomething: 560 /* 561 * Use the cached roll buffers, or read the master 562 * and overlay the deltas 563 */ 564 switch (log_roll_read(ul, rbs, nmblk, roll_bufs, &nbuf)) { 565 case 1: trans_roll_wait(logmap, &cprinfo); 566 /* FALLTHROUGH */ 567 case 2: goto again; 568 /* default case is success */ 569 } 570 571 /* 572 * Asynchronously write out the deltas 573 */ 574 if (log_roll_write(ul, rbs, nbuf)) 575 goto again; 576 577 /* 578 * free up the deltas in the logmap 579 */ 580 for (i = 0, rbp = rbs; i < nbuf; ++i, ++rbp) { 581 bp = &rbp->rb_bh; 582 logmap_remove_roll(logmap, 583 ldbtob(bp->b_blkno) & (offset_t)MAPBLOCKMASK, MAPBLOCKSIZE); 584 } 585 586 /* 587 * free up log space; if possible 588 */ 589 logmap_sethead(logmap, ul); 590 591 /* 592 * LOOP 593 */ 594 goto again; 595 } 596