1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/dmu.h> 29 #include <sys/zap.h> 30 #include <sys/arc.h> 31 #include <sys/stat.h> 32 #include <sys/resource.h> 33 #include <sys/zil.h> 34 #include <sys/zil_impl.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/vdev.h> 37 #include <sys/dmu_tx.h> 38 39 /* 40 * The zfs intent log (ZIL) saves transaction records of system calls 41 * that change the file system in memory with enough information 42 * to be able to replay them. These are stored in memory until 43 * either the DMU transaction group (txg) commits them to the stable pool 44 * and they can be discarded, or they are flushed to the stable log 45 * (also in the pool) due to a fsync, O_DSYNC or other synchronous 46 * requirement. In the event of a panic or power fail then those log 47 * records (transactions) are replayed. 48 * 49 * There is one ZIL per file system. Its on-disk (pool) format consists 50 * of 3 parts: 51 * 52 * - ZIL header 53 * - ZIL blocks 54 * - ZIL records 55 * 56 * A log record holds a system call transaction. Log blocks can 57 * hold many log records and the blocks are chained together. 58 * Each ZIL block contains a block pointer (blkptr_t) to the next 59 * ZIL block in the chain. The ZIL header points to the first 60 * block in the chain. Note there is not a fixed place in the pool 61 * to hold blocks. They are dynamically allocated and freed as 62 * needed from the blocks available. Figure X shows the ZIL structure: 63 */ 64 65 /* 66 * This global ZIL switch affects all pools 67 */ 68 int zil_disable = 0; /* disable intent logging */ 69 70 /* 71 * Tunable parameter for debugging or performance analysis. Setting 72 * zfs_nocacheflush will cause corruption on power loss if a volatile 73 * out-of-order write cache is enabled. 74 */ 75 boolean_t zfs_nocacheflush = B_FALSE; 76 77 static kmem_cache_t *zil_lwb_cache; 78 79 static boolean_t zil_empty(zilog_t *zilog); 80 81 #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 82 sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 83 84 85 static int 86 zil_bp_compare(const void *x1, const void *x2) 87 { 88 const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 89 const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 90 91 if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 92 return (-1); 93 if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 94 return (1); 95 96 if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 97 return (-1); 98 if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 99 return (1); 100 101 return (0); 102 } 103 104 static void 105 zil_bp_tree_init(zilog_t *zilog) 106 { 107 avl_create(&zilog->zl_bp_tree, zil_bp_compare, 108 sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 109 } 110 111 static void 112 zil_bp_tree_fini(zilog_t *zilog) 113 { 114 avl_tree_t *t = &zilog->zl_bp_tree; 115 zil_bp_node_t *zn; 116 void *cookie = NULL; 117 118 while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 119 kmem_free(zn, sizeof (zil_bp_node_t)); 120 121 avl_destroy(t); 122 } 123 124 int 125 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 126 { 127 avl_tree_t *t = &zilog->zl_bp_tree; 128 const dva_t *dva = BP_IDENTITY(bp); 129 zil_bp_node_t *zn; 130 avl_index_t where; 131 132 if (avl_find(t, dva, &where) != NULL) 133 return (EEXIST); 134 135 zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 136 zn->zn_dva = *dva; 137 avl_insert(t, zn, where); 138 139 return (0); 140 } 141 142 static zil_header_t * 143 zil_header_in_syncing_context(zilog_t *zilog) 144 { 145 return ((zil_header_t *)zilog->zl_header); 146 } 147 148 static void 149 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 150 { 151 zio_cksum_t *zc = &bp->blk_cksum; 152 153 zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 154 zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 155 zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 156 zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 157 } 158 159 /* 160 * Read a log block and make sure it's valid. 161 */ 162 static int 163 zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 164 char **end) 165 { 166 enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 167 uint32_t aflags = ARC_WAIT; 168 arc_buf_t *abuf = NULL; 169 zbookmark_t zb; 170 int error; 171 172 if (zilog->zl_header->zh_claim_txg == 0) 173 zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 174 175 if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 176 zio_flags |= ZIO_FLAG_SPECULATIVE; 177 178 SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 179 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 180 181 error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 182 ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 183 184 if (error == 0) { 185 zio_cksum_t cksum = bp->blk_cksum; 186 187 /* 188 * Validate the checksummed log block. 189 * 190 * Sequence numbers should be... sequential. The checksum 191 * verifier for the next block should be bp's checksum plus 1. 192 * 193 * Also check the log chain linkage and size used. 194 */ 195 cksum.zc_word[ZIL_ZC_SEQ]++; 196 197 if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 198 zil_chain_t *zilc = abuf->b_data; 199 char *lr = (char *)(zilc + 1); 200 uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 201 202 if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 203 sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 204 error = ECKSUM; 205 } else { 206 bcopy(lr, dst, len); 207 *end = (char *)dst + len; 208 *nbp = zilc->zc_next_blk; 209 } 210 } else { 211 char *lr = abuf->b_data; 212 uint64_t size = BP_GET_LSIZE(bp); 213 zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 214 215 if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 216 sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 217 (zilc->zc_nused > (size - sizeof (*zilc)))) { 218 error = ECKSUM; 219 } else { 220 bcopy(lr, dst, zilc->zc_nused); 221 *end = (char *)dst + zilc->zc_nused; 222 *nbp = zilc->zc_next_blk; 223 } 224 } 225 226 VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 227 } 228 229 return (error); 230 } 231 232 /* 233 * Read a TX_WRITE log data block. 234 */ 235 static int 236 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 237 { 238 enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 239 const blkptr_t *bp = &lr->lr_blkptr; 240 uint32_t aflags = ARC_WAIT; 241 arc_buf_t *abuf = NULL; 242 zbookmark_t zb; 243 int error; 244 245 if (BP_IS_HOLE(bp)) { 246 if (wbuf != NULL) 247 bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 248 return (0); 249 } 250 251 if (zilog->zl_header->zh_claim_txg == 0) 252 zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 253 254 SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 255 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 256 257 error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 258 ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 259 260 if (error == 0) { 261 if (wbuf != NULL) 262 bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 263 (void) arc_buf_remove_ref(abuf, &abuf); 264 } 265 266 return (error); 267 } 268 269 /* 270 * Parse the intent log, and call parse_func for each valid record within. 271 */ 272 int 273 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 274 zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 275 { 276 const zil_header_t *zh = zilog->zl_header; 277 boolean_t claimed = !!zh->zh_claim_txg; 278 uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 279 uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 280 uint64_t max_blk_seq = 0; 281 uint64_t max_lr_seq = 0; 282 uint64_t blk_count = 0; 283 uint64_t lr_count = 0; 284 blkptr_t blk, next_blk; 285 char *lrbuf, *lrp; 286 int error = 0; 287 288 /* 289 * Old logs didn't record the maximum zh_claim_lr_seq. 290 */ 291 if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 292 claim_lr_seq = UINT64_MAX; 293 294 /* 295 * Starting at the block pointed to by zh_log we read the log chain. 296 * For each block in the chain we strongly check that block to 297 * ensure its validity. We stop when an invalid block is found. 298 * For each block pointer in the chain we call parse_blk_func(). 299 * For each record in each valid block we call parse_lr_func(). 300 * If the log has been claimed, stop if we encounter a sequence 301 * number greater than the highest claimed sequence number. 302 */ 303 lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); 304 zil_bp_tree_init(zilog); 305 306 for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 307 uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 308 int reclen; 309 char *end; 310 311 if (blk_seq > claim_blk_seq) 312 break; 313 if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 314 break; 315 ASSERT3U(max_blk_seq, <, blk_seq); 316 max_blk_seq = blk_seq; 317 blk_count++; 318 319 if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 320 break; 321 322 error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 323 if (error) 324 break; 325 326 for (lrp = lrbuf; lrp < end; lrp += reclen) { 327 lr_t *lr = (lr_t *)lrp; 328 reclen = lr->lrc_reclen; 329 ASSERT3U(reclen, >=, sizeof (lr_t)); 330 if (lr->lrc_seq > claim_lr_seq) 331 goto done; 332 if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 333 goto done; 334 ASSERT3U(max_lr_seq, <, lr->lrc_seq); 335 max_lr_seq = lr->lrc_seq; 336 lr_count++; 337 } 338 } 339 done: 340 zilog->zl_parse_error = error; 341 zilog->zl_parse_blk_seq = max_blk_seq; 342 zilog->zl_parse_lr_seq = max_lr_seq; 343 zilog->zl_parse_blk_count = blk_count; 344 zilog->zl_parse_lr_count = lr_count; 345 346 ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 347 (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 348 349 zil_bp_tree_fini(zilog); 350 zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); 351 352 return (error); 353 } 354 355 static int 356 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 357 { 358 /* 359 * Claim log block if not already committed and not already claimed. 360 * If tx == NULL, just verify that the block is claimable. 361 */ 362 if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) 363 return (0); 364 365 return (zio_wait(zio_claim(NULL, zilog->zl_spa, 366 tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 367 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 368 } 369 370 static int 371 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 372 { 373 lr_write_t *lr = (lr_write_t *)lrc; 374 int error; 375 376 if (lrc->lrc_txtype != TX_WRITE) 377 return (0); 378 379 /* 380 * If the block is not readable, don't claim it. This can happen 381 * in normal operation when a log block is written to disk before 382 * some of the dmu_sync() blocks it points to. In this case, the 383 * transaction cannot have been committed to anyone (we would have 384 * waited for all writes to be stable first), so it is semantically 385 * correct to declare this the end of the log. 386 */ 387 if (lr->lr_blkptr.blk_birth >= first_txg && 388 (error = zil_read_log_data(zilog, lr, NULL)) != 0) 389 return (error); 390 return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 391 } 392 393 /* ARGSUSED */ 394 static int 395 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 396 { 397 zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 398 399 return (0); 400 } 401 402 static int 403 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 404 { 405 lr_write_t *lr = (lr_write_t *)lrc; 406 blkptr_t *bp = &lr->lr_blkptr; 407 408 /* 409 * If we previously claimed it, we need to free it. 410 */ 411 if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 412 bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) 413 zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 414 415 return (0); 416 } 417 418 static lwb_t * 419 zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) 420 { 421 lwb_t *lwb; 422 423 lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 424 lwb->lwb_zilog = zilog; 425 lwb->lwb_blk = *bp; 426 lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 427 lwb->lwb_max_txg = txg; 428 lwb->lwb_zio = NULL; 429 lwb->lwb_tx = NULL; 430 if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 431 lwb->lwb_nused = sizeof (zil_chain_t); 432 lwb->lwb_sz = BP_GET_LSIZE(bp); 433 } else { 434 lwb->lwb_nused = 0; 435 lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 436 } 437 438 mutex_enter(&zilog->zl_lock); 439 list_insert_tail(&zilog->zl_lwb_list, lwb); 440 mutex_exit(&zilog->zl_lock); 441 442 return (lwb); 443 } 444 445 /* 446 * Create an on-disk intent log. 447 */ 448 static lwb_t * 449 zil_create(zilog_t *zilog) 450 { 451 const zil_header_t *zh = zilog->zl_header; 452 lwb_t *lwb = NULL; 453 uint64_t txg = 0; 454 dmu_tx_t *tx = NULL; 455 blkptr_t blk; 456 int error = 0; 457 458 /* 459 * Wait for any previous destroy to complete. 460 */ 461 txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 462 463 ASSERT(zh->zh_claim_txg == 0); 464 ASSERT(zh->zh_replay_seq == 0); 465 466 blk = zh->zh_log; 467 468 /* 469 * Allocate an initial log block if: 470 * - there isn't one already 471 * - the existing block is the wrong endianess 472 */ 473 if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 474 tx = dmu_tx_create(zilog->zl_os); 475 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 476 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 477 txg = dmu_tx_get_txg(tx); 478 479 if (!BP_IS_HOLE(&blk)) { 480 zio_free_zil(zilog->zl_spa, txg, &blk); 481 BP_ZERO(&blk); 482 } 483 484 error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, 485 ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); 486 487 if (error == 0) 488 zil_init_log_chain(zilog, &blk); 489 } 490 491 /* 492 * Allocate a log write buffer (lwb) for the first log block. 493 */ 494 if (error == 0) 495 lwb = zil_alloc_lwb(zilog, &blk, txg); 496 497 /* 498 * If we just allocated the first log block, commit our transaction 499 * and wait for zil_sync() to stuff the block poiner into zh_log. 500 * (zh is part of the MOS, so we cannot modify it in open context.) 501 */ 502 if (tx != NULL) { 503 dmu_tx_commit(tx); 504 txg_wait_synced(zilog->zl_dmu_pool, txg); 505 } 506 507 ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 508 509 return (lwb); 510 } 511 512 /* 513 * In one tx, free all log blocks and clear the log header. 514 * If keep_first is set, then we're replaying a log with no content. 515 * We want to keep the first block, however, so that the first 516 * synchronous transaction doesn't require a txg_wait_synced() 517 * in zil_create(). We don't need to txg_wait_synced() here either 518 * when keep_first is set, because both zil_create() and zil_destroy() 519 * will wait for any in-progress destroys to complete. 520 */ 521 void 522 zil_destroy(zilog_t *zilog, boolean_t keep_first) 523 { 524 const zil_header_t *zh = zilog->zl_header; 525 lwb_t *lwb; 526 dmu_tx_t *tx; 527 uint64_t txg; 528 529 /* 530 * Wait for any previous destroy to complete. 531 */ 532 txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 533 534 zilog->zl_old_header = *zh; /* debugging aid */ 535 536 if (BP_IS_HOLE(&zh->zh_log)) 537 return; 538 539 tx = dmu_tx_create(zilog->zl_os); 540 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 541 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 542 txg = dmu_tx_get_txg(tx); 543 544 mutex_enter(&zilog->zl_lock); 545 546 ASSERT3U(zilog->zl_destroy_txg, <, txg); 547 zilog->zl_destroy_txg = txg; 548 zilog->zl_keep_first = keep_first; 549 550 if (!list_is_empty(&zilog->zl_lwb_list)) { 551 ASSERT(zh->zh_claim_txg == 0); 552 ASSERT(!keep_first); 553 while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 554 list_remove(&zilog->zl_lwb_list, lwb); 555 if (lwb->lwb_buf != NULL) 556 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 557 zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); 558 kmem_cache_free(zil_lwb_cache, lwb); 559 } 560 } else if (!keep_first) { 561 (void) zil_parse(zilog, zil_free_log_block, 562 zil_free_log_record, tx, zh->zh_claim_txg); 563 } 564 mutex_exit(&zilog->zl_lock); 565 566 dmu_tx_commit(tx); 567 } 568 569 int 570 zil_claim(const char *osname, void *txarg) 571 { 572 dmu_tx_t *tx = txarg; 573 uint64_t first_txg = dmu_tx_get_txg(tx); 574 zilog_t *zilog; 575 zil_header_t *zh; 576 objset_t *os; 577 int error; 578 579 error = dmu_objset_hold(osname, FTAG, &os); 580 if (error) { 581 cmn_err(CE_WARN, "can't open objset for %s", osname); 582 return (0); 583 } 584 585 zilog = dmu_objset_zil(os); 586 zh = zil_header_in_syncing_context(zilog); 587 588 if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { 589 if (!BP_IS_HOLE(&zh->zh_log)) 590 zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); 591 BP_ZERO(&zh->zh_log); 592 dsl_dataset_dirty(dmu_objset_ds(os), tx); 593 dmu_objset_rele(os, FTAG); 594 return (0); 595 } 596 597 /* 598 * Claim all log blocks if we haven't already done so, and remember 599 * the highest claimed sequence number. This ensures that if we can 600 * read only part of the log now (e.g. due to a missing device), 601 * but we can read the entire log later, we will not try to replay 602 * or destroy beyond the last block we successfully claimed. 603 */ 604 ASSERT3U(zh->zh_claim_txg, <=, first_txg); 605 if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 606 (void) zil_parse(zilog, zil_claim_log_block, 607 zil_claim_log_record, tx, first_txg); 608 zh->zh_claim_txg = first_txg; 609 zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 610 zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 611 if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 612 zh->zh_flags |= ZIL_REPLAY_NEEDED; 613 zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 614 dsl_dataset_dirty(dmu_objset_ds(os), tx); 615 } 616 617 ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 618 dmu_objset_rele(os, FTAG); 619 return (0); 620 } 621 622 /* 623 * Check the log by walking the log chain. 624 * Checksum errors are ok as they indicate the end of the chain. 625 * Any other error (no device or read failure) returns an error. 626 */ 627 int 628 zil_check_log_chain(const char *osname, void *tx) 629 { 630 zilog_t *zilog; 631 objset_t *os; 632 int error; 633 634 ASSERT(tx == NULL); 635 636 error = dmu_objset_hold(osname, FTAG, &os); 637 if (error) { 638 cmn_err(CE_WARN, "can't open objset for %s", osname); 639 return (0); 640 } 641 642 zilog = dmu_objset_zil(os); 643 644 /* 645 * Because tx == NULL, zil_claim_log_block() will not actually claim 646 * any blocks, but just determine whether it is possible to do so. 647 * In addition to checking the log chain, zil_claim_log_block() 648 * will invoke zio_claim() with a done func of spa_claim_notify(), 649 * which will update spa_max_claim_txg. See spa_load() for details. 650 */ 651 error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 652 zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); 653 654 dmu_objset_rele(os, FTAG); 655 656 return ((error == ECKSUM || error == ENOENT) ? 0 : error); 657 } 658 659 static int 660 zil_vdev_compare(const void *x1, const void *x2) 661 { 662 uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 663 uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 664 665 if (v1 < v2) 666 return (-1); 667 if (v1 > v2) 668 return (1); 669 670 return (0); 671 } 672 673 void 674 zil_add_block(zilog_t *zilog, const blkptr_t *bp) 675 { 676 avl_tree_t *t = &zilog->zl_vdev_tree; 677 avl_index_t where; 678 zil_vdev_node_t *zv, zvsearch; 679 int ndvas = BP_GET_NDVAS(bp); 680 int i; 681 682 if (zfs_nocacheflush) 683 return; 684 685 ASSERT(zilog->zl_writer); 686 687 /* 688 * Even though we're zl_writer, we still need a lock because the 689 * zl_get_data() callbacks may have dmu_sync() done callbacks 690 * that will run concurrently. 691 */ 692 mutex_enter(&zilog->zl_vdev_lock); 693 for (i = 0; i < ndvas; i++) { 694 zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 695 if (avl_find(t, &zvsearch, &where) == NULL) { 696 zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 697 zv->zv_vdev = zvsearch.zv_vdev; 698 avl_insert(t, zv, where); 699 } 700 } 701 mutex_exit(&zilog->zl_vdev_lock); 702 } 703 704 void 705 zil_flush_vdevs(zilog_t *zilog) 706 { 707 spa_t *spa = zilog->zl_spa; 708 avl_tree_t *t = &zilog->zl_vdev_tree; 709 void *cookie = NULL; 710 zil_vdev_node_t *zv; 711 zio_t *zio; 712 713 ASSERT(zilog->zl_writer); 714 715 /* 716 * We don't need zl_vdev_lock here because we're the zl_writer, 717 * and all zl_get_data() callbacks are done. 718 */ 719 if (avl_numnodes(t) == 0) 720 return; 721 722 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 723 724 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 725 726 while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 727 vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 728 if (vd != NULL) 729 zio_flush(zio, vd); 730 kmem_free(zv, sizeof (*zv)); 731 } 732 733 /* 734 * Wait for all the flushes to complete. Not all devices actually 735 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. 736 */ 737 (void) zio_wait(zio); 738 739 spa_config_exit(spa, SCL_STATE, FTAG); 740 } 741 742 /* 743 * Function called when a log block write completes 744 */ 745 static void 746 zil_lwb_write_done(zio_t *zio) 747 { 748 lwb_t *lwb = zio->io_private; 749 zilog_t *zilog = lwb->lwb_zilog; 750 dmu_tx_t *tx = lwb->lwb_tx; 751 752 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 753 ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 754 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 755 ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 756 ASSERT(!BP_IS_GANG(zio->io_bp)); 757 ASSERT(!BP_IS_HOLE(zio->io_bp)); 758 ASSERT(zio->io_bp->blk_fill == 0); 759 760 /* 761 * Ensure the lwb buffer pointer is cleared before releasing 762 * the txg. If we have had an allocation failure and 763 * the txg is waiting to sync then we want want zil_sync() 764 * to remove the lwb so that it's not picked up as the next new 765 * one in zil_commit_writer(). zil_sync() will only remove 766 * the lwb if lwb_buf is null. 767 */ 768 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 769 mutex_enter(&zilog->zl_lock); 770 lwb->lwb_buf = NULL; 771 lwb->lwb_tx = NULL; 772 mutex_exit(&zilog->zl_lock); 773 774 /* 775 * Now that we've written this log block, we have a stable pointer 776 * to the next block in the chain, so it's OK to let the txg in 777 * which we allocated the next block sync. 778 */ 779 dmu_tx_commit(tx); 780 } 781 782 /* 783 * Initialize the io for a log block. 784 */ 785 static void 786 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) 787 { 788 zbookmark_t zb; 789 790 SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 791 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 792 lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 793 794 if (zilog->zl_root_zio == NULL) { 795 zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, 796 ZIO_FLAG_CANFAIL); 797 } 798 if (lwb->lwb_zio == NULL) { 799 lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 800 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), 801 zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, 802 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 803 } 804 } 805 806 /* 807 * Define a limited set of intent log block sizes. 808 * These must be a multiple of 4KB. Note only the amount used (again 809 * aligned to 4KB) actually gets written. However, we can't always just 810 * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. 811 */ 812 uint64_t zil_block_buckets[] = { 813 4096, /* non TX_WRITE */ 814 8192+4096, /* data base */ 815 32*1024 + 4096, /* NFS writes */ 816 UINT64_MAX 817 }; 818 819 /* 820 * Use the slog as long as the logbias is 'latency' and the current commit size 821 * is less than the limit or the total list size is less than 2X the limit. 822 * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. 823 */ 824 uint64_t zil_slog_limit = 1024 * 1024; 825 #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ 826 (((zilog)->zl_cur_used < zil_slog_limit) || \ 827 ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) 828 829 /* 830 * Start a log block write and advance to the next log block. 831 * Calls are serialized. 832 */ 833 static lwb_t * 834 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) 835 { 836 lwb_t *nlwb = NULL; 837 zil_chain_t *zilc; 838 spa_t *spa = zilog->zl_spa; 839 blkptr_t *bp; 840 dmu_tx_t *tx; 841 uint64_t txg; 842 uint64_t zil_blksz, wsz; 843 int i, error; 844 845 if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 846 zilc = (zil_chain_t *)lwb->lwb_buf; 847 bp = &zilc->zc_next_blk; 848 } else { 849 zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 850 bp = &zilc->zc_next_blk; 851 } 852 853 ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 854 855 /* 856 * Allocate the next block and save its address in this block 857 * before writing it in order to establish the log chain. 858 * Note that if the allocation of nlwb synced before we wrote 859 * the block that points at it (lwb), we'd leak it if we crashed. 860 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 861 * We dirty the dataset to ensure that zil_sync() will be called 862 * to clean up in the event of allocation failure or I/O failure. 863 */ 864 tx = dmu_tx_create(zilog->zl_os); 865 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 866 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 867 txg = dmu_tx_get_txg(tx); 868 869 lwb->lwb_tx = tx; 870 871 /* 872 * Log blocks are pre-allocated. Here we select the size of the next 873 * block, based on size used in the last block. 874 * - first find the smallest bucket that will fit the block from a 875 * limited set of block sizes. This is because it's faster to write 876 * blocks allocated from the same metaslab as they are adjacent or 877 * close. 878 * - next find the maximum from the new suggested size and an array of 879 * previous sizes. This lessens a picket fence effect of wrongly 880 * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 881 * requests. 882 * 883 * Note we only write what is used, but we can't just allocate 884 * the maximum block size because we can exhaust the available 885 * pool log space. 886 */ 887 zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 888 for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 889 continue; 890 zil_blksz = zil_block_buckets[i]; 891 if (zil_blksz == UINT64_MAX) 892 zil_blksz = SPA_MAXBLOCKSIZE; 893 zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 894 for (i = 0; i < ZIL_PREV_BLKS; i++) 895 zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 896 zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 897 898 BP_ZERO(bp); 899 /* pass the old blkptr in order to spread log blocks across devs */ 900 error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, 901 USE_SLOG(zilog)); 902 if (!error) { 903 ASSERT3U(bp->blk_birth, ==, txg); 904 bp->blk_cksum = lwb->lwb_blk.blk_cksum; 905 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 906 907 /* 908 * Allocate a new log write buffer (lwb). 909 */ 910 nlwb = zil_alloc_lwb(zilog, bp, txg); 911 912 /* Record the block for later vdev flushing */ 913 zil_add_block(zilog, &lwb->lwb_blk); 914 } 915 916 if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 917 /* For Slim ZIL only write what is used. */ 918 wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 919 ASSERT3U(wsz, <=, lwb->lwb_sz); 920 zio_shrink(lwb->lwb_zio, wsz); 921 922 } else { 923 wsz = lwb->lwb_sz; 924 } 925 926 zilc->zc_pad = 0; 927 zilc->zc_nused = lwb->lwb_nused; 928 zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 929 930 /* 931 * clear unused data for security 932 */ 933 bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); 934 935 zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ 936 937 /* 938 * If there was an allocation failure then nlwb will be null which 939 * forces a txg_wait_synced(). 940 */ 941 return (nlwb); 942 } 943 944 static lwb_t * 945 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 946 { 947 lr_t *lrc = &itx->itx_lr; /* common log record */ 948 lr_write_t *lrw = (lr_write_t *)lrc; 949 char *lr_buf; 950 uint64_t txg = lrc->lrc_txg; 951 uint64_t reclen = lrc->lrc_reclen; 952 uint64_t dlen = 0; 953 954 if (lwb == NULL) 955 return (NULL); 956 957 ASSERT(lwb->lwb_buf != NULL); 958 959 if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) 960 dlen = P2ROUNDUP_TYPED( 961 lrw->lr_length, sizeof (uint64_t), uint64_t); 962 963 zilog->zl_cur_used += (reclen + dlen); 964 965 zil_lwb_write_init(zilog, lwb); 966 967 /* 968 * If this record won't fit in the current log block, start a new one. 969 */ 970 if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 971 lwb = zil_lwb_write_start(zilog, lwb); 972 if (lwb == NULL) 973 return (NULL); 974 zil_lwb_write_init(zilog, lwb); 975 ASSERT(LWB_EMPTY(lwb)); 976 if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 977 txg_wait_synced(zilog->zl_dmu_pool, txg); 978 return (lwb); 979 } 980 } 981 982 lr_buf = lwb->lwb_buf + lwb->lwb_nused; 983 bcopy(lrc, lr_buf, reclen); 984 lrc = (lr_t *)lr_buf; 985 lrw = (lr_write_t *)lrc; 986 987 /* 988 * If it's a write, fetch the data or get its blkptr as appropriate. 989 */ 990 if (lrc->lrc_txtype == TX_WRITE) { 991 if (txg > spa_freeze_txg(zilog->zl_spa)) 992 txg_wait_synced(zilog->zl_dmu_pool, txg); 993 if (itx->itx_wr_state != WR_COPIED) { 994 char *dbuf; 995 int error; 996 997 if (dlen) { 998 ASSERT(itx->itx_wr_state == WR_NEED_COPY); 999 dbuf = lr_buf + reclen; 1000 lrw->lr_common.lrc_reclen += dlen; 1001 } else { 1002 ASSERT(itx->itx_wr_state == WR_INDIRECT); 1003 dbuf = NULL; 1004 } 1005 error = zilog->zl_get_data( 1006 itx->itx_private, lrw, dbuf, lwb->lwb_zio); 1007 if (error == EIO) { 1008 txg_wait_synced(zilog->zl_dmu_pool, txg); 1009 return (lwb); 1010 } 1011 if (error) { 1012 ASSERT(error == ENOENT || error == EEXIST || 1013 error == EALREADY); 1014 return (lwb); 1015 } 1016 } 1017 } 1018 1019 /* 1020 * We're actually making an entry, so update lrc_seq to be the 1021 * log record sequence number. Note that this is generally not 1022 * equal to the itx sequence number because not all transactions 1023 * are synchronous, and sometimes spa_sync() gets there first. 1024 */ 1025 lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ 1026 lwb->lwb_nused += reclen + dlen; 1027 lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 1028 ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1029 ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); 1030 1031 return (lwb); 1032 } 1033 1034 itx_t * 1035 zil_itx_create(uint64_t txtype, size_t lrsize) 1036 { 1037 itx_t *itx; 1038 1039 lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1040 1041 itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1042 itx->itx_lr.lrc_txtype = txtype; 1043 itx->itx_lr.lrc_reclen = lrsize; 1044 itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ 1045 itx->itx_lr.lrc_seq = 0; /* defensive */ 1046 1047 return (itx); 1048 } 1049 1050 void 1051 zil_itx_destroy(itx_t *itx) 1052 { 1053 kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1054 } 1055 1056 uint64_t 1057 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 1058 { 1059 uint64_t seq; 1060 1061 ASSERT(itx->itx_lr.lrc_seq == 0); 1062 ASSERT(!zilog->zl_replay); 1063 1064 mutex_enter(&zilog->zl_lock); 1065 list_insert_tail(&zilog->zl_itx_list, itx); 1066 zilog->zl_itx_list_sz += itx->itx_sod; 1067 itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1068 itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; 1069 mutex_exit(&zilog->zl_lock); 1070 1071 return (seq); 1072 } 1073 1074 /* 1075 * Free up all in-memory intent log transactions that have now been synced. 1076 */ 1077 static void 1078 zil_itx_clean(zilog_t *zilog) 1079 { 1080 uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); 1081 uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); 1082 list_t clean_list; 1083 itx_t *itx; 1084 1085 list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1086 1087 mutex_enter(&zilog->zl_lock); 1088 /* wait for a log writer to finish walking list */ 1089 while (zilog->zl_writer) { 1090 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1091 } 1092 1093 /* 1094 * Move the sync'd log transactions to a separate list so we can call 1095 * kmem_free without holding the zl_lock. 1096 * 1097 * There is no need to set zl_writer as we don't drop zl_lock here 1098 */ 1099 while ((itx = list_head(&zilog->zl_itx_list)) != NULL && 1100 itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { 1101 list_remove(&zilog->zl_itx_list, itx); 1102 zilog->zl_itx_list_sz -= itx->itx_sod; 1103 list_insert_tail(&clean_list, itx); 1104 } 1105 cv_broadcast(&zilog->zl_cv_writer); 1106 mutex_exit(&zilog->zl_lock); 1107 1108 /* destroy sync'd log transactions */ 1109 while ((itx = list_head(&clean_list)) != NULL) { 1110 list_remove(&clean_list, itx); 1111 zil_itx_destroy(itx); 1112 } 1113 list_destroy(&clean_list); 1114 } 1115 1116 /* 1117 * If there are any in-memory intent log transactions which have now been 1118 * synced then start up a taskq to free them. 1119 */ 1120 void 1121 zil_clean(zilog_t *zilog) 1122 { 1123 itx_t *itx; 1124 1125 mutex_enter(&zilog->zl_lock); 1126 itx = list_head(&zilog->zl_itx_list); 1127 if ((itx != NULL) && 1128 (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { 1129 (void) taskq_dispatch(zilog->zl_clean_taskq, 1130 (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP); 1131 } 1132 mutex_exit(&zilog->zl_lock); 1133 } 1134 1135 static void 1136 zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) 1137 { 1138 uint64_t txg; 1139 uint64_t commit_seq = 0; 1140 itx_t *itx, *itx_next; 1141 lwb_t *lwb; 1142 spa_t *spa; 1143 int error = 0; 1144 1145 zilog->zl_writer = B_TRUE; 1146 ASSERT(zilog->zl_root_zio == NULL); 1147 spa = zilog->zl_spa; 1148 1149 if (zilog->zl_suspend) { 1150 lwb = NULL; 1151 } else { 1152 lwb = list_tail(&zilog->zl_lwb_list); 1153 if (lwb == NULL) { 1154 /* 1155 * Return if there's nothing to flush before we 1156 * dirty the fs by calling zil_create() 1157 */ 1158 if (list_is_empty(&zilog->zl_itx_list)) { 1159 zilog->zl_writer = B_FALSE; 1160 return; 1161 } 1162 mutex_exit(&zilog->zl_lock); 1163 lwb = zil_create(zilog); 1164 mutex_enter(&zilog->zl_lock); 1165 } 1166 } 1167 ASSERT(lwb == NULL || lwb->lwb_zio == NULL); 1168 1169 /* Loop through in-memory log transactions filling log blocks. */ 1170 DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); 1171 1172 for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) { 1173 /* 1174 * Save the next pointer. Even though we drop zl_lock below, 1175 * all threads that can remove itx list entries (other writers 1176 * and zil_itx_clean()) can't do so until they have zl_writer. 1177 */ 1178 itx_next = list_next(&zilog->zl_itx_list, itx); 1179 1180 /* 1181 * Determine whether to push this itx. 1182 * Push all transactions related to specified foid and 1183 * all other transactions except those that can be logged 1184 * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL) 1185 * for all other files. 1186 * 1187 * If foid == 0 (meaning "push all foids") or 1188 * itx->itx_sync is set (meaning O_[D]SYNC), push regardless. 1189 */ 1190 if (foid != 0 && !itx->itx_sync && 1191 TX_OOO(itx->itx_lr.lrc_txtype) && 1192 ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid) 1193 continue; /* skip this record */ 1194 1195 if ((itx->itx_lr.lrc_seq > seq) && 1196 ((lwb == NULL) || (LWB_EMPTY(lwb)) || 1197 (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz))) 1198 break; 1199 1200 list_remove(&zilog->zl_itx_list, itx); 1201 zilog->zl_itx_list_sz -= itx->itx_sod; 1202 1203 mutex_exit(&zilog->zl_lock); 1204 1205 txg = itx->itx_lr.lrc_txg; 1206 ASSERT(txg); 1207 1208 if (txg > spa_last_synced_txg(spa) || 1209 txg > spa_freeze_txg(spa)) 1210 lwb = zil_lwb_commit(zilog, itx, lwb); 1211 1212 zil_itx_destroy(itx); 1213 1214 mutex_enter(&zilog->zl_lock); 1215 } 1216 DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); 1217 /* determine commit sequence number */ 1218 itx = list_head(&zilog->zl_itx_list); 1219 if (itx) 1220 commit_seq = itx->itx_lr.lrc_seq - 1; 1221 else 1222 commit_seq = zilog->zl_itx_seq; 1223 mutex_exit(&zilog->zl_lock); 1224 1225 /* write the last block out */ 1226 if (lwb != NULL && lwb->lwb_zio != NULL) 1227 lwb = zil_lwb_write_start(zilog, lwb); 1228 1229 zilog->zl_prev_used = zilog->zl_cur_used; 1230 zilog->zl_cur_used = 0; 1231 1232 /* 1233 * Wait if necessary for the log blocks to be on stable storage. 1234 */ 1235 if (zilog->zl_root_zio) { 1236 DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); 1237 error = zio_wait(zilog->zl_root_zio); 1238 zilog->zl_root_zio = NULL; 1239 DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); 1240 zil_flush_vdevs(zilog); 1241 } 1242 1243 if (error || lwb == NULL) 1244 txg_wait_synced(zilog->zl_dmu_pool, 0); 1245 1246 mutex_enter(&zilog->zl_lock); 1247 zilog->zl_writer = B_FALSE; 1248 1249 ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); 1250 zilog->zl_commit_seq = commit_seq; 1251 1252 /* 1253 * Remember the highest committed log sequence number for ztest. 1254 * We only update this value when all the log writes succeeded, 1255 * because ztest wants to ASSERT that it got the whole log chain. 1256 */ 1257 if (error == 0 && lwb != NULL) 1258 zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1259 } 1260 1261 /* 1262 * Push zfs transactions to stable storage up to the supplied sequence number. 1263 * If foid is 0 push out all transactions, otherwise push only those 1264 * for that file or might have been used to create that file. 1265 */ 1266 void 1267 zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) 1268 { 1269 if (zilog == NULL || seq == 0) 1270 return; 1271 1272 mutex_enter(&zilog->zl_lock); 1273 1274 seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ 1275 1276 while (zilog->zl_writer) { 1277 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1278 if (seq <= zilog->zl_commit_seq) { 1279 mutex_exit(&zilog->zl_lock); 1280 return; 1281 } 1282 } 1283 zil_commit_writer(zilog, seq, foid); /* drops zl_lock */ 1284 /* wake up others waiting on the commit */ 1285 cv_broadcast(&zilog->zl_cv_writer); 1286 mutex_exit(&zilog->zl_lock); 1287 } 1288 1289 /* 1290 * Report whether all transactions are committed. 1291 */ 1292 static boolean_t 1293 zil_is_committed(zilog_t *zilog) 1294 { 1295 lwb_t *lwb; 1296 boolean_t committed; 1297 1298 mutex_enter(&zilog->zl_lock); 1299 1300 while (zilog->zl_writer) 1301 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1302 1303 if (!list_is_empty(&zilog->zl_itx_list)) 1304 committed = B_FALSE; /* unpushed transactions */ 1305 else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL) 1306 committed = B_TRUE; /* intent log never used */ 1307 else if (list_next(&zilog->zl_lwb_list, lwb) != NULL) 1308 committed = B_FALSE; /* zil_sync() not done yet */ 1309 else 1310 committed = B_TRUE; /* everything synced */ 1311 1312 mutex_exit(&zilog->zl_lock); 1313 return (committed); 1314 } 1315 1316 /* 1317 * Called in syncing context to free committed log blocks and update log header. 1318 */ 1319 void 1320 zil_sync(zilog_t *zilog, dmu_tx_t *tx) 1321 { 1322 zil_header_t *zh = zil_header_in_syncing_context(zilog); 1323 uint64_t txg = dmu_tx_get_txg(tx); 1324 spa_t *spa = zilog->zl_spa; 1325 uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 1326 lwb_t *lwb; 1327 1328 /* 1329 * We don't zero out zl_destroy_txg, so make sure we don't try 1330 * to destroy it twice. 1331 */ 1332 if (spa_sync_pass(spa) != 1) 1333 return; 1334 1335 mutex_enter(&zilog->zl_lock); 1336 1337 ASSERT(zilog->zl_stop_sync == 0); 1338 1339 if (*replayed_seq != 0) { 1340 ASSERT(zh->zh_replay_seq < *replayed_seq); 1341 zh->zh_replay_seq = *replayed_seq; 1342 *replayed_seq = 0; 1343 } 1344 1345 if (zilog->zl_destroy_txg == txg) { 1346 blkptr_t blk = zh->zh_log; 1347 1348 ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 1349 1350 bzero(zh, sizeof (zil_header_t)); 1351 bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 1352 1353 if (zilog->zl_keep_first) { 1354 /* 1355 * If this block was part of log chain that couldn't 1356 * be claimed because a device was missing during 1357 * zil_claim(), but that device later returns, 1358 * then this block could erroneously appear valid. 1359 * To guard against this, assign a new GUID to the new 1360 * log chain so it doesn't matter what blk points to. 1361 */ 1362 zil_init_log_chain(zilog, &blk); 1363 zh->zh_log = blk; 1364 } 1365 } 1366 1367 while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1368 zh->zh_log = lwb->lwb_blk; 1369 if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 1370 break; 1371 list_remove(&zilog->zl_lwb_list, lwb); 1372 zio_free_zil(spa, txg, &lwb->lwb_blk); 1373 kmem_cache_free(zil_lwb_cache, lwb); 1374 1375 /* 1376 * If we don't have anything left in the lwb list then 1377 * we've had an allocation failure and we need to zero 1378 * out the zil_header blkptr so that we don't end 1379 * up freeing the same block twice. 1380 */ 1381 if (list_head(&zilog->zl_lwb_list) == NULL) 1382 BP_ZERO(&zh->zh_log); 1383 } 1384 mutex_exit(&zilog->zl_lock); 1385 } 1386 1387 void 1388 zil_init(void) 1389 { 1390 zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 1391 sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); 1392 } 1393 1394 void 1395 zil_fini(void) 1396 { 1397 kmem_cache_destroy(zil_lwb_cache); 1398 } 1399 1400 void 1401 zil_set_logbias(zilog_t *zilog, uint64_t logbias) 1402 { 1403 zilog->zl_logbias = logbias; 1404 } 1405 1406 zilog_t * 1407 zil_alloc(objset_t *os, zil_header_t *zh_phys) 1408 { 1409 zilog_t *zilog; 1410 1411 zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 1412 1413 zilog->zl_header = zh_phys; 1414 zilog->zl_os = os; 1415 zilog->zl_spa = dmu_objset_spa(os); 1416 zilog->zl_dmu_pool = dmu_objset_pool(os); 1417 zilog->zl_destroy_txg = TXG_INITIAL - 1; 1418 zilog->zl_logbias = dmu_objset_logbias(os); 1419 1420 mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 1421 1422 list_create(&zilog->zl_itx_list, sizeof (itx_t), 1423 offsetof(itx_t, itx_node)); 1424 1425 list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 1426 offsetof(lwb_t, lwb_node)); 1427 1428 mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 1429 1430 avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, 1431 sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 1432 1433 cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); 1434 cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 1435 1436 return (zilog); 1437 } 1438 1439 void 1440 zil_free(zilog_t *zilog) 1441 { 1442 lwb_t *lwb; 1443 1444 zilog->zl_stop_sync = 1; 1445 1446 while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1447 list_remove(&zilog->zl_lwb_list, lwb); 1448 if (lwb->lwb_buf != NULL) 1449 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 1450 kmem_cache_free(zil_lwb_cache, lwb); 1451 } 1452 list_destroy(&zilog->zl_lwb_list); 1453 1454 avl_destroy(&zilog->zl_vdev_tree); 1455 mutex_destroy(&zilog->zl_vdev_lock); 1456 1457 ASSERT(list_head(&zilog->zl_itx_list) == NULL); 1458 list_destroy(&zilog->zl_itx_list); 1459 mutex_destroy(&zilog->zl_lock); 1460 1461 cv_destroy(&zilog->zl_cv_writer); 1462 cv_destroy(&zilog->zl_cv_suspend); 1463 1464 kmem_free(zilog, sizeof (zilog_t)); 1465 } 1466 1467 /* 1468 * Open an intent log. 1469 */ 1470 zilog_t * 1471 zil_open(objset_t *os, zil_get_data_t *get_data) 1472 { 1473 zilog_t *zilog = dmu_objset_zil(os); 1474 1475 zilog->zl_get_data = get_data; 1476 zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 1477 2, 2, TASKQ_PREPOPULATE); 1478 1479 return (zilog); 1480 } 1481 1482 /* 1483 * Close an intent log. 1484 */ 1485 void 1486 zil_close(zilog_t *zilog) 1487 { 1488 /* 1489 * If the log isn't already committed, mark the objset dirty 1490 * (so zil_sync() will be called) and wait for that txg to sync. 1491 */ 1492 if (!zil_is_committed(zilog)) { 1493 uint64_t txg; 1494 dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 1495 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 1496 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1497 txg = dmu_tx_get_txg(tx); 1498 dmu_tx_commit(tx); 1499 txg_wait_synced(zilog->zl_dmu_pool, txg); 1500 } 1501 1502 taskq_destroy(zilog->zl_clean_taskq); 1503 zilog->zl_clean_taskq = NULL; 1504 zilog->zl_get_data = NULL; 1505 1506 zil_itx_clean(zilog); 1507 ASSERT(list_head(&zilog->zl_itx_list) == NULL); 1508 } 1509 1510 /* 1511 * Suspend an intent log. While in suspended mode, we still honor 1512 * synchronous semantics, but we rely on txg_wait_synced() to do it. 1513 * We suspend the log briefly when taking a snapshot so that the snapshot 1514 * contains all the data it's supposed to, and has an empty intent log. 1515 */ 1516 int 1517 zil_suspend(zilog_t *zilog) 1518 { 1519 const zil_header_t *zh = zilog->zl_header; 1520 1521 mutex_enter(&zilog->zl_lock); 1522 if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 1523 mutex_exit(&zilog->zl_lock); 1524 return (EBUSY); 1525 } 1526 if (zilog->zl_suspend++ != 0) { 1527 /* 1528 * Someone else already began a suspend. 1529 * Just wait for them to finish. 1530 */ 1531 while (zilog->zl_suspending) 1532 cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 1533 mutex_exit(&zilog->zl_lock); 1534 return (0); 1535 } 1536 zilog->zl_suspending = B_TRUE; 1537 mutex_exit(&zilog->zl_lock); 1538 1539 zil_commit(zilog, UINT64_MAX, 0); 1540 1541 /* 1542 * Wait for any in-flight log writes to complete. 1543 */ 1544 mutex_enter(&zilog->zl_lock); 1545 while (zilog->zl_writer) 1546 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1547 mutex_exit(&zilog->zl_lock); 1548 1549 zil_destroy(zilog, B_FALSE); 1550 1551 mutex_enter(&zilog->zl_lock); 1552 zilog->zl_suspending = B_FALSE; 1553 cv_broadcast(&zilog->zl_cv_suspend); 1554 mutex_exit(&zilog->zl_lock); 1555 1556 return (0); 1557 } 1558 1559 void 1560 zil_resume(zilog_t *zilog) 1561 { 1562 mutex_enter(&zilog->zl_lock); 1563 ASSERT(zilog->zl_suspend != 0); 1564 zilog->zl_suspend--; 1565 mutex_exit(&zilog->zl_lock); 1566 } 1567 1568 typedef struct zil_replay_arg { 1569 zil_replay_func_t **zr_replay; 1570 void *zr_arg; 1571 boolean_t zr_byteswap; 1572 char *zr_lr; 1573 } zil_replay_arg_t; 1574 1575 static int 1576 zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 1577 { 1578 char name[MAXNAMELEN]; 1579 1580 zilog->zl_replaying_seq--; /* didn't actually replay this one */ 1581 1582 dmu_objset_name(zilog->zl_os, name); 1583 1584 cmn_err(CE_WARN, "ZFS replay transaction error %d, " 1585 "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 1586 (u_longlong_t)lr->lrc_seq, 1587 (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 1588 (lr->lrc_txtype & TX_CI) ? "CI" : ""); 1589 1590 return (error); 1591 } 1592 1593 static int 1594 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 1595 { 1596 zil_replay_arg_t *zr = zra; 1597 const zil_header_t *zh = zilog->zl_header; 1598 uint64_t reclen = lr->lrc_reclen; 1599 uint64_t txtype = lr->lrc_txtype; 1600 int error = 0; 1601 1602 zilog->zl_replaying_seq = lr->lrc_seq; 1603 1604 if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 1605 return (0); 1606 1607 if (lr->lrc_txg < claim_txg) /* already committed */ 1608 return (0); 1609 1610 /* Strip case-insensitive bit, still present in log record */ 1611 txtype &= ~TX_CI; 1612 1613 if (txtype == 0 || txtype >= TX_MAX_TYPE) 1614 return (zil_replay_error(zilog, lr, EINVAL)); 1615 1616 /* 1617 * If this record type can be logged out of order, the object 1618 * (lr_foid) may no longer exist. That's legitimate, not an error. 1619 */ 1620 if (TX_OOO(txtype)) { 1621 error = dmu_object_info(zilog->zl_os, 1622 ((lr_ooo_t *)lr)->lr_foid, NULL); 1623 if (error == ENOENT || error == EEXIST) 1624 return (0); 1625 } 1626 1627 /* 1628 * Make a copy of the data so we can revise and extend it. 1629 */ 1630 bcopy(lr, zr->zr_lr, reclen); 1631 1632 /* 1633 * If this is a TX_WRITE with a blkptr, suck in the data. 1634 */ 1635 if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 1636 error = zil_read_log_data(zilog, (lr_write_t *)lr, 1637 zr->zr_lr + reclen); 1638 if (error) 1639 return (zil_replay_error(zilog, lr, error)); 1640 } 1641 1642 /* 1643 * The log block containing this lr may have been byteswapped 1644 * so that we can easily examine common fields like lrc_txtype. 1645 * However, the log is a mix of different record types, and only the 1646 * replay vectors know how to byteswap their records. Therefore, if 1647 * the lr was byteswapped, undo it before invoking the replay vector. 1648 */ 1649 if (zr->zr_byteswap) 1650 byteswap_uint64_array(zr->zr_lr, reclen); 1651 1652 /* 1653 * We must now do two things atomically: replay this log record, 1654 * and update the log header sequence number to reflect the fact that 1655 * we did so. At the end of each replay function the sequence number 1656 * is updated if we are in replay mode. 1657 */ 1658 error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 1659 if (error) { 1660 /* 1661 * The DMU's dnode layer doesn't see removes until the txg 1662 * commits, so a subsequent claim can spuriously fail with 1663 * EEXIST. So if we receive any error we try syncing out 1664 * any removes then retry the transaction. Note that we 1665 * specify B_FALSE for byteswap now, so we don't do it twice. 1666 */ 1667 txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 1668 error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 1669 if (error) 1670 return (zil_replay_error(zilog, lr, error)); 1671 } 1672 return (0); 1673 } 1674 1675 /* ARGSUSED */ 1676 static int 1677 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 1678 { 1679 zilog->zl_replay_blks++; 1680 1681 return (0); 1682 } 1683 1684 /* 1685 * If this dataset has a non-empty intent log, replay it and destroy it. 1686 */ 1687 void 1688 zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 1689 { 1690 zilog_t *zilog = dmu_objset_zil(os); 1691 const zil_header_t *zh = zilog->zl_header; 1692 zil_replay_arg_t zr; 1693 1694 if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 1695 zil_destroy(zilog, B_TRUE); 1696 return; 1697 } 1698 1699 zr.zr_replay = replay_func; 1700 zr.zr_arg = arg; 1701 zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 1702 zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 1703 1704 /* 1705 * Wait for in-progress removes to sync before starting replay. 1706 */ 1707 txg_wait_synced(zilog->zl_dmu_pool, 0); 1708 1709 zilog->zl_replay = B_TRUE; 1710 zilog->zl_replay_time = ddi_get_lbolt(); 1711 ASSERT(zilog->zl_replay_blks == 0); 1712 (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 1713 zh->zh_claim_txg); 1714 kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 1715 1716 zil_destroy(zilog, B_FALSE); 1717 txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 1718 zilog->zl_replay = B_FALSE; 1719 } 1720 1721 boolean_t 1722 zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 1723 { 1724 if (zilog == NULL) 1725 return (B_TRUE); 1726 1727 if (zilog->zl_replay) { 1728 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1729 zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 1730 zilog->zl_replaying_seq; 1731 return (B_TRUE); 1732 } 1733 1734 return (B_FALSE); 1735 } 1736 1737 /* ARGSUSED */ 1738 int 1739 zil_vdev_offline(const char *osname, void *arg) 1740 { 1741 objset_t *os; 1742 zilog_t *zilog; 1743 int error; 1744 1745 error = dmu_objset_hold(osname, FTAG, &os); 1746 if (error) 1747 return (error); 1748 1749 zilog = dmu_objset_zil(os); 1750 if (zil_suspend(zilog) != 0) 1751 error = EEXIST; 1752 else 1753 zil_resume(zilog); 1754 dmu_objset_rele(os, FTAG); 1755 return (error); 1756 } 1757