1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/dmu.h> 29 #include <sys/zap.h> 30 #include <sys/arc.h> 31 #include <sys/stat.h> 32 #include <sys/resource.h> 33 #include <sys/zil.h> 34 #include <sys/zil_impl.h> 35 #include <sys/dsl_dataset.h> 36 #include <sys/vdev.h> 37 #include <sys/dmu_tx.h> 38 39 /* 40 * The zfs intent log (ZIL) saves transaction records of system calls 41 * that change the file system in memory with enough information 42 * to be able to replay them. These are stored in memory until 43 * either the DMU transaction group (txg) commits them to the stable pool 44 * and they can be discarded, or they are flushed to the stable log 45 * (also in the pool) due to a fsync, O_DSYNC or other synchronous 46 * requirement. In the event of a panic or power fail then those log 47 * records (transactions) are replayed. 48 * 49 * There is one ZIL per file system. Its on-disk (pool) format consists 50 * of 3 parts: 51 * 52 * - ZIL header 53 * - ZIL blocks 54 * - ZIL records 55 * 56 * A log record holds a system call transaction. Log blocks can 57 * hold many log records and the blocks are chained together. 58 * Each ZIL block contains a block pointer (blkptr_t) to the next 59 * ZIL block in the chain. The ZIL header points to the first 60 * block in the chain. Note there is not a fixed place in the pool 61 * to hold blocks. They are dynamically allocated and freed as 62 * needed from the blocks available. Figure X shows the ZIL structure: 63 */ 64 65 /* 66 * This global ZIL switch affects all pools 67 */ 68 int zil_disable = 0; /* disable intent logging */ 69 70 /* 71 * Tunable parameter for debugging or performance analysis. Setting 72 * zfs_nocacheflush will cause corruption on power loss if a volatile 73 * out-of-order write cache is enabled. 74 */ 75 boolean_t zfs_nocacheflush = B_FALSE; 76 77 static kmem_cache_t *zil_lwb_cache; 78 79 static boolean_t zil_empty(zilog_t *zilog); 80 81 #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ 82 sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) 83 84 85 static int 86 zil_bp_compare(const void *x1, const void *x2) 87 { 88 const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; 89 const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; 90 91 if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) 92 return (-1); 93 if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) 94 return (1); 95 96 if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) 97 return (-1); 98 if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) 99 return (1); 100 101 return (0); 102 } 103 104 static void 105 zil_bp_tree_init(zilog_t *zilog) 106 { 107 avl_create(&zilog->zl_bp_tree, zil_bp_compare, 108 sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); 109 } 110 111 static void 112 zil_bp_tree_fini(zilog_t *zilog) 113 { 114 avl_tree_t *t = &zilog->zl_bp_tree; 115 zil_bp_node_t *zn; 116 void *cookie = NULL; 117 118 while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) 119 kmem_free(zn, sizeof (zil_bp_node_t)); 120 121 avl_destroy(t); 122 } 123 124 int 125 zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) 126 { 127 avl_tree_t *t = &zilog->zl_bp_tree; 128 const dva_t *dva = BP_IDENTITY(bp); 129 zil_bp_node_t *zn; 130 avl_index_t where; 131 132 if (avl_find(t, dva, &where) != NULL) 133 return (EEXIST); 134 135 zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); 136 zn->zn_dva = *dva; 137 avl_insert(t, zn, where); 138 139 return (0); 140 } 141 142 static zil_header_t * 143 zil_header_in_syncing_context(zilog_t *zilog) 144 { 145 return ((zil_header_t *)zilog->zl_header); 146 } 147 148 static void 149 zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) 150 { 151 zio_cksum_t *zc = &bp->blk_cksum; 152 153 zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); 154 zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); 155 zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); 156 zc->zc_word[ZIL_ZC_SEQ] = 1ULL; 157 } 158 159 /* 160 * Read a log block and make sure it's valid. 161 */ 162 static int 163 zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, 164 char **end) 165 { 166 enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 167 uint32_t aflags = ARC_WAIT; 168 arc_buf_t *abuf = NULL; 169 zbookmark_t zb; 170 int error; 171 172 if (zilog->zl_header->zh_claim_txg == 0) 173 zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 174 175 if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 176 zio_flags |= ZIO_FLAG_SPECULATIVE; 177 178 SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], 179 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 180 181 error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 182 ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 183 184 if (error == 0) { 185 zio_cksum_t cksum = bp->blk_cksum; 186 187 /* 188 * Validate the checksummed log block. 189 * 190 * Sequence numbers should be... sequential. The checksum 191 * verifier for the next block should be bp's checksum plus 1. 192 * 193 * Also check the log chain linkage and size used. 194 */ 195 cksum.zc_word[ZIL_ZC_SEQ]++; 196 197 if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 198 zil_chain_t *zilc = abuf->b_data; 199 char *lr = (char *)(zilc + 1); 200 uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); 201 202 if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 203 sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { 204 error = ECKSUM; 205 } else { 206 bcopy(lr, dst, len); 207 *end = (char *)dst + len; 208 *nbp = zilc->zc_next_blk; 209 } 210 } else { 211 char *lr = abuf->b_data; 212 uint64_t size = BP_GET_LSIZE(bp); 213 zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; 214 215 if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, 216 sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || 217 (zilc->zc_nused > (size - sizeof (*zilc)))) { 218 error = ECKSUM; 219 } else { 220 bcopy(lr, dst, zilc->zc_nused); 221 *end = (char *)dst + zilc->zc_nused; 222 *nbp = zilc->zc_next_blk; 223 } 224 } 225 226 VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); 227 } 228 229 return (error); 230 } 231 232 /* 233 * Read a TX_WRITE log data block. 234 */ 235 static int 236 zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) 237 { 238 enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; 239 const blkptr_t *bp = &lr->lr_blkptr; 240 uint32_t aflags = ARC_WAIT; 241 arc_buf_t *abuf = NULL; 242 zbookmark_t zb; 243 int error; 244 245 if (BP_IS_HOLE(bp)) { 246 if (wbuf != NULL) 247 bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); 248 return (0); 249 } 250 251 if (zilog->zl_header->zh_claim_txg == 0) 252 zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; 253 254 SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, 255 ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); 256 257 error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, 258 ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); 259 260 if (error == 0) { 261 if (wbuf != NULL) 262 bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); 263 (void) arc_buf_remove_ref(abuf, &abuf); 264 } 265 266 return (error); 267 } 268 269 /* 270 * Parse the intent log, and call parse_func for each valid record within. 271 */ 272 int 273 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, 274 zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) 275 { 276 const zil_header_t *zh = zilog->zl_header; 277 boolean_t claimed = !!zh->zh_claim_txg; 278 uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; 279 uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; 280 uint64_t max_blk_seq = 0; 281 uint64_t max_lr_seq = 0; 282 uint64_t blk_count = 0; 283 uint64_t lr_count = 0; 284 blkptr_t blk, next_blk; 285 char *lrbuf, *lrp; 286 int error = 0; 287 288 /* 289 * Old logs didn't record the maximum zh_claim_lr_seq. 290 */ 291 if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) 292 claim_lr_seq = UINT64_MAX; 293 294 /* 295 * Starting at the block pointed to by zh_log we read the log chain. 296 * For each block in the chain we strongly check that block to 297 * ensure its validity. We stop when an invalid block is found. 298 * For each block pointer in the chain we call parse_blk_func(). 299 * For each record in each valid block we call parse_lr_func(). 300 * If the log has been claimed, stop if we encounter a sequence 301 * number greater than the highest claimed sequence number. 302 */ 303 lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); 304 zil_bp_tree_init(zilog); 305 306 for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { 307 uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; 308 int reclen; 309 char *end; 310 311 if (blk_seq > claim_blk_seq) 312 break; 313 if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) 314 break; 315 ASSERT3U(max_blk_seq, <, blk_seq); 316 max_blk_seq = blk_seq; 317 blk_count++; 318 319 if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) 320 break; 321 322 error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); 323 if (error) 324 break; 325 326 for (lrp = lrbuf; lrp < end; lrp += reclen) { 327 lr_t *lr = (lr_t *)lrp; 328 reclen = lr->lrc_reclen; 329 ASSERT3U(reclen, >=, sizeof (lr_t)); 330 if (lr->lrc_seq > claim_lr_seq) 331 goto done; 332 if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) 333 goto done; 334 ASSERT3U(max_lr_seq, <, lr->lrc_seq); 335 max_lr_seq = lr->lrc_seq; 336 lr_count++; 337 } 338 } 339 done: 340 zilog->zl_parse_error = error; 341 zilog->zl_parse_blk_seq = max_blk_seq; 342 zilog->zl_parse_lr_seq = max_lr_seq; 343 zilog->zl_parse_blk_count = blk_count; 344 zilog->zl_parse_lr_count = lr_count; 345 346 ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || 347 (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); 348 349 zil_bp_tree_fini(zilog); 350 zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); 351 352 return (error); 353 } 354 355 static int 356 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) 357 { 358 /* 359 * Claim log block if not already committed and not already claimed. 360 * If tx == NULL, just verify that the block is claimable. 361 */ 362 if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) 363 return (0); 364 365 return (zio_wait(zio_claim(NULL, zilog->zl_spa, 366 tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, 367 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); 368 } 369 370 static int 371 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) 372 { 373 lr_write_t *lr = (lr_write_t *)lrc; 374 int error; 375 376 if (lrc->lrc_txtype != TX_WRITE) 377 return (0); 378 379 /* 380 * If the block is not readable, don't claim it. This can happen 381 * in normal operation when a log block is written to disk before 382 * some of the dmu_sync() blocks it points to. In this case, the 383 * transaction cannot have been committed to anyone (we would have 384 * waited for all writes to be stable first), so it is semantically 385 * correct to declare this the end of the log. 386 */ 387 if (lr->lr_blkptr.blk_birth >= first_txg && 388 (error = zil_read_log_data(zilog, lr, NULL)) != 0) 389 return (error); 390 return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); 391 } 392 393 /* ARGSUSED */ 394 static int 395 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) 396 { 397 zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 398 399 return (0); 400 } 401 402 static int 403 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) 404 { 405 lr_write_t *lr = (lr_write_t *)lrc; 406 blkptr_t *bp = &lr->lr_blkptr; 407 408 /* 409 * If we previously claimed it, we need to free it. 410 */ 411 if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && 412 bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) 413 zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); 414 415 return (0); 416 } 417 418 static lwb_t * 419 zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) 420 { 421 lwb_t *lwb; 422 423 lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); 424 lwb->lwb_zilog = zilog; 425 lwb->lwb_blk = *bp; 426 lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); 427 lwb->lwb_max_txg = txg; 428 lwb->lwb_zio = NULL; 429 lwb->lwb_tx = NULL; 430 if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { 431 lwb->lwb_nused = sizeof (zil_chain_t); 432 lwb->lwb_sz = BP_GET_LSIZE(bp); 433 } else { 434 lwb->lwb_nused = 0; 435 lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); 436 } 437 438 mutex_enter(&zilog->zl_lock); 439 list_insert_tail(&zilog->zl_lwb_list, lwb); 440 mutex_exit(&zilog->zl_lock); 441 442 return (lwb); 443 } 444 445 /* 446 * Create an on-disk intent log. 447 */ 448 static lwb_t * 449 zil_create(zilog_t *zilog) 450 { 451 const zil_header_t *zh = zilog->zl_header; 452 lwb_t *lwb = NULL; 453 uint64_t txg = 0; 454 dmu_tx_t *tx = NULL; 455 blkptr_t blk; 456 int error = 0; 457 458 /* 459 * Wait for any previous destroy to complete. 460 */ 461 txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 462 463 ASSERT(zh->zh_claim_txg == 0); 464 ASSERT(zh->zh_replay_seq == 0); 465 466 blk = zh->zh_log; 467 468 /* 469 * Allocate an initial log block if: 470 * - there isn't one already 471 * - the existing block is the wrong endianess 472 */ 473 if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { 474 tx = dmu_tx_create(zilog->zl_os); 475 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 476 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 477 txg = dmu_tx_get_txg(tx); 478 479 if (!BP_IS_HOLE(&blk)) { 480 zio_free_zil(zilog->zl_spa, txg, &blk); 481 BP_ZERO(&blk); 482 } 483 484 error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, 485 ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); 486 487 if (error == 0) 488 zil_init_log_chain(zilog, &blk); 489 } 490 491 /* 492 * Allocate a log write buffer (lwb) for the first log block. 493 */ 494 if (error == 0) 495 lwb = zil_alloc_lwb(zilog, &blk, txg); 496 497 /* 498 * If we just allocated the first log block, commit our transaction 499 * and wait for zil_sync() to stuff the block poiner into zh_log. 500 * (zh is part of the MOS, so we cannot modify it in open context.) 501 */ 502 if (tx != NULL) { 503 dmu_tx_commit(tx); 504 txg_wait_synced(zilog->zl_dmu_pool, txg); 505 } 506 507 ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); 508 509 return (lwb); 510 } 511 512 /* 513 * In one tx, free all log blocks and clear the log header. 514 * If keep_first is set, then we're replaying a log with no content. 515 * We want to keep the first block, however, so that the first 516 * synchronous transaction doesn't require a txg_wait_synced() 517 * in zil_create(). We don't need to txg_wait_synced() here either 518 * when keep_first is set, because both zil_create() and zil_destroy() 519 * will wait for any in-progress destroys to complete. 520 */ 521 void 522 zil_destroy(zilog_t *zilog, boolean_t keep_first) 523 { 524 const zil_header_t *zh = zilog->zl_header; 525 lwb_t *lwb; 526 dmu_tx_t *tx; 527 uint64_t txg; 528 529 /* 530 * Wait for any previous destroy to complete. 531 */ 532 txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 533 534 zilog->zl_old_header = *zh; /* debugging aid */ 535 536 if (BP_IS_HOLE(&zh->zh_log)) 537 return; 538 539 tx = dmu_tx_create(zilog->zl_os); 540 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 541 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 542 txg = dmu_tx_get_txg(tx); 543 544 mutex_enter(&zilog->zl_lock); 545 546 ASSERT3U(zilog->zl_destroy_txg, <, txg); 547 zilog->zl_destroy_txg = txg; 548 zilog->zl_keep_first = keep_first; 549 550 if (!list_is_empty(&zilog->zl_lwb_list)) { 551 ASSERT(zh->zh_claim_txg == 0); 552 ASSERT(!keep_first); 553 while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 554 list_remove(&zilog->zl_lwb_list, lwb); 555 if (lwb->lwb_buf != NULL) 556 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 557 zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); 558 kmem_cache_free(zil_lwb_cache, lwb); 559 } 560 } else if (!keep_first) { 561 (void) zil_parse(zilog, zil_free_log_block, 562 zil_free_log_record, tx, zh->zh_claim_txg); 563 } 564 mutex_exit(&zilog->zl_lock); 565 566 dmu_tx_commit(tx); 567 } 568 569 int 570 zil_claim(const char *osname, void *txarg) 571 { 572 dmu_tx_t *tx = txarg; 573 uint64_t first_txg = dmu_tx_get_txg(tx); 574 zilog_t *zilog; 575 zil_header_t *zh; 576 objset_t *os; 577 int error; 578 579 error = dmu_objset_hold(osname, FTAG, &os); 580 if (error) { 581 cmn_err(CE_WARN, "can't open objset for %s", osname); 582 return (0); 583 } 584 585 zilog = dmu_objset_zil(os); 586 zh = zil_header_in_syncing_context(zilog); 587 588 if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { 589 if (!BP_IS_HOLE(&zh->zh_log)) 590 zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); 591 BP_ZERO(&zh->zh_log); 592 dsl_dataset_dirty(dmu_objset_ds(os), tx); 593 dmu_objset_rele(os, FTAG); 594 return (0); 595 } 596 597 /* 598 * Claim all log blocks if we haven't already done so, and remember 599 * the highest claimed sequence number. This ensures that if we can 600 * read only part of the log now (e.g. due to a missing device), 601 * but we can read the entire log later, we will not try to replay 602 * or destroy beyond the last block we successfully claimed. 603 */ 604 ASSERT3U(zh->zh_claim_txg, <=, first_txg); 605 if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { 606 (void) zil_parse(zilog, zil_claim_log_block, 607 zil_claim_log_record, tx, first_txg); 608 zh->zh_claim_txg = first_txg; 609 zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; 610 zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; 611 if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) 612 zh->zh_flags |= ZIL_REPLAY_NEEDED; 613 zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; 614 dsl_dataset_dirty(dmu_objset_ds(os), tx); 615 } 616 617 ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); 618 dmu_objset_rele(os, FTAG); 619 return (0); 620 } 621 622 /* 623 * Check the log by walking the log chain. 624 * Checksum errors are ok as they indicate the end of the chain. 625 * Any other error (no device or read failure) returns an error. 626 */ 627 int 628 zil_check_log_chain(const char *osname, void *tx) 629 { 630 zilog_t *zilog; 631 objset_t *os; 632 int error; 633 634 ASSERT(tx == NULL); 635 636 error = dmu_objset_hold(osname, FTAG, &os); 637 if (error) { 638 cmn_err(CE_WARN, "can't open objset for %s", osname); 639 return (0); 640 } 641 642 zilog = dmu_objset_zil(os); 643 644 /* 645 * Because tx == NULL, zil_claim_log_block() will not actually claim 646 * any blocks, but just determine whether it is possible to do so. 647 * In addition to checking the log chain, zil_claim_log_block() 648 * will invoke zio_claim() with a done func of spa_claim_notify(), 649 * which will update spa_max_claim_txg. See spa_load() for details. 650 */ 651 error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, 652 zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); 653 654 dmu_objset_rele(os, FTAG); 655 656 return ((error == ECKSUM || error == ENOENT) ? 0 : error); 657 } 658 659 static int 660 zil_vdev_compare(const void *x1, const void *x2) 661 { 662 uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; 663 uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; 664 665 if (v1 < v2) 666 return (-1); 667 if (v1 > v2) 668 return (1); 669 670 return (0); 671 } 672 673 void 674 zil_add_block(zilog_t *zilog, const blkptr_t *bp) 675 { 676 avl_tree_t *t = &zilog->zl_vdev_tree; 677 avl_index_t where; 678 zil_vdev_node_t *zv, zvsearch; 679 int ndvas = BP_GET_NDVAS(bp); 680 int i; 681 682 if (zfs_nocacheflush) 683 return; 684 685 ASSERT(zilog->zl_writer); 686 687 /* 688 * Even though we're zl_writer, we still need a lock because the 689 * zl_get_data() callbacks may have dmu_sync() done callbacks 690 * that will run concurrently. 691 */ 692 mutex_enter(&zilog->zl_vdev_lock); 693 for (i = 0; i < ndvas; i++) { 694 zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); 695 if (avl_find(t, &zvsearch, &where) == NULL) { 696 zv = kmem_alloc(sizeof (*zv), KM_SLEEP); 697 zv->zv_vdev = zvsearch.zv_vdev; 698 avl_insert(t, zv, where); 699 } 700 } 701 mutex_exit(&zilog->zl_vdev_lock); 702 } 703 704 void 705 zil_flush_vdevs(zilog_t *zilog) 706 { 707 spa_t *spa = zilog->zl_spa; 708 avl_tree_t *t = &zilog->zl_vdev_tree; 709 void *cookie = NULL; 710 zil_vdev_node_t *zv; 711 zio_t *zio; 712 713 ASSERT(zilog->zl_writer); 714 715 /* 716 * We don't need zl_vdev_lock here because we're the zl_writer, 717 * and all zl_get_data() callbacks are done. 718 */ 719 if (avl_numnodes(t) == 0) 720 return; 721 722 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 723 724 zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); 725 726 while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { 727 vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); 728 if (vd != NULL) 729 zio_flush(zio, vd); 730 kmem_free(zv, sizeof (*zv)); 731 } 732 733 /* 734 * Wait for all the flushes to complete. Not all devices actually 735 * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. 736 */ 737 (void) zio_wait(zio); 738 739 spa_config_exit(spa, SCL_STATE, FTAG); 740 } 741 742 /* 743 * Function called when a log block write completes 744 */ 745 static void 746 zil_lwb_write_done(zio_t *zio) 747 { 748 lwb_t *lwb = zio->io_private; 749 zilog_t *zilog = lwb->lwb_zilog; 750 dmu_tx_t *tx = lwb->lwb_tx; 751 752 ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); 753 ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); 754 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 755 ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); 756 ASSERT(!BP_IS_GANG(zio->io_bp)); 757 ASSERT(!BP_IS_HOLE(zio->io_bp)); 758 ASSERT(zio->io_bp->blk_fill == 0); 759 760 /* 761 * Ensure the lwb buffer pointer is cleared before releasing 762 * the txg. If we have had an allocation failure and 763 * the txg is waiting to sync then we want want zil_sync() 764 * to remove the lwb so that it's not picked up as the next new 765 * one in zil_commit_writer(). zil_sync() will only remove 766 * the lwb if lwb_buf is null. 767 */ 768 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 769 mutex_enter(&zilog->zl_lock); 770 lwb->lwb_buf = NULL; 771 lwb->lwb_tx = NULL; 772 mutex_exit(&zilog->zl_lock); 773 774 /* 775 * Now that we've written this log block, we have a stable pointer 776 * to the next block in the chain, so it's OK to let the txg in 777 * which we allocated the next block sync. 778 */ 779 dmu_tx_commit(tx); 780 } 781 782 /* 783 * Initialize the io for a log block. 784 */ 785 static void 786 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) 787 { 788 zbookmark_t zb; 789 790 SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], 791 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, 792 lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); 793 794 if (zilog->zl_root_zio == NULL) { 795 zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, 796 ZIO_FLAG_CANFAIL); 797 } 798 if (lwb->lwb_zio == NULL) { 799 lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 800 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), 801 zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, 802 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); 803 } 804 } 805 806 /* 807 * Define a limited set of intent log block sizes. 808 * These must be a multiple of 4KB. Note only the amount used (again 809 * aligned to 4KB) actually gets written. However, we can't always just 810 * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. 811 */ 812 uint64_t zil_block_buckets[] = { 813 4096, /* non TX_WRITE */ 814 8192+4096, /* data base */ 815 32*1024 + 4096, /* NFS writes */ 816 UINT64_MAX 817 }; 818 819 /* 820 * Use the slog as long as the logbias is 'latency' and the current commit size 821 * is less than the limit or the total list size is less than 2X the limit. 822 * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. 823 */ 824 uint64_t zil_slog_limit = 1024 * 1024; 825 #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ 826 (((zilog)->zl_cur_used < zil_slog_limit) || \ 827 ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) 828 829 /* 830 * Start a log block write and advance to the next log block. 831 * Calls are serialized. 832 */ 833 static lwb_t * 834 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) 835 { 836 lwb_t *nlwb = NULL; 837 zil_chain_t *zilc; 838 spa_t *spa = zilog->zl_spa; 839 blkptr_t *bp; 840 dmu_tx_t *tx; 841 uint64_t txg; 842 uint64_t zil_blksz; 843 int i, error; 844 845 if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 846 zilc = (zil_chain_t *)lwb->lwb_buf; 847 bp = &zilc->zc_next_blk; 848 } else { 849 zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); 850 bp = &zilc->zc_next_blk; 851 } 852 853 ASSERT(lwb->lwb_nused <= lwb->lwb_sz); 854 855 /* 856 * Allocate the next block and save its address in this block 857 * before writing it in order to establish the log chain. 858 * Note that if the allocation of nlwb synced before we wrote 859 * the block that points at it (lwb), we'd leak it if we crashed. 860 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). 861 * We dirty the dataset to ensure that zil_sync() will be called 862 * to clean up in the event of allocation failure or I/O failure. 863 */ 864 tx = dmu_tx_create(zilog->zl_os); 865 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 866 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 867 txg = dmu_tx_get_txg(tx); 868 869 lwb->lwb_tx = tx; 870 871 /* 872 * Log blocks are pre-allocated. Here we select the size of the next 873 * block, based on size used in the last block. 874 * - first find the smallest bucket that will fit the block from a 875 * limited set of block sizes. This is because it's faster to write 876 * blocks allocated from the same metaslab as they are adjacent or 877 * close. 878 * - next find the maximum from the new suggested size and an array of 879 * previous sizes. This lessens a picket fence effect of wrongly 880 * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k 881 * requests. 882 * 883 * Note we only write what is used, but we can't just allocate 884 * the maximum block size because we can exhaust the available 885 * pool log space. 886 */ 887 zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); 888 for (i = 0; zil_blksz > zil_block_buckets[i]; i++) 889 continue; 890 zil_blksz = zil_block_buckets[i]; 891 if (zil_blksz == UINT64_MAX) 892 zil_blksz = SPA_MAXBLOCKSIZE; 893 zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; 894 for (i = 0; i < ZIL_PREV_BLKS; i++) 895 zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); 896 zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); 897 898 BP_ZERO(bp); 899 /* pass the old blkptr in order to spread log blocks across devs */ 900 error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, 901 USE_SLOG(zilog)); 902 if (!error) { 903 ASSERT3U(bp->blk_birth, ==, txg); 904 bp->blk_cksum = lwb->lwb_blk.blk_cksum; 905 bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; 906 907 /* 908 * Allocate a new log write buffer (lwb). 909 */ 910 nlwb = zil_alloc_lwb(zilog, bp, txg); 911 912 /* Record the block for later vdev flushing */ 913 zil_add_block(zilog, &lwb->lwb_blk); 914 } 915 916 if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { 917 uint64_t len; 918 919 /* For Slim ZIL only write what is used. */ 920 len = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); 921 ASSERT3U(len, <=, lwb->lwb_sz); 922 zio_shrink(lwb->lwb_zio, len); 923 924 } 925 zilc->zc_pad = 0; 926 zilc->zc_nused = lwb->lwb_nused; 927 zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; 928 929 zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ 930 931 /* 932 * If there was an allocation failure then nlwb will be null which 933 * forces a txg_wait_synced(). 934 */ 935 return (nlwb); 936 } 937 938 static lwb_t * 939 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) 940 { 941 lr_t *lrc = &itx->itx_lr; /* common log record */ 942 lr_write_t *lrw = (lr_write_t *)lrc; 943 char *lr_buf; 944 uint64_t txg = lrc->lrc_txg; 945 uint64_t reclen = lrc->lrc_reclen; 946 uint64_t dlen = 0; 947 948 if (lwb == NULL) 949 return (NULL); 950 951 ASSERT(lwb->lwb_buf != NULL); 952 953 if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) 954 dlen = P2ROUNDUP_TYPED( 955 lrw->lr_length, sizeof (uint64_t), uint64_t); 956 957 zilog->zl_cur_used += (reclen + dlen); 958 959 zil_lwb_write_init(zilog, lwb); 960 961 /* 962 * If this record won't fit in the current log block, start a new one. 963 */ 964 if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 965 lwb = zil_lwb_write_start(zilog, lwb); 966 if (lwb == NULL) 967 return (NULL); 968 zil_lwb_write_init(zilog, lwb); 969 ASSERT(LWB_EMPTY(lwb)); 970 if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { 971 txg_wait_synced(zilog->zl_dmu_pool, txg); 972 return (lwb); 973 } 974 } 975 976 lr_buf = lwb->lwb_buf + lwb->lwb_nused; 977 bcopy(lrc, lr_buf, reclen); 978 lrc = (lr_t *)lr_buf; 979 lrw = (lr_write_t *)lrc; 980 981 /* 982 * If it's a write, fetch the data or get its blkptr as appropriate. 983 */ 984 if (lrc->lrc_txtype == TX_WRITE) { 985 if (txg > spa_freeze_txg(zilog->zl_spa)) 986 txg_wait_synced(zilog->zl_dmu_pool, txg); 987 if (itx->itx_wr_state != WR_COPIED) { 988 char *dbuf; 989 int error; 990 991 if (dlen) { 992 ASSERT(itx->itx_wr_state == WR_NEED_COPY); 993 dbuf = lr_buf + reclen; 994 lrw->lr_common.lrc_reclen += dlen; 995 } else { 996 ASSERT(itx->itx_wr_state == WR_INDIRECT); 997 dbuf = NULL; 998 } 999 error = zilog->zl_get_data( 1000 itx->itx_private, lrw, dbuf, lwb->lwb_zio); 1001 if (error == EIO) { 1002 txg_wait_synced(zilog->zl_dmu_pool, txg); 1003 return (lwb); 1004 } 1005 if (error) { 1006 ASSERT(error == ENOENT || error == EEXIST || 1007 error == EALREADY); 1008 return (lwb); 1009 } 1010 } 1011 } 1012 1013 /* 1014 * We're actually making an entry, so update lrc_seq to be the 1015 * log record sequence number. Note that this is generally not 1016 * equal to the itx sequence number because not all transactions 1017 * are synchronous, and sometimes spa_sync() gets there first. 1018 */ 1019 lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ 1020 lwb->lwb_nused += reclen + dlen; 1021 lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); 1022 ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); 1023 ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); 1024 1025 return (lwb); 1026 } 1027 1028 itx_t * 1029 zil_itx_create(uint64_t txtype, size_t lrsize) 1030 { 1031 itx_t *itx; 1032 1033 lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); 1034 1035 itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); 1036 itx->itx_lr.lrc_txtype = txtype; 1037 itx->itx_lr.lrc_reclen = lrsize; 1038 itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ 1039 itx->itx_lr.lrc_seq = 0; /* defensive */ 1040 1041 return (itx); 1042 } 1043 1044 void 1045 zil_itx_destroy(itx_t *itx) 1046 { 1047 kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); 1048 } 1049 1050 uint64_t 1051 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) 1052 { 1053 uint64_t seq; 1054 1055 ASSERT(itx->itx_lr.lrc_seq == 0); 1056 ASSERT(!zilog->zl_replay); 1057 1058 mutex_enter(&zilog->zl_lock); 1059 list_insert_tail(&zilog->zl_itx_list, itx); 1060 zilog->zl_itx_list_sz += itx->itx_sod; 1061 itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); 1062 itx->itx_lr.lrc_seq = seq = ++zilog->zl_itx_seq; 1063 mutex_exit(&zilog->zl_lock); 1064 1065 return (seq); 1066 } 1067 1068 /* 1069 * Free up all in-memory intent log transactions that have now been synced. 1070 */ 1071 static void 1072 zil_itx_clean(zilog_t *zilog) 1073 { 1074 uint64_t synced_txg = spa_last_synced_txg(zilog->zl_spa); 1075 uint64_t freeze_txg = spa_freeze_txg(zilog->zl_spa); 1076 list_t clean_list; 1077 itx_t *itx; 1078 1079 list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); 1080 1081 mutex_enter(&zilog->zl_lock); 1082 /* wait for a log writer to finish walking list */ 1083 while (zilog->zl_writer) { 1084 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1085 } 1086 1087 /* 1088 * Move the sync'd log transactions to a separate list so we can call 1089 * kmem_free without holding the zl_lock. 1090 * 1091 * There is no need to set zl_writer as we don't drop zl_lock here 1092 */ 1093 while ((itx = list_head(&zilog->zl_itx_list)) != NULL && 1094 itx->itx_lr.lrc_txg <= MIN(synced_txg, freeze_txg)) { 1095 list_remove(&zilog->zl_itx_list, itx); 1096 zilog->zl_itx_list_sz -= itx->itx_sod; 1097 list_insert_tail(&clean_list, itx); 1098 } 1099 cv_broadcast(&zilog->zl_cv_writer); 1100 mutex_exit(&zilog->zl_lock); 1101 1102 /* destroy sync'd log transactions */ 1103 while ((itx = list_head(&clean_list)) != NULL) { 1104 list_remove(&clean_list, itx); 1105 zil_itx_destroy(itx); 1106 } 1107 list_destroy(&clean_list); 1108 } 1109 1110 /* 1111 * If there are any in-memory intent log transactions which have now been 1112 * synced then start up a taskq to free them. 1113 */ 1114 void 1115 zil_clean(zilog_t *zilog) 1116 { 1117 itx_t *itx; 1118 1119 mutex_enter(&zilog->zl_lock); 1120 itx = list_head(&zilog->zl_itx_list); 1121 if ((itx != NULL) && 1122 (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { 1123 (void) taskq_dispatch(zilog->zl_clean_taskq, 1124 (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP); 1125 } 1126 mutex_exit(&zilog->zl_lock); 1127 } 1128 1129 static void 1130 zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) 1131 { 1132 uint64_t txg; 1133 uint64_t commit_seq = 0; 1134 itx_t *itx, *itx_next; 1135 lwb_t *lwb; 1136 spa_t *spa; 1137 int error = 0; 1138 1139 zilog->zl_writer = B_TRUE; 1140 ASSERT(zilog->zl_root_zio == NULL); 1141 spa = zilog->zl_spa; 1142 1143 if (zilog->zl_suspend) { 1144 lwb = NULL; 1145 } else { 1146 lwb = list_tail(&zilog->zl_lwb_list); 1147 if (lwb == NULL) { 1148 /* 1149 * Return if there's nothing to flush before we 1150 * dirty the fs by calling zil_create() 1151 */ 1152 if (list_is_empty(&zilog->zl_itx_list)) { 1153 zilog->zl_writer = B_FALSE; 1154 return; 1155 } 1156 mutex_exit(&zilog->zl_lock); 1157 lwb = zil_create(zilog); 1158 mutex_enter(&zilog->zl_lock); 1159 } 1160 } 1161 ASSERT(lwb == NULL || lwb->lwb_zio == NULL); 1162 1163 /* Loop through in-memory log transactions filling log blocks. */ 1164 DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); 1165 1166 for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) { 1167 /* 1168 * Save the next pointer. Even though we drop zl_lock below, 1169 * all threads that can remove itx list entries (other writers 1170 * and zil_itx_clean()) can't do so until they have zl_writer. 1171 */ 1172 itx_next = list_next(&zilog->zl_itx_list, itx); 1173 1174 /* 1175 * Determine whether to push this itx. 1176 * Push all transactions related to specified foid and 1177 * all other transactions except those that can be logged 1178 * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL) 1179 * for all other files. 1180 * 1181 * If foid == 0 (meaning "push all foids") or 1182 * itx->itx_sync is set (meaning O_[D]SYNC), push regardless. 1183 */ 1184 if (foid != 0 && !itx->itx_sync && 1185 TX_OOO(itx->itx_lr.lrc_txtype) && 1186 ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid) 1187 continue; /* skip this record */ 1188 1189 if ((itx->itx_lr.lrc_seq > seq) && 1190 ((lwb == NULL) || (LWB_EMPTY(lwb)) || 1191 (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz))) 1192 break; 1193 1194 list_remove(&zilog->zl_itx_list, itx); 1195 zilog->zl_itx_list_sz -= itx->itx_sod; 1196 1197 mutex_exit(&zilog->zl_lock); 1198 1199 txg = itx->itx_lr.lrc_txg; 1200 ASSERT(txg); 1201 1202 if (txg > spa_last_synced_txg(spa) || 1203 txg > spa_freeze_txg(spa)) 1204 lwb = zil_lwb_commit(zilog, itx, lwb); 1205 1206 zil_itx_destroy(itx); 1207 1208 mutex_enter(&zilog->zl_lock); 1209 } 1210 DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); 1211 /* determine commit sequence number */ 1212 itx = list_head(&zilog->zl_itx_list); 1213 if (itx) 1214 commit_seq = itx->itx_lr.lrc_seq - 1; 1215 else 1216 commit_seq = zilog->zl_itx_seq; 1217 mutex_exit(&zilog->zl_lock); 1218 1219 /* write the last block out */ 1220 if (lwb != NULL && lwb->lwb_zio != NULL) 1221 lwb = zil_lwb_write_start(zilog, lwb); 1222 1223 zilog->zl_prev_used = zilog->zl_cur_used; 1224 zilog->zl_cur_used = 0; 1225 1226 /* 1227 * Wait if necessary for the log blocks to be on stable storage. 1228 */ 1229 if (zilog->zl_root_zio) { 1230 DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); 1231 error = zio_wait(zilog->zl_root_zio); 1232 zilog->zl_root_zio = NULL; 1233 DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); 1234 zil_flush_vdevs(zilog); 1235 } 1236 1237 if (error || lwb == NULL) 1238 txg_wait_synced(zilog->zl_dmu_pool, 0); 1239 1240 mutex_enter(&zilog->zl_lock); 1241 zilog->zl_writer = B_FALSE; 1242 1243 ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); 1244 zilog->zl_commit_seq = commit_seq; 1245 1246 /* 1247 * Remember the highest committed log sequence number for ztest. 1248 * We only update this value when all the log writes succeeded, 1249 * because ztest wants to ASSERT that it got the whole log chain. 1250 */ 1251 if (error == 0 && lwb != NULL) 1252 zilog->zl_commit_lr_seq = zilog->zl_lr_seq; 1253 } 1254 1255 /* 1256 * Push zfs transactions to stable storage up to the supplied sequence number. 1257 * If foid is 0 push out all transactions, otherwise push only those 1258 * for that file or might have been used to create that file. 1259 */ 1260 void 1261 zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) 1262 { 1263 if (zilog == NULL || seq == 0) 1264 return; 1265 1266 mutex_enter(&zilog->zl_lock); 1267 1268 seq = MIN(seq, zilog->zl_itx_seq); /* cap seq at largest itx seq */ 1269 1270 while (zilog->zl_writer) { 1271 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1272 if (seq <= zilog->zl_commit_seq) { 1273 mutex_exit(&zilog->zl_lock); 1274 return; 1275 } 1276 } 1277 zil_commit_writer(zilog, seq, foid); /* drops zl_lock */ 1278 /* wake up others waiting on the commit */ 1279 cv_broadcast(&zilog->zl_cv_writer); 1280 mutex_exit(&zilog->zl_lock); 1281 } 1282 1283 /* 1284 * Report whether all transactions are committed. 1285 */ 1286 static boolean_t 1287 zil_is_committed(zilog_t *zilog) 1288 { 1289 lwb_t *lwb; 1290 boolean_t committed; 1291 1292 mutex_enter(&zilog->zl_lock); 1293 1294 while (zilog->zl_writer) 1295 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1296 1297 if (!list_is_empty(&zilog->zl_itx_list)) 1298 committed = B_FALSE; /* unpushed transactions */ 1299 else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL) 1300 committed = B_TRUE; /* intent log never used */ 1301 else if (list_next(&zilog->zl_lwb_list, lwb) != NULL) 1302 committed = B_FALSE; /* zil_sync() not done yet */ 1303 else 1304 committed = B_TRUE; /* everything synced */ 1305 1306 mutex_exit(&zilog->zl_lock); 1307 return (committed); 1308 } 1309 1310 /* 1311 * Called in syncing context to free committed log blocks and update log header. 1312 */ 1313 void 1314 zil_sync(zilog_t *zilog, dmu_tx_t *tx) 1315 { 1316 zil_header_t *zh = zil_header_in_syncing_context(zilog); 1317 uint64_t txg = dmu_tx_get_txg(tx); 1318 spa_t *spa = zilog->zl_spa; 1319 uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; 1320 lwb_t *lwb; 1321 1322 /* 1323 * We don't zero out zl_destroy_txg, so make sure we don't try 1324 * to destroy it twice. 1325 */ 1326 if (spa_sync_pass(spa) != 1) 1327 return; 1328 1329 mutex_enter(&zilog->zl_lock); 1330 1331 ASSERT(zilog->zl_stop_sync == 0); 1332 1333 if (*replayed_seq != 0) { 1334 ASSERT(zh->zh_replay_seq < *replayed_seq); 1335 zh->zh_replay_seq = *replayed_seq; 1336 *replayed_seq = 0; 1337 } 1338 1339 if (zilog->zl_destroy_txg == txg) { 1340 blkptr_t blk = zh->zh_log; 1341 1342 ASSERT(list_head(&zilog->zl_lwb_list) == NULL); 1343 1344 bzero(zh, sizeof (zil_header_t)); 1345 bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); 1346 1347 if (zilog->zl_keep_first) { 1348 /* 1349 * If this block was part of log chain that couldn't 1350 * be claimed because a device was missing during 1351 * zil_claim(), but that device later returns, 1352 * then this block could erroneously appear valid. 1353 * To guard against this, assign a new GUID to the new 1354 * log chain so it doesn't matter what blk points to. 1355 */ 1356 zil_init_log_chain(zilog, &blk); 1357 zh->zh_log = blk; 1358 } 1359 } 1360 1361 while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1362 zh->zh_log = lwb->lwb_blk; 1363 if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) 1364 break; 1365 list_remove(&zilog->zl_lwb_list, lwb); 1366 zio_free_zil(spa, txg, &lwb->lwb_blk); 1367 kmem_cache_free(zil_lwb_cache, lwb); 1368 1369 /* 1370 * If we don't have anything left in the lwb list then 1371 * we've had an allocation failure and we need to zero 1372 * out the zil_header blkptr so that we don't end 1373 * up freeing the same block twice. 1374 */ 1375 if (list_head(&zilog->zl_lwb_list) == NULL) 1376 BP_ZERO(&zh->zh_log); 1377 } 1378 mutex_exit(&zilog->zl_lock); 1379 } 1380 1381 void 1382 zil_init(void) 1383 { 1384 zil_lwb_cache = kmem_cache_create("zil_lwb_cache", 1385 sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); 1386 } 1387 1388 void 1389 zil_fini(void) 1390 { 1391 kmem_cache_destroy(zil_lwb_cache); 1392 } 1393 1394 void 1395 zil_set_logbias(zilog_t *zilog, uint64_t logbias) 1396 { 1397 zilog->zl_logbias = logbias; 1398 } 1399 1400 zilog_t * 1401 zil_alloc(objset_t *os, zil_header_t *zh_phys) 1402 { 1403 zilog_t *zilog; 1404 1405 zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); 1406 1407 zilog->zl_header = zh_phys; 1408 zilog->zl_os = os; 1409 zilog->zl_spa = dmu_objset_spa(os); 1410 zilog->zl_dmu_pool = dmu_objset_pool(os); 1411 zilog->zl_destroy_txg = TXG_INITIAL - 1; 1412 zilog->zl_logbias = dmu_objset_logbias(os); 1413 1414 mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); 1415 1416 list_create(&zilog->zl_itx_list, sizeof (itx_t), 1417 offsetof(itx_t, itx_node)); 1418 1419 list_create(&zilog->zl_lwb_list, sizeof (lwb_t), 1420 offsetof(lwb_t, lwb_node)); 1421 1422 mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); 1423 1424 avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, 1425 sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); 1426 1427 cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); 1428 cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); 1429 1430 return (zilog); 1431 } 1432 1433 void 1434 zil_free(zilog_t *zilog) 1435 { 1436 lwb_t *lwb; 1437 1438 zilog->zl_stop_sync = 1; 1439 1440 while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { 1441 list_remove(&zilog->zl_lwb_list, lwb); 1442 if (lwb->lwb_buf != NULL) 1443 zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); 1444 kmem_cache_free(zil_lwb_cache, lwb); 1445 } 1446 list_destroy(&zilog->zl_lwb_list); 1447 1448 avl_destroy(&zilog->zl_vdev_tree); 1449 mutex_destroy(&zilog->zl_vdev_lock); 1450 1451 ASSERT(list_head(&zilog->zl_itx_list) == NULL); 1452 list_destroy(&zilog->zl_itx_list); 1453 mutex_destroy(&zilog->zl_lock); 1454 1455 cv_destroy(&zilog->zl_cv_writer); 1456 cv_destroy(&zilog->zl_cv_suspend); 1457 1458 kmem_free(zilog, sizeof (zilog_t)); 1459 } 1460 1461 /* 1462 * Open an intent log. 1463 */ 1464 zilog_t * 1465 zil_open(objset_t *os, zil_get_data_t *get_data) 1466 { 1467 zilog_t *zilog = dmu_objset_zil(os); 1468 1469 zilog->zl_get_data = get_data; 1470 zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 1471 2, 2, TASKQ_PREPOPULATE); 1472 1473 return (zilog); 1474 } 1475 1476 /* 1477 * Close an intent log. 1478 */ 1479 void 1480 zil_close(zilog_t *zilog) 1481 { 1482 /* 1483 * If the log isn't already committed, mark the objset dirty 1484 * (so zil_sync() will be called) and wait for that txg to sync. 1485 */ 1486 if (!zil_is_committed(zilog)) { 1487 uint64_t txg; 1488 dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); 1489 VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); 1490 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1491 txg = dmu_tx_get_txg(tx); 1492 dmu_tx_commit(tx); 1493 txg_wait_synced(zilog->zl_dmu_pool, txg); 1494 } 1495 1496 taskq_destroy(zilog->zl_clean_taskq); 1497 zilog->zl_clean_taskq = NULL; 1498 zilog->zl_get_data = NULL; 1499 1500 zil_itx_clean(zilog); 1501 ASSERT(list_head(&zilog->zl_itx_list) == NULL); 1502 } 1503 1504 /* 1505 * Suspend an intent log. While in suspended mode, we still honor 1506 * synchronous semantics, but we rely on txg_wait_synced() to do it. 1507 * We suspend the log briefly when taking a snapshot so that the snapshot 1508 * contains all the data it's supposed to, and has an empty intent log. 1509 */ 1510 int 1511 zil_suspend(zilog_t *zilog) 1512 { 1513 const zil_header_t *zh = zilog->zl_header; 1514 1515 mutex_enter(&zilog->zl_lock); 1516 if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ 1517 mutex_exit(&zilog->zl_lock); 1518 return (EBUSY); 1519 } 1520 if (zilog->zl_suspend++ != 0) { 1521 /* 1522 * Someone else already began a suspend. 1523 * Just wait for them to finish. 1524 */ 1525 while (zilog->zl_suspending) 1526 cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); 1527 mutex_exit(&zilog->zl_lock); 1528 return (0); 1529 } 1530 zilog->zl_suspending = B_TRUE; 1531 mutex_exit(&zilog->zl_lock); 1532 1533 zil_commit(zilog, UINT64_MAX, 0); 1534 1535 /* 1536 * Wait for any in-flight log writes to complete. 1537 */ 1538 mutex_enter(&zilog->zl_lock); 1539 while (zilog->zl_writer) 1540 cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); 1541 mutex_exit(&zilog->zl_lock); 1542 1543 zil_destroy(zilog, B_FALSE); 1544 1545 mutex_enter(&zilog->zl_lock); 1546 zilog->zl_suspending = B_FALSE; 1547 cv_broadcast(&zilog->zl_cv_suspend); 1548 mutex_exit(&zilog->zl_lock); 1549 1550 return (0); 1551 } 1552 1553 void 1554 zil_resume(zilog_t *zilog) 1555 { 1556 mutex_enter(&zilog->zl_lock); 1557 ASSERT(zilog->zl_suspend != 0); 1558 zilog->zl_suspend--; 1559 mutex_exit(&zilog->zl_lock); 1560 } 1561 1562 typedef struct zil_replay_arg { 1563 zil_replay_func_t **zr_replay; 1564 void *zr_arg; 1565 boolean_t zr_byteswap; 1566 char *zr_lr; 1567 } zil_replay_arg_t; 1568 1569 static int 1570 zil_replay_error(zilog_t *zilog, lr_t *lr, int error) 1571 { 1572 char name[MAXNAMELEN]; 1573 1574 zilog->zl_replaying_seq--; /* didn't actually replay this one */ 1575 1576 dmu_objset_name(zilog->zl_os, name); 1577 1578 cmn_err(CE_WARN, "ZFS replay transaction error %d, " 1579 "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, 1580 (u_longlong_t)lr->lrc_seq, 1581 (u_longlong_t)(lr->lrc_txtype & ~TX_CI), 1582 (lr->lrc_txtype & TX_CI) ? "CI" : ""); 1583 1584 return (error); 1585 } 1586 1587 static int 1588 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) 1589 { 1590 zil_replay_arg_t *zr = zra; 1591 const zil_header_t *zh = zilog->zl_header; 1592 uint64_t reclen = lr->lrc_reclen; 1593 uint64_t txtype = lr->lrc_txtype; 1594 int error = 0; 1595 1596 zilog->zl_replaying_seq = lr->lrc_seq; 1597 1598 if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ 1599 return (0); 1600 1601 if (lr->lrc_txg < claim_txg) /* already committed */ 1602 return (0); 1603 1604 /* Strip case-insensitive bit, still present in log record */ 1605 txtype &= ~TX_CI; 1606 1607 if (txtype == 0 || txtype >= TX_MAX_TYPE) 1608 return (zil_replay_error(zilog, lr, EINVAL)); 1609 1610 /* 1611 * If this record type can be logged out of order, the object 1612 * (lr_foid) may no longer exist. That's legitimate, not an error. 1613 */ 1614 if (TX_OOO(txtype)) { 1615 error = dmu_object_info(zilog->zl_os, 1616 ((lr_ooo_t *)lr)->lr_foid, NULL); 1617 if (error == ENOENT || error == EEXIST) 1618 return (0); 1619 } 1620 1621 /* 1622 * Make a copy of the data so we can revise and extend it. 1623 */ 1624 bcopy(lr, zr->zr_lr, reclen); 1625 1626 /* 1627 * If this is a TX_WRITE with a blkptr, suck in the data. 1628 */ 1629 if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { 1630 error = zil_read_log_data(zilog, (lr_write_t *)lr, 1631 zr->zr_lr + reclen); 1632 if (error) 1633 return (zil_replay_error(zilog, lr, error)); 1634 } 1635 1636 /* 1637 * The log block containing this lr may have been byteswapped 1638 * so that we can easily examine common fields like lrc_txtype. 1639 * However, the log is a mix of different record types, and only the 1640 * replay vectors know how to byteswap their records. Therefore, if 1641 * the lr was byteswapped, undo it before invoking the replay vector. 1642 */ 1643 if (zr->zr_byteswap) 1644 byteswap_uint64_array(zr->zr_lr, reclen); 1645 1646 /* 1647 * We must now do two things atomically: replay this log record, 1648 * and update the log header sequence number to reflect the fact that 1649 * we did so. At the end of each replay function the sequence number 1650 * is updated if we are in replay mode. 1651 */ 1652 error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); 1653 if (error) { 1654 /* 1655 * The DMU's dnode layer doesn't see removes until the txg 1656 * commits, so a subsequent claim can spuriously fail with 1657 * EEXIST. So if we receive any error we try syncing out 1658 * any removes then retry the transaction. Note that we 1659 * specify B_FALSE for byteswap now, so we don't do it twice. 1660 */ 1661 txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); 1662 error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); 1663 if (error) 1664 return (zil_replay_error(zilog, lr, error)); 1665 } 1666 return (0); 1667 } 1668 1669 /* ARGSUSED */ 1670 static int 1671 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) 1672 { 1673 zilog->zl_replay_blks++; 1674 1675 return (0); 1676 } 1677 1678 /* 1679 * If this dataset has a non-empty intent log, replay it and destroy it. 1680 */ 1681 void 1682 zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) 1683 { 1684 zilog_t *zilog = dmu_objset_zil(os); 1685 const zil_header_t *zh = zilog->zl_header; 1686 zil_replay_arg_t zr; 1687 1688 if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { 1689 zil_destroy(zilog, B_TRUE); 1690 return; 1691 } 1692 1693 zr.zr_replay = replay_func; 1694 zr.zr_arg = arg; 1695 zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); 1696 zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); 1697 1698 /* 1699 * Wait for in-progress removes to sync before starting replay. 1700 */ 1701 txg_wait_synced(zilog->zl_dmu_pool, 0); 1702 1703 zilog->zl_replay = B_TRUE; 1704 zilog->zl_replay_time = ddi_get_lbolt(); 1705 ASSERT(zilog->zl_replay_blks == 0); 1706 (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, 1707 zh->zh_claim_txg); 1708 kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); 1709 1710 zil_destroy(zilog, B_FALSE); 1711 txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); 1712 zilog->zl_replay = B_FALSE; 1713 } 1714 1715 boolean_t 1716 zil_replaying(zilog_t *zilog, dmu_tx_t *tx) 1717 { 1718 if (zilog == NULL) 1719 return (B_TRUE); 1720 1721 if (zilog->zl_replay) { 1722 dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); 1723 zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = 1724 zilog->zl_replaying_seq; 1725 return (B_TRUE); 1726 } 1727 1728 return (B_FALSE); 1729 } 1730 1731 /* ARGSUSED */ 1732 int 1733 zil_vdev_offline(const char *osname, void *arg) 1734 { 1735 objset_t *os; 1736 zilog_t *zilog; 1737 int error; 1738 1739 error = dmu_objset_hold(osname, FTAG, &os); 1740 if (error) 1741 return (error); 1742 1743 zilog = dmu_objset_zil(os); 1744 if (zil_suspend(zilog) != 0) 1745 error = EEXIST; 1746 else 1747 zil_resume(zilog); 1748 dmu_objset_rele(os, FTAG); 1749 return (error); 1750 } 1751