1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2011, 2021 by Delphix. All rights reserved. 24 * Copyright 2016 Gary Mills 25 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 26 * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved. 27 * Copyright 2019 Joyent, Inc. 28 */ 29 30 #include <sys/dsl_scan.h> 31 #include <sys/dsl_pool.h> 32 #include <sys/dsl_dataset.h> 33 #include <sys/dsl_prop.h> 34 #include <sys/dsl_dir.h> 35 #include <sys/dsl_synctask.h> 36 #include <sys/dnode.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/dmu_objset.h> 39 #include <sys/arc.h> 40 #include <sys/arc_impl.h> 41 #include <sys/zap.h> 42 #include <sys/zio.h> 43 #include <sys/zfs_context.h> 44 #include <sys/fs/zfs.h> 45 #include <sys/zfs_znode.h> 46 #include <sys/spa_impl.h> 47 #include <sys/vdev_impl.h> 48 #include <sys/zil_impl.h> 49 #include <sys/zio_checksum.h> 50 #include <sys/ddt.h> 51 #include <sys/sa.h> 52 #include <sys/sa_impl.h> 53 #include <sys/zfeature.h> 54 #include <sys/abd.h> 55 #include <sys/range_tree.h> 56 #ifdef _KERNEL 57 #include <sys/zfs_vfsops.h> 58 #endif 59 60 /* 61 * Grand theory statement on scan queue sorting 62 * 63 * Scanning is implemented by recursively traversing all indirection levels 64 * in an object and reading all blocks referenced from said objects. This 65 * results in us approximately traversing the object from lowest logical 66 * offset to the highest. For best performance, we would want the logical 67 * blocks to be physically contiguous. However, this is frequently not the 68 * case with pools given the allocation patterns of copy-on-write filesystems. 69 * So instead, we put the I/Os into a reordering queue and issue them in a 70 * way that will most benefit physical disks (LBA-order). 71 * 72 * Queue management: 73 * 74 * Ideally, we would want to scan all metadata and queue up all block I/O 75 * prior to starting to issue it, because that allows us to do an optimal 76 * sorting job. This can however consume large amounts of memory. Therefore 77 * we continuously monitor the size of the queues and constrain them to 5% 78 * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this 79 * limit, we clear out a few of the largest extents at the head of the queues 80 * to make room for more scanning. Hopefully, these extents will be fairly 81 * large and contiguous, allowing us to approach sequential I/O throughput 82 * even without a fully sorted tree. 83 * 84 * Metadata scanning takes place in dsl_scan_visit(), which is called from 85 * dsl_scan_sync() every spa_sync(). If we have either fully scanned all 86 * metadata on the pool, or we need to make room in memory because our 87 * queues are too large, dsl_scan_visit() is postponed and 88 * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies 89 * that metadata scanning and queued I/O issuing are mutually exclusive. This 90 * allows us to provide maximum sequential I/O throughput for the majority of 91 * I/O's issued since sequential I/O performance is significantly negatively 92 * impacted if it is interleaved with random I/O. 93 * 94 * Implementation Notes 95 * 96 * One side effect of the queued scanning algorithm is that the scanning code 97 * needs to be notified whenever a block is freed. This is needed to allow 98 * the scanning code to remove these I/Os from the issuing queue. Additionally, 99 * we do not attempt to queue gang blocks to be issued sequentially since this 100 * is very hard to do and would have an extremely limited performance benefit. 101 * Instead, we simply issue gang I/Os as soon as we find them using the legacy 102 * algorithm. 103 * 104 * Backwards compatibility 105 * 106 * This new algorithm is backwards compatible with the legacy on-disk data 107 * structures (and therefore does not require a new feature flag). 108 * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan 109 * will stop scanning metadata (in logical order) and wait for all outstanding 110 * sorted I/O to complete. Once this is done, we write out a checkpoint 111 * bookmark, indicating that we have scanned everything logically before it. 112 * If the pool is imported on a machine without the new sorting algorithm, 113 * the scan simply resumes from the last checkpoint using the legacy algorithm. 114 */ 115 116 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, 117 const zbookmark_phys_t *); 118 119 static scan_cb_t dsl_scan_scrub_cb; 120 121 static int scan_ds_queue_compare(const void *a, const void *b); 122 static int scan_prefetch_queue_compare(const void *a, const void *b); 123 static void scan_ds_queue_clear(dsl_scan_t *scn); 124 static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn); 125 static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, 126 uint64_t *txg); 127 static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); 128 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); 129 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); 130 static uint64_t dsl_scan_count_data_disks(spa_t *spa); 131 132 extern uint_t zfs_vdev_async_write_active_min_dirty_percent; 133 static int zfs_scan_blkstats = 0; 134 135 /* 136 * 'zpool status' uses bytes processed per pass to report throughput and 137 * estimate time remaining. We define a pass to start when the scanning 138 * phase completes for a sequential resilver. Optionally, this value 139 * may be used to reset the pass statistics every N txgs to provide an 140 * estimated completion time based on currently observed performance. 141 */ 142 static uint_t zfs_scan_report_txgs = 0; 143 144 /* 145 * By default zfs will check to ensure it is not over the hard memory 146 * limit before each txg. If finer-grained control of this is needed 147 * this value can be set to 1 to enable checking before scanning each 148 * block. 149 */ 150 static int zfs_scan_strict_mem_lim = B_FALSE; 151 152 /* 153 * Maximum number of parallelly executed bytes per leaf vdev. We attempt 154 * to strike a balance here between keeping the vdev queues full of I/Os 155 * at all times and not overflowing the queues to cause long latency, 156 * which would cause long txg sync times. No matter what, we will not 157 * overload the drives with I/O, since that is protected by 158 * zfs_vdev_scrub_max_active. 159 */ 160 static uint64_t zfs_scan_vdev_limit = 16 << 20; 161 162 static uint_t zfs_scan_issue_strategy = 0; 163 164 /* don't queue & sort zios, go direct */ 165 static int zfs_scan_legacy = B_FALSE; 166 static uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ 167 168 /* 169 * fill_weight is non-tunable at runtime, so we copy it at module init from 170 * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would 171 * break queue sorting. 172 */ 173 static uint_t zfs_scan_fill_weight = 3; 174 static uint64_t fill_weight; 175 176 /* See dsl_scan_should_clear() for details on the memory limit tunables */ 177 static const uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ 178 static const uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ 179 180 181 /* fraction of physmem */ 182 static uint_t zfs_scan_mem_lim_fact = 20; 183 184 /* fraction of mem lim above */ 185 static uint_t zfs_scan_mem_lim_soft_fact = 20; 186 187 /* minimum milliseconds to scrub per txg */ 188 static uint_t zfs_scrub_min_time_ms = 1000; 189 190 /* minimum milliseconds to obsolete per txg */ 191 static uint_t zfs_obsolete_min_time_ms = 500; 192 193 /* minimum milliseconds to free per txg */ 194 static uint_t zfs_free_min_time_ms = 1000; 195 196 /* minimum milliseconds to resilver per txg */ 197 static uint_t zfs_resilver_min_time_ms = 3000; 198 199 static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */ 200 int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */ 201 static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ 202 static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ 203 static const enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; 204 /* max number of blocks to free in a single TXG */ 205 static uint64_t zfs_async_block_max_blocks = UINT64_MAX; 206 /* max number of dedup blocks to free in a single TXG */ 207 static uint64_t zfs_max_async_dedup_frees = 100000; 208 209 /* set to disable resilver deferring */ 210 static int zfs_resilver_disable_defer = B_FALSE; 211 212 /* 213 * We wait a few txgs after importing a pool to begin scanning so that 214 * the import / mounting code isn't held up by scrub / resilver IO. 215 * Unfortunately, it is a bit difficult to determine exactly how long 216 * this will take since userspace will trigger fs mounts asynchronously 217 * and the kernel will create zvol minors asynchronously. As a result, 218 * the value provided here is a bit arbitrary, but represents a 219 * reasonable estimate of how many txgs it will take to finish fully 220 * importing a pool 221 */ 222 #define SCAN_IMPORT_WAIT_TXGS 5 223 224 #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ 225 ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ 226 (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) 227 228 /* 229 * Enable/disable the processing of the free_bpobj object. 230 */ 231 static int zfs_free_bpobj_enabled = 1; 232 233 /* the order has to match pool_scan_type */ 234 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { 235 NULL, 236 dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ 237 dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ 238 }; 239 240 /* In core node for the scn->scn_queue. Represents a dataset to be scanned */ 241 typedef struct { 242 uint64_t sds_dsobj; 243 uint64_t sds_txg; 244 avl_node_t sds_node; 245 } scan_ds_t; 246 247 /* 248 * This controls what conditions are placed on dsl_scan_sync_state(): 249 * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0 250 * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0. 251 * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise 252 * write out the scn_phys_cached version. 253 * See dsl_scan_sync_state for details. 254 */ 255 typedef enum { 256 SYNC_OPTIONAL, 257 SYNC_MANDATORY, 258 SYNC_CACHED 259 } state_sync_type_t; 260 261 /* 262 * This struct represents the minimum information needed to reconstruct a 263 * zio for sequential scanning. This is useful because many of these will 264 * accumulate in the sequential IO queues before being issued, so saving 265 * memory matters here. 266 */ 267 typedef struct scan_io { 268 /* fields from blkptr_t */ 269 uint64_t sio_blk_prop; 270 uint64_t sio_phys_birth; 271 uint64_t sio_birth; 272 zio_cksum_t sio_cksum; 273 uint32_t sio_nr_dvas; 274 275 /* fields from zio_t */ 276 uint32_t sio_flags; 277 zbookmark_phys_t sio_zb; 278 279 /* members for queue sorting */ 280 union { 281 avl_node_t sio_addr_node; /* link into issuing queue */ 282 list_node_t sio_list_node; /* link for issuing to disk */ 283 } sio_nodes; 284 285 /* 286 * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, 287 * depending on how many were in the original bp. Only the 288 * first DVA is really used for sorting and issuing purposes. 289 * The other DVAs (if provided) simply exist so that the zio 290 * layer can find additional copies to repair from in the 291 * event of an error. This array must go at the end of the 292 * struct to allow this for the variable number of elements. 293 */ 294 dva_t sio_dva[]; 295 } scan_io_t; 296 297 #define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) 298 #define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) 299 #define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) 300 #define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) 301 #define SIO_GET_END_OFFSET(sio) \ 302 (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) 303 #define SIO_GET_MUSED(sio) \ 304 (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) 305 306 struct dsl_scan_io_queue { 307 dsl_scan_t *q_scn; /* associated dsl_scan_t */ 308 vdev_t *q_vd; /* top-level vdev that this queue represents */ 309 zio_t *q_zio; /* scn_zio_root child for waiting on IO */ 310 311 /* trees used for sorting I/Os and extents of I/Os */ 312 range_tree_t *q_exts_by_addr; 313 zfs_btree_t q_exts_by_size; 314 avl_tree_t q_sios_by_addr; 315 uint64_t q_sio_memused; 316 uint64_t q_last_ext_addr; 317 318 /* members for zio rate limiting */ 319 uint64_t q_maxinflight_bytes; 320 uint64_t q_inflight_bytes; 321 kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ 322 323 /* per txg statistics */ 324 uint64_t q_total_seg_size_this_txg; 325 uint64_t q_segs_this_txg; 326 uint64_t q_total_zio_size_this_txg; 327 uint64_t q_zios_this_txg; 328 }; 329 330 /* private data for dsl_scan_prefetch_cb() */ 331 typedef struct scan_prefetch_ctx { 332 zfs_refcount_t spc_refcnt; /* refcount for memory management */ 333 dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ 334 boolean_t spc_root; /* is this prefetch for an objset? */ 335 uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ 336 uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ 337 } scan_prefetch_ctx_t; 338 339 /* private data for dsl_scan_prefetch() */ 340 typedef struct scan_prefetch_issue_ctx { 341 avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ 342 scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ 343 blkptr_t spic_bp; /* bp to prefetch */ 344 zbookmark_phys_t spic_zb; /* bookmark to prefetch */ 345 } scan_prefetch_issue_ctx_t; 346 347 static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, 348 const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); 349 static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, 350 scan_io_t *sio); 351 352 static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); 353 static void scan_io_queues_destroy(dsl_scan_t *scn); 354 355 static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; 356 357 /* sio->sio_nr_dvas must be set so we know which cache to free from */ 358 static void 359 sio_free(scan_io_t *sio) 360 { 361 ASSERT3U(sio->sio_nr_dvas, >, 0); 362 ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); 363 364 kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); 365 } 366 367 /* It is up to the caller to set sio->sio_nr_dvas for freeing */ 368 static scan_io_t * 369 sio_alloc(unsigned short nr_dvas) 370 { 371 ASSERT3U(nr_dvas, >, 0); 372 ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); 373 374 return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); 375 } 376 377 void 378 scan_init(void) 379 { 380 /* 381 * This is used in ext_size_compare() to weight segments 382 * based on how sparse they are. This cannot be changed 383 * mid-scan and the tree comparison functions don't currently 384 * have a mechanism for passing additional context to the 385 * compare functions. Thus we store this value globally and 386 * we only allow it to be set at module initialization time 387 */ 388 fill_weight = zfs_scan_fill_weight; 389 390 for (int i = 0; i < SPA_DVAS_PER_BP; i++) { 391 char name[36]; 392 393 (void) snprintf(name, sizeof (name), "sio_cache_%d", i); 394 sio_cache[i] = kmem_cache_create(name, 395 (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), 396 0, NULL, NULL, NULL, NULL, NULL, 0); 397 } 398 } 399 400 void 401 scan_fini(void) 402 { 403 for (int i = 0; i < SPA_DVAS_PER_BP; i++) { 404 kmem_cache_destroy(sio_cache[i]); 405 } 406 } 407 408 static inline boolean_t 409 dsl_scan_is_running(const dsl_scan_t *scn) 410 { 411 return (scn->scn_phys.scn_state == DSS_SCANNING); 412 } 413 414 boolean_t 415 dsl_scan_resilvering(dsl_pool_t *dp) 416 { 417 return (dsl_scan_is_running(dp->dp_scan) && 418 dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); 419 } 420 421 static inline void 422 sio2bp(const scan_io_t *sio, blkptr_t *bp) 423 { 424 memset(bp, 0, sizeof (*bp)); 425 bp->blk_prop = sio->sio_blk_prop; 426 bp->blk_phys_birth = sio->sio_phys_birth; 427 bp->blk_birth = sio->sio_birth; 428 bp->blk_fill = 1; /* we always only work with data pointers */ 429 bp->blk_cksum = sio->sio_cksum; 430 431 ASSERT3U(sio->sio_nr_dvas, >, 0); 432 ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); 433 434 memcpy(bp->blk_dva, sio->sio_dva, sio->sio_nr_dvas * sizeof (dva_t)); 435 } 436 437 static inline void 438 bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) 439 { 440 sio->sio_blk_prop = bp->blk_prop; 441 sio->sio_phys_birth = bp->blk_phys_birth; 442 sio->sio_birth = bp->blk_birth; 443 sio->sio_cksum = bp->blk_cksum; 444 sio->sio_nr_dvas = BP_GET_NDVAS(bp); 445 446 /* 447 * Copy the DVAs to the sio. We need all copies of the block so 448 * that the self healing code can use the alternate copies if the 449 * first is corrupted. We want the DVA at index dva_i to be first 450 * in the sio since this is the primary one that we want to issue. 451 */ 452 for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { 453 sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; 454 } 455 } 456 457 int 458 dsl_scan_init(dsl_pool_t *dp, uint64_t txg) 459 { 460 int err; 461 dsl_scan_t *scn; 462 spa_t *spa = dp->dp_spa; 463 uint64_t f; 464 465 scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); 466 scn->scn_dp = dp; 467 468 /* 469 * It's possible that we're resuming a scan after a reboot so 470 * make sure that the scan_async_destroying flag is initialized 471 * appropriately. 472 */ 473 ASSERT(!scn->scn_async_destroying); 474 scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, 475 SPA_FEATURE_ASYNC_DESTROY); 476 477 /* 478 * Calculate the max number of in-flight bytes for pool-wide 479 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). 480 * Limits for the issuing phase are done per top-level vdev and 481 * are handled separately. 482 */ 483 scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, 484 zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); 485 486 avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), 487 offsetof(scan_ds_t, sds_node)); 488 avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, 489 sizeof (scan_prefetch_issue_ctx_t), 490 offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); 491 492 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 493 "scrub_func", sizeof (uint64_t), 1, &f); 494 if (err == 0) { 495 /* 496 * There was an old-style scrub in progress. Restart a 497 * new-style scrub from the beginning. 498 */ 499 scn->scn_restart_txg = txg; 500 zfs_dbgmsg("old-style scrub was in progress for %s; " 501 "restarting new-style scrub in txg %llu", 502 spa->spa_name, 503 (longlong_t)scn->scn_restart_txg); 504 505 /* 506 * Load the queue obj from the old location so that it 507 * can be freed by dsl_scan_done(). 508 */ 509 (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 510 "scrub_queue", sizeof (uint64_t), 1, 511 &scn->scn_phys.scn_queue_obj); 512 } else { 513 err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 514 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 515 &scn->scn_phys); 516 /* 517 * Detect if the pool contains the signature of #2094. If it 518 * does properly update the scn->scn_phys structure and notify 519 * the administrator by setting an errata for the pool. 520 */ 521 if (err == EOVERFLOW) { 522 uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; 523 VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); 524 VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, 525 (23 * sizeof (uint64_t))); 526 527 err = zap_lookup(dp->dp_meta_objset, 528 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, 529 sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); 530 if (err == 0) { 531 uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; 532 533 if (overflow & ~DSL_SCAN_FLAGS_MASK || 534 scn->scn_async_destroying) { 535 spa->spa_errata = 536 ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; 537 return (EOVERFLOW); 538 } 539 540 memcpy(&scn->scn_phys, zaptmp, 541 SCAN_PHYS_NUMINTS * sizeof (uint64_t)); 542 scn->scn_phys.scn_flags = overflow; 543 544 /* Required scrub already in progress. */ 545 if (scn->scn_phys.scn_state == DSS_FINISHED || 546 scn->scn_phys.scn_state == DSS_CANCELED) 547 spa->spa_errata = 548 ZPOOL_ERRATA_ZOL_2094_SCRUB; 549 } 550 } 551 552 if (err == ENOENT) 553 return (0); 554 else if (err) 555 return (err); 556 557 /* 558 * We might be restarting after a reboot, so jump the issued 559 * counter to how far we've scanned. We know we're consistent 560 * up to here. 561 */ 562 scn->scn_issued_before_pass = scn->scn_phys.scn_examined; 563 564 if (dsl_scan_is_running(scn) && 565 spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { 566 /* 567 * A new-type scrub was in progress on an old 568 * pool, and the pool was accessed by old 569 * software. Restart from the beginning, since 570 * the old software may have changed the pool in 571 * the meantime. 572 */ 573 scn->scn_restart_txg = txg; 574 zfs_dbgmsg("new-style scrub for %s was modified " 575 "by old software; restarting in txg %llu", 576 spa->spa_name, 577 (longlong_t)scn->scn_restart_txg); 578 } else if (dsl_scan_resilvering(dp)) { 579 /* 580 * If a resilver is in progress and there are already 581 * errors, restart it instead of finishing this scan and 582 * then restarting it. If there haven't been any errors 583 * then remember that the incore DTL is valid. 584 */ 585 if (scn->scn_phys.scn_errors > 0) { 586 scn->scn_restart_txg = txg; 587 zfs_dbgmsg("resilver can't excise DTL_MISSING " 588 "when finished; restarting on %s in txg " 589 "%llu", 590 spa->spa_name, 591 (u_longlong_t)scn->scn_restart_txg); 592 } else { 593 /* it's safe to excise DTL when finished */ 594 spa->spa_scrub_started = B_TRUE; 595 } 596 } 597 } 598 599 memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); 600 601 /* reload the queue into the in-core state */ 602 if (scn->scn_phys.scn_queue_obj != 0) { 603 zap_cursor_t zc; 604 zap_attribute_t za; 605 606 for (zap_cursor_init(&zc, dp->dp_meta_objset, 607 scn->scn_phys.scn_queue_obj); 608 zap_cursor_retrieve(&zc, &za) == 0; 609 (void) zap_cursor_advance(&zc)) { 610 scan_ds_queue_insert(scn, 611 zfs_strtonum(za.za_name, NULL), 612 za.za_first_integer); 613 } 614 zap_cursor_fini(&zc); 615 } 616 617 spa_scan_stat_init(spa); 618 vdev_scan_stat_init(spa->spa_root_vdev); 619 620 return (0); 621 } 622 623 void 624 dsl_scan_fini(dsl_pool_t *dp) 625 { 626 if (dp->dp_scan != NULL) { 627 dsl_scan_t *scn = dp->dp_scan; 628 629 if (scn->scn_taskq != NULL) 630 taskq_destroy(scn->scn_taskq); 631 632 scan_ds_queue_clear(scn); 633 avl_destroy(&scn->scn_queue); 634 scan_ds_prefetch_queue_clear(scn); 635 avl_destroy(&scn->scn_prefetch_queue); 636 637 kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); 638 dp->dp_scan = NULL; 639 } 640 } 641 642 static boolean_t 643 dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) 644 { 645 return (scn->scn_restart_txg != 0 && 646 scn->scn_restart_txg <= tx->tx_txg); 647 } 648 649 boolean_t 650 dsl_scan_resilver_scheduled(dsl_pool_t *dp) 651 { 652 return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || 653 (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); 654 } 655 656 boolean_t 657 dsl_scan_scrubbing(const dsl_pool_t *dp) 658 { 659 dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; 660 661 return (scn_phys->scn_state == DSS_SCANNING && 662 scn_phys->scn_func == POOL_SCAN_SCRUB); 663 } 664 665 boolean_t 666 dsl_scan_is_paused_scrub(const dsl_scan_t *scn) 667 { 668 return (dsl_scan_scrubbing(scn->scn_dp) && 669 scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); 670 } 671 672 /* 673 * Writes out a persistent dsl_scan_phys_t record to the pool directory. 674 * Because we can be running in the block sorting algorithm, we do not always 675 * want to write out the record, only when it is "safe" to do so. This safety 676 * condition is achieved by making sure that the sorting queues are empty 677 * (scn_queues_pending == 0). When this condition is not true, the sync'd state 678 * is inconsistent with how much actual scanning progress has been made. The 679 * kind of sync to be performed is specified by the sync_type argument. If the 680 * sync is optional, we only sync if the queues are empty. If the sync is 681 * mandatory, we do a hard ASSERT to make sure that the queues are empty. The 682 * third possible state is a "cached" sync. This is done in response to: 683 * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been 684 * destroyed, so we wouldn't be able to restart scanning from it. 685 * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been 686 * superseded by a newer snapshot. 687 * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been 688 * swapped with its clone. 689 * In all cases, a cached sync simply rewrites the last record we've written, 690 * just slightly modified. For the modifications that are performed to the 691 * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, 692 * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. 693 */ 694 static void 695 dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) 696 { 697 int i; 698 spa_t *spa = scn->scn_dp->dp_spa; 699 700 ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0); 701 if (scn->scn_queues_pending == 0) { 702 for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 703 vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; 704 dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; 705 706 if (q == NULL) 707 continue; 708 709 mutex_enter(&vd->vdev_scan_io_queue_lock); 710 ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); 711 ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, 712 NULL); 713 ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); 714 mutex_exit(&vd->vdev_scan_io_queue_lock); 715 } 716 717 if (scn->scn_phys.scn_queue_obj != 0) 718 scan_ds_queue_sync(scn, tx); 719 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, 720 DMU_POOL_DIRECTORY_OBJECT, 721 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 722 &scn->scn_phys, tx)); 723 memcpy(&scn->scn_phys_cached, &scn->scn_phys, 724 sizeof (scn->scn_phys)); 725 726 if (scn->scn_checkpointing) 727 zfs_dbgmsg("finish scan checkpoint for %s", 728 spa->spa_name); 729 730 scn->scn_checkpointing = B_FALSE; 731 scn->scn_last_checkpoint = ddi_get_lbolt(); 732 } else if (sync_type == SYNC_CACHED) { 733 VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, 734 DMU_POOL_DIRECTORY_OBJECT, 735 DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, 736 &scn->scn_phys_cached, tx)); 737 } 738 } 739 740 int 741 dsl_scan_setup_check(void *arg, dmu_tx_t *tx) 742 { 743 (void) arg; 744 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 745 vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; 746 747 if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) 748 return (SET_ERROR(EBUSY)); 749 750 return (0); 751 } 752 753 void 754 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) 755 { 756 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 757 pool_scan_func_t *funcp = arg; 758 dmu_object_type_t ot = 0; 759 dsl_pool_t *dp = scn->scn_dp; 760 spa_t *spa = dp->dp_spa; 761 762 ASSERT(!dsl_scan_is_running(scn)); 763 ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); 764 memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); 765 scn->scn_phys.scn_func = *funcp; 766 scn->scn_phys.scn_state = DSS_SCANNING; 767 scn->scn_phys.scn_min_txg = 0; 768 scn->scn_phys.scn_max_txg = tx->tx_txg; 769 scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ 770 scn->scn_phys.scn_start_time = gethrestime_sec(); 771 scn->scn_phys.scn_errors = 0; 772 scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; 773 scn->scn_issued_before_pass = 0; 774 scn->scn_restart_txg = 0; 775 scn->scn_done_txg = 0; 776 scn->scn_last_checkpoint = 0; 777 scn->scn_checkpointing = B_FALSE; 778 spa_scan_stat_init(spa); 779 vdev_scan_stat_init(spa->spa_root_vdev); 780 781 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 782 scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; 783 784 /* rewrite all disk labels */ 785 vdev_config_dirty(spa->spa_root_vdev); 786 787 if (vdev_resilver_needed(spa->spa_root_vdev, 788 &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { 789 nvlist_t *aux = fnvlist_alloc(); 790 fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, 791 "healing"); 792 spa_event_notify(spa, NULL, aux, 793 ESC_ZFS_RESILVER_START); 794 nvlist_free(aux); 795 } else { 796 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); 797 } 798 799 spa->spa_scrub_started = B_TRUE; 800 /* 801 * If this is an incremental scrub, limit the DDT scrub phase 802 * to just the auto-ditto class (for correctness); the rest 803 * of the scrub should go faster using top-down pruning. 804 */ 805 if (scn->scn_phys.scn_min_txg > TXG_INITIAL) 806 scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; 807 808 /* 809 * When starting a resilver clear any existing rebuild state. 810 * This is required to prevent stale rebuild status from 811 * being reported when a rebuild is run, then a resilver and 812 * finally a scrub. In which case only the scrub status 813 * should be reported by 'zpool status'. 814 */ 815 if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { 816 vdev_t *rvd = spa->spa_root_vdev; 817 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 818 vdev_t *vd = rvd->vdev_child[i]; 819 vdev_rebuild_clear_sync( 820 (void *)(uintptr_t)vd->vdev_id, tx); 821 } 822 } 823 } 824 825 /* back to the generic stuff */ 826 827 if (zfs_scan_blkstats) { 828 if (dp->dp_blkstats == NULL) { 829 dp->dp_blkstats = 830 vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); 831 } 832 memset(&dp->dp_blkstats->zab_type, 0, 833 sizeof (dp->dp_blkstats->zab_type)); 834 } else { 835 if (dp->dp_blkstats) { 836 vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); 837 dp->dp_blkstats = NULL; 838 } 839 } 840 841 if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) 842 ot = DMU_OT_ZAP_OTHER; 843 844 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, 845 ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); 846 847 memcpy(&scn->scn_phys_cached, &scn->scn_phys, sizeof (scn->scn_phys)); 848 849 dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); 850 851 spa_history_log_internal(spa, "scan setup", tx, 852 "func=%u mintxg=%llu maxtxg=%llu", 853 *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg, 854 (u_longlong_t)scn->scn_phys.scn_max_txg); 855 } 856 857 /* 858 * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. 859 * Can also be called to resume a paused scrub. 860 */ 861 int 862 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) 863 { 864 spa_t *spa = dp->dp_spa; 865 dsl_scan_t *scn = dp->dp_scan; 866 867 /* 868 * Purge all vdev caches and probe all devices. We do this here 869 * rather than in sync context because this requires a writer lock 870 * on the spa_config lock, which we can't do from sync context. The 871 * spa_scrub_reopen flag indicates that vdev_open() should not 872 * attempt to start another scrub. 873 */ 874 spa_vdev_state_enter(spa, SCL_NONE); 875 spa->spa_scrub_reopen = B_TRUE; 876 vdev_reopen(spa->spa_root_vdev); 877 spa->spa_scrub_reopen = B_FALSE; 878 (void) spa_vdev_state_exit(spa, NULL, 0); 879 880 if (func == POOL_SCAN_RESILVER) { 881 dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); 882 return (0); 883 } 884 885 if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { 886 /* got scrub start cmd, resume paused scrub */ 887 int err = dsl_scrub_set_pause_resume(scn->scn_dp, 888 POOL_SCRUB_NORMAL); 889 if (err == 0) { 890 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); 891 return (SET_ERROR(ECANCELED)); 892 } 893 894 return (SET_ERROR(err)); 895 } 896 897 return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, 898 dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); 899 } 900 901 static void 902 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) 903 { 904 static const char *old_names[] = { 905 "scrub_bookmark", 906 "scrub_ddt_bookmark", 907 "scrub_ddt_class_max", 908 "scrub_queue", 909 "scrub_min_txg", 910 "scrub_max_txg", 911 "scrub_func", 912 "scrub_errors", 913 NULL 914 }; 915 916 dsl_pool_t *dp = scn->scn_dp; 917 spa_t *spa = dp->dp_spa; 918 int i; 919 920 /* Remove any remnants of an old-style scrub. */ 921 for (i = 0; old_names[i]; i++) { 922 (void) zap_remove(dp->dp_meta_objset, 923 DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); 924 } 925 926 if (scn->scn_phys.scn_queue_obj != 0) { 927 VERIFY0(dmu_object_free(dp->dp_meta_objset, 928 scn->scn_phys.scn_queue_obj, tx)); 929 scn->scn_phys.scn_queue_obj = 0; 930 } 931 scan_ds_queue_clear(scn); 932 scan_ds_prefetch_queue_clear(scn); 933 934 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; 935 936 /* 937 * If we were "restarted" from a stopped state, don't bother 938 * with anything else. 939 */ 940 if (!dsl_scan_is_running(scn)) { 941 ASSERT(!scn->scn_is_sorted); 942 return; 943 } 944 945 if (scn->scn_is_sorted) { 946 scan_io_queues_destroy(scn); 947 scn->scn_is_sorted = B_FALSE; 948 949 if (scn->scn_taskq != NULL) { 950 taskq_destroy(scn->scn_taskq); 951 scn->scn_taskq = NULL; 952 } 953 } 954 955 scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; 956 957 spa_notify_waiters(spa); 958 959 if (dsl_scan_restarting(scn, tx)) 960 spa_history_log_internal(spa, "scan aborted, restarting", tx, 961 "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); 962 else if (!complete) 963 spa_history_log_internal(spa, "scan cancelled", tx, 964 "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); 965 else 966 spa_history_log_internal(spa, "scan done", tx, 967 "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); 968 969 if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { 970 spa->spa_scrub_active = B_FALSE; 971 972 /* 973 * If the scrub/resilver completed, update all DTLs to 974 * reflect this. Whether it succeeded or not, vacate 975 * all temporary scrub DTLs. 976 * 977 * As the scrub does not currently support traversing 978 * data that have been freed but are part of a checkpoint, 979 * we don't mark the scrub as done in the DTLs as faults 980 * may still exist in those vdevs. 981 */ 982 if (complete && 983 !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 984 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 985 scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); 986 987 if (scn->scn_phys.scn_min_txg) { 988 nvlist_t *aux = fnvlist_alloc(); 989 fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, 990 "healing"); 991 spa_event_notify(spa, NULL, aux, 992 ESC_ZFS_RESILVER_FINISH); 993 nvlist_free(aux); 994 } else { 995 spa_event_notify(spa, NULL, NULL, 996 ESC_ZFS_SCRUB_FINISH); 997 } 998 } else { 999 vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, 1000 0, B_TRUE, B_FALSE); 1001 } 1002 spa_errlog_rotate(spa); 1003 1004 /* 1005 * Don't clear flag until after vdev_dtl_reassess to ensure that 1006 * DTL_MISSING will get updated when possible. 1007 */ 1008 spa->spa_scrub_started = B_FALSE; 1009 1010 /* 1011 * We may have finished replacing a device. 1012 * Let the async thread assess this and handle the detach. 1013 */ 1014 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 1015 1016 /* 1017 * Clear any resilver_deferred flags in the config. 1018 * If there are drives that need resilvering, kick 1019 * off an asynchronous request to start resilver. 1020 * vdev_clear_resilver_deferred() may update the config 1021 * before the resilver can restart. In the event of 1022 * a crash during this period, the spa loading code 1023 * will find the drives that need to be resilvered 1024 * and start the resilver then. 1025 */ 1026 if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && 1027 vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { 1028 spa_history_log_internal(spa, 1029 "starting deferred resilver", tx, "errors=%llu", 1030 (u_longlong_t)spa_approx_errlog_size(spa)); 1031 spa_async_request(spa, SPA_ASYNC_RESILVER); 1032 } 1033 1034 /* Clear recent error events (i.e. duplicate events tracking) */ 1035 if (complete) 1036 zfs_ereport_clear(spa, NULL); 1037 } 1038 1039 scn->scn_phys.scn_end_time = gethrestime_sec(); 1040 1041 if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) 1042 spa->spa_errata = 0; 1043 1044 ASSERT(!dsl_scan_is_running(scn)); 1045 } 1046 1047 static int 1048 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) 1049 { 1050 (void) arg; 1051 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 1052 1053 if (!dsl_scan_is_running(scn)) 1054 return (SET_ERROR(ENOENT)); 1055 return (0); 1056 } 1057 1058 static void 1059 dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) 1060 { 1061 (void) arg; 1062 dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; 1063 1064 dsl_scan_done(scn, B_FALSE, tx); 1065 dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); 1066 spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); 1067 } 1068 1069 int 1070 dsl_scan_cancel(dsl_pool_t *dp) 1071 { 1072 return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, 1073 dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); 1074 } 1075 1076 static int 1077 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) 1078 { 1079 pool_scrub_cmd_t *cmd = arg; 1080 dsl_pool_t *dp = dmu_tx_pool(tx); 1081 dsl_scan_t *scn = dp->dp_scan; 1082 1083 if (*cmd == POOL_SCRUB_PAUSE) { 1084 /* can't pause a scrub when there is no in-progress scrub */ 1085 if (!dsl_scan_scrubbing(dp)) 1086 return (SET_ERROR(ENOENT)); 1087 1088 /* can't pause a paused scrub */ 1089 if (dsl_scan_is_paused_scrub(scn)) 1090 return (SET_ERROR(EBUSY)); 1091 } else if (*cmd != POOL_SCRUB_NORMAL) { 1092 return (SET_ERROR(ENOTSUP)); 1093 } 1094 1095 return (0); 1096 } 1097 1098 static void 1099 dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) 1100 { 1101 pool_scrub_cmd_t *cmd = arg; 1102 dsl_pool_t *dp = dmu_tx_pool(tx); 1103 spa_t *spa = dp->dp_spa; 1104 dsl_scan_t *scn = dp->dp_scan; 1105 1106 if (*cmd == POOL_SCRUB_PAUSE) { 1107 /* can't pause a scrub when there is no in-progress scrub */ 1108 spa->spa_scan_pass_scrub_pause = gethrestime_sec(); 1109 scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; 1110 scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED; 1111 dsl_scan_sync_state(scn, tx, SYNC_CACHED); 1112 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); 1113 spa_notify_waiters(spa); 1114 } else { 1115 ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); 1116 if (dsl_scan_is_paused_scrub(scn)) { 1117 /* 1118 * We need to keep track of how much time we spend 1119 * paused per pass so that we can adjust the scrub rate 1120 * shown in the output of 'zpool status' 1121 */ 1122 spa->spa_scan_pass_scrub_spent_paused += 1123 gethrestime_sec() - spa->spa_scan_pass_scrub_pause; 1124 spa->spa_scan_pass_scrub_pause = 0; 1125 scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; 1126 scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED; 1127 dsl_scan_sync_state(scn, tx, SYNC_CACHED); 1128 } 1129 } 1130 } 1131 1132 /* 1133 * Set scrub pause/resume state if it makes sense to do so 1134 */ 1135 int 1136 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) 1137 { 1138 return (dsl_sync_task(spa_name(dp->dp_spa), 1139 dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, 1140 ZFS_SPACE_CHECK_RESERVED)); 1141 } 1142 1143 1144 /* start a new scan, or restart an existing one. */ 1145 void 1146 dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) 1147 { 1148 if (txg == 0) { 1149 dmu_tx_t *tx; 1150 tx = dmu_tx_create_dd(dp->dp_mos_dir); 1151 VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); 1152 1153 txg = dmu_tx_get_txg(tx); 1154 dp->dp_scan->scn_restart_txg = txg; 1155 dmu_tx_commit(tx); 1156 } else { 1157 dp->dp_scan->scn_restart_txg = txg; 1158 } 1159 zfs_dbgmsg("restarting resilver for %s at txg=%llu", 1160 dp->dp_spa->spa_name, (longlong_t)txg); 1161 } 1162 1163 void 1164 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) 1165 { 1166 zio_free(dp->dp_spa, txg, bp); 1167 } 1168 1169 void 1170 dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) 1171 { 1172 ASSERT(dsl_pool_sync_context(dp)); 1173 zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); 1174 } 1175 1176 static int 1177 scan_ds_queue_compare(const void *a, const void *b) 1178 { 1179 const scan_ds_t *sds_a = a, *sds_b = b; 1180 1181 if (sds_a->sds_dsobj < sds_b->sds_dsobj) 1182 return (-1); 1183 if (sds_a->sds_dsobj == sds_b->sds_dsobj) 1184 return (0); 1185 return (1); 1186 } 1187 1188 static void 1189 scan_ds_queue_clear(dsl_scan_t *scn) 1190 { 1191 void *cookie = NULL; 1192 scan_ds_t *sds; 1193 while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { 1194 kmem_free(sds, sizeof (*sds)); 1195 } 1196 } 1197 1198 static boolean_t 1199 scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) 1200 { 1201 scan_ds_t srch, *sds; 1202 1203 srch.sds_dsobj = dsobj; 1204 sds = avl_find(&scn->scn_queue, &srch, NULL); 1205 if (sds != NULL && txg != NULL) 1206 *txg = sds->sds_txg; 1207 return (sds != NULL); 1208 } 1209 1210 static void 1211 scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) 1212 { 1213 scan_ds_t *sds; 1214 avl_index_t where; 1215 1216 sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); 1217 sds->sds_dsobj = dsobj; 1218 sds->sds_txg = txg; 1219 1220 VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); 1221 avl_insert(&scn->scn_queue, sds, where); 1222 } 1223 1224 static void 1225 scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) 1226 { 1227 scan_ds_t srch, *sds; 1228 1229 srch.sds_dsobj = dsobj; 1230 1231 sds = avl_find(&scn->scn_queue, &srch, NULL); 1232 VERIFY(sds != NULL); 1233 avl_remove(&scn->scn_queue, sds); 1234 kmem_free(sds, sizeof (*sds)); 1235 } 1236 1237 static void 1238 scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) 1239 { 1240 dsl_pool_t *dp = scn->scn_dp; 1241 spa_t *spa = dp->dp_spa; 1242 dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? 1243 DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; 1244 1245 ASSERT0(scn->scn_queues_pending); 1246 ASSERT(scn->scn_phys.scn_queue_obj != 0); 1247 1248 VERIFY0(dmu_object_free(dp->dp_meta_objset, 1249 scn->scn_phys.scn_queue_obj, tx)); 1250 scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, 1251 DMU_OT_NONE, 0, tx); 1252 for (scan_ds_t *sds = avl_first(&scn->scn_queue); 1253 sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { 1254 VERIFY0(zap_add_int_key(dp->dp_meta_objset, 1255 scn->scn_phys.scn_queue_obj, sds->sds_dsobj, 1256 sds->sds_txg, tx)); 1257 } 1258 } 1259 1260 /* 1261 * Computes the memory limit state that we're currently in. A sorted scan 1262 * needs quite a bit of memory to hold the sorting queue, so we need to 1263 * reasonably constrain the size so it doesn't impact overall system 1264 * performance. We compute two limits: 1265 * 1) Hard memory limit: if the amount of memory used by the sorting 1266 * queues on a pool gets above this value, we stop the metadata 1267 * scanning portion and start issuing the queued up and sorted 1268 * I/Os to reduce memory usage. 1269 * This limit is calculated as a fraction of physmem (by default 5%). 1270 * We constrain the lower bound of the hard limit to an absolute 1271 * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain 1272 * the upper bound to 5% of the total pool size - no chance we'll 1273 * ever need that much memory, but just to keep the value in check. 1274 * 2) Soft memory limit: once we hit the hard memory limit, we start 1275 * issuing I/O to reduce queue memory usage, but we don't want to 1276 * completely empty out the queues, since we might be able to find I/Os 1277 * that will fill in the gaps of our non-sequential IOs at some point 1278 * in the future. So we stop the issuing of I/Os once the amount of 1279 * memory used drops below the soft limit (at which point we stop issuing 1280 * I/O and start scanning metadata again). 1281 * 1282 * This limit is calculated by subtracting a fraction of the hard 1283 * limit from the hard limit. By default this fraction is 5%, so 1284 * the soft limit is 95% of the hard limit. We cap the size of the 1285 * difference between the hard and soft limits at an absolute 1286 * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is 1287 * sufficient to not cause too frequent switching between the 1288 * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's 1289 * worth of queues is about 1.2 GiB of on-pool data, so scanning 1290 * that should take at least a decent fraction of a second). 1291 */ 1292 static boolean_t 1293 dsl_scan_should_clear(dsl_scan_t *scn) 1294 { 1295 spa_t *spa = scn->scn_dp->dp_spa; 1296 vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; 1297 uint64_t alloc, mlim_hard, mlim_soft, mused; 1298 1299 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 1300 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 1301 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 1302 1303 mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, 1304 zfs_scan_mem_lim_min); 1305 mlim_hard = MIN(mlim_hard, alloc / 20); 1306 mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, 1307 zfs_scan_mem_lim_soft_max); 1308 mused = 0; 1309 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 1310 vdev_t *tvd = rvd->vdev_child[i]; 1311 dsl_scan_io_queue_t *queue; 1312 1313 mutex_enter(&tvd->vdev_scan_io_queue_lock); 1314 queue = tvd->vdev_scan_io_queue; 1315 if (queue != NULL) { 1316 /* 1317 * # of extents in exts_by_addr = # in exts_by_size. 1318 * B-tree efficiency is ~75%, but can be as low as 50%. 1319 */ 1320 mused += zfs_btree_numnodes(&queue->q_exts_by_size) * 1321 ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) * 1322 3 / 2) + queue->q_sio_memused; 1323 } 1324 mutex_exit(&tvd->vdev_scan_io_queue_lock); 1325 } 1326 1327 dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); 1328 1329 if (mused == 0) 1330 ASSERT0(scn->scn_queues_pending); 1331 1332 /* 1333 * If we are above our hard limit, we need to clear out memory. 1334 * If we are below our soft limit, we need to accumulate sequential IOs. 1335 * Otherwise, we should keep doing whatever we are currently doing. 1336 */ 1337 if (mused >= mlim_hard) 1338 return (B_TRUE); 1339 else if (mused < mlim_soft) 1340 return (B_FALSE); 1341 else 1342 return (scn->scn_clearing); 1343 } 1344 1345 static boolean_t 1346 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) 1347 { 1348 /* we never skip user/group accounting objects */ 1349 if (zb && (int64_t)zb->zb_object < 0) 1350 return (B_FALSE); 1351 1352 if (scn->scn_suspending) 1353 return (B_TRUE); /* we're already suspending */ 1354 1355 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) 1356 return (B_FALSE); /* we're resuming */ 1357 1358 /* We only know how to resume from level-0 and objset blocks. */ 1359 if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL)) 1360 return (B_FALSE); 1361 1362 /* 1363 * We suspend if: 1364 * - we have scanned for at least the minimum time (default 1 sec 1365 * for scrub, 3 sec for resilver), and either we have sufficient 1366 * dirty data that we are starting to write more quickly 1367 * (default 30%), someone is explicitly waiting for this txg 1368 * to complete, or we have used up all of the time in the txg 1369 * timeout (default 5 sec). 1370 * or 1371 * - the spa is shutting down because this pool is being exported 1372 * or the machine is rebooting. 1373 * or 1374 * - the scan queue has reached its memory use limit 1375 */ 1376 uint64_t curr_time_ns = gethrtime(); 1377 uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; 1378 uint64_t sync_time_ns = curr_time_ns - 1379 scn->scn_dp->dp_spa->spa_sync_starttime; 1380 uint64_t dirty_min_bytes = zfs_dirty_data_max * 1381 zfs_vdev_async_write_active_min_dirty_percent / 100; 1382 uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 1383 zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; 1384 1385 if ((NSEC2MSEC(scan_time_ns) > mintime && 1386 (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || 1387 txg_sync_waiting(scn->scn_dp) || 1388 NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || 1389 spa_shutting_down(scn->scn_dp->dp_spa) || 1390 (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { 1391 if (zb && zb->zb_level == ZB_ROOT_LEVEL) { 1392 dprintf("suspending at first available bookmark " 1393 "%llx/%llx/%llx/%llx\n", 1394 (longlong_t)zb->zb_objset, 1395 (longlong_t)zb->zb_object, 1396 (longlong_t)zb->zb_level, 1397 (longlong_t)zb->zb_blkid); 1398 SET_BOOKMARK(&scn->scn_phys.scn_bookmark, 1399 zb->zb_objset, 0, 0, 0); 1400 } else if (zb != NULL) { 1401 dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", 1402 (longlong_t)zb->zb_objset, 1403 (longlong_t)zb->zb_object, 1404 (longlong_t)zb->zb_level, 1405 (longlong_t)zb->zb_blkid); 1406 scn->scn_phys.scn_bookmark = *zb; 1407 } else { 1408 #ifdef ZFS_DEBUG 1409 dsl_scan_phys_t *scnp = &scn->scn_phys; 1410 dprintf("suspending at at DDT bookmark " 1411 "%llx/%llx/%llx/%llx\n", 1412 (longlong_t)scnp->scn_ddt_bookmark.ddb_class, 1413 (longlong_t)scnp->scn_ddt_bookmark.ddb_type, 1414 (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, 1415 (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); 1416 #endif 1417 } 1418 scn->scn_suspending = B_TRUE; 1419 return (B_TRUE); 1420 } 1421 return (B_FALSE); 1422 } 1423 1424 typedef struct zil_scan_arg { 1425 dsl_pool_t *zsa_dp; 1426 zil_header_t *zsa_zh; 1427 } zil_scan_arg_t; 1428 1429 static int 1430 dsl_scan_zil_block(zilog_t *zilog, const blkptr_t *bp, void *arg, 1431 uint64_t claim_txg) 1432 { 1433 (void) zilog; 1434 zil_scan_arg_t *zsa = arg; 1435 dsl_pool_t *dp = zsa->zsa_dp; 1436 dsl_scan_t *scn = dp->dp_scan; 1437 zil_header_t *zh = zsa->zsa_zh; 1438 zbookmark_phys_t zb; 1439 1440 ASSERT(!BP_IS_REDACTED(bp)); 1441 if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 1442 return (0); 1443 1444 /* 1445 * One block ("stubby") can be allocated a long time ago; we 1446 * want to visit that one because it has been allocated 1447 * (on-disk) even if it hasn't been claimed (even though for 1448 * scrub there's nothing to do to it). 1449 */ 1450 if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) 1451 return (0); 1452 1453 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1454 ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); 1455 1456 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 1457 return (0); 1458 } 1459 1460 static int 1461 dsl_scan_zil_record(zilog_t *zilog, const lr_t *lrc, void *arg, 1462 uint64_t claim_txg) 1463 { 1464 (void) zilog; 1465 if (lrc->lrc_txtype == TX_WRITE) { 1466 zil_scan_arg_t *zsa = arg; 1467 dsl_pool_t *dp = zsa->zsa_dp; 1468 dsl_scan_t *scn = dp->dp_scan; 1469 zil_header_t *zh = zsa->zsa_zh; 1470 const lr_write_t *lr = (const lr_write_t *)lrc; 1471 const blkptr_t *bp = &lr->lr_blkptr; 1472 zbookmark_phys_t zb; 1473 1474 ASSERT(!BP_IS_REDACTED(bp)); 1475 if (BP_IS_HOLE(bp) || 1476 bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) 1477 return (0); 1478 1479 /* 1480 * birth can be < claim_txg if this record's txg is 1481 * already txg sync'ed (but this log block contains 1482 * other records that are not synced) 1483 */ 1484 if (claim_txg == 0 || bp->blk_birth < claim_txg) 1485 return (0); 1486 1487 ASSERT3U(BP_GET_LSIZE(bp), !=, 0); 1488 SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], 1489 lr->lr_foid, ZB_ZIL_LEVEL, 1490 lr->lr_offset / BP_GET_LSIZE(bp)); 1491 1492 VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); 1493 } 1494 return (0); 1495 } 1496 1497 static void 1498 dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) 1499 { 1500 uint64_t claim_txg = zh->zh_claim_txg; 1501 zil_scan_arg_t zsa = { dp, zh }; 1502 zilog_t *zilog; 1503 1504 ASSERT(spa_writeable(dp->dp_spa)); 1505 1506 /* 1507 * We only want to visit blocks that have been claimed but not yet 1508 * replayed (or, in read-only mode, blocks that *would* be claimed). 1509 */ 1510 if (claim_txg == 0) 1511 return; 1512 1513 zilog = zil_alloc(dp->dp_meta_objset, zh); 1514 1515 (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, 1516 claim_txg, B_FALSE); 1517 1518 zil_free(zilog); 1519 } 1520 1521 /* 1522 * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea 1523 * here is to sort the AVL tree by the order each block will be needed. 1524 */ 1525 static int 1526 scan_prefetch_queue_compare(const void *a, const void *b) 1527 { 1528 const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; 1529 const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; 1530 const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; 1531 1532 return (zbookmark_compare(spc_a->spc_datablkszsec, 1533 spc_a->spc_indblkshift, spc_b->spc_datablkszsec, 1534 spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); 1535 } 1536 1537 static void 1538 scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, const void *tag) 1539 { 1540 if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { 1541 zfs_refcount_destroy(&spc->spc_refcnt); 1542 kmem_free(spc, sizeof (scan_prefetch_ctx_t)); 1543 } 1544 } 1545 1546 static scan_prefetch_ctx_t * 1547 scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, const void *tag) 1548 { 1549 scan_prefetch_ctx_t *spc; 1550 1551 spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); 1552 zfs_refcount_create(&spc->spc_refcnt); 1553 zfs_refcount_add(&spc->spc_refcnt, tag); 1554 spc->spc_scn = scn; 1555 if (dnp != NULL) { 1556 spc->spc_datablkszsec = dnp->dn_datablkszsec; 1557 spc->spc_indblkshift = dnp->dn_indblkshift; 1558 spc->spc_root = B_FALSE; 1559 } else { 1560 spc->spc_datablkszsec = 0; 1561 spc->spc_indblkshift = 0; 1562 spc->spc_root = B_TRUE; 1563 } 1564 1565 return (spc); 1566 } 1567 1568 static void 1569 scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, const void *tag) 1570 { 1571 zfs_refcount_add(&spc->spc_refcnt, tag); 1572 } 1573 1574 static void 1575 scan_ds_prefetch_queue_clear(dsl_scan_t *scn) 1576 { 1577 spa_t *spa = scn->scn_dp->dp_spa; 1578 void *cookie = NULL; 1579 scan_prefetch_issue_ctx_t *spic = NULL; 1580 1581 mutex_enter(&spa->spa_scrub_lock); 1582 while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue, 1583 &cookie)) != NULL) { 1584 scan_prefetch_ctx_rele(spic->spic_spc, scn); 1585 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); 1586 } 1587 mutex_exit(&spa->spa_scrub_lock); 1588 } 1589 1590 static boolean_t 1591 dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, 1592 const zbookmark_phys_t *zb) 1593 { 1594 zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; 1595 dnode_phys_t tmp_dnp; 1596 dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; 1597 1598 if (zb->zb_objset != last_zb->zb_objset) 1599 return (B_TRUE); 1600 if ((int64_t)zb->zb_object < 0) 1601 return (B_FALSE); 1602 1603 tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; 1604 tmp_dnp.dn_indblkshift = spc->spc_indblkshift; 1605 1606 if (zbookmark_subtree_completed(dnp, zb, last_zb)) 1607 return (B_TRUE); 1608 1609 return (B_FALSE); 1610 } 1611 1612 static void 1613 dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) 1614 { 1615 avl_index_t idx; 1616 dsl_scan_t *scn = spc->spc_scn; 1617 spa_t *spa = scn->scn_dp->dp_spa; 1618 scan_prefetch_issue_ctx_t *spic; 1619 1620 if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) 1621 return; 1622 1623 if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || 1624 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && 1625 BP_GET_TYPE(bp) != DMU_OT_OBJSET)) 1626 return; 1627 1628 if (dsl_scan_check_prefetch_resume(spc, zb)) 1629 return; 1630 1631 scan_prefetch_ctx_add_ref(spc, scn); 1632 spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); 1633 spic->spic_spc = spc; 1634 spic->spic_bp = *bp; 1635 spic->spic_zb = *zb; 1636 1637 /* 1638 * Add the IO to the queue of blocks to prefetch. This allows us to 1639 * prioritize blocks that we will need first for the main traversal 1640 * thread. 1641 */ 1642 mutex_enter(&spa->spa_scrub_lock); 1643 if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { 1644 /* this block is already queued for prefetch */ 1645 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); 1646 scan_prefetch_ctx_rele(spc, scn); 1647 mutex_exit(&spa->spa_scrub_lock); 1648 return; 1649 } 1650 1651 avl_insert(&scn->scn_prefetch_queue, spic, idx); 1652 cv_broadcast(&spa->spa_scrub_io_cv); 1653 mutex_exit(&spa->spa_scrub_lock); 1654 } 1655 1656 static void 1657 dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, 1658 uint64_t objset, uint64_t object) 1659 { 1660 int i; 1661 zbookmark_phys_t zb; 1662 scan_prefetch_ctx_t *spc; 1663 1664 if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) 1665 return; 1666 1667 SET_BOOKMARK(&zb, objset, object, 0, 0); 1668 1669 spc = scan_prefetch_ctx_create(scn, dnp, FTAG); 1670 1671 for (i = 0; i < dnp->dn_nblkptr; i++) { 1672 zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); 1673 zb.zb_blkid = i; 1674 dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); 1675 } 1676 1677 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 1678 zb.zb_level = 0; 1679 zb.zb_blkid = DMU_SPILL_BLKID; 1680 dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb); 1681 } 1682 1683 scan_prefetch_ctx_rele(spc, FTAG); 1684 } 1685 1686 static void 1687 dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 1688 arc_buf_t *buf, void *private) 1689 { 1690 (void) zio; 1691 scan_prefetch_ctx_t *spc = private; 1692 dsl_scan_t *scn = spc->spc_scn; 1693 spa_t *spa = scn->scn_dp->dp_spa; 1694 1695 /* broadcast that the IO has completed for rate limiting purposes */ 1696 mutex_enter(&spa->spa_scrub_lock); 1697 ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); 1698 spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); 1699 cv_broadcast(&spa->spa_scrub_io_cv); 1700 mutex_exit(&spa->spa_scrub_lock); 1701 1702 /* if there was an error or we are done prefetching, just cleanup */ 1703 if (buf == NULL || scn->scn_prefetch_stop) 1704 goto out; 1705 1706 if (BP_GET_LEVEL(bp) > 0) { 1707 int i; 1708 blkptr_t *cbp; 1709 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 1710 zbookmark_phys_t czb; 1711 1712 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 1713 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 1714 zb->zb_level - 1, zb->zb_blkid * epb + i); 1715 dsl_scan_prefetch(spc, cbp, &czb); 1716 } 1717 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 1718 dnode_phys_t *cdnp; 1719 int i; 1720 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 1721 1722 for (i = 0, cdnp = buf->b_data; i < epb; 1723 i += cdnp->dn_extra_slots + 1, 1724 cdnp += cdnp->dn_extra_slots + 1) { 1725 dsl_scan_prefetch_dnode(scn, cdnp, 1726 zb->zb_objset, zb->zb_blkid * epb + i); 1727 } 1728 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 1729 objset_phys_t *osp = buf->b_data; 1730 1731 dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, 1732 zb->zb_objset, DMU_META_DNODE_OBJECT); 1733 1734 if (OBJSET_BUF_HAS_USERUSED(buf)) { 1735 dsl_scan_prefetch_dnode(scn, 1736 &osp->os_groupused_dnode, zb->zb_objset, 1737 DMU_GROUPUSED_OBJECT); 1738 dsl_scan_prefetch_dnode(scn, 1739 &osp->os_userused_dnode, zb->zb_objset, 1740 DMU_USERUSED_OBJECT); 1741 } 1742 } 1743 1744 out: 1745 if (buf != NULL) 1746 arc_buf_destroy(buf, private); 1747 scan_prefetch_ctx_rele(spc, scn); 1748 } 1749 1750 static void 1751 dsl_scan_prefetch_thread(void *arg) 1752 { 1753 dsl_scan_t *scn = arg; 1754 spa_t *spa = scn->scn_dp->dp_spa; 1755 scan_prefetch_issue_ctx_t *spic; 1756 1757 /* loop until we are told to stop */ 1758 while (!scn->scn_prefetch_stop) { 1759 arc_flags_t flags = ARC_FLAG_NOWAIT | 1760 ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; 1761 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; 1762 1763 mutex_enter(&spa->spa_scrub_lock); 1764 1765 /* 1766 * Wait until we have an IO to issue and are not above our 1767 * maximum in flight limit. 1768 */ 1769 while (!scn->scn_prefetch_stop && 1770 (avl_numnodes(&scn->scn_prefetch_queue) == 0 || 1771 spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { 1772 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 1773 } 1774 1775 /* recheck if we should stop since we waited for the cv */ 1776 if (scn->scn_prefetch_stop) { 1777 mutex_exit(&spa->spa_scrub_lock); 1778 break; 1779 } 1780 1781 /* remove the prefetch IO from the tree */ 1782 spic = avl_first(&scn->scn_prefetch_queue); 1783 spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); 1784 avl_remove(&scn->scn_prefetch_queue, spic); 1785 1786 mutex_exit(&spa->spa_scrub_lock); 1787 1788 if (BP_IS_PROTECTED(&spic->spic_bp)) { 1789 ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE || 1790 BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET); 1791 ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0); 1792 zio_flags |= ZIO_FLAG_RAW; 1793 } 1794 1795 /* issue the prefetch asynchronously */ 1796 (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, 1797 &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, 1798 ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); 1799 1800 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); 1801 } 1802 1803 ASSERT(scn->scn_prefetch_stop); 1804 1805 /* free any prefetches we didn't get to complete */ 1806 mutex_enter(&spa->spa_scrub_lock); 1807 while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { 1808 avl_remove(&scn->scn_prefetch_queue, spic); 1809 scan_prefetch_ctx_rele(spic->spic_spc, scn); 1810 kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); 1811 } 1812 ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); 1813 mutex_exit(&spa->spa_scrub_lock); 1814 } 1815 1816 static boolean_t 1817 dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, 1818 const zbookmark_phys_t *zb) 1819 { 1820 /* 1821 * We never skip over user/group accounting objects (obj<0) 1822 */ 1823 if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && 1824 (int64_t)zb->zb_object >= 0) { 1825 /* 1826 * If we already visited this bp & everything below (in 1827 * a prior txg sync), don't bother doing it again. 1828 */ 1829 if (zbookmark_subtree_completed(dnp, zb, 1830 &scn->scn_phys.scn_bookmark)) 1831 return (B_TRUE); 1832 1833 /* 1834 * If we found the block we're trying to resume from, or 1835 * we went past it, zero it out to indicate that it's OK 1836 * to start checking for suspending again. 1837 */ 1838 if (zbookmark_subtree_tbd(dnp, zb, 1839 &scn->scn_phys.scn_bookmark)) { 1840 dprintf("resuming at %llx/%llx/%llx/%llx\n", 1841 (longlong_t)zb->zb_objset, 1842 (longlong_t)zb->zb_object, 1843 (longlong_t)zb->zb_level, 1844 (longlong_t)zb->zb_blkid); 1845 memset(&scn->scn_phys.scn_bookmark, 0, sizeof (*zb)); 1846 } 1847 } 1848 return (B_FALSE); 1849 } 1850 1851 static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, 1852 dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, 1853 dmu_objset_type_t ostype, dmu_tx_t *tx); 1854 inline __attribute__((always_inline)) static void dsl_scan_visitdnode( 1855 dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, 1856 dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); 1857 1858 /* 1859 * Return nonzero on i/o error. 1860 * Return new buf to write out in *bufp. 1861 */ 1862 inline __attribute__((always_inline)) static int 1863 dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, 1864 dnode_phys_t *dnp, const blkptr_t *bp, 1865 const zbookmark_phys_t *zb, dmu_tx_t *tx) 1866 { 1867 dsl_pool_t *dp = scn->scn_dp; 1868 spa_t *spa = dp->dp_spa; 1869 int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; 1870 int err; 1871 1872 ASSERT(!BP_IS_REDACTED(bp)); 1873 1874 /* 1875 * There is an unlikely case of encountering dnodes with contradicting 1876 * dn_bonuslen and DNODE_FLAG_SPILL_BLKPTR flag before in files created 1877 * or modified before commit 4254acb was merged. As it is not possible 1878 * to know which of the two is correct, report an error. 1879 */ 1880 if (dnp != NULL && 1881 dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { 1882 scn->scn_phys.scn_errors++; 1883 spa_log_error(spa, zb); 1884 return (SET_ERROR(EINVAL)); 1885 } 1886 1887 if (BP_GET_LEVEL(bp) > 0) { 1888 arc_flags_t flags = ARC_FLAG_WAIT; 1889 int i; 1890 blkptr_t *cbp; 1891 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; 1892 arc_buf_t *buf; 1893 1894 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, 1895 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); 1896 if (err) { 1897 scn->scn_phys.scn_errors++; 1898 return (err); 1899 } 1900 for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { 1901 zbookmark_phys_t czb; 1902 1903 SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, 1904 zb->zb_level - 1, 1905 zb->zb_blkid * epb + i); 1906 dsl_scan_visitbp(cbp, &czb, dnp, 1907 ds, scn, ostype, tx); 1908 } 1909 arc_buf_destroy(buf, &buf); 1910 } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { 1911 arc_flags_t flags = ARC_FLAG_WAIT; 1912 dnode_phys_t *cdnp; 1913 int i; 1914 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; 1915 arc_buf_t *buf; 1916 1917 if (BP_IS_PROTECTED(bp)) { 1918 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); 1919 zio_flags |= ZIO_FLAG_RAW; 1920 } 1921 1922 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, 1923 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); 1924 if (err) { 1925 scn->scn_phys.scn_errors++; 1926 return (err); 1927 } 1928 for (i = 0, cdnp = buf->b_data; i < epb; 1929 i += cdnp->dn_extra_slots + 1, 1930 cdnp += cdnp->dn_extra_slots + 1) { 1931 dsl_scan_visitdnode(scn, ds, ostype, 1932 cdnp, zb->zb_blkid * epb + i, tx); 1933 } 1934 1935 arc_buf_destroy(buf, &buf); 1936 } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { 1937 arc_flags_t flags = ARC_FLAG_WAIT; 1938 objset_phys_t *osp; 1939 arc_buf_t *buf; 1940 1941 err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, 1942 ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); 1943 if (err) { 1944 scn->scn_phys.scn_errors++; 1945 return (err); 1946 } 1947 1948 osp = buf->b_data; 1949 1950 dsl_scan_visitdnode(scn, ds, osp->os_type, 1951 &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); 1952 1953 if (OBJSET_BUF_HAS_USERUSED(buf)) { 1954 /* 1955 * We also always visit user/group/project accounting 1956 * objects, and never skip them, even if we are 1957 * suspending. This is necessary so that the 1958 * space deltas from this txg get integrated. 1959 */ 1960 if (OBJSET_BUF_HAS_PROJECTUSED(buf)) 1961 dsl_scan_visitdnode(scn, ds, osp->os_type, 1962 &osp->os_projectused_dnode, 1963 DMU_PROJECTUSED_OBJECT, tx); 1964 dsl_scan_visitdnode(scn, ds, osp->os_type, 1965 &osp->os_groupused_dnode, 1966 DMU_GROUPUSED_OBJECT, tx); 1967 dsl_scan_visitdnode(scn, ds, osp->os_type, 1968 &osp->os_userused_dnode, 1969 DMU_USERUSED_OBJECT, tx); 1970 } 1971 arc_buf_destroy(buf, &buf); 1972 } else if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { 1973 /* 1974 * Sanity check the block pointer contents, this is handled 1975 * by arc_read() for the cases above. 1976 */ 1977 scn->scn_phys.scn_errors++; 1978 spa_log_error(spa, zb); 1979 return (SET_ERROR(EINVAL)); 1980 } 1981 1982 return (0); 1983 } 1984 1985 inline __attribute__((always_inline)) static void 1986 dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, 1987 dmu_objset_type_t ostype, dnode_phys_t *dnp, 1988 uint64_t object, dmu_tx_t *tx) 1989 { 1990 int j; 1991 1992 for (j = 0; j < dnp->dn_nblkptr; j++) { 1993 zbookmark_phys_t czb; 1994 1995 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 1996 dnp->dn_nlevels - 1, j); 1997 dsl_scan_visitbp(&dnp->dn_blkptr[j], 1998 &czb, dnp, ds, scn, ostype, tx); 1999 } 2000 2001 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { 2002 zbookmark_phys_t czb; 2003 SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 2004 0, DMU_SPILL_BLKID); 2005 dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), 2006 &czb, dnp, ds, scn, ostype, tx); 2007 } 2008 } 2009 2010 /* 2011 * The arguments are in this order because mdb can only print the 2012 * first 5; we want them to be useful. 2013 */ 2014 static void 2015 dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, 2016 dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, 2017 dmu_objset_type_t ostype, dmu_tx_t *tx) 2018 { 2019 dsl_pool_t *dp = scn->scn_dp; 2020 blkptr_t *bp_toread = NULL; 2021 2022 if (dsl_scan_check_suspend(scn, zb)) 2023 return; 2024 2025 if (dsl_scan_check_resume(scn, dnp, zb)) 2026 return; 2027 2028 scn->scn_visited_this_txg++; 2029 2030 if (BP_IS_HOLE(bp)) { 2031 scn->scn_holes_this_txg++; 2032 return; 2033 } 2034 2035 if (BP_IS_REDACTED(bp)) { 2036 ASSERT(dsl_dataset_feature_is_active(ds, 2037 SPA_FEATURE_REDACTED_DATASETS)); 2038 return; 2039 } 2040 2041 /* 2042 * Check if this block contradicts any filesystem flags. 2043 */ 2044 spa_feature_t f = SPA_FEATURE_LARGE_BLOCKS; 2045 if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) 2046 ASSERT(dsl_dataset_feature_is_active(ds, f)); 2047 2048 f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); 2049 if (f != SPA_FEATURE_NONE) 2050 ASSERT(dsl_dataset_feature_is_active(ds, f)); 2051 2052 f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); 2053 if (f != SPA_FEATURE_NONE) 2054 ASSERT(dsl_dataset_feature_is_active(ds, f)); 2055 2056 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { 2057 scn->scn_lt_min_this_txg++; 2058 return; 2059 } 2060 2061 if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { 2062 scn->scn_lt_min_this_txg++; 2063 return; 2064 } 2065 2066 bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); 2067 *bp_toread = *bp; 2068 2069 if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) 2070 goto out; 2071 2072 /* 2073 * If dsl_scan_ddt() has already visited this block, it will have 2074 * already done any translations or scrubbing, so don't call the 2075 * callback again. 2076 */ 2077 if (ddt_class_contains(dp->dp_spa, 2078 scn->scn_phys.scn_ddt_class_max, bp)) { 2079 scn->scn_ddt_contained_this_txg++; 2080 goto out; 2081 } 2082 2083 /* 2084 * If this block is from the future (after cur_max_txg), then we 2085 * are doing this on behalf of a deleted snapshot, and we will 2086 * revisit the future block on the next pass of this dataset. 2087 * Don't scan it now unless we need to because something 2088 * under it was modified. 2089 */ 2090 if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { 2091 scn->scn_gt_max_this_txg++; 2092 goto out; 2093 } 2094 2095 scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); 2096 2097 out: 2098 kmem_free(bp_toread, sizeof (blkptr_t)); 2099 } 2100 2101 static void 2102 dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, 2103 dmu_tx_t *tx) 2104 { 2105 zbookmark_phys_t zb; 2106 scan_prefetch_ctx_t *spc; 2107 2108 SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, 2109 ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); 2110 2111 if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { 2112 SET_BOOKMARK(&scn->scn_prefetch_bookmark, 2113 zb.zb_objset, 0, 0, 0); 2114 } else { 2115 scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; 2116 } 2117 2118 scn->scn_objsets_visited_this_txg++; 2119 2120 spc = scan_prefetch_ctx_create(scn, NULL, FTAG); 2121 dsl_scan_prefetch(spc, bp, &zb); 2122 scan_prefetch_ctx_rele(spc, FTAG); 2123 2124 dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); 2125 2126 dprintf_ds(ds, "finished scan%s", ""); 2127 } 2128 2129 static void 2130 ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) 2131 { 2132 if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { 2133 if (ds->ds_is_snapshot) { 2134 /* 2135 * Note: 2136 * - scn_cur_{min,max}_txg stays the same. 2137 * - Setting the flag is not really necessary if 2138 * scn_cur_max_txg == scn_max_txg, because there 2139 * is nothing after this snapshot that we care 2140 * about. However, we set it anyway and then 2141 * ignore it when we retraverse it in 2142 * dsl_scan_visitds(). 2143 */ 2144 scn_phys->scn_bookmark.zb_objset = 2145 dsl_dataset_phys(ds)->ds_next_snap_obj; 2146 zfs_dbgmsg("destroying ds %llu on %s; currently " 2147 "traversing; reset zb_objset to %llu", 2148 (u_longlong_t)ds->ds_object, 2149 ds->ds_dir->dd_pool->dp_spa->spa_name, 2150 (u_longlong_t)dsl_dataset_phys(ds)-> 2151 ds_next_snap_obj); 2152 scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; 2153 } else { 2154 SET_BOOKMARK(&scn_phys->scn_bookmark, 2155 ZB_DESTROYED_OBJSET, 0, 0, 0); 2156 zfs_dbgmsg("destroying ds %llu on %s; currently " 2157 "traversing; reset bookmark to -1,0,0,0", 2158 (u_longlong_t)ds->ds_object, 2159 ds->ds_dir->dd_pool->dp_spa->spa_name); 2160 } 2161 } 2162 } 2163 2164 /* 2165 * Invoked when a dataset is destroyed. We need to make sure that: 2166 * 2167 * 1) If it is the dataset that was currently being scanned, we write 2168 * a new dsl_scan_phys_t and marking the objset reference in it 2169 * as destroyed. 2170 * 2) Remove it from the work queue, if it was present. 2171 * 2172 * If the dataset was actually a snapshot, instead of marking the dataset 2173 * as destroyed, we instead substitute the next snapshot in line. 2174 */ 2175 void 2176 dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) 2177 { 2178 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2179 dsl_scan_t *scn = dp->dp_scan; 2180 uint64_t mintxg; 2181 2182 if (!dsl_scan_is_running(scn)) 2183 return; 2184 2185 ds_destroyed_scn_phys(ds, &scn->scn_phys); 2186 ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); 2187 2188 if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { 2189 scan_ds_queue_remove(scn, ds->ds_object); 2190 if (ds->ds_is_snapshot) 2191 scan_ds_queue_insert(scn, 2192 dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); 2193 } 2194 2195 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 2196 ds->ds_object, &mintxg) == 0) { 2197 ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); 2198 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2199 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 2200 if (ds->ds_is_snapshot) { 2201 /* 2202 * We keep the same mintxg; it could be > 2203 * ds_creation_txg if the previous snapshot was 2204 * deleted too. 2205 */ 2206 VERIFY(zap_add_int_key(dp->dp_meta_objset, 2207 scn->scn_phys.scn_queue_obj, 2208 dsl_dataset_phys(ds)->ds_next_snap_obj, 2209 mintxg, tx) == 0); 2210 zfs_dbgmsg("destroying ds %llu on %s; in queue; " 2211 "replacing with %llu", 2212 (u_longlong_t)ds->ds_object, 2213 dp->dp_spa->spa_name, 2214 (u_longlong_t)dsl_dataset_phys(ds)-> 2215 ds_next_snap_obj); 2216 } else { 2217 zfs_dbgmsg("destroying ds %llu on %s; in queue; " 2218 "removing", 2219 (u_longlong_t)ds->ds_object, 2220 dp->dp_spa->spa_name); 2221 } 2222 } 2223 2224 /* 2225 * dsl_scan_sync() should be called after this, and should sync 2226 * out our changed state, but just to be safe, do it here. 2227 */ 2228 dsl_scan_sync_state(scn, tx, SYNC_CACHED); 2229 } 2230 2231 static void 2232 ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) 2233 { 2234 if (scn_bookmark->zb_objset == ds->ds_object) { 2235 scn_bookmark->zb_objset = 2236 dsl_dataset_phys(ds)->ds_prev_snap_obj; 2237 zfs_dbgmsg("snapshotting ds %llu on %s; currently traversing; " 2238 "reset zb_objset to %llu", 2239 (u_longlong_t)ds->ds_object, 2240 ds->ds_dir->dd_pool->dp_spa->spa_name, 2241 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 2242 } 2243 } 2244 2245 /* 2246 * Called when a dataset is snapshotted. If we were currently traversing 2247 * this snapshot, we reset our bookmark to point at the newly created 2248 * snapshot. We also modify our work queue to remove the old snapshot and 2249 * replace with the new one. 2250 */ 2251 void 2252 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) 2253 { 2254 dsl_pool_t *dp = ds->ds_dir->dd_pool; 2255 dsl_scan_t *scn = dp->dp_scan; 2256 uint64_t mintxg; 2257 2258 if (!dsl_scan_is_running(scn)) 2259 return; 2260 2261 ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); 2262 2263 ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); 2264 ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); 2265 2266 if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { 2267 scan_ds_queue_remove(scn, ds->ds_object); 2268 scan_ds_queue_insert(scn, 2269 dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); 2270 } 2271 2272 if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, 2273 ds->ds_object, &mintxg) == 0) { 2274 VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, 2275 scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); 2276 VERIFY(zap_add_int_key(dp->dp_meta_objset, 2277 scn->scn_phys.scn_queue_obj, 2278 dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); 2279 zfs_dbgmsg("snapshotting ds %llu on %s; in queue; " 2280 "replacing with %llu", 2281 (u_longlong_t)ds->ds_object, 2282 dp->dp_spa->spa_name, 2283 (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); 2284 } 2285 2286 dsl_scan_sync_state(scn, tx, SYNC_CACHED); 2287 } 2288 2289 static void 2290 ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, 2291 zbookmark_phys_t *scn_bookmark) 2292 { 2293 if (scn_bookmark->zb_objset == ds1->ds_object) { 2294 scn_bookmark->zb_objset = ds2->ds_object; 2295 zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " 2296 "reset zb_objset to %llu", 2297 (u_longlong_t)ds1->ds_object, 2298 ds1->ds_dir->dd_pool->dp_spa->spa_name, 2299 (u_longlong_t)ds2->ds_object); 2300 } else if (scn_bookmark->zb_objset == ds2->ds_object) { 2301 scn_bookmark->zb_objset = ds1->ds_object; 2302 zfs_dbgmsg("clone_swap ds %llu on %s; currently traversing; " 2303 "reset zb_objset to %llu", 2304 (u_longlong_t)ds2->ds_object, 2305 ds2->ds_dir->dd_pool->dp_spa->spa_name, 2306 (u_longlong_t)ds1->ds_object); 2307 } 2308 } 2309 2310 /* 2311 * Called when an origin dataset and its clone are swapped. If we were 2312 * currently traversing the dataset, we need to switch to traversing the 2313 * newly promoted clone. 2314 */ 2315 void 2316 dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) 2317 { 2318 dsl_pool_t *dp = ds1->ds_dir->dd_pool; 2319 dsl_scan_t *scn = dp->dp_scan; 2320 uint64_t mintxg1, mintxg2; 2321 boolean_t ds1_queued, ds2_queued; 2322 2323 if (!dsl_scan_is_running(scn)) 2324 return; 2325 2326 ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); 2327 ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); 2328 2329 /* 2330 * Handle the in-memory scan queue. 2331 */ 2332 ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); 2333 ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); 2334 2335 /* Sanity checking. */ 2336 if (ds1_queued) { 2337 ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 2338 ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 2339 } 2340 if (ds2_queued) { 2341 ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 2342 ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 2343 } 2344 2345 if (ds1_queued && ds2_queued) { 2346 /* 2347 * If both are queued, we don't need to do anything. 2348 * The swapping code below would not handle this case correctly, 2349 * since we can't insert ds2 if it is already there. That's 2350 * because scan_ds_queue_insert() prohibits a duplicate insert 2351 * and panics. 2352 */ 2353 } else if (ds1_queued) { 2354 scan_ds_queue_remove(scn, ds1->ds_object); 2355 scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); 2356 } else if (ds2_queued) { 2357 scan_ds_queue_remove(scn, ds2->ds_object); 2358 scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); 2359 } 2360 2361 /* 2362 * Handle the on-disk scan queue. 2363 * The on-disk state is an out-of-date version of the in-memory state, 2364 * so the in-memory and on-disk values for ds1_queued and ds2_queued may 2365 * be different. Therefore we need to apply the swap logic to the 2366 * on-disk state independently of the in-memory state. 2367 */ 2368 ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, 2369 scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; 2370 ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, 2371 scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; 2372 2373 /* Sanity checking. */ 2374 if (ds1_queued) { 2375 ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 2376 ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 2377 } 2378 if (ds2_queued) { 2379 ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); 2380 ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); 2381 } 2382 2383 if (ds1_queued && ds2_queued) { 2384 /* 2385 * If both are queued, we don't need to do anything. 2386 * Alternatively, we could check for EEXIST from 2387 * zap_add_int_key() and back out to the original state, but 2388 * that would be more work than checking for this case upfront. 2389 */ 2390 } else if (ds1_queued) { 2391 VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, 2392 scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); 2393 VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, 2394 scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); 2395 zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " 2396 "replacing with %llu", 2397 (u_longlong_t)ds1->ds_object, 2398 dp->dp_spa->spa_name, 2399 (u_longlong_t)ds2->ds_object); 2400 } else if (ds2_queued) { 2401 VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, 2402 scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); 2403 VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, 2404 scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); 2405 zfs_dbgmsg("clone_swap ds %llu on %s; in queue; " 2406 "replacing with %llu", 2407 (u_longlong_t)ds2->ds_object, 2408 dp->dp_spa->spa_name, 2409 (u_longlong_t)ds1->ds_object); 2410 } 2411 2412 dsl_scan_sync_state(scn, tx, SYNC_CACHED); 2413 } 2414 2415 static int 2416 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 2417 { 2418 uint64_t originobj = *(uint64_t *)arg; 2419 dsl_dataset_t *ds; 2420 int err; 2421 dsl_scan_t *scn = dp->dp_scan; 2422 2423 if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) 2424 return (0); 2425 2426 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 2427 if (err) 2428 return (err); 2429 2430 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { 2431 dsl_dataset_t *prev; 2432 err = dsl_dataset_hold_obj(dp, 2433 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 2434 2435 dsl_dataset_rele(ds, FTAG); 2436 if (err) 2437 return (err); 2438 ds = prev; 2439 } 2440 scan_ds_queue_insert(scn, ds->ds_object, 2441 dsl_dataset_phys(ds)->ds_prev_snap_txg); 2442 dsl_dataset_rele(ds, FTAG); 2443 return (0); 2444 } 2445 2446 static void 2447 dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) 2448 { 2449 dsl_pool_t *dp = scn->scn_dp; 2450 dsl_dataset_t *ds; 2451 2452 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 2453 2454 if (scn->scn_phys.scn_cur_min_txg >= 2455 scn->scn_phys.scn_max_txg) { 2456 /* 2457 * This can happen if this snapshot was created after the 2458 * scan started, and we already completed a previous snapshot 2459 * that was created after the scan started. This snapshot 2460 * only references blocks with: 2461 * 2462 * birth < our ds_creation_txg 2463 * cur_min_txg is no less than ds_creation_txg. 2464 * We have already visited these blocks. 2465 * or 2466 * birth > scn_max_txg 2467 * The scan requested not to visit these blocks. 2468 * 2469 * Subsequent snapshots (and clones) can reference our 2470 * blocks, or blocks with even higher birth times. 2471 * Therefore we do not need to visit them either, 2472 * so we do not add them to the work queue. 2473 * 2474 * Note that checking for cur_min_txg >= cur_max_txg 2475 * is not sufficient, because in that case we may need to 2476 * visit subsequent snapshots. This happens when min_txg > 0, 2477 * which raises cur_min_txg. In this case we will visit 2478 * this dataset but skip all of its blocks, because the 2479 * rootbp's birth time is < cur_min_txg. Then we will 2480 * add the next snapshots/clones to the work queue. 2481 */ 2482 char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 2483 dsl_dataset_name(ds, dsname); 2484 zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " 2485 "cur_min_txg (%llu) >= max_txg (%llu)", 2486 (longlong_t)dsobj, dsname, 2487 (longlong_t)scn->scn_phys.scn_cur_min_txg, 2488 (longlong_t)scn->scn_phys.scn_max_txg); 2489 kmem_free(dsname, MAXNAMELEN); 2490 2491 goto out; 2492 } 2493 2494 /* 2495 * Only the ZIL in the head (non-snapshot) is valid. Even though 2496 * snapshots can have ZIL block pointers (which may be the same 2497 * BP as in the head), they must be ignored. In addition, $ORIGIN 2498 * doesn't have a objset (i.e. its ds_bp is a hole) so we don't 2499 * need to look for a ZIL in it either. So we traverse the ZIL here, 2500 * rather than in scan_recurse(), because the regular snapshot 2501 * block-sharing rules don't apply to it. 2502 */ 2503 if (!dsl_dataset_is_snapshot(ds) && 2504 (dp->dp_origin_snap == NULL || 2505 ds->ds_dir != dp->dp_origin_snap->ds_dir)) { 2506 objset_t *os; 2507 if (dmu_objset_from_ds(ds, &os) != 0) { 2508 goto out; 2509 } 2510 dsl_scan_zil(dp, &os->os_zil_header); 2511 } 2512 2513 /* 2514 * Iterate over the bps in this ds. 2515 */ 2516 dmu_buf_will_dirty(ds->ds_dbuf, tx); 2517 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); 2518 dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); 2519 rrw_exit(&ds->ds_bp_rwlock, FTAG); 2520 2521 char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); 2522 dsl_dataset_name(ds, dsname); 2523 zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " 2524 "suspending=%u", 2525 (longlong_t)dsobj, dsname, 2526 (longlong_t)scn->scn_phys.scn_cur_min_txg, 2527 (longlong_t)scn->scn_phys.scn_cur_max_txg, 2528 (int)scn->scn_suspending); 2529 kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); 2530 2531 if (scn->scn_suspending) 2532 goto out; 2533 2534 /* 2535 * We've finished this pass over this dataset. 2536 */ 2537 2538 /* 2539 * If we did not completely visit this dataset, do another pass. 2540 */ 2541 if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { 2542 zfs_dbgmsg("incomplete pass on %s; visiting again", 2543 dp->dp_spa->spa_name); 2544 scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; 2545 scan_ds_queue_insert(scn, ds->ds_object, 2546 scn->scn_phys.scn_cur_max_txg); 2547 goto out; 2548 } 2549 2550 /* 2551 * Add descendant datasets to work queue. 2552 */ 2553 if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { 2554 scan_ds_queue_insert(scn, 2555 dsl_dataset_phys(ds)->ds_next_snap_obj, 2556 dsl_dataset_phys(ds)->ds_creation_txg); 2557 } 2558 if (dsl_dataset_phys(ds)->ds_num_children > 1) { 2559 boolean_t usenext = B_FALSE; 2560 if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { 2561 uint64_t count; 2562 /* 2563 * A bug in a previous version of the code could 2564 * cause upgrade_clones_cb() to not set 2565 * ds_next_snap_obj when it should, leading to a 2566 * missing entry. Therefore we can only use the 2567 * next_clones_obj when its count is correct. 2568 */ 2569 int err = zap_count(dp->dp_meta_objset, 2570 dsl_dataset_phys(ds)->ds_next_clones_obj, &count); 2571 if (err == 0 && 2572 count == dsl_dataset_phys(ds)->ds_num_children - 1) 2573 usenext = B_TRUE; 2574 } 2575 2576 if (usenext) { 2577 zap_cursor_t zc; 2578 zap_attribute_t za; 2579 for (zap_cursor_init(&zc, dp->dp_meta_objset, 2580 dsl_dataset_phys(ds)->ds_next_clones_obj); 2581 zap_cursor_retrieve(&zc, &za) == 0; 2582 (void) zap_cursor_advance(&zc)) { 2583 scan_ds_queue_insert(scn, 2584 zfs_strtonum(za.za_name, NULL), 2585 dsl_dataset_phys(ds)->ds_creation_txg); 2586 } 2587 zap_cursor_fini(&zc); 2588 } else { 2589 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2590 enqueue_clones_cb, &ds->ds_object, 2591 DS_FIND_CHILDREN)); 2592 } 2593 } 2594 2595 out: 2596 dsl_dataset_rele(ds, FTAG); 2597 } 2598 2599 static int 2600 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) 2601 { 2602 (void) arg; 2603 dsl_dataset_t *ds; 2604 int err; 2605 dsl_scan_t *scn = dp->dp_scan; 2606 2607 err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); 2608 if (err) 2609 return (err); 2610 2611 while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { 2612 dsl_dataset_t *prev; 2613 err = dsl_dataset_hold_obj(dp, 2614 dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); 2615 if (err) { 2616 dsl_dataset_rele(ds, FTAG); 2617 return (err); 2618 } 2619 2620 /* 2621 * If this is a clone, we don't need to worry about it for now. 2622 */ 2623 if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { 2624 dsl_dataset_rele(ds, FTAG); 2625 dsl_dataset_rele(prev, FTAG); 2626 return (0); 2627 } 2628 dsl_dataset_rele(ds, FTAG); 2629 ds = prev; 2630 } 2631 2632 scan_ds_queue_insert(scn, ds->ds_object, 2633 dsl_dataset_phys(ds)->ds_prev_snap_txg); 2634 dsl_dataset_rele(ds, FTAG); 2635 return (0); 2636 } 2637 2638 void 2639 dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, 2640 ddt_entry_t *dde, dmu_tx_t *tx) 2641 { 2642 (void) tx; 2643 const ddt_key_t *ddk = &dde->dde_key; 2644 ddt_phys_t *ddp = dde->dde_phys; 2645 blkptr_t bp; 2646 zbookmark_phys_t zb = { 0 }; 2647 2648 if (!dsl_scan_is_running(scn)) 2649 return; 2650 2651 /* 2652 * This function is special because it is the only thing 2653 * that can add scan_io_t's to the vdev scan queues from 2654 * outside dsl_scan_sync(). For the most part this is ok 2655 * as long as it is called from within syncing context. 2656 * However, dsl_scan_sync() expects that no new sio's will 2657 * be added between when all the work for a scan is done 2658 * and the next txg when the scan is actually marked as 2659 * completed. This check ensures we do not issue new sio's 2660 * during this period. 2661 */ 2662 if (scn->scn_done_txg != 0) 2663 return; 2664 2665 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 2666 if (ddp->ddp_phys_birth == 0 || 2667 ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) 2668 continue; 2669 ddt_bp_create(checksum, ddk, ddp, &bp); 2670 2671 scn->scn_visited_this_txg++; 2672 scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); 2673 } 2674 } 2675 2676 /* 2677 * Scrub/dedup interaction. 2678 * 2679 * If there are N references to a deduped block, we don't want to scrub it 2680 * N times -- ideally, we should scrub it exactly once. 2681 * 2682 * We leverage the fact that the dde's replication class (enum ddt_class) 2683 * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest 2684 * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. 2685 * 2686 * To prevent excess scrubbing, the scrub begins by walking the DDT 2687 * to find all blocks with refcnt > 1, and scrubs each of these once. 2688 * Since there are two replication classes which contain blocks with 2689 * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. 2690 * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. 2691 * 2692 * There would be nothing more to say if a block's refcnt couldn't change 2693 * during a scrub, but of course it can so we must account for changes 2694 * in a block's replication class. 2695 * 2696 * Here's an example of what can occur: 2697 * 2698 * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 2699 * when visited during the top-down scrub phase, it will be scrubbed twice. 2700 * This negates our scrub optimization, but is otherwise harmless. 2701 * 2702 * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 2703 * on each visit during the top-down scrub phase, it will never be scrubbed. 2704 * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's 2705 * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to 2706 * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 2707 * while a scrub is in progress, it scrubs the block right then. 2708 */ 2709 static void 2710 dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) 2711 { 2712 ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; 2713 ddt_entry_t dde = {{{{0}}}}; 2714 int error; 2715 uint64_t n = 0; 2716 2717 while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { 2718 ddt_t *ddt; 2719 2720 if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) 2721 break; 2722 dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", 2723 (longlong_t)ddb->ddb_class, 2724 (longlong_t)ddb->ddb_type, 2725 (longlong_t)ddb->ddb_checksum, 2726 (longlong_t)ddb->ddb_cursor); 2727 2728 /* There should be no pending changes to the dedup table */ 2729 ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; 2730 ASSERT(avl_first(&ddt->ddt_tree) == NULL); 2731 2732 dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); 2733 n++; 2734 2735 if (dsl_scan_check_suspend(scn, NULL)) 2736 break; 2737 } 2738 2739 zfs_dbgmsg("scanned %llu ddt entries on %s with class_max = %u; " 2740 "suspending=%u", (longlong_t)n, scn->scn_dp->dp_spa->spa_name, 2741 (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); 2742 2743 ASSERT(error == 0 || error == ENOENT); 2744 ASSERT(error != ENOENT || 2745 ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); 2746 } 2747 2748 static uint64_t 2749 dsl_scan_ds_maxtxg(dsl_dataset_t *ds) 2750 { 2751 uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; 2752 if (ds->ds_is_snapshot) 2753 return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); 2754 return (smt); 2755 } 2756 2757 static void 2758 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) 2759 { 2760 scan_ds_t *sds; 2761 dsl_pool_t *dp = scn->scn_dp; 2762 2763 if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= 2764 scn->scn_phys.scn_ddt_class_max) { 2765 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 2766 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 2767 dsl_scan_ddt(scn, tx); 2768 if (scn->scn_suspending) 2769 return; 2770 } 2771 2772 if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { 2773 /* First do the MOS & ORIGIN */ 2774 2775 scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; 2776 scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; 2777 dsl_scan_visit_rootbp(scn, NULL, 2778 &dp->dp_meta_rootbp, tx); 2779 spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); 2780 if (scn->scn_suspending) 2781 return; 2782 2783 if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { 2784 VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2785 enqueue_cb, NULL, DS_FIND_CHILDREN)); 2786 } else { 2787 dsl_scan_visitds(scn, 2788 dp->dp_origin_snap->ds_object, tx); 2789 } 2790 ASSERT(!scn->scn_suspending); 2791 } else if (scn->scn_phys.scn_bookmark.zb_objset != 2792 ZB_DESTROYED_OBJSET) { 2793 uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; 2794 /* 2795 * If we were suspended, continue from here. Note if the 2796 * ds we were suspended on was deleted, the zb_objset may 2797 * be -1, so we will skip this and find a new objset 2798 * below. 2799 */ 2800 dsl_scan_visitds(scn, dsobj, tx); 2801 if (scn->scn_suspending) 2802 return; 2803 } 2804 2805 /* 2806 * In case we suspended right at the end of the ds, zero the 2807 * bookmark so we don't think that we're still trying to resume. 2808 */ 2809 memset(&scn->scn_phys.scn_bookmark, 0, sizeof (zbookmark_phys_t)); 2810 2811 /* 2812 * Keep pulling things out of the dataset avl queue. Updates to the 2813 * persistent zap-object-as-queue happen only at checkpoints. 2814 */ 2815 while ((sds = avl_first(&scn->scn_queue)) != NULL) { 2816 dsl_dataset_t *ds; 2817 uint64_t dsobj = sds->sds_dsobj; 2818 uint64_t txg = sds->sds_txg; 2819 2820 /* dequeue and free the ds from the queue */ 2821 scan_ds_queue_remove(scn, dsobj); 2822 sds = NULL; 2823 2824 /* set up min / max txg */ 2825 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); 2826 if (txg != 0) { 2827 scn->scn_phys.scn_cur_min_txg = 2828 MAX(scn->scn_phys.scn_min_txg, txg); 2829 } else { 2830 scn->scn_phys.scn_cur_min_txg = 2831 MAX(scn->scn_phys.scn_min_txg, 2832 dsl_dataset_phys(ds)->ds_prev_snap_txg); 2833 } 2834 scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); 2835 dsl_dataset_rele(ds, FTAG); 2836 2837 dsl_scan_visitds(scn, dsobj, tx); 2838 if (scn->scn_suspending) 2839 return; 2840 } 2841 2842 /* No more objsets to fetch, we're done */ 2843 scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; 2844 ASSERT0(scn->scn_suspending); 2845 } 2846 2847 static uint64_t 2848 dsl_scan_count_data_disks(spa_t *spa) 2849 { 2850 vdev_t *rvd = spa->spa_root_vdev; 2851 uint64_t i, leaves = 0; 2852 2853 for (i = 0; i < rvd->vdev_children; i++) { 2854 vdev_t *vd = rvd->vdev_child[i]; 2855 if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache) 2856 continue; 2857 leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd); 2858 } 2859 return (leaves); 2860 } 2861 2862 static void 2863 scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) 2864 { 2865 int i; 2866 uint64_t cur_size = 0; 2867 2868 for (i = 0; i < BP_GET_NDVAS(bp); i++) { 2869 cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); 2870 } 2871 2872 q->q_total_zio_size_this_txg += cur_size; 2873 q->q_zios_this_txg++; 2874 } 2875 2876 static void 2877 scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, 2878 uint64_t end) 2879 { 2880 q->q_total_seg_size_this_txg += end - start; 2881 q->q_segs_this_txg++; 2882 } 2883 2884 static boolean_t 2885 scan_io_queue_check_suspend(dsl_scan_t *scn) 2886 { 2887 /* See comment in dsl_scan_check_suspend() */ 2888 uint64_t curr_time_ns = gethrtime(); 2889 uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; 2890 uint64_t sync_time_ns = curr_time_ns - 2891 scn->scn_dp->dp_spa->spa_sync_starttime; 2892 uint64_t dirty_min_bytes = zfs_dirty_data_max * 2893 zfs_vdev_async_write_active_min_dirty_percent / 100; 2894 uint_t mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? 2895 zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; 2896 2897 return ((NSEC2MSEC(scan_time_ns) > mintime && 2898 (scn->scn_dp->dp_dirty_total >= dirty_min_bytes || 2899 txg_sync_waiting(scn->scn_dp) || 2900 NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || 2901 spa_shutting_down(scn->scn_dp->dp_spa)); 2902 } 2903 2904 /* 2905 * Given a list of scan_io_t's in io_list, this issues the I/Os out to 2906 * disk. This consumes the io_list and frees the scan_io_t's. This is 2907 * called when emptying queues, either when we're up against the memory 2908 * limit or when we have finished scanning. Returns B_TRUE if we stopped 2909 * processing the list before we finished. Any sios that were not issued 2910 * will remain in the io_list. 2911 */ 2912 static boolean_t 2913 scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) 2914 { 2915 dsl_scan_t *scn = queue->q_scn; 2916 scan_io_t *sio; 2917 boolean_t suspended = B_FALSE; 2918 2919 while ((sio = list_head(io_list)) != NULL) { 2920 blkptr_t bp; 2921 2922 if (scan_io_queue_check_suspend(scn)) { 2923 suspended = B_TRUE; 2924 break; 2925 } 2926 2927 sio2bp(sio, &bp); 2928 scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, 2929 &sio->sio_zb, queue); 2930 (void) list_remove_head(io_list); 2931 scan_io_queues_update_zio_stats(queue, &bp); 2932 sio_free(sio); 2933 } 2934 return (suspended); 2935 } 2936 2937 /* 2938 * This function removes sios from an IO queue which reside within a given 2939 * range_seg_t and inserts them (in offset order) into a list. Note that 2940 * we only ever return a maximum of 32 sios at once. If there are more sios 2941 * to process within this segment that did not make it onto the list we 2942 * return B_TRUE and otherwise B_FALSE. 2943 */ 2944 static boolean_t 2945 scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) 2946 { 2947 scan_io_t *srch_sio, *sio, *next_sio; 2948 avl_index_t idx; 2949 uint_t num_sios = 0; 2950 int64_t bytes_issued = 0; 2951 2952 ASSERT(rs != NULL); 2953 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 2954 2955 srch_sio = sio_alloc(1); 2956 srch_sio->sio_nr_dvas = 1; 2957 SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); 2958 2959 /* 2960 * The exact start of the extent might not contain any matching zios, 2961 * so if that's the case, examine the next one in the tree. 2962 */ 2963 sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); 2964 sio_free(srch_sio); 2965 2966 if (sio == NULL) 2967 sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); 2968 2969 while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, 2970 queue->q_exts_by_addr) && num_sios <= 32) { 2971 ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, 2972 queue->q_exts_by_addr)); 2973 ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, 2974 queue->q_exts_by_addr)); 2975 2976 next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); 2977 avl_remove(&queue->q_sios_by_addr, sio); 2978 if (avl_is_empty(&queue->q_sios_by_addr)) 2979 atomic_add_64(&queue->q_scn->scn_queues_pending, -1); 2980 queue->q_sio_memused -= SIO_GET_MUSED(sio); 2981 2982 bytes_issued += SIO_GET_ASIZE(sio); 2983 num_sios++; 2984 list_insert_tail(list, sio); 2985 sio = next_sio; 2986 } 2987 2988 /* 2989 * We limit the number of sios we process at once to 32 to avoid 2990 * biting off more than we can chew. If we didn't take everything 2991 * in the segment we update it to reflect the work we were able to 2992 * complete. Otherwise, we remove it from the range tree entirely. 2993 */ 2994 if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, 2995 queue->q_exts_by_addr)) { 2996 range_tree_adjust_fill(queue->q_exts_by_addr, rs, 2997 -bytes_issued); 2998 range_tree_resize_segment(queue->q_exts_by_addr, rs, 2999 SIO_GET_OFFSET(sio), rs_get_end(rs, 3000 queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); 3001 queue->q_last_ext_addr = SIO_GET_OFFSET(sio); 3002 return (B_TRUE); 3003 } else { 3004 uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); 3005 uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); 3006 range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); 3007 queue->q_last_ext_addr = -1; 3008 return (B_FALSE); 3009 } 3010 } 3011 3012 /* 3013 * This is called from the queue emptying thread and selects the next 3014 * extent from which we are to issue I/Os. The behavior of this function 3015 * depends on the state of the scan, the current memory consumption and 3016 * whether or not we are performing a scan shutdown. 3017 * 1) We select extents in an elevator algorithm (LBA-order) if the scan 3018 * needs to perform a checkpoint 3019 * 2) We select the largest available extent if we are up against the 3020 * memory limit. 3021 * 3) Otherwise we don't select any extents. 3022 */ 3023 static range_seg_t * 3024 scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) 3025 { 3026 dsl_scan_t *scn = queue->q_scn; 3027 range_tree_t *rt = queue->q_exts_by_addr; 3028 3029 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 3030 ASSERT(scn->scn_is_sorted); 3031 3032 if (!scn->scn_checkpointing && !scn->scn_clearing) 3033 return (NULL); 3034 3035 /* 3036 * During normal clearing, we want to issue our largest segments 3037 * first, keeping IO as sequential as possible, and leaving the 3038 * smaller extents for later with the hope that they might eventually 3039 * grow to larger sequential segments. However, when the scan is 3040 * checkpointing, no new extents will be added to the sorting queue, 3041 * so the way we are sorted now is as good as it will ever get. 3042 * In this case, we instead switch to issuing extents in LBA order. 3043 */ 3044 if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) || 3045 zfs_scan_issue_strategy == 1) 3046 return (range_tree_first(rt)); 3047 3048 /* 3049 * Try to continue previous extent if it is not completed yet. After 3050 * shrink in scan_io_queue_gather() it may no longer be the best, but 3051 * otherwise we leave shorter remnant every txg. 3052 */ 3053 uint64_t start; 3054 uint64_t size = 1ULL << rt->rt_shift; 3055 range_seg_t *addr_rs; 3056 if (queue->q_last_ext_addr != -1) { 3057 start = queue->q_last_ext_addr; 3058 addr_rs = range_tree_find(rt, start, size); 3059 if (addr_rs != NULL) 3060 return (addr_rs); 3061 } 3062 3063 /* 3064 * Nothing to continue, so find new best extent. 3065 */ 3066 uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL); 3067 if (v == NULL) 3068 return (NULL); 3069 queue->q_last_ext_addr = start = *v << rt->rt_shift; 3070 3071 /* 3072 * We need to get the original entry in the by_addr tree so we can 3073 * modify it. 3074 */ 3075 addr_rs = range_tree_find(rt, start, size); 3076 ASSERT3P(addr_rs, !=, NULL); 3077 ASSERT3U(rs_get_start(addr_rs, rt), ==, start); 3078 ASSERT3U(rs_get_end(addr_rs, rt), >, start); 3079 return (addr_rs); 3080 } 3081 3082 static void 3083 scan_io_queues_run_one(void *arg) 3084 { 3085 dsl_scan_io_queue_t *queue = arg; 3086 kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; 3087 boolean_t suspended = B_FALSE; 3088 range_seg_t *rs; 3089 scan_io_t *sio; 3090 zio_t *zio; 3091 list_t sio_list; 3092 3093 ASSERT(queue->q_scn->scn_is_sorted); 3094 3095 list_create(&sio_list, sizeof (scan_io_t), 3096 offsetof(scan_io_t, sio_nodes.sio_list_node)); 3097 zio = zio_null(queue->q_scn->scn_zio_root, queue->q_scn->scn_dp->dp_spa, 3098 NULL, NULL, NULL, ZIO_FLAG_CANFAIL); 3099 mutex_enter(q_lock); 3100 queue->q_zio = zio; 3101 3102 /* Calculate maximum in-flight bytes for this vdev. */ 3103 queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit * 3104 (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd))); 3105 3106 /* reset per-queue scan statistics for this txg */ 3107 queue->q_total_seg_size_this_txg = 0; 3108 queue->q_segs_this_txg = 0; 3109 queue->q_total_zio_size_this_txg = 0; 3110 queue->q_zios_this_txg = 0; 3111 3112 /* loop until we run out of time or sios */ 3113 while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { 3114 uint64_t seg_start = 0, seg_end = 0; 3115 boolean_t more_left; 3116 3117 ASSERT(list_is_empty(&sio_list)); 3118 3119 /* loop while we still have sios left to process in this rs */ 3120 do { 3121 scan_io_t *first_sio, *last_sio; 3122 3123 /* 3124 * We have selected which extent needs to be 3125 * processed next. Gather up the corresponding sios. 3126 */ 3127 more_left = scan_io_queue_gather(queue, rs, &sio_list); 3128 ASSERT(!list_is_empty(&sio_list)); 3129 first_sio = list_head(&sio_list); 3130 last_sio = list_tail(&sio_list); 3131 3132 seg_end = SIO_GET_END_OFFSET(last_sio); 3133 if (seg_start == 0) 3134 seg_start = SIO_GET_OFFSET(first_sio); 3135 3136 /* 3137 * Issuing sios can take a long time so drop the 3138 * queue lock. The sio queue won't be updated by 3139 * other threads since we're in syncing context so 3140 * we can be sure that our trees will remain exactly 3141 * as we left them. 3142 */ 3143 mutex_exit(q_lock); 3144 suspended = scan_io_queue_issue(queue, &sio_list); 3145 mutex_enter(q_lock); 3146 3147 if (suspended) 3148 break; 3149 } while (more_left); 3150 3151 /* update statistics for debugging purposes */ 3152 scan_io_queues_update_seg_stats(queue, seg_start, seg_end); 3153 3154 if (suspended) 3155 break; 3156 } 3157 3158 /* 3159 * If we were suspended in the middle of processing, 3160 * requeue any unfinished sios and exit. 3161 */ 3162 while ((sio = list_head(&sio_list)) != NULL) { 3163 list_remove(&sio_list, sio); 3164 scan_io_queue_insert_impl(queue, sio); 3165 } 3166 3167 queue->q_zio = NULL; 3168 mutex_exit(q_lock); 3169 zio_nowait(zio); 3170 list_destroy(&sio_list); 3171 } 3172 3173 /* 3174 * Performs an emptying run on all scan queues in the pool. This just 3175 * punches out one thread per top-level vdev, each of which processes 3176 * only that vdev's scan queue. We can parallelize the I/O here because 3177 * we know that each queue's I/Os only affect its own top-level vdev. 3178 * 3179 * This function waits for the queue runs to complete, and must be 3180 * called from dsl_scan_sync (or in general, syncing context). 3181 */ 3182 static void 3183 scan_io_queues_run(dsl_scan_t *scn) 3184 { 3185 spa_t *spa = scn->scn_dp->dp_spa; 3186 3187 ASSERT(scn->scn_is_sorted); 3188 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 3189 3190 if (scn->scn_queues_pending == 0) 3191 return; 3192 3193 if (scn->scn_taskq == NULL) { 3194 int nthreads = spa->spa_root_vdev->vdev_children; 3195 3196 /* 3197 * We need to make this taskq *always* execute as many 3198 * threads in parallel as we have top-level vdevs and no 3199 * less, otherwise strange serialization of the calls to 3200 * scan_io_queues_run_one can occur during spa_sync runs 3201 * and that significantly impacts performance. 3202 */ 3203 scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads, 3204 minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE); 3205 } 3206 3207 for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 3208 vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; 3209 3210 mutex_enter(&vd->vdev_scan_io_queue_lock); 3211 if (vd->vdev_scan_io_queue != NULL) { 3212 VERIFY(taskq_dispatch(scn->scn_taskq, 3213 scan_io_queues_run_one, vd->vdev_scan_io_queue, 3214 TQ_SLEEP) != TASKQID_INVALID); 3215 } 3216 mutex_exit(&vd->vdev_scan_io_queue_lock); 3217 } 3218 3219 /* 3220 * Wait for the queues to finish issuing their IOs for this run 3221 * before we return. There may still be IOs in flight at this 3222 * point. 3223 */ 3224 taskq_wait(scn->scn_taskq); 3225 } 3226 3227 static boolean_t 3228 dsl_scan_async_block_should_pause(dsl_scan_t *scn) 3229 { 3230 uint64_t elapsed_nanosecs; 3231 3232 if (zfs_recover) 3233 return (B_FALSE); 3234 3235 if (zfs_async_block_max_blocks != 0 && 3236 scn->scn_visited_this_txg >= zfs_async_block_max_blocks) { 3237 return (B_TRUE); 3238 } 3239 3240 if (zfs_max_async_dedup_frees != 0 && 3241 scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) { 3242 return (B_TRUE); 3243 } 3244 3245 elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; 3246 return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || 3247 (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && 3248 txg_sync_waiting(scn->scn_dp)) || 3249 spa_shutting_down(scn->scn_dp->dp_spa)); 3250 } 3251 3252 static int 3253 dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 3254 { 3255 dsl_scan_t *scn = arg; 3256 3257 if (!scn->scn_is_bptree || 3258 (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { 3259 if (dsl_scan_async_block_should_pause(scn)) 3260 return (SET_ERROR(ERESTART)); 3261 } 3262 3263 zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, 3264 dmu_tx_get_txg(tx), bp, 0)); 3265 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 3266 -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), 3267 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 3268 scn->scn_visited_this_txg++; 3269 if (BP_GET_DEDUP(bp)) 3270 scn->scn_dedup_frees_this_txg++; 3271 return (0); 3272 } 3273 3274 static void 3275 dsl_scan_update_stats(dsl_scan_t *scn) 3276 { 3277 spa_t *spa = scn->scn_dp->dp_spa; 3278 uint64_t i; 3279 uint64_t seg_size_total = 0, zio_size_total = 0; 3280 uint64_t seg_count_total = 0, zio_count_total = 0; 3281 3282 for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { 3283 vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; 3284 dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; 3285 3286 if (queue == NULL) 3287 continue; 3288 3289 seg_size_total += queue->q_total_seg_size_this_txg; 3290 zio_size_total += queue->q_total_zio_size_this_txg; 3291 seg_count_total += queue->q_segs_this_txg; 3292 zio_count_total += queue->q_zios_this_txg; 3293 } 3294 3295 if (seg_count_total == 0 || zio_count_total == 0) { 3296 scn->scn_avg_seg_size_this_txg = 0; 3297 scn->scn_avg_zio_size_this_txg = 0; 3298 scn->scn_segs_this_txg = 0; 3299 scn->scn_zios_this_txg = 0; 3300 return; 3301 } 3302 3303 scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; 3304 scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; 3305 scn->scn_segs_this_txg = seg_count_total; 3306 scn->scn_zios_this_txg = zio_count_total; 3307 } 3308 3309 static int 3310 bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3311 dmu_tx_t *tx) 3312 { 3313 ASSERT(!bp_freed); 3314 return (dsl_scan_free_block_cb(arg, bp, tx)); 3315 } 3316 3317 static int 3318 dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3319 dmu_tx_t *tx) 3320 { 3321 ASSERT(!bp_freed); 3322 dsl_scan_t *scn = arg; 3323 const dva_t *dva = &bp->blk_dva[0]; 3324 3325 if (dsl_scan_async_block_should_pause(scn)) 3326 return (SET_ERROR(ERESTART)); 3327 3328 spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, 3329 DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), 3330 DVA_GET_ASIZE(dva), tx); 3331 scn->scn_visited_this_txg++; 3332 return (0); 3333 } 3334 3335 boolean_t 3336 dsl_scan_active(dsl_scan_t *scn) 3337 { 3338 spa_t *spa = scn->scn_dp->dp_spa; 3339 uint64_t used = 0, comp, uncomp; 3340 boolean_t clones_left; 3341 3342 if (spa->spa_load_state != SPA_LOAD_NONE) 3343 return (B_FALSE); 3344 if (spa_shutting_down(spa)) 3345 return (B_FALSE); 3346 if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || 3347 (scn->scn_async_destroying && !scn->scn_async_stalled)) 3348 return (B_TRUE); 3349 3350 if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { 3351 (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, 3352 &used, &comp, &uncomp); 3353 } 3354 clones_left = spa_livelist_delete_check(spa); 3355 return ((used != 0) || (clones_left)); 3356 } 3357 3358 static boolean_t 3359 dsl_scan_check_deferred(vdev_t *vd) 3360 { 3361 boolean_t need_resilver = B_FALSE; 3362 3363 for (int c = 0; c < vd->vdev_children; c++) { 3364 need_resilver |= 3365 dsl_scan_check_deferred(vd->vdev_child[c]); 3366 } 3367 3368 if (!vdev_is_concrete(vd) || vd->vdev_aux || 3369 !vd->vdev_ops->vdev_op_leaf) 3370 return (need_resilver); 3371 3372 if (!vd->vdev_resilver_deferred) 3373 need_resilver = B_TRUE; 3374 3375 return (need_resilver); 3376 } 3377 3378 static boolean_t 3379 dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, 3380 uint64_t phys_birth) 3381 { 3382 vdev_t *vd; 3383 3384 vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 3385 3386 if (vd->vdev_ops == &vdev_indirect_ops) { 3387 /* 3388 * The indirect vdev can point to multiple 3389 * vdevs. For simplicity, always create 3390 * the resilver zio_t. zio_vdev_io_start() 3391 * will bypass the child resilver i/o's if 3392 * they are on vdevs that don't have DTL's. 3393 */ 3394 return (B_TRUE); 3395 } 3396 3397 if (DVA_GET_GANG(dva)) { 3398 /* 3399 * Gang members may be spread across multiple 3400 * vdevs, so the best estimate we have is the 3401 * scrub range, which has already been checked. 3402 * XXX -- it would be better to change our 3403 * allocation policy to ensure that all 3404 * gang members reside on the same vdev. 3405 */ 3406 return (B_TRUE); 3407 } 3408 3409 /* 3410 * Check if the top-level vdev must resilver this offset. 3411 * When the offset does not intersect with a dirty leaf DTL 3412 * then it may be possible to skip the resilver IO. The psize 3413 * is provided instead of asize to simplify the check for RAIDZ. 3414 */ 3415 if (!vdev_dtl_need_resilver(vd, dva, psize, phys_birth)) 3416 return (B_FALSE); 3417 3418 /* 3419 * Check that this top-level vdev has a device under it which 3420 * is resilvering and is not deferred. 3421 */ 3422 if (!dsl_scan_check_deferred(vd)) 3423 return (B_FALSE); 3424 3425 return (B_TRUE); 3426 } 3427 3428 static int 3429 dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) 3430 { 3431 dsl_scan_t *scn = dp->dp_scan; 3432 spa_t *spa = dp->dp_spa; 3433 int err = 0; 3434 3435 if (spa_suspend_async_destroy(spa)) 3436 return (0); 3437 3438 if (zfs_free_bpobj_enabled && 3439 spa_version(spa) >= SPA_VERSION_DEADLISTS) { 3440 scn->scn_is_bptree = B_FALSE; 3441 scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; 3442 scn->scn_zio_root = zio_root(spa, NULL, 3443 NULL, ZIO_FLAG_MUSTSUCCEED); 3444 err = bpobj_iterate(&dp->dp_free_bpobj, 3445 bpobj_dsl_scan_free_block_cb, scn, tx); 3446 VERIFY0(zio_wait(scn->scn_zio_root)); 3447 scn->scn_zio_root = NULL; 3448 3449 if (err != 0 && err != ERESTART) 3450 zfs_panic_recover("error %u from bpobj_iterate()", err); 3451 } 3452 3453 if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { 3454 ASSERT(scn->scn_async_destroying); 3455 scn->scn_is_bptree = B_TRUE; 3456 scn->scn_zio_root = zio_root(spa, NULL, 3457 NULL, ZIO_FLAG_MUSTSUCCEED); 3458 err = bptree_iterate(dp->dp_meta_objset, 3459 dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); 3460 VERIFY0(zio_wait(scn->scn_zio_root)); 3461 scn->scn_zio_root = NULL; 3462 3463 if (err == EIO || err == ECKSUM) { 3464 err = 0; 3465 } else if (err != 0 && err != ERESTART) { 3466 zfs_panic_recover("error %u from " 3467 "traverse_dataset_destroyed()", err); 3468 } 3469 3470 if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { 3471 /* finished; deactivate async destroy feature */ 3472 spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); 3473 ASSERT(!spa_feature_is_active(spa, 3474 SPA_FEATURE_ASYNC_DESTROY)); 3475 VERIFY0(zap_remove(dp->dp_meta_objset, 3476 DMU_POOL_DIRECTORY_OBJECT, 3477 DMU_POOL_BPTREE_OBJ, tx)); 3478 VERIFY0(bptree_free(dp->dp_meta_objset, 3479 dp->dp_bptree_obj, tx)); 3480 dp->dp_bptree_obj = 0; 3481 scn->scn_async_destroying = B_FALSE; 3482 scn->scn_async_stalled = B_FALSE; 3483 } else { 3484 /* 3485 * If we didn't make progress, mark the async 3486 * destroy as stalled, so that we will not initiate 3487 * a spa_sync() on its behalf. Note that we only 3488 * check this if we are not finished, because if the 3489 * bptree had no blocks for us to visit, we can 3490 * finish without "making progress". 3491 */ 3492 scn->scn_async_stalled = 3493 (scn->scn_visited_this_txg == 0); 3494 } 3495 } 3496 if (scn->scn_visited_this_txg) { 3497 zfs_dbgmsg("freed %llu blocks in %llums from " 3498 "free_bpobj/bptree on %s in txg %llu; err=%u", 3499 (longlong_t)scn->scn_visited_this_txg, 3500 (longlong_t) 3501 NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), 3502 spa->spa_name, (longlong_t)tx->tx_txg, err); 3503 scn->scn_visited_this_txg = 0; 3504 scn->scn_dedup_frees_this_txg = 0; 3505 3506 /* 3507 * Write out changes to the DDT that may be required as a 3508 * result of the blocks freed. This ensures that the DDT 3509 * is clean when a scrub/resilver runs. 3510 */ 3511 ddt_sync(spa, tx->tx_txg); 3512 } 3513 if (err != 0) 3514 return (err); 3515 if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && 3516 zfs_free_leak_on_eio && 3517 (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || 3518 dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || 3519 dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { 3520 /* 3521 * We have finished background destroying, but there is still 3522 * some space left in the dp_free_dir. Transfer this leaked 3523 * space to the dp_leak_dir. 3524 */ 3525 if (dp->dp_leak_dir == NULL) { 3526 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 3527 (void) dsl_dir_create_sync(dp, dp->dp_root_dir, 3528 LEAK_DIR_NAME, tx); 3529 VERIFY0(dsl_pool_open_special_dir(dp, 3530 LEAK_DIR_NAME, &dp->dp_leak_dir)); 3531 rrw_exit(&dp->dp_config_rwlock, FTAG); 3532 } 3533 dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, 3534 dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 3535 dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 3536 dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 3537 dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, 3538 -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, 3539 -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, 3540 -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); 3541 } 3542 3543 if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && 3544 !spa_livelist_delete_check(spa)) { 3545 /* finished; verify that space accounting went to zero */ 3546 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); 3547 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); 3548 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); 3549 } 3550 3551 spa_notify_waiters(spa); 3552 3553 EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), 3554 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3555 DMU_POOL_OBSOLETE_BPOBJ)); 3556 if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { 3557 ASSERT(spa_feature_is_active(dp->dp_spa, 3558 SPA_FEATURE_OBSOLETE_COUNTS)); 3559 3560 scn->scn_is_bptree = B_FALSE; 3561 scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; 3562 err = bpobj_iterate(&dp->dp_obsolete_bpobj, 3563 dsl_scan_obsolete_block_cb, scn, tx); 3564 if (err != 0 && err != ERESTART) 3565 zfs_panic_recover("error %u from bpobj_iterate()", err); 3566 3567 if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) 3568 dsl_pool_destroy_obsolete_bpobj(dp, tx); 3569 } 3570 return (0); 3571 } 3572 3573 /* 3574 * This is the primary entry point for scans that is called from syncing 3575 * context. Scans must happen entirely during syncing context so that we 3576 * can guarantee that blocks we are currently scanning will not change out 3577 * from under us. While a scan is active, this function controls how quickly 3578 * transaction groups proceed, instead of the normal handling provided by 3579 * txg_sync_thread(). 3580 */ 3581 void 3582 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) 3583 { 3584 int err = 0; 3585 dsl_scan_t *scn = dp->dp_scan; 3586 spa_t *spa = dp->dp_spa; 3587 state_sync_type_t sync_type = SYNC_OPTIONAL; 3588 3589 if (spa->spa_resilver_deferred && 3590 !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) 3591 spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx); 3592 3593 /* 3594 * Check for scn_restart_txg before checking spa_load_state, so 3595 * that we can restart an old-style scan while the pool is being 3596 * imported (see dsl_scan_init). We also restart scans if there 3597 * is a deferred resilver and the user has manually disabled 3598 * deferred resilvers via the tunable. 3599 */ 3600 if (dsl_scan_restarting(scn, tx) || 3601 (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) { 3602 pool_scan_func_t func = POOL_SCAN_SCRUB; 3603 dsl_scan_done(scn, B_FALSE, tx); 3604 if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) 3605 func = POOL_SCAN_RESILVER; 3606 zfs_dbgmsg("restarting scan func=%u on %s txg=%llu", 3607 func, dp->dp_spa->spa_name, (longlong_t)tx->tx_txg); 3608 dsl_scan_setup_sync(&func, tx); 3609 } 3610 3611 /* 3612 * Only process scans in sync pass 1. 3613 */ 3614 if (spa_sync_pass(spa) > 1) 3615 return; 3616 3617 /* 3618 * If the spa is shutting down, then stop scanning. This will 3619 * ensure that the scan does not dirty any new data during the 3620 * shutdown phase. 3621 */ 3622 if (spa_shutting_down(spa)) 3623 return; 3624 3625 /* 3626 * If the scan is inactive due to a stalled async destroy, try again. 3627 */ 3628 if (!scn->scn_async_stalled && !dsl_scan_active(scn)) 3629 return; 3630 3631 /* reset scan statistics */ 3632 scn->scn_visited_this_txg = 0; 3633 scn->scn_dedup_frees_this_txg = 0; 3634 scn->scn_holes_this_txg = 0; 3635 scn->scn_lt_min_this_txg = 0; 3636 scn->scn_gt_max_this_txg = 0; 3637 scn->scn_ddt_contained_this_txg = 0; 3638 scn->scn_objsets_visited_this_txg = 0; 3639 scn->scn_avg_seg_size_this_txg = 0; 3640 scn->scn_segs_this_txg = 0; 3641 scn->scn_avg_zio_size_this_txg = 0; 3642 scn->scn_zios_this_txg = 0; 3643 scn->scn_suspending = B_FALSE; 3644 scn->scn_sync_start_time = gethrtime(); 3645 spa->spa_scrub_active = B_TRUE; 3646 3647 /* 3648 * First process the async destroys. If we suspend, don't do 3649 * any scrubbing or resilvering. This ensures that there are no 3650 * async destroys while we are scanning, so the scan code doesn't 3651 * have to worry about traversing it. It is also faster to free the 3652 * blocks than to scrub them. 3653 */ 3654 err = dsl_process_async_destroys(dp, tx); 3655 if (err != 0) 3656 return; 3657 3658 if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) 3659 return; 3660 3661 /* 3662 * Wait a few txgs after importing to begin scanning so that 3663 * we can get the pool imported quickly. 3664 */ 3665 if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) 3666 return; 3667 3668 /* 3669 * zfs_scan_suspend_progress can be set to disable scan progress. 3670 * We don't want to spin the txg_sync thread, so we add a delay 3671 * here to simulate the time spent doing a scan. This is mostly 3672 * useful for testing and debugging. 3673 */ 3674 if (zfs_scan_suspend_progress) { 3675 uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; 3676 uint_t mintime = (scn->scn_phys.scn_func == 3677 POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : 3678 zfs_scrub_min_time_ms; 3679 3680 while (zfs_scan_suspend_progress && 3681 !txg_sync_waiting(scn->scn_dp) && 3682 !spa_shutting_down(scn->scn_dp->dp_spa) && 3683 NSEC2MSEC(scan_time_ns) < mintime) { 3684 delay(hz); 3685 scan_time_ns = gethrtime() - scn->scn_sync_start_time; 3686 } 3687 return; 3688 } 3689 3690 /* 3691 * Disabled by default, set zfs_scan_report_txgs to report 3692 * average performance over the last zfs_scan_report_txgs TXGs. 3693 */ 3694 if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && 3695 tx->tx_txg % zfs_scan_report_txgs == 0) { 3696 scn->scn_issued_before_pass += spa->spa_scan_pass_issued; 3697 spa_scan_stat_init(spa); 3698 } 3699 3700 /* 3701 * It is possible to switch from unsorted to sorted at any time, 3702 * but afterwards the scan will remain sorted unless reloaded from 3703 * a checkpoint after a reboot. 3704 */ 3705 if (!zfs_scan_legacy) { 3706 scn->scn_is_sorted = B_TRUE; 3707 if (scn->scn_last_checkpoint == 0) 3708 scn->scn_last_checkpoint = ddi_get_lbolt(); 3709 } 3710 3711 /* 3712 * For sorted scans, determine what kind of work we will be doing 3713 * this txg based on our memory limitations and whether or not we 3714 * need to perform a checkpoint. 3715 */ 3716 if (scn->scn_is_sorted) { 3717 /* 3718 * If we are over our checkpoint interval, set scn_clearing 3719 * so that we can begin checkpointing immediately. The 3720 * checkpoint allows us to save a consistent bookmark 3721 * representing how much data we have scrubbed so far. 3722 * Otherwise, use the memory limit to determine if we should 3723 * scan for metadata or start issue scrub IOs. We accumulate 3724 * metadata until we hit our hard memory limit at which point 3725 * we issue scrub IOs until we are at our soft memory limit. 3726 */ 3727 if (scn->scn_checkpointing || 3728 ddi_get_lbolt() - scn->scn_last_checkpoint > 3729 SEC_TO_TICK(zfs_scan_checkpoint_intval)) { 3730 if (!scn->scn_checkpointing) 3731 zfs_dbgmsg("begin scan checkpoint for %s", 3732 spa->spa_name); 3733 3734 scn->scn_checkpointing = B_TRUE; 3735 scn->scn_clearing = B_TRUE; 3736 } else { 3737 boolean_t should_clear = dsl_scan_should_clear(scn); 3738 if (should_clear && !scn->scn_clearing) { 3739 zfs_dbgmsg("begin scan clearing for %s", 3740 spa->spa_name); 3741 scn->scn_clearing = B_TRUE; 3742 } else if (!should_clear && scn->scn_clearing) { 3743 zfs_dbgmsg("finish scan clearing for %s", 3744 spa->spa_name); 3745 scn->scn_clearing = B_FALSE; 3746 } 3747 } 3748 } else { 3749 ASSERT0(scn->scn_checkpointing); 3750 ASSERT0(scn->scn_clearing); 3751 } 3752 3753 if (!scn->scn_clearing && scn->scn_done_txg == 0) { 3754 /* Need to scan metadata for more blocks to scrub */ 3755 dsl_scan_phys_t *scnp = &scn->scn_phys; 3756 taskqid_t prefetch_tqid; 3757 3758 /* 3759 * Calculate the max number of in-flight bytes for pool-wide 3760 * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). 3761 * Limits for the issuing phase are done per top-level vdev and 3762 * are handled separately. 3763 */ 3764 scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, 3765 zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); 3766 3767 if (scnp->scn_ddt_bookmark.ddb_class <= 3768 scnp->scn_ddt_class_max) { 3769 ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); 3770 zfs_dbgmsg("doing scan sync for %s txg %llu; " 3771 "ddt bm=%llu/%llu/%llu/%llx", 3772 spa->spa_name, 3773 (longlong_t)tx->tx_txg, 3774 (longlong_t)scnp->scn_ddt_bookmark.ddb_class, 3775 (longlong_t)scnp->scn_ddt_bookmark.ddb_type, 3776 (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, 3777 (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); 3778 } else { 3779 zfs_dbgmsg("doing scan sync for %s txg %llu; " 3780 "bm=%llu/%llu/%llu/%llu", 3781 spa->spa_name, 3782 (longlong_t)tx->tx_txg, 3783 (longlong_t)scnp->scn_bookmark.zb_objset, 3784 (longlong_t)scnp->scn_bookmark.zb_object, 3785 (longlong_t)scnp->scn_bookmark.zb_level, 3786 (longlong_t)scnp->scn_bookmark.zb_blkid); 3787 } 3788 3789 scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 3790 NULL, ZIO_FLAG_CANFAIL); 3791 3792 scn->scn_prefetch_stop = B_FALSE; 3793 prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, 3794 dsl_scan_prefetch_thread, scn, TQ_SLEEP); 3795 ASSERT(prefetch_tqid != TASKQID_INVALID); 3796 3797 dsl_pool_config_enter(dp, FTAG); 3798 dsl_scan_visit(scn, tx); 3799 dsl_pool_config_exit(dp, FTAG); 3800 3801 mutex_enter(&dp->dp_spa->spa_scrub_lock); 3802 scn->scn_prefetch_stop = B_TRUE; 3803 cv_broadcast(&spa->spa_scrub_io_cv); 3804 mutex_exit(&dp->dp_spa->spa_scrub_lock); 3805 3806 taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); 3807 (void) zio_wait(scn->scn_zio_root); 3808 scn->scn_zio_root = NULL; 3809 3810 zfs_dbgmsg("scan visited %llu blocks of %s in %llums " 3811 "(%llu os's, %llu holes, %llu < mintxg, " 3812 "%llu in ddt, %llu > maxtxg)", 3813 (longlong_t)scn->scn_visited_this_txg, 3814 spa->spa_name, 3815 (longlong_t)NSEC2MSEC(gethrtime() - 3816 scn->scn_sync_start_time), 3817 (longlong_t)scn->scn_objsets_visited_this_txg, 3818 (longlong_t)scn->scn_holes_this_txg, 3819 (longlong_t)scn->scn_lt_min_this_txg, 3820 (longlong_t)scn->scn_ddt_contained_this_txg, 3821 (longlong_t)scn->scn_gt_max_this_txg); 3822 3823 if (!scn->scn_suspending) { 3824 ASSERT0(avl_numnodes(&scn->scn_queue)); 3825 scn->scn_done_txg = tx->tx_txg + 1; 3826 if (scn->scn_is_sorted) { 3827 scn->scn_checkpointing = B_TRUE; 3828 scn->scn_clearing = B_TRUE; 3829 scn->scn_issued_before_pass += 3830 spa->spa_scan_pass_issued; 3831 spa_scan_stat_init(spa); 3832 } 3833 zfs_dbgmsg("scan complete for %s txg %llu", 3834 spa->spa_name, 3835 (longlong_t)tx->tx_txg); 3836 } 3837 } else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) { 3838 ASSERT(scn->scn_clearing); 3839 3840 /* need to issue scrubbing IOs from per-vdev queues */ 3841 scn->scn_zio_root = zio_root(dp->dp_spa, NULL, 3842 NULL, ZIO_FLAG_CANFAIL); 3843 scan_io_queues_run(scn); 3844 (void) zio_wait(scn->scn_zio_root); 3845 scn->scn_zio_root = NULL; 3846 3847 /* calculate and dprintf the current memory usage */ 3848 (void) dsl_scan_should_clear(scn); 3849 dsl_scan_update_stats(scn); 3850 3851 zfs_dbgmsg("scan issued %llu blocks for %s (%llu segs) " 3852 "in %llums (avg_block_size = %llu, avg_seg_size = %llu)", 3853 (longlong_t)scn->scn_zios_this_txg, 3854 spa->spa_name, 3855 (longlong_t)scn->scn_segs_this_txg, 3856 (longlong_t)NSEC2MSEC(gethrtime() - 3857 scn->scn_sync_start_time), 3858 (longlong_t)scn->scn_avg_zio_size_this_txg, 3859 (longlong_t)scn->scn_avg_seg_size_this_txg); 3860 } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { 3861 /* Finished with everything. Mark the scrub as complete */ 3862 zfs_dbgmsg("scan issuing complete txg %llu for %s", 3863 (longlong_t)tx->tx_txg, 3864 spa->spa_name); 3865 ASSERT3U(scn->scn_done_txg, !=, 0); 3866 ASSERT0(spa->spa_scrub_inflight); 3867 ASSERT0(scn->scn_queues_pending); 3868 dsl_scan_done(scn, B_TRUE, tx); 3869 sync_type = SYNC_MANDATORY; 3870 } 3871 3872 dsl_scan_sync_state(scn, tx, sync_type); 3873 } 3874 3875 static void 3876 count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all) 3877 { 3878 /* 3879 * Don't count embedded bp's, since we already did the work of 3880 * scanning these when we scanned the containing block. 3881 */ 3882 if (BP_IS_EMBEDDED(bp)) 3883 return; 3884 3885 /* 3886 * Update the spa's stats on how many bytes we have issued. 3887 * Sequential scrubs create a zio for each DVA of the bp. Each 3888 * of these will include all DVAs for repair purposes, but the 3889 * zio code will only try the first one unless there is an issue. 3890 * Therefore, we should only count the first DVA for these IOs. 3891 */ 3892 atomic_add_64(&spa->spa_scan_pass_issued, 3893 all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); 3894 } 3895 3896 static void 3897 count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) 3898 { 3899 /* 3900 * If we resume after a reboot, zab will be NULL; don't record 3901 * incomplete stats in that case. 3902 */ 3903 if (zab == NULL) 3904 return; 3905 3906 for (int i = 0; i < 4; i++) { 3907 int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; 3908 int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; 3909 3910 if (t & DMU_OT_NEWTYPE) 3911 t = DMU_OT_OTHER; 3912 zfs_blkstat_t *zb = &zab->zab_type[l][t]; 3913 int equal; 3914 3915 zb->zb_count++; 3916 zb->zb_asize += BP_GET_ASIZE(bp); 3917 zb->zb_lsize += BP_GET_LSIZE(bp); 3918 zb->zb_psize += BP_GET_PSIZE(bp); 3919 zb->zb_gangs += BP_COUNT_GANG(bp); 3920 3921 switch (BP_GET_NDVAS(bp)) { 3922 case 2: 3923 if (DVA_GET_VDEV(&bp->blk_dva[0]) == 3924 DVA_GET_VDEV(&bp->blk_dva[1])) 3925 zb->zb_ditto_2_of_2_samevdev++; 3926 break; 3927 case 3: 3928 equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == 3929 DVA_GET_VDEV(&bp->blk_dva[1])) + 3930 (DVA_GET_VDEV(&bp->blk_dva[0]) == 3931 DVA_GET_VDEV(&bp->blk_dva[2])) + 3932 (DVA_GET_VDEV(&bp->blk_dva[1]) == 3933 DVA_GET_VDEV(&bp->blk_dva[2])); 3934 if (equal == 1) 3935 zb->zb_ditto_2_of_3_samevdev++; 3936 else if (equal == 3) 3937 zb->zb_ditto_3_of_3_samevdev++; 3938 break; 3939 } 3940 } 3941 } 3942 3943 static void 3944 scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) 3945 { 3946 avl_index_t idx; 3947 dsl_scan_t *scn = queue->q_scn; 3948 3949 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 3950 3951 if (unlikely(avl_is_empty(&queue->q_sios_by_addr))) 3952 atomic_add_64(&scn->scn_queues_pending, 1); 3953 if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { 3954 /* block is already scheduled for reading */ 3955 sio_free(sio); 3956 return; 3957 } 3958 avl_insert(&queue->q_sios_by_addr, sio, idx); 3959 queue->q_sio_memused += SIO_GET_MUSED(sio); 3960 range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), 3961 SIO_GET_ASIZE(sio)); 3962 } 3963 3964 /* 3965 * Given all the info we got from our metadata scanning process, we 3966 * construct a scan_io_t and insert it into the scan sorting queue. The 3967 * I/O must already be suitable for us to process. This is controlled 3968 * by dsl_scan_enqueue(). 3969 */ 3970 static void 3971 scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, 3972 int zio_flags, const zbookmark_phys_t *zb) 3973 { 3974 scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); 3975 3976 ASSERT0(BP_IS_GANG(bp)); 3977 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 3978 3979 bp2sio(bp, sio, dva_i); 3980 sio->sio_flags = zio_flags; 3981 sio->sio_zb = *zb; 3982 3983 queue->q_last_ext_addr = -1; 3984 scan_io_queue_insert_impl(queue, sio); 3985 } 3986 3987 /* 3988 * Given a set of I/O parameters as discovered by the metadata traversal 3989 * process, attempts to place the I/O into the sorted queues (if allowed), 3990 * or immediately executes the I/O. 3991 */ 3992 static void 3993 dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, 3994 const zbookmark_phys_t *zb) 3995 { 3996 spa_t *spa = dp->dp_spa; 3997 3998 ASSERT(!BP_IS_EMBEDDED(bp)); 3999 4000 /* 4001 * Gang blocks are hard to issue sequentially, so we just issue them 4002 * here immediately instead of queuing them. 4003 */ 4004 if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { 4005 scan_exec_io(dp, bp, zio_flags, zb, NULL); 4006 return; 4007 } 4008 4009 for (int i = 0; i < BP_GET_NDVAS(bp); i++) { 4010 dva_t dva; 4011 vdev_t *vdev; 4012 4013 dva = bp->blk_dva[i]; 4014 vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); 4015 ASSERT(vdev != NULL); 4016 4017 mutex_enter(&vdev->vdev_scan_io_queue_lock); 4018 if (vdev->vdev_scan_io_queue == NULL) 4019 vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); 4020 ASSERT(dp->dp_scan != NULL); 4021 scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, 4022 i, zio_flags, zb); 4023 mutex_exit(&vdev->vdev_scan_io_queue_lock); 4024 } 4025 } 4026 4027 static int 4028 dsl_scan_scrub_cb(dsl_pool_t *dp, 4029 const blkptr_t *bp, const zbookmark_phys_t *zb) 4030 { 4031 dsl_scan_t *scn = dp->dp_scan; 4032 spa_t *spa = dp->dp_spa; 4033 uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); 4034 size_t psize = BP_GET_PSIZE(bp); 4035 boolean_t needs_io = B_FALSE; 4036 int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; 4037 4038 count_block(dp->dp_blkstats, bp); 4039 if (phys_birth <= scn->scn_phys.scn_min_txg || 4040 phys_birth >= scn->scn_phys.scn_max_txg) { 4041 count_block_issued(spa, bp, B_TRUE); 4042 return (0); 4043 } 4044 4045 /* Embedded BP's have phys_birth==0, so we reject them above. */ 4046 ASSERT(!BP_IS_EMBEDDED(bp)); 4047 4048 ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); 4049 if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { 4050 zio_flags |= ZIO_FLAG_SCRUB; 4051 needs_io = B_TRUE; 4052 } else { 4053 ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); 4054 zio_flags |= ZIO_FLAG_RESILVER; 4055 needs_io = B_FALSE; 4056 } 4057 4058 /* If it's an intent log block, failure is expected. */ 4059 if (zb->zb_level == ZB_ZIL_LEVEL) 4060 zio_flags |= ZIO_FLAG_SPECULATIVE; 4061 4062 for (int d = 0; d < BP_GET_NDVAS(bp); d++) { 4063 const dva_t *dva = &bp->blk_dva[d]; 4064 4065 /* 4066 * Keep track of how much data we've examined so that 4067 * zpool(8) status can make useful progress reports. 4068 */ 4069 uint64_t asize = DVA_GET_ASIZE(dva); 4070 scn->scn_phys.scn_examined += asize; 4071 spa->spa_scan_pass_exam += asize; 4072 4073 /* if it's a resilver, this may not be in the target range */ 4074 if (!needs_io) 4075 needs_io = dsl_scan_need_resilver(spa, dva, psize, 4076 phys_birth); 4077 } 4078 4079 if (needs_io && !zfs_no_scrub_io) { 4080 dsl_scan_enqueue(dp, bp, zio_flags, zb); 4081 } else { 4082 count_block_issued(spa, bp, B_TRUE); 4083 } 4084 4085 /* do not relocate this block */ 4086 return (0); 4087 } 4088 4089 static void 4090 dsl_scan_scrub_done(zio_t *zio) 4091 { 4092 spa_t *spa = zio->io_spa; 4093 blkptr_t *bp = zio->io_bp; 4094 dsl_scan_io_queue_t *queue = zio->io_private; 4095 4096 abd_free(zio->io_abd); 4097 4098 if (queue == NULL) { 4099 mutex_enter(&spa->spa_scrub_lock); 4100 ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); 4101 spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); 4102 cv_broadcast(&spa->spa_scrub_io_cv); 4103 mutex_exit(&spa->spa_scrub_lock); 4104 } else { 4105 mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); 4106 ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); 4107 queue->q_inflight_bytes -= BP_GET_PSIZE(bp); 4108 cv_broadcast(&queue->q_zio_cv); 4109 mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); 4110 } 4111 4112 if (zio->io_error && (zio->io_error != ECKSUM || 4113 !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { 4114 atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); 4115 } 4116 } 4117 4118 /* 4119 * Given a scanning zio's information, executes the zio. The zio need 4120 * not necessarily be only sortable, this function simply executes the 4121 * zio, no matter what it is. The optional queue argument allows the 4122 * caller to specify that they want per top level vdev IO rate limiting 4123 * instead of the legacy global limiting. 4124 */ 4125 static void 4126 scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, 4127 const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) 4128 { 4129 spa_t *spa = dp->dp_spa; 4130 dsl_scan_t *scn = dp->dp_scan; 4131 size_t size = BP_GET_PSIZE(bp); 4132 abd_t *data = abd_alloc_for_io(size, B_FALSE); 4133 zio_t *pio; 4134 4135 if (queue == NULL) { 4136 ASSERT3U(scn->scn_maxinflight_bytes, >, 0); 4137 mutex_enter(&spa->spa_scrub_lock); 4138 while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) 4139 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 4140 spa->spa_scrub_inflight += BP_GET_PSIZE(bp); 4141 mutex_exit(&spa->spa_scrub_lock); 4142 pio = scn->scn_zio_root; 4143 } else { 4144 kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; 4145 4146 ASSERT3U(queue->q_maxinflight_bytes, >, 0); 4147 mutex_enter(q_lock); 4148 while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) 4149 cv_wait(&queue->q_zio_cv, q_lock); 4150 queue->q_inflight_bytes += BP_GET_PSIZE(bp); 4151 pio = queue->q_zio; 4152 mutex_exit(q_lock); 4153 } 4154 4155 ASSERT(pio != NULL); 4156 count_block_issued(spa, bp, queue == NULL); 4157 zio_nowait(zio_read(pio, spa, bp, data, size, dsl_scan_scrub_done, 4158 queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); 4159 } 4160 4161 /* 4162 * This is the primary extent sorting algorithm. We balance two parameters: 4163 * 1) how many bytes of I/O are in an extent 4164 * 2) how well the extent is filled with I/O (as a fraction of its total size) 4165 * Since we allow extents to have gaps between their constituent I/Os, it's 4166 * possible to have a fairly large extent that contains the same amount of 4167 * I/O bytes than a much smaller extent, which just packs the I/O more tightly. 4168 * The algorithm sorts based on a score calculated from the extent's size, 4169 * the relative fill volume (in %) and a "fill weight" parameter that controls 4170 * the split between whether we prefer larger extents or more well populated 4171 * extents: 4172 * 4173 * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) 4174 * 4175 * Example: 4176 * 1) assume extsz = 64 MiB 4177 * 2) assume fill = 32 MiB (extent is half full) 4178 * 3) assume fill_weight = 3 4179 * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 4180 * SCORE = 32M + (50 * 3 * 32M) / 100 4181 * SCORE = 32M + (4800M / 100) 4182 * SCORE = 32M + 48M 4183 * ^ ^ 4184 * | +--- final total relative fill-based score 4185 * +--------- final total fill-based score 4186 * SCORE = 80M 4187 * 4188 * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards 4189 * extents that are more completely filled (in a 3:2 ratio) vs just larger. 4190 * Note that as an optimization, we replace multiplication and division by 4191 * 100 with bitshifting by 7 (which effectively multiplies and divides by 128). 4192 * 4193 * Since we do not care if one extent is only few percent better than another, 4194 * compress the score into 6 bits via binary logarithm AKA highbit64() and 4195 * put into otherwise unused due to ashift high bits of offset. This allows 4196 * to reduce q_exts_by_size B-tree elements to only 64 bits and compare them 4197 * with single operation. Plus it makes scrubs more sequential and reduces 4198 * chances that minor extent change move it within the B-tree. 4199 */ 4200 static int 4201 ext_size_compare(const void *x, const void *y) 4202 { 4203 const uint64_t *a = x, *b = y; 4204 4205 return (TREE_CMP(*a, *b)); 4206 } 4207 4208 static void 4209 ext_size_create(range_tree_t *rt, void *arg) 4210 { 4211 (void) rt; 4212 zfs_btree_t *size_tree = arg; 4213 4214 zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t)); 4215 } 4216 4217 static void 4218 ext_size_destroy(range_tree_t *rt, void *arg) 4219 { 4220 (void) rt; 4221 zfs_btree_t *size_tree = arg; 4222 ASSERT0(zfs_btree_numnodes(size_tree)); 4223 4224 zfs_btree_destroy(size_tree); 4225 } 4226 4227 static uint64_t 4228 ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg) 4229 { 4230 (void) rt; 4231 uint64_t size = rsg->rs_end - rsg->rs_start; 4232 uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) * 4233 fill_weight * rsg->rs_fill) >> 7); 4234 ASSERT3U(rt->rt_shift, >=, 8); 4235 return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start); 4236 } 4237 4238 static void 4239 ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg) 4240 { 4241 zfs_btree_t *size_tree = arg; 4242 ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); 4243 uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); 4244 zfs_btree_add(size_tree, &v); 4245 } 4246 4247 static void 4248 ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg) 4249 { 4250 zfs_btree_t *size_tree = arg; 4251 ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP); 4252 uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs); 4253 zfs_btree_remove(size_tree, &v); 4254 } 4255 4256 static void 4257 ext_size_vacate(range_tree_t *rt, void *arg) 4258 { 4259 zfs_btree_t *size_tree = arg; 4260 zfs_btree_clear(size_tree); 4261 zfs_btree_destroy(size_tree); 4262 4263 ext_size_create(rt, arg); 4264 } 4265 4266 static const range_tree_ops_t ext_size_ops = { 4267 .rtop_create = ext_size_create, 4268 .rtop_destroy = ext_size_destroy, 4269 .rtop_add = ext_size_add, 4270 .rtop_remove = ext_size_remove, 4271 .rtop_vacate = ext_size_vacate 4272 }; 4273 4274 /* 4275 * Comparator for the q_sios_by_addr tree. Sorting is simply performed 4276 * based on LBA-order (from lowest to highest). 4277 */ 4278 static int 4279 sio_addr_compare(const void *x, const void *y) 4280 { 4281 const scan_io_t *a = x, *b = y; 4282 4283 return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); 4284 } 4285 4286 /* IO queues are created on demand when they are needed. */ 4287 static dsl_scan_io_queue_t * 4288 scan_io_queue_create(vdev_t *vd) 4289 { 4290 dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; 4291 dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); 4292 4293 q->q_scn = scn; 4294 q->q_vd = vd; 4295 q->q_sio_memused = 0; 4296 q->q_last_ext_addr = -1; 4297 cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); 4298 q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP, 4299 &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap); 4300 avl_create(&q->q_sios_by_addr, sio_addr_compare, 4301 sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); 4302 4303 return (q); 4304 } 4305 4306 /* 4307 * Destroys a scan queue and all segments and scan_io_t's contained in it. 4308 * No further execution of I/O occurs, anything pending in the queue is 4309 * simply freed without being executed. 4310 */ 4311 void 4312 dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) 4313 { 4314 dsl_scan_t *scn = queue->q_scn; 4315 scan_io_t *sio; 4316 void *cookie = NULL; 4317 4318 ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); 4319 4320 if (!avl_is_empty(&queue->q_sios_by_addr)) 4321 atomic_add_64(&scn->scn_queues_pending, -1); 4322 while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != 4323 NULL) { 4324 ASSERT(range_tree_contains(queue->q_exts_by_addr, 4325 SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); 4326 queue->q_sio_memused -= SIO_GET_MUSED(sio); 4327 sio_free(sio); 4328 } 4329 4330 ASSERT0(queue->q_sio_memused); 4331 range_tree_vacate(queue->q_exts_by_addr, NULL, queue); 4332 range_tree_destroy(queue->q_exts_by_addr); 4333 avl_destroy(&queue->q_sios_by_addr); 4334 cv_destroy(&queue->q_zio_cv); 4335 4336 kmem_free(queue, sizeof (*queue)); 4337 } 4338 4339 /* 4340 * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is 4341 * called on behalf of vdev_top_transfer when creating or destroying 4342 * a mirror vdev due to zpool attach/detach. 4343 */ 4344 void 4345 dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) 4346 { 4347 mutex_enter(&svd->vdev_scan_io_queue_lock); 4348 mutex_enter(&tvd->vdev_scan_io_queue_lock); 4349 4350 VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); 4351 tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; 4352 svd->vdev_scan_io_queue = NULL; 4353 if (tvd->vdev_scan_io_queue != NULL) 4354 tvd->vdev_scan_io_queue->q_vd = tvd; 4355 4356 mutex_exit(&tvd->vdev_scan_io_queue_lock); 4357 mutex_exit(&svd->vdev_scan_io_queue_lock); 4358 } 4359 4360 static void 4361 scan_io_queues_destroy(dsl_scan_t *scn) 4362 { 4363 vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; 4364 4365 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 4366 vdev_t *tvd = rvd->vdev_child[i]; 4367 4368 mutex_enter(&tvd->vdev_scan_io_queue_lock); 4369 if (tvd->vdev_scan_io_queue != NULL) 4370 dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); 4371 tvd->vdev_scan_io_queue = NULL; 4372 mutex_exit(&tvd->vdev_scan_io_queue_lock); 4373 } 4374 } 4375 4376 static void 4377 dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) 4378 { 4379 dsl_pool_t *dp = spa->spa_dsl_pool; 4380 dsl_scan_t *scn = dp->dp_scan; 4381 vdev_t *vdev; 4382 kmutex_t *q_lock; 4383 dsl_scan_io_queue_t *queue; 4384 scan_io_t *srch_sio, *sio; 4385 avl_index_t idx; 4386 uint64_t start, size; 4387 4388 vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); 4389 ASSERT(vdev != NULL); 4390 q_lock = &vdev->vdev_scan_io_queue_lock; 4391 queue = vdev->vdev_scan_io_queue; 4392 4393 mutex_enter(q_lock); 4394 if (queue == NULL) { 4395 mutex_exit(q_lock); 4396 return; 4397 } 4398 4399 srch_sio = sio_alloc(BP_GET_NDVAS(bp)); 4400 bp2sio(bp, srch_sio, dva_i); 4401 start = SIO_GET_OFFSET(srch_sio); 4402 size = SIO_GET_ASIZE(srch_sio); 4403 4404 /* 4405 * We can find the zio in two states: 4406 * 1) Cold, just sitting in the queue of zio's to be issued at 4407 * some point in the future. In this case, all we do is 4408 * remove the zio from the q_sios_by_addr tree, decrement 4409 * its data volume from the containing range_seg_t and 4410 * resort the q_exts_by_size tree to reflect that the 4411 * range_seg_t has lost some of its 'fill'. We don't shorten 4412 * the range_seg_t - this is usually rare enough not to be 4413 * worth the extra hassle of trying keep track of precise 4414 * extent boundaries. 4415 * 2) Hot, where the zio is currently in-flight in 4416 * dsl_scan_issue_ios. In this case, we can't simply 4417 * reach in and stop the in-flight zio's, so we instead 4418 * block the caller. Eventually, dsl_scan_issue_ios will 4419 * be done with issuing the zio's it gathered and will 4420 * signal us. 4421 */ 4422 sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); 4423 sio_free(srch_sio); 4424 4425 if (sio != NULL) { 4426 blkptr_t tmpbp; 4427 4428 /* Got it while it was cold in the queue */ 4429 ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); 4430 ASSERT3U(size, ==, SIO_GET_ASIZE(sio)); 4431 avl_remove(&queue->q_sios_by_addr, sio); 4432 if (avl_is_empty(&queue->q_sios_by_addr)) 4433 atomic_add_64(&scn->scn_queues_pending, -1); 4434 queue->q_sio_memused -= SIO_GET_MUSED(sio); 4435 4436 ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); 4437 range_tree_remove_fill(queue->q_exts_by_addr, start, size); 4438 4439 /* count the block as though we issued it */ 4440 sio2bp(sio, &tmpbp); 4441 count_block_issued(spa, &tmpbp, B_FALSE); 4442 4443 sio_free(sio); 4444 } 4445 mutex_exit(q_lock); 4446 } 4447 4448 /* 4449 * Callback invoked when a zio_free() zio is executing. This needs to be 4450 * intercepted to prevent the zio from deallocating a particular portion 4451 * of disk space and it then getting reallocated and written to, while we 4452 * still have it queued up for processing. 4453 */ 4454 void 4455 dsl_scan_freed(spa_t *spa, const blkptr_t *bp) 4456 { 4457 dsl_pool_t *dp = spa->spa_dsl_pool; 4458 dsl_scan_t *scn = dp->dp_scan; 4459 4460 ASSERT(!BP_IS_EMBEDDED(bp)); 4461 ASSERT(scn != NULL); 4462 if (!dsl_scan_is_running(scn)) 4463 return; 4464 4465 for (int i = 0; i < BP_GET_NDVAS(bp); i++) 4466 dsl_scan_freed_dva(spa, bp, i); 4467 } 4468 4469 /* 4470 * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has 4471 * not started, start it. Otherwise, only restart if max txg in DTL range is 4472 * greater than the max txg in the current scan. If the DTL max is less than 4473 * the scan max, then the vdev has not missed any new data since the resilver 4474 * started, so a restart is not needed. 4475 */ 4476 void 4477 dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) 4478 { 4479 uint64_t min, max; 4480 4481 if (!vdev_resilver_needed(vd, &min, &max)) 4482 return; 4483 4484 if (!dsl_scan_resilvering(dp)) { 4485 spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); 4486 return; 4487 } 4488 4489 if (max <= dp->dp_scan->scn_phys.scn_max_txg) 4490 return; 4491 4492 /* restart is needed, check if it can be deferred */ 4493 if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) 4494 vdev_defer_resilver(vd); 4495 else 4496 spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); 4497 } 4498 4499 ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, U64, ZMOD_RW, 4500 "Max bytes in flight per leaf vdev for scrubs and resilvers"); 4501 4502 ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, UINT, ZMOD_RW, 4503 "Min millisecs to scrub per txg"); 4504 4505 ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, UINT, ZMOD_RW, 4506 "Min millisecs to obsolete per txg"); 4507 4508 ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, UINT, ZMOD_RW, 4509 "Min millisecs to free per txg"); 4510 4511 ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, UINT, ZMOD_RW, 4512 "Min millisecs to resilver per txg"); 4513 4514 ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW, 4515 "Set to prevent scans from progressing"); 4516 4517 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, 4518 "Set to disable scrub I/O"); 4519 4520 ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, 4521 "Set to disable scrub prefetching"); 4522 4523 ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, U64, ZMOD_RW, 4524 "Max number of blocks freed in one txg"); 4525 4526 ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, U64, ZMOD_RW, 4527 "Max number of dedup blocks freed in one txg"); 4528 4529 ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, 4530 "Enable processing of the free_bpobj"); 4531 4532 ZFS_MODULE_PARAM(zfs, zfs_, scan_blkstats, INT, ZMOD_RW, 4533 "Enable block statistics calculation during scrub"); 4534 4535 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, UINT, ZMOD_RW, 4536 "Fraction of RAM for scan hard limit"); 4537 4538 ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, UINT, ZMOD_RW, 4539 "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size"); 4540 4541 ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, 4542 "Scrub using legacy non-sequential method"); 4543 4544 ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, UINT, ZMOD_RW, 4545 "Scan progress on-disk checkpointing interval"); 4546 4547 ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, U64, ZMOD_RW, 4548 "Max gap in bytes between sequential scrub / resilver I/Os"); 4549 4550 ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, UINT, ZMOD_RW, 4551 "Fraction of hard limit used as soft limit"); 4552 4553 ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, 4554 "Tunable to attempt to reduce lock contention"); 4555 4556 ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, UINT, ZMOD_RW, 4557 "Tunable to adjust bias towards more filled segments during scans"); 4558 4559 ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, 4560 "Tunable to report resilver performance over the last N txgs"); 4561 4562 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, 4563 "Process all resilvers immediately"); 4564