1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * 24 * Copyright (c) 2018, Intel Corporation. 25 * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. 26 * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. 27 * Copyright (c) 2024 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/vdev_impl.h> 31 #include <sys/vdev_draid.h> 32 #include <sys/dsl_scan.h> 33 #include <sys/spa_impl.h> 34 #include <sys/metaslab_impl.h> 35 #include <sys/vdev_rebuild.h> 36 #include <sys/zio.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/arc.h> 39 #include <sys/arc_impl.h> 40 #include <sys/zap.h> 41 42 /* 43 * This file contains the sequential reconstruction implementation for 44 * resilvering. This form of resilvering is internally referred to as device 45 * rebuild to avoid conflating it with the traditional healing reconstruction 46 * performed by the dsl scan code. 47 * 48 * When replacing a device, or scrubbing the pool, ZFS has historically used 49 * a process called resilvering which is a form of healing reconstruction. 50 * This approach has the advantage that as blocks are read from disk their 51 * checksums can be immediately verified and the data repaired. Unfortunately, 52 * it also results in a random IO pattern to the disk even when extra care 53 * is taken to sequentialize the IO as much as possible. This substantially 54 * increases the time required to resilver the pool and restore redundancy. 55 * 56 * For mirrored devices it's possible to implement an alternate sequential 57 * reconstruction strategy when resilvering. Sequential reconstruction 58 * behaves like a traditional RAID rebuild and reconstructs a device in LBA 59 * order without verifying the checksum. After this phase completes a second 60 * scrub phase is started to verify all of the checksums. This two phase 61 * process will take longer than the healing reconstruction described above. 62 * However, it has that advantage that after the reconstruction first phase 63 * completes redundancy has been restored. At this point the pool can incur 64 * another device failure without risking data loss. 65 * 66 * There are a few noteworthy limitations and other advantages of resilvering 67 * using sequential reconstruction vs healing reconstruction. 68 * 69 * Limitations: 70 * 71 * - Sequential reconstruction is not possible on RAIDZ due to its 72 * variable stripe width. Note dRAID uses a fixed stripe width which 73 * avoids this issue, but comes at the expense of some usable capacity. 74 * 75 * - Block checksums are not verified during sequential reconstruction. 76 * Similar to traditional RAID the parity/mirror data is reconstructed 77 * but cannot be immediately double checked. For this reason when the 78 * last active resilver completes the pool is automatically scrubbed 79 * by default. 80 * 81 * - Deferred resilvers using sequential reconstruction are not currently 82 * supported. When adding another vdev to an active top-level resilver 83 * it must be restarted. 84 * 85 * Advantages: 86 * 87 * - Sequential reconstruction is performed in LBA order which may be faster 88 * than healing reconstruction particularly when using HDDs (or 89 * especially with SMR devices). Only allocated capacity is resilvered. 90 * 91 * - Sequential reconstruction is not constrained by ZFS block boundaries. 92 * This allows it to issue larger IOs to disk which span multiple blocks 93 * allowing all of these logical blocks to be repaired with a single IO. 94 * 95 * - Unlike a healing resilver or scrub which are pool wide operations, 96 * sequential reconstruction is handled by the top-level vdevs. This 97 * allows for it to be started or canceled on a top-level vdev without 98 * impacting any other top-level vdevs in the pool. 99 * 100 * - Data only referenced by a pool checkpoint will be repaired because 101 * that space is reflected in the space maps. This differs for a 102 * healing resilver or scrub which will not repair that data. 103 */ 104 105 106 /* 107 * Size of rebuild reads; defaults to 1MiB per data disk and is capped at 108 * SPA_MAXBLOCKSIZE. 109 */ 110 static uint64_t zfs_rebuild_max_segment = 1024 * 1024; 111 112 /* 113 * Maximum number of parallelly executed bytes per leaf vdev caused by a 114 * sequential resilver. We attempt to strike a balance here between keeping 115 * the vdev queues full of I/Os at all times and not overflowing the queues 116 * to cause long latency, which would cause long txg sync times. 117 * 118 * A large default value can be safely used here because the default target 119 * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep 120 * the queue depth short. 121 * 122 * 64MB was observed to deliver the best performance and set as the default. 123 * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) 124 * and a rebuild rate of 1.2GB/s was measured to the distribute spare. 125 * Smaller values were unable to fully saturate the available pool I/O. 126 */ 127 static uint64_t zfs_rebuild_vdev_limit = 64 << 20; 128 129 /* 130 * Automatically start a pool scrub when the last active sequential resilver 131 * completes in order to verify the checksums of all blocks which have been 132 * resilvered. This option is enabled by default and is strongly recommended. 133 */ 134 static int zfs_rebuild_scrub_enabled = 1; 135 136 /* 137 * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). 138 */ 139 static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg); 140 static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx); 141 142 /* 143 * Clear the per-vdev rebuild bytes value for a vdev tree. 144 */ 145 static void 146 clear_rebuild_bytes(vdev_t *vd) 147 { 148 vdev_stat_t *vs = &vd->vdev_stat; 149 150 for (uint64_t i = 0; i < vd->vdev_children; i++) 151 clear_rebuild_bytes(vd->vdev_child[i]); 152 153 mutex_enter(&vd->vdev_stat_lock); 154 vs->vs_rebuild_processed = 0; 155 mutex_exit(&vd->vdev_stat_lock); 156 } 157 158 /* 159 * Determines whether a vdev_rebuild_thread() should be stopped. 160 */ 161 static boolean_t 162 vdev_rebuild_should_stop(vdev_t *vd) 163 { 164 return (!vdev_writeable(vd) || vd->vdev_removing || 165 vd->vdev_rebuild_exit_wanted || 166 vd->vdev_rebuild_cancel_wanted || 167 vd->vdev_rebuild_reset_wanted); 168 } 169 170 /* 171 * Determine if the rebuild should be canceled. This may happen when all 172 * vdevs with MISSING DTLs are detached. 173 */ 174 static boolean_t 175 vdev_rebuild_should_cancel(vdev_t *vd) 176 { 177 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 178 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 179 180 if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) 181 return (B_TRUE); 182 183 return (B_FALSE); 184 } 185 186 /* 187 * The sync task for updating the on-disk state of a rebuild. This is 188 * scheduled by vdev_rebuild_range(). 189 */ 190 static void 191 vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) 192 { 193 int vdev_id = (uintptr_t)arg; 194 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 195 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 196 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 197 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 198 uint64_t txg = dmu_tx_get_txg(tx); 199 200 mutex_enter(&vd->vdev_rebuild_lock); 201 202 if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { 203 vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; 204 vr->vr_scan_offset[txg & TXG_MASK] = 0; 205 } 206 207 vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + 208 NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); 209 210 VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 211 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), 212 REBUILD_PHYS_ENTRIES, vrp, tx)); 213 214 mutex_exit(&vd->vdev_rebuild_lock); 215 } 216 217 /* 218 * Initialize the on-disk state for a new rebuild, start the rebuild thread. 219 */ 220 static void 221 vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) 222 { 223 int vdev_id = (uintptr_t)arg; 224 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 225 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 226 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 227 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 228 229 ASSERT(vd->vdev_rebuilding); 230 231 spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); 232 233 mutex_enter(&vd->vdev_rebuild_lock); 234 memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); 235 vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; 236 vrp->vrp_min_txg = 0; 237 vrp->vrp_max_txg = dmu_tx_get_txg(tx); 238 vrp->vrp_start_time = gethrestime_sec(); 239 vrp->vrp_scan_time_ms = 0; 240 vr->vr_prev_scan_time_ms = 0; 241 242 /* 243 * Rebuilds are currently only used when replacing a device, in which 244 * case there must be DTL_MISSING entries. In the future, we could 245 * allow rebuilds to be used in a way similar to a scrub. This would 246 * be useful because it would allow us to rebuild the space used by 247 * pool checkpoints. 248 */ 249 VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); 250 251 VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 252 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), 253 REBUILD_PHYS_ENTRIES, vrp, tx)); 254 255 spa_history_log_internal(spa, "rebuild", tx, 256 "vdev_id=%llu vdev_guid=%llu started", 257 (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); 258 259 ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); 260 vd->vdev_rebuild_thread = thread_create(NULL, 0, 261 vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 262 263 mutex_exit(&vd->vdev_rebuild_lock); 264 } 265 266 static void 267 vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, const char *name) 268 { 269 nvlist_t *aux = fnvlist_alloc(); 270 271 fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); 272 spa_event_notify(spa, vd, aux, name); 273 nvlist_free(aux); 274 } 275 276 /* 277 * Called to request that a new rebuild be started. The feature will remain 278 * active for the duration of the rebuild, then revert to the enabled state. 279 */ 280 static void 281 vdev_rebuild_initiate(vdev_t *vd) 282 { 283 spa_t *spa = vd->vdev_spa; 284 285 ASSERT(vd->vdev_top == vd); 286 ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); 287 ASSERT(!vd->vdev_rebuilding); 288 289 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 290 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); 291 292 vd->vdev_rebuilding = B_TRUE; 293 294 dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, 295 (void *)(uintptr_t)vd->vdev_id, tx); 296 dmu_tx_commit(tx); 297 298 vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); 299 } 300 301 /* 302 * Update the on-disk state to completed when a rebuild finishes. 303 */ 304 static void 305 vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) 306 { 307 int vdev_id = (uintptr_t)arg; 308 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 309 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 310 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 311 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 312 313 mutex_enter(&vd->vdev_rebuild_lock); 314 315 /* 316 * Handle a second device failure if it occurs after all rebuild I/O 317 * has completed but before this sync task has been executed. 318 */ 319 if (vd->vdev_rebuild_reset_wanted) { 320 mutex_exit(&vd->vdev_rebuild_lock); 321 vdev_rebuild_reset_sync(arg, tx); 322 return; 323 } 324 325 vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; 326 vrp->vrp_end_time = gethrestime_sec(); 327 328 VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 329 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), 330 REBUILD_PHYS_ENTRIES, vrp, tx)); 331 332 vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); 333 spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); 334 335 spa_history_log_internal(spa, "rebuild", tx, 336 "vdev_id=%llu vdev_guid=%llu complete", 337 (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); 338 vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); 339 340 /* Handles detaching of spares */ 341 spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); 342 vd->vdev_rebuilding = B_FALSE; 343 mutex_exit(&vd->vdev_rebuild_lock); 344 345 /* 346 * While we're in syncing context take the opportunity to 347 * setup the scrub when there are no more active rebuilds. 348 */ 349 setup_sync_arg_t setup_sync_arg = { 350 .func = POOL_SCAN_SCRUB, 351 .txgstart = 0, 352 .txgend = 0, 353 }; 354 if (dsl_scan_setup_check(&setup_sync_arg.func, tx) == 0 && 355 zfs_rebuild_scrub_enabled) { 356 dsl_scan_setup_sync(&setup_sync_arg, tx); 357 } 358 359 cv_broadcast(&vd->vdev_rebuild_cv); 360 361 /* Clear recent error events (i.e. duplicate events tracking) */ 362 zfs_ereport_clear(spa, NULL); 363 } 364 365 /* 366 * Update the on-disk state to canceled when a rebuild finishes. 367 */ 368 static void 369 vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) 370 { 371 int vdev_id = (uintptr_t)arg; 372 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 373 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 374 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 375 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 376 377 mutex_enter(&vd->vdev_rebuild_lock); 378 vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; 379 vrp->vrp_end_time = gethrestime_sec(); 380 381 VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 382 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), 383 REBUILD_PHYS_ENTRIES, vrp, tx)); 384 385 spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); 386 387 spa_history_log_internal(spa, "rebuild", tx, 388 "vdev_id=%llu vdev_guid=%llu canceled", 389 (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); 390 vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); 391 392 vd->vdev_rebuild_cancel_wanted = B_FALSE; 393 vd->vdev_rebuilding = B_FALSE; 394 mutex_exit(&vd->vdev_rebuild_lock); 395 396 spa_notify_waiters(spa); 397 cv_broadcast(&vd->vdev_rebuild_cv); 398 } 399 400 /* 401 * Resets the progress of a running rebuild. This will occur when a new 402 * vdev is added to rebuild. 403 */ 404 static void 405 vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) 406 { 407 int vdev_id = (uintptr_t)arg; 408 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 409 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 410 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 411 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 412 413 mutex_enter(&vd->vdev_rebuild_lock); 414 415 ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); 416 ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); 417 418 vrp->vrp_last_offset = 0; 419 vrp->vrp_min_txg = 0; 420 vrp->vrp_max_txg = dmu_tx_get_txg(tx); 421 vrp->vrp_bytes_scanned = 0; 422 vrp->vrp_bytes_issued = 0; 423 vrp->vrp_bytes_rebuilt = 0; 424 vrp->vrp_bytes_est = 0; 425 vrp->vrp_scan_time_ms = 0; 426 vr->vr_prev_scan_time_ms = 0; 427 428 /* See vdev_rebuild_initiate_sync comment */ 429 VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); 430 431 VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, 432 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), 433 REBUILD_PHYS_ENTRIES, vrp, tx)); 434 435 spa_history_log_internal(spa, "rebuild", tx, 436 "vdev_id=%llu vdev_guid=%llu reset", 437 (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); 438 439 vd->vdev_rebuild_reset_wanted = B_FALSE; 440 ASSERT(vd->vdev_rebuilding); 441 442 vd->vdev_rebuild_thread = thread_create(NULL, 0, 443 vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 444 445 mutex_exit(&vd->vdev_rebuild_lock); 446 } 447 448 /* 449 * Clear the last rebuild status. 450 */ 451 void 452 vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) 453 { 454 int vdev_id = (uintptr_t)arg; 455 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 456 vdev_t *vd = vdev_lookup_top(spa, vdev_id); 457 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 458 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 459 objset_t *mos = spa_meta_objset(spa); 460 461 mutex_enter(&vd->vdev_rebuild_lock); 462 463 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || 464 vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { 465 mutex_exit(&vd->vdev_rebuild_lock); 466 return; 467 } 468 469 clear_rebuild_bytes(vd); 470 memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); 471 472 if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, 473 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { 474 VERIFY0(zap_update(mos, vd->vdev_top_zap, 475 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), 476 REBUILD_PHYS_ENTRIES, vrp, tx)); 477 } 478 479 mutex_exit(&vd->vdev_rebuild_lock); 480 } 481 482 /* 483 * The zio_done_func_t callback for each rebuild I/O issued. It's responsible 484 * for updating the rebuild stats and limiting the number of in flight I/Os. 485 */ 486 static void 487 vdev_rebuild_cb(zio_t *zio) 488 { 489 vdev_rebuild_t *vr = zio->io_private; 490 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 491 vdev_t *vd = vr->vr_top_vdev; 492 493 mutex_enter(&vr->vr_io_lock); 494 if (zio->io_error == ENXIO && !vdev_writeable(vd)) { 495 /* 496 * The I/O failed because the top-level vdev was unavailable. 497 * Attempt to roll back to the last completed offset, in order 498 * resume from the correct location if the pool is resumed. 499 * (This works because spa_sync waits on spa_txg_zio before 500 * it runs sync tasks.) 501 */ 502 uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; 503 *off = MIN(*off, zio->io_offset); 504 } else if (zio->io_error) { 505 vrp->vrp_errors++; 506 } 507 508 abd_free(zio->io_abd); 509 510 ASSERT3U(vr->vr_bytes_inflight, >, 0); 511 vr->vr_bytes_inflight -= zio->io_size; 512 cv_broadcast(&vr->vr_io_cv); 513 mutex_exit(&vr->vr_io_lock); 514 515 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 516 } 517 518 /* 519 * Initialize a block pointer that can be used to read the given segment 520 * for sequential rebuild. 521 */ 522 static void 523 vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, 524 uint64_t asize) 525 { 526 ASSERT(vd->vdev_ops == &vdev_draid_ops || 527 vd->vdev_ops == &vdev_mirror_ops || 528 vd->vdev_ops == &vdev_replacing_ops || 529 vd->vdev_ops == &vdev_spare_ops); 530 531 uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? 532 vdev_draid_asize_to_psize(vd, asize) : asize; 533 534 BP_ZERO(bp); 535 536 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); 537 DVA_SET_OFFSET(&bp->blk_dva[0], start); 538 DVA_SET_GANG(&bp->blk_dva[0], 0); 539 DVA_SET_ASIZE(&bp->blk_dva[0], asize); 540 541 BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); 542 BP_SET_LSIZE(bp, psize); 543 BP_SET_PSIZE(bp, psize); 544 BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); 545 BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); 546 BP_SET_TYPE(bp, DMU_OT_NONE); 547 BP_SET_LEVEL(bp, 0); 548 BP_SET_DEDUP(bp, 0); 549 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 550 } 551 552 /* 553 * Issues a rebuild I/O and takes care of rate limiting the number of queued 554 * rebuild I/Os. The provided start and size must be properly aligned for the 555 * top-level vdev type being rebuilt. 556 */ 557 static int 558 vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) 559 { 560 uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; 561 vdev_t *vd = vr->vr_top_vdev; 562 spa_t *spa = vd->vdev_spa; 563 blkptr_t blk; 564 565 ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); 566 ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); 567 568 vr->vr_pass_bytes_scanned += size; 569 vr->vr_rebuild_phys.vrp_bytes_scanned += size; 570 571 /* 572 * Rebuild the data in this range by constructing a special block 573 * pointer. It has no relation to any existing blocks in the pool. 574 * However, by disabling checksum verification and issuing a scrub IO 575 * we can reconstruct and repair any children with missing data. 576 */ 577 vdev_rebuild_blkptr_init(&blk, vd, start, size); 578 uint64_t psize = BP_GET_PSIZE(&blk); 579 580 if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) { 581 vr->vr_pass_bytes_skipped += size; 582 return (0); 583 } 584 585 mutex_enter(&vr->vr_io_lock); 586 587 /* Limit in flight rebuild I/Os */ 588 while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) 589 cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); 590 591 vr->vr_bytes_inflight += psize; 592 mutex_exit(&vr->vr_io_lock); 593 594 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 595 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); 596 uint64_t txg = dmu_tx_get_txg(tx); 597 598 spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); 599 mutex_enter(&vd->vdev_rebuild_lock); 600 601 /* This is the first I/O for this txg. */ 602 if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { 603 vr->vr_scan_offset[txg & TXG_MASK] = start; 604 dsl_sync_task_nowait(spa_get_dsl(spa), 605 vdev_rebuild_update_sync, 606 (void *)(uintptr_t)vd->vdev_id, tx); 607 } 608 609 /* When exiting write out our progress. */ 610 if (vdev_rebuild_should_stop(vd)) { 611 mutex_enter(&vr->vr_io_lock); 612 vr->vr_bytes_inflight -= psize; 613 mutex_exit(&vr->vr_io_lock); 614 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 615 mutex_exit(&vd->vdev_rebuild_lock); 616 dmu_tx_commit(tx); 617 return (SET_ERROR(EINTR)); 618 } 619 mutex_exit(&vd->vdev_rebuild_lock); 620 dmu_tx_commit(tx); 621 622 vr->vr_scan_offset[txg & TXG_MASK] = start + size; 623 vr->vr_pass_bytes_issued += size; 624 vr->vr_rebuild_phys.vrp_bytes_issued += size; 625 626 zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, 627 abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, 628 ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | 629 ZIO_FLAG_RESILVER, NULL)); 630 631 return (0); 632 } 633 634 /* 635 * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. 636 */ 637 static int 638 vdev_rebuild_ranges(vdev_rebuild_t *vr) 639 { 640 vdev_t *vd = vr->vr_top_vdev; 641 zfs_btree_t *t = &vr->vr_scan_tree->rt_root; 642 zfs_btree_index_t idx; 643 int error; 644 645 for (zfs_range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; 646 rs = zfs_btree_next(t, &idx, &idx)) { 647 uint64_t start = zfs_rs_get_start(rs, vr->vr_scan_tree); 648 uint64_t size = zfs_rs_get_end(rs, vr->vr_scan_tree) - start; 649 650 /* 651 * zfs_scan_suspend_progress can be set to disable rebuild 652 * progress for testing. See comment in dsl_scan_sync(). 653 */ 654 while (zfs_scan_suspend_progress && 655 !vdev_rebuild_should_stop(vd)) { 656 delay(hz); 657 } 658 659 while (size > 0) { 660 uint64_t chunk_size; 661 662 /* 663 * Split range into legally-sized logical chunks 664 * given the constraints of the top-level vdev 665 * being rebuilt (dRAID or mirror). 666 */ 667 ASSERT3P(vd->vdev_ops, !=, NULL); 668 chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, 669 start, size, zfs_rebuild_max_segment); 670 671 error = vdev_rebuild_range(vr, start, chunk_size); 672 if (error != 0) 673 return (error); 674 675 size -= chunk_size; 676 start += chunk_size; 677 } 678 } 679 680 return (0); 681 } 682 683 /* 684 * Calculates the estimated capacity which remains to be scanned. Since 685 * we traverse the pool in metaslab order only allocated capacity beyond 686 * the vrp_last_offset need be considered. All lower offsets must have 687 * already been rebuilt and are thus already included in vrp_bytes_scanned. 688 */ 689 static void 690 vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) 691 { 692 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 693 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 694 uint64_t bytes_est = vrp->vrp_bytes_scanned; 695 696 if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) 697 return; 698 699 for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { 700 metaslab_t *msp = vd->vdev_ms[i]; 701 702 mutex_enter(&msp->ms_lock); 703 bytes_est += metaslab_allocated_space(msp); 704 mutex_exit(&msp->ms_lock); 705 } 706 707 vrp->vrp_bytes_est = bytes_est; 708 } 709 710 /* 711 * Load from disk the top-level vdev's rebuild information. 712 */ 713 int 714 vdev_rebuild_load(vdev_t *vd) 715 { 716 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 717 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 718 spa_t *spa = vd->vdev_spa; 719 int err = 0; 720 721 mutex_enter(&vd->vdev_rebuild_lock); 722 vd->vdev_rebuilding = B_FALSE; 723 724 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { 725 memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); 726 mutex_exit(&vd->vdev_rebuild_lock); 727 return (SET_ERROR(ENOTSUP)); 728 } 729 730 ASSERT(vd->vdev_top == vd); 731 732 err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, 733 VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), 734 REBUILD_PHYS_ENTRIES, vrp); 735 736 /* 737 * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should 738 * not prevent a pool from being imported. Clear the rebuild 739 * status allowing a new resilver/rebuild to be started. 740 */ 741 if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { 742 memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); 743 } else if (err) { 744 mutex_exit(&vd->vdev_rebuild_lock); 745 return (err); 746 } 747 748 vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; 749 vr->vr_top_vdev = vd; 750 751 mutex_exit(&vd->vdev_rebuild_lock); 752 753 return (0); 754 } 755 756 /* 757 * Each scan thread is responsible for rebuilding a top-level vdev. The 758 * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. 759 */ 760 static __attribute__((noreturn)) void 761 vdev_rebuild_thread(void *arg) 762 { 763 vdev_t *vd = arg; 764 spa_t *spa = vd->vdev_spa; 765 vdev_t *rvd = spa->spa_root_vdev; 766 int error = 0; 767 768 /* 769 * If there's a scrub in process request that it be stopped. This 770 * is not required for a correct rebuild, but we do want rebuilds to 771 * emulate the resilver behavior as much as possible. 772 */ 773 dsl_pool_t *dsl = spa_get_dsl(spa); 774 if (dsl_scan_scrubbing(dsl)) 775 dsl_scan_cancel(dsl); 776 777 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 778 mutex_enter(&vd->vdev_rebuild_lock); 779 780 ASSERT3P(vd->vdev_top, ==, vd); 781 ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); 782 ASSERT(vd->vdev_rebuilding); 783 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); 784 ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); 785 786 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 787 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 788 vr->vr_top_vdev = vd; 789 vr->vr_scan_msp = NULL; 790 vr->vr_scan_tree = zfs_range_tree_create(NULL, ZFS_RANGE_SEG64, NULL, 791 0, 0); 792 mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); 793 cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); 794 795 vr->vr_pass_start_time = gethrtime(); 796 vr->vr_pass_bytes_scanned = 0; 797 vr->vr_pass_bytes_issued = 0; 798 vr->vr_pass_bytes_skipped = 0; 799 800 uint64_t update_est_time = gethrtime(); 801 vdev_rebuild_update_bytes_est(vd, 0); 802 803 clear_rebuild_bytes(vr->vr_top_vdev); 804 805 mutex_exit(&vd->vdev_rebuild_lock); 806 807 /* 808 * Systematically walk the metaslabs and issue rebuild I/Os for 809 * all ranges in the allocated space map. 810 */ 811 for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { 812 metaslab_t *msp = vd->vdev_ms[i]; 813 vr->vr_scan_msp = msp; 814 815 /* 816 * Calculate the max number of in-flight bytes for top-level 817 * vdev scanning operations (minimum 1MB, maximum 1/2 of 818 * arc_c_max shared by all top-level vdevs). Limits for the 819 * issuing phase are done per top-level vdev and are handled 820 * separately. 821 */ 822 uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1); 823 vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, 824 zfs_rebuild_vdev_limit * vd->vdev_children)); 825 826 /* 827 * Removal of vdevs from the vdev tree may eliminate the need 828 * for the rebuild, in which case it should be canceled. The 829 * vdev_rebuild_cancel_wanted flag is set until the sync task 830 * completes. This may be after the rebuild thread exits. 831 */ 832 if (vdev_rebuild_should_cancel(vd)) { 833 vd->vdev_rebuild_cancel_wanted = B_TRUE; 834 error = EINTR; 835 break; 836 } 837 838 ASSERT0(zfs_range_tree_space(vr->vr_scan_tree)); 839 840 /* Disable any new allocations to this metaslab */ 841 spa_config_exit(spa, SCL_CONFIG, FTAG); 842 metaslab_disable(msp); 843 844 mutex_enter(&msp->ms_sync_lock); 845 mutex_enter(&msp->ms_lock); 846 847 /* 848 * If there are outstanding allocations wait for them to be 849 * synced. This is needed to ensure all allocated ranges are 850 * on disk and therefore will be rebuilt. 851 */ 852 for (int j = 0; j < TXG_SIZE; j++) { 853 if (zfs_range_tree_space(msp->ms_allocating[j])) { 854 mutex_exit(&msp->ms_lock); 855 mutex_exit(&msp->ms_sync_lock); 856 txg_wait_synced(dsl, 0); 857 mutex_enter(&msp->ms_sync_lock); 858 mutex_enter(&msp->ms_lock); 859 break; 860 } 861 } 862 863 /* 864 * When a metaslab has been allocated from read its allocated 865 * ranges from the space map object into the vr_scan_tree. 866 * Then add inflight / unflushed ranges and remove inflight / 867 * unflushed frees. This is the minimum range to be rebuilt. 868 */ 869 if (msp->ms_sm != NULL) { 870 VERIFY0(space_map_load(msp->ms_sm, 871 vr->vr_scan_tree, SM_ALLOC)); 872 873 for (int i = 0; i < TXG_SIZE; i++) { 874 ASSERT0(zfs_range_tree_space( 875 msp->ms_allocating[i])); 876 } 877 878 zfs_range_tree_walk(msp->ms_unflushed_allocs, 879 zfs_range_tree_add, vr->vr_scan_tree); 880 zfs_range_tree_walk(msp->ms_unflushed_frees, 881 zfs_range_tree_remove, vr->vr_scan_tree); 882 883 /* 884 * Remove ranges which have already been rebuilt based 885 * on the last offset. This can happen when restarting 886 * a scan after exporting and re-importing the pool. 887 */ 888 zfs_range_tree_clear(vr->vr_scan_tree, 0, 889 vrp->vrp_last_offset); 890 } 891 892 mutex_exit(&msp->ms_lock); 893 mutex_exit(&msp->ms_sync_lock); 894 895 /* 896 * To provide an accurate estimate re-calculate the estimated 897 * size every 5 minutes to account for recent allocations and 898 * frees made to space maps which have not yet been rebuilt. 899 */ 900 if (gethrtime() > update_est_time + SEC2NSEC(300)) { 901 update_est_time = gethrtime(); 902 vdev_rebuild_update_bytes_est(vd, i); 903 } 904 905 /* 906 * Walk the allocated space map and issue the rebuild I/O. 907 */ 908 error = vdev_rebuild_ranges(vr); 909 zfs_range_tree_vacate(vr->vr_scan_tree, NULL, NULL); 910 911 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 912 metaslab_enable(msp, B_FALSE, B_FALSE); 913 914 if (error != 0) 915 break; 916 } 917 918 zfs_range_tree_destroy(vr->vr_scan_tree); 919 spa_config_exit(spa, SCL_CONFIG, FTAG); 920 921 /* Wait for any remaining rebuild I/O to complete */ 922 mutex_enter(&vr->vr_io_lock); 923 while (vr->vr_bytes_inflight > 0) 924 cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); 925 926 mutex_exit(&vr->vr_io_lock); 927 928 mutex_destroy(&vr->vr_io_lock); 929 cv_destroy(&vr->vr_io_cv); 930 931 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 932 933 dsl_pool_t *dp = spa_get_dsl(spa); 934 dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); 935 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); 936 937 mutex_enter(&vd->vdev_rebuild_lock); 938 if (error == 0) { 939 /* 940 * After a successful rebuild clear the DTLs of all ranges 941 * which were missing when the rebuild was started. These 942 * ranges must have been rebuilt as a consequence of rebuilding 943 * all allocated space. Note that unlike a scrub or resilver 944 * the rebuild operation will reconstruct data only referenced 945 * by a pool checkpoint. See the dsl_scan_done() comments. 946 */ 947 dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, 948 (void *)(uintptr_t)vd->vdev_id, tx); 949 } else if (vd->vdev_rebuild_cancel_wanted) { 950 /* 951 * The rebuild operation was canceled. This will occur when 952 * a device participating in the rebuild is detached. 953 */ 954 dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, 955 (void *)(uintptr_t)vd->vdev_id, tx); 956 } else if (vd->vdev_rebuild_reset_wanted) { 957 /* 958 * Reset the running rebuild without canceling and restarting 959 * it. This will occur when a new device is attached and must 960 * participate in the rebuild. 961 */ 962 dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, 963 (void *)(uintptr_t)vd->vdev_id, tx); 964 } else { 965 /* 966 * The rebuild operation should be suspended. This may occur 967 * when detaching a child vdev or when exporting the pool. The 968 * rebuild is left in the active state so it will be resumed. 969 */ 970 ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); 971 vd->vdev_rebuilding = B_FALSE; 972 } 973 974 dmu_tx_commit(tx); 975 976 vd->vdev_rebuild_thread = NULL; 977 mutex_exit(&vd->vdev_rebuild_lock); 978 spa_config_exit(spa, SCL_CONFIG, FTAG); 979 980 cv_broadcast(&vd->vdev_rebuild_cv); 981 982 thread_exit(); 983 } 984 985 /* 986 * Returns B_TRUE if any top-level vdev are rebuilding. 987 */ 988 boolean_t 989 vdev_rebuild_active(vdev_t *vd) 990 { 991 spa_t *spa = vd->vdev_spa; 992 boolean_t ret = B_FALSE; 993 994 if (vd == spa->spa_root_vdev) { 995 for (uint64_t i = 0; i < vd->vdev_children; i++) { 996 ret = vdev_rebuild_active(vd->vdev_child[i]); 997 if (ret) 998 return (ret); 999 } 1000 } else if (vd->vdev_top_zap != 0) { 1001 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 1002 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 1003 1004 mutex_enter(&vd->vdev_rebuild_lock); 1005 ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); 1006 mutex_exit(&vd->vdev_rebuild_lock); 1007 } 1008 1009 return (ret); 1010 } 1011 1012 /* 1013 * Start a rebuild operation. The rebuild may be restarted when the 1014 * top-level vdev is currently actively rebuilding. 1015 */ 1016 void 1017 vdev_rebuild(vdev_t *vd) 1018 { 1019 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 1020 vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; 1021 1022 ASSERT(vd->vdev_top == vd); 1023 ASSERT(vdev_is_concrete(vd)); 1024 ASSERT(!vd->vdev_removing); 1025 ASSERT(spa_feature_is_enabled(vd->vdev_spa, 1026 SPA_FEATURE_DEVICE_REBUILD)); 1027 1028 mutex_enter(&vd->vdev_rebuild_lock); 1029 if (vd->vdev_rebuilding) { 1030 ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); 1031 1032 /* 1033 * Signal a running rebuild operation that it should restart 1034 * from the beginning because a new device was attached. The 1035 * vdev_rebuild_reset_wanted flag is set until the sync task 1036 * completes. This may be after the rebuild thread exits. 1037 */ 1038 if (!vd->vdev_rebuild_reset_wanted) 1039 vd->vdev_rebuild_reset_wanted = B_TRUE; 1040 } else { 1041 vdev_rebuild_initiate(vd); 1042 } 1043 mutex_exit(&vd->vdev_rebuild_lock); 1044 } 1045 1046 static void 1047 vdev_rebuild_restart_impl(vdev_t *vd) 1048 { 1049 spa_t *spa = vd->vdev_spa; 1050 1051 if (vd == spa->spa_root_vdev) { 1052 for (uint64_t i = 0; i < vd->vdev_children; i++) 1053 vdev_rebuild_restart_impl(vd->vdev_child[i]); 1054 1055 } else if (vd->vdev_top_zap != 0) { 1056 vdev_rebuild_t *vr = &vd->vdev_rebuild_config; 1057 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 1058 1059 mutex_enter(&vd->vdev_rebuild_lock); 1060 if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && 1061 vdev_writeable(vd) && !vd->vdev_rebuilding) { 1062 ASSERT(spa_feature_is_active(spa, 1063 SPA_FEATURE_DEVICE_REBUILD)); 1064 vd->vdev_rebuilding = B_TRUE; 1065 vd->vdev_rebuild_thread = thread_create(NULL, 0, 1066 vdev_rebuild_thread, vd, 0, &p0, TS_RUN, 1067 maxclsyspri); 1068 } 1069 mutex_exit(&vd->vdev_rebuild_lock); 1070 } 1071 } 1072 1073 /* 1074 * Conditionally restart all of the vdev_rebuild_thread's for a pool. The 1075 * feature flag must be active and the rebuild in the active state. This 1076 * cannot be used to start a new rebuild. 1077 */ 1078 void 1079 vdev_rebuild_restart(spa_t *spa) 1080 { 1081 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 1082 spa->spa_load_thread == curthread); 1083 1084 vdev_rebuild_restart_impl(spa->spa_root_vdev); 1085 } 1086 1087 /* 1088 * Stop and wait for all of the vdev_rebuild_thread's associated with the 1089 * vdev tree provide to be terminated (canceled or stopped). 1090 */ 1091 void 1092 vdev_rebuild_stop_wait(vdev_t *vd) 1093 { 1094 spa_t *spa = vd->vdev_spa; 1095 1096 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 1097 spa->spa_export_thread == curthread); 1098 1099 if (vd == spa->spa_root_vdev) { 1100 for (uint64_t i = 0; i < vd->vdev_children; i++) 1101 vdev_rebuild_stop_wait(vd->vdev_child[i]); 1102 1103 } else if (vd->vdev_top_zap != 0) { 1104 ASSERT(vd == vd->vdev_top); 1105 1106 mutex_enter(&vd->vdev_rebuild_lock); 1107 if (vd->vdev_rebuild_thread != NULL) { 1108 vd->vdev_rebuild_exit_wanted = B_TRUE; 1109 while (vd->vdev_rebuilding) { 1110 cv_wait(&vd->vdev_rebuild_cv, 1111 &vd->vdev_rebuild_lock); 1112 } 1113 vd->vdev_rebuild_exit_wanted = B_FALSE; 1114 } 1115 mutex_exit(&vd->vdev_rebuild_lock); 1116 } 1117 } 1118 1119 /* 1120 * Stop all rebuild operations but leave them in the active state so they 1121 * will be resumed when importing the pool. 1122 */ 1123 void 1124 vdev_rebuild_stop_all(spa_t *spa) 1125 { 1126 vdev_rebuild_stop_wait(spa->spa_root_vdev); 1127 } 1128 1129 /* 1130 * Rebuild statistics reported per top-level vdev. 1131 */ 1132 int 1133 vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) 1134 { 1135 spa_t *spa = tvd->vdev_spa; 1136 1137 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 1138 return (SET_ERROR(ENOTSUP)); 1139 1140 if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) 1141 return (SET_ERROR(EINVAL)); 1142 1143 int error = zap_contains(spa_meta_objset(spa), 1144 tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); 1145 1146 if (error == ENOENT) { 1147 memset(vrs, 0, sizeof (vdev_rebuild_stat_t)); 1148 vrs->vrs_state = VDEV_REBUILD_NONE; 1149 error = 0; 1150 } else if (error == 0) { 1151 vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; 1152 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 1153 1154 mutex_enter(&tvd->vdev_rebuild_lock); 1155 vrs->vrs_state = vrp->vrp_rebuild_state; 1156 vrs->vrs_start_time = vrp->vrp_start_time; 1157 vrs->vrs_end_time = vrp->vrp_end_time; 1158 vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; 1159 vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; 1160 vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; 1161 vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; 1162 vrs->vrs_bytes_est = vrp->vrp_bytes_est; 1163 vrs->vrs_errors = vrp->vrp_errors; 1164 vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - 1165 vr->vr_pass_start_time); 1166 vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; 1167 vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; 1168 vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped; 1169 mutex_exit(&tvd->vdev_rebuild_lock); 1170 } 1171 1172 return (error); 1173 } 1174 1175 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW, 1176 "Max segment size in bytes of rebuild reads"); 1177 1178 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW, 1179 "Max bytes in flight per leaf vdev for sequential resilvers"); 1180 1181 ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, 1182 "Automatically scrub after sequential resilver completes"); 1183