1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2016, 2019 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/spa.h> 27 #include <sys/spa_impl.h> 28 #include <sys/txg.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/refcount.h> 31 #include <sys/metaslab_impl.h> 32 #include <sys/dsl_synctask.h> 33 #include <sys/zap.h> 34 #include <sys/dmu_tx.h> 35 36 /* 37 * Value that is written to disk during initialization. 38 */ 39 uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; 40 41 /* maximum number of I/Os outstanding per leaf vdev */ 42 int zfs_initialize_limit = 1; 43 44 /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ 45 uint64_t zfs_initialize_chunk_size = 1024 * 1024; 46 47 static boolean_t 48 vdev_initialize_should_stop(vdev_t *vd) 49 { 50 return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || 51 vd->vdev_detached || vd->vdev_top->vdev_removing); 52 } 53 54 static void 55 vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) 56 { 57 /* 58 * We pass in the guid instead of the vdev_t since the vdev may 59 * have been freed prior to the sync task being processed. This 60 * happens when a vdev is detached as we call spa_config_vdev_exit(), 61 * stop the initializing thread, schedule the sync task, and free 62 * the vdev. Later when the scheduled sync task is invoked, it would 63 * find that the vdev has been freed. 64 */ 65 uint64_t guid = *(uint64_t *)arg; 66 uint64_t txg = dmu_tx_get_txg(tx); 67 kmem_free(arg, sizeof (uint64_t)); 68 69 vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); 70 if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) 71 return; 72 73 uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; 74 vd->vdev_initialize_offset[txg & TXG_MASK] = 0; 75 76 VERIFY(vd->vdev_leaf_zap != 0); 77 78 objset_t *mos = vd->vdev_spa->spa_meta_objset; 79 80 if (last_offset > 0) { 81 vd->vdev_initialize_last_offset = last_offset; 82 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 83 VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 84 sizeof (last_offset), 1, &last_offset, tx)); 85 } 86 if (vd->vdev_initialize_action_time > 0) { 87 uint64_t val = (uint64_t)vd->vdev_initialize_action_time; 88 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 89 VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), 90 1, &val, tx)); 91 } 92 93 uint64_t initialize_state = vd->vdev_initialize_state; 94 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 95 VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, 96 &initialize_state, tx)); 97 } 98 99 static void 100 vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) 101 { 102 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 103 spa_t *spa = vd->vdev_spa; 104 105 if (new_state == vd->vdev_initialize_state) 106 return; 107 108 /* 109 * Copy the vd's guid, this will be freed by the sync task. 110 */ 111 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 112 *guid = vd->vdev_guid; 113 114 /* 115 * If we're suspending, then preserving the original start time. 116 */ 117 if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { 118 vd->vdev_initialize_action_time = gethrestime_sec(); 119 } 120 vd->vdev_initialize_state = new_state; 121 122 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 123 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 124 dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, 125 guid, 2, ZFS_SPACE_CHECK_NONE, tx); 126 127 switch (new_state) { 128 case VDEV_INITIALIZE_ACTIVE: 129 spa_history_log_internal(spa, "initialize", tx, 130 "vdev=%s activated", vd->vdev_path); 131 break; 132 case VDEV_INITIALIZE_SUSPENDED: 133 spa_history_log_internal(spa, "initialize", tx, 134 "vdev=%s suspended", vd->vdev_path); 135 break; 136 case VDEV_INITIALIZE_CANCELED: 137 spa_history_log_internal(spa, "initialize", tx, 138 "vdev=%s canceled", vd->vdev_path); 139 break; 140 case VDEV_INITIALIZE_COMPLETE: 141 spa_history_log_internal(spa, "initialize", tx, 142 "vdev=%s complete", vd->vdev_path); 143 break; 144 default: 145 panic("invalid state %llu", (unsigned long long)new_state); 146 } 147 148 dmu_tx_commit(tx); 149 } 150 151 static void 152 vdev_initialize_cb(zio_t *zio) 153 { 154 vdev_t *vd = zio->io_vd; 155 mutex_enter(&vd->vdev_initialize_io_lock); 156 if (zio->io_error == ENXIO && !vdev_writeable(vd)) { 157 /* 158 * The I/O failed because the vdev was unavailable; roll the 159 * last offset back. (This works because spa_sync waits on 160 * spa_txg_zio before it runs sync tasks.) 161 */ 162 uint64_t *off = 163 &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; 164 *off = MIN(*off, zio->io_offset); 165 } else { 166 /* 167 * Since initializing is best-effort, we ignore I/O errors and 168 * rely on vdev_probe to determine if the errors are more 169 * critical. 170 */ 171 if (zio->io_error != 0) 172 vd->vdev_stat.vs_initialize_errors++; 173 174 vd->vdev_initialize_bytes_done += zio->io_orig_size; 175 } 176 ASSERT3U(vd->vdev_initialize_inflight, >, 0); 177 vd->vdev_initialize_inflight--; 178 cv_broadcast(&vd->vdev_initialize_io_cv); 179 mutex_exit(&vd->vdev_initialize_io_lock); 180 181 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 182 } 183 184 /* Takes care of physical writing and limiting # of concurrent ZIOs. */ 185 static int 186 vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) 187 { 188 spa_t *spa = vd->vdev_spa; 189 190 /* Limit inflight initializing I/Os */ 191 mutex_enter(&vd->vdev_initialize_io_lock); 192 while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { 193 cv_wait(&vd->vdev_initialize_io_cv, 194 &vd->vdev_initialize_io_lock); 195 } 196 vd->vdev_initialize_inflight++; 197 mutex_exit(&vd->vdev_initialize_io_lock); 198 199 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 200 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 201 uint64_t txg = dmu_tx_get_txg(tx); 202 203 spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); 204 mutex_enter(&vd->vdev_initialize_lock); 205 206 if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { 207 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 208 *guid = vd->vdev_guid; 209 210 /* This is the first write of this txg. */ 211 dsl_sync_task_nowait(spa_get_dsl(spa), 212 vdev_initialize_zap_update_sync, guid, 2, 213 ZFS_SPACE_CHECK_RESERVED, tx); 214 } 215 216 /* 217 * We know the vdev struct will still be around since all 218 * consumers of vdev_free must stop the initialization first. 219 */ 220 if (vdev_initialize_should_stop(vd)) { 221 mutex_enter(&vd->vdev_initialize_io_lock); 222 ASSERT3U(vd->vdev_initialize_inflight, >, 0); 223 vd->vdev_initialize_inflight--; 224 mutex_exit(&vd->vdev_initialize_io_lock); 225 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 226 mutex_exit(&vd->vdev_initialize_lock); 227 dmu_tx_commit(tx); 228 return (SET_ERROR(EINTR)); 229 } 230 mutex_exit(&vd->vdev_initialize_lock); 231 232 vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; 233 zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, 234 size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, 235 ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); 236 /* vdev_initialize_cb releases SCL_STATE_ALL */ 237 238 dmu_tx_commit(tx); 239 240 return (0); 241 } 242 243 /* 244 * Callback to fill each ABD chunk with zfs_initialize_value. len must be 245 * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD 246 * allocation will guarantee these for us. 247 */ 248 /* ARGSUSED */ 249 static int 250 vdev_initialize_block_fill(void *buf, size_t len, void *unused) 251 { 252 ASSERT0(len % sizeof (uint64_t)); 253 for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { 254 *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; 255 } 256 return (0); 257 } 258 259 static abd_t * 260 vdev_initialize_block_alloc() 261 { 262 /* Allocate ABD for filler data */ 263 abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); 264 265 ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); 266 (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, 267 vdev_initialize_block_fill, NULL); 268 269 return (data); 270 } 271 272 static void 273 vdev_initialize_block_free(abd_t *data) 274 { 275 abd_free(data); 276 } 277 278 static int 279 vdev_initialize_ranges(vdev_t *vd, abd_t *data) 280 { 281 range_tree_t *rt = vd->vdev_initialize_tree; 282 zfs_btree_t *bt = &rt->rt_root; 283 zfs_btree_index_t where; 284 285 for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; 286 rs = zfs_btree_next(bt, &where, &where)) { 287 uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); 288 289 /* Split range into legally-sized physical chunks */ 290 uint64_t writes_required = 291 ((size - 1) / zfs_initialize_chunk_size) + 1; 292 293 for (uint64_t w = 0; w < writes_required; w++) { 294 int error; 295 296 error = vdev_initialize_write(vd, 297 VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + 298 (w * zfs_initialize_chunk_size), 299 MIN(size - (w * zfs_initialize_chunk_size), 300 zfs_initialize_chunk_size), data); 301 if (error != 0) 302 return (error); 303 } 304 } 305 return (0); 306 } 307 308 static void 309 vdev_initialize_calculate_progress(vdev_t *vd) 310 { 311 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 312 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 313 ASSERT(vd->vdev_leaf_zap != 0); 314 315 vd->vdev_initialize_bytes_est = 0; 316 vd->vdev_initialize_bytes_done = 0; 317 318 for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { 319 metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 320 mutex_enter(&msp->ms_lock); 321 322 uint64_t ms_free = msp->ms_size - 323 metaslab_allocated_space(msp); 324 325 if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) 326 ms_free /= vd->vdev_top->vdev_children; 327 328 /* 329 * Convert the metaslab range to a physical range 330 * on our vdev. We use this to determine if we are 331 * in the middle of this metaslab range. 332 */ 333 range_seg64_t logical_rs, physical_rs; 334 logical_rs.rs_start = msp->ms_start; 335 logical_rs.rs_end = msp->ms_start + msp->ms_size; 336 vdev_xlate(vd, &logical_rs, &physical_rs); 337 338 if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { 339 vd->vdev_initialize_bytes_est += ms_free; 340 mutex_exit(&msp->ms_lock); 341 continue; 342 } else if (vd->vdev_initialize_last_offset > 343 physical_rs.rs_end) { 344 vd->vdev_initialize_bytes_done += ms_free; 345 vd->vdev_initialize_bytes_est += ms_free; 346 mutex_exit(&msp->ms_lock); 347 continue; 348 } 349 350 /* 351 * If we get here, we're in the middle of initializing this 352 * metaslab. Load it and walk the free tree for more accurate 353 * progress estimation. 354 */ 355 VERIFY0(metaslab_load(msp)); 356 357 zfs_btree_index_t where; 358 range_tree_t *rt = msp->ms_allocatable; 359 for (range_seg_t *rs = 360 zfs_btree_first(&rt->rt_root, &where); rs; 361 rs = zfs_btree_next(&rt->rt_root, &where, 362 &where)) { 363 logical_rs.rs_start = rs_get_start(rs, rt); 364 logical_rs.rs_end = rs_get_end(rs, rt); 365 vdev_xlate(vd, &logical_rs, &physical_rs); 366 367 uint64_t size = physical_rs.rs_end - 368 physical_rs.rs_start; 369 vd->vdev_initialize_bytes_est += size; 370 if (vd->vdev_initialize_last_offset > 371 physical_rs.rs_end) { 372 vd->vdev_initialize_bytes_done += size; 373 } else if (vd->vdev_initialize_last_offset > 374 physical_rs.rs_start && 375 vd->vdev_initialize_last_offset < 376 physical_rs.rs_end) { 377 vd->vdev_initialize_bytes_done += 378 vd->vdev_initialize_last_offset - 379 physical_rs.rs_start; 380 } 381 } 382 mutex_exit(&msp->ms_lock); 383 } 384 } 385 386 static int 387 vdev_initialize_load(vdev_t *vd) 388 { 389 int err = 0; 390 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 391 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 392 ASSERT(vd->vdev_leaf_zap != 0); 393 394 if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || 395 vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { 396 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 397 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 398 sizeof (vd->vdev_initialize_last_offset), 1, 399 &vd->vdev_initialize_last_offset); 400 if (err == ENOENT) { 401 vd->vdev_initialize_last_offset = 0; 402 err = 0; 403 } 404 } 405 406 vdev_initialize_calculate_progress(vd); 407 return (err); 408 } 409 410 411 /* 412 * Convert the logical range into a physical range and add it to our 413 * avl tree. 414 */ 415 void 416 vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) 417 { 418 vdev_t *vd = arg; 419 range_seg64_t logical_rs, physical_rs; 420 logical_rs.rs_start = start; 421 logical_rs.rs_end = start + size; 422 423 ASSERT(vd->vdev_ops->vdev_op_leaf); 424 vdev_xlate(vd, &logical_rs, &physical_rs); 425 426 IMPLY(vd->vdev_top == vd, 427 logical_rs.rs_start == physical_rs.rs_start); 428 IMPLY(vd->vdev_top == vd, 429 logical_rs.rs_end == physical_rs.rs_end); 430 431 /* Only add segments that we have not visited yet */ 432 if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) 433 return; 434 435 /* Pick up where we left off mid-range. */ 436 if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { 437 zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " 438 "(%llu, %llu)", vd->vdev_path, 439 (u_longlong_t)physical_rs.rs_start, 440 (u_longlong_t)physical_rs.rs_end, 441 (u_longlong_t)vd->vdev_initialize_last_offset, 442 (u_longlong_t)physical_rs.rs_end); 443 ASSERT3U(physical_rs.rs_end, >, 444 vd->vdev_initialize_last_offset); 445 physical_rs.rs_start = vd->vdev_initialize_last_offset; 446 } 447 ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); 448 449 /* 450 * With raidz, it's possible that the logical range does not live on 451 * this leaf vdev. We only add the physical range to this vdev's if it 452 * has a length greater than 0. 453 */ 454 if (physical_rs.rs_end > physical_rs.rs_start) { 455 range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, 456 physical_rs.rs_end - physical_rs.rs_start); 457 } else { 458 ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); 459 } 460 } 461 462 static void 463 vdev_initialize_thread(void *arg) 464 { 465 vdev_t *vd = arg; 466 spa_t *spa = vd->vdev_spa; 467 int error = 0; 468 uint64_t ms_count = 0; 469 470 ASSERT(vdev_is_concrete(vd)); 471 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 472 473 vd->vdev_initialize_last_offset = 0; 474 VERIFY0(vdev_initialize_load(vd)); 475 476 abd_t *deadbeef = vdev_initialize_block_alloc(); 477 478 vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 479 0, 0); 480 481 for (uint64_t i = 0; !vd->vdev_detached && 482 i < vd->vdev_top->vdev_ms_count; i++) { 483 metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 484 boolean_t unload_when_done = B_FALSE; 485 486 /* 487 * If we've expanded the top-level vdev or it's our 488 * first pass, calculate our progress. 489 */ 490 if (vd->vdev_top->vdev_ms_count != ms_count) { 491 vdev_initialize_calculate_progress(vd); 492 ms_count = vd->vdev_top->vdev_ms_count; 493 } 494 495 spa_config_exit(spa, SCL_CONFIG, FTAG); 496 metaslab_disable(msp); 497 mutex_enter(&msp->ms_lock); 498 if (!msp->ms_loaded && !msp->ms_loading) 499 unload_when_done = B_TRUE; 500 VERIFY0(metaslab_load(msp)); 501 502 range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, 503 vd); 504 mutex_exit(&msp->ms_lock); 505 506 error = vdev_initialize_ranges(vd, deadbeef); 507 metaslab_enable(msp, B_TRUE, unload_when_done); 508 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 509 510 range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); 511 if (error != 0) 512 break; 513 } 514 515 spa_config_exit(spa, SCL_CONFIG, FTAG); 516 mutex_enter(&vd->vdev_initialize_io_lock); 517 while (vd->vdev_initialize_inflight > 0) { 518 cv_wait(&vd->vdev_initialize_io_cv, 519 &vd->vdev_initialize_io_lock); 520 } 521 mutex_exit(&vd->vdev_initialize_io_lock); 522 523 range_tree_destroy(vd->vdev_initialize_tree); 524 vdev_initialize_block_free(deadbeef); 525 vd->vdev_initialize_tree = NULL; 526 527 mutex_enter(&vd->vdev_initialize_lock); 528 if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { 529 vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); 530 } 531 ASSERT(vd->vdev_initialize_thread != NULL || 532 vd->vdev_initialize_inflight == 0); 533 534 /* 535 * Drop the vdev_initialize_lock while we sync out the 536 * txg since it's possible that a device might be trying to 537 * come online and must check to see if it needs to restart an 538 * initialization. That thread will be holding the spa_config_lock 539 * which would prevent the txg_wait_synced from completing. 540 */ 541 mutex_exit(&vd->vdev_initialize_lock); 542 txg_wait_synced(spa_get_dsl(spa), 0); 543 mutex_enter(&vd->vdev_initialize_lock); 544 545 vd->vdev_initialize_thread = NULL; 546 cv_broadcast(&vd->vdev_initialize_cv); 547 mutex_exit(&vd->vdev_initialize_lock); 548 } 549 550 /* 551 * Initiates a device. Caller must hold vdev_initialize_lock. 552 * Device must be a leaf and not already be initializing. 553 */ 554 void 555 vdev_initialize(vdev_t *vd) 556 { 557 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 558 ASSERT(vd->vdev_ops->vdev_op_leaf); 559 ASSERT(vdev_is_concrete(vd)); 560 ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 561 ASSERT(!vd->vdev_detached); 562 ASSERT(!vd->vdev_initialize_exit_wanted); 563 ASSERT(!vd->vdev_top->vdev_removing); 564 565 vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); 566 vd->vdev_initialize_thread = thread_create(NULL, 0, 567 vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 568 } 569 570 /* 571 * Wait for the initialize thread to be terminated (cancelled or stopped). 572 */ 573 static void 574 vdev_initialize_stop_wait_impl(vdev_t *vd) 575 { 576 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 577 578 while (vd->vdev_initialize_thread != NULL) 579 cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); 580 581 ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 582 vd->vdev_initialize_exit_wanted = B_FALSE; 583 } 584 585 /* 586 * Wait for vdev initialize threads which were either to cleanly exit. 587 */ 588 void 589 vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) 590 { 591 vdev_t *vd; 592 593 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 594 595 while ((vd = list_remove_head(vd_list)) != NULL) { 596 mutex_enter(&vd->vdev_initialize_lock); 597 vdev_initialize_stop_wait_impl(vd); 598 mutex_exit(&vd->vdev_initialize_lock); 599 } 600 } 601 602 /* 603 * Stop initializing a device, with the resultant initializing state being 604 * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when 605 * a list_t is provided the stopping vdev is inserted in to the list. Callers 606 * are then required to call vdev_initialize_stop_wait() to block for all the 607 * initialization threads to exit. The caller must hold vdev_initialize_lock 608 * and must not be writing to the spa config, as the initializing thread may 609 * try to enter the config as a reader before exiting. 610 */ 611 void 612 vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, 613 list_t *vd_list) 614 { 615 ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); 616 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 617 ASSERT(vd->vdev_ops->vdev_op_leaf); 618 ASSERT(vdev_is_concrete(vd)); 619 620 /* 621 * Allow cancel requests to proceed even if the initialize thread 622 * has stopped. 623 */ 624 if (vd->vdev_initialize_thread == NULL && 625 tgt_state != VDEV_INITIALIZE_CANCELED) { 626 return; 627 } 628 629 vdev_initialize_change_state(vd, tgt_state); 630 vd->vdev_initialize_exit_wanted = B_TRUE; 631 632 if (vd_list == NULL) { 633 vdev_initialize_stop_wait_impl(vd); 634 } else { 635 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 636 list_insert_tail(vd_list, vd); 637 } 638 } 639 640 static void 641 vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, 642 list_t *vd_list) 643 { 644 if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { 645 mutex_enter(&vd->vdev_initialize_lock); 646 vdev_initialize_stop(vd, tgt_state, vd_list); 647 mutex_exit(&vd->vdev_initialize_lock); 648 return; 649 } 650 651 for (uint64_t i = 0; i < vd->vdev_children; i++) { 652 vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, 653 vd_list); 654 } 655 } 656 657 /* 658 * Convenience function to stop initializing of a vdev tree and set all 659 * initialize thread pointers to NULL. 660 */ 661 void 662 vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) 663 { 664 spa_t *spa = vd->vdev_spa; 665 list_t vd_list; 666 667 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 668 669 list_create(&vd_list, sizeof (vdev_t), 670 offsetof(vdev_t, vdev_initialize_node)); 671 672 vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); 673 vdev_initialize_stop_wait(spa, &vd_list); 674 675 if (vd->vdev_spa->spa_sync_on) { 676 /* Make sure that our state has been synced to disk */ 677 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); 678 } 679 680 list_destroy(&vd_list); 681 } 682 683 void 684 vdev_initialize_restart(vdev_t *vd) 685 { 686 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 687 ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 688 689 if (vd->vdev_leaf_zap != 0) { 690 mutex_enter(&vd->vdev_initialize_lock); 691 uint64_t initialize_state = VDEV_INITIALIZE_NONE; 692 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, 693 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, 694 sizeof (initialize_state), 1, &initialize_state); 695 ASSERT(err == 0 || err == ENOENT); 696 vd->vdev_initialize_state = initialize_state; 697 698 uint64_t timestamp = 0; 699 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 700 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, 701 sizeof (timestamp), 1, ×tamp); 702 ASSERT(err == 0 || err == ENOENT); 703 vd->vdev_initialize_action_time = (time_t)timestamp; 704 705 if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || 706 vd->vdev_offline) { 707 /* load progress for reporting, but don't resume */ 708 VERIFY0(vdev_initialize_load(vd)); 709 } else if (vd->vdev_initialize_state == 710 VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && 711 !vd->vdev_top->vdev_removing && 712 vd->vdev_initialize_thread == NULL) { 713 vdev_initialize(vd); 714 } 715 716 mutex_exit(&vd->vdev_initialize_lock); 717 } 718 719 for (uint64_t i = 0; i < vd->vdev_children; i++) { 720 vdev_initialize_restart(vd->vdev_child[i]); 721 } 722 } 723