1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2016, 2019 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/spa.h> 27 #include <sys/spa_impl.h> 28 #include <sys/txg.h> 29 #include <sys/vdev_impl.h> 30 #include <sys/refcount.h> 31 #include <sys/metaslab_impl.h> 32 #include <sys/dsl_synctask.h> 33 #include <sys/zap.h> 34 #include <sys/dmu_tx.h> 35 36 /* 37 * Value that is written to disk during initialization. 38 */ 39 uint64_t zfs_initialize_value = 0xdeadbeefdeadbeefULL; 40 41 /* maximum number of I/Os outstanding per leaf vdev */ 42 int zfs_initialize_limit = 1; 43 44 /* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ 45 uint64_t zfs_initialize_chunk_size = 1024 * 1024; 46 47 static boolean_t 48 vdev_initialize_should_stop(vdev_t *vd) 49 { 50 return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || 51 vd->vdev_detached || vd->vdev_top->vdev_removing); 52 } 53 54 static void 55 vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) 56 { 57 /* 58 * We pass in the guid instead of the vdev_t since the vdev may 59 * have been freed prior to the sync task being processed. This 60 * happens when a vdev is detached as we call spa_config_vdev_exit(), 61 * stop the initializing thread, schedule the sync task, and free 62 * the vdev. Later when the scheduled sync task is invoked, it would 63 * find that the vdev has been freed. 64 */ 65 uint64_t guid = *(uint64_t *)arg; 66 uint64_t txg = dmu_tx_get_txg(tx); 67 kmem_free(arg, sizeof (uint64_t)); 68 69 vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); 70 if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) 71 return; 72 73 uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; 74 vd->vdev_initialize_offset[txg & TXG_MASK] = 0; 75 76 VERIFY(vd->vdev_leaf_zap != 0); 77 78 objset_t *mos = vd->vdev_spa->spa_meta_objset; 79 80 if (last_offset > 0) { 81 vd->vdev_initialize_last_offset = last_offset; 82 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 83 VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 84 sizeof (last_offset), 1, &last_offset, tx)); 85 } 86 if (vd->vdev_initialize_action_time > 0) { 87 uint64_t val = (uint64_t)vd->vdev_initialize_action_time; 88 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 89 VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), 90 1, &val, tx)); 91 } 92 93 uint64_t initialize_state = vd->vdev_initialize_state; 94 VERIFY0(zap_update(mos, vd->vdev_leaf_zap, 95 VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, 96 &initialize_state, tx)); 97 } 98 99 static void 100 vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) 101 { 102 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 103 spa_t *spa = vd->vdev_spa; 104 105 if (new_state == vd->vdev_initialize_state) 106 return; 107 108 /* 109 * Copy the vd's guid, this will be freed by the sync task. 110 */ 111 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 112 *guid = vd->vdev_guid; 113 114 /* 115 * If we're suspending, then preserving the original start time. 116 */ 117 if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { 118 vd->vdev_initialize_action_time = gethrestime_sec(); 119 } 120 vd->vdev_initialize_state = new_state; 121 122 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 123 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 124 dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, 125 guid, 2, ZFS_SPACE_CHECK_NONE, tx); 126 127 switch (new_state) { 128 case VDEV_INITIALIZE_ACTIVE: 129 spa_history_log_internal(spa, "initialize", tx, 130 "vdev=%s activated", vd->vdev_path); 131 break; 132 case VDEV_INITIALIZE_SUSPENDED: 133 spa_history_log_internal(spa, "initialize", tx, 134 "vdev=%s suspended", vd->vdev_path); 135 break; 136 case VDEV_INITIALIZE_CANCELED: 137 spa_history_log_internal(spa, "initialize", tx, 138 "vdev=%s canceled", vd->vdev_path); 139 break; 140 case VDEV_INITIALIZE_COMPLETE: 141 spa_history_log_internal(spa, "initialize", tx, 142 "vdev=%s complete", vd->vdev_path); 143 break; 144 default: 145 panic("invalid state %llu", (unsigned long long)new_state); 146 } 147 148 dmu_tx_commit(tx); 149 } 150 151 static void 152 vdev_initialize_cb(zio_t *zio) 153 { 154 vdev_t *vd = zio->io_vd; 155 mutex_enter(&vd->vdev_initialize_io_lock); 156 if (zio->io_error == ENXIO && !vdev_writeable(vd)) { 157 /* 158 * The I/O failed because the vdev was unavailable; roll the 159 * last offset back. (This works because spa_sync waits on 160 * spa_txg_zio before it runs sync tasks.) 161 */ 162 uint64_t *off = 163 &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; 164 *off = MIN(*off, zio->io_offset); 165 } else { 166 /* 167 * Since initializing is best-effort, we ignore I/O errors and 168 * rely on vdev_probe to determine if the errors are more 169 * critical. 170 */ 171 if (zio->io_error != 0) 172 vd->vdev_stat.vs_initialize_errors++; 173 174 vd->vdev_initialize_bytes_done += zio->io_orig_size; 175 } 176 ASSERT3U(vd->vdev_initialize_inflight, >, 0); 177 vd->vdev_initialize_inflight--; 178 cv_broadcast(&vd->vdev_initialize_io_cv); 179 mutex_exit(&vd->vdev_initialize_io_lock); 180 181 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 182 } 183 184 /* Takes care of physical writing and limiting # of concurrent ZIOs. */ 185 static int 186 vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) 187 { 188 spa_t *spa = vd->vdev_spa; 189 190 /* Limit inflight initializing I/Os */ 191 mutex_enter(&vd->vdev_initialize_io_lock); 192 while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { 193 cv_wait(&vd->vdev_initialize_io_cv, 194 &vd->vdev_initialize_io_lock); 195 } 196 vd->vdev_initialize_inflight++; 197 mutex_exit(&vd->vdev_initialize_io_lock); 198 199 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 200 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 201 uint64_t txg = dmu_tx_get_txg(tx); 202 203 spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); 204 mutex_enter(&vd->vdev_initialize_lock); 205 206 if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { 207 uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); 208 *guid = vd->vdev_guid; 209 210 /* This is the first write of this txg. */ 211 dsl_sync_task_nowait(spa_get_dsl(spa), 212 vdev_initialize_zap_update_sync, guid, 2, 213 ZFS_SPACE_CHECK_RESERVED, tx); 214 } 215 216 /* 217 * We know the vdev struct will still be around since all 218 * consumers of vdev_free must stop the initialization first. 219 */ 220 if (vdev_initialize_should_stop(vd)) { 221 mutex_enter(&vd->vdev_initialize_io_lock); 222 ASSERT3U(vd->vdev_initialize_inflight, >, 0); 223 vd->vdev_initialize_inflight--; 224 mutex_exit(&vd->vdev_initialize_io_lock); 225 spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); 226 mutex_exit(&vd->vdev_initialize_lock); 227 dmu_tx_commit(tx); 228 return (SET_ERROR(EINTR)); 229 } 230 mutex_exit(&vd->vdev_initialize_lock); 231 232 vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; 233 zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, 234 size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, 235 ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); 236 /* vdev_initialize_cb releases SCL_STATE_ALL */ 237 238 dmu_tx_commit(tx); 239 240 return (0); 241 } 242 243 /* 244 * Callback to fill each ABD chunk with zfs_initialize_value. len must be 245 * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD 246 * allocation will guarantee these for us. 247 */ 248 /* ARGSUSED */ 249 static int 250 vdev_initialize_block_fill(void *buf, size_t len, void *unused) 251 { 252 ASSERT0(len % sizeof (uint64_t)); 253 for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { 254 *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; 255 } 256 return (0); 257 } 258 259 static abd_t * 260 vdev_initialize_block_alloc() 261 { 262 /* Allocate ABD for filler data */ 263 abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); 264 265 ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); 266 (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, 267 vdev_initialize_block_fill, NULL); 268 269 return (data); 270 } 271 272 static void 273 vdev_initialize_block_free(abd_t *data) 274 { 275 abd_free(data); 276 } 277 278 static int 279 vdev_initialize_ranges(vdev_t *vd, abd_t *data) 280 { 281 avl_tree_t *rt = &vd->vdev_initialize_tree->rt_root; 282 283 for (range_seg_t *rs = avl_first(rt); rs != NULL; 284 rs = AVL_NEXT(rt, rs)) { 285 uint64_t size = rs->rs_end - rs->rs_start; 286 287 /* Split range into legally-sized physical chunks */ 288 uint64_t writes_required = 289 ((size - 1) / zfs_initialize_chunk_size) + 1; 290 291 for (uint64_t w = 0; w < writes_required; w++) { 292 int error; 293 294 error = vdev_initialize_write(vd, 295 VDEV_LABEL_START_SIZE + rs->rs_start + 296 (w * zfs_initialize_chunk_size), 297 MIN(size - (w * zfs_initialize_chunk_size), 298 zfs_initialize_chunk_size), data); 299 if (error != 0) 300 return (error); 301 } 302 } 303 return (0); 304 } 305 306 static void 307 vdev_initialize_calculate_progress(vdev_t *vd) 308 { 309 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 310 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 311 ASSERT(vd->vdev_leaf_zap != 0); 312 313 vd->vdev_initialize_bytes_est = 0; 314 vd->vdev_initialize_bytes_done = 0; 315 316 for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { 317 metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 318 mutex_enter(&msp->ms_lock); 319 320 uint64_t ms_free = msp->ms_size - 321 metaslab_allocated_space(msp); 322 323 if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) 324 ms_free /= vd->vdev_top->vdev_children; 325 326 /* 327 * Convert the metaslab range to a physical range 328 * on our vdev. We use this to determine if we are 329 * in the middle of this metaslab range. 330 */ 331 range_seg_t logical_rs, physical_rs; 332 logical_rs.rs_start = msp->ms_start; 333 logical_rs.rs_end = msp->ms_start + msp->ms_size; 334 vdev_xlate(vd, &logical_rs, &physical_rs); 335 336 if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { 337 vd->vdev_initialize_bytes_est += ms_free; 338 mutex_exit(&msp->ms_lock); 339 continue; 340 } else if (vd->vdev_initialize_last_offset > 341 physical_rs.rs_end) { 342 vd->vdev_initialize_bytes_done += ms_free; 343 vd->vdev_initialize_bytes_est += ms_free; 344 mutex_exit(&msp->ms_lock); 345 continue; 346 } 347 348 /* 349 * If we get here, we're in the middle of initializing this 350 * metaslab. Load it and walk the free tree for more accurate 351 * progress estimation. 352 */ 353 VERIFY0(metaslab_load(msp)); 354 355 for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); 356 rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) { 357 logical_rs.rs_start = rs->rs_start; 358 logical_rs.rs_end = rs->rs_end; 359 vdev_xlate(vd, &logical_rs, &physical_rs); 360 361 uint64_t size = physical_rs.rs_end - 362 physical_rs.rs_start; 363 vd->vdev_initialize_bytes_est += size; 364 if (vd->vdev_initialize_last_offset > 365 physical_rs.rs_end) { 366 vd->vdev_initialize_bytes_done += size; 367 } else if (vd->vdev_initialize_last_offset > 368 physical_rs.rs_start && 369 vd->vdev_initialize_last_offset < 370 physical_rs.rs_end) { 371 vd->vdev_initialize_bytes_done += 372 vd->vdev_initialize_last_offset - 373 physical_rs.rs_start; 374 } 375 } 376 mutex_exit(&msp->ms_lock); 377 } 378 } 379 380 static int 381 vdev_initialize_load(vdev_t *vd) 382 { 383 int err = 0; 384 ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || 385 spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); 386 ASSERT(vd->vdev_leaf_zap != 0); 387 388 if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || 389 vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { 390 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 391 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, 392 sizeof (vd->vdev_initialize_last_offset), 1, 393 &vd->vdev_initialize_last_offset); 394 if (err == ENOENT) { 395 vd->vdev_initialize_last_offset = 0; 396 err = 0; 397 } 398 } 399 400 vdev_initialize_calculate_progress(vd); 401 return (err); 402 } 403 404 405 /* 406 * Convert the logical range into a physical range and add it to our 407 * avl tree. 408 */ 409 void 410 vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) 411 { 412 vdev_t *vd = arg; 413 range_seg_t logical_rs, physical_rs; 414 logical_rs.rs_start = start; 415 logical_rs.rs_end = start + size; 416 417 ASSERT(vd->vdev_ops->vdev_op_leaf); 418 vdev_xlate(vd, &logical_rs, &physical_rs); 419 420 IMPLY(vd->vdev_top == vd, 421 logical_rs.rs_start == physical_rs.rs_start); 422 IMPLY(vd->vdev_top == vd, 423 logical_rs.rs_end == physical_rs.rs_end); 424 425 /* Only add segments that we have not visited yet */ 426 if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) 427 return; 428 429 /* Pick up where we left off mid-range. */ 430 if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { 431 zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " 432 "(%llu, %llu)", vd->vdev_path, 433 (u_longlong_t)physical_rs.rs_start, 434 (u_longlong_t)physical_rs.rs_end, 435 (u_longlong_t)vd->vdev_initialize_last_offset, 436 (u_longlong_t)physical_rs.rs_end); 437 ASSERT3U(physical_rs.rs_end, >, 438 vd->vdev_initialize_last_offset); 439 physical_rs.rs_start = vd->vdev_initialize_last_offset; 440 } 441 ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); 442 443 /* 444 * With raidz, it's possible that the logical range does not live on 445 * this leaf vdev. We only add the physical range to this vdev's if it 446 * has a length greater than 0. 447 */ 448 if (physical_rs.rs_end > physical_rs.rs_start) { 449 range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, 450 physical_rs.rs_end - physical_rs.rs_start); 451 } else { 452 ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); 453 } 454 } 455 456 static void 457 vdev_initialize_thread(void *arg) 458 { 459 vdev_t *vd = arg; 460 spa_t *spa = vd->vdev_spa; 461 int error = 0; 462 uint64_t ms_count = 0; 463 464 ASSERT(vdev_is_concrete(vd)); 465 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 466 467 vd->vdev_initialize_last_offset = 0; 468 VERIFY0(vdev_initialize_load(vd)); 469 470 abd_t *deadbeef = vdev_initialize_block_alloc(); 471 472 vd->vdev_initialize_tree = range_tree_create(NULL, NULL); 473 474 for (uint64_t i = 0; !vd->vdev_detached && 475 i < vd->vdev_top->vdev_ms_count; i++) { 476 metaslab_t *msp = vd->vdev_top->vdev_ms[i]; 477 boolean_t unload_when_done = B_FALSE; 478 479 /* 480 * If we've expanded the top-level vdev or it's our 481 * first pass, calculate our progress. 482 */ 483 if (vd->vdev_top->vdev_ms_count != ms_count) { 484 vdev_initialize_calculate_progress(vd); 485 ms_count = vd->vdev_top->vdev_ms_count; 486 } 487 488 spa_config_exit(spa, SCL_CONFIG, FTAG); 489 metaslab_disable(msp); 490 mutex_enter(&msp->ms_lock); 491 if (!msp->ms_loaded && !msp->ms_loading) 492 unload_when_done = B_TRUE; 493 VERIFY0(metaslab_load(msp)); 494 495 range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, 496 vd); 497 mutex_exit(&msp->ms_lock); 498 499 error = vdev_initialize_ranges(vd, deadbeef); 500 metaslab_enable(msp, B_TRUE, unload_when_done); 501 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 502 503 range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); 504 if (error != 0) 505 break; 506 } 507 508 spa_config_exit(spa, SCL_CONFIG, FTAG); 509 mutex_enter(&vd->vdev_initialize_io_lock); 510 while (vd->vdev_initialize_inflight > 0) { 511 cv_wait(&vd->vdev_initialize_io_cv, 512 &vd->vdev_initialize_io_lock); 513 } 514 mutex_exit(&vd->vdev_initialize_io_lock); 515 516 range_tree_destroy(vd->vdev_initialize_tree); 517 vdev_initialize_block_free(deadbeef); 518 vd->vdev_initialize_tree = NULL; 519 520 mutex_enter(&vd->vdev_initialize_lock); 521 if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { 522 vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); 523 } 524 ASSERT(vd->vdev_initialize_thread != NULL || 525 vd->vdev_initialize_inflight == 0); 526 527 /* 528 * Drop the vdev_initialize_lock while we sync out the 529 * txg since it's possible that a device might be trying to 530 * come online and must check to see if it needs to restart an 531 * initialization. That thread will be holding the spa_config_lock 532 * which would prevent the txg_wait_synced from completing. 533 */ 534 mutex_exit(&vd->vdev_initialize_lock); 535 txg_wait_synced(spa_get_dsl(spa), 0); 536 mutex_enter(&vd->vdev_initialize_lock); 537 538 vd->vdev_initialize_thread = NULL; 539 cv_broadcast(&vd->vdev_initialize_cv); 540 mutex_exit(&vd->vdev_initialize_lock); 541 } 542 543 /* 544 * Initiates a device. Caller must hold vdev_initialize_lock. 545 * Device must be a leaf and not already be initializing. 546 */ 547 void 548 vdev_initialize(vdev_t *vd) 549 { 550 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 551 ASSERT(vd->vdev_ops->vdev_op_leaf); 552 ASSERT(vdev_is_concrete(vd)); 553 ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 554 ASSERT(!vd->vdev_detached); 555 ASSERT(!vd->vdev_initialize_exit_wanted); 556 ASSERT(!vd->vdev_top->vdev_removing); 557 558 vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); 559 vd->vdev_initialize_thread = thread_create(NULL, 0, 560 vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); 561 } 562 563 /* 564 * Wait for the initialize thread to be terminated (cancelled or stopped). 565 */ 566 static void 567 vdev_initialize_stop_wait_impl(vdev_t *vd) 568 { 569 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 570 571 while (vd->vdev_initialize_thread != NULL) 572 cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); 573 574 ASSERT3P(vd->vdev_initialize_thread, ==, NULL); 575 vd->vdev_initialize_exit_wanted = B_FALSE; 576 } 577 578 /* 579 * Wait for vdev initialize threads which were either to cleanly exit. 580 */ 581 void 582 vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) 583 { 584 vdev_t *vd; 585 586 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 587 588 while ((vd = list_remove_head(vd_list)) != NULL) { 589 mutex_enter(&vd->vdev_initialize_lock); 590 vdev_initialize_stop_wait_impl(vd); 591 mutex_exit(&vd->vdev_initialize_lock); 592 } 593 } 594 595 /* 596 * Stop initializing a device, with the resultant initializing state being 597 * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when 598 * a list_t is provided the stopping vdev is inserted in to the list. Callers 599 * are then required to call vdev_initialize_stop_wait() to block for all the 600 * initialization threads to exit. The caller must hold vdev_initialize_lock 601 * and must not be writing to the spa config, as the initializing thread may 602 * try to enter the config as a reader before exiting. 603 */ 604 void 605 vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, 606 list_t *vd_list) 607 { 608 ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); 609 ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); 610 ASSERT(vd->vdev_ops->vdev_op_leaf); 611 ASSERT(vdev_is_concrete(vd)); 612 613 /* 614 * Allow cancel requests to proceed even if the initialize thread 615 * has stopped. 616 */ 617 if (vd->vdev_initialize_thread == NULL && 618 tgt_state != VDEV_INITIALIZE_CANCELED) { 619 return; 620 } 621 622 vdev_initialize_change_state(vd, tgt_state); 623 vd->vdev_initialize_exit_wanted = B_TRUE; 624 625 if (vd_list == NULL) { 626 vdev_initialize_stop_wait_impl(vd); 627 } else { 628 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 629 list_insert_tail(vd_list, vd); 630 } 631 } 632 633 static void 634 vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, 635 list_t *vd_list) 636 { 637 if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { 638 mutex_enter(&vd->vdev_initialize_lock); 639 vdev_initialize_stop(vd, tgt_state, vd_list); 640 mutex_exit(&vd->vdev_initialize_lock); 641 return; 642 } 643 644 for (uint64_t i = 0; i < vd->vdev_children; i++) { 645 vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, 646 vd_list); 647 } 648 } 649 650 /* 651 * Convenience function to stop initializing of a vdev tree and set all 652 * initialize thread pointers to NULL. 653 */ 654 void 655 vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) 656 { 657 spa_t *spa = vd->vdev_spa; 658 list_t vd_list; 659 660 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 661 662 list_create(&vd_list, sizeof (vdev_t), 663 offsetof(vdev_t, vdev_initialize_node)); 664 665 vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); 666 vdev_initialize_stop_wait(spa, &vd_list); 667 668 if (vd->vdev_spa->spa_sync_on) { 669 /* Make sure that our state has been synced to disk */ 670 txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); 671 } 672 673 list_destroy(&vd_list); 674 } 675 676 void 677 vdev_initialize_restart(vdev_t *vd) 678 { 679 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 680 ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 681 682 if (vd->vdev_leaf_zap != 0) { 683 mutex_enter(&vd->vdev_initialize_lock); 684 uint64_t initialize_state = VDEV_INITIALIZE_NONE; 685 int err = zap_lookup(vd->vdev_spa->spa_meta_objset, 686 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, 687 sizeof (initialize_state), 1, &initialize_state); 688 ASSERT(err == 0 || err == ENOENT); 689 vd->vdev_initialize_state = initialize_state; 690 691 uint64_t timestamp = 0; 692 err = zap_lookup(vd->vdev_spa->spa_meta_objset, 693 vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, 694 sizeof (timestamp), 1, ×tamp); 695 ASSERT(err == 0 || err == ENOENT); 696 vd->vdev_initialize_action_time = (time_t)timestamp; 697 698 if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || 699 vd->vdev_offline) { 700 /* load progress for reporting, but don't resume */ 701 VERIFY0(vdev_initialize_load(vd)); 702 } else if (vd->vdev_initialize_state == 703 VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && 704 !vd->vdev_top->vdev_removing && 705 vd->vdev_initialize_thread == NULL) { 706 vdev_initialize(vd); 707 } 708 709 mutex_exit(&vd->vdev_initialize_lock); 710 } 711 712 for (uint64_t i = 0; i < vd->vdev_children; i++) { 713 vdev_initialize_restart(vd->vdev_child[i]); 714 } 715 } 716