1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2011, 2021 by Delphix. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. 27 * Copyright (c) 2014 Integros [integros.com] 28 * Copyright 2016 Toomas Soome <tsoome@me.com> 29 * Copyright 2017 Joyent, Inc. 30 * Copyright (c) 2017, Intel Corporation. 31 * Copyright (c) 2019, Datto Inc. All rights reserved. 32 * Copyright (c) 2021, 2025, Klara, Inc. 33 * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. 34 * Copyright (c) 2026, Seagate Technology, LLC. 35 */ 36 37 #include <sys/zfs_context.h> 38 #include <sys/fm/fs/zfs.h> 39 #include <sys/spa.h> 40 #include <sys/spa_impl.h> 41 #include <sys/bpobj.h> 42 #include <sys/dmu.h> 43 #include <sys/dmu_tx.h> 44 #include <sys/dsl_dir.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/vdev_rebuild.h> 47 #include <sys/vdev_draid.h> 48 #include <sys/uberblock_impl.h> 49 #include <sys/metaslab.h> 50 #include <sys/metaslab_impl.h> 51 #include <sys/space_map.h> 52 #include <sys/space_reftree.h> 53 #include <sys/zio.h> 54 #include <sys/zap.h> 55 #include <sys/fs/zfs.h> 56 #include <sys/arc.h> 57 #include <sys/zil.h> 58 #include <sys/dsl_scan.h> 59 #include <sys/vdev_raidz.h> 60 #include <sys/abd.h> 61 #include <sys/vdev_initialize.h> 62 #include <sys/vdev_trim.h> 63 #include <sys/vdev_raidz.h> 64 #include <sys/zvol.h> 65 #include <sys/zfs_ratelimit.h> 66 #include "zfs_prop.h" 67 68 /* 69 * One metaslab from each (normal-class) vdev is used by the ZIL. These are 70 * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are 71 * part of the spa_embedded_log_class. The metaslab with the most free space 72 * in each vdev is selected for this purpose when the pool is opened (or a 73 * vdev is added). See vdev_metaslab_init(). 74 * 75 * Log blocks can be allocated from the following locations. Each one is tried 76 * in order until the allocation succeeds: 77 * 1. dedicated log vdevs, aka "slog" (spa_log_class) 78 * 2. embedded slog metaslabs (spa_embedded_log_class) 79 * 3. other metaslabs in normal vdevs (spa_normal_class) 80 * 81 * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer 82 * than this number of metaslabs in the vdev. This ensures that we don't set 83 * aside an unreasonable amount of space for the ZIL. If set to less than 84 * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced 85 * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab. 86 */ 87 static uint_t zfs_embedded_slog_min_ms = 64; 88 89 /* default target for number of metaslabs per top-level vdev */ 90 static uint_t zfs_vdev_default_ms_count = 200; 91 92 /* minimum number of metaslabs per top-level vdev */ 93 static uint_t zfs_vdev_min_ms_count = 16; 94 95 /* practical upper limit of total metaslabs per top-level vdev */ 96 static uint_t zfs_vdev_ms_count_limit = 1ULL << 17; 97 98 /* lower limit for metaslab size (512M) */ 99 static uint_t zfs_vdev_default_ms_shift = 29; 100 101 /* upper limit for metaslab size (16G) */ 102 static uint_t zfs_vdev_max_ms_shift = 34; 103 104 int vdev_validate_skip = B_FALSE; 105 106 /* 107 * Since the DTL space map of a vdev is not expected to have a lot of 108 * entries, we default its block size to 4K. 109 */ 110 int zfs_vdev_dtl_sm_blksz = (1 << 12); 111 112 /* 113 * Rate limit slow IO (delay) events to this many per second. 114 */ 115 static unsigned int zfs_slow_io_events_per_second = 20; 116 117 /* 118 * Rate limit deadman "hung IO" events to this many per second. 119 */ 120 static unsigned int zfs_deadman_events_per_second = 1; 121 122 /* 123 * Rate limit direct write IO verify failures to this many per scond. 124 */ 125 static unsigned int zfs_dio_write_verify_events_per_second = 20; 126 127 /* 128 * Rate limit checksum events after this many checksum errors per second. 129 */ 130 static unsigned int zfs_checksum_events_per_second = 20; 131 132 /* 133 * Ignore errors during scrub/resilver. Allows to work around resilver 134 * upon import when there are pool errors. 135 */ 136 static int zfs_scan_ignore_errors = 0; 137 138 /* 139 * vdev-wide space maps that have lots of entries written to them at 140 * the end of each transaction can benefit from a higher I/O bandwidth 141 * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. 142 */ 143 int zfs_vdev_standard_sm_blksz = (1 << 17); 144 145 /* 146 * Tunable parameter for debugging or performance analysis. Setting this 147 * will cause pool corruption on power loss if a volatile out-of-order 148 * write cache is enabled. 149 */ 150 int zfs_nocacheflush = 0; 151 152 /* 153 * Maximum and minimum ashift values that can be automatically set based on 154 * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX 155 * is higher than the maximum value, it is intentionally limited here to not 156 * excessively impact pool space efficiency. Higher ashift values may still 157 * be forced by vdev logical ashift or by user via ashift property, but won't 158 * be set automatically as a performance optimization. 159 */ 160 uint_t zfs_vdev_max_auto_ashift = 14; 161 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; 162 163 /* 164 * VDEV checksum verification for Direct I/O writes. This is neccessary for 165 * Linux, because anonymous pages can not be placed under write protection 166 * during Direct I/O writes. 167 */ 168 #if !defined(__FreeBSD__) 169 uint_t zfs_vdev_direct_write_verify = 1; 170 #else 171 uint_t zfs_vdev_direct_write_verify = 0; 172 #endif 173 174 void 175 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) 176 { 177 va_list adx; 178 char buf[256]; 179 180 va_start(adx, fmt); 181 (void) vsnprintf(buf, sizeof (buf), fmt, adx); 182 va_end(adx); 183 184 if (vd->vdev_path != NULL) { 185 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, 186 vd->vdev_path, buf); 187 } else { 188 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", 189 vd->vdev_ops->vdev_op_type, 190 (u_longlong_t)vd->vdev_id, 191 (u_longlong_t)vd->vdev_guid, buf); 192 } 193 } 194 195 void 196 vdev_dbgmsg_print_tree(vdev_t *vd, int indent) 197 { 198 char state[20]; 199 200 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { 201 zfs_dbgmsg("%*svdev %llu: %s", indent, "", 202 (u_longlong_t)vd->vdev_id, 203 vd->vdev_ops->vdev_op_type); 204 return; 205 } 206 207 switch (vd->vdev_state) { 208 case VDEV_STATE_UNKNOWN: 209 (void) snprintf(state, sizeof (state), "unknown"); 210 break; 211 case VDEV_STATE_CLOSED: 212 (void) snprintf(state, sizeof (state), "closed"); 213 break; 214 case VDEV_STATE_OFFLINE: 215 (void) snprintf(state, sizeof (state), "offline"); 216 break; 217 case VDEV_STATE_REMOVED: 218 (void) snprintf(state, sizeof (state), "removed"); 219 break; 220 case VDEV_STATE_CANT_OPEN: 221 (void) snprintf(state, sizeof (state), "can't open"); 222 break; 223 case VDEV_STATE_FAULTED: 224 (void) snprintf(state, sizeof (state), "faulted"); 225 break; 226 case VDEV_STATE_DEGRADED: 227 (void) snprintf(state, sizeof (state), "degraded"); 228 break; 229 case VDEV_STATE_HEALTHY: 230 (void) snprintf(state, sizeof (state), "healthy"); 231 break; 232 default: 233 (void) snprintf(state, sizeof (state), "<state %u>", 234 (uint_t)vd->vdev_state); 235 } 236 237 zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, 238 "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, 239 vd->vdev_islog ? " (log)" : "", 240 (u_longlong_t)vd->vdev_guid, 241 vd->vdev_path ? vd->vdev_path : "N/A", state); 242 243 for (uint64_t i = 0; i < vd->vdev_children; i++) 244 vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); 245 } 246 247 char * 248 vdev_rt_name(vdev_t *vd, const char *name) 249 { 250 return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}", 251 spa_name(vd->vdev_spa), 252 (u_longlong_t)vd->vdev_guid, 253 name)); 254 } 255 256 static char * 257 vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type) 258 { 259 return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}", 260 spa_name(vd->vdev_spa), 261 (u_longlong_t)vd->vdev_guid, 262 name, 263 dtl_type)); 264 } 265 266 /* 267 * Virtual device management. 268 */ 269 270 static vdev_ops_t *const vdev_ops_table[] = { 271 &vdev_root_ops, 272 &vdev_raidz_ops, 273 &vdev_draid_ops, 274 &vdev_draid_spare_ops, 275 &vdev_mirror_ops, 276 &vdev_replacing_ops, 277 &vdev_spare_ops, 278 &vdev_disk_ops, 279 &vdev_file_ops, 280 &vdev_missing_ops, 281 &vdev_hole_ops, 282 &vdev_indirect_ops, 283 NULL 284 }; 285 286 /* 287 * Given a vdev type, return the appropriate ops vector. 288 */ 289 static vdev_ops_t * 290 vdev_getops(const char *type) 291 { 292 vdev_ops_t *ops, *const *opspp; 293 294 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 295 if (strcmp(ops->vdev_op_type, type) == 0) 296 break; 297 298 return (ops); 299 } 300 301 /* 302 * Given a vdev and a metaslab class, find which metaslab group we're 303 * interested in. All vdevs may belong to two different metaslab classes. 304 * Dedicated slog devices use only the primary metaslab group, rather than a 305 * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and 306 * will point to a metaslab group of either embedded_log_class (for normal 307 * vdevs) or special_embedded_log_class (for special vdevs). 308 */ 309 metaslab_group_t * 310 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) 311 { 312 if ((mc == spa_embedded_log_class(vd->vdev_spa) || 313 mc == spa_special_embedded_log_class(vd->vdev_spa)) && 314 vd->vdev_log_mg != NULL) 315 return (vd->vdev_log_mg); 316 else 317 return (vd->vdev_mg); 318 } 319 320 void 321 vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, 322 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) 323 { 324 (void) vd, (void) remain_rs; 325 326 physical_rs->rs_start = logical_rs->rs_start; 327 physical_rs->rs_end = logical_rs->rs_end; 328 } 329 330 /* 331 * Derive the enumerated allocation bias from string input. 332 * String origin is either the per-vdev zap or zpool(8). 333 */ 334 static vdev_alloc_bias_t 335 vdev_derive_alloc_bias(const char *bias) 336 { 337 vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; 338 339 if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) 340 alloc_bias = VDEV_BIAS_LOG; 341 else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) 342 alloc_bias = VDEV_BIAS_SPECIAL; 343 else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) 344 alloc_bias = VDEV_BIAS_DEDUP; 345 346 return (alloc_bias); 347 } 348 349 uint64_t 350 vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg) 351 { 352 ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift)); 353 uint64_t csize, psize = asize; 354 for (int c = 0; c < vd->vdev_children; c++) { 355 csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg); 356 psize = MIN(psize, csize); 357 } 358 359 return (psize); 360 } 361 362 /* 363 * Default asize function: return the MAX of psize with the asize of 364 * all children. This is what's used by anything other than RAID-Z. 365 */ 366 uint64_t 367 vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg) 368 { 369 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 370 uint64_t csize; 371 372 for (int c = 0; c < vd->vdev_children; c++) { 373 csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg); 374 asize = MAX(asize, csize); 375 } 376 377 return (asize); 378 } 379 380 uint64_t 381 vdev_default_min_asize(vdev_t *vd) 382 { 383 return (vd->vdev_min_asize); 384 } 385 386 /* 387 * Get the minimum allocatable size. We define the allocatable size as 388 * the vdev's asize rounded to the nearest metaslab. This allows us to 389 * replace or attach devices which don't have the same physical size but 390 * can still satisfy the same number of allocations. 391 */ 392 uint64_t 393 vdev_get_min_asize(vdev_t *vd) 394 { 395 vdev_t *pvd = vd->vdev_parent; 396 397 /* 398 * If our parent is NULL (inactive spare or cache) or is the root, 399 * just return our own asize. 400 */ 401 if (pvd == NULL) 402 return (vd->vdev_asize); 403 404 /* 405 * The top-level vdev just returns the allocatable size rounded 406 * to the nearest metaslab. 407 */ 408 if (vd == vd->vdev_top) 409 return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift, 410 uint64_t)); 411 412 return (pvd->vdev_ops->vdev_op_min_asize(pvd)); 413 } 414 415 void 416 vdev_set_min_asize(vdev_t *vd) 417 { 418 vd->vdev_min_asize = vdev_get_min_asize(vd); 419 420 for (int c = 0; c < vd->vdev_children; c++) 421 vdev_set_min_asize(vd->vdev_child[c]); 422 } 423 424 /* 425 * Get the minimal allocation size for the top-level vdev. 426 */ 427 uint64_t 428 vdev_get_min_alloc(vdev_t *vd) 429 { 430 uint64_t min_alloc = 1ULL << vd->vdev_ashift; 431 432 if (vd->vdev_ops->vdev_op_min_alloc != NULL) 433 min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd); 434 435 return (min_alloc); 436 } 437 438 /* 439 * Get the parity level for a top-level vdev. 440 */ 441 uint64_t 442 vdev_get_nparity(vdev_t *vd) 443 { 444 uint64_t nparity = 0; 445 446 if (vd->vdev_ops->vdev_op_nparity != NULL) 447 nparity = vd->vdev_ops->vdev_op_nparity(vd); 448 449 return (nparity); 450 } 451 452 static int 453 vdev_prop_get_objid(vdev_t *vd, uint64_t *objid) 454 { 455 456 if (vd->vdev_root_zap != 0) { 457 *objid = vd->vdev_root_zap; 458 } else if (vd->vdev_top_zap != 0) { 459 *objid = vd->vdev_top_zap; 460 } else if (vd->vdev_leaf_zap != 0) { 461 *objid = vd->vdev_leaf_zap; 462 } else { 463 *objid = 0; 464 return (EINVAL); 465 } 466 467 return (0); 468 } 469 470 static int 471 vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) 472 { 473 spa_t *spa = vd->vdev_spa; 474 objset_t *mos = spa->spa_meta_objset; 475 uint64_t objid; 476 int err; 477 478 if (vdev_prop_get_objid(vd, &objid) != 0) { 479 /* No ZAP: property was never set, return the default. */ 480 *value = vdev_prop_default_numeric(prop); 481 return (ENOENT); 482 } 483 484 err = zap_lookup(mos, objid, vdev_prop_to_name(prop), 485 sizeof (uint64_t), 1, value); 486 if (err == ENOENT) 487 *value = vdev_prop_default_numeric(prop); 488 489 return (err); 490 } 491 492 static int 493 vdev_prop_get_bool(vdev_t *vd, vdev_prop_t prop, boolean_t *bvalue) 494 { 495 int err; 496 uint64_t ivalue; 497 498 err = vdev_prop_get_int(vd, prop, &ivalue); 499 *bvalue = ivalue != 0; 500 501 return (err); 502 } 503 504 /* 505 * Get the number of data disks for a top-level vdev. 506 */ 507 uint64_t 508 vdev_get_ndisks(vdev_t *vd) 509 { 510 uint64_t ndisks = 1; 511 512 if (vd->vdev_ops->vdev_op_ndisks != NULL) 513 ndisks = vd->vdev_ops->vdev_op_ndisks(vd); 514 515 return (ndisks); 516 } 517 518 vdev_t * 519 vdev_lookup_top(spa_t *spa, uint64_t vdev) 520 { 521 vdev_t *rvd = spa->spa_root_vdev; 522 523 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 524 525 if (vdev < rvd->vdev_children) { 526 ASSERT(rvd->vdev_child[vdev] != NULL); 527 return (rvd->vdev_child[vdev]); 528 } 529 530 return (NULL); 531 } 532 533 vdev_t * 534 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 535 { 536 vdev_t *mvd; 537 538 if (vd->vdev_guid == guid) 539 return (vd); 540 541 for (int c = 0; c < vd->vdev_children; c++) 542 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 543 NULL) 544 return (mvd); 545 546 return (NULL); 547 } 548 549 static int 550 vdev_count_leaves_impl(vdev_t *vd) 551 { 552 int n = 0; 553 554 if (vd->vdev_ops->vdev_op_leaf) 555 return (1); 556 557 for (int c = 0; c < vd->vdev_children; c++) 558 n += vdev_count_leaves_impl(vd->vdev_child[c]); 559 560 return (n); 561 } 562 563 int 564 vdev_count_leaves(spa_t *spa) 565 { 566 int rc; 567 568 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 569 rc = vdev_count_leaves_impl(spa->spa_root_vdev); 570 spa_config_exit(spa, SCL_VDEV, FTAG); 571 572 return (rc); 573 } 574 575 void 576 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 577 { 578 size_t oldsize, newsize; 579 uint64_t id = cvd->vdev_id; 580 vdev_t **newchild; 581 582 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 583 ASSERT0P(cvd->vdev_parent); 584 585 cvd->vdev_parent = pvd; 586 587 if (pvd == NULL) 588 return; 589 590 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 591 592 oldsize = pvd->vdev_children * sizeof (vdev_t *); 593 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 594 newsize = pvd->vdev_children * sizeof (vdev_t *); 595 596 newchild = kmem_alloc(newsize, KM_SLEEP); 597 if (pvd->vdev_child != NULL) { 598 memcpy(newchild, pvd->vdev_child, oldsize); 599 kmem_free(pvd->vdev_child, oldsize); 600 } 601 602 pvd->vdev_child = newchild; 603 pvd->vdev_child[id] = cvd; 604 pvd->vdev_nonrot &= cvd->vdev_nonrot; 605 606 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 607 ASSERT0P(cvd->vdev_top->vdev_parent->vdev_parent); 608 609 /* 610 * Walk up all ancestors to update guid sum. 611 */ 612 for (; pvd != NULL; pvd = pvd->vdev_parent) 613 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 614 615 if (cvd->vdev_ops->vdev_op_leaf) { 616 list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); 617 cvd->vdev_spa->spa_leaf_list_gen++; 618 } 619 } 620 621 void 622 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 623 { 624 int c; 625 uint_t id = cvd->vdev_id; 626 627 ASSERT(cvd->vdev_parent == pvd); 628 629 if (pvd == NULL) 630 return; 631 632 ASSERT(id < pvd->vdev_children); 633 ASSERT(pvd->vdev_child[id] == cvd); 634 635 pvd->vdev_child[id] = NULL; 636 cvd->vdev_parent = NULL; 637 638 for (c = 0; c < pvd->vdev_children; c++) 639 if (pvd->vdev_child[c]) 640 break; 641 642 if (c == pvd->vdev_children) { 643 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 644 pvd->vdev_child = NULL; 645 pvd->vdev_children = 0; 646 } 647 648 if (cvd->vdev_ops->vdev_op_leaf) { 649 spa_t *spa = cvd->vdev_spa; 650 list_remove(&spa->spa_leaf_list, cvd); 651 spa->spa_leaf_list_gen++; 652 } 653 654 /* 655 * Walk up all ancestors to update guid sum. 656 */ 657 for (; pvd != NULL; pvd = pvd->vdev_parent) 658 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 659 } 660 661 /* 662 * Remove any holes in the child array. 663 */ 664 void 665 vdev_compact_children(vdev_t *pvd) 666 { 667 vdev_t **newchild, *cvd; 668 int oldc = pvd->vdev_children; 669 int newc; 670 671 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 672 673 if (oldc == 0) 674 return; 675 676 for (int c = newc = 0; c < oldc; c++) 677 if (pvd->vdev_child[c]) 678 newc++; 679 680 if (newc > 0) { 681 newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); 682 683 for (int c = newc = 0; c < oldc; c++) { 684 if ((cvd = pvd->vdev_child[c]) != NULL) { 685 newchild[newc] = cvd; 686 cvd->vdev_id = newc++; 687 } 688 } 689 } else { 690 newchild = NULL; 691 } 692 693 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 694 pvd->vdev_child = newchild; 695 pvd->vdev_children = newc; 696 } 697 698 /* 699 * Allocate and minimally initialize a vdev_t. 700 */ 701 vdev_t * 702 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 703 { 704 vdev_t *vd; 705 vdev_indirect_config_t *vic; 706 707 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 708 vic = &vd->vdev_indirect_config; 709 710 if (spa->spa_root_vdev == NULL) { 711 ASSERT(ops == &vdev_root_ops); 712 spa->spa_root_vdev = vd; 713 spa->spa_load_guid = spa_generate_load_guid(); 714 } 715 716 if (guid == 0 && ops != &vdev_hole_ops) { 717 if (spa->spa_root_vdev == vd) { 718 /* 719 * The root vdev's guid will also be the pool guid, 720 * which must be unique among all pools. 721 */ 722 guid = spa_generate_guid(NULL); 723 } else { 724 /* 725 * Any other vdev's guid must be unique within the pool. 726 */ 727 guid = spa_generate_guid(spa); 728 } 729 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 730 } 731 732 vd->vdev_spa = spa; 733 vd->vdev_id = id; 734 vd->vdev_guid = guid; 735 vd->vdev_guid_sum = guid; 736 vd->vdev_ops = ops; 737 vd->vdev_state = VDEV_STATE_CLOSED; 738 vd->vdev_ishole = (ops == &vdev_hole_ops); 739 vic->vic_prev_indirect_vdev = UINT64_MAX; 740 741 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); 742 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); 743 vd->vdev_obsolete_segments = zfs_range_tree_create_flags( 744 NULL, ZFS_RANGE_SEG64, NULL, 0, 0, 745 ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments")); 746 747 /* 748 * Initialize rate limit structs for events. We rate limit ZIO delay 749 * and checksum events so that we don't overwhelm ZED with thousands 750 * of events when a disk is acting up. 751 */ 752 zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, 753 1); 754 zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second, 755 1); 756 zfs_ratelimit_init(&vd->vdev_dio_verify_rl, 757 &zfs_dio_write_verify_events_per_second, 1); 758 zfs_ratelimit_init(&vd->vdev_checksum_rl, 759 &zfs_checksum_events_per_second, 1); 760 761 /* 762 * Default Thresholds for tuning ZED 763 */ 764 vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N); 765 vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T); 766 767 vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N); 768 vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T); 769 770 vd->vdev_slow_io_events = vdev_prop_default_numeric( 771 VDEV_PROP_SLOW_IO_EVENTS); 772 vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); 773 vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); 774 775 vd->vdev_scheduler = vdev_prop_default_numeric(VDEV_PROP_SCHEDULER); 776 777 list_link_init(&vd->vdev_config_dirty_node); 778 list_link_init(&vd->vdev_state_dirty_node); 779 list_link_init(&vd->vdev_initialize_node); 780 list_link_init(&vd->vdev_leaf_node); 781 list_link_init(&vd->vdev_trim_node); 782 783 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); 784 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 785 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 786 mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); 787 788 mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); 789 mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); 790 cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); 791 cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); 792 793 mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); 794 mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); 795 mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); 796 cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); 797 cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); 798 cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); 799 cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); 800 801 mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); 802 cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); 803 804 for (int t = 0; t < DTL_TYPES; t++) { 805 vd->vdev_dtl[t] = zfs_range_tree_create_flags( 806 NULL, ZFS_RANGE_SEG64, NULL, 0, 0, 807 ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t)); 808 } 809 810 txg_list_create(&vd->vdev_ms_list, spa, 811 offsetof(struct metaslab, ms_txg_node)); 812 txg_list_create(&vd->vdev_dtl_list, spa, 813 offsetof(struct vdev, vdev_dtl_node)); 814 vd->vdev_stat.vs_timestamp = gethrtime(); 815 vdev_queue_init(vd); 816 817 return (vd); 818 } 819 820 /* 821 * Allocate a new vdev. The 'alloctype' is used to control whether we are 822 * creating a new vdev or loading an existing one - the behavior is slightly 823 * different for each case. 824 */ 825 int 826 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 827 int alloctype) 828 { 829 vdev_ops_t *ops; 830 const char *type; 831 uint64_t guid = 0, islog; 832 vdev_t *vd; 833 vdev_indirect_config_t *vic; 834 const char *tmp = NULL; 835 int rc; 836 vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; 837 boolean_t top_level = (parent && !parent->vdev_parent); 838 839 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 840 841 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 842 return (SET_ERROR(EINVAL)); 843 844 if ((ops = vdev_getops(type)) == NULL) 845 return (SET_ERROR(EINVAL)); 846 847 /* 848 * If this is a load, get the vdev guid from the nvlist. 849 * Otherwise, vdev_alloc_common() will generate one for us. 850 */ 851 if (alloctype == VDEV_ALLOC_LOAD) { 852 uint64_t label_id; 853 854 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 855 label_id != id) 856 return (SET_ERROR(EINVAL)); 857 858 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 859 return (SET_ERROR(EINVAL)); 860 } else if (alloctype == VDEV_ALLOC_SPARE) { 861 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 862 return (SET_ERROR(EINVAL)); 863 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 864 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 865 return (SET_ERROR(EINVAL)); 866 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 867 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 868 return (SET_ERROR(EINVAL)); 869 } 870 871 /* 872 * The first allocated vdev must be of type 'root'. 873 */ 874 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 875 return (SET_ERROR(EINVAL)); 876 877 /* 878 * Determine whether we're a log vdev. 879 */ 880 islog = 0; 881 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 882 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 883 return (SET_ERROR(ENOTSUP)); 884 885 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 886 return (SET_ERROR(ENOTSUP)); 887 888 if (top_level && alloctype == VDEV_ALLOC_ADD) { 889 const char *bias; 890 891 /* 892 * If creating a top-level vdev, check for allocation 893 * classes input. 894 */ 895 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, 896 &bias) == 0) { 897 alloc_bias = vdev_derive_alloc_bias(bias); 898 899 /* spa_vdev_add() expects feature to be enabled */ 900 if (spa->spa_load_state != SPA_LOAD_CREATE && 901 !spa_feature_is_enabled(spa, 902 SPA_FEATURE_ALLOCATION_CLASSES)) { 903 return (SET_ERROR(ENOTSUP)); 904 } 905 } 906 907 /* spa_vdev_add() expects feature to be enabled */ 908 if (ops == &vdev_draid_ops && 909 spa->spa_load_state != SPA_LOAD_CREATE && 910 !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) { 911 return (SET_ERROR(ENOTSUP)); 912 } 913 } 914 915 /* 916 * Initialize the vdev specific data. This is done before calling 917 * vdev_alloc_common() since it may fail and this simplifies the 918 * error reporting and cleanup code paths. 919 */ 920 void *tsd = NULL; 921 if (ops->vdev_op_init != NULL) { 922 rc = ops->vdev_op_init(spa, nv, &tsd); 923 if (rc != 0) { 924 return (rc); 925 } 926 } 927 928 vd = vdev_alloc_common(spa, id, guid, ops); 929 vd->vdev_tsd = tsd; 930 vd->vdev_islog = islog; 931 932 if (top_level && alloc_bias != VDEV_BIAS_NONE) 933 vd->vdev_alloc_bias = alloc_bias; 934 935 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0) 936 vd->vdev_path = spa_strdup(tmp); 937 938 /* 939 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a 940 * fault on a vdev and want it to persist across imports (like with 941 * zpool offline -f). 942 */ 943 rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); 944 if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { 945 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; 946 vd->vdev_faulted = 1; 947 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 948 } 949 950 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0) 951 vd->vdev_devid = spa_strdup(tmp); 952 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0) 953 vd->vdev_physpath = spa_strdup(tmp); 954 955 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, 956 &tmp) == 0) 957 vd->vdev_enc_sysfs_path = spa_strdup(tmp); 958 959 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0) 960 vd->vdev_fru = spa_strdup(tmp); 961 962 /* 963 * Set the whole_disk property. If it's not specified, leave the value 964 * as -1. 965 */ 966 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 967 &vd->vdev_wholedisk) != 0) 968 vd->vdev_wholedisk = -1ULL; 969 970 /* 971 * Restore the last-known rotational status for leaf vdevs. vdev_open() 972 * will overwrite this with the hardware value when the device is 973 * accessible; the persisted value acts as a fallback for failed or 974 * missing devices so that spare selection can still match on device 975 * type even when the original disk is gone. 976 */ 977 if (vd->vdev_ops->vdev_op_leaf) { 978 uint64_t rotational = 0; 979 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROTATIONAL, 980 &rotational) == 0) 981 vd->vdev_nonrot = !rotational; 982 } 983 984 vic = &vd->vdev_indirect_config; 985 986 ASSERT0(vic->vic_mapping_object); 987 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, 988 &vic->vic_mapping_object); 989 ASSERT0(vic->vic_births_object); 990 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, 991 &vic->vic_births_object); 992 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); 993 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, 994 &vic->vic_prev_indirect_vdev); 995 996 /* 997 * Look for the 'not present' flag. This will only be set if the device 998 * was not present at the time of import. 999 */ 1000 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1001 &vd->vdev_not_present); 1002 1003 /* 1004 * Get the alignment requirement. Ignore pool ashift for vdev 1005 * attach case. 1006 */ 1007 if (alloctype != VDEV_ALLOC_ATTACH) { 1008 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, 1009 &vd->vdev_ashift); 1010 } else { 1011 vd->vdev_attaching = B_TRUE; 1012 } 1013 1014 /* 1015 * Retrieve the vdev creation time. 1016 */ 1017 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 1018 &vd->vdev_crtxg); 1019 1020 if (vd->vdev_ops == &vdev_root_ops && 1021 (alloctype == VDEV_ALLOC_LOAD || 1022 alloctype == VDEV_ALLOC_SPLIT || 1023 alloctype == VDEV_ALLOC_ROOTPOOL)) { 1024 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, 1025 &vd->vdev_root_zap); 1026 } 1027 1028 /* 1029 * If we're a top-level vdev, try to load the allocation parameters. 1030 */ 1031 if (top_level && 1032 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 1033 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 1034 &vd->vdev_ms_array); 1035 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 1036 &vd->vdev_ms_shift); 1037 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 1038 &vd->vdev_asize); 1039 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, 1040 &vd->vdev_noalloc); 1041 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 1042 &vd->vdev_removing); 1043 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 1044 &vd->vdev_top_zap); 1045 vd->vdev_rz_expanding = nvlist_exists(nv, 1046 ZPOOL_CONFIG_RAIDZ_EXPANDING); 1047 } else { 1048 ASSERT0(vd->vdev_top_zap); 1049 } 1050 1051 if (top_level && alloctype != VDEV_ALLOC_ATTACH) { 1052 ASSERT(alloctype == VDEV_ALLOC_LOAD || 1053 alloctype == VDEV_ALLOC_ADD || 1054 alloctype == VDEV_ALLOC_SPLIT || 1055 alloctype == VDEV_ALLOC_ROOTPOOL); 1056 /* Note: metaslab_group_create() is now deferred */ 1057 } 1058 1059 if (vd->vdev_ops->vdev_op_leaf && 1060 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 1061 (void) nvlist_lookup_uint64(nv, 1062 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 1063 } else { 1064 ASSERT0(vd->vdev_leaf_zap); 1065 } 1066 1067 /* 1068 * If we're a leaf vdev, try to load the DTL object and other state. 1069 */ 1070 1071 if (vd->vdev_ops->vdev_op_leaf && 1072 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 1073 alloctype == VDEV_ALLOC_ROOTPOOL)) { 1074 if (alloctype == VDEV_ALLOC_LOAD) { 1075 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 1076 &vd->vdev_dtl_object); 1077 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 1078 &vd->vdev_unspare); 1079 } 1080 1081 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 1082 uint64_t spare = 0; 1083 1084 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1085 &spare) == 0 && spare) 1086 spa_spare_add(vd); 1087 } 1088 1089 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 1090 &vd->vdev_offline); 1091 1092 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 1093 &vd->vdev_resilver_txg); 1094 1095 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, 1096 &vd->vdev_rebuild_txg); 1097 1098 if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) 1099 vdev_defer_resilver(vd); 1100 1101 /* 1102 * In general, when importing a pool we want to ignore the 1103 * persistent fault state, as the diagnosis made on another 1104 * system may not be valid in the current context. The only 1105 * exception is if we forced a vdev to a persistently faulted 1106 * state with 'zpool offline -f'. The persistent fault will 1107 * remain across imports until cleared. 1108 * 1109 * Local vdevs will remain in the faulted state. 1110 */ 1111 if (spa_load_state(spa) == SPA_LOAD_OPEN || 1112 spa_load_state(spa) == SPA_LOAD_IMPORT) { 1113 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 1114 &vd->vdev_faulted); 1115 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 1116 &vd->vdev_degraded); 1117 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 1118 &vd->vdev_removed); 1119 1120 if (vd->vdev_faulted || vd->vdev_degraded) { 1121 const char *aux; 1122 1123 vd->vdev_label_aux = 1124 VDEV_AUX_ERR_EXCEEDED; 1125 if (nvlist_lookup_string(nv, 1126 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 1127 strcmp(aux, "external") == 0) 1128 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 1129 else 1130 vd->vdev_faulted = 0ULL; 1131 } 1132 } 1133 } 1134 1135 if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops)) 1136 vd->vdev_autosit = 1137 vdev_prop_default_numeric(VDEV_PROP_AUTOSIT); 1138 if (ops == &vdev_root_ops) 1139 vd->vdev_failfast = 1140 vdev_prop_default_numeric(VDEV_PROP_FAILFAST); 1141 else 1142 vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT; 1143 1144 /* 1145 * Add ourselves to the parent's list of children. 1146 */ 1147 vdev_add_child(parent, vd); 1148 1149 *vdp = vd; 1150 1151 return (0); 1152 } 1153 1154 void 1155 vdev_free(vdev_t *vd) 1156 { 1157 spa_t *spa = vd->vdev_spa; 1158 1159 ASSERT0P(vd->vdev_initialize_thread); 1160 ASSERT0P(vd->vdev_trim_thread); 1161 ASSERT0P(vd->vdev_autotrim_thread); 1162 ASSERT0P(vd->vdev_rebuild_thread); 1163 1164 /* 1165 * Scan queues are normally destroyed at the end of a scan. If the 1166 * queue exists here, that implies the vdev is being removed while 1167 * the scan is still running. 1168 */ 1169 if (vd->vdev_scan_io_queue != NULL) { 1170 mutex_enter(&vd->vdev_scan_io_queue_lock); 1171 dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); 1172 vd->vdev_scan_io_queue = NULL; 1173 mutex_exit(&vd->vdev_scan_io_queue_lock); 1174 } 1175 1176 /* 1177 * vdev_free() implies closing the vdev first. This is simpler than 1178 * trying to ensure complicated semantics for all callers. 1179 */ 1180 vdev_close(vd); 1181 1182 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 1183 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 1184 1185 /* 1186 * Free all children. 1187 */ 1188 for (int c = 0; c < vd->vdev_children; c++) 1189 vdev_free(vd->vdev_child[c]); 1190 1191 ASSERT0P(vd->vdev_child); 1192 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 1193 1194 if (vd->vdev_ops->vdev_op_fini != NULL) 1195 vd->vdev_ops->vdev_op_fini(vd); 1196 1197 /* 1198 * Discard allocation state. 1199 */ 1200 if (vd->vdev_mg != NULL) { 1201 vdev_metaslab_fini(vd); 1202 metaslab_group_destroy(vd->vdev_mg); 1203 vd->vdev_mg = NULL; 1204 } 1205 if (vd->vdev_log_mg != NULL) { 1206 ASSERT0(vd->vdev_ms_count); 1207 metaslab_group_destroy(vd->vdev_log_mg); 1208 vd->vdev_log_mg = NULL; 1209 } 1210 1211 ASSERT0(vd->vdev_stat.vs_space); 1212 ASSERT0(vd->vdev_stat.vs_dspace); 1213 ASSERT0(vd->vdev_stat.vs_alloc); 1214 1215 /* 1216 * Remove this vdev from its parent's child list. 1217 */ 1218 vdev_remove_child(vd->vdev_parent, vd); 1219 1220 ASSERT0P(vd->vdev_parent); 1221 ASSERT(!list_link_active(&vd->vdev_leaf_node)); 1222 1223 /* 1224 * Clean up vdev structure. 1225 */ 1226 vdev_queue_fini(vd); 1227 1228 if (vd->vdev_path) 1229 spa_strfree(vd->vdev_path); 1230 if (vd->vdev_devid) 1231 spa_strfree(vd->vdev_devid); 1232 if (vd->vdev_physpath) 1233 spa_strfree(vd->vdev_physpath); 1234 1235 if (vd->vdev_enc_sysfs_path) 1236 spa_strfree(vd->vdev_enc_sysfs_path); 1237 1238 if (vd->vdev_fru) 1239 spa_strfree(vd->vdev_fru); 1240 1241 if (vd->vdev_isspare) 1242 spa_spare_remove(vd); 1243 if (vd->vdev_isl2cache) 1244 spa_l2cache_remove(vd); 1245 if (vd->vdev_prev_histo) 1246 kmem_free(vd->vdev_prev_histo, 1247 sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS); 1248 1249 txg_list_destroy(&vd->vdev_ms_list); 1250 txg_list_destroy(&vd->vdev_dtl_list); 1251 1252 mutex_enter(&vd->vdev_dtl_lock); 1253 space_map_close(vd->vdev_dtl_sm); 1254 for (int t = 0; t < DTL_TYPES; t++) { 1255 zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 1256 zfs_range_tree_destroy(vd->vdev_dtl[t]); 1257 } 1258 mutex_exit(&vd->vdev_dtl_lock); 1259 1260 EQUIV(vd->vdev_indirect_births != NULL, 1261 vd->vdev_indirect_mapping != NULL); 1262 if (vd->vdev_indirect_births != NULL) { 1263 vdev_indirect_mapping_close(vd->vdev_indirect_mapping); 1264 vdev_indirect_births_close(vd->vdev_indirect_births); 1265 } 1266 1267 if (vd->vdev_obsolete_sm != NULL) { 1268 ASSERT(vd->vdev_removing || 1269 vd->vdev_ops == &vdev_indirect_ops); 1270 space_map_close(vd->vdev_obsolete_sm); 1271 vd->vdev_obsolete_sm = NULL; 1272 } 1273 zfs_range_tree_destroy(vd->vdev_obsolete_segments); 1274 rw_destroy(&vd->vdev_indirect_rwlock); 1275 mutex_destroy(&vd->vdev_obsolete_lock); 1276 1277 mutex_destroy(&vd->vdev_dtl_lock); 1278 mutex_destroy(&vd->vdev_stat_lock); 1279 mutex_destroy(&vd->vdev_probe_lock); 1280 mutex_destroy(&vd->vdev_scan_io_queue_lock); 1281 1282 mutex_destroy(&vd->vdev_initialize_lock); 1283 mutex_destroy(&vd->vdev_initialize_io_lock); 1284 cv_destroy(&vd->vdev_initialize_io_cv); 1285 cv_destroy(&vd->vdev_initialize_cv); 1286 1287 mutex_destroy(&vd->vdev_trim_lock); 1288 mutex_destroy(&vd->vdev_autotrim_lock); 1289 mutex_destroy(&vd->vdev_trim_io_lock); 1290 cv_destroy(&vd->vdev_trim_cv); 1291 cv_destroy(&vd->vdev_autotrim_cv); 1292 cv_destroy(&vd->vdev_autotrim_kick_cv); 1293 cv_destroy(&vd->vdev_trim_io_cv); 1294 1295 mutex_destroy(&vd->vdev_rebuild_lock); 1296 cv_destroy(&vd->vdev_rebuild_cv); 1297 1298 zfs_ratelimit_fini(&vd->vdev_delay_rl); 1299 zfs_ratelimit_fini(&vd->vdev_deadman_rl); 1300 zfs_ratelimit_fini(&vd->vdev_dio_verify_rl); 1301 zfs_ratelimit_fini(&vd->vdev_checksum_rl); 1302 1303 if (vd == spa->spa_root_vdev) 1304 spa->spa_root_vdev = NULL; 1305 1306 kmem_free(vd, sizeof (vdev_t)); 1307 } 1308 1309 /* 1310 * Transfer top-level vdev state from svd to tvd. 1311 */ 1312 static void 1313 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 1314 { 1315 spa_t *spa = svd->vdev_spa; 1316 metaslab_t *msp; 1317 vdev_t *vd; 1318 int t; 1319 1320 ASSERT(tvd == tvd->vdev_top); 1321 1322 tvd->vdev_ms_array = svd->vdev_ms_array; 1323 tvd->vdev_ms_shift = svd->vdev_ms_shift; 1324 tvd->vdev_ms_count = svd->vdev_ms_count; 1325 tvd->vdev_top_zap = svd->vdev_top_zap; 1326 1327 svd->vdev_ms_array = 0; 1328 svd->vdev_ms_shift = 0; 1329 svd->vdev_ms_count = 0; 1330 svd->vdev_top_zap = 0; 1331 1332 if (tvd->vdev_mg) 1333 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 1334 if (tvd->vdev_log_mg) 1335 ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg); 1336 tvd->vdev_mg = svd->vdev_mg; 1337 tvd->vdev_log_mg = svd->vdev_log_mg; 1338 tvd->vdev_ms = svd->vdev_ms; 1339 1340 svd->vdev_mg = NULL; 1341 svd->vdev_log_mg = NULL; 1342 svd->vdev_ms = NULL; 1343 1344 if (tvd->vdev_mg != NULL) 1345 tvd->vdev_mg->mg_vd = tvd; 1346 if (tvd->vdev_log_mg != NULL) 1347 tvd->vdev_log_mg->mg_vd = tvd; 1348 1349 tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; 1350 svd->vdev_checkpoint_sm = NULL; 1351 1352 tvd->vdev_alloc_bias = svd->vdev_alloc_bias; 1353 svd->vdev_alloc_bias = VDEV_BIAS_NONE; 1354 1355 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 1356 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 1357 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 1358 1359 svd->vdev_stat.vs_alloc = 0; 1360 svd->vdev_stat.vs_space = 0; 1361 svd->vdev_stat.vs_dspace = 0; 1362 1363 /* 1364 * State which may be set on a top-level vdev that's in the 1365 * process of being removed. 1366 */ 1367 ASSERT0(tvd->vdev_indirect_config.vic_births_object); 1368 ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); 1369 ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); 1370 ASSERT0P(tvd->vdev_indirect_mapping); 1371 ASSERT0P(tvd->vdev_indirect_births); 1372 ASSERT0P(tvd->vdev_obsolete_sm); 1373 ASSERT0(tvd->vdev_noalloc); 1374 ASSERT0(tvd->vdev_removing); 1375 ASSERT0(tvd->vdev_rebuilding); 1376 tvd->vdev_noalloc = svd->vdev_noalloc; 1377 tvd->vdev_removing = svd->vdev_removing; 1378 tvd->vdev_rebuilding = svd->vdev_rebuilding; 1379 tvd->vdev_rebuild_config = svd->vdev_rebuild_config; 1380 tvd->vdev_indirect_config = svd->vdev_indirect_config; 1381 tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; 1382 tvd->vdev_indirect_births = svd->vdev_indirect_births; 1383 zfs_range_tree_swap(&svd->vdev_obsolete_segments, 1384 &tvd->vdev_obsolete_segments); 1385 tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; 1386 svd->vdev_indirect_config.vic_mapping_object = 0; 1387 svd->vdev_indirect_config.vic_births_object = 0; 1388 svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; 1389 svd->vdev_indirect_mapping = NULL; 1390 svd->vdev_indirect_births = NULL; 1391 svd->vdev_obsolete_sm = NULL; 1392 svd->vdev_noalloc = 0; 1393 svd->vdev_removing = 0; 1394 svd->vdev_rebuilding = 0; 1395 1396 for (t = 0; t < TXG_SIZE; t++) { 1397 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 1398 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 1399 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 1400 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 1401 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 1402 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 1403 } 1404 1405 if (list_link_active(&svd->vdev_config_dirty_node)) { 1406 vdev_config_clean(svd); 1407 vdev_config_dirty(tvd); 1408 } 1409 1410 if (list_link_active(&svd->vdev_state_dirty_node)) { 1411 vdev_state_clean(svd); 1412 vdev_state_dirty(tvd); 1413 } 1414 1415 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 1416 svd->vdev_deflate_ratio = 0; 1417 1418 tvd->vdev_islog = svd->vdev_islog; 1419 svd->vdev_islog = 0; 1420 1421 dsl_scan_io_queue_vdev_xfer(svd, tvd); 1422 } 1423 1424 static void 1425 vdev_top_update(vdev_t *tvd, vdev_t *vd) 1426 { 1427 if (vd == NULL) 1428 return; 1429 1430 vd->vdev_top = tvd; 1431 1432 for (int c = 0; c < vd->vdev_children; c++) 1433 vdev_top_update(tvd, vd->vdev_child[c]); 1434 } 1435 1436 /* 1437 * Add a mirror/replacing vdev above an existing vdev. There is no need to 1438 * call .vdev_op_init() since mirror/replacing vdevs do not have private state. 1439 */ 1440 vdev_t * 1441 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 1442 { 1443 spa_t *spa = cvd->vdev_spa; 1444 vdev_t *pvd = cvd->vdev_parent; 1445 vdev_t *mvd; 1446 1447 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1448 1449 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 1450 1451 mvd->vdev_asize = cvd->vdev_asize; 1452 mvd->vdev_min_asize = cvd->vdev_min_asize; 1453 mvd->vdev_max_asize = cvd->vdev_max_asize; 1454 mvd->vdev_psize = cvd->vdev_psize; 1455 mvd->vdev_ashift = cvd->vdev_ashift; 1456 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; 1457 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; 1458 mvd->vdev_state = cvd->vdev_state; 1459 mvd->vdev_crtxg = cvd->vdev_crtxg; 1460 mvd->vdev_nonrot = cvd->vdev_nonrot; 1461 1462 vdev_remove_child(pvd, cvd); 1463 vdev_add_child(pvd, mvd); 1464 cvd->vdev_id = mvd->vdev_children; 1465 vdev_add_child(mvd, cvd); 1466 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 1467 1468 if (mvd == mvd->vdev_top) 1469 vdev_top_transfer(cvd, mvd); 1470 1471 return (mvd); 1472 } 1473 1474 /* 1475 * Remove a 1-way mirror/replacing vdev from the tree. 1476 */ 1477 void 1478 vdev_remove_parent(vdev_t *cvd) 1479 { 1480 vdev_t *mvd = cvd->vdev_parent; 1481 vdev_t *pvd = mvd->vdev_parent; 1482 1483 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1484 1485 ASSERT(mvd->vdev_children == 1); 1486 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 1487 mvd->vdev_ops == &vdev_replacing_ops || 1488 mvd->vdev_ops == &vdev_spare_ops); 1489 cvd->vdev_ashift = mvd->vdev_ashift; 1490 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; 1491 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; 1492 vdev_remove_child(mvd, cvd); 1493 vdev_remove_child(pvd, mvd); 1494 1495 /* 1496 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 1497 * Otherwise, we could have detached an offline device, and when we 1498 * go to import the pool we'll think we have two top-level vdevs, 1499 * instead of a different version of the same top-level vdev. 1500 */ 1501 if (mvd->vdev_top == mvd) { 1502 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 1503 cvd->vdev_orig_guid = cvd->vdev_guid; 1504 cvd->vdev_guid += guid_delta; 1505 cvd->vdev_guid_sum += guid_delta; 1506 1507 /* 1508 * If pool not set for autoexpand, we need to also preserve 1509 * mvd's asize to prevent automatic expansion of cvd. 1510 * Otherwise if we are adjusting the mirror by attaching and 1511 * detaching children of non-uniform sizes, the mirror could 1512 * autoexpand, unexpectedly requiring larger devices to 1513 * re-establish the mirror. 1514 */ 1515 if (!cvd->vdev_spa->spa_autoexpand) 1516 cvd->vdev_asize = mvd->vdev_asize; 1517 } 1518 cvd->vdev_id = mvd->vdev_id; 1519 vdev_add_child(pvd, cvd); 1520 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 1521 1522 if (cvd == cvd->vdev_top) 1523 vdev_top_transfer(mvd, cvd); 1524 1525 ASSERT0(mvd->vdev_children); 1526 vdev_free(mvd); 1527 } 1528 1529 /* 1530 * Choose GCD for spa_gcd_alloc. 1531 */ 1532 static uint64_t 1533 vdev_gcd(uint64_t a, uint64_t b) 1534 { 1535 while (b != 0) { 1536 uint64_t t = b; 1537 b = a % b; 1538 a = t; 1539 } 1540 return (a); 1541 } 1542 1543 /* 1544 * Set spa_min_alloc and spa_gcd_alloc. 1545 */ 1546 static void 1547 vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc) 1548 { 1549 if (min_alloc < spa->spa_min_alloc) 1550 spa->spa_min_alloc = min_alloc; 1551 1552 if (min_alloc > spa->spa_max_alloc) 1553 spa->spa_max_alloc = min_alloc; 1554 1555 if (spa->spa_gcd_alloc == INT_MAX) 1556 spa->spa_gcd_alloc = min_alloc; 1557 else 1558 spa->spa_gcd_alloc = vdev_gcd(min_alloc, spa->spa_gcd_alloc); 1559 } 1560 1561 void 1562 vdev_metaslab_group_create(vdev_t *vd) 1563 { 1564 spa_t *spa = vd->vdev_spa; 1565 1566 /* 1567 * metaslab_group_create was delayed until allocation bias was available 1568 */ 1569 if (vd->vdev_mg == NULL) { 1570 metaslab_class_t *mc; 1571 1572 if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) 1573 vd->vdev_alloc_bias = VDEV_BIAS_LOG; 1574 1575 ASSERT3U(vd->vdev_islog, ==, 1576 (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); 1577 1578 switch (vd->vdev_alloc_bias) { 1579 case VDEV_BIAS_LOG: 1580 mc = spa_log_class(spa); 1581 break; 1582 case VDEV_BIAS_SPECIAL: 1583 mc = spa_special_class(spa); 1584 break; 1585 case VDEV_BIAS_DEDUP: 1586 mc = spa_dedup_class(spa); 1587 break; 1588 default: 1589 mc = spa_normal_class(spa); 1590 } 1591 1592 vd->vdev_mg = metaslab_group_create(mc, vd); 1593 1594 if (!vd->vdev_islog) { 1595 if (mc == spa_special_class(spa)) { 1596 vd->vdev_log_mg = metaslab_group_create( 1597 spa_special_embedded_log_class(spa), vd); 1598 } else { 1599 vd->vdev_log_mg = metaslab_group_create( 1600 spa_embedded_log_class(spa), vd); 1601 } 1602 } 1603 1604 /* 1605 * The spa ashift min/max only apply for the normal metaslab 1606 * class. Class destination is late binding so ashift boundary 1607 * setting had to wait until now. 1608 */ 1609 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1610 mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { 1611 if (vd->vdev_ashift > spa->spa_max_ashift) 1612 spa->spa_max_ashift = vd->vdev_ashift; 1613 if (vd->vdev_ashift < spa->spa_min_ashift) 1614 spa->spa_min_ashift = vd->vdev_ashift; 1615 1616 vdev_spa_set_alloc(spa, vdev_get_min_alloc(vd)); 1617 } 1618 } 1619 } 1620 1621 void 1622 vdev_update_nonallocating_space(vdev_t *vd, boolean_t add) 1623 { 1624 spa_t *spa = vd->vdev_spa; 1625 1626 if (vd->vdev_mg->mg_class != spa_normal_class(spa)) 1627 return; 1628 1629 uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg); 1630 uint64_t dspace = spa_deflate(spa) ? 1631 vdev_deflated_space(vd, raw_space) : raw_space; 1632 if (add) { 1633 spa->spa_nonallocating_dspace += dspace; 1634 } else { 1635 ASSERT3U(spa->spa_nonallocating_dspace, >=, dspace); 1636 spa->spa_nonallocating_dspace -= dspace; 1637 } 1638 } 1639 1640 int 1641 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 1642 { 1643 spa_t *spa = vd->vdev_spa; 1644 uint64_t oldc = vd->vdev_ms_count; 1645 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 1646 metaslab_t **mspp; 1647 int error; 1648 boolean_t expanding = (oldc != 0); 1649 1650 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1651 1652 /* 1653 * This vdev is not being allocated from yet or is a hole. 1654 */ 1655 if (vd->vdev_ms_shift == 0) 1656 return (0); 1657 1658 ASSERT(!vd->vdev_ishole); 1659 1660 ASSERT(oldc <= newc); 1661 1662 mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 1663 1664 if (expanding) { 1665 memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp)); 1666 vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 1667 } 1668 1669 vd->vdev_ms = mspp; 1670 vd->vdev_ms_count = newc; 1671 1672 /* 1673 * Weighting algorithms can depend on the number of metaslabs in the 1674 * vdev. In order to ensure that all weights are correct at all times, 1675 * we need to recalculate here. 1676 */ 1677 for (uint64_t m = 0; m < oldc; m++) { 1678 metaslab_t *msp = vd->vdev_ms[m]; 1679 mutex_enter(&msp->ms_lock); 1680 metaslab_recalculate_weight_and_sort(msp); 1681 mutex_exit(&msp->ms_lock); 1682 } 1683 1684 for (uint64_t m = oldc; m < newc; m++) { 1685 uint64_t object = 0; 1686 /* 1687 * vdev_ms_array may be 0 if we are creating the "fake" 1688 * metaslabs for an indirect vdev for zdb's leak detection. 1689 * See zdb_leak_init(). 1690 */ 1691 if (txg == 0 && vd->vdev_ms_array != 0) { 1692 error = dmu_read(spa->spa_meta_objset, 1693 vd->vdev_ms_array, 1694 m * sizeof (uint64_t), sizeof (uint64_t), &object, 1695 DMU_READ_PREFETCH); 1696 if (error != 0) { 1697 vdev_dbgmsg(vd, "unable to read the metaslab " 1698 "array [error=%d]", error); 1699 return (error); 1700 } 1701 } 1702 1703 error = metaslab_init(vd->vdev_mg, m, object, txg, 1704 &(vd->vdev_ms[m])); 1705 if (error != 0) { 1706 vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", 1707 error); 1708 return (error); 1709 } 1710 } 1711 1712 /* 1713 * Find the emptiest metaslab on the vdev and mark it for use for 1714 * embedded slog by moving it from the regular to the log metaslab 1715 * group. This works for normal and special vdevs. 1716 */ 1717 if ((vd->vdev_mg->mg_class == spa_normal_class(spa) || 1718 vd->vdev_mg->mg_class == spa_special_class(spa)) && 1719 vd->vdev_ms_count > zfs_embedded_slog_min_ms && 1720 avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { 1721 uint64_t slog_msid = 0; 1722 uint64_t smallest = UINT64_MAX; 1723 1724 /* 1725 * Note, we only search the new metaslabs, because the old 1726 * (pre-existing) ones may be active (e.g. have non-empty 1727 * range_tree's), and we don't move them to the new 1728 * metaslab_t. 1729 */ 1730 for (uint64_t m = oldc; m < newc; m++) { 1731 uint64_t alloc = 1732 space_map_allocated(vd->vdev_ms[m]->ms_sm); 1733 if (alloc < smallest) { 1734 slog_msid = m; 1735 smallest = alloc; 1736 } 1737 } 1738 metaslab_t *slog_ms = vd->vdev_ms[slog_msid]; 1739 /* 1740 * The metaslab was marked as dirty at the end of 1741 * metaslab_init(). Remove it from the dirty list so that we 1742 * can uninitialize and reinitialize it to the new class. It 1743 * may be dirty in any txg slot, so clear them all. 1744 */ 1745 for (int t = 0; t < TXG_SIZE; t++) { 1746 (void) txg_list_remove_this(&vd->vdev_ms_list, 1747 slog_ms, t); 1748 } 1749 uint64_t sm_obj = space_map_object(slog_ms->ms_sm); 1750 metaslab_fini(slog_ms); 1751 VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg, 1752 &vd->vdev_ms[slog_msid])); 1753 } 1754 1755 if (txg == 0) 1756 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 1757 1758 /* 1759 * If the vdev is marked as non-allocating then don't 1760 * activate the metaslabs since we want to ensure that 1761 * no allocations are performed on this device. 1762 */ 1763 if (vd->vdev_noalloc) { 1764 /* track non-allocating vdev space */ 1765 vdev_update_nonallocating_space(vd, B_TRUE); 1766 } else if (!expanding) { 1767 metaslab_group_activate(vd->vdev_mg); 1768 if (vd->vdev_log_mg != NULL) 1769 metaslab_group_activate(vd->vdev_log_mg); 1770 } 1771 1772 if (txg == 0) 1773 spa_config_exit(spa, SCL_ALLOC, FTAG); 1774 1775 return (0); 1776 } 1777 1778 void 1779 vdev_metaslab_fini(vdev_t *vd) 1780 { 1781 if (vd->vdev_checkpoint_sm != NULL) { 1782 ASSERT(spa_feature_is_active(vd->vdev_spa, 1783 SPA_FEATURE_POOL_CHECKPOINT)); 1784 space_map_close(vd->vdev_checkpoint_sm); 1785 /* 1786 * Even though we close the space map, we need to set its 1787 * pointer to NULL. The reason is that vdev_metaslab_fini() 1788 * may be called multiple times for certain operations 1789 * (i.e. when destroying a pool) so we need to ensure that 1790 * this clause never executes twice. This logic is similar 1791 * to the one used for the vdev_ms clause below. 1792 */ 1793 vd->vdev_checkpoint_sm = NULL; 1794 } 1795 1796 if (vd->vdev_ms != NULL) { 1797 metaslab_group_t *mg = vd->vdev_mg; 1798 1799 metaslab_group_passivate(mg); 1800 if (vd->vdev_log_mg != NULL) { 1801 ASSERT(!vd->vdev_islog); 1802 metaslab_group_passivate(vd->vdev_log_mg); 1803 } 1804 1805 uint64_t count = vd->vdev_ms_count; 1806 for (uint64_t m = 0; m < count; m++) { 1807 metaslab_t *msp = vd->vdev_ms[m]; 1808 if (msp != NULL) 1809 metaslab_fini(msp); 1810 } 1811 vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 1812 vd->vdev_ms = NULL; 1813 vd->vdev_ms_count = 0; 1814 1815 for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) { 1816 ASSERT0(mg->mg_histogram[i]); 1817 if (vd->vdev_log_mg != NULL) 1818 ASSERT0(vd->vdev_log_mg->mg_histogram[i]); 1819 } 1820 } 1821 ASSERT0(vd->vdev_ms_count); 1822 } 1823 1824 typedef struct vdev_probe_stats { 1825 boolean_t vps_readable; 1826 boolean_t vps_writeable; 1827 boolean_t vps_zio_done_probe; 1828 int vps_flags; 1829 } vdev_probe_stats_t; 1830 1831 static void 1832 vdev_probe_done(zio_t *zio) 1833 { 1834 spa_t *spa = zio->io_spa; 1835 vdev_t *vd = zio->io_vd; 1836 vdev_probe_stats_t *vps = zio->io_private; 1837 1838 ASSERT(vd->vdev_probe_zio != NULL); 1839 1840 if (zio->io_type == ZIO_TYPE_READ) { 1841 if (zio->io_error == 0) 1842 vps->vps_readable = 1; 1843 if (zio->io_error == 0 && spa_writeable(spa)) { 1844 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 1845 zio->io_offset, zio->io_size, zio->io_abd, 1846 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1847 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 1848 } else { 1849 abd_free(zio->io_abd); 1850 } 1851 } else if (zio->io_type == ZIO_TYPE_WRITE) { 1852 if (zio->io_error == 0) 1853 vps->vps_writeable = 1; 1854 abd_free(zio->io_abd); 1855 } else if (zio->io_type == ZIO_TYPE_NULL) { 1856 zio_t *pio; 1857 zio_link_t *zl; 1858 1859 vd->vdev_cant_read |= !vps->vps_readable; 1860 vd->vdev_cant_write |= !vps->vps_writeable; 1861 vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u", 1862 vd->vdev_cant_read, vd->vdev_cant_write); 1863 1864 if (vdev_readable(vd) && 1865 (vdev_writeable(vd) || !spa_writeable(spa))) { 1866 zio->io_error = 0; 1867 } else { 1868 ASSERT(zio->io_error != 0); 1869 vdev_dbgmsg(vd, "failed probe"); 1870 (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 1871 spa, vd, NULL, NULL, 0); 1872 zio->io_error = SET_ERROR(ENXIO); 1873 1874 /* 1875 * If this probe was initiated from zio pipeline, then 1876 * change the state in a spa_async_request. Probes that 1877 * were initiated from a vdev_open can change the state 1878 * as part of the open call. 1879 * Skip fault injection if this vdev is already removed 1880 * or a removal is pending. 1881 */ 1882 if (vps->vps_zio_done_probe && 1883 !vd->vdev_remove_wanted && !vd->vdev_removed) { 1884 vd->vdev_fault_wanted = B_TRUE; 1885 spa_async_request(spa, SPA_ASYNC_FAULT_VDEV); 1886 } 1887 } 1888 1889 mutex_enter(&vd->vdev_probe_lock); 1890 ASSERT(vd->vdev_probe_zio == zio); 1891 vd->vdev_probe_zio = NULL; 1892 mutex_exit(&vd->vdev_probe_lock); 1893 1894 zl = NULL; 1895 while ((pio = zio_walk_parents(zio, &zl)) != NULL) 1896 if (!vdev_accessible(vd, pio)) 1897 pio->io_error = SET_ERROR(ENXIO); 1898 1899 kmem_free(vps, sizeof (*vps)); 1900 } 1901 } 1902 1903 /* 1904 * Determine whether this device is accessible. 1905 * 1906 * Read and write to several known locations: the pad regions of each 1907 * vdev label but the first, which we leave alone in case it contains 1908 * a VTOC. 1909 */ 1910 zio_t * 1911 vdev_probe(vdev_t *vd, zio_t *zio) 1912 { 1913 spa_t *spa = vd->vdev_spa; 1914 vdev_probe_stats_t *vps = NULL; 1915 zio_t *pio; 1916 1917 ASSERT(vd->vdev_ops->vdev_op_leaf); 1918 1919 /* 1920 * Don't probe the probe. 1921 */ 1922 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1923 return (NULL); 1924 1925 /* 1926 * To prevent 'probe storms' when a device fails, we create 1927 * just one probe i/o at a time. All zios that want to probe 1928 * this vdev will become parents of the probe io. 1929 */ 1930 mutex_enter(&vd->vdev_probe_lock); 1931 1932 if ((pio = vd->vdev_probe_zio) == NULL) { 1933 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1934 1935 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1936 ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; 1937 vps->vps_zio_done_probe = (zio != NULL); 1938 1939 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1940 /* 1941 * vdev_cant_read and vdev_cant_write can only 1942 * transition from TRUE to FALSE when we have the 1943 * SCL_ZIO lock as writer; otherwise they can only 1944 * transition from FALSE to TRUE. This ensures that 1945 * any zio looking at these values can assume that 1946 * failures persist for the life of the I/O. That's 1947 * important because when a device has intermittent 1948 * connectivity problems, we want to ensure that 1949 * they're ascribed to the device (ENXIO) and not 1950 * the zio (EIO). 1951 * 1952 * Since we hold SCL_ZIO as writer here, clear both 1953 * values so the probe can reevaluate from first 1954 * principles. 1955 */ 1956 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1957 vd->vdev_cant_read = B_FALSE; 1958 vd->vdev_cant_write = B_FALSE; 1959 } 1960 1961 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1962 vdev_probe_done, vps, 1963 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1964 } 1965 1966 if (zio != NULL) 1967 zio_add_child(zio, pio); 1968 1969 mutex_exit(&vd->vdev_probe_lock); 1970 1971 if (vps == NULL) { 1972 ASSERT(zio != NULL); 1973 return (NULL); 1974 } 1975 1976 for (int l = 1; l < VDEV_LABELS; l++) { 1977 zio_nowait(zio_read_phys(pio, vd, 1978 vdev_label_offset(vd->vdev_psize, l, 1979 offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, 1980 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), 1981 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1982 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1983 } 1984 1985 if (zio == NULL) 1986 return (pio); 1987 1988 zio_nowait(pio); 1989 return (NULL); 1990 } 1991 1992 static void 1993 vdev_load_child(void *arg) 1994 { 1995 vdev_t *vd = arg; 1996 1997 vd->vdev_load_error = vdev_load(vd); 1998 } 1999 2000 static void 2001 vdev_open_child(void *arg) 2002 { 2003 vdev_t *vd = arg; 2004 2005 vd->vdev_open_thread = curthread; 2006 vd->vdev_open_error = vdev_open(vd); 2007 vd->vdev_open_thread = NULL; 2008 } 2009 2010 static boolean_t 2011 vdev_uses_zvols(vdev_t *vd) 2012 { 2013 #ifdef _KERNEL 2014 if (zvol_is_zvol(vd->vdev_path)) 2015 return (B_TRUE); 2016 #endif 2017 2018 for (int c = 0; c < vd->vdev_children; c++) 2019 if (vdev_uses_zvols(vd->vdev_child[c])) 2020 return (B_TRUE); 2021 2022 return (B_FALSE); 2023 } 2024 2025 /* 2026 * Returns B_TRUE if the passed child should be opened. 2027 */ 2028 static boolean_t 2029 vdev_default_open_children_func(vdev_t *vd) 2030 { 2031 (void) vd; 2032 return (B_TRUE); 2033 } 2034 2035 /* 2036 * Open the requested child vdevs. If any of the leaf vdevs are using 2037 * a ZFS volume then do the opens in a single thread. This avoids a 2038 * deadlock when the current thread is holding the spa_namespace_lock. 2039 */ 2040 static void 2041 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func) 2042 { 2043 int children = vd->vdev_children; 2044 2045 taskq_t *tq = taskq_create("vdev_open", children, minclsyspri, 2046 children, children, TASKQ_PREPOPULATE); 2047 vd->vdev_nonrot = B_TRUE; 2048 2049 for (int c = 0; c < children; c++) { 2050 vdev_t *cvd = vd->vdev_child[c]; 2051 2052 if (open_func(cvd) == B_FALSE) 2053 continue; 2054 2055 if (tq == NULL || vdev_uses_zvols(vd)) { 2056 cvd->vdev_open_error = vdev_open(cvd); 2057 } else { 2058 VERIFY(taskq_dispatch(tq, vdev_open_child, 2059 cvd, TQ_SLEEP) != TASKQID_INVALID); 2060 } 2061 } 2062 2063 if (tq != NULL) 2064 taskq_wait(tq); 2065 for (int c = 0; c < children; c++) { 2066 vdev_t *cvd = vd->vdev_child[c]; 2067 2068 if (open_func(cvd) == B_FALSE || 2069 cvd->vdev_state <= VDEV_STATE_FAULTED) 2070 continue; 2071 vd->vdev_nonrot &= cvd->vdev_nonrot; 2072 } 2073 2074 if (tq != NULL) 2075 taskq_destroy(tq); 2076 } 2077 2078 /* 2079 * Open all child vdevs. 2080 */ 2081 void 2082 vdev_open_children(vdev_t *vd) 2083 { 2084 vdev_open_children_impl(vd, vdev_default_open_children_func); 2085 } 2086 2087 /* 2088 * Conditionally open a subset of child vdevs. 2089 */ 2090 void 2091 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func) 2092 { 2093 vdev_open_children_impl(vd, open_func); 2094 } 2095 2096 /* 2097 * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17) 2098 * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE 2099 * changed, this algorithm can not change, otherwise it would inconsistently 2100 * account for existing bp's. We also hard-code txg 0 for the same reason 2101 * since expanded RAIDZ vdevs can use a different asize for different birth 2102 * txg's. 2103 */ 2104 static void 2105 vdev_set_deflate_ratio(vdev_t *vd) 2106 { 2107 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { 2108 vd->vdev_deflate_ratio = (1 << 17) / 2109 (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >> 2110 SPA_MINBLOCKSHIFT); 2111 } 2112 } 2113 2114 /* 2115 * Choose the best of two ashifts, preferring one between logical ashift 2116 * (absolute minimum) and administrator defined maximum, otherwise take 2117 * the biggest of the two. 2118 */ 2119 uint64_t 2120 vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b) 2121 { 2122 if (a > logical && a <= zfs_vdev_max_auto_ashift) { 2123 if (b <= logical || b > zfs_vdev_max_auto_ashift) 2124 return (a); 2125 else 2126 return (MAX(a, b)); 2127 } else if (b <= logical || b > zfs_vdev_max_auto_ashift) 2128 return (MAX(a, b)); 2129 return (b); 2130 } 2131 2132 /* 2133 * Maximize performance by inflating the configured ashift for top level 2134 * vdevs to be as close to the physical ashift as possible while maintaining 2135 * administrator defined limits and ensuring it doesn't go below the 2136 * logical ashift. 2137 */ 2138 static void 2139 vdev_ashift_optimize(vdev_t *vd) 2140 { 2141 ASSERT(vd == vd->vdev_top); 2142 2143 if (vd->vdev_ashift < vd->vdev_physical_ashift && 2144 vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) { 2145 vd->vdev_ashift = MIN( 2146 MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), 2147 MAX(zfs_vdev_min_auto_ashift, 2148 vd->vdev_physical_ashift)); 2149 } else { 2150 /* 2151 * If the logical and physical ashifts are the same, then 2152 * we ensure that the top-level vdev's ashift is not smaller 2153 * than our minimum ashift value. For the unusual case 2154 * where logical ashift > physical ashift, we can't cap 2155 * the calculated ashift based on max ashift as that 2156 * would cause failures. 2157 * We still check if we need to increase it to match 2158 * the min ashift. 2159 */ 2160 vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, 2161 vd->vdev_ashift); 2162 } 2163 } 2164 2165 /* 2166 * Prepare a virtual device for access. 2167 */ 2168 int 2169 vdev_open(vdev_t *vd) 2170 { 2171 spa_t *spa = vd->vdev_spa; 2172 int error; 2173 uint64_t osize = 0; 2174 uint64_t max_osize = 0; 2175 uint64_t asize, max_asize, psize; 2176 uint64_t logical_ashift = 0; 2177 uint64_t physical_ashift = 0; 2178 2179 ASSERT(vd->vdev_open_thread == curthread || 2180 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2181 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 2182 vd->vdev_state == VDEV_STATE_CANT_OPEN || 2183 vd->vdev_state == VDEV_STATE_OFFLINE); 2184 2185 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2186 vd->vdev_cant_read = B_FALSE; 2187 vd->vdev_cant_write = B_FALSE; 2188 vd->vdev_fault_wanted = B_FALSE; 2189 vd->vdev_remove_wanted = B_FALSE; 2190 vd->vdev_min_asize = vdev_get_min_asize(vd); 2191 2192 /* 2193 * If this vdev is not removed, check its fault status. If it's 2194 * faulted, bail out of the open. 2195 */ 2196 if (!vd->vdev_removed && vd->vdev_faulted) { 2197 ASSERT0(vd->vdev_children); 2198 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 2199 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 2200 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 2201 vd->vdev_label_aux); 2202 return (SET_ERROR(ENXIO)); 2203 } else if (vd->vdev_offline) { 2204 ASSERT0(vd->vdev_children); 2205 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 2206 return (SET_ERROR(ENXIO)); 2207 } 2208 2209 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, 2210 &logical_ashift, &physical_ashift); 2211 2212 /* Keep the device in removed state if unplugged */ 2213 if (error == ENOENT && vd->vdev_removed) { 2214 vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED, 2215 VDEV_AUX_NONE); 2216 return (error); 2217 } 2218 2219 /* 2220 * Physical volume size should never be larger than its max size, unless 2221 * the disk has shrunk while we were reading it or the device is buggy 2222 * or damaged: either way it's not safe for use, bail out of the open. 2223 */ 2224 if (osize > max_osize) { 2225 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2226 VDEV_AUX_OPEN_FAILED); 2227 return (SET_ERROR(ENXIO)); 2228 } 2229 2230 /* 2231 * Reset the vdev_reopening flag so that we actually close 2232 * the vdev on error. 2233 */ 2234 vd->vdev_reopening = B_FALSE; 2235 if (zio_injection_enabled && error == 0) 2236 error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); 2237 2238 if (error) { 2239 if (vd->vdev_removed && 2240 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 2241 vd->vdev_removed = B_FALSE; 2242 2243 if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { 2244 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, 2245 vd->vdev_stat.vs_aux); 2246 } else { 2247 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2248 vd->vdev_stat.vs_aux); 2249 } 2250 return (error); 2251 } 2252 2253 vd->vdev_removed = B_FALSE; 2254 2255 /* 2256 * Recheck the faulted flag now that we have confirmed that 2257 * the vdev is accessible. If we're faulted, bail. 2258 */ 2259 if (vd->vdev_faulted) { 2260 ASSERT0(vd->vdev_children); 2261 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 2262 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 2263 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 2264 vd->vdev_label_aux); 2265 return (SET_ERROR(ENXIO)); 2266 } 2267 2268 if (vd->vdev_degraded) { 2269 ASSERT0(vd->vdev_children); 2270 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 2271 VDEV_AUX_ERR_EXCEEDED); 2272 } else { 2273 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 2274 } 2275 2276 /* 2277 * For hole or missing vdevs we just return success. 2278 */ 2279 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 2280 return (0); 2281 2282 for (int c = 0; c < vd->vdev_children; c++) { 2283 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 2284 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 2285 VDEV_AUX_NONE); 2286 break; 2287 } 2288 } 2289 2290 osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t); 2291 max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t); 2292 2293 if (vd->vdev_children == 0) { 2294 if (osize < SPA_MINDEVSIZE) { 2295 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2296 VDEV_AUX_TOO_SMALL); 2297 return (SET_ERROR(EOVERFLOW)); 2298 } 2299 psize = osize; 2300 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 2301 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 2302 VDEV_LABEL_END_SIZE); 2303 } else { 2304 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 2305 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 2306 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2307 VDEV_AUX_TOO_SMALL); 2308 return (SET_ERROR(EOVERFLOW)); 2309 } 2310 psize = 0; 2311 asize = osize; 2312 max_asize = max_osize; 2313 } 2314 2315 /* 2316 * If the vdev was expanded, record this so that we can re-create the 2317 * uberblock rings in labels {2,3}, during the next sync. 2318 */ 2319 if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) 2320 vd->vdev_copy_uberblocks = B_TRUE; 2321 2322 vd->vdev_psize = psize; 2323 2324 /* 2325 * Make sure the allocatable size hasn't shrunk too much. 2326 */ 2327 if (asize < vd->vdev_min_asize) { 2328 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2329 VDEV_AUX_BAD_LABEL); 2330 return (SET_ERROR(EINVAL)); 2331 } 2332 2333 /* 2334 * We can always set the logical/physical ashift members since 2335 * their values are only used to calculate the vdev_ashift when 2336 * the device is first added to the config. These values should 2337 * not be used for anything else since they may change whenever 2338 * the device is reopened and we don't store them in the label. 2339 */ 2340 vd->vdev_physical_ashift = 2341 MAX(physical_ashift, vd->vdev_physical_ashift); 2342 vd->vdev_logical_ashift = MAX(logical_ashift, 2343 vd->vdev_logical_ashift); 2344 2345 if (vd->vdev_asize == 0) { 2346 /* 2347 * This is the first-ever open, so use the computed values. 2348 * For compatibility, a different ashift can be requested. 2349 */ 2350 vd->vdev_asize = asize; 2351 vd->vdev_max_asize = max_asize; 2352 2353 /* 2354 * If the vdev_ashift was not overridden at creation time 2355 * (0) or the override value is impossible for the device, 2356 * then set it the logical ashift and optimize the ashift. 2357 */ 2358 if (vd->vdev_ashift < vd->vdev_logical_ashift) { 2359 vd->vdev_ashift = vd->vdev_logical_ashift; 2360 2361 if (vd->vdev_logical_ashift > ASHIFT_MAX) { 2362 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2363 VDEV_AUX_ASHIFT_TOO_BIG); 2364 return (SET_ERROR(EDOM)); 2365 } 2366 2367 if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE) 2368 vdev_ashift_optimize(vd); 2369 vd->vdev_attaching = B_FALSE; 2370 } 2371 if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || 2372 vd->vdev_ashift > ASHIFT_MAX)) { 2373 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2374 VDEV_AUX_BAD_ASHIFT); 2375 return (SET_ERROR(EDOM)); 2376 } 2377 } else { 2378 /* 2379 * Make sure the alignment required hasn't increased. 2380 */ 2381 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && 2382 vd->vdev_ops->vdev_op_leaf) { 2383 (void) zfs_ereport_post( 2384 FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, 2385 spa, vd, NULL, NULL, 0); 2386 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2387 VDEV_AUX_BAD_LABEL); 2388 return (SET_ERROR(EDOM)); 2389 } 2390 vd->vdev_max_asize = max_asize; 2391 } 2392 2393 /* 2394 * If all children are healthy we update asize if either: 2395 * The asize has increased, due to a device expansion caused by dynamic 2396 * LUN growth or vdev replacement, and automatic expansion is enabled; 2397 * making the additional space available. 2398 * 2399 * The asize has decreased, due to a device shrink usually caused by a 2400 * vdev replace with a smaller device. This ensures that calculations 2401 * based of max_asize and asize e.g. esize are always valid. It's safe 2402 * to do this as we've already validated that asize is greater than 2403 * vdev_min_asize. 2404 */ 2405 if (vd->vdev_state == VDEV_STATE_HEALTHY && 2406 ((asize > vd->vdev_asize && 2407 (vd->vdev_expanding || spa->spa_autoexpand)) || 2408 (asize < vd->vdev_asize))) 2409 vd->vdev_asize = asize; 2410 2411 vdev_set_min_asize(vd); 2412 2413 /* 2414 * Ensure we can issue some IO before declaring the 2415 * vdev open for business. 2416 */ 2417 if (vd->vdev_ops->vdev_op_leaf && 2418 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 2419 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 2420 VDEV_AUX_ERR_EXCEEDED); 2421 return (error); 2422 } 2423 2424 /* 2425 * Track the minimum allocation size. 2426 */ 2427 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 2428 vd->vdev_islog == 0 && vd->vdev_aux == NULL) { 2429 uint64_t min_alloc = vdev_get_min_alloc(vd); 2430 vdev_spa_set_alloc(spa, min_alloc); 2431 } 2432 2433 /* 2434 * If this is a leaf vdev, assess whether a resilver is needed. 2435 * But don't do this if we are doing a reopen for a scrub, since 2436 * this would just restart the scrub we are already doing. 2437 */ 2438 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) 2439 dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); 2440 2441 return (0); 2442 } 2443 2444 static void 2445 vdev_validate_child(void *arg) 2446 { 2447 vdev_t *vd = arg; 2448 2449 vd->vdev_validate_thread = curthread; 2450 vd->vdev_validate_error = vdev_validate(vd); 2451 vd->vdev_validate_thread = NULL; 2452 } 2453 2454 /* 2455 * Called once the vdevs are all opened, this routine validates the label 2456 * contents. This needs to be done before vdev_load() so that we don't 2457 * inadvertently do repair I/Os to the wrong device. 2458 * 2459 * This function will only return failure if one of the vdevs indicates that it 2460 * has since been destroyed or exported. This is only possible if 2461 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 2462 * will be updated but the function will return 0. 2463 */ 2464 int 2465 vdev_validate(vdev_t *vd) 2466 { 2467 spa_t *spa = vd->vdev_spa; 2468 taskq_t *tq = NULL; 2469 nvlist_t *label; 2470 uint64_t guid = 0, aux_guid = 0, top_guid; 2471 uint64_t state; 2472 nvlist_t *nvl; 2473 uint64_t txg; 2474 int children = vd->vdev_children; 2475 2476 if (vdev_validate_skip) 2477 return (0); 2478 2479 if (children > 0) { 2480 tq = taskq_create("vdev_validate", children, minclsyspri, 2481 children, children, TASKQ_PREPOPULATE); 2482 } 2483 2484 for (uint64_t c = 0; c < children; c++) { 2485 vdev_t *cvd = vd->vdev_child[c]; 2486 2487 if (tq == NULL || vdev_uses_zvols(cvd)) { 2488 vdev_validate_child(cvd); 2489 } else { 2490 VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd, 2491 TQ_SLEEP) != TASKQID_INVALID); 2492 } 2493 } 2494 if (tq != NULL) { 2495 taskq_wait(tq); 2496 taskq_destroy(tq); 2497 } 2498 for (int c = 0; c < children; c++) { 2499 int error = vd->vdev_child[c]->vdev_validate_error; 2500 2501 if (error != 0) 2502 return (SET_ERROR(EBADF)); 2503 } 2504 2505 2506 /* 2507 * If the device has already failed, or was marked offline, don't do 2508 * any further validation. Otherwise, label I/O will fail and we will 2509 * overwrite the previous state. 2510 */ 2511 if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) 2512 return (0); 2513 2514 /* 2515 * If we are performing an extreme rewind, we allow for a label that 2516 * was modified at a point after the current txg. 2517 * If config lock is not held do not check for the txg. spa_sync could 2518 * be updating the vdev's label before updating spa_last_synced_txg. 2519 */ 2520 if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || 2521 spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) 2522 txg = UINT64_MAX; 2523 else 2524 txg = spa_last_synced_txg(spa); 2525 2526 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 2527 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2528 VDEV_AUX_BAD_LABEL); 2529 vdev_dbgmsg(vd, "vdev_validate: failed reading config for " 2530 "txg %llu", (u_longlong_t)txg); 2531 return (0); 2532 } 2533 2534 /* 2535 * Determine if this vdev has been split off into another 2536 * pool. If so, then refuse to open it. 2537 */ 2538 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 2539 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 2540 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2541 VDEV_AUX_SPLIT_POOL); 2542 nvlist_free(label); 2543 vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); 2544 return (0); 2545 } 2546 2547 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { 2548 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2549 VDEV_AUX_CORRUPT_DATA); 2550 nvlist_free(label); 2551 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 2552 ZPOOL_CONFIG_POOL_GUID); 2553 return (0); 2554 } 2555 2556 /* 2557 * If config is not trusted then ignore the spa guid check. This is 2558 * necessary because if the machine crashed during a re-guid the new 2559 * guid might have been written to all of the vdev labels, but not the 2560 * cached config. The check will be performed again once we have the 2561 * trusted config from the MOS. 2562 */ 2563 if (spa->spa_trust_config && guid != spa_guid(spa)) { 2564 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2565 VDEV_AUX_CORRUPT_DATA); 2566 nvlist_free(label); 2567 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " 2568 "match config (%llu != %llu)", (u_longlong_t)guid, 2569 (u_longlong_t)spa_guid(spa)); 2570 return (0); 2571 } 2572 2573 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 2574 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 2575 &aux_guid) != 0) 2576 aux_guid = 0; 2577 2578 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { 2579 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2580 VDEV_AUX_CORRUPT_DATA); 2581 nvlist_free(label); 2582 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 2583 ZPOOL_CONFIG_GUID); 2584 return (0); 2585 } 2586 2587 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) 2588 != 0) { 2589 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2590 VDEV_AUX_CORRUPT_DATA); 2591 nvlist_free(label); 2592 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 2593 ZPOOL_CONFIG_TOP_GUID); 2594 return (0); 2595 } 2596 2597 /* 2598 * If this vdev just became a top-level vdev because its sibling was 2599 * detached, it will have adopted the parent's vdev guid -- but the 2600 * label may or may not be on disk yet. Fortunately, either version 2601 * of the label will have the same top guid, so if we're a top-level 2602 * vdev, we can safely compare to that instead. 2603 * However, if the config comes from a cachefile that failed to update 2604 * after the detach, a top-level vdev will appear as a non top-level 2605 * vdev in the config. Also relax the constraints if we perform an 2606 * extreme rewind. 2607 * 2608 * If we split this vdev off instead, then we also check the 2609 * original pool's guid. We don't want to consider the vdev 2610 * corrupt if it is partway through a split operation. 2611 */ 2612 if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { 2613 boolean_t mismatch = B_FALSE; 2614 if (spa->spa_trust_config && !spa->spa_extreme_rewind) { 2615 if (vd != vd->vdev_top || vd->vdev_guid != top_guid) 2616 mismatch = B_TRUE; 2617 } else { 2618 if (vd->vdev_guid != top_guid && 2619 vd->vdev_top->vdev_guid != guid) 2620 mismatch = B_TRUE; 2621 } 2622 2623 if (mismatch) { 2624 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2625 VDEV_AUX_CORRUPT_DATA); 2626 nvlist_free(label); 2627 vdev_dbgmsg(vd, "vdev_validate: config guid " 2628 "doesn't match label guid"); 2629 vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", 2630 (u_longlong_t)vd->vdev_guid, 2631 (u_longlong_t)vd->vdev_top->vdev_guid); 2632 vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " 2633 "aux_guid %llu", (u_longlong_t)guid, 2634 (u_longlong_t)top_guid, (u_longlong_t)aux_guid); 2635 return (0); 2636 } 2637 } 2638 2639 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 2640 &state) != 0) { 2641 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2642 VDEV_AUX_CORRUPT_DATA); 2643 nvlist_free(label); 2644 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", 2645 ZPOOL_CONFIG_POOL_STATE); 2646 return (0); 2647 } 2648 2649 nvlist_free(label); 2650 2651 /* 2652 * If this is a verbatim import, no need to check the 2653 * state of the pool. 2654 */ 2655 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 2656 spa_load_state(spa) == SPA_LOAD_OPEN && 2657 state != POOL_STATE_ACTIVE) { 2658 vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " 2659 "for spa %s", (u_longlong_t)state, spa->spa_name); 2660 return (SET_ERROR(EBADF)); 2661 } 2662 2663 /* 2664 * If we were able to open and validate a vdev that was 2665 * previously marked permanently unavailable, clear that state 2666 * now. 2667 */ 2668 if (vd->vdev_not_present) 2669 vd->vdev_not_present = 0; 2670 2671 return (0); 2672 } 2673 2674 static void 2675 vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) 2676 { 2677 if (svd != NULL && *dvd != NULL) { 2678 if (strcmp(svd, *dvd) != 0) { 2679 zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " 2680 "from '%s' to '%s'", (u_longlong_t)guid, prefix, 2681 *dvd, svd); 2682 spa_strfree(*dvd); 2683 *dvd = spa_strdup(svd); 2684 } 2685 } else if (svd != NULL) { 2686 *dvd = spa_strdup(svd); 2687 zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", 2688 (u_longlong_t)guid, *dvd); 2689 } 2690 } 2691 2692 static void 2693 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) 2694 { 2695 char *old, *new; 2696 2697 vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, 2698 dvd->vdev_guid); 2699 2700 vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, 2701 dvd->vdev_guid); 2702 2703 vdev_update_path("vdev_physpath", svd->vdev_physpath, 2704 &dvd->vdev_physpath, dvd->vdev_guid); 2705 2706 /* 2707 * Our enclosure sysfs path may have changed between imports 2708 */ 2709 old = dvd->vdev_enc_sysfs_path; 2710 new = svd->vdev_enc_sysfs_path; 2711 if ((old != NULL && new == NULL) || 2712 (old == NULL && new != NULL) || 2713 ((old != NULL && new != NULL) && strcmp(new, old) != 0)) { 2714 zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path " 2715 "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, 2716 old, new); 2717 2718 if (dvd->vdev_enc_sysfs_path) 2719 spa_strfree(dvd->vdev_enc_sysfs_path); 2720 2721 if (svd->vdev_enc_sysfs_path) { 2722 dvd->vdev_enc_sysfs_path = spa_strdup( 2723 svd->vdev_enc_sysfs_path); 2724 } else { 2725 dvd->vdev_enc_sysfs_path = NULL; 2726 } 2727 } 2728 } 2729 2730 /* 2731 * Recursively copy vdev paths from one vdev to another. Source and destination 2732 * vdev trees must have same geometry otherwise return error. Intended to copy 2733 * paths from userland config into MOS config. 2734 */ 2735 int 2736 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) 2737 { 2738 if ((svd->vdev_ops == &vdev_missing_ops) || 2739 (svd->vdev_ishole && dvd->vdev_ishole) || 2740 (dvd->vdev_ops == &vdev_indirect_ops)) 2741 return (0); 2742 2743 if (svd->vdev_ops != dvd->vdev_ops) { 2744 vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", 2745 svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); 2746 return (SET_ERROR(EINVAL)); 2747 } 2748 2749 if (svd->vdev_guid != dvd->vdev_guid) { 2750 vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " 2751 "%llu)", (u_longlong_t)svd->vdev_guid, 2752 (u_longlong_t)dvd->vdev_guid); 2753 return (SET_ERROR(EINVAL)); 2754 } 2755 2756 if (svd->vdev_children != dvd->vdev_children) { 2757 vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " 2758 "%llu != %llu", (u_longlong_t)svd->vdev_children, 2759 (u_longlong_t)dvd->vdev_children); 2760 return (SET_ERROR(EINVAL)); 2761 } 2762 2763 for (uint64_t i = 0; i < svd->vdev_children; i++) { 2764 int error = vdev_copy_path_strict(svd->vdev_child[i], 2765 dvd->vdev_child[i]); 2766 if (error != 0) 2767 return (error); 2768 } 2769 2770 if (svd->vdev_ops->vdev_op_leaf) 2771 vdev_copy_path_impl(svd, dvd); 2772 2773 return (0); 2774 } 2775 2776 static void 2777 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) 2778 { 2779 ASSERT(stvd->vdev_top == stvd); 2780 ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); 2781 2782 for (uint64_t i = 0; i < dvd->vdev_children; i++) { 2783 vdev_copy_path_search(stvd, dvd->vdev_child[i]); 2784 } 2785 2786 if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) 2787 return; 2788 2789 /* 2790 * The idea here is that while a vdev can shift positions within 2791 * a top vdev (when replacing, attaching mirror, etc.) it cannot 2792 * step outside of it. 2793 */ 2794 vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); 2795 2796 if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) 2797 return; 2798 2799 ASSERT(vd->vdev_ops->vdev_op_leaf); 2800 2801 vdev_copy_path_impl(vd, dvd); 2802 } 2803 2804 /* 2805 * Recursively copy vdev paths from one root vdev to another. Source and 2806 * destination vdev trees may differ in geometry. For each destination leaf 2807 * vdev, search a vdev with the same guid and top vdev id in the source. 2808 * Intended to copy paths from userland config into MOS config. 2809 */ 2810 void 2811 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) 2812 { 2813 uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); 2814 ASSERT(srvd->vdev_ops == &vdev_root_ops); 2815 ASSERT(drvd->vdev_ops == &vdev_root_ops); 2816 2817 for (uint64_t i = 0; i < children; i++) { 2818 vdev_copy_path_search(srvd->vdev_child[i], 2819 drvd->vdev_child[i]); 2820 } 2821 } 2822 2823 /* 2824 * Close a virtual device. 2825 */ 2826 void 2827 vdev_close(vdev_t *vd) 2828 { 2829 vdev_t *pvd = vd->vdev_parent; 2830 spa_t *spa __maybe_unused = vd->vdev_spa; 2831 2832 ASSERT(vd != NULL); 2833 ASSERT(vd->vdev_open_thread == curthread || 2834 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2835 2836 /* 2837 * If our parent is reopening, then we are as well, unless we are 2838 * going offline. 2839 */ 2840 if (pvd != NULL && pvd->vdev_reopening) 2841 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 2842 2843 vd->vdev_ops->vdev_op_close(vd); 2844 2845 /* 2846 * We record the previous state before we close it, so that if we are 2847 * doing a reopen(), we don't generate FMA ereports if we notice that 2848 * it's still faulted. 2849 */ 2850 vd->vdev_prevstate = vd->vdev_state; 2851 2852 if (vd->vdev_offline) 2853 vd->vdev_state = VDEV_STATE_OFFLINE; 2854 else 2855 vd->vdev_state = VDEV_STATE_CLOSED; 2856 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2857 } 2858 2859 void 2860 vdev_hold(vdev_t *vd) 2861 { 2862 spa_t *spa = vd->vdev_spa; 2863 2864 ASSERT(spa_is_root(spa)); 2865 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 2866 return; 2867 2868 for (int c = 0; c < vd->vdev_children; c++) 2869 vdev_hold(vd->vdev_child[c]); 2870 2871 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL) 2872 vd->vdev_ops->vdev_op_hold(vd); 2873 } 2874 2875 void 2876 vdev_rele(vdev_t *vd) 2877 { 2878 ASSERT(spa_is_root(vd->vdev_spa)); 2879 for (int c = 0; c < vd->vdev_children; c++) 2880 vdev_rele(vd->vdev_child[c]); 2881 2882 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL) 2883 vd->vdev_ops->vdev_op_rele(vd); 2884 } 2885 2886 /* 2887 * Reopen all interior vdevs and any unopened leaves. We don't actually 2888 * reopen leaf vdevs which had previously been opened as they might deadlock 2889 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 2890 * If the leaf has never been opened then open it, as usual. 2891 */ 2892 void 2893 vdev_reopen(vdev_t *vd) 2894 { 2895 spa_t *spa = vd->vdev_spa; 2896 2897 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2898 2899 /* set the reopening flag unless we're taking the vdev offline */ 2900 vd->vdev_reopening = !vd->vdev_offline; 2901 vdev_close(vd); 2902 (void) vdev_open(vd); 2903 2904 /* 2905 * Call vdev_validate() here to make sure we have the same device. 2906 * Otherwise, a device with an invalid label could be successfully 2907 * opened in response to vdev_reopen(). 2908 */ 2909 if (vd->vdev_aux) { 2910 (void) vdev_validate_aux(vd); 2911 if (vdev_readable(vd) && vdev_writeable(vd) && 2912 vd->vdev_aux == &spa->spa_l2cache) { 2913 /* 2914 * In case the vdev is present we should evict all ARC 2915 * buffers and pointers to log blocks and reclaim their 2916 * space before restoring its contents to L2ARC. 2917 */ 2918 if (l2arc_vdev_present(vd)) { 2919 l2arc_rebuild_vdev(vd, B_TRUE); 2920 } else { 2921 l2arc_add_vdev(spa, vd); 2922 } 2923 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 2924 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2925 } 2926 } else { 2927 (void) vdev_validate(vd); 2928 } 2929 2930 /* 2931 * Recheck if resilver is still needed and cancel any 2932 * scheduled resilver if resilver is unneeded. 2933 */ 2934 if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && 2935 spa->spa_async_tasks & SPA_ASYNC_RESILVER) { 2936 mutex_enter(&spa->spa_async_lock); 2937 spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; 2938 mutex_exit(&spa->spa_async_lock); 2939 } 2940 2941 /* 2942 * Reassess parent vdev's health. 2943 */ 2944 vdev_propagate_state(vd); 2945 } 2946 2947 int 2948 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 2949 { 2950 int error; 2951 2952 /* 2953 * Normally, partial opens (e.g. of a mirror) are allowed. 2954 * For a create, however, we want to fail the request if 2955 * there are any components we can't open. 2956 */ 2957 error = vdev_open(vd); 2958 2959 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 2960 vdev_close(vd); 2961 return (error ? error : SET_ERROR(ENXIO)); 2962 } 2963 2964 /* 2965 * Recursively load DTLs and initialize all labels. 2966 */ 2967 if ((error = vdev_dtl_load(vd)) != 0 || 2968 (error = vdev_label_init(vd, txg, isreplacing ? 2969 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 2970 vdev_close(vd); 2971 return (error); 2972 } 2973 2974 return (0); 2975 } 2976 2977 void 2978 vdev_metaslab_set_size(vdev_t *vd) 2979 { 2980 uint64_t asize = vd->vdev_asize; 2981 uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; 2982 uint64_t ms_shift; 2983 2984 /* 2985 * There are two dimensions to the metaslab sizing calculation: 2986 * the size of the metaslab and the count of metaslabs per vdev. 2987 * 2988 * The default values used below are a good balance between memory 2989 * usage (larger metaslab size means more memory needed for loaded 2990 * metaslabs; more metaslabs means more memory needed for the 2991 * metaslab_t structs), metaslab load time (larger metaslabs take 2992 * longer to load), and metaslab sync time (more metaslabs means 2993 * more time spent syncing all of them). 2994 * 2995 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. 2996 * The range of the dimensions are as follows: 2997 * 2998 * 2^29 <= ms_size <= 2^34 2999 * 16 <= ms_count <= 131,072 3000 * 3001 * On the lower end of vdev sizes, we aim for metaslabs sizes of 3002 * at least 512MB (2^29) to minimize fragmentation effects when 3003 * testing with smaller devices. However, the count constraint 3004 * of at least 16 metaslabs will override this minimum size goal. 3005 * 3006 * On the upper end of vdev sizes, we aim for a maximum metaslab 3007 * size of 16GB. However, we will cap the total count to 2^17 3008 * metaslabs to keep our memory footprint in check and let the 3009 * metaslab size grow from there if that limit is hit. 3010 * 3011 * The net effect of applying above constrains is summarized below. 3012 * 3013 * vdev size metaslab count 3014 * --------------|----------------- 3015 * < 8GB ~16 3016 * 8GB - 100GB one per 512MB 3017 * 100GB - 3TB ~200 3018 * 3TB - 2PB one per 16GB 3019 * > 2PB ~131,072 3020 * -------------------------------- 3021 * 3022 * Finally, note that all of the above calculate the initial 3023 * number of metaslabs. Expanding a top-level vdev will result 3024 * in additional metaslabs being allocated making it possible 3025 * to exceed the zfs_vdev_ms_count_limit. 3026 */ 3027 3028 if (ms_count < zfs_vdev_min_ms_count) 3029 ms_shift = highbit64(asize / zfs_vdev_min_ms_count); 3030 else if (ms_count > zfs_vdev_default_ms_count) 3031 ms_shift = highbit64(asize / zfs_vdev_default_ms_count); 3032 else 3033 ms_shift = zfs_vdev_default_ms_shift; 3034 3035 if (ms_shift < SPA_MAXBLOCKSHIFT) { 3036 ms_shift = SPA_MAXBLOCKSHIFT; 3037 } else if (ms_shift > zfs_vdev_max_ms_shift) { 3038 ms_shift = zfs_vdev_max_ms_shift; 3039 /* cap the total count to constrain memory footprint */ 3040 if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) 3041 ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); 3042 } 3043 3044 vd->vdev_ms_shift = ms_shift; 3045 ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); 3046 } 3047 3048 void 3049 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 3050 { 3051 ASSERT(vd == vd->vdev_top); 3052 /* indirect vdevs don't have metaslabs or dtls */ 3053 ASSERT(vdev_is_concrete(vd) || flags == 0); 3054 ASSERT(ISP2(flags)); 3055 ASSERT(spa_writeable(vd->vdev_spa)); 3056 3057 if (flags & VDD_METASLAB) 3058 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 3059 3060 if (flags & VDD_DTL) 3061 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 3062 3063 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 3064 } 3065 3066 void 3067 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 3068 { 3069 for (int c = 0; c < vd->vdev_children; c++) 3070 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 3071 3072 if (vd->vdev_ops->vdev_op_leaf) 3073 vdev_dirty(vd->vdev_top, flags, vd, txg); 3074 } 3075 3076 /* 3077 * DTLs. 3078 * 3079 * A vdev's DTL (dirty time log) is the set of transaction groups for which 3080 * the vdev has less than perfect replication. There are four kinds of DTL: 3081 * 3082 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 3083 * 3084 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 3085 * 3086 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 3087 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 3088 * txgs that was scrubbed. 3089 * 3090 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 3091 * persistent errors or just some device being offline. 3092 * Unlike the other three, the DTL_OUTAGE map is not generally 3093 * maintained; it's only computed when needed, typically to 3094 * determine whether a device can be detached. 3095 * 3096 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 3097 * either has the data or it doesn't. 3098 * 3099 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 3100 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 3101 * if any child is less than fully replicated, then so is its parent. 3102 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 3103 * comprising only those txgs which appear in 'maxfaults' or more children; 3104 * those are the txgs we don't have enough replication to read. For example, 3105 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 3106 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 3107 * two child DTL_MISSING maps. 3108 * 3109 * It should be clear from the above that to compute the DTLs and outage maps 3110 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 3111 * Therefore, that is all we keep on disk. When loading the pool, or after 3112 * a configuration change, we generate all other DTLs from first principles. 3113 */ 3114 void 3115 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 3116 { 3117 zfs_range_tree_t *rt = vd->vdev_dtl[t]; 3118 3119 ASSERT(t < DTL_TYPES); 3120 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 3121 ASSERT(spa_writeable(vd->vdev_spa)); 3122 3123 mutex_enter(&vd->vdev_dtl_lock); 3124 if (!zfs_range_tree_contains(rt, txg, size)) { 3125 /* Clear whatever is there already. */ 3126 zfs_range_tree_clear(rt, txg, size); 3127 zfs_range_tree_add(rt, txg, size); 3128 } 3129 mutex_exit(&vd->vdev_dtl_lock); 3130 } 3131 3132 boolean_t 3133 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 3134 { 3135 zfs_range_tree_t *rt = vd->vdev_dtl[t]; 3136 boolean_t dirty = B_FALSE; 3137 3138 ASSERT(t < DTL_TYPES); 3139 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 3140 3141 /* 3142 * While we are loading the pool, the DTLs have not been loaded yet. 3143 * This isn't a problem but it can result in devices being tried 3144 * which are known to not have the data. In which case, the import 3145 * is relying on the checksum to ensure that we get the right data. 3146 * Note that while importing we are only reading the MOS, which is 3147 * always checksummed. 3148 */ 3149 mutex_enter(&vd->vdev_dtl_lock); 3150 if (!zfs_range_tree_is_empty(rt)) 3151 dirty = zfs_range_tree_contains(rt, txg, size); 3152 mutex_exit(&vd->vdev_dtl_lock); 3153 3154 return (dirty); 3155 } 3156 3157 boolean_t 3158 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 3159 { 3160 zfs_range_tree_t *rt = vd->vdev_dtl[t]; 3161 boolean_t empty; 3162 3163 mutex_enter(&vd->vdev_dtl_lock); 3164 empty = zfs_range_tree_is_empty(rt); 3165 mutex_exit(&vd->vdev_dtl_lock); 3166 3167 return (empty); 3168 } 3169 3170 /* 3171 * Check if the txg falls within the range which must be 3172 * resilvered. DVAs outside this range can always be skipped. 3173 */ 3174 boolean_t 3175 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3176 uint64_t phys_birth) 3177 { 3178 (void) dva, (void) psize; 3179 3180 /* Set by sequential resilver. */ 3181 if (phys_birth == TXG_UNKNOWN) 3182 return (B_TRUE); 3183 3184 return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)); 3185 } 3186 3187 /* 3188 * Returns B_TRUE if the vdev determines the DVA needs to be resilvered. 3189 */ 3190 boolean_t 3191 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, 3192 uint64_t phys_birth) 3193 { 3194 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 3195 3196 if (vd->vdev_ops->vdev_op_need_resilver == NULL || 3197 vd->vdev_ops->vdev_op_leaf) 3198 return (B_TRUE); 3199 3200 return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize, 3201 phys_birth)); 3202 } 3203 3204 /* 3205 * Returns the lowest txg in the DTL range. 3206 */ 3207 static uint64_t 3208 vdev_dtl_min(vdev_t *vd) 3209 { 3210 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 3211 ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 3212 ASSERT0(vd->vdev_children); 3213 3214 return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); 3215 } 3216 3217 /* 3218 * Returns the highest txg in the DTL. 3219 */ 3220 static uint64_t 3221 vdev_dtl_max(vdev_t *vd) 3222 { 3223 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 3224 ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 3225 ASSERT0(vd->vdev_children); 3226 3227 return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING])); 3228 } 3229 3230 /* 3231 * Determine if a resilvering vdev should remove any DTL entries from 3232 * its range. If the vdev was resilvering for the entire duration of the 3233 * scan then it should excise that range from its DTLs. Otherwise, this 3234 * vdev is considered partially resilvered and should leave its DTL 3235 * entries intact. The comment in vdev_dtl_reassess() describes how we 3236 * excise the DTLs. 3237 */ 3238 static boolean_t 3239 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) 3240 { 3241 ASSERT0(vd->vdev_children); 3242 3243 if (vd->vdev_state < VDEV_STATE_DEGRADED) 3244 return (B_FALSE); 3245 3246 if (vd->vdev_resilver_deferred) 3247 return (B_FALSE); 3248 3249 if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) 3250 return (B_TRUE); 3251 3252 if (rebuild_done) { 3253 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; 3254 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 3255 3256 /* Rebuild not initiated by attach */ 3257 if (vd->vdev_rebuild_txg == 0) 3258 return (B_TRUE); 3259 3260 /* 3261 * When a rebuild completes without error then all missing data 3262 * up to the rebuild max txg has been reconstructed and the DTL 3263 * is eligible for excision. 3264 */ 3265 if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && 3266 vdev_dtl_max(vd) <= vrp->vrp_max_txg) { 3267 ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); 3268 ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); 3269 ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); 3270 return (B_TRUE); 3271 } 3272 } else { 3273 dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; 3274 dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; 3275 3276 /* Resilver not initiated by attach */ 3277 if (vd->vdev_resilver_txg == 0) 3278 return (B_TRUE); 3279 3280 /* 3281 * When a resilver is initiated the scan will assign the 3282 * scn_max_txg value to the highest txg value that exists 3283 * in all DTLs. If this device's max DTL is not part of this 3284 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] 3285 * then it is not eligible for excision. 3286 */ 3287 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 3288 ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); 3289 ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); 3290 ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); 3291 return (B_TRUE); 3292 } 3293 } 3294 3295 return (B_FALSE); 3296 } 3297 3298 /* 3299 * Reassess DTLs after a config change or scrub completion. If txg == 0 no 3300 * write operations will be issued to the pool. 3301 */ 3302 static void 3303 vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, 3304 boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting) 3305 { 3306 spa_t *spa = vd->vdev_spa; 3307 avl_tree_t reftree; 3308 int minref; 3309 3310 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 3311 3312 for (int c = 0; c < vd->vdev_children; c++) 3313 vdev_dtl_reassess_impl(vd->vdev_child[c], txg, 3314 scrub_txg, scrub_done, rebuild_done, faulting); 3315 3316 if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) 3317 return; 3318 3319 if (vd->vdev_ops->vdev_op_leaf) { 3320 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 3321 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; 3322 boolean_t check_excise = B_FALSE; 3323 boolean_t wasempty = B_TRUE; 3324 3325 mutex_enter(&vd->vdev_dtl_lock); 3326 3327 /* 3328 * If requested, pretend the scan or rebuild completed cleanly. 3329 */ 3330 if (zfs_scan_ignore_errors) { 3331 if (scn != NULL) 3332 scn->scn_phys.scn_errors = 0; 3333 if (vr != NULL) 3334 vr->vr_rebuild_phys.vrp_errors = 0; 3335 } 3336 3337 if (scrub_txg != 0 && 3338 !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { 3339 wasempty = B_FALSE; 3340 zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " 3341 "dtl:%llu/%llu errors:%llu", 3342 (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, 3343 (u_longlong_t)scrub_txg, spa->spa_scrub_started, 3344 (u_longlong_t)vdev_dtl_min(vd), 3345 (u_longlong_t)vdev_dtl_max(vd), 3346 (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); 3347 } 3348 3349 /* 3350 * If we've completed a scrub/resilver or a rebuild cleanly 3351 * then determine if this vdev should remove any DTLs. We 3352 * only want to excise regions on vdevs that were available 3353 * during the entire duration of this scan. 3354 */ 3355 if (rebuild_done && 3356 vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { 3357 check_excise = B_TRUE; 3358 } else { 3359 if (spa->spa_scrub_started || 3360 (scn != NULL && scn->scn_phys.scn_errors == 0)) { 3361 check_excise = B_TRUE; 3362 } 3363 } 3364 3365 if (scrub_txg && check_excise && 3366 vdev_dtl_should_excise(vd, rebuild_done)) { 3367 /* 3368 * We completed a scrub, resilver or rebuild up to 3369 * scrub_txg. If we did it without rebooting, then 3370 * the scrub dtl will be valid, so excise the old 3371 * region and fold in the scrub dtl. Otherwise, 3372 * leave the dtl as-is if there was an error. 3373 * 3374 * There's little trick here: to excise the beginning 3375 * of the DTL_MISSING map, we put it into a reference 3376 * tree and then add a segment with refcnt -1 that 3377 * covers the range [0, scrub_txg). This means 3378 * that each txg in that range has refcnt -1 or 0. 3379 * We then add DTL_SCRUB with a refcnt of 2, so that 3380 * entries in the range [0, scrub_txg) will have a 3381 * positive refcnt -- either 1 or 2. We then convert 3382 * the reference tree into the new DTL_MISSING map. 3383 */ 3384 space_reftree_create(&reftree); 3385 space_reftree_add_map(&reftree, 3386 vd->vdev_dtl[DTL_MISSING], 1); 3387 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 3388 space_reftree_add_map(&reftree, 3389 vd->vdev_dtl[DTL_SCRUB], 2); 3390 space_reftree_generate_map(&reftree, 3391 vd->vdev_dtl[DTL_MISSING], 1); 3392 space_reftree_destroy(&reftree); 3393 3394 if (!zfs_range_tree_is_empty( 3395 vd->vdev_dtl[DTL_MISSING])) { 3396 zfs_dbgmsg("update DTL_MISSING:%llu/%llu", 3397 (u_longlong_t)vdev_dtl_min(vd), 3398 (u_longlong_t)vdev_dtl_max(vd)); 3399 } else if (!wasempty) { 3400 zfs_dbgmsg("DTL_MISSING is now empty"); 3401 } 3402 } 3403 zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 3404 zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], 3405 zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 3406 if (scrub_done) 3407 zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, 3408 NULL); 3409 zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 3410 3411 /* 3412 * For the faulting case, treat members of a replacing vdev 3413 * as if they are not available. It's more likely than not that 3414 * a vdev in a replacing vdev could encounter read errors so 3415 * treat it as not being able to contribute. 3416 */ 3417 if (!vdev_readable(vd) || 3418 (faulting && vd->vdev_parent != NULL && 3419 vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) { 3420 zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 3421 } else { 3422 zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING], 3423 zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 3424 } 3425 3426 /* 3427 * If the vdev was resilvering or rebuilding and no longer 3428 * has any DTLs then reset the appropriate flag and dirty 3429 * the top level so that we persist the change. 3430 */ 3431 if (txg != 0 && 3432 zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && 3433 zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { 3434 if (vd->vdev_rebuild_txg != 0) { 3435 vd->vdev_rebuild_txg = 0; 3436 vdev_config_dirty(vd->vdev_top); 3437 } else if (vd->vdev_resilver_txg != 0) { 3438 vd->vdev_resilver_txg = 0; 3439 vdev_config_dirty(vd->vdev_top); 3440 } 3441 } 3442 3443 mutex_exit(&vd->vdev_dtl_lock); 3444 3445 if (txg != 0) 3446 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 3447 } else { 3448 mutex_enter(&vd->vdev_dtl_lock); 3449 for (int t = 0; t < DTL_TYPES; t++) { 3450 /* account for child's outage in parent's missing map */ 3451 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 3452 if (t == DTL_SCRUB) { 3453 /* leaf vdevs only */ 3454 continue; 3455 } 3456 int children = vd->vdev_children; 3457 int width = children; 3458 if (t == DTL_PARTIAL) { 3459 /* i.e. non-zero */ 3460 minref = 1; 3461 } else if (vdev_get_nparity(vd) != 0) { 3462 /* RAIDZ, DRAID */ 3463 minref = vdev_get_nparity(vd) + 1; 3464 if (vd->vdev_ops == &vdev_draid_ops) { 3465 vdev_draid_config_t *vdc = vd->vdev_tsd; 3466 minref = vdc->vdc_nparity + 1; 3467 children = vdc->vdc_children; 3468 } 3469 } else { 3470 /* any kind of mirror */ 3471 minref = vd->vdev_children; 3472 } 3473 /* 3474 * For dRAID with failure domains, count failures 3475 * only once for any i-th child failure in each failure 3476 * group, but only if the failures threshold is not 3477 * reached in any of the groups. 3478 */ 3479 boolean_t safe2skip = B_FALSE; 3480 if (width > children && 3481 vdev_draid_fail_domain_allowed(vd)) 3482 safe2skip = B_TRUE; 3483 3484 space_reftree_create(&reftree); 3485 for (int c = 0; c < children; c++) { 3486 for (int i = c; i < width; i += children) { 3487 vdev_t *cvd = vd->vdev_child[i]; 3488 3489 mutex_enter(&cvd->vdev_dtl_lock); 3490 space_reftree_add_map(&reftree, 3491 cvd->vdev_dtl[s], 1); 3492 boolean_t empty = 3493 zfs_range_tree_is_empty( 3494 cvd->vdev_dtl[s]); 3495 mutex_exit(&cvd->vdev_dtl_lock); 3496 3497 if (s == DTL_OUTAGE && !empty && 3498 safe2skip) 3499 break; 3500 } 3501 } 3502 space_reftree_generate_map(&reftree, 3503 vd->vdev_dtl[t], minref); 3504 space_reftree_destroy(&reftree); 3505 } 3506 mutex_exit(&vd->vdev_dtl_lock); 3507 } 3508 3509 if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) { 3510 raidz_dtl_reassessed(vd); 3511 } 3512 } 3513 3514 void 3515 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, 3516 boolean_t scrub_done, boolean_t rebuild_done) 3517 { 3518 return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done, 3519 rebuild_done, B_FALSE)); 3520 } 3521 3522 /* 3523 * Iterate over all the vdevs except spare, and post kobj events 3524 */ 3525 void 3526 vdev_post_kobj_evt(vdev_t *vd) 3527 { 3528 if (vd->vdev_ops->vdev_op_kobj_evt_post && 3529 vd->vdev_kobj_flag == B_FALSE) { 3530 vd->vdev_kobj_flag = B_TRUE; 3531 vd->vdev_ops->vdev_op_kobj_evt_post(vd); 3532 } 3533 3534 for (int c = 0; c < vd->vdev_children; c++) 3535 vdev_post_kobj_evt(vd->vdev_child[c]); 3536 } 3537 3538 /* 3539 * Iterate over all the vdevs except spare, and clear kobj events 3540 */ 3541 void 3542 vdev_clear_kobj_evt(vdev_t *vd) 3543 { 3544 vd->vdev_kobj_flag = B_FALSE; 3545 3546 for (int c = 0; c < vd->vdev_children; c++) 3547 vdev_clear_kobj_evt(vd->vdev_child[c]); 3548 } 3549 3550 int 3551 vdev_dtl_load(vdev_t *vd) 3552 { 3553 spa_t *spa = vd->vdev_spa; 3554 objset_t *mos = spa->spa_meta_objset; 3555 zfs_range_tree_t *rt; 3556 int error = 0; 3557 3558 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 3559 ASSERT(vdev_is_concrete(vd)); 3560 3561 /* 3562 * If the dtl cannot be sync'd there is no need to open it. 3563 */ 3564 if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps) 3565 return (0); 3566 3567 error = space_map_open(&vd->vdev_dtl_sm, mos, 3568 vd->vdev_dtl_object, 0, -1ULL, 0); 3569 if (error) 3570 return (error); 3571 ASSERT(vd->vdev_dtl_sm != NULL); 3572 3573 rt = zfs_range_tree_create_flags( 3574 NULL, ZFS_RANGE_SEG64, NULL, 0, 0, 3575 ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt")); 3576 error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC); 3577 if (error == 0) { 3578 mutex_enter(&vd->vdev_dtl_lock); 3579 zfs_range_tree_walk(rt, zfs_range_tree_add, 3580 vd->vdev_dtl[DTL_MISSING]); 3581 mutex_exit(&vd->vdev_dtl_lock); 3582 } 3583 3584 zfs_range_tree_vacate(rt, NULL, NULL); 3585 zfs_range_tree_destroy(rt); 3586 3587 return (error); 3588 } 3589 3590 for (int c = 0; c < vd->vdev_children; c++) { 3591 error = vdev_dtl_load(vd->vdev_child[c]); 3592 if (error != 0) 3593 break; 3594 } 3595 3596 return (error); 3597 } 3598 3599 static void 3600 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) 3601 { 3602 spa_t *spa = vd->vdev_spa; 3603 objset_t *mos = spa->spa_meta_objset; 3604 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; 3605 const char *string; 3606 3607 ASSERT(alloc_bias != VDEV_BIAS_NONE); 3608 3609 string = 3610 (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : 3611 (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : 3612 (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; 3613 3614 ASSERT(string != NULL); 3615 VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, 3616 1, strlen(string) + 1, string, tx)); 3617 3618 if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { 3619 spa_activate_allocation_classes(spa, tx); 3620 } 3621 } 3622 3623 void 3624 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) 3625 { 3626 spa_t *spa = vd->vdev_spa; 3627 3628 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 3629 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 3630 zapobj, tx)); 3631 } 3632 3633 uint64_t 3634 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) 3635 { 3636 spa_t *spa = vd->vdev_spa; 3637 uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 3638 DMU_OT_NONE, 0, tx); 3639 3640 ASSERT(zap != 0); 3641 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 3642 zap, tx)); 3643 3644 return (zap); 3645 } 3646 3647 void 3648 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) 3649 { 3650 if (vd->vdev_ops != &vdev_hole_ops && 3651 vd->vdev_ops != &vdev_missing_ops && 3652 vd->vdev_ops != &vdev_root_ops && 3653 !vd->vdev_top->vdev_removing) { 3654 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 3655 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 3656 } 3657 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 3658 vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 3659 if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) 3660 vdev_zap_allocation_data(vd, tx); 3661 } 3662 } 3663 if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && 3664 spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { 3665 if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) 3666 spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); 3667 vd->vdev_root_zap = vdev_create_link_zap(vd, tx); 3668 } 3669 3670 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3671 vdev_construct_zaps(vd->vdev_child[i], tx); 3672 } 3673 } 3674 3675 static void 3676 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 3677 { 3678 spa_t *spa = vd->vdev_spa; 3679 zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 3680 objset_t *mos = spa->spa_meta_objset; 3681 zfs_range_tree_t *rtsync; 3682 dmu_tx_t *tx; 3683 uint64_t object = space_map_object(vd->vdev_dtl_sm); 3684 3685 ASSERT(vdev_is_concrete(vd)); 3686 ASSERT(vd->vdev_ops->vdev_op_leaf); 3687 3688 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 3689 3690 if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 3691 mutex_enter(&vd->vdev_dtl_lock); 3692 space_map_free(vd->vdev_dtl_sm, tx); 3693 space_map_close(vd->vdev_dtl_sm); 3694 vd->vdev_dtl_sm = NULL; 3695 mutex_exit(&vd->vdev_dtl_lock); 3696 3697 /* 3698 * We only destroy the leaf ZAP for detached leaves or for 3699 * removed log devices. Removed data devices handle leaf ZAP 3700 * cleanup later, once cancellation is no longer possible. 3701 */ 3702 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || 3703 vd->vdev_top->vdev_islog)) { 3704 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 3705 vd->vdev_leaf_zap = 0; 3706 } 3707 3708 dmu_tx_commit(tx); 3709 return; 3710 } 3711 3712 if (vd->vdev_dtl_sm == NULL) { 3713 uint64_t new_object; 3714 3715 new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); 3716 VERIFY3U(new_object, !=, 0); 3717 3718 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 3719 0, -1ULL, 0)); 3720 ASSERT(vd->vdev_dtl_sm != NULL); 3721 } 3722 3723 rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0, 3724 ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync")); 3725 3726 mutex_enter(&vd->vdev_dtl_lock); 3727 zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync); 3728 mutex_exit(&vd->vdev_dtl_lock); 3729 3730 space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); 3731 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); 3732 zfs_range_tree_vacate(rtsync, NULL, NULL); 3733 3734 zfs_range_tree_destroy(rtsync); 3735 3736 /* 3737 * If the object for the space map has changed then dirty 3738 * the top level so that we update the config. 3739 */ 3740 if (object != space_map_object(vd->vdev_dtl_sm)) { 3741 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " 3742 "new object %llu", (u_longlong_t)txg, spa_name(spa), 3743 (u_longlong_t)object, 3744 (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); 3745 vdev_config_dirty(vd->vdev_top); 3746 } 3747 3748 dmu_tx_commit(tx); 3749 } 3750 3751 /* 3752 * Determine whether the specified vdev can be 3753 * - offlined 3754 * - detached 3755 * - removed 3756 * - faulted 3757 * without losing data. 3758 */ 3759 boolean_t 3760 vdev_dtl_required(vdev_t *vd) 3761 { 3762 spa_t *spa = vd->vdev_spa; 3763 vdev_t *tvd = vd->vdev_top; 3764 uint8_t cant_read = vd->vdev_cant_read; 3765 boolean_t required; 3766 boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED; 3767 3768 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3769 3770 if (vd == spa->spa_root_vdev || vd == tvd) 3771 return (B_TRUE); 3772 3773 /* 3774 * Temporarily mark the device as unreadable, and then determine 3775 * whether this results in any DTL outages in the top-level vdev. 3776 * If not, we can safely offline/detach/remove the device. 3777 */ 3778 vd->vdev_cant_read = B_TRUE; 3779 vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); 3780 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 3781 vd->vdev_cant_read = cant_read; 3782 vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting); 3783 3784 if (!required && zio_injection_enabled) { 3785 required = !!zio_handle_device_injection(vd, NULL, 3786 SET_ERROR(ECHILD)); 3787 } 3788 3789 return (required); 3790 } 3791 3792 /* 3793 * Determine if resilver is needed, and if so the txg range. 3794 */ 3795 boolean_t 3796 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 3797 { 3798 boolean_t needed = B_FALSE; 3799 uint64_t thismin = UINT64_MAX; 3800 uint64_t thismax = 0; 3801 3802 if (vd->vdev_children == 0) { 3803 mutex_enter(&vd->vdev_dtl_lock); 3804 if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && 3805 vdev_writeable(vd)) { 3806 3807 thismin = vdev_dtl_min(vd); 3808 thismax = vdev_dtl_max(vd); 3809 needed = B_TRUE; 3810 } 3811 mutex_exit(&vd->vdev_dtl_lock); 3812 } else { 3813 for (int c = 0; c < vd->vdev_children; c++) { 3814 vdev_t *cvd = vd->vdev_child[c]; 3815 uint64_t cmin, cmax; 3816 3817 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 3818 thismin = MIN(thismin, cmin); 3819 thismax = MAX(thismax, cmax); 3820 needed = B_TRUE; 3821 } 3822 } 3823 } 3824 3825 if (needed && minp) { 3826 *minp = thismin; 3827 *maxp = thismax; 3828 } 3829 return (needed); 3830 } 3831 3832 /* 3833 * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj 3834 * will contain either the checkpoint spacemap object or zero if none exists. 3835 * All other errors are returned to the caller. 3836 */ 3837 int 3838 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) 3839 { 3840 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); 3841 3842 if (vd->vdev_top_zap == 0) { 3843 *sm_obj = 0; 3844 return (0); 3845 } 3846 3847 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, 3848 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); 3849 if (error == ENOENT) { 3850 *sm_obj = 0; 3851 error = 0; 3852 } 3853 3854 return (error); 3855 } 3856 3857 int 3858 vdev_load(vdev_t *vd) 3859 { 3860 int children = vd->vdev_children; 3861 int error = 0; 3862 taskq_t *tq = NULL; 3863 3864 /* 3865 * It's only worthwhile to use the taskq for the root vdev, because the 3866 * slow part is metaslab_init, and that only happens for top-level 3867 * vdevs. 3868 */ 3869 if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) { 3870 tq = taskq_create("vdev_load", children, minclsyspri, 3871 children, children, TASKQ_PREPOPULATE); 3872 } 3873 3874 /* 3875 * Recursively load all children. 3876 */ 3877 for (int c = 0; c < vd->vdev_children; c++) { 3878 vdev_t *cvd = vd->vdev_child[c]; 3879 3880 if (tq == NULL || vdev_uses_zvols(cvd)) { 3881 cvd->vdev_load_error = vdev_load(cvd); 3882 } else { 3883 VERIFY(taskq_dispatch(tq, vdev_load_child, 3884 cvd, TQ_SLEEP) != TASKQID_INVALID); 3885 } 3886 } 3887 3888 if (tq != NULL) { 3889 taskq_wait(tq); 3890 taskq_destroy(tq); 3891 } 3892 3893 for (int c = 0; c < vd->vdev_children; c++) { 3894 int error = vd->vdev_child[c]->vdev_load_error; 3895 3896 if (error != 0) 3897 return (error); 3898 } 3899 3900 vdev_set_deflate_ratio(vd); 3901 3902 if (vd->vdev_ops == &vdev_raidz_ops) { 3903 error = vdev_raidz_load(vd); 3904 if (error != 0) 3905 return (error); 3906 } 3907 3908 /* 3909 * On spa_load path, grab the allocation bias from our zap 3910 */ 3911 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { 3912 spa_t *spa = vd->vdev_spa; 3913 char bias_str[64]; 3914 3915 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, 3916 VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), 3917 bias_str); 3918 if (error == 0) { 3919 ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); 3920 vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); 3921 } else if (error != ENOENT) { 3922 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3923 VDEV_AUX_CORRUPT_DATA); 3924 vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " 3925 "failed [error=%d]", 3926 (u_longlong_t)vd->vdev_top_zap, error); 3927 return (error); 3928 } 3929 } 3930 3931 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { 3932 spa_t *spa = vd->vdev_spa; 3933 uint64_t failfast; 3934 3935 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, 3936 vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast), 3937 1, &failfast); 3938 if (error == 0) { 3939 vd->vdev_failfast = failfast; 3940 } else if (error == ENOENT) { 3941 vd->vdev_failfast = ZPROP_BOOLEAN_INHERIT; 3942 } else { 3943 vdev_dbgmsg(vd, 3944 "vdev_load: zap_lookup(top_zap=%llu) " 3945 "failed [error=%d]", 3946 (u_longlong_t)vd->vdev_top_zap, error); 3947 } 3948 } 3949 3950 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { 3951 spa_t *spa = vd->vdev_spa; 3952 uint64_t autosit; 3953 3954 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, 3955 vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit), 3956 1, &autosit); 3957 if (error == 0) { 3958 vd->vdev_autosit = autosit == 1; 3959 } else if (error == ENOENT) { 3960 vd->vdev_autosit = vdev_prop_default_numeric( 3961 VDEV_PROP_AUTOSIT); 3962 } else { 3963 vdev_dbgmsg(vd, 3964 "vdev_load: zap_lookup(top_zap=%llu) " 3965 "failed [error=%d]", 3966 (u_longlong_t)vd->vdev_top_zap, error); 3967 } 3968 } 3969 3970 /* 3971 * Load any rebuild state from the top-level vdev zap. 3972 */ 3973 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { 3974 error = vdev_rebuild_load(vd); 3975 if (error && error != ENOTSUP) { 3976 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 3977 VDEV_AUX_CORRUPT_DATA); 3978 vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " 3979 "failed [error=%d]", error); 3980 return (error); 3981 } 3982 } 3983 3984 if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) { 3985 uint64_t zapobj; 3986 3987 if (vd->vdev_top_zap != 0) 3988 zapobj = vd->vdev_top_zap; 3989 else 3990 zapobj = vd->vdev_leaf_zap; 3991 3992 error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N, 3993 &vd->vdev_checksum_n); 3994 if (error && error != ENOENT) 3995 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 3996 "failed [error=%d]", (u_longlong_t)zapobj, error); 3997 3998 error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T, 3999 &vd->vdev_checksum_t); 4000 if (error && error != ENOENT) 4001 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 4002 "failed [error=%d]", (u_longlong_t)zapobj, error); 4003 4004 error = vdev_prop_get_int(vd, VDEV_PROP_IO_N, 4005 &vd->vdev_io_n); 4006 if (error && error != ENOENT) 4007 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 4008 "failed [error=%d]", (u_longlong_t)zapobj, error); 4009 4010 error = vdev_prop_get_int(vd, VDEV_PROP_IO_T, 4011 &vd->vdev_io_t); 4012 if (error && error != ENOENT) 4013 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 4014 "failed [error=%d]", (u_longlong_t)zapobj, error); 4015 4016 error = vdev_prop_get_bool(vd, VDEV_PROP_SLOW_IO_EVENTS, 4017 &vd->vdev_slow_io_events); 4018 if (error && error != ENOENT) 4019 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 4020 "failed [error=%d]", (u_longlong_t)zapobj, error); 4021 error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N, 4022 &vd->vdev_slow_io_n); 4023 if (error && error != ENOENT) 4024 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 4025 "failed [error=%d]", (u_longlong_t)zapobj, error); 4026 4027 error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T, 4028 &vd->vdev_slow_io_t); 4029 if (error && error != ENOENT) 4030 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 4031 "failed [error=%d]", (u_longlong_t)zapobj, error); 4032 4033 error = vdev_prop_get_int(vd, VDEV_PROP_SCHEDULER, 4034 &vd->vdev_scheduler); 4035 if (error && error != ENOENT) 4036 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) " 4037 "failed [error=%d]", (u_longlong_t)zapobj, error); 4038 } 4039 4040 /* 4041 * If this is a top-level vdev, initialize its metaslabs. 4042 */ 4043 if (vd == vd->vdev_top && vdev_is_concrete(vd)) { 4044 vdev_metaslab_group_create(vd); 4045 4046 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { 4047 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 4048 VDEV_AUX_CORRUPT_DATA); 4049 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " 4050 "asize=%llu", (u_longlong_t)vd->vdev_ashift, 4051 (u_longlong_t)vd->vdev_asize); 4052 return (SET_ERROR(ENXIO)); 4053 } 4054 4055 error = vdev_metaslab_init(vd, 0); 4056 if (error != 0) { 4057 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " 4058 "[error=%d]", error); 4059 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 4060 VDEV_AUX_CORRUPT_DATA); 4061 return (error); 4062 } 4063 4064 uint64_t checkpoint_sm_obj; 4065 error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); 4066 if (error == 0 && checkpoint_sm_obj != 0) { 4067 objset_t *mos = spa_meta_objset(vd->vdev_spa); 4068 ASSERT(vd->vdev_asize != 0); 4069 ASSERT0P(vd->vdev_checkpoint_sm); 4070 4071 error = space_map_open(&vd->vdev_checkpoint_sm, 4072 mos, checkpoint_sm_obj, 0, vd->vdev_asize, 4073 vd->vdev_ashift); 4074 if (error != 0) { 4075 vdev_dbgmsg(vd, "vdev_load: space_map_open " 4076 "failed for checkpoint spacemap (obj %llu) " 4077 "[error=%d]", 4078 (u_longlong_t)checkpoint_sm_obj, error); 4079 return (error); 4080 } 4081 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); 4082 4083 /* 4084 * Since the checkpoint_sm contains free entries 4085 * exclusively we can use space_map_allocated() to 4086 * indicate the cumulative checkpointed space that 4087 * has been freed. 4088 */ 4089 vd->vdev_stat.vs_checkpoint_space = 4090 -space_map_allocated(vd->vdev_checkpoint_sm); 4091 vd->vdev_spa->spa_checkpoint_info.sci_dspace += 4092 vd->vdev_stat.vs_checkpoint_space; 4093 } else if (error != 0) { 4094 vdev_dbgmsg(vd, "vdev_load: failed to retrieve " 4095 "checkpoint space map object from vdev ZAP " 4096 "[error=%d]", error); 4097 return (error); 4098 } 4099 } 4100 4101 /* 4102 * If this is a leaf vdev, load its DTL. 4103 */ 4104 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { 4105 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 4106 VDEV_AUX_CORRUPT_DATA); 4107 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " 4108 "[error=%d]", error); 4109 return (error); 4110 } 4111 4112 uint64_t obsolete_sm_object; 4113 error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); 4114 if (error == 0 && obsolete_sm_object != 0) { 4115 objset_t *mos = vd->vdev_spa->spa_meta_objset; 4116 ASSERT(vd->vdev_asize != 0); 4117 ASSERT0P(vd->vdev_obsolete_sm); 4118 4119 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, 4120 obsolete_sm_object, 0, vd->vdev_asize, 0))) { 4121 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 4122 VDEV_AUX_CORRUPT_DATA); 4123 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " 4124 "obsolete spacemap (obj %llu) [error=%d]", 4125 (u_longlong_t)obsolete_sm_object, error); 4126 return (error); 4127 } 4128 } else if (error != 0) { 4129 vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " 4130 "space map object from vdev ZAP [error=%d]", error); 4131 return (error); 4132 } 4133 4134 return (0); 4135 } 4136 4137 /* 4138 * The special vdev case is used for hot spares and l2cache devices. Its 4139 * sole purpose it to set the vdev state for the associated vdev. To do this, 4140 * we make sure that we can open the underlying device, then try to read the 4141 * label, and make sure that the label is sane and that it hasn't been 4142 * repurposed to another pool. 4143 */ 4144 int 4145 vdev_validate_aux(vdev_t *vd) 4146 { 4147 nvlist_t *label; 4148 uint64_t guid, version; 4149 uint64_t state; 4150 4151 if (!vdev_readable(vd)) 4152 return (0); 4153 4154 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 4155 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 4156 VDEV_AUX_CORRUPT_DATA); 4157 return (-1); 4158 } 4159 4160 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 4161 !SPA_VERSION_IS_SUPPORTED(version) || 4162 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 4163 guid != vd->vdev_guid || 4164 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 4165 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 4166 VDEV_AUX_CORRUPT_DATA); 4167 nvlist_free(label); 4168 return (-1); 4169 } 4170 4171 /* 4172 * We don't actually check the pool state here. If it's in fact in 4173 * use by another pool, we update this fact on the fly when requested. 4174 */ 4175 nvlist_free(label); 4176 return (0); 4177 } 4178 4179 static void 4180 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) 4181 { 4182 objset_t *mos = spa_meta_objset(vd->vdev_spa); 4183 4184 if (vd->vdev_top_zap == 0) 4185 return; 4186 4187 uint64_t object = 0; 4188 int err = zap_lookup(mos, vd->vdev_top_zap, 4189 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); 4190 if (err == ENOENT) 4191 return; 4192 VERIFY0(err); 4193 4194 VERIFY0(dmu_object_free(mos, object, tx)); 4195 VERIFY0(zap_remove(mos, vd->vdev_top_zap, 4196 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); 4197 } 4198 4199 /* 4200 * Free the objects used to store this vdev's spacemaps, and the array 4201 * that points to them. 4202 */ 4203 void 4204 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) 4205 { 4206 if (vd->vdev_ms_array == 0) 4207 return; 4208 4209 objset_t *mos = vd->vdev_spa->spa_meta_objset; 4210 uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; 4211 size_t array_bytes = array_count * sizeof (uint64_t); 4212 uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); 4213 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, 4214 array_bytes, smobj_array, 0)); 4215 4216 for (uint64_t i = 0; i < array_count; i++) { 4217 uint64_t smobj = smobj_array[i]; 4218 if (smobj == 0) 4219 continue; 4220 4221 space_map_free_obj(mos, smobj, tx); 4222 } 4223 4224 kmem_free(smobj_array, array_bytes); 4225 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); 4226 vdev_destroy_ms_flush_data(vd, tx); 4227 vd->vdev_ms_array = 0; 4228 } 4229 4230 static void 4231 vdev_remove_empty_log(vdev_t *vd, uint64_t txg) 4232 { 4233 spa_t *spa = vd->vdev_spa; 4234 4235 ASSERT(vd->vdev_islog); 4236 ASSERT(vd == vd->vdev_top); 4237 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 4238 4239 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 4240 4241 vdev_destroy_spacemaps(vd, tx); 4242 if (vd->vdev_top_zap != 0) { 4243 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 4244 vd->vdev_top_zap = 0; 4245 } 4246 4247 dmu_tx_commit(tx); 4248 } 4249 4250 static void 4251 metaslab_sync_done_task(void *arg) 4252 { 4253 metaslab_t *msp = arg; 4254 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; 4255 metaslab_sync_done(msp, spa_syncing_txg(spa)); 4256 } 4257 4258 void 4259 vdev_sync_dispatch(vdev_t *vd, uint64_t txg) 4260 { 4261 spa_t *spa = vd->vdev_spa; 4262 4263 ASSERT(vdev_is_concrete(vd)); 4264 4265 for (metaslab_t *msp = txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)); 4266 msp; msp = txg_list_next(&vd->vdev_ms_list, msp, TXG_CLEAN(txg))) { 4267 (void) taskq_dispatch(spa->spa_sync_tq, 4268 metaslab_sync_done_task, msp, TQ_SLEEP); 4269 } 4270 } 4271 4272 void 4273 vdev_sync_done(vdev_t *vd, uint64_t txg) 4274 { 4275 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 4276 4277 ASSERT(vdev_is_concrete(vd)); 4278 4279 taskq_wait(vd->vdev_spa->spa_sync_tq); 4280 4281 while (txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)) != NULL) 4282 ; 4283 4284 if (reassess) { 4285 metaslab_sync_reassess(vd->vdev_mg); 4286 if (vd->vdev_log_mg != NULL) 4287 metaslab_sync_reassess(vd->vdev_log_mg); 4288 } 4289 } 4290 4291 void 4292 vdev_sync(vdev_t *vd, uint64_t txg) 4293 { 4294 spa_t *spa = vd->vdev_spa; 4295 vdev_t *lvd; 4296 metaslab_t *msp; 4297 4298 ASSERT3U(txg, ==, spa->spa_syncing_txg); 4299 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 4300 if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) { 4301 ASSERT(vd->vdev_removing || 4302 vd->vdev_ops == &vdev_indirect_ops); 4303 4304 vdev_indirect_sync_obsolete(vd, tx); 4305 4306 /* 4307 * If the vdev is indirect, it can't have dirty 4308 * metaslabs or DTLs. 4309 */ 4310 if (vd->vdev_ops == &vdev_indirect_ops) { 4311 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); 4312 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); 4313 dmu_tx_commit(tx); 4314 return; 4315 } 4316 } 4317 4318 ASSERT(vdev_is_concrete(vd)); 4319 4320 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && 4321 !vd->vdev_removing) { 4322 ASSERT(vd == vd->vdev_top); 4323 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); 4324 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 4325 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 4326 ASSERT(vd->vdev_ms_array != 0); 4327 vdev_config_dirty(vd); 4328 } 4329 4330 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 4331 metaslab_sync(msp, txg); 4332 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 4333 } 4334 4335 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 4336 vdev_dtl_sync(lvd, txg); 4337 4338 /* 4339 * If this is an empty log device being removed, destroy the 4340 * metadata associated with it. 4341 */ 4342 if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 4343 vdev_remove_empty_log(vd, txg); 4344 4345 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 4346 dmu_tx_commit(tx); 4347 } 4348 uint64_t 4349 vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg) 4350 { 4351 return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg)); 4352 } 4353 4354 /* 4355 * Return the amount of space that should be (or was) allocated for the given 4356 * psize (compressed block size) in the given TXG. Note that for expanded 4357 * RAIDZ vdevs, the size allocated for older BP's may be larger. See 4358 * vdev_raidz_psize_to_asize(). 4359 */ 4360 uint64_t 4361 vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg) 4362 { 4363 return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg)); 4364 } 4365 4366 uint64_t 4367 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 4368 { 4369 return (vdev_psize_to_asize_txg(vd, psize, 0)); 4370 } 4371 4372 /* 4373 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 4374 * not be opened, and no I/O is attempted. 4375 */ 4376 int 4377 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 4378 { 4379 vdev_t *vd, *tvd; 4380 4381 spa_vdev_state_enter(spa, SCL_NONE); 4382 4383 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4384 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); 4385 4386 if (!vd->vdev_ops->vdev_op_leaf) 4387 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); 4388 4389 tvd = vd->vdev_top; 4390 4391 /* 4392 * If user did a 'zpool offline -f' then make the fault persist across 4393 * reboots. 4394 */ 4395 if (aux == VDEV_AUX_EXTERNAL_PERSIST) { 4396 /* 4397 * There are two kinds of forced faults: temporary and 4398 * persistent. Temporary faults go away at pool import, while 4399 * persistent faults stay set. Both types of faults can be 4400 * cleared with a zpool clear. 4401 * 4402 * We tell if a vdev is persistently faulted by looking at the 4403 * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at 4404 * import then it's a persistent fault. Otherwise, it's 4405 * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" 4406 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This 4407 * tells vdev_config_generate() (which gets run later) to set 4408 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. 4409 */ 4410 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; 4411 vd->vdev_tmpoffline = B_FALSE; 4412 aux = VDEV_AUX_EXTERNAL; 4413 } else { 4414 vd->vdev_tmpoffline = B_TRUE; 4415 } 4416 4417 /* 4418 * We don't directly use the aux state here, but if we do a 4419 * vdev_reopen(), we need this value to be present to remember why we 4420 * were faulted. 4421 */ 4422 vd->vdev_label_aux = aux; 4423 4424 /* 4425 * Faulted state takes precedence over degraded. 4426 */ 4427 vd->vdev_delayed_close = B_FALSE; 4428 vd->vdev_faulted = 1ULL; 4429 vd->vdev_degraded = 0ULL; 4430 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 4431 4432 /* 4433 * If this device has the only valid copy of the data, then 4434 * back off and simply mark the vdev as degraded instead. 4435 */ 4436 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 4437 vd->vdev_degraded = 1ULL; 4438 vd->vdev_faulted = 0ULL; 4439 4440 /* 4441 * If we reopen the device and it's not dead, only then do we 4442 * mark it degraded. 4443 */ 4444 vdev_reopen(tvd); 4445 4446 if (vdev_readable(vd)) 4447 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 4448 } 4449 4450 return (spa_vdev_state_exit(spa, vd, 0)); 4451 } 4452 4453 /* 4454 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 4455 * user that something is wrong. The vdev continues to operate as normal as far 4456 * as I/O is concerned. 4457 */ 4458 int 4459 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 4460 { 4461 vdev_t *vd; 4462 4463 spa_vdev_state_enter(spa, SCL_NONE); 4464 4465 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4466 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); 4467 4468 if (!vd->vdev_ops->vdev_op_leaf) 4469 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); 4470 4471 /* 4472 * If the vdev is already faulted, then don't do anything. 4473 */ 4474 if (vd->vdev_faulted || vd->vdev_degraded) 4475 return (spa_vdev_state_exit(spa, NULL, 0)); 4476 4477 vd->vdev_degraded = 1ULL; 4478 if (!vdev_is_dead(vd)) 4479 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 4480 aux); 4481 4482 return (spa_vdev_state_exit(spa, vd, 0)); 4483 } 4484 4485 int 4486 vdev_remove_wanted(spa_t *spa, uint64_t guid) 4487 { 4488 vdev_t *vd; 4489 4490 spa_vdev_state_enter(spa, SCL_NONE); 4491 4492 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4493 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); 4494 4495 /* 4496 * If the vdev is already removed, or expanding which can trigger 4497 * repartition add/remove events, then don't do anything. 4498 */ 4499 if (vd->vdev_removed || vd->vdev_expanding) 4500 return (spa_vdev_state_exit(spa, NULL, 0)); 4501 4502 /* 4503 * Confirm the vdev has been removed, otherwise don't do anything. 4504 */ 4505 if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) 4506 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); 4507 4508 vd->vdev_remove_wanted = B_TRUE; 4509 spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER); 4510 4511 return (spa_vdev_state_exit(spa, vd, 0)); 4512 } 4513 4514 4515 /* 4516 * Online the given vdev. 4517 * 4518 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 4519 * spare device should be detached when the device finishes resilvering. 4520 * Second, the online should be treated like a 'test' online case, so no FMA 4521 * events are generated if the device fails to open. 4522 */ 4523 int 4524 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 4525 { 4526 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 4527 boolean_t wasoffline; 4528 vdev_state_t oldstate; 4529 4530 spa_vdev_state_enter(spa, SCL_NONE); 4531 4532 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4533 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); 4534 4535 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); 4536 oldstate = vd->vdev_state; 4537 4538 tvd = vd->vdev_top; 4539 vd->vdev_offline = B_FALSE; 4540 vd->vdev_tmpoffline = B_FALSE; 4541 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 4542 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 4543 4544 /* XXX - L2ARC 1.0 does not support expansion */ 4545 if (!vd->vdev_aux) { 4546 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 4547 pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || 4548 spa->spa_autoexpand); 4549 vd->vdev_expansion_time = gethrestime_sec(); 4550 } 4551 4552 vdev_reopen(tvd); 4553 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 4554 4555 if (!vd->vdev_aux) { 4556 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 4557 pvd->vdev_expanding = B_FALSE; 4558 } 4559 4560 if (newstate) 4561 *newstate = vd->vdev_state; 4562 if ((flags & ZFS_ONLINE_UNSPARE) && 4563 !vdev_is_dead(vd) && vd->vdev_parent && 4564 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 4565 vd->vdev_parent->vdev_child[0] == vd) 4566 vd->vdev_unspare = B_TRUE; 4567 4568 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 4569 4570 /* XXX - L2ARC 1.0 does not support expansion */ 4571 if (vd->vdev_aux) 4572 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 4573 spa->spa_ccw_fail_time = 0; 4574 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4575 } 4576 4577 /* Restart initializing if necessary */ 4578 mutex_enter(&vd->vdev_initialize_lock); 4579 if (vdev_writeable(vd) && 4580 vd->vdev_initialize_thread == NULL && 4581 vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { 4582 (void) vdev_initialize(vd); 4583 } 4584 mutex_exit(&vd->vdev_initialize_lock); 4585 4586 /* 4587 * Restart trimming if necessary. We do not restart trimming for cache 4588 * devices here. This is triggered by l2arc_rebuild_vdev() 4589 * asynchronously for the whole device or in l2arc_evict() as it evicts 4590 * space for upcoming writes. 4591 */ 4592 mutex_enter(&vd->vdev_trim_lock); 4593 if (vdev_writeable(vd) && !vd->vdev_isl2cache && 4594 vd->vdev_trim_thread == NULL && 4595 vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { 4596 (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, 4597 vd->vdev_trim_secure); 4598 } 4599 mutex_exit(&vd->vdev_trim_lock); 4600 4601 if (wasoffline || 4602 (oldstate < VDEV_STATE_DEGRADED && 4603 vd->vdev_state >= VDEV_STATE_DEGRADED)) { 4604 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); 4605 4606 /* 4607 * Asynchronously detach spare vdev if resilver or 4608 * rebuild is not required 4609 */ 4610 if (vd->vdev_unspare && 4611 !dsl_scan_resilvering(spa->spa_dsl_pool) && 4612 !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && 4613 !vdev_rebuild_active(tvd)) 4614 spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); 4615 } 4616 return (spa_vdev_state_exit(spa, vd, 0)); 4617 } 4618 4619 static int 4620 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 4621 { 4622 vdev_t *vd, *tvd; 4623 int error = 0; 4624 uint64_t generation; 4625 metaslab_group_t *mg; 4626 boolean_t dtl_required; 4627 4628 top: 4629 spa_vdev_state_enter(spa, SCL_ALLOC); 4630 4631 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4632 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); 4633 4634 if (!vd->vdev_ops->vdev_op_leaf) 4635 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); 4636 4637 if (vd->vdev_ops == &vdev_draid_spare_ops) 4638 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4639 4640 tvd = vd->vdev_top; 4641 mg = tvd->vdev_mg; 4642 generation = spa->spa_config_generation + 1; 4643 4644 /* 4645 * If the device isn't already offline, try to offline it. 4646 */ 4647 if (!vd->vdev_offline) { 4648 dtl_required = vdev_dtl_required(vd); 4649 4650 /* 4651 * If this device has the only valid copy of some data, 4652 * don't allow it to be offlined. Log devices are always 4653 * expendable. 4654 */ 4655 if (!tvd->vdev_islog && vd->vdev_aux == NULL && dtl_required) 4656 return (spa_vdev_state_exit(spa, NULL, 4657 SET_ERROR(EBUSY))); 4658 4659 /* 4660 * If the top-level is a slog and it has had allocations 4661 * then proceed. We check that the vdev's metaslab group 4662 * is not NULL since it's possible that we may have just 4663 * added this vdev but not yet initialized its metaslabs. 4664 */ 4665 if (tvd->vdev_islog && mg != NULL && dtl_required) { 4666 /* 4667 * Prevent future allocations unless the log device is 4668 * redundant. 4669 */ 4670 ASSERT0P(tvd->vdev_log_mg); 4671 metaslab_group_passivate(mg); 4672 (void) spa_vdev_state_exit(spa, vd, 0); 4673 4674 error = spa_reset_logs(spa); 4675 4676 /* 4677 * If the log device was successfully reset but has 4678 * checkpointed data, do not offline it. 4679 */ 4680 if (error == 0 && 4681 tvd->vdev_checkpoint_sm != NULL) { 4682 ASSERT3U(space_map_allocated( 4683 tvd->vdev_checkpoint_sm), !=, 0); 4684 error = ZFS_ERR_CHECKPOINT_EXISTS; 4685 } 4686 4687 spa_vdev_state_enter(spa, SCL_ALLOC); 4688 4689 /* 4690 * Check to see if the config has changed. 4691 */ 4692 if (error || generation != spa->spa_config_generation) { 4693 metaslab_group_activate(mg); 4694 if (error) 4695 return (spa_vdev_state_exit(spa, 4696 vd, error)); 4697 (void) spa_vdev_state_exit(spa, vd, 0); 4698 goto top; 4699 } 4700 ASSERT0(tvd->vdev_stat.vs_alloc); 4701 } 4702 4703 /* 4704 * Offline this device and reopen its top-level vdev. 4705 * If the top-level vdev is a log device then just offline 4706 * it. Otherwise, if this action results in the top-level 4707 * vdev becoming unusable, undo it and fail the request. 4708 */ 4709 vd->vdev_offline = B_TRUE; 4710 vdev_reopen(tvd); 4711 4712 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 4713 vdev_is_dead(tvd)) { 4714 vd->vdev_offline = B_FALSE; 4715 vdev_reopen(tvd); 4716 return (spa_vdev_state_exit(spa, NULL, 4717 SET_ERROR(EBUSY))); 4718 } 4719 4720 /* 4721 * Add the device back into the metaslab rotor so that 4722 * once we online the device it's open for business. 4723 */ 4724 if (tvd->vdev_islog && mg != NULL && dtl_required) 4725 metaslab_group_activate(mg); 4726 } 4727 4728 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 4729 4730 return (spa_vdev_state_exit(spa, vd, 0)); 4731 } 4732 4733 int 4734 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 4735 { 4736 int error; 4737 4738 mutex_enter(&spa->spa_vdev_top_lock); 4739 error = vdev_offline_locked(spa, guid, flags); 4740 mutex_exit(&spa->spa_vdev_top_lock); 4741 4742 return (error); 4743 } 4744 4745 /* 4746 * Clear the error counts associated with this vdev. Unlike vdev_online() and 4747 * vdev_offline(), we assume the spa config is locked. We also clear all 4748 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 4749 */ 4750 void 4751 vdev_clear(spa_t *spa, vdev_t *vd) 4752 { 4753 vdev_t *rvd = spa->spa_root_vdev; 4754 4755 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 4756 4757 if (vd == NULL) 4758 vd = rvd; 4759 4760 vd->vdev_stat.vs_read_errors = 0; 4761 vd->vdev_stat.vs_write_errors = 0; 4762 vd->vdev_stat.vs_checksum_errors = 0; 4763 vd->vdev_stat.vs_dio_verify_errors = 0; 4764 vd->vdev_stat.vs_slow_ios = 0; 4765 atomic_store_64((volatile uint64_t *)&vd->vdev_outlier_count, 0); 4766 vd->vdev_read_sit_out_expire = 0; 4767 4768 for (int c = 0; c < vd->vdev_children; c++) 4769 vdev_clear(spa, vd->vdev_child[c]); 4770 4771 /* 4772 * It makes no sense to "clear" an indirect or removed vdev. 4773 */ 4774 if (!vdev_is_concrete(vd) || vd->vdev_removed) 4775 return; 4776 4777 /* 4778 * If we're in the FAULTED state or have experienced failed I/O, then 4779 * clear the persistent state and attempt to reopen the device. We 4780 * also mark the vdev config dirty, so that the new faulted state is 4781 * written out to disk. 4782 */ 4783 if (vd->vdev_faulted || vd->vdev_degraded || 4784 !vdev_readable(vd) || !vdev_writeable(vd)) { 4785 /* 4786 * When reopening in response to a clear event, it may be due to 4787 * a fmadm repair request. In this case, if the device is 4788 * still broken, we want to still post the ereport again. 4789 */ 4790 vd->vdev_forcefault = B_TRUE; 4791 4792 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 4793 vd->vdev_cant_read = B_FALSE; 4794 vd->vdev_cant_write = B_FALSE; 4795 vd->vdev_stat.vs_aux = 0; 4796 4797 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 4798 4799 vd->vdev_forcefault = B_FALSE; 4800 4801 if (vd != rvd && vdev_writeable(vd->vdev_top)) 4802 vdev_state_dirty(vd->vdev_top); 4803 4804 /* If a resilver isn't required, check if vdevs can be culled */ 4805 if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && 4806 !dsl_scan_resilvering(spa->spa_dsl_pool) && 4807 !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) 4808 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 4809 4810 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); 4811 } 4812 4813 /* 4814 * When clearing a FMA-diagnosed fault, we always want to 4815 * unspare the device, as we assume that the original spare was 4816 * done in response to the FMA fault. 4817 */ 4818 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 4819 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 4820 vd->vdev_parent->vdev_child[0] == vd) 4821 vd->vdev_unspare = B_TRUE; 4822 4823 /* Clear recent error events cache (i.e. duplicate events tracking) */ 4824 zfs_ereport_clear(spa, vd); 4825 } 4826 4827 boolean_t 4828 vdev_is_dead(vdev_t *vd) 4829 { 4830 /* 4831 * Holes and missing devices are always considered "dead". 4832 * This simplifies the code since we don't have to check for 4833 * these types of devices in the various code paths. 4834 * Instead we rely on the fact that we skip over dead devices 4835 * before issuing I/O to them. 4836 */ 4837 return (vd->vdev_state < VDEV_STATE_DEGRADED || 4838 vd->vdev_ops == &vdev_hole_ops || 4839 vd->vdev_ops == &vdev_missing_ops); 4840 } 4841 4842 boolean_t 4843 vdev_readable(vdev_t *vd) 4844 { 4845 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 4846 } 4847 4848 boolean_t 4849 vdev_writeable(vdev_t *vd) 4850 { 4851 return (!vdev_is_dead(vd) && !vd->vdev_cant_write && 4852 vdev_is_concrete(vd)); 4853 } 4854 4855 boolean_t 4856 vdev_allocatable(vdev_t *vd) 4857 { 4858 uint64_t state = vd->vdev_state; 4859 4860 /* 4861 * We currently allow allocations from vdevs which may be in the 4862 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 4863 * fails to reopen then we'll catch it later when we're holding 4864 * the proper locks. Note that we have to get the vdev state 4865 * in a local variable because although it changes atomically, 4866 * we're asking two separate questions about it. 4867 */ 4868 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 4869 !vd->vdev_cant_write && vdev_is_concrete(vd) && 4870 vd->vdev_mg->mg_initialized); 4871 } 4872 4873 boolean_t 4874 vdev_accessible(vdev_t *vd, zio_t *zio) 4875 { 4876 ASSERT(zio->io_vd == vd); 4877 4878 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 4879 return (B_FALSE); 4880 4881 if (zio->io_type == ZIO_TYPE_READ) 4882 return (!vd->vdev_cant_read); 4883 4884 if (zio->io_type == ZIO_TYPE_WRITE) 4885 return (!vd->vdev_cant_write); 4886 4887 return (B_TRUE); 4888 } 4889 4890 static void 4891 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) 4892 { 4893 /* 4894 * Exclude the dRAID spare when aggregating to avoid double counting 4895 * the ops and bytes. These IOs are counted by the physical leaves. 4896 */ 4897 if (cvd->vdev_ops == &vdev_draid_spare_ops) 4898 return; 4899 4900 for (int t = 0; t < VS_ZIO_TYPES; t++) { 4901 vs->vs_ops[t] += cvs->vs_ops[t]; 4902 vs->vs_bytes[t] += cvs->vs_bytes[t]; 4903 } 4904 4905 cvs->vs_scan_removing = cvd->vdev_removing; 4906 } 4907 4908 /* 4909 * Get extended stats 4910 */ 4911 static void 4912 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) 4913 { 4914 (void) cvd; 4915 4916 int t, b; 4917 for (t = 0; t < ZIO_TYPES; t++) { 4918 for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) 4919 vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; 4920 4921 for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { 4922 vsx->vsx_total_histo[t][b] += 4923 cvsx->vsx_total_histo[t][b]; 4924 } 4925 } 4926 4927 for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { 4928 for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { 4929 vsx->vsx_queue_histo[t][b] += 4930 cvsx->vsx_queue_histo[t][b]; 4931 } 4932 vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; 4933 vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; 4934 4935 for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) 4936 vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; 4937 4938 for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) 4939 vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; 4940 } 4941 4942 } 4943 4944 boolean_t 4945 vdev_is_spacemap_addressable(vdev_t *vd) 4946 { 4947 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) 4948 return (B_TRUE); 4949 4950 /* 4951 * If double-word space map entries are not enabled we assume 4952 * 47 bits of the space map entry are dedicated to the entry's 4953 * offset (see SM_OFFSET_BITS in space_map.h). We then use that 4954 * to calculate the maximum address that can be described by a 4955 * space map entry for the given device. 4956 */ 4957 uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; 4958 4959 if (shift >= 63) /* detect potential overflow */ 4960 return (B_TRUE); 4961 4962 return (vd->vdev_asize < (1ULL << shift)); 4963 } 4964 4965 /* 4966 * Get statistics for the given vdev. 4967 */ 4968 static void 4969 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) 4970 { 4971 int t; 4972 /* 4973 * If we're getting stats on the root vdev, aggregate the I/O counts 4974 * over all top-level vdevs (i.e. the direct children of the root). 4975 */ 4976 if (!vd->vdev_ops->vdev_op_leaf) { 4977 if (vs) { 4978 memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); 4979 memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); 4980 } 4981 if (vsx) 4982 memset(vsx, 0, sizeof (*vsx)); 4983 4984 for (int c = 0; c < vd->vdev_children; c++) { 4985 vdev_t *cvd = vd->vdev_child[c]; 4986 vdev_stat_t *cvs = &cvd->vdev_stat; 4987 vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; 4988 4989 vdev_get_stats_ex_impl(cvd, cvs, cvsx); 4990 if (vs) 4991 vdev_get_child_stat(cvd, vs, cvs); 4992 if (vsx) 4993 vdev_get_child_stat_ex(cvd, vsx, cvsx); 4994 } 4995 } else { 4996 /* 4997 * We're a leaf. Just copy our ZIO active queue stats in. The 4998 * other leaf stats are updated in vdev_stat_update(). 4999 */ 5000 if (!vsx) 5001 return; 5002 5003 memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); 5004 5005 for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { 5006 vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; 5007 vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); 5008 } 5009 } 5010 } 5011 5012 void 5013 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) 5014 { 5015 vdev_t *tvd = vd->vdev_top; 5016 mutex_enter(&vd->vdev_stat_lock); 5017 if (vs) { 5018 memcpy(vs, &vd->vdev_stat, sizeof (*vs)); 5019 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 5020 vs->vs_state = vd->vdev_state; 5021 vs->vs_rsize = vdev_get_min_asize(vd); 5022 5023 if (vd->vdev_ops->vdev_op_leaf) { 5024 vs->vs_pspace = vd->vdev_psize; 5025 vs->vs_rsize += VDEV_LABEL_START_SIZE + 5026 VDEV_LABEL_END_SIZE; 5027 /* 5028 * Report initializing progress. Since we don't 5029 * have the initializing locks held, this is only 5030 * an estimate (although a fairly accurate one). 5031 */ 5032 vs->vs_initialize_bytes_done = 5033 vd->vdev_initialize_bytes_done; 5034 vs->vs_initialize_bytes_est = 5035 vd->vdev_initialize_bytes_est; 5036 vs->vs_initialize_state = vd->vdev_initialize_state; 5037 vs->vs_initialize_action_time = 5038 vd->vdev_initialize_action_time; 5039 5040 /* 5041 * Report manual TRIM progress. Since we don't have 5042 * the manual TRIM locks held, this is only an 5043 * estimate (although fairly accurate one). 5044 */ 5045 vs->vs_trim_notsup = !vd->vdev_has_trim; 5046 vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; 5047 vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; 5048 vs->vs_trim_state = vd->vdev_trim_state; 5049 vs->vs_trim_action_time = vd->vdev_trim_action_time; 5050 5051 /* Set when there is a deferred resilver. */ 5052 vs->vs_resilver_deferred = vd->vdev_resilver_deferred; 5053 } 5054 5055 /* 5056 * Report expandable space on top-level, non-auxiliary devices 5057 * only. The expandable space is reported in terms of metaslab 5058 * sized units since that determines how much space the pool 5059 * can expand. 5060 */ 5061 if (vd->vdev_aux == NULL && tvd != NULL) { 5062 vs->vs_esize = P2ALIGN_TYPED( 5063 vd->vdev_max_asize - vd->vdev_asize, 5064 1ULL << tvd->vdev_ms_shift, uint64_t); 5065 } 5066 5067 vs->vs_configured_ashift = vd->vdev_top != NULL 5068 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; 5069 vs->vs_logical_ashift = vd->vdev_logical_ashift; 5070 if (vd->vdev_physical_ashift <= ASHIFT_MAX) 5071 vs->vs_physical_ashift = vd->vdev_physical_ashift; 5072 else 5073 vs->vs_physical_ashift = 0; 5074 5075 /* 5076 * Report fragmentation and rebuild progress for top-level, 5077 * non-auxiliary, concrete devices. 5078 */ 5079 if (vd->vdev_aux == NULL && vd == vd->vdev_top && 5080 vdev_is_concrete(vd)) { 5081 /* 5082 * The vdev fragmentation rating doesn't take into 5083 * account the embedded slog metaslab (vdev_log_mg). 5084 * Since it's only one metaslab, it would have a tiny 5085 * impact on the overall fragmentation. 5086 */ 5087 vs->vs_fragmentation = (vd->vdev_mg != NULL) ? 5088 vd->vdev_mg->mg_fragmentation : 0; 5089 } 5090 vs->vs_noalloc = MAX(vd->vdev_noalloc, 5091 tvd ? tvd->vdev_noalloc : 0); 5092 } 5093 5094 vdev_get_stats_ex_impl(vd, vs, vsx); 5095 mutex_exit(&vd->vdev_stat_lock); 5096 } 5097 5098 void 5099 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 5100 { 5101 return (vdev_get_stats_ex(vd, vs, NULL)); 5102 } 5103 5104 void 5105 vdev_clear_stats(vdev_t *vd) 5106 { 5107 mutex_enter(&vd->vdev_stat_lock); 5108 vd->vdev_stat.vs_space = 0; 5109 vd->vdev_stat.vs_dspace = 0; 5110 vd->vdev_stat.vs_alloc = 0; 5111 mutex_exit(&vd->vdev_stat_lock); 5112 } 5113 5114 void 5115 vdev_scan_stat_init(vdev_t *vd) 5116 { 5117 vdev_stat_t *vs = &vd->vdev_stat; 5118 5119 for (int c = 0; c < vd->vdev_children; c++) 5120 vdev_scan_stat_init(vd->vdev_child[c]); 5121 5122 mutex_enter(&vd->vdev_stat_lock); 5123 vs->vs_scan_processed = 0; 5124 mutex_exit(&vd->vdev_stat_lock); 5125 } 5126 5127 void 5128 vdev_stat_update(zio_t *zio, uint64_t psize) 5129 { 5130 spa_t *spa = zio->io_spa; 5131 vdev_t *rvd = spa->spa_root_vdev; 5132 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 5133 vdev_t *pvd; 5134 uint64_t txg = zio->io_txg; 5135 /* Suppress ASAN false positive */ 5136 #ifdef __SANITIZE_ADDRESS__ 5137 vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL; 5138 vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL; 5139 #else 5140 vdev_stat_t *vs = &vd->vdev_stat; 5141 vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; 5142 #endif 5143 zio_type_t type = zio->io_type; 5144 int flags = zio->io_flags; 5145 5146 /* 5147 * If this i/o is a gang leader, it didn't do any actual work. 5148 */ 5149 if (zio->io_gang_tree) 5150 return; 5151 5152 if (zio->io_error == 0) { 5153 /* 5154 * If this is a root i/o, don't count it -- we've already 5155 * counted the top-level vdevs, and vdev_get_stats() will 5156 * aggregate them when asked. This reduces contention on 5157 * the root vdev_stat_lock and implicitly handles blocks 5158 * that compress away to holes, for which there is no i/o. 5159 * (Holes never create vdev children, so all the counters 5160 * remain zero, which is what we want.) 5161 * 5162 * Note: this only applies to successful i/o (io_error == 0) 5163 * because unlike i/o counts, errors are not additive. 5164 * When reading a ditto block, for example, failure of 5165 * one top-level vdev does not imply a root-level error. 5166 */ 5167 if (vd == rvd) 5168 return; 5169 5170 ASSERT(vd == zio->io_vd); 5171 5172 if (flags & ZIO_FLAG_IO_BYPASS) 5173 return; 5174 5175 mutex_enter(&vd->vdev_stat_lock); 5176 5177 if (flags & ZIO_FLAG_IO_REPAIR) { 5178 /* 5179 * Repair is the result of a resilver issued by the 5180 * scan thread (spa_sync). 5181 */ 5182 if (flags & ZIO_FLAG_SCAN_THREAD) { 5183 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 5184 dsl_scan_phys_t *scn_phys = &scn->scn_phys; 5185 uint64_t *processed = &scn_phys->scn_processed; 5186 5187 if (vd->vdev_ops->vdev_op_leaf) 5188 atomic_add_64(processed, psize); 5189 vs->vs_scan_processed += psize; 5190 } 5191 5192 /* 5193 * Repair is the result of a rebuild issued by the 5194 * rebuild thread (vdev_rebuild_thread). To avoid 5195 * double counting repaired bytes the virtual dRAID 5196 * spare vdev is excluded from the processed bytes. 5197 */ 5198 if (zio->io_priority == ZIO_PRIORITY_REBUILD) { 5199 vdev_t *tvd = vd->vdev_top; 5200 vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; 5201 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; 5202 uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; 5203 5204 if (vd->vdev_ops->vdev_op_leaf && 5205 vd->vdev_ops != &vdev_draid_spare_ops) { 5206 atomic_add_64(rebuilt, psize); 5207 } 5208 vs->vs_rebuild_processed += psize; 5209 } 5210 5211 if (flags & ZIO_FLAG_SELF_HEAL) 5212 vs->vs_self_healed += psize; 5213 } 5214 5215 /* 5216 * The bytes/ops/histograms are recorded at the leaf level and 5217 * aggregated into the higher level vdevs in vdev_get_stats(). 5218 */ 5219 if (vd->vdev_ops->vdev_op_leaf && 5220 (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { 5221 zio_type_t vs_type = type; 5222 zio_priority_t priority = zio->io_priority; 5223 5224 /* 5225 * TRIM ops and bytes are reported to user space as 5226 * ZIO_TYPE_FLUSH. This is done to preserve the 5227 * vdev_stat_t structure layout for user space. 5228 */ 5229 if (type == ZIO_TYPE_TRIM) 5230 vs_type = ZIO_TYPE_FLUSH; 5231 5232 /* 5233 * Solely for the purposes of 'zpool iostat -lqrw' 5234 * reporting use the priority to categorize the IO. 5235 * Only the following are reported to user space: 5236 * 5237 * ZIO_PRIORITY_SYNC_READ, 5238 * ZIO_PRIORITY_SYNC_WRITE, 5239 * ZIO_PRIORITY_ASYNC_READ, 5240 * ZIO_PRIORITY_ASYNC_WRITE, 5241 * ZIO_PRIORITY_SCRUB, 5242 * ZIO_PRIORITY_TRIM, 5243 * ZIO_PRIORITY_REBUILD. 5244 */ 5245 if (priority == ZIO_PRIORITY_INITIALIZING) { 5246 ASSERT3U(type, ==, ZIO_TYPE_WRITE); 5247 priority = ZIO_PRIORITY_ASYNC_WRITE; 5248 } else if (priority == ZIO_PRIORITY_REMOVAL) { 5249 priority = ((type == ZIO_TYPE_WRITE) ? 5250 ZIO_PRIORITY_ASYNC_WRITE : 5251 ZIO_PRIORITY_ASYNC_READ); 5252 } 5253 5254 vs->vs_ops[vs_type]++; 5255 vs->vs_bytes[vs_type] += psize; 5256 5257 if (flags & ZIO_FLAG_DELEGATED) { 5258 vsx->vsx_agg_histo[priority] 5259 [RQ_HISTO(zio->io_size)]++; 5260 } else { 5261 vsx->vsx_ind_histo[priority] 5262 [RQ_HISTO(zio->io_size)]++; 5263 } 5264 5265 if (zio->io_delta && zio->io_delay) { 5266 vsx->vsx_queue_histo[priority] 5267 [L_HISTO(zio->io_delta - zio->io_delay)]++; 5268 vsx->vsx_disk_histo[type] 5269 [L_HISTO(zio->io_delay)]++; 5270 vsx->vsx_total_histo[type] 5271 [L_HISTO(zio->io_delta)]++; 5272 } 5273 } 5274 5275 mutex_exit(&vd->vdev_stat_lock); 5276 return; 5277 } 5278 5279 if (flags & ZIO_FLAG_SPECULATIVE) 5280 return; 5281 5282 /* 5283 * If this is an I/O error that is going to be retried, then ignore the 5284 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 5285 * hard errors, when in reality they can happen for any number of 5286 * innocuous reasons (bus resets, MPxIO link failure, etc). 5287 */ 5288 if (zio->io_error == EIO && 5289 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 5290 return; 5291 5292 /* 5293 * Intent logs writes won't propagate their error to the root 5294 * I/O so don't mark these types of failures as pool-level 5295 * errors. 5296 */ 5297 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 5298 return; 5299 5300 if (type == ZIO_TYPE_WRITE && txg != 0 && 5301 (!(flags & ZIO_FLAG_IO_REPAIR) || 5302 (flags & ZIO_FLAG_SCAN_THREAD) || 5303 zio->io_priority == ZIO_PRIORITY_REBUILD || 5304 spa->spa_claiming)) { 5305 /* 5306 * This is either a normal write (not a repair), or it's 5307 * a repair induced by the scrub thread, or it's a repair 5308 * made by zil_claim() during spa_load() in the first txg, 5309 * or its repair induced by rebuild (sequential resilver). 5310 * In the normal case, we commit the DTL change in the same 5311 * txg as the block was born. In the scrub-induced repair 5312 * case, we know that scrubs run in first-pass syncing context, 5313 * so we commit the DTL change in spa_syncing_txg(spa). 5314 * In the zil_claim() case, we commit in spa_first_txg(spa). 5315 * 5316 * We currently do not make DTL entries for failed spontaneous 5317 * self-healing writes triggered by normal (non-scrubbing) 5318 * reads, because we have no transactional context in which to 5319 * do so -- and it's not clear that it'd be desirable anyway. 5320 * 5321 * For rebuild, since we don't have any information about BPs 5322 * and txgs that are being rebuilt, we need to add all known 5323 * txgs (starting from TXG_INITIAL) to DTL so that during 5324 * healing resilver we would be able to check all txgs at 5325 * vdev_draid_need_resilver(). 5326 */ 5327 uint64_t size = 1; 5328 if (vd->vdev_ops->vdev_op_leaf) { 5329 uint64_t commit_txg = txg; 5330 if (flags & ZIO_FLAG_SCAN_THREAD) { 5331 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 5332 ASSERT(spa_sync_pass(spa) == 1); 5333 vdev_dtl_dirty(vd, DTL_SCRUB, txg, size); 5334 commit_txg = spa_syncing_txg(spa); 5335 } else if (spa->spa_claiming) { 5336 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 5337 commit_txg = spa_first_txg(spa); 5338 } else if (zio->io_priority == ZIO_PRIORITY_REBUILD) { 5339 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 5340 vdev_rebuild_txgs(vd->vdev_top, &txg, &size); 5341 commit_txg = spa_open_txg(spa); 5342 } 5343 ASSERT(commit_txg >= spa_syncing_txg(spa)); 5344 if (vdev_dtl_contains(vd, DTL_MISSING, txg, size)) 5345 return; 5346 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 5347 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, size); 5348 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 5349 } 5350 if (vd != rvd) 5351 vdev_dtl_dirty(vd, DTL_MISSING, txg, size); 5352 } 5353 } 5354 5355 int64_t 5356 vdev_deflated_space(vdev_t *vd, int64_t space) 5357 { 5358 ASSERT0((space & (SPA_MINBLOCKSIZE-1))); 5359 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 5360 5361 return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); 5362 } 5363 5364 /* 5365 * Update the in-core space usage stats for this vdev, its metaslab class, 5366 * and the root vdev. 5367 */ 5368 void 5369 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 5370 int64_t space_delta) 5371 { 5372 (void) defer_delta; 5373 int64_t dspace_delta; 5374 spa_t *spa = vd->vdev_spa; 5375 vdev_t *rvd = spa->spa_root_vdev; 5376 5377 ASSERT(vd == vd->vdev_top); 5378 5379 /* 5380 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 5381 * factor. We must calculate this here and not at the root vdev 5382 * because the root vdev's psize-to-asize is simply the max of its 5383 * children's, thus not accurate enough for us. 5384 */ 5385 dspace_delta = vdev_deflated_space(vd, space_delta); 5386 5387 mutex_enter(&vd->vdev_stat_lock); 5388 /* ensure we won't underflow */ 5389 if (alloc_delta < 0) { 5390 ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); 5391 } 5392 5393 vd->vdev_stat.vs_alloc += alloc_delta; 5394 vd->vdev_stat.vs_space += space_delta; 5395 vd->vdev_stat.vs_dspace += dspace_delta; 5396 mutex_exit(&vd->vdev_stat_lock); 5397 5398 /* every class but log contributes to root space stats */ 5399 if (vd->vdev_mg != NULL && !vd->vdev_islog) { 5400 ASSERT(!vd->vdev_isl2cache); 5401 mutex_enter(&rvd->vdev_stat_lock); 5402 rvd->vdev_stat.vs_alloc += alloc_delta; 5403 rvd->vdev_stat.vs_space += space_delta; 5404 rvd->vdev_stat.vs_dspace += dspace_delta; 5405 mutex_exit(&rvd->vdev_stat_lock); 5406 } 5407 /* Note: metaslab_class_space_update moved to metaslab_space_update */ 5408 } 5409 5410 /* 5411 * Mark a top-level vdev's config as dirty, placing it on the dirty list 5412 * so that it will be written out next time the vdev configuration is synced. 5413 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 5414 */ 5415 void 5416 vdev_config_dirty(vdev_t *vd) 5417 { 5418 spa_t *spa = vd->vdev_spa; 5419 vdev_t *rvd = spa->spa_root_vdev; 5420 int c; 5421 5422 ASSERT(spa_writeable(spa)); 5423 5424 /* 5425 * If this is an aux vdev (as with l2cache and spare devices), then we 5426 * update the vdev config manually and set the sync flag. 5427 */ 5428 if (vd->vdev_aux != NULL) { 5429 spa_aux_vdev_t *sav = vd->vdev_aux; 5430 nvlist_t **aux; 5431 uint_t naux; 5432 5433 for (c = 0; c < sav->sav_count; c++) { 5434 if (sav->sav_vdevs[c] == vd) 5435 break; 5436 } 5437 5438 if (c == sav->sav_count) { 5439 /* 5440 * We're being removed. There's nothing more to do. 5441 */ 5442 ASSERT(sav->sav_sync == B_TRUE); 5443 return; 5444 } 5445 5446 sav->sav_sync = B_TRUE; 5447 5448 if (nvlist_lookup_nvlist_array(sav->sav_config, 5449 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 5450 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 5451 ZPOOL_CONFIG_SPARES, &aux, &naux)); 5452 } 5453 5454 ASSERT(c < naux); 5455 5456 /* 5457 * Setting the nvlist in the middle if the array is a little 5458 * sketchy, but it will work. 5459 */ 5460 nvlist_free(aux[c]); 5461 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 5462 5463 return; 5464 } 5465 5466 /* 5467 * The dirty list is protected by the SCL_CONFIG lock. The caller 5468 * must either hold SCL_CONFIG as writer, or must be the sync thread 5469 * (which holds SCL_CONFIG as reader). There's only one sync thread, 5470 * so this is sufficient to ensure mutual exclusion. 5471 */ 5472 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 5473 (dsl_pool_sync_context(spa_get_dsl(spa)) && 5474 spa_config_held(spa, SCL_CONFIG, RW_READER))); 5475 5476 if (vd == rvd) { 5477 for (c = 0; c < rvd->vdev_children; c++) 5478 vdev_config_dirty(rvd->vdev_child[c]); 5479 } else { 5480 ASSERT(vd == vd->vdev_top); 5481 5482 if (!list_link_active(&vd->vdev_config_dirty_node) && 5483 vdev_is_concrete(vd)) { 5484 list_insert_head(&spa->spa_config_dirty_list, vd); 5485 } 5486 } 5487 } 5488 5489 void 5490 vdev_config_clean(vdev_t *vd) 5491 { 5492 spa_t *spa = vd->vdev_spa; 5493 5494 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 5495 (dsl_pool_sync_context(spa_get_dsl(spa)) && 5496 spa_config_held(spa, SCL_CONFIG, RW_READER))); 5497 5498 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 5499 list_remove(&spa->spa_config_dirty_list, vd); 5500 } 5501 5502 /* 5503 * Mark a top-level vdev's state as dirty, so that the next pass of 5504 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 5505 * the state changes from larger config changes because they require 5506 * much less locking, and are often needed for administrative actions. 5507 */ 5508 void 5509 vdev_state_dirty(vdev_t *vd) 5510 { 5511 spa_t *spa = vd->vdev_spa; 5512 5513 ASSERT(spa_writeable(spa)); 5514 ASSERT(vd == vd->vdev_top); 5515 5516 /* 5517 * The state list is protected by the SCL_STATE lock. The caller 5518 * must either hold SCL_STATE as writer, or must be the sync thread 5519 * (which holds SCL_STATE as reader). There's only one sync thread, 5520 * so this is sufficient to ensure mutual exclusion. 5521 */ 5522 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 5523 (dsl_pool_sync_context(spa_get_dsl(spa)) && 5524 spa_config_held(spa, SCL_STATE, RW_READER))); 5525 5526 if (!list_link_active(&vd->vdev_state_dirty_node) && 5527 vdev_is_concrete(vd)) 5528 list_insert_head(&spa->spa_state_dirty_list, vd); 5529 } 5530 5531 void 5532 vdev_state_clean(vdev_t *vd) 5533 { 5534 spa_t *spa = vd->vdev_spa; 5535 5536 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 5537 (dsl_pool_sync_context(spa_get_dsl(spa)) && 5538 spa_config_held(spa, SCL_STATE, RW_READER))); 5539 5540 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 5541 list_remove(&spa->spa_state_dirty_list, vd); 5542 } 5543 5544 /* 5545 * Propagate vdev state up from children to parent. 5546 */ 5547 void 5548 vdev_propagate_state(vdev_t *vd) 5549 { 5550 spa_t *spa = vd->vdev_spa; 5551 vdev_t *rvd = spa->spa_root_vdev; 5552 int degraded = 0, faulted = 0; 5553 int corrupted = 0; 5554 vdev_t *child; 5555 5556 if (vd->vdev_children > 0) { 5557 for (int c = 0; c < vd->vdev_children; c++) { 5558 child = vd->vdev_child[c]; 5559 5560 /* 5561 * Don't factor holes or indirect vdevs into the 5562 * decision. 5563 */ 5564 if (!vdev_is_concrete(child)) 5565 continue; 5566 5567 if (!vdev_readable(child) || 5568 (!vdev_writeable(child) && spa_writeable(spa))) { 5569 /* 5570 * Root special: if there is a top-level log 5571 * device, treat the root vdev as if it were 5572 * degraded. 5573 */ 5574 if (child->vdev_islog && vd == rvd) 5575 degraded++; 5576 else 5577 faulted++; 5578 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 5579 degraded++; 5580 } 5581 5582 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 5583 corrupted++; 5584 } 5585 5586 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 5587 5588 /* 5589 * Root special: if there is a top-level vdev that cannot be 5590 * opened due to corrupted metadata, then propagate the root 5591 * vdev's aux state as 'corrupt' rather than 'insufficient 5592 * replicas'. 5593 */ 5594 if (corrupted && vd == rvd && 5595 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 5596 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 5597 VDEV_AUX_CORRUPT_DATA); 5598 } 5599 5600 if (vd->vdev_parent) 5601 vdev_propagate_state(vd->vdev_parent); 5602 } 5603 5604 /* 5605 * Set a vdev's state. If this is during an open, we don't update the parent 5606 * state, because we're in the process of opening children depth-first. 5607 * Otherwise, we propagate the change to the parent. 5608 * 5609 * If this routine places a device in a faulted state, an appropriate ereport is 5610 * generated. 5611 */ 5612 void 5613 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 5614 { 5615 uint64_t save_state; 5616 spa_t *spa = vd->vdev_spa; 5617 5618 if (state == vd->vdev_state) { 5619 /* 5620 * Since vdev_offline() code path is already in an offline 5621 * state we can miss a statechange event to OFFLINE. Check 5622 * the previous state to catch this condition. 5623 */ 5624 if (vd->vdev_ops->vdev_op_leaf && 5625 (state == VDEV_STATE_OFFLINE) && 5626 (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { 5627 /* post an offline state change */ 5628 zfs_post_state_change(spa, vd, vd->vdev_prevstate); 5629 } 5630 vd->vdev_stat.vs_aux = aux; 5631 return; 5632 } 5633 5634 save_state = vd->vdev_state; 5635 5636 vd->vdev_state = state; 5637 vd->vdev_stat.vs_aux = aux; 5638 5639 /* 5640 * If we are setting the vdev state to anything but an open state, then 5641 * always close the underlying device unless the device has requested 5642 * a delayed close (i.e. we're about to remove or fault the device). 5643 * Otherwise, we keep accessible but invalid devices open forever. 5644 * We don't call vdev_close() itself, because that implies some extra 5645 * checks (offline, etc) that we don't want here. This is limited to 5646 * leaf devices, because otherwise closing the device will affect other 5647 * children. 5648 */ 5649 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 5650 vd->vdev_ops->vdev_op_leaf) 5651 vd->vdev_ops->vdev_op_close(vd); 5652 5653 if (vd->vdev_removed && 5654 state == VDEV_STATE_CANT_OPEN && 5655 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 5656 /* 5657 * If the previous state is set to VDEV_STATE_REMOVED, then this 5658 * device was previously marked removed and someone attempted to 5659 * reopen it. If this failed due to a nonexistent device, then 5660 * keep the device in the REMOVED state. We also let this be if 5661 * it is one of our special test online cases, which is only 5662 * attempting to online the device and shouldn't generate an FMA 5663 * fault. 5664 */ 5665 vd->vdev_state = VDEV_STATE_REMOVED; 5666 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 5667 } else if (state == VDEV_STATE_REMOVED) { 5668 vd->vdev_removed = B_TRUE; 5669 } else if (state == VDEV_STATE_CANT_OPEN) { 5670 /* 5671 * If we fail to open a vdev during an import or recovery, we 5672 * mark it as "not available", which signifies that it was 5673 * never there to begin with. Failure to open such a device 5674 * is not considered an error. 5675 */ 5676 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 5677 spa_load_state(spa) == SPA_LOAD_RECOVER) && 5678 vd->vdev_ops->vdev_op_leaf) 5679 vd->vdev_not_present = 1; 5680 5681 /* 5682 * Post the appropriate ereport. If the 'prevstate' field is 5683 * set to something other than VDEV_STATE_UNKNOWN, it indicates 5684 * that this is part of a vdev_reopen(). In this case, we don't 5685 * want to post the ereport if the device was already in the 5686 * CANT_OPEN state beforehand. 5687 * 5688 * If the 'checkremove' flag is set, then this is an attempt to 5689 * online the device in response to an insertion event. If we 5690 * hit this case, then we have detected an insertion event for a 5691 * faulted or offline device that wasn't in the removed state. 5692 * In this scenario, we don't post an ereport because we are 5693 * about to replace the device, or attempt an online with 5694 * vdev_forcefault, which will generate the fault for us. 5695 */ 5696 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 5697 !vd->vdev_not_present && !vd->vdev_checkremove && 5698 vd != spa->spa_root_vdev) { 5699 const char *class; 5700 5701 switch (aux) { 5702 case VDEV_AUX_OPEN_FAILED: 5703 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 5704 break; 5705 case VDEV_AUX_CORRUPT_DATA: 5706 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 5707 break; 5708 case VDEV_AUX_NO_REPLICAS: 5709 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 5710 break; 5711 case VDEV_AUX_BAD_GUID_SUM: 5712 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 5713 break; 5714 case VDEV_AUX_TOO_SMALL: 5715 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 5716 break; 5717 case VDEV_AUX_BAD_LABEL: 5718 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 5719 break; 5720 case VDEV_AUX_BAD_ASHIFT: 5721 class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; 5722 break; 5723 default: 5724 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 5725 } 5726 5727 (void) zfs_ereport_post(class, spa, vd, NULL, NULL, 5728 save_state); 5729 } 5730 5731 /* Erase any notion of persistent removed state */ 5732 vd->vdev_removed = B_FALSE; 5733 } else { 5734 vd->vdev_removed = B_FALSE; 5735 } 5736 5737 /* 5738 * Notify ZED of any significant state-change on a leaf vdev. 5739 * 5740 */ 5741 if (vd->vdev_ops->vdev_op_leaf) { 5742 /* preserve original state from a vdev_reopen() */ 5743 if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && 5744 (vd->vdev_prevstate != vd->vdev_state) && 5745 (save_state <= VDEV_STATE_CLOSED)) 5746 save_state = vd->vdev_prevstate; 5747 5748 /* filter out state change due to initial vdev_open */ 5749 if (save_state > VDEV_STATE_CLOSED) 5750 zfs_post_state_change(spa, vd, save_state); 5751 } 5752 5753 if (!isopen && vd->vdev_parent) 5754 vdev_propagate_state(vd->vdev_parent); 5755 } 5756 5757 boolean_t 5758 vdev_children_are_offline(vdev_t *vd) 5759 { 5760 ASSERT(!vd->vdev_ops->vdev_op_leaf); 5761 5762 for (uint64_t i = 0; i < vd->vdev_children; i++) { 5763 if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) 5764 return (B_FALSE); 5765 } 5766 5767 return (B_TRUE); 5768 } 5769 5770 /* 5771 * Check the vdev configuration to ensure that it's capable of supporting 5772 * a root pool. We do not support partial configuration. 5773 */ 5774 boolean_t 5775 vdev_is_bootable(vdev_t *vd) 5776 { 5777 if (!vd->vdev_ops->vdev_op_leaf) { 5778 const char *vdev_type = vd->vdev_ops->vdev_op_type; 5779 5780 if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) 5781 return (B_FALSE); 5782 } 5783 5784 for (int c = 0; c < vd->vdev_children; c++) { 5785 if (!vdev_is_bootable(vd->vdev_child[c])) 5786 return (B_FALSE); 5787 } 5788 return (B_TRUE); 5789 } 5790 5791 boolean_t 5792 vdev_is_concrete(vdev_t *vd) 5793 { 5794 vdev_ops_t *ops = vd->vdev_ops; 5795 if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || 5796 ops == &vdev_missing_ops || ops == &vdev_root_ops) { 5797 return (B_FALSE); 5798 } else { 5799 return (B_TRUE); 5800 } 5801 } 5802 5803 /* 5804 * Determine if a log device has valid content. If the vdev was 5805 * removed or faulted in the MOS config then we know that 5806 * the content on the log device has already been written to the pool. 5807 */ 5808 boolean_t 5809 vdev_log_state_valid(vdev_t *vd) 5810 { 5811 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 5812 !vd->vdev_removed) 5813 return (B_TRUE); 5814 5815 for (int c = 0; c < vd->vdev_children; c++) 5816 if (vdev_log_state_valid(vd->vdev_child[c])) 5817 return (B_TRUE); 5818 5819 return (B_FALSE); 5820 } 5821 5822 /* 5823 * Expand a vdev if possible. 5824 */ 5825 void 5826 vdev_expand(vdev_t *vd, uint64_t txg) 5827 { 5828 ASSERT(vd->vdev_top == vd); 5829 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5830 ASSERT(vdev_is_concrete(vd)); 5831 5832 vdev_set_deflate_ratio(vd); 5833 5834 if ((vd->vdev_spa->spa_raidz_expand == NULL || 5835 vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) && 5836 (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && 5837 vdev_is_concrete(vd)) { 5838 vdev_metaslab_group_create(vd); 5839 VERIFY0(vdev_metaslab_init(vd, txg)); 5840 vdev_config_dirty(vd); 5841 } 5842 } 5843 5844 /* 5845 * Split a vdev. 5846 */ 5847 void 5848 vdev_split(vdev_t *vd) 5849 { 5850 vdev_t *cvd, *pvd = vd->vdev_parent; 5851 5852 VERIFY3U(pvd->vdev_children, >, 1); 5853 5854 vdev_remove_child(pvd, vd); 5855 vdev_compact_children(pvd); 5856 5857 ASSERT3P(pvd->vdev_child, !=, NULL); 5858 5859 cvd = pvd->vdev_child[0]; 5860 if (pvd->vdev_children == 1) { 5861 vdev_remove_parent(cvd); 5862 cvd->vdev_splitting = B_TRUE; 5863 } 5864 vdev_propagate_state(cvd); 5865 } 5866 5867 void 5868 vdev_deadman(vdev_t *vd, const char *tag) 5869 { 5870 for (int c = 0; c < vd->vdev_children; c++) { 5871 vdev_t *cvd = vd->vdev_child[c]; 5872 5873 vdev_deadman(cvd, tag); 5874 } 5875 5876 if (vd->vdev_ops->vdev_op_leaf) { 5877 vdev_queue_t *vq = &vd->vdev_queue; 5878 5879 mutex_enter(&vq->vq_lock); 5880 if (vq->vq_active > 0) { 5881 spa_t *spa = vd->vdev_spa; 5882 zio_t *fio; 5883 uint64_t delta; 5884 5885 zfs_dbgmsg("slow vdev: %s has %u active IOs", 5886 vd->vdev_path, vq->vq_active); 5887 5888 /* 5889 * Look at the head of all the pending queues, 5890 * if any I/O has been outstanding for longer than 5891 * the spa_deadman_synctime invoke the deadman logic. 5892 */ 5893 fio = list_head(&vq->vq_active_list); 5894 delta = gethrtime() - fio->io_timestamp; 5895 if (delta > spa_deadman_synctime(spa)) 5896 zio_deadman(fio, tag); 5897 } 5898 mutex_exit(&vq->vq_lock); 5899 } 5900 } 5901 5902 void 5903 vdev_defer_resilver(vdev_t *vd) 5904 { 5905 ASSERT(vd->vdev_ops->vdev_op_leaf); 5906 5907 vd->vdev_resilver_deferred = B_TRUE; 5908 vd->vdev_spa->spa_resilver_deferred = B_TRUE; 5909 } 5910 5911 /* 5912 * Clears the resilver deferred flag on all leaf devs under vd. Returns 5913 * B_TRUE if we have devices that need to be resilvered and are available to 5914 * accept resilver I/Os. 5915 */ 5916 boolean_t 5917 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) 5918 { 5919 boolean_t resilver_needed = B_FALSE; 5920 spa_t *spa = vd->vdev_spa; 5921 5922 for (int c = 0; c < vd->vdev_children; c++) { 5923 vdev_t *cvd = vd->vdev_child[c]; 5924 resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); 5925 } 5926 5927 if (vd == spa->spa_root_vdev && 5928 spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { 5929 spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); 5930 vdev_config_dirty(vd); 5931 spa->spa_resilver_deferred = B_FALSE; 5932 return (resilver_needed); 5933 } 5934 5935 if (!vdev_is_concrete(vd) || vd->vdev_aux || 5936 !vd->vdev_ops->vdev_op_leaf) 5937 return (resilver_needed); 5938 5939 vd->vdev_resilver_deferred = B_FALSE; 5940 5941 return (!vdev_is_dead(vd) && !vd->vdev_offline && 5942 vdev_resilver_needed(vd, NULL, NULL)); 5943 } 5944 5945 boolean_t 5946 vdev_xlate_is_empty(zfs_range_seg64_t *rs) 5947 { 5948 return (rs->rs_start == rs->rs_end); 5949 } 5950 5951 /* 5952 * Translate a logical range to the first contiguous physical range for the 5953 * specified vdev_t. This function is initially called with a leaf vdev and 5954 * will walk each parent vdev until it reaches a top-level vdev. Once the 5955 * top-level is reached the physical range is initialized and the recursive 5956 * function begins to unwind. As it unwinds it calls the parent's vdev 5957 * specific translation function to do the real conversion. 5958 */ 5959 void 5960 vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs, 5961 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs) 5962 { 5963 /* 5964 * Walk up the vdev tree 5965 */ 5966 if (vd != vd->vdev_top) { 5967 vdev_xlate(vd->vdev_parent, logical_rs, physical_rs, 5968 remain_rs); 5969 } else { 5970 /* 5971 * We've reached the top-level vdev, initialize the physical 5972 * range to the logical range and set an empty remaining 5973 * range then start to unwind. 5974 */ 5975 physical_rs->rs_start = logical_rs->rs_start; 5976 physical_rs->rs_end = logical_rs->rs_end; 5977 5978 remain_rs->rs_start = logical_rs->rs_start; 5979 remain_rs->rs_end = logical_rs->rs_start; 5980 5981 return; 5982 } 5983 5984 vdev_t *pvd = vd->vdev_parent; 5985 ASSERT3P(pvd, !=, NULL); 5986 ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); 5987 5988 /* 5989 * As this recursive function unwinds, translate the logical 5990 * range into its physical and any remaining components by calling 5991 * the vdev specific translate function. 5992 */ 5993 zfs_range_seg64_t intermediate = { 0 }; 5994 pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs); 5995 5996 physical_rs->rs_start = intermediate.rs_start; 5997 physical_rs->rs_end = intermediate.rs_end; 5998 } 5999 6000 void 6001 vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs, 6002 vdev_xlate_func_t *func, void *arg) 6003 { 6004 zfs_range_seg64_t iter_rs = *logical_rs; 6005 zfs_range_seg64_t physical_rs; 6006 zfs_range_seg64_t remain_rs; 6007 6008 while (!vdev_xlate_is_empty(&iter_rs)) { 6009 6010 vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs); 6011 6012 /* 6013 * With raidz and dRAID, it's possible that the logical range 6014 * does not live on this leaf vdev. Only when there is a non- 6015 * zero physical size call the provided function. 6016 */ 6017 if (!vdev_xlate_is_empty(&physical_rs)) 6018 func(arg, &physical_rs); 6019 6020 iter_rs = remain_rs; 6021 } 6022 } 6023 6024 static char * 6025 vdev_name(vdev_t *vd, char *buf, int buflen) 6026 { 6027 if (vd->vdev_path == NULL) { 6028 if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) { 6029 strlcpy(buf, vd->vdev_spa->spa_name, buflen); 6030 } else if (!vd->vdev_ops->vdev_op_leaf) { 6031 snprintf(buf, buflen, "%s-%llu", 6032 vd->vdev_ops->vdev_op_type, 6033 (u_longlong_t)vd->vdev_id); 6034 } 6035 } else { 6036 strlcpy(buf, vd->vdev_path, buflen); 6037 } 6038 return (buf); 6039 } 6040 6041 /* 6042 * Look at the vdev tree and determine whether any devices are currently being 6043 * replaced. 6044 */ 6045 boolean_t 6046 vdev_replace_in_progress(vdev_t *vdev) 6047 { 6048 ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); 6049 6050 if (vdev->vdev_ops == &vdev_replacing_ops) 6051 return (B_TRUE); 6052 6053 /* 6054 * A 'spare' vdev indicates that we have a replace in progress, unless 6055 * it has exactly two children, and the second, the hot spare, has 6056 * finished being resilvered. 6057 */ 6058 if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || 6059 !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) 6060 return (B_TRUE); 6061 6062 for (int i = 0; i < vdev->vdev_children; i++) { 6063 if (vdev_replace_in_progress(vdev->vdev_child[i])) 6064 return (B_TRUE); 6065 } 6066 6067 return (B_FALSE); 6068 } 6069 6070 /* 6071 * Add a (source=src, propname=propval) list to an nvlist. 6072 */ 6073 static void 6074 vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval, 6075 uint64_t intval, zprop_source_t src) 6076 { 6077 nvlist_t *propval; 6078 6079 propval = fnvlist_alloc(); 6080 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 6081 6082 if (strval != NULL) 6083 fnvlist_add_string(propval, ZPROP_VALUE, strval); 6084 else 6085 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 6086 6087 fnvlist_add_nvlist(nvl, propname, propval); 6088 nvlist_free(propval); 6089 } 6090 6091 static void 6092 vdev_props_set_sync(void *arg, dmu_tx_t *tx) 6093 { 6094 vdev_t *vd; 6095 nvlist_t *nvp = arg; 6096 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6097 objset_t *mos = spa->spa_meta_objset; 6098 nvpair_t *elem = NULL; 6099 uint64_t vdev_guid; 6100 uint64_t objid; 6101 nvlist_t *nvprops; 6102 6103 vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV); 6104 nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS); 6105 vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); 6106 6107 /* this vdev could get removed while waiting for this sync task */ 6108 if (vd == NULL) 6109 return; 6110 6111 /* 6112 * Set vdev property values in the vdev props mos object. 6113 */ 6114 if (vdev_prop_get_objid(vd, &objid) != 0) 6115 panic("unexpected vdev type"); 6116 6117 mutex_enter(&spa->spa_props_lock); 6118 6119 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { 6120 uint64_t intval; 6121 const char *strval; 6122 vdev_prop_t prop; 6123 const char *propname = nvpair_name(elem); 6124 zprop_type_t proptype; 6125 6126 switch (prop = vdev_name_to_prop(propname)) { 6127 case VDEV_PROP_USERPROP: 6128 if (vdev_prop_user(propname)) { 6129 strval = fnvpair_value_string(elem); 6130 if (strlen(strval) == 0) { 6131 /* remove the property if value == "" */ 6132 (void) zap_remove(mos, objid, propname, 6133 tx); 6134 } else { 6135 VERIFY0(zap_update(mos, objid, propname, 6136 1, strlen(strval) + 1, strval, tx)); 6137 } 6138 spa_history_log_internal(spa, "vdev set", tx, 6139 "vdev_guid=%llu: %s=%s", 6140 (u_longlong_t)vdev_guid, nvpair_name(elem), 6141 strval); 6142 } 6143 break; 6144 case VDEV_PROP_ALLOC_BIAS: { 6145 intval = fnvpair_value_uint64(elem); 6146 ASSERT3U(intval, !=, VDEV_BIAS_LOG); 6147 const char *bias_str = 6148 (intval == VDEV_BIAS_SPECIAL) ? 6149 VDEV_ALLOC_BIAS_SPECIAL : 6150 (intval == VDEV_BIAS_DEDUP) ? 6151 VDEV_ALLOC_BIAS_DEDUP : NULL; 6152 if (bias_str == NULL) { 6153 (void) zap_remove(mos, objid, 6154 VDEV_TOP_ZAP_ALLOCATION_BIAS, tx); 6155 } else { 6156 VERIFY0(zap_update(mos, objid, 6157 VDEV_TOP_ZAP_ALLOCATION_BIAS, 6158 1, strlen(bias_str) + 1, bias_str, tx)); 6159 spa_activate_allocation_classes(spa, tx); 6160 } 6161 spa_history_log_internal(spa, "vdev set", tx, 6162 "vdev_guid=%llu: alloc_bias=%s", 6163 (u_longlong_t)vdev_guid, 6164 bias_str != NULL ? bias_str : "none"); 6165 break; 6166 } 6167 default: 6168 /* normalize the property name */ 6169 propname = vdev_prop_to_name(prop); 6170 proptype = vdev_prop_get_type(prop); 6171 6172 if (nvpair_type(elem) == DATA_TYPE_STRING) { 6173 ASSERT(proptype == PROP_TYPE_STRING); 6174 strval = fnvpair_value_string(elem); 6175 VERIFY0(zap_update(mos, objid, propname, 6176 1, strlen(strval) + 1, strval, tx)); 6177 spa_history_log_internal(spa, "vdev set", tx, 6178 "vdev_guid=%llu: %s=%s", 6179 (u_longlong_t)vdev_guid, nvpair_name(elem), 6180 strval); 6181 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 6182 intval = fnvpair_value_uint64(elem); 6183 6184 if (proptype == PROP_TYPE_INDEX) { 6185 const char *unused; 6186 VERIFY0(vdev_prop_index_to_string( 6187 prop, intval, &unused)); 6188 } 6189 VERIFY0(zap_update(mos, objid, propname, 6190 sizeof (uint64_t), 1, &intval, tx)); 6191 spa_history_log_internal(spa, "vdev set", tx, 6192 "vdev_guid=%llu: %s=%lld", 6193 (u_longlong_t)vdev_guid, 6194 nvpair_name(elem), (longlong_t)intval); 6195 } else { 6196 panic("invalid vdev property type %u", 6197 nvpair_type(elem)); 6198 } 6199 } 6200 6201 } 6202 6203 mutex_exit(&spa->spa_props_lock); 6204 } 6205 6206 int 6207 vdev_prop_set(spa_t *spa, nvlist_t *innvl, nvlist_t *outnvl) 6208 { 6209 vdev_t *vd; 6210 nvpair_t *elem = NULL; 6211 uint64_t vdev_guid; 6212 nvlist_t *nvprops; 6213 int error = 0; 6214 6215 if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV, 6216 &vdev_guid) != 0) 6217 return (SET_ERROR(EINVAL)); 6218 6219 if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS, 6220 &nvprops) != 0) 6221 return (SET_ERROR(EINVAL)); 6222 6223 /* 6224 * Resolve the vdev by guid and hold SCL_CONFIG as a reader so the 6225 * vdev tree can't change beneath us while we touch vd. The lock is 6226 * dropped around the "path" and "allocating" handlers below: those 6227 * descend into spa_vdev_enter() -> spa_config_enter(SCL_ALL, 6228 * RW_WRITER), and taking SCL_CONFIG as a writer while this same 6229 * thread already holds it as a reader is a self-deadlock (the writer 6230 * waits for scl_count to drain to 0, but scl_count is this thread's 6231 * own reader, which is never released). Those handlers re-resolve 6232 * the vdev by guid under their own locking, so we re-resolve here 6233 * after each one in case the tree changed. 6234 */ 6235 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6236 if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { 6237 spa_config_exit(spa, SCL_CONFIG, FTAG); 6238 return (SET_ERROR(ENOENT)); 6239 } 6240 6241 /* Check that vdev has a zap we can use */ 6242 if (vd->vdev_root_zap == 0 && 6243 vd->vdev_top_zap == 0 && 6244 vd->vdev_leaf_zap == 0) { 6245 spa_config_exit(spa, SCL_CONFIG, FTAG); 6246 return (SET_ERROR(EINVAL)); 6247 } 6248 6249 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { 6250 const char *propname = nvpair_name(elem); 6251 vdev_prop_t prop = vdev_name_to_prop(propname); 6252 uint64_t intval = 0; 6253 const char *strval = NULL; 6254 6255 if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) { 6256 error = EINVAL; 6257 goto end; 6258 } 6259 6260 if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) { 6261 error = EROFS; 6262 goto end; 6263 } 6264 6265 /* Special Processing */ 6266 switch (prop) { 6267 case VDEV_PROP_PATH: 6268 if (vd->vdev_path == NULL) { 6269 error = EROFS; 6270 break; 6271 } 6272 if (nvpair_value_string(elem, &strval) != 0) { 6273 error = EINVAL; 6274 break; 6275 } 6276 /* New path must start with /dev/ */ 6277 if (strncmp(strval, "/dev/", 5)) { 6278 error = EINVAL; 6279 break; 6280 } 6281 /* 6282 * spa_vdev_setpath() takes SCL_ALL as a writer, so we 6283 * must not hold SCL_CONFIG across it (see above). Drop 6284 * it, then re-resolve vd in case the tree changed. 6285 */ 6286 spa_config_exit(spa, SCL_CONFIG, FTAG); 6287 error = spa_vdev_setpath(spa, vdev_guid, strval); 6288 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6289 vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); 6290 if (vd == NULL && error == 0) 6291 error = SET_ERROR(ENOENT); 6292 break; 6293 case VDEV_PROP_ALLOCATING: 6294 if (nvpair_value_uint64(elem, &intval) != 0) { 6295 error = EINVAL; 6296 break; 6297 } 6298 if (intval != vd->vdev_noalloc) 6299 break; 6300 /* 6301 * spa_vdev_noalloc()/spa_vdev_alloc() take SCL_ALL as a 6302 * writer; same locking dance as VDEV_PROP_PATH above. 6303 */ 6304 spa_config_exit(spa, SCL_CONFIG, FTAG); 6305 if (intval == 0) 6306 error = spa_vdev_noalloc(spa, vdev_guid); 6307 else 6308 error = spa_vdev_alloc(spa, vdev_guid); 6309 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6310 vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); 6311 if (vd == NULL && error == 0) 6312 error = SET_ERROR(ENOENT); 6313 break; 6314 case VDEV_PROP_FAILFAST: 6315 if (nvpair_value_uint64(elem, &intval) != 0 || 6316 intval > ZPROP_BOOLEAN_INHERIT || 6317 (intval == ZPROP_BOOLEAN_INHERIT && 6318 vd->vdev_ops == &vdev_root_ops)) { 6319 error = EINVAL; 6320 break; 6321 } 6322 vd->vdev_failfast = intval; 6323 break; 6324 case VDEV_PROP_SIT_OUT: 6325 /* Only expose this for a draid or raidz leaf */ 6326 if (!vd->vdev_ops->vdev_op_leaf || 6327 vd->vdev_top == NULL || 6328 (vd->vdev_top->vdev_ops != &vdev_raidz_ops && 6329 vd->vdev_top->vdev_ops != &vdev_draid_ops)) { 6330 error = ENOTSUP; 6331 break; 6332 } 6333 if (nvpair_value_uint64(elem, &intval) != 0) { 6334 error = EINVAL; 6335 break; 6336 } 6337 if (intval == 1) { 6338 vdev_t *ancestor = vd; 6339 while (ancestor->vdev_parent != vd->vdev_top) 6340 ancestor = ancestor->vdev_parent; 6341 vdev_t *pvd = vd->vdev_top; 6342 uint_t sitouts = 0; 6343 for (int i = 0; i < pvd->vdev_children; i++) { 6344 if (pvd->vdev_child[i] == ancestor) 6345 continue; 6346 if (vdev_sit_out_reads( 6347 pvd->vdev_child[i], 0)) { 6348 sitouts++; 6349 } 6350 } 6351 if (sitouts >= vdev_get_nparity(pvd)) { 6352 error = ZFS_ERR_TOO_MANY_SITOUTS; 6353 break; 6354 } 6355 if (error == 0) 6356 vdev_raidz_sit_child(vd, 6357 INT64_MAX - gethrestime_sec()); 6358 } else { 6359 vdev_raidz_unsit_child(vd); 6360 } 6361 break; 6362 case VDEV_PROP_AUTOSIT: 6363 if (vd->vdev_ops != &vdev_raidz_ops && 6364 vd->vdev_ops != &vdev_draid_ops) { 6365 error = ENOTSUP; 6366 break; 6367 } 6368 if (nvpair_value_uint64(elem, &intval) != 0) { 6369 error = EINVAL; 6370 break; 6371 } 6372 vd->vdev_autosit = intval == 1; 6373 break; 6374 case VDEV_PROP_CHECKSUM_N: 6375 if (nvpair_value_uint64(elem, &intval) != 0) { 6376 error = EINVAL; 6377 break; 6378 } 6379 vd->vdev_checksum_n = intval; 6380 break; 6381 case VDEV_PROP_CHECKSUM_T: 6382 if (nvpair_value_uint64(elem, &intval) != 0) { 6383 error = EINVAL; 6384 break; 6385 } 6386 vd->vdev_checksum_t = intval; 6387 break; 6388 case VDEV_PROP_IO_N: 6389 if (nvpair_value_uint64(elem, &intval) != 0) { 6390 error = EINVAL; 6391 break; 6392 } 6393 vd->vdev_io_n = intval; 6394 break; 6395 case VDEV_PROP_IO_T: 6396 if (nvpair_value_uint64(elem, &intval) != 0) { 6397 error = EINVAL; 6398 break; 6399 } 6400 vd->vdev_io_t = intval; 6401 break; 6402 case VDEV_PROP_SLOW_IO_EVENTS: 6403 if (nvpair_value_uint64(elem, &intval) != 0) { 6404 error = EINVAL; 6405 break; 6406 } 6407 vd->vdev_slow_io_events = intval != 0; 6408 break; 6409 case VDEV_PROP_SLOW_IO_N: 6410 if (nvpair_value_uint64(elem, &intval) != 0) { 6411 error = EINVAL; 6412 break; 6413 } 6414 vd->vdev_slow_io_n = intval; 6415 break; 6416 case VDEV_PROP_SLOW_IO_T: 6417 if (nvpair_value_uint64(elem, &intval) != 0) { 6418 error = EINVAL; 6419 break; 6420 } 6421 vd->vdev_slow_io_t = intval; 6422 break; 6423 case VDEV_PROP_SCHEDULER: 6424 if (nvpair_value_uint64(elem, &intval) != 0) { 6425 error = EINVAL; 6426 break; 6427 } 6428 vd->vdev_scheduler = intval; 6429 break; 6430 case VDEV_PROP_ALLOC_BIAS: 6431 if (nvpair_value_uint64(elem, &intval) != 0) { 6432 error = EINVAL; 6433 break; 6434 } 6435 if (vd != vd->vdev_top || vd->vdev_top_zap == 0) { 6436 error = ENOTSUP; 6437 break; 6438 } 6439 /* Log vdevs are not supported: remove and re-add. */ 6440 if (vd->vdev_islog) { 6441 error = ENOTSUP; 6442 break; 6443 } 6444 /* special/dedup needs allocation_classes feature */ 6445 if (intval != VDEV_BIAS_NONE && 6446 ((intval != VDEV_BIAS_SPECIAL && 6447 intval != VDEV_BIAS_DEDUP) || 6448 !spa_feature_is_enabled(spa, 6449 SPA_FEATURE_ALLOCATION_CLASSES))) { 6450 error = ENOTSUP; 6451 break; 6452 } 6453 /* 6454 * Disallow converting the last normal vdev to 6455 * avoid pool suspension on failed allocations. 6456 */ 6457 if (intval != VDEV_BIAS_NONE && 6458 vd->vdev_alloc_bias == VDEV_BIAS_NONE) { 6459 vdev_t *rvd = spa->spa_root_vdev; 6460 int normal = 0; 6461 for (uint64_t c = 0; 6462 c < rvd->vdev_children; c++) { 6463 vdev_t *cvd = rvd->vdev_child[c]; 6464 if (vdev_is_concrete(cvd) && 6465 cvd->vdev_alloc_bias == 6466 VDEV_BIAS_NONE && 6467 !cvd->vdev_noalloc) 6468 normal++; 6469 } 6470 if (normal <= 1) { 6471 error = ENOTSUP; 6472 break; 6473 } 6474 } 6475 vd->vdev_alloc_bias = (vdev_alloc_bias_t)intval; 6476 break; 6477 default: 6478 /* Most processing is done in vdev_props_set_sync */ 6479 break; 6480 } 6481 end: 6482 if (error != 0) { 6483 intval = error; 6484 vdev_prop_add_list(outnvl, propname, strval, intval, 0); 6485 break; 6486 } 6487 } 6488 6489 spa_config_exit(spa, SCL_CONFIG, FTAG); 6490 6491 if (error != 0) 6492 return (error); 6493 6494 return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync, 6495 innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED)); 6496 } 6497 6498 static int 6499 vdev_get_child_idx(vdev_t *vd, uint64_t c_guid) 6500 { 6501 for (int c = 0; c < vd->vdev_children; c++) 6502 if (vd->vdev_child[c]->vdev_guid == c_guid) 6503 return (c); 6504 return (0); 6505 } 6506 6507 int 6508 vdev_prop_get(spa_t *spa, nvlist_t *innvl, nvlist_t *outnvl) 6509 { 6510 objset_t *mos = spa->spa_meta_objset; 6511 vdev_t *vd; 6512 int err = 0; 6513 uint64_t objid = 0; 6514 uint64_t vdev_guid; 6515 nvpair_t *elem = NULL; 6516 nvlist_t *nvprops = NULL; 6517 uint64_t intval = 0; 6518 boolean_t boolval = 0; 6519 char *strval = NULL; 6520 const char *propname = NULL; 6521 vdev_prop_t prop; 6522 6523 ASSERT(mos != NULL); 6524 6525 if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV, 6526 &vdev_guid) != 0) 6527 return (SET_ERROR(EINVAL)); 6528 6529 nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); 6530 6531 /* 6532 * Resolve the vdev by guid and hold SCL_CONFIG as a reader across the 6533 * property fetch so the vdev tree can't change beneath us. This path 6534 * is read-only and never takes SCL_CONFIG as a writer, so holding the 6535 * reader throughout is safe. 6536 */ 6537 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6538 if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL) { 6539 spa_config_exit(spa, SCL_CONFIG, FTAG); 6540 return (SET_ERROR(ENOENT)); 6541 } 6542 6543 /* 6544 * A missing ZAP is normal for spare and L2ARC vdevs, which are 6545 * not part of the main vdev tree and never get ZAPs allocated. 6546 * Many properties are sourced directly from vdev_t fields and 6547 * work fine without one; ZAP-backed properties will return their 6548 * default values. objid is set to 0 when absent and the few 6549 * cases that call zap_lookup directly guard against this below. 6550 */ 6551 (void) vdev_prop_get_objid(vd, &objid); 6552 6553 mutex_enter(&spa->spa_props_lock); 6554 6555 if (nvprops != NULL) { 6556 char namebuf[64] = { 0 }; 6557 6558 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) { 6559 intval = 0; 6560 strval = NULL; 6561 propname = nvpair_name(elem); 6562 prop = vdev_name_to_prop(propname); 6563 zprop_source_t src = ZPROP_SRC_DEFAULT; 6564 uint64_t integer_size, num_integers; 6565 6566 switch (prop) { 6567 /* Special Read-only Properties */ 6568 case VDEV_PROP_NAME: 6569 strval = vdev_name(vd, namebuf, 6570 sizeof (namebuf)); 6571 if (strval == NULL) 6572 continue; 6573 vdev_prop_add_list(outnvl, propname, strval, 0, 6574 ZPROP_SRC_NONE); 6575 continue; 6576 case VDEV_PROP_CAPACITY: 6577 /* percent used */ 6578 intval = (vd->vdev_stat.vs_dspace == 0) ? 0 : 6579 (vd->vdev_stat.vs_alloc * 100 / 6580 vd->vdev_stat.vs_dspace); 6581 vdev_prop_add_list(outnvl, propname, NULL, 6582 intval, ZPROP_SRC_NONE); 6583 continue; 6584 case VDEV_PROP_STATE: 6585 vdev_prop_add_list(outnvl, propname, NULL, 6586 vd->vdev_state, ZPROP_SRC_NONE); 6587 continue; 6588 case VDEV_PROP_GUID: 6589 vdev_prop_add_list(outnvl, propname, NULL, 6590 vd->vdev_guid, ZPROP_SRC_NONE); 6591 continue; 6592 case VDEV_PROP_ASIZE: 6593 vdev_prop_add_list(outnvl, propname, NULL, 6594 vd->vdev_asize, ZPROP_SRC_NONE); 6595 continue; 6596 case VDEV_PROP_PSIZE: 6597 vdev_prop_add_list(outnvl, propname, NULL, 6598 vd->vdev_psize, ZPROP_SRC_NONE); 6599 continue; 6600 case VDEV_PROP_ASHIFT: 6601 vdev_prop_add_list(outnvl, propname, NULL, 6602 vd->vdev_ashift, ZPROP_SRC_NONE); 6603 continue; 6604 case VDEV_PROP_SIZE: 6605 vdev_prop_add_list(outnvl, propname, NULL, 6606 vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE); 6607 continue; 6608 case VDEV_PROP_FREE: 6609 vdev_prop_add_list(outnvl, propname, NULL, 6610 vd->vdev_stat.vs_dspace - 6611 vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); 6612 continue; 6613 case VDEV_PROP_ALLOCATED: 6614 vdev_prop_add_list(outnvl, propname, NULL, 6615 vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE); 6616 continue; 6617 case VDEV_PROP_EXPANDSZ: 6618 vdev_prop_add_list(outnvl, propname, NULL, 6619 vd->vdev_stat.vs_esize, ZPROP_SRC_NONE); 6620 continue; 6621 case VDEV_PROP_FRAGMENTATION: 6622 vdev_prop_add_list(outnvl, propname, NULL, 6623 vd->vdev_stat.vs_fragmentation, 6624 ZPROP_SRC_NONE); 6625 continue; 6626 case VDEV_PROP_PARITY: 6627 vdev_prop_add_list(outnvl, propname, NULL, 6628 vdev_get_nparity(vd), ZPROP_SRC_NONE); 6629 continue; 6630 case VDEV_PROP_FDOMAIN: 6631 case VDEV_PROP_FGROUP: 6632 if (vd->vdev_ops->vdev_op_leaf && 6633 vd->vdev_top != NULL && 6634 vd->vdev_top->vdev_ops == 6635 &vdev_draid_ops) { 6636 vdev_draid_config_t *vdc = 6637 vd->vdev_top->vdev_tsd; 6638 if (vdc->vdc_width == vdc->vdc_children) 6639 continue; 6640 int c_idx = vdev_get_child_idx( 6641 vd->vdev_top, vd->vdev_guid); 6642 vdev_prop_add_list(outnvl, propname, 6643 NULL, prop == VDEV_PROP_FDOMAIN ? 6644 (c_idx % vdc->vdc_children) : 6645 (c_idx / vdc->vdc_children), 6646 ZPROP_SRC_NONE); 6647 } 6648 continue; 6649 case VDEV_PROP_PATH: 6650 if (vd->vdev_path == NULL) 6651 continue; 6652 vdev_prop_add_list(outnvl, propname, 6653 vd->vdev_path, 0, ZPROP_SRC_NONE); 6654 continue; 6655 case VDEV_PROP_DEVID: 6656 if (vd->vdev_devid == NULL) 6657 continue; 6658 vdev_prop_add_list(outnvl, propname, 6659 vd->vdev_devid, 0, ZPROP_SRC_NONE); 6660 continue; 6661 case VDEV_PROP_PHYS_PATH: 6662 if (vd->vdev_physpath == NULL) 6663 continue; 6664 vdev_prop_add_list(outnvl, propname, 6665 vd->vdev_physpath, 0, ZPROP_SRC_NONE); 6666 continue; 6667 case VDEV_PROP_ENC_PATH: 6668 if (vd->vdev_enc_sysfs_path == NULL) 6669 continue; 6670 vdev_prop_add_list(outnvl, propname, 6671 vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE); 6672 continue; 6673 case VDEV_PROP_FRU: 6674 if (vd->vdev_fru == NULL) 6675 continue; 6676 vdev_prop_add_list(outnvl, propname, 6677 vd->vdev_fru, 0, ZPROP_SRC_NONE); 6678 continue; 6679 case VDEV_PROP_PARENT: 6680 if (vd->vdev_parent != NULL) { 6681 strval = vdev_name(vd->vdev_parent, 6682 namebuf, sizeof (namebuf)); 6683 vdev_prop_add_list(outnvl, propname, 6684 strval, 0, ZPROP_SRC_NONE); 6685 } 6686 continue; 6687 case VDEV_PROP_CHILDREN: 6688 if (vd->vdev_children > 0) 6689 strval = kmem_zalloc(ZAP_MAXVALUELEN, 6690 KM_SLEEP); 6691 for (uint64_t i = 0; i < vd->vdev_children; 6692 i++) { 6693 const char *vname; 6694 6695 vname = vdev_name(vd->vdev_child[i], 6696 namebuf, sizeof (namebuf)); 6697 if (vname == NULL) 6698 vname = "(unknown)"; 6699 if (strlen(strval) > 0) 6700 strlcat(strval, ",", 6701 ZAP_MAXVALUELEN); 6702 strlcat(strval, vname, ZAP_MAXVALUELEN); 6703 } 6704 if (strval != NULL) { 6705 vdev_prop_add_list(outnvl, propname, 6706 strval, 0, ZPROP_SRC_NONE); 6707 kmem_free(strval, ZAP_MAXVALUELEN); 6708 } 6709 continue; 6710 case VDEV_PROP_NUMCHILDREN: 6711 vdev_prop_add_list(outnvl, propname, NULL, 6712 vd->vdev_children, ZPROP_SRC_NONE); 6713 continue; 6714 case VDEV_PROP_READ_ERRORS: 6715 vdev_prop_add_list(outnvl, propname, NULL, 6716 vd->vdev_stat.vs_read_errors, 6717 ZPROP_SRC_NONE); 6718 continue; 6719 case VDEV_PROP_WRITE_ERRORS: 6720 vdev_prop_add_list(outnvl, propname, NULL, 6721 vd->vdev_stat.vs_write_errors, 6722 ZPROP_SRC_NONE); 6723 continue; 6724 case VDEV_PROP_CHECKSUM_ERRORS: 6725 vdev_prop_add_list(outnvl, propname, NULL, 6726 vd->vdev_stat.vs_checksum_errors, 6727 ZPROP_SRC_NONE); 6728 continue; 6729 case VDEV_PROP_INITIALIZE_ERRORS: 6730 vdev_prop_add_list(outnvl, propname, NULL, 6731 vd->vdev_stat.vs_initialize_errors, 6732 ZPROP_SRC_NONE); 6733 continue; 6734 case VDEV_PROP_TRIM_ERRORS: 6735 vdev_prop_add_list(outnvl, propname, NULL, 6736 vd->vdev_stat.vs_trim_errors, 6737 ZPROP_SRC_NONE); 6738 continue; 6739 case VDEV_PROP_SLOW_IOS: 6740 vdev_prop_add_list(outnvl, propname, NULL, 6741 vd->vdev_stat.vs_slow_ios, 6742 ZPROP_SRC_NONE); 6743 continue; 6744 case VDEV_PROP_OPS_NULL: 6745 vdev_prop_add_list(outnvl, propname, NULL, 6746 vd->vdev_stat.vs_ops[ZIO_TYPE_NULL], 6747 ZPROP_SRC_NONE); 6748 continue; 6749 case VDEV_PROP_OPS_READ: 6750 vdev_prop_add_list(outnvl, propname, NULL, 6751 vd->vdev_stat.vs_ops[ZIO_TYPE_READ], 6752 ZPROP_SRC_NONE); 6753 continue; 6754 case VDEV_PROP_OPS_WRITE: 6755 vdev_prop_add_list(outnvl, propname, NULL, 6756 vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE], 6757 ZPROP_SRC_NONE); 6758 continue; 6759 case VDEV_PROP_OPS_FREE: 6760 vdev_prop_add_list(outnvl, propname, NULL, 6761 vd->vdev_stat.vs_ops[ZIO_TYPE_FREE], 6762 ZPROP_SRC_NONE); 6763 continue; 6764 case VDEV_PROP_OPS_CLAIM: 6765 vdev_prop_add_list(outnvl, propname, NULL, 6766 vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM], 6767 ZPROP_SRC_NONE); 6768 continue; 6769 case VDEV_PROP_OPS_TRIM: 6770 /* 6771 * TRIM ops and bytes are reported to user 6772 * space as ZIO_TYPE_FLUSH. This is done to 6773 * preserve the vdev_stat_t structure layout 6774 * for user space. 6775 */ 6776 vdev_prop_add_list(outnvl, propname, NULL, 6777 vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH], 6778 ZPROP_SRC_NONE); 6779 continue; 6780 case VDEV_PROP_BYTES_NULL: 6781 vdev_prop_add_list(outnvl, propname, NULL, 6782 vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL], 6783 ZPROP_SRC_NONE); 6784 continue; 6785 case VDEV_PROP_BYTES_READ: 6786 vdev_prop_add_list(outnvl, propname, NULL, 6787 vd->vdev_stat.vs_bytes[ZIO_TYPE_READ], 6788 ZPROP_SRC_NONE); 6789 continue; 6790 case VDEV_PROP_BYTES_WRITE: 6791 vdev_prop_add_list(outnvl, propname, NULL, 6792 vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE], 6793 ZPROP_SRC_NONE); 6794 continue; 6795 case VDEV_PROP_BYTES_FREE: 6796 vdev_prop_add_list(outnvl, propname, NULL, 6797 vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE], 6798 ZPROP_SRC_NONE); 6799 continue; 6800 case VDEV_PROP_BYTES_CLAIM: 6801 vdev_prop_add_list(outnvl, propname, NULL, 6802 vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM], 6803 ZPROP_SRC_NONE); 6804 continue; 6805 case VDEV_PROP_BYTES_TRIM: 6806 /* 6807 * TRIM ops and bytes are reported to user 6808 * space as ZIO_TYPE_FLUSH. This is done to 6809 * preserve the vdev_stat_t structure layout 6810 * for user space. 6811 */ 6812 vdev_prop_add_list(outnvl, propname, NULL, 6813 vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH], 6814 ZPROP_SRC_NONE); 6815 continue; 6816 case VDEV_PROP_REMOVING: 6817 vdev_prop_add_list(outnvl, propname, NULL, 6818 vd->vdev_removing, ZPROP_SRC_NONE); 6819 continue; 6820 case VDEV_PROP_RAIDZ_EXPANDING: 6821 /* Only expose this for raidz */ 6822 if (vd->vdev_ops == &vdev_raidz_ops) { 6823 vdev_prop_add_list(outnvl, propname, 6824 NULL, vd->vdev_rz_expanding, 6825 ZPROP_SRC_NONE); 6826 } 6827 continue; 6828 case VDEV_PROP_SIT_OUT: 6829 /* Only expose this for a draid or raidz leaf */ 6830 if (vd->vdev_ops->vdev_op_leaf && 6831 vd->vdev_top != NULL && 6832 (vd->vdev_top->vdev_ops == 6833 &vdev_raidz_ops || 6834 vd->vdev_top->vdev_ops == 6835 &vdev_draid_ops)) { 6836 vdev_prop_add_list(outnvl, propname, 6837 NULL, vdev_sit_out_reads(vd, 0), 6838 ZPROP_SRC_NONE); 6839 } 6840 continue; 6841 case VDEV_PROP_TRIM_SUPPORT: 6842 /* only valid for leaf vdevs */ 6843 if (vd->vdev_ops->vdev_op_leaf) { 6844 vdev_prop_add_list(outnvl, propname, 6845 NULL, vd->vdev_has_trim, 6846 ZPROP_SRC_NONE); 6847 } 6848 continue; 6849 /* Numeric Properites */ 6850 case VDEV_PROP_ALLOCATING: 6851 /* Leaf vdevs cannot have this property */ 6852 if (vd->vdev_mg == NULL && 6853 vd->vdev_top != NULL) { 6854 src = ZPROP_SRC_NONE; 6855 intval = ZPROP_BOOLEAN_NA; 6856 } else { 6857 err = vdev_prop_get_int(vd, prop, 6858 &intval); 6859 if (err && err != ENOENT) 6860 break; 6861 6862 if (intval == 6863 vdev_prop_default_numeric(prop)) 6864 src = ZPROP_SRC_DEFAULT; 6865 else 6866 src = ZPROP_SRC_LOCAL; 6867 } 6868 6869 vdev_prop_add_list(outnvl, propname, NULL, 6870 intval, src); 6871 break; 6872 case VDEV_PROP_FAILFAST: 6873 src = ZPROP_SRC_LOCAL; 6874 6875 if (objid != 0) { 6876 err = zap_lookup(mos, objid, 6877 nvpair_name(elem), 6878 sizeof (uint64_t), 1, &intval); 6879 } else { 6880 err = ENOENT; 6881 } 6882 if (err == ENOENT) { 6883 if (vd->vdev_ops == &vdev_root_ops) 6884 intval = 6885 vdev_prop_default_numeric( 6886 prop); 6887 else 6888 intval = ZPROP_BOOLEAN_INHERIT; 6889 err = 0; 6890 } else if (err) { 6891 break; 6892 } 6893 if (intval == ZPROP_BOOLEAN_INHERIT || 6894 (vd->vdev_ops == &vdev_root_ops && 6895 intval == 1)) 6896 src = ZPROP_SRC_DEFAULT; 6897 6898 vdev_prop_add_list(outnvl, propname, strval, 6899 intval, src); 6900 break; 6901 case VDEV_PROP_AUTOSIT: 6902 /* Only raidz vdevs cannot have this property */ 6903 if (vd->vdev_ops != &vdev_raidz_ops && 6904 vd->vdev_ops != &vdev_draid_ops) { 6905 src = ZPROP_SRC_NONE; 6906 intval = ZPROP_BOOLEAN_NA; 6907 } else { 6908 err = vdev_prop_get_int(vd, prop, 6909 &intval); 6910 if (err && err != ENOENT) 6911 break; 6912 6913 if (intval == 6914 vdev_prop_default_numeric(prop)) 6915 src = ZPROP_SRC_DEFAULT; 6916 else 6917 src = ZPROP_SRC_LOCAL; 6918 } 6919 6920 vdev_prop_add_list(outnvl, propname, NULL, 6921 intval, src); 6922 break; 6923 6924 case VDEV_PROP_SLOW_IO_EVENTS: 6925 err = vdev_prop_get_bool(vd, prop, &boolval); 6926 if (err && err != ENOENT) 6927 break; 6928 6929 src = ZPROP_SRC_LOCAL; 6930 if (boolval == vdev_prop_default_numeric(prop)) 6931 src = ZPROP_SRC_DEFAULT; 6932 6933 vdev_prop_add_list(outnvl, propname, NULL, 6934 boolval, src); 6935 break; 6936 case VDEV_PROP_ALLOC_BIAS: 6937 if (vd == vd->vdev_top) { 6938 vdev_prop_add_list(outnvl, propname, 6939 NULL, vd->vdev_alloc_bias, 6940 ZPROP_SRC_NONE); 6941 } 6942 continue; 6943 case VDEV_PROP_ROTATIONAL: 6944 vdev_prop_add_list(outnvl, propname, NULL, 6945 !vd->vdev_nonrot, ZPROP_SRC_NONE); 6946 continue; 6947 case VDEV_PROP_CHECKSUM_N: 6948 case VDEV_PROP_CHECKSUM_T: 6949 case VDEV_PROP_IO_N: 6950 case VDEV_PROP_IO_T: 6951 case VDEV_PROP_SLOW_IO_N: 6952 case VDEV_PROP_SLOW_IO_T: 6953 case VDEV_PROP_SCHEDULER: 6954 err = vdev_prop_get_int(vd, prop, &intval); 6955 if (err && err != ENOENT) 6956 break; 6957 6958 if (intval == vdev_prop_default_numeric(prop)) 6959 src = ZPROP_SRC_DEFAULT; 6960 else 6961 src = ZPROP_SRC_LOCAL; 6962 6963 vdev_prop_add_list(outnvl, propname, NULL, 6964 intval, src); 6965 break; 6966 /* Text Properties */ 6967 case VDEV_PROP_COMMENT: 6968 /* Exists in the ZAP below */ 6969 /* FALLTHRU */ 6970 case VDEV_PROP_USERPROP: 6971 /* User Properites */ 6972 if (objid == 0) 6973 continue; 6974 src = ZPROP_SRC_LOCAL; 6975 6976 err = zap_length(mos, objid, nvpair_name(elem), 6977 &integer_size, &num_integers); 6978 if (err) 6979 break; 6980 6981 switch (integer_size) { 6982 case 8: 6983 /* User properties cannot be integers */ 6984 err = EINVAL; 6985 break; 6986 case 1: 6987 /* string property */ 6988 strval = kmem_alloc(num_integers, 6989 KM_SLEEP); 6990 err = zap_lookup(mos, objid, 6991 nvpair_name(elem), 1, 6992 num_integers, strval); 6993 if (err) { 6994 kmem_free(strval, 6995 num_integers); 6996 break; 6997 } 6998 vdev_prop_add_list(outnvl, propname, 6999 strval, 0, src); 7000 kmem_free(strval, num_integers); 7001 break; 7002 } 7003 break; 7004 default: 7005 err = ENOENT; 7006 break; 7007 } 7008 if (err) 7009 break; 7010 } 7011 } else { 7012 /* 7013 * Get all properties from the MOS vdev property object. 7014 */ 7015 zap_cursor_t zc; 7016 zap_attribute_t *za = zap_attribute_alloc(); 7017 for (zap_cursor_init(&zc, mos, objid); 7018 (err = zap_cursor_retrieve(&zc, za)) == 0; 7019 zap_cursor_advance(&zc)) { 7020 intval = 0; 7021 strval = NULL; 7022 zprop_source_t src = ZPROP_SRC_DEFAULT; 7023 propname = za->za_name; 7024 7025 switch (za->za_integer_length) { 7026 case 8: 7027 /* We do not allow integer user properties */ 7028 /* This is likely an internal value */ 7029 break; 7030 case 1: 7031 /* string property */ 7032 strval = kmem_alloc(za->za_num_integers, 7033 KM_SLEEP); 7034 err = zap_lookup(mos, objid, za->za_name, 1, 7035 za->za_num_integers, strval); 7036 if (err) { 7037 kmem_free(strval, za->za_num_integers); 7038 break; 7039 } 7040 vdev_prop_add_list(outnvl, propname, strval, 0, 7041 src); 7042 kmem_free(strval, za->za_num_integers); 7043 break; 7044 7045 default: 7046 break; 7047 } 7048 } 7049 zap_cursor_fini(&zc); 7050 zap_attribute_free(za); 7051 } 7052 7053 mutex_exit(&spa->spa_props_lock); 7054 spa_config_exit(spa, SCL_CONFIG, FTAG); 7055 7056 if (err && err != ENOENT) { 7057 return (err); 7058 } 7059 7060 return (0); 7061 } 7062 7063 EXPORT_SYMBOL(vdev_fault); 7064 EXPORT_SYMBOL(vdev_degrade); 7065 EXPORT_SYMBOL(vdev_online); 7066 EXPORT_SYMBOL(vdev_offline); 7067 EXPORT_SYMBOL(vdev_clear); 7068 7069 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, 7070 "Target number of metaslabs per top-level vdev"); 7071 7072 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, 7073 "Default lower limit for metaslab size"); 7074 7075 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, 7076 "Default upper limit for metaslab size"); 7077 7078 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, 7079 "Minimum number of metaslabs per top-level vdev"); 7080 7081 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW, 7082 "Practical upper limit of total metaslabs per top-level vdev"); 7083 7084 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, 7085 "Rate limit slow IO (delay) events to this many per second"); 7086 7087 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW, 7088 "Rate limit hung IO (deadman) events to this many per second"); 7089 7090 ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW, 7091 "Rate Direct I/O write verify events to this many per second"); 7092 7093 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW, 7094 "Direct I/O writes will perform for checksum verification before " 7095 "commiting write"); 7096 7097 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, 7098 "Rate limit checksum events to this many checksum errors per second " 7099 "(do not set below ZED threshold)."); 7100 7101 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, 7102 "Ignore errors during resilver/scrub"); 7103 7104 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, 7105 "Bypass vdev_validate()"); 7106 7107 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, 7108 "Disable cache flushes"); 7109 7110 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW, 7111 "Minimum number of metaslabs required to dedicate one for log blocks"); 7112 7113 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, 7114 param_set_min_auto_ashift, param_get_uint, ZMOD_RW, 7115 "Minimum ashift used when creating new top-level vdevs"); 7116 7117 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, 7118 param_set_max_auto_ashift, param_get_uint, ZMOD_RW, 7119 "Maximum ashift used when optimizing for logical -> physical sector " 7120 "size on new top-level vdevs"); 7121 7122 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl, 7123 param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW, 7124 "RAIDZ implementation"); 7125