1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* 26 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * This file contains all the routines used when modifying on-disk SPA state. 31 * This includes opening, importing, destroying, exporting a pool, and syncing a 32 * pool. 33 */ 34 35 #include <sys/zfs_context.h> 36 #include <sys/fm/fs/zfs.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zio.h> 39 #include <sys/zio_checksum.h> 40 #include <sys/dmu.h> 41 #include <sys/dmu_tx.h> 42 #include <sys/zap.h> 43 #include <sys/zil.h> 44 #include <sys/ddt.h> 45 #include <sys/vdev_impl.h> 46 #include <sys/metaslab.h> 47 #include <sys/metaslab_impl.h> 48 #include <sys/uberblock_impl.h> 49 #include <sys/txg.h> 50 #include <sys/avl.h> 51 #include <sys/dmu_traverse.h> 52 #include <sys/dmu_objset.h> 53 #include <sys/unique.h> 54 #include <sys/dsl_pool.h> 55 #include <sys/dsl_dataset.h> 56 #include <sys/dsl_dir.h> 57 #include <sys/dsl_prop.h> 58 #include <sys/dsl_synctask.h> 59 #include <sys/fs/zfs.h> 60 #include <sys/arc.h> 61 #include <sys/callb.h> 62 #include <sys/systeminfo.h> 63 #include <sys/spa_boot.h> 64 #include <sys/zfs_ioctl.h> 65 #include <sys/dsl_scan.h> 66 67 #ifdef _KERNEL 68 #include <sys/bootprops.h> 69 #include <sys/callb.h> 70 #include <sys/cpupart.h> 71 #include <sys/pool.h> 72 #include <sys/sysdc.h> 73 #include <sys/zone.h> 74 #endif /* _KERNEL */ 75 76 #include "zfs_prop.h" 77 #include "zfs_comutil.h" 78 79 typedef enum zti_modes { 80 zti_mode_fixed, /* value is # of threads (min 1) */ 81 zti_mode_online_percent, /* value is % of online CPUs */ 82 zti_mode_batch, /* cpu-intensive; value is ignored */ 83 zti_mode_null, /* don't create a taskq */ 84 zti_nmodes 85 } zti_modes_t; 86 87 #define ZTI_FIX(n) { zti_mode_fixed, (n) } 88 #define ZTI_PCT(n) { zti_mode_online_percent, (n) } 89 #define ZTI_BATCH { zti_mode_batch, 0 } 90 #define ZTI_NULL { zti_mode_null, 0 } 91 92 #define ZTI_ONE ZTI_FIX(1) 93 94 typedef struct zio_taskq_info { 95 enum zti_modes zti_mode; 96 uint_t zti_value; 97 } zio_taskq_info_t; 98 99 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 100 "issue", "issue_high", "intr", "intr_high" 101 }; 102 103 /* 104 * Define the taskq threads for the following I/O types: 105 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 106 */ 107 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 108 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 109 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 110 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 111 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 112 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 113 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 114 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 115 }; 116 117 static dsl_syncfunc_t spa_sync_props; 118 static boolean_t spa_has_active_shared_spare(spa_t *spa); 119 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 120 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 121 char **ereport); 122 static void spa_vdev_resilver_done(spa_t *spa); 123 124 uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 125 id_t zio_taskq_psrset_bind = PS_NONE; 126 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 127 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 128 129 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 130 131 /* 132 * This (illegal) pool name is used when temporarily importing a spa_t in order 133 * to get the vdev stats associated with the imported devices. 134 */ 135 #define TRYIMPORT_NAME "$import" 136 137 /* 138 * ========================================================================== 139 * SPA properties routines 140 * ========================================================================== 141 */ 142 143 /* 144 * Add a (source=src, propname=propval) list to an nvlist. 145 */ 146 static void 147 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 148 uint64_t intval, zprop_source_t src) 149 { 150 const char *propname = zpool_prop_to_name(prop); 151 nvlist_t *propval; 152 153 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 154 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 155 156 if (strval != NULL) 157 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 158 else 159 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 160 161 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 162 nvlist_free(propval); 163 } 164 165 /* 166 * Get property values from the spa configuration. 167 */ 168 static void 169 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 170 { 171 uint64_t size; 172 uint64_t alloc; 173 uint64_t cap, version; 174 zprop_source_t src = ZPROP_SRC_NONE; 175 spa_config_dirent_t *dp; 176 177 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 178 179 if (spa->spa_root_vdev != NULL) { 180 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 181 size = metaslab_class_get_space(spa_normal_class(spa)); 182 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 183 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 184 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 185 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 186 size - alloc, src); 187 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 188 (spa_mode(spa) == FREAD), src); 189 190 cap = (size == 0) ? 0 : (alloc * 100 / size); 191 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 192 193 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 194 ddt_get_pool_dedup_ratio(spa), src); 195 196 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 197 spa->spa_root_vdev->vdev_state, src); 198 199 version = spa_version(spa); 200 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 201 src = ZPROP_SRC_DEFAULT; 202 else 203 src = ZPROP_SRC_LOCAL; 204 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 205 } 206 207 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 208 209 if (spa->spa_root != NULL) 210 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 211 0, ZPROP_SRC_LOCAL); 212 213 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 214 if (dp->scd_path == NULL) { 215 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 216 "none", 0, ZPROP_SRC_LOCAL); 217 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 218 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 219 dp->scd_path, 0, ZPROP_SRC_LOCAL); 220 } 221 } 222 } 223 224 /* 225 * Get zpool property values. 226 */ 227 int 228 spa_prop_get(spa_t *spa, nvlist_t **nvp) 229 { 230 objset_t *mos = spa->spa_meta_objset; 231 zap_cursor_t zc; 232 zap_attribute_t za; 233 int err; 234 235 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 236 237 mutex_enter(&spa->spa_props_lock); 238 239 /* 240 * Get properties from the spa config. 241 */ 242 spa_prop_get_config(spa, nvp); 243 244 /* If no pool property object, no more prop to get. */ 245 if (mos == NULL || spa->spa_pool_props_object == 0) { 246 mutex_exit(&spa->spa_props_lock); 247 return (0); 248 } 249 250 /* 251 * Get properties from the MOS pool property object. 252 */ 253 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 254 (err = zap_cursor_retrieve(&zc, &za)) == 0; 255 zap_cursor_advance(&zc)) { 256 uint64_t intval = 0; 257 char *strval = NULL; 258 zprop_source_t src = ZPROP_SRC_DEFAULT; 259 zpool_prop_t prop; 260 261 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 262 continue; 263 264 switch (za.za_integer_length) { 265 case 8: 266 /* integer property */ 267 if (za.za_first_integer != 268 zpool_prop_default_numeric(prop)) 269 src = ZPROP_SRC_LOCAL; 270 271 if (prop == ZPOOL_PROP_BOOTFS) { 272 dsl_pool_t *dp; 273 dsl_dataset_t *ds = NULL; 274 275 dp = spa_get_dsl(spa); 276 rw_enter(&dp->dp_config_rwlock, RW_READER); 277 if (err = dsl_dataset_hold_obj(dp, 278 za.za_first_integer, FTAG, &ds)) { 279 rw_exit(&dp->dp_config_rwlock); 280 break; 281 } 282 283 strval = kmem_alloc( 284 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 285 KM_SLEEP); 286 dsl_dataset_name(ds, strval); 287 dsl_dataset_rele(ds, FTAG); 288 rw_exit(&dp->dp_config_rwlock); 289 } else { 290 strval = NULL; 291 intval = za.za_first_integer; 292 } 293 294 spa_prop_add_list(*nvp, prop, strval, intval, src); 295 296 if (strval != NULL) 297 kmem_free(strval, 298 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 299 300 break; 301 302 case 1: 303 /* string property */ 304 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 305 err = zap_lookup(mos, spa->spa_pool_props_object, 306 za.za_name, 1, za.za_num_integers, strval); 307 if (err) { 308 kmem_free(strval, za.za_num_integers); 309 break; 310 } 311 spa_prop_add_list(*nvp, prop, strval, 0, src); 312 kmem_free(strval, za.za_num_integers); 313 break; 314 315 default: 316 break; 317 } 318 } 319 zap_cursor_fini(&zc); 320 mutex_exit(&spa->spa_props_lock); 321 out: 322 if (err && err != ENOENT) { 323 nvlist_free(*nvp); 324 *nvp = NULL; 325 return (err); 326 } 327 328 return (0); 329 } 330 331 /* 332 * Validate the given pool properties nvlist and modify the list 333 * for the property values to be set. 334 */ 335 static int 336 spa_prop_validate(spa_t *spa, nvlist_t *props) 337 { 338 nvpair_t *elem; 339 int error = 0, reset_bootfs = 0; 340 uint64_t objnum; 341 342 elem = NULL; 343 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 344 zpool_prop_t prop; 345 char *propname, *strval; 346 uint64_t intval; 347 objset_t *os; 348 char *slash; 349 350 propname = nvpair_name(elem); 351 352 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 353 return (EINVAL); 354 355 switch (prop) { 356 case ZPOOL_PROP_VERSION: 357 error = nvpair_value_uint64(elem, &intval); 358 if (!error && 359 (intval < spa_version(spa) || intval > SPA_VERSION)) 360 error = EINVAL; 361 break; 362 363 case ZPOOL_PROP_DELEGATION: 364 case ZPOOL_PROP_AUTOREPLACE: 365 case ZPOOL_PROP_LISTSNAPS: 366 case ZPOOL_PROP_AUTOEXPAND: 367 error = nvpair_value_uint64(elem, &intval); 368 if (!error && intval > 1) 369 error = EINVAL; 370 break; 371 372 case ZPOOL_PROP_BOOTFS: 373 /* 374 * If the pool version is less than SPA_VERSION_BOOTFS, 375 * or the pool is still being created (version == 0), 376 * the bootfs property cannot be set. 377 */ 378 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 379 error = ENOTSUP; 380 break; 381 } 382 383 /* 384 * Make sure the vdev config is bootable 385 */ 386 if (!vdev_is_bootable(spa->spa_root_vdev)) { 387 error = ENOTSUP; 388 break; 389 } 390 391 reset_bootfs = 1; 392 393 error = nvpair_value_string(elem, &strval); 394 395 if (!error) { 396 uint64_t compress; 397 398 if (strval == NULL || strval[0] == '\0') { 399 objnum = zpool_prop_default_numeric( 400 ZPOOL_PROP_BOOTFS); 401 break; 402 } 403 404 if (error = dmu_objset_hold(strval, FTAG, &os)) 405 break; 406 407 /* Must be ZPL and not gzip compressed. */ 408 409 if (dmu_objset_type(os) != DMU_OST_ZFS) { 410 error = ENOTSUP; 411 } else if ((error = dsl_prop_get_integer(strval, 412 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 413 &compress, NULL)) == 0 && 414 !BOOTFS_COMPRESS_VALID(compress)) { 415 error = ENOTSUP; 416 } else { 417 objnum = dmu_objset_id(os); 418 } 419 dmu_objset_rele(os, FTAG); 420 } 421 break; 422 423 case ZPOOL_PROP_FAILUREMODE: 424 error = nvpair_value_uint64(elem, &intval); 425 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 426 intval > ZIO_FAILURE_MODE_PANIC)) 427 error = EINVAL; 428 429 /* 430 * This is a special case which only occurs when 431 * the pool has completely failed. This allows 432 * the user to change the in-core failmode property 433 * without syncing it out to disk (I/Os might 434 * currently be blocked). We do this by returning 435 * EIO to the caller (spa_prop_set) to trick it 436 * into thinking we encountered a property validation 437 * error. 438 */ 439 if (!error && spa_suspended(spa)) { 440 spa->spa_failmode = intval; 441 error = EIO; 442 } 443 break; 444 445 case ZPOOL_PROP_CACHEFILE: 446 if ((error = nvpair_value_string(elem, &strval)) != 0) 447 break; 448 449 if (strval[0] == '\0') 450 break; 451 452 if (strcmp(strval, "none") == 0) 453 break; 454 455 if (strval[0] != '/') { 456 error = EINVAL; 457 break; 458 } 459 460 slash = strrchr(strval, '/'); 461 ASSERT(slash != NULL); 462 463 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 464 strcmp(slash, "/..") == 0) 465 error = EINVAL; 466 break; 467 468 case ZPOOL_PROP_DEDUPDITTO: 469 if (spa_version(spa) < SPA_VERSION_DEDUP) 470 error = ENOTSUP; 471 else 472 error = nvpair_value_uint64(elem, &intval); 473 if (error == 0 && 474 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 475 error = EINVAL; 476 break; 477 } 478 479 if (error) 480 break; 481 } 482 483 if (!error && reset_bootfs) { 484 error = nvlist_remove(props, 485 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 486 487 if (!error) { 488 error = nvlist_add_uint64(props, 489 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 490 } 491 } 492 493 return (error); 494 } 495 496 void 497 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 498 { 499 char *cachefile; 500 spa_config_dirent_t *dp; 501 502 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 503 &cachefile) != 0) 504 return; 505 506 dp = kmem_alloc(sizeof (spa_config_dirent_t), 507 KM_SLEEP); 508 509 if (cachefile[0] == '\0') 510 dp->scd_path = spa_strdup(spa_config_path); 511 else if (strcmp(cachefile, "none") == 0) 512 dp->scd_path = NULL; 513 else 514 dp->scd_path = spa_strdup(cachefile); 515 516 list_insert_head(&spa->spa_config_list, dp); 517 if (need_sync) 518 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 519 } 520 521 int 522 spa_prop_set(spa_t *spa, nvlist_t *nvp) 523 { 524 int error; 525 nvpair_t *elem; 526 boolean_t need_sync = B_FALSE; 527 zpool_prop_t prop; 528 529 if ((error = spa_prop_validate(spa, nvp)) != 0) 530 return (error); 531 532 elem = NULL; 533 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 534 if ((prop = zpool_name_to_prop( 535 nvpair_name(elem))) == ZPROP_INVAL) 536 return (EINVAL); 537 538 if (prop == ZPOOL_PROP_CACHEFILE || 539 prop == ZPOOL_PROP_ALTROOT || 540 prop == ZPOOL_PROP_READONLY) 541 continue; 542 543 need_sync = B_TRUE; 544 break; 545 } 546 547 if (need_sync) 548 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 549 spa, nvp, 3)); 550 else 551 return (0); 552 } 553 554 /* 555 * If the bootfs property value is dsobj, clear it. 556 */ 557 void 558 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 559 { 560 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 561 VERIFY(zap_remove(spa->spa_meta_objset, 562 spa->spa_pool_props_object, 563 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 564 spa->spa_bootfs = 0; 565 } 566 } 567 568 /* 569 * ========================================================================== 570 * SPA state manipulation (open/create/destroy/import/export) 571 * ========================================================================== 572 */ 573 574 static int 575 spa_error_entry_compare(const void *a, const void *b) 576 { 577 spa_error_entry_t *sa = (spa_error_entry_t *)a; 578 spa_error_entry_t *sb = (spa_error_entry_t *)b; 579 int ret; 580 581 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 582 sizeof (zbookmark_t)); 583 584 if (ret < 0) 585 return (-1); 586 else if (ret > 0) 587 return (1); 588 else 589 return (0); 590 } 591 592 /* 593 * Utility function which retrieves copies of the current logs and 594 * re-initializes them in the process. 595 */ 596 void 597 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 598 { 599 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 600 601 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 602 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 603 604 avl_create(&spa->spa_errlist_scrub, 605 spa_error_entry_compare, sizeof (spa_error_entry_t), 606 offsetof(spa_error_entry_t, se_avl)); 607 avl_create(&spa->spa_errlist_last, 608 spa_error_entry_compare, sizeof (spa_error_entry_t), 609 offsetof(spa_error_entry_t, se_avl)); 610 } 611 612 static taskq_t * 613 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 614 uint_t value) 615 { 616 uint_t flags = 0; 617 boolean_t batch = B_FALSE; 618 619 switch (mode) { 620 case zti_mode_null: 621 return (NULL); /* no taskq needed */ 622 623 case zti_mode_fixed: 624 ASSERT3U(value, >=, 1); 625 value = MAX(value, 1); 626 break; 627 628 case zti_mode_batch: 629 batch = B_TRUE; 630 flags |= TASKQ_THREADS_CPU_PCT; 631 value = zio_taskq_batch_pct; 632 break; 633 634 case zti_mode_online_percent: 635 flags |= TASKQ_THREADS_CPU_PCT; 636 break; 637 638 default: 639 panic("unrecognized mode for %s taskq (%u:%u) in " 640 "spa_activate()", 641 name, mode, value); 642 break; 643 } 644 645 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 646 if (batch) 647 flags |= TASKQ_DC_BATCH; 648 649 return (taskq_create_sysdc(name, value, 50, INT_MAX, 650 spa->spa_proc, zio_taskq_basedc, flags)); 651 } 652 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 653 spa->spa_proc, flags)); 654 } 655 656 static void 657 spa_create_zio_taskqs(spa_t *spa) 658 { 659 for (int t = 0; t < ZIO_TYPES; t++) { 660 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 661 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 662 enum zti_modes mode = ztip->zti_mode; 663 uint_t value = ztip->zti_value; 664 char name[32]; 665 666 (void) snprintf(name, sizeof (name), 667 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 668 669 spa->spa_zio_taskq[t][q] = 670 spa_taskq_create(spa, name, mode, value); 671 } 672 } 673 } 674 675 #ifdef _KERNEL 676 static void 677 spa_thread(void *arg) 678 { 679 callb_cpr_t cprinfo; 680 681 spa_t *spa = arg; 682 user_t *pu = PTOU(curproc); 683 684 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 685 spa->spa_name); 686 687 ASSERT(curproc != &p0); 688 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 689 "zpool-%s", spa->spa_name); 690 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 691 692 /* bind this thread to the requested psrset */ 693 if (zio_taskq_psrset_bind != PS_NONE) { 694 pool_lock(); 695 mutex_enter(&cpu_lock); 696 mutex_enter(&pidlock); 697 mutex_enter(&curproc->p_lock); 698 699 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 700 0, NULL, NULL) == 0) { 701 curthread->t_bind_pset = zio_taskq_psrset_bind; 702 } else { 703 cmn_err(CE_WARN, 704 "Couldn't bind process for zfs pool \"%s\" to " 705 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 706 } 707 708 mutex_exit(&curproc->p_lock); 709 mutex_exit(&pidlock); 710 mutex_exit(&cpu_lock); 711 pool_unlock(); 712 } 713 714 if (zio_taskq_sysdc) { 715 sysdc_thread_enter(curthread, 100, 0); 716 } 717 718 spa->spa_proc = curproc; 719 spa->spa_did = curthread->t_did; 720 721 spa_create_zio_taskqs(spa); 722 723 mutex_enter(&spa->spa_proc_lock); 724 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 725 726 spa->spa_proc_state = SPA_PROC_ACTIVE; 727 cv_broadcast(&spa->spa_proc_cv); 728 729 CALLB_CPR_SAFE_BEGIN(&cprinfo); 730 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 731 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 732 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 733 734 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 735 spa->spa_proc_state = SPA_PROC_GONE; 736 spa->spa_proc = &p0; 737 cv_broadcast(&spa->spa_proc_cv); 738 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 739 740 mutex_enter(&curproc->p_lock); 741 lwp_exit(); 742 } 743 #endif 744 745 /* 746 * Activate an uninitialized pool. 747 */ 748 static void 749 spa_activate(spa_t *spa, int mode) 750 { 751 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 752 753 spa->spa_state = POOL_STATE_ACTIVE; 754 spa->spa_mode = mode; 755 756 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 757 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 758 759 /* Try to create a covering process */ 760 mutex_enter(&spa->spa_proc_lock); 761 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 762 ASSERT(spa->spa_proc == &p0); 763 spa->spa_did = 0; 764 765 /* Only create a process if we're going to be around a while. */ 766 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 767 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 768 NULL, 0) == 0) { 769 spa->spa_proc_state = SPA_PROC_CREATED; 770 while (spa->spa_proc_state == SPA_PROC_CREATED) { 771 cv_wait(&spa->spa_proc_cv, 772 &spa->spa_proc_lock); 773 } 774 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 775 ASSERT(spa->spa_proc != &p0); 776 ASSERT(spa->spa_did != 0); 777 } else { 778 #ifdef _KERNEL 779 cmn_err(CE_WARN, 780 "Couldn't create process for zfs pool \"%s\"\n", 781 spa->spa_name); 782 #endif 783 } 784 } 785 mutex_exit(&spa->spa_proc_lock); 786 787 /* If we didn't create a process, we need to create our taskqs. */ 788 if (spa->spa_proc == &p0) { 789 spa_create_zio_taskqs(spa); 790 } 791 792 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 793 offsetof(vdev_t, vdev_config_dirty_node)); 794 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 795 offsetof(vdev_t, vdev_state_dirty_node)); 796 797 txg_list_create(&spa->spa_vdev_txg_list, 798 offsetof(struct vdev, vdev_txg_node)); 799 800 avl_create(&spa->spa_errlist_scrub, 801 spa_error_entry_compare, sizeof (spa_error_entry_t), 802 offsetof(spa_error_entry_t, se_avl)); 803 avl_create(&spa->spa_errlist_last, 804 spa_error_entry_compare, sizeof (spa_error_entry_t), 805 offsetof(spa_error_entry_t, se_avl)); 806 } 807 808 /* 809 * Opposite of spa_activate(). 810 */ 811 static void 812 spa_deactivate(spa_t *spa) 813 { 814 ASSERT(spa->spa_sync_on == B_FALSE); 815 ASSERT(spa->spa_dsl_pool == NULL); 816 ASSERT(spa->spa_root_vdev == NULL); 817 ASSERT(spa->spa_async_zio_root == NULL); 818 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 819 820 txg_list_destroy(&spa->spa_vdev_txg_list); 821 822 list_destroy(&spa->spa_config_dirty_list); 823 list_destroy(&spa->spa_state_dirty_list); 824 825 for (int t = 0; t < ZIO_TYPES; t++) { 826 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 827 if (spa->spa_zio_taskq[t][q] != NULL) 828 taskq_destroy(spa->spa_zio_taskq[t][q]); 829 spa->spa_zio_taskq[t][q] = NULL; 830 } 831 } 832 833 metaslab_class_destroy(spa->spa_normal_class); 834 spa->spa_normal_class = NULL; 835 836 metaslab_class_destroy(spa->spa_log_class); 837 spa->spa_log_class = NULL; 838 839 /* 840 * If this was part of an import or the open otherwise failed, we may 841 * still have errors left in the queues. Empty them just in case. 842 */ 843 spa_errlog_drain(spa); 844 845 avl_destroy(&spa->spa_errlist_scrub); 846 avl_destroy(&spa->spa_errlist_last); 847 848 spa->spa_state = POOL_STATE_UNINITIALIZED; 849 850 mutex_enter(&spa->spa_proc_lock); 851 if (spa->spa_proc_state != SPA_PROC_NONE) { 852 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 853 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 854 cv_broadcast(&spa->spa_proc_cv); 855 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 856 ASSERT(spa->spa_proc != &p0); 857 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 858 } 859 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 860 spa->spa_proc_state = SPA_PROC_NONE; 861 } 862 ASSERT(spa->spa_proc == &p0); 863 mutex_exit(&spa->spa_proc_lock); 864 865 /* 866 * We want to make sure spa_thread() has actually exited the ZFS 867 * module, so that the module can't be unloaded out from underneath 868 * it. 869 */ 870 if (spa->spa_did != 0) { 871 thread_join(spa->spa_did); 872 spa->spa_did = 0; 873 } 874 } 875 876 /* 877 * Verify a pool configuration, and construct the vdev tree appropriately. This 878 * will create all the necessary vdevs in the appropriate layout, with each vdev 879 * in the CLOSED state. This will prep the pool before open/creation/import. 880 * All vdev validation is done by the vdev_alloc() routine. 881 */ 882 static int 883 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 884 uint_t id, int atype) 885 { 886 nvlist_t **child; 887 uint_t children; 888 int error; 889 890 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 891 return (error); 892 893 if ((*vdp)->vdev_ops->vdev_op_leaf) 894 return (0); 895 896 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 897 &child, &children); 898 899 if (error == ENOENT) 900 return (0); 901 902 if (error) { 903 vdev_free(*vdp); 904 *vdp = NULL; 905 return (EINVAL); 906 } 907 908 for (int c = 0; c < children; c++) { 909 vdev_t *vd; 910 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 911 atype)) != 0) { 912 vdev_free(*vdp); 913 *vdp = NULL; 914 return (error); 915 } 916 } 917 918 ASSERT(*vdp != NULL); 919 920 return (0); 921 } 922 923 /* 924 * Opposite of spa_load(). 925 */ 926 static void 927 spa_unload(spa_t *spa) 928 { 929 int i; 930 931 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 932 933 /* 934 * Stop async tasks. 935 */ 936 spa_async_suspend(spa); 937 938 /* 939 * Stop syncing. 940 */ 941 if (spa->spa_sync_on) { 942 txg_sync_stop(spa->spa_dsl_pool); 943 spa->spa_sync_on = B_FALSE; 944 } 945 946 /* 947 * Wait for any outstanding async I/O to complete. 948 */ 949 if (spa->spa_async_zio_root != NULL) { 950 (void) zio_wait(spa->spa_async_zio_root); 951 spa->spa_async_zio_root = NULL; 952 } 953 954 bpobj_close(&spa->spa_deferred_bpobj); 955 956 /* 957 * Close the dsl pool. 958 */ 959 if (spa->spa_dsl_pool) { 960 dsl_pool_close(spa->spa_dsl_pool); 961 spa->spa_dsl_pool = NULL; 962 spa->spa_meta_objset = NULL; 963 } 964 965 ddt_unload(spa); 966 967 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 968 969 /* 970 * Drop and purge level 2 cache 971 */ 972 spa_l2cache_drop(spa); 973 974 /* 975 * Close all vdevs. 976 */ 977 if (spa->spa_root_vdev) 978 vdev_free(spa->spa_root_vdev); 979 ASSERT(spa->spa_root_vdev == NULL); 980 981 for (i = 0; i < spa->spa_spares.sav_count; i++) 982 vdev_free(spa->spa_spares.sav_vdevs[i]); 983 if (spa->spa_spares.sav_vdevs) { 984 kmem_free(spa->spa_spares.sav_vdevs, 985 spa->spa_spares.sav_count * sizeof (void *)); 986 spa->spa_spares.sav_vdevs = NULL; 987 } 988 if (spa->spa_spares.sav_config) { 989 nvlist_free(spa->spa_spares.sav_config); 990 spa->spa_spares.sav_config = NULL; 991 } 992 spa->spa_spares.sav_count = 0; 993 994 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 995 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 996 if (spa->spa_l2cache.sav_vdevs) { 997 kmem_free(spa->spa_l2cache.sav_vdevs, 998 spa->spa_l2cache.sav_count * sizeof (void *)); 999 spa->spa_l2cache.sav_vdevs = NULL; 1000 } 1001 if (spa->spa_l2cache.sav_config) { 1002 nvlist_free(spa->spa_l2cache.sav_config); 1003 spa->spa_l2cache.sav_config = NULL; 1004 } 1005 spa->spa_l2cache.sav_count = 0; 1006 1007 spa->spa_async_suspended = 0; 1008 1009 spa_config_exit(spa, SCL_ALL, FTAG); 1010 } 1011 1012 /* 1013 * Load (or re-load) the current list of vdevs describing the active spares for 1014 * this pool. When this is called, we have some form of basic information in 1015 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1016 * then re-generate a more complete list including status information. 1017 */ 1018 static void 1019 spa_load_spares(spa_t *spa) 1020 { 1021 nvlist_t **spares; 1022 uint_t nspares; 1023 int i; 1024 vdev_t *vd, *tvd; 1025 1026 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1027 1028 /* 1029 * First, close and free any existing spare vdevs. 1030 */ 1031 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1032 vd = spa->spa_spares.sav_vdevs[i]; 1033 1034 /* Undo the call to spa_activate() below */ 1035 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1036 B_FALSE)) != NULL && tvd->vdev_isspare) 1037 spa_spare_remove(tvd); 1038 vdev_close(vd); 1039 vdev_free(vd); 1040 } 1041 1042 if (spa->spa_spares.sav_vdevs) 1043 kmem_free(spa->spa_spares.sav_vdevs, 1044 spa->spa_spares.sav_count * sizeof (void *)); 1045 1046 if (spa->spa_spares.sav_config == NULL) 1047 nspares = 0; 1048 else 1049 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1050 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1051 1052 spa->spa_spares.sav_count = (int)nspares; 1053 spa->spa_spares.sav_vdevs = NULL; 1054 1055 if (nspares == 0) 1056 return; 1057 1058 /* 1059 * Construct the array of vdevs, opening them to get status in the 1060 * process. For each spare, there is potentially two different vdev_t 1061 * structures associated with it: one in the list of spares (used only 1062 * for basic validation purposes) and one in the active vdev 1063 * configuration (if it's spared in). During this phase we open and 1064 * validate each vdev on the spare list. If the vdev also exists in the 1065 * active configuration, then we also mark this vdev as an active spare. 1066 */ 1067 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1068 KM_SLEEP); 1069 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1070 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1071 VDEV_ALLOC_SPARE) == 0); 1072 ASSERT(vd != NULL); 1073 1074 spa->spa_spares.sav_vdevs[i] = vd; 1075 1076 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1077 B_FALSE)) != NULL) { 1078 if (!tvd->vdev_isspare) 1079 spa_spare_add(tvd); 1080 1081 /* 1082 * We only mark the spare active if we were successfully 1083 * able to load the vdev. Otherwise, importing a pool 1084 * with a bad active spare would result in strange 1085 * behavior, because multiple pool would think the spare 1086 * is actively in use. 1087 * 1088 * There is a vulnerability here to an equally bizarre 1089 * circumstance, where a dead active spare is later 1090 * brought back to life (onlined or otherwise). Given 1091 * the rarity of this scenario, and the extra complexity 1092 * it adds, we ignore the possibility. 1093 */ 1094 if (!vdev_is_dead(tvd)) 1095 spa_spare_activate(tvd); 1096 } 1097 1098 vd->vdev_top = vd; 1099 vd->vdev_aux = &spa->spa_spares; 1100 1101 if (vdev_open(vd) != 0) 1102 continue; 1103 1104 if (vdev_validate_aux(vd) == 0) 1105 spa_spare_add(vd); 1106 } 1107 1108 /* 1109 * Recompute the stashed list of spares, with status information 1110 * this time. 1111 */ 1112 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1113 DATA_TYPE_NVLIST_ARRAY) == 0); 1114 1115 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1116 KM_SLEEP); 1117 for (i = 0; i < spa->spa_spares.sav_count; i++) 1118 spares[i] = vdev_config_generate(spa, 1119 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1120 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1121 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1122 for (i = 0; i < spa->spa_spares.sav_count; i++) 1123 nvlist_free(spares[i]); 1124 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1125 } 1126 1127 /* 1128 * Load (or re-load) the current list of vdevs describing the active l2cache for 1129 * this pool. When this is called, we have some form of basic information in 1130 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1131 * then re-generate a more complete list including status information. 1132 * Devices which are already active have their details maintained, and are 1133 * not re-opened. 1134 */ 1135 static void 1136 spa_load_l2cache(spa_t *spa) 1137 { 1138 nvlist_t **l2cache; 1139 uint_t nl2cache; 1140 int i, j, oldnvdevs; 1141 uint64_t guid; 1142 vdev_t *vd, **oldvdevs, **newvdevs; 1143 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1144 1145 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1146 1147 if (sav->sav_config != NULL) { 1148 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1149 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1150 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1151 } else { 1152 nl2cache = 0; 1153 } 1154 1155 oldvdevs = sav->sav_vdevs; 1156 oldnvdevs = sav->sav_count; 1157 sav->sav_vdevs = NULL; 1158 sav->sav_count = 0; 1159 1160 /* 1161 * Process new nvlist of vdevs. 1162 */ 1163 for (i = 0; i < nl2cache; i++) { 1164 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1165 &guid) == 0); 1166 1167 newvdevs[i] = NULL; 1168 for (j = 0; j < oldnvdevs; j++) { 1169 vd = oldvdevs[j]; 1170 if (vd != NULL && guid == vd->vdev_guid) { 1171 /* 1172 * Retain previous vdev for add/remove ops. 1173 */ 1174 newvdevs[i] = vd; 1175 oldvdevs[j] = NULL; 1176 break; 1177 } 1178 } 1179 1180 if (newvdevs[i] == NULL) { 1181 /* 1182 * Create new vdev 1183 */ 1184 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1185 VDEV_ALLOC_L2CACHE) == 0); 1186 ASSERT(vd != NULL); 1187 newvdevs[i] = vd; 1188 1189 /* 1190 * Commit this vdev as an l2cache device, 1191 * even if it fails to open. 1192 */ 1193 spa_l2cache_add(vd); 1194 1195 vd->vdev_top = vd; 1196 vd->vdev_aux = sav; 1197 1198 spa_l2cache_activate(vd); 1199 1200 if (vdev_open(vd) != 0) 1201 continue; 1202 1203 (void) vdev_validate_aux(vd); 1204 1205 if (!vdev_is_dead(vd)) 1206 l2arc_add_vdev(spa, vd); 1207 } 1208 } 1209 1210 /* 1211 * Purge vdevs that were dropped 1212 */ 1213 for (i = 0; i < oldnvdevs; i++) { 1214 uint64_t pool; 1215 1216 vd = oldvdevs[i]; 1217 if (vd != NULL) { 1218 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1219 pool != 0ULL && l2arc_vdev_present(vd)) 1220 l2arc_remove_vdev(vd); 1221 (void) vdev_close(vd); 1222 spa_l2cache_remove(vd); 1223 } 1224 } 1225 1226 if (oldvdevs) 1227 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1228 1229 if (sav->sav_config == NULL) 1230 goto out; 1231 1232 sav->sav_vdevs = newvdevs; 1233 sav->sav_count = (int)nl2cache; 1234 1235 /* 1236 * Recompute the stashed list of l2cache devices, with status 1237 * information this time. 1238 */ 1239 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1240 DATA_TYPE_NVLIST_ARRAY) == 0); 1241 1242 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1243 for (i = 0; i < sav->sav_count; i++) 1244 l2cache[i] = vdev_config_generate(spa, 1245 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1246 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1247 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1248 out: 1249 for (i = 0; i < sav->sav_count; i++) 1250 nvlist_free(l2cache[i]); 1251 if (sav->sav_count) 1252 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1253 } 1254 1255 static int 1256 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1257 { 1258 dmu_buf_t *db; 1259 char *packed = NULL; 1260 size_t nvsize = 0; 1261 int error; 1262 *value = NULL; 1263 1264 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1265 nvsize = *(uint64_t *)db->db_data; 1266 dmu_buf_rele(db, FTAG); 1267 1268 packed = kmem_alloc(nvsize, KM_SLEEP); 1269 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1270 DMU_READ_PREFETCH); 1271 if (error == 0) 1272 error = nvlist_unpack(packed, nvsize, value, 0); 1273 kmem_free(packed, nvsize); 1274 1275 return (error); 1276 } 1277 1278 /* 1279 * Checks to see if the given vdev could not be opened, in which case we post a 1280 * sysevent to notify the autoreplace code that the device has been removed. 1281 */ 1282 static void 1283 spa_check_removed(vdev_t *vd) 1284 { 1285 for (int c = 0; c < vd->vdev_children; c++) 1286 spa_check_removed(vd->vdev_child[c]); 1287 1288 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1289 zfs_post_autoreplace(vd->vdev_spa, vd); 1290 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1291 } 1292 } 1293 1294 /* 1295 * Validate the current config against the MOS config 1296 */ 1297 static boolean_t 1298 spa_config_valid(spa_t *spa, nvlist_t *config) 1299 { 1300 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1301 nvlist_t *nv; 1302 1303 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1304 1305 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1306 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1307 1308 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1309 1310 /* 1311 * If we're doing a normal import, then build up any additional 1312 * diagnostic information about missing devices in this config. 1313 * We'll pass this up to the user for further processing. 1314 */ 1315 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1316 nvlist_t **child, *nv; 1317 uint64_t idx = 0; 1318 1319 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1320 KM_SLEEP); 1321 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1322 1323 for (int c = 0; c < rvd->vdev_children; c++) { 1324 vdev_t *tvd = rvd->vdev_child[c]; 1325 vdev_t *mtvd = mrvd->vdev_child[c]; 1326 1327 if (tvd->vdev_ops == &vdev_missing_ops && 1328 mtvd->vdev_ops != &vdev_missing_ops && 1329 mtvd->vdev_islog) 1330 child[idx++] = vdev_config_generate(spa, mtvd, 1331 B_FALSE, 0); 1332 } 1333 1334 if (idx) { 1335 VERIFY(nvlist_add_nvlist_array(nv, 1336 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1337 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1338 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1339 1340 for (int i = 0; i < idx; i++) 1341 nvlist_free(child[i]); 1342 } 1343 nvlist_free(nv); 1344 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1345 } 1346 1347 /* 1348 * Compare the root vdev tree with the information we have 1349 * from the MOS config (mrvd). Check each top-level vdev 1350 * with the corresponding MOS config top-level (mtvd). 1351 */ 1352 for (int c = 0; c < rvd->vdev_children; c++) { 1353 vdev_t *tvd = rvd->vdev_child[c]; 1354 vdev_t *mtvd = mrvd->vdev_child[c]; 1355 1356 /* 1357 * Resolve any "missing" vdevs in the current configuration. 1358 * If we find that the MOS config has more accurate information 1359 * about the top-level vdev then use that vdev instead. 1360 */ 1361 if (tvd->vdev_ops == &vdev_missing_ops && 1362 mtvd->vdev_ops != &vdev_missing_ops) { 1363 1364 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1365 continue; 1366 1367 /* 1368 * Device specific actions. 1369 */ 1370 if (mtvd->vdev_islog) { 1371 spa_set_log_state(spa, SPA_LOG_CLEAR); 1372 } else { 1373 /* 1374 * XXX - once we have 'readonly' pool 1375 * support we should be able to handle 1376 * missing data devices by transitioning 1377 * the pool to readonly. 1378 */ 1379 continue; 1380 } 1381 1382 /* 1383 * Swap the missing vdev with the data we were 1384 * able to obtain from the MOS config. 1385 */ 1386 vdev_remove_child(rvd, tvd); 1387 vdev_remove_child(mrvd, mtvd); 1388 1389 vdev_add_child(rvd, mtvd); 1390 vdev_add_child(mrvd, tvd); 1391 1392 spa_config_exit(spa, SCL_ALL, FTAG); 1393 vdev_load(mtvd); 1394 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1395 1396 vdev_reopen(rvd); 1397 } else if (mtvd->vdev_islog) { 1398 /* 1399 * Load the slog device's state from the MOS config 1400 * since it's possible that the label does not 1401 * contain the most up-to-date information. 1402 */ 1403 vdev_load_log_state(tvd, mtvd); 1404 vdev_reopen(tvd); 1405 } 1406 } 1407 vdev_free(mrvd); 1408 spa_config_exit(spa, SCL_ALL, FTAG); 1409 1410 /* 1411 * Ensure we were able to validate the config. 1412 */ 1413 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1414 } 1415 1416 /* 1417 * Check for missing log devices 1418 */ 1419 static int 1420 spa_check_logs(spa_t *spa) 1421 { 1422 switch (spa->spa_log_state) { 1423 case SPA_LOG_MISSING: 1424 /* need to recheck in case slog has been restored */ 1425 case SPA_LOG_UNKNOWN: 1426 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1427 DS_FIND_CHILDREN)) { 1428 spa_set_log_state(spa, SPA_LOG_MISSING); 1429 return (1); 1430 } 1431 break; 1432 } 1433 return (0); 1434 } 1435 1436 static boolean_t 1437 spa_passivate_log(spa_t *spa) 1438 { 1439 vdev_t *rvd = spa->spa_root_vdev; 1440 boolean_t slog_found = B_FALSE; 1441 1442 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1443 1444 if (!spa_has_slogs(spa)) 1445 return (B_FALSE); 1446 1447 for (int c = 0; c < rvd->vdev_children; c++) { 1448 vdev_t *tvd = rvd->vdev_child[c]; 1449 metaslab_group_t *mg = tvd->vdev_mg; 1450 1451 if (tvd->vdev_islog) { 1452 metaslab_group_passivate(mg); 1453 slog_found = B_TRUE; 1454 } 1455 } 1456 1457 return (slog_found); 1458 } 1459 1460 static void 1461 spa_activate_log(spa_t *spa) 1462 { 1463 vdev_t *rvd = spa->spa_root_vdev; 1464 1465 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1466 1467 for (int c = 0; c < rvd->vdev_children; c++) { 1468 vdev_t *tvd = rvd->vdev_child[c]; 1469 metaslab_group_t *mg = tvd->vdev_mg; 1470 1471 if (tvd->vdev_islog) 1472 metaslab_group_activate(mg); 1473 } 1474 } 1475 1476 int 1477 spa_offline_log(spa_t *spa) 1478 { 1479 int error = 0; 1480 1481 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1482 NULL, DS_FIND_CHILDREN)) == 0) { 1483 1484 /* 1485 * We successfully offlined the log device, sync out the 1486 * current txg so that the "stubby" block can be removed 1487 * by zil_sync(). 1488 */ 1489 txg_wait_synced(spa->spa_dsl_pool, 0); 1490 } 1491 return (error); 1492 } 1493 1494 static void 1495 spa_aux_check_removed(spa_aux_vdev_t *sav) 1496 { 1497 for (int i = 0; i < sav->sav_count; i++) 1498 spa_check_removed(sav->sav_vdevs[i]); 1499 } 1500 1501 void 1502 spa_claim_notify(zio_t *zio) 1503 { 1504 spa_t *spa = zio->io_spa; 1505 1506 if (zio->io_error) 1507 return; 1508 1509 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1510 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1511 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1512 mutex_exit(&spa->spa_props_lock); 1513 } 1514 1515 typedef struct spa_load_error { 1516 uint64_t sle_meta_count; 1517 uint64_t sle_data_count; 1518 } spa_load_error_t; 1519 1520 static void 1521 spa_load_verify_done(zio_t *zio) 1522 { 1523 blkptr_t *bp = zio->io_bp; 1524 spa_load_error_t *sle = zio->io_private; 1525 dmu_object_type_t type = BP_GET_TYPE(bp); 1526 int error = zio->io_error; 1527 1528 if (error) { 1529 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1530 type != DMU_OT_INTENT_LOG) 1531 atomic_add_64(&sle->sle_meta_count, 1); 1532 else 1533 atomic_add_64(&sle->sle_data_count, 1); 1534 } 1535 zio_data_buf_free(zio->io_data, zio->io_size); 1536 } 1537 1538 /*ARGSUSED*/ 1539 static int 1540 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1541 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1542 { 1543 if (bp != NULL) { 1544 zio_t *rio = arg; 1545 size_t size = BP_GET_PSIZE(bp); 1546 void *data = zio_data_buf_alloc(size); 1547 1548 zio_nowait(zio_read(rio, spa, bp, data, size, 1549 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1550 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1551 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1552 } 1553 return (0); 1554 } 1555 1556 static int 1557 spa_load_verify(spa_t *spa) 1558 { 1559 zio_t *rio; 1560 spa_load_error_t sle = { 0 }; 1561 zpool_rewind_policy_t policy; 1562 boolean_t verify_ok = B_FALSE; 1563 int error; 1564 1565 zpool_get_rewind_policy(spa->spa_config, &policy); 1566 1567 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1568 return (0); 1569 1570 rio = zio_root(spa, NULL, &sle, 1571 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1572 1573 error = traverse_pool(spa, spa->spa_verify_min_txg, 1574 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1575 1576 (void) zio_wait(rio); 1577 1578 spa->spa_load_meta_errors = sle.sle_meta_count; 1579 spa->spa_load_data_errors = sle.sle_data_count; 1580 1581 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1582 sle.sle_data_count <= policy.zrp_maxdata) { 1583 int64_t loss = 0; 1584 1585 verify_ok = B_TRUE; 1586 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1587 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1588 1589 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1590 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1591 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1592 VERIFY(nvlist_add_int64(spa->spa_load_info, 1593 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1594 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1595 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1596 } else { 1597 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1598 } 1599 1600 if (error) { 1601 if (error != ENXIO && error != EIO) 1602 error = EIO; 1603 return (error); 1604 } 1605 1606 return (verify_ok ? 0 : EIO); 1607 } 1608 1609 /* 1610 * Find a value in the pool props object. 1611 */ 1612 static void 1613 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1614 { 1615 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1616 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1617 } 1618 1619 /* 1620 * Find a value in the pool directory object. 1621 */ 1622 static int 1623 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1624 { 1625 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1626 name, sizeof (uint64_t), 1, val)); 1627 } 1628 1629 static int 1630 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1631 { 1632 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1633 return (err); 1634 } 1635 1636 /* 1637 * Fix up config after a partly-completed split. This is done with the 1638 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1639 * pool have that entry in their config, but only the splitting one contains 1640 * a list of all the guids of the vdevs that are being split off. 1641 * 1642 * This function determines what to do with that list: either rejoin 1643 * all the disks to the pool, or complete the splitting process. To attempt 1644 * the rejoin, each disk that is offlined is marked online again, and 1645 * we do a reopen() call. If the vdev label for every disk that was 1646 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1647 * then we call vdev_split() on each disk, and complete the split. 1648 * 1649 * Otherwise we leave the config alone, with all the vdevs in place in 1650 * the original pool. 1651 */ 1652 static void 1653 spa_try_repair(spa_t *spa, nvlist_t *config) 1654 { 1655 uint_t extracted; 1656 uint64_t *glist; 1657 uint_t i, gcount; 1658 nvlist_t *nvl; 1659 vdev_t **vd; 1660 boolean_t attempt_reopen; 1661 1662 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1663 return; 1664 1665 /* check that the config is complete */ 1666 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1667 &glist, &gcount) != 0) 1668 return; 1669 1670 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1671 1672 /* attempt to online all the vdevs & validate */ 1673 attempt_reopen = B_TRUE; 1674 for (i = 0; i < gcount; i++) { 1675 if (glist[i] == 0) /* vdev is hole */ 1676 continue; 1677 1678 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1679 if (vd[i] == NULL) { 1680 /* 1681 * Don't bother attempting to reopen the disks; 1682 * just do the split. 1683 */ 1684 attempt_reopen = B_FALSE; 1685 } else { 1686 /* attempt to re-online it */ 1687 vd[i]->vdev_offline = B_FALSE; 1688 } 1689 } 1690 1691 if (attempt_reopen) { 1692 vdev_reopen(spa->spa_root_vdev); 1693 1694 /* check each device to see what state it's in */ 1695 for (extracted = 0, i = 0; i < gcount; i++) { 1696 if (vd[i] != NULL && 1697 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1698 break; 1699 ++extracted; 1700 } 1701 } 1702 1703 /* 1704 * If every disk has been moved to the new pool, or if we never 1705 * even attempted to look at them, then we split them off for 1706 * good. 1707 */ 1708 if (!attempt_reopen || gcount == extracted) { 1709 for (i = 0; i < gcount; i++) 1710 if (vd[i] != NULL) 1711 vdev_split(vd[i]); 1712 vdev_reopen(spa->spa_root_vdev); 1713 } 1714 1715 kmem_free(vd, gcount * sizeof (vdev_t *)); 1716 } 1717 1718 static int 1719 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1720 boolean_t mosconfig) 1721 { 1722 nvlist_t *config = spa->spa_config; 1723 char *ereport = FM_EREPORT_ZFS_POOL; 1724 int error; 1725 uint64_t pool_guid; 1726 nvlist_t *nvl; 1727 1728 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1729 return (EINVAL); 1730 1731 /* 1732 * Versioning wasn't explicitly added to the label until later, so if 1733 * it's not present treat it as the initial version. 1734 */ 1735 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1736 &spa->spa_ubsync.ub_version) != 0) 1737 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1738 1739 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1740 &spa->spa_config_txg); 1741 1742 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1743 spa_guid_exists(pool_guid, 0)) { 1744 error = EEXIST; 1745 } else { 1746 spa->spa_load_guid = pool_guid; 1747 1748 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1749 &nvl) == 0) { 1750 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1751 KM_SLEEP) == 0); 1752 } 1753 1754 gethrestime(&spa->spa_loaded_ts); 1755 error = spa_load_impl(spa, pool_guid, config, state, type, 1756 mosconfig, &ereport); 1757 } 1758 1759 spa->spa_minref = refcount_count(&spa->spa_refcount); 1760 if (error) { 1761 if (error != EEXIST) { 1762 spa->spa_loaded_ts.tv_sec = 0; 1763 spa->spa_loaded_ts.tv_nsec = 0; 1764 } 1765 if (error != EBADF) { 1766 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1767 } 1768 } 1769 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1770 spa->spa_ena = 0; 1771 1772 return (error); 1773 } 1774 1775 /* 1776 * Load an existing storage pool, using the pool's builtin spa_config as a 1777 * source of configuration information. 1778 */ 1779 static int 1780 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1781 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1782 char **ereport) 1783 { 1784 int error = 0; 1785 nvlist_t *nvroot = NULL; 1786 vdev_t *rvd; 1787 uberblock_t *ub = &spa->spa_uberblock; 1788 uint64_t children, config_cache_txg = spa->spa_config_txg; 1789 int orig_mode = spa->spa_mode; 1790 int parse; 1791 uint64_t obj; 1792 1793 /* 1794 * If this is an untrusted config, access the pool in read-only mode. 1795 * This prevents things like resilvering recently removed devices. 1796 */ 1797 if (!mosconfig) 1798 spa->spa_mode = FREAD; 1799 1800 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1801 1802 spa->spa_load_state = state; 1803 1804 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1805 return (EINVAL); 1806 1807 parse = (type == SPA_IMPORT_EXISTING ? 1808 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1809 1810 /* 1811 * Create "The Godfather" zio to hold all async IOs 1812 */ 1813 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1814 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1815 1816 /* 1817 * Parse the configuration into a vdev tree. We explicitly set the 1818 * value that will be returned by spa_version() since parsing the 1819 * configuration requires knowing the version number. 1820 */ 1821 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1822 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1823 spa_config_exit(spa, SCL_ALL, FTAG); 1824 1825 if (error != 0) 1826 return (error); 1827 1828 ASSERT(spa->spa_root_vdev == rvd); 1829 1830 if (type != SPA_IMPORT_ASSEMBLE) { 1831 ASSERT(spa_guid(spa) == pool_guid); 1832 } 1833 1834 /* 1835 * Try to open all vdevs, loading each label in the process. 1836 */ 1837 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1838 error = vdev_open(rvd); 1839 spa_config_exit(spa, SCL_ALL, FTAG); 1840 if (error != 0) 1841 return (error); 1842 1843 /* 1844 * We need to validate the vdev labels against the configuration that 1845 * we have in hand, which is dependent on the setting of mosconfig. If 1846 * mosconfig is true then we're validating the vdev labels based on 1847 * that config. Otherwise, we're validating against the cached config 1848 * (zpool.cache) that was read when we loaded the zfs module, and then 1849 * later we will recursively call spa_load() and validate against 1850 * the vdev config. 1851 * 1852 * If we're assembling a new pool that's been split off from an 1853 * existing pool, the labels haven't yet been updated so we skip 1854 * validation for now. 1855 */ 1856 if (type != SPA_IMPORT_ASSEMBLE) { 1857 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1858 error = vdev_validate(rvd); 1859 spa_config_exit(spa, SCL_ALL, FTAG); 1860 1861 if (error != 0) 1862 return (error); 1863 1864 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1865 return (ENXIO); 1866 } 1867 1868 /* 1869 * Find the best uberblock. 1870 */ 1871 vdev_uberblock_load(NULL, rvd, ub); 1872 1873 /* 1874 * If we weren't able to find a single valid uberblock, return failure. 1875 */ 1876 if (ub->ub_txg == 0) 1877 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1878 1879 /* 1880 * If the pool is newer than the code, we can't open it. 1881 */ 1882 if (ub->ub_version > SPA_VERSION) 1883 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1884 1885 /* 1886 * If the vdev guid sum doesn't match the uberblock, we have an 1887 * incomplete configuration. We first check to see if the pool 1888 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 1889 * If it is, defer the vdev_guid_sum check till later so we 1890 * can handle missing vdevs. 1891 */ 1892 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 1893 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 1894 rvd->vdev_guid_sum != ub->ub_guid_sum) 1895 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1896 1897 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1898 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1899 spa_try_repair(spa, config); 1900 spa_config_exit(spa, SCL_ALL, FTAG); 1901 nvlist_free(spa->spa_config_splitting); 1902 spa->spa_config_splitting = NULL; 1903 } 1904 1905 /* 1906 * Initialize internal SPA structures. 1907 */ 1908 spa->spa_state = POOL_STATE_ACTIVE; 1909 spa->spa_ubsync = spa->spa_uberblock; 1910 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1911 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 1912 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1913 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1914 spa->spa_claim_max_txg = spa->spa_first_txg; 1915 spa->spa_prev_software_version = ub->ub_software_version; 1916 1917 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1918 if (error) 1919 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1920 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1921 1922 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 1923 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1924 1925 if (!mosconfig) { 1926 uint64_t hostid; 1927 nvlist_t *policy = NULL, *nvconfig; 1928 1929 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 1930 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1931 1932 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 1933 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 1934 char *hostname; 1935 unsigned long myhostid = 0; 1936 1937 VERIFY(nvlist_lookup_string(nvconfig, 1938 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 1939 1940 #ifdef _KERNEL 1941 myhostid = zone_get_hostid(NULL); 1942 #else /* _KERNEL */ 1943 /* 1944 * We're emulating the system's hostid in userland, so 1945 * we can't use zone_get_hostid(). 1946 */ 1947 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 1948 #endif /* _KERNEL */ 1949 if (hostid != 0 && myhostid != 0 && 1950 hostid != myhostid) { 1951 nvlist_free(nvconfig); 1952 cmn_err(CE_WARN, "pool '%s' could not be " 1953 "loaded as it was last accessed by " 1954 "another system (host: %s hostid: 0x%lx). " 1955 "See: http://www.sun.com/msg/ZFS-8000-EY", 1956 spa_name(spa), hostname, 1957 (unsigned long)hostid); 1958 return (EBADF); 1959 } 1960 } 1961 if (nvlist_lookup_nvlist(spa->spa_config, 1962 ZPOOL_REWIND_POLICY, &policy) == 0) 1963 VERIFY(nvlist_add_nvlist(nvconfig, 1964 ZPOOL_REWIND_POLICY, policy) == 0); 1965 1966 spa_config_set(spa, nvconfig); 1967 spa_unload(spa); 1968 spa_deactivate(spa); 1969 spa_activate(spa, orig_mode); 1970 1971 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 1972 } 1973 1974 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 1975 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1976 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 1977 if (error != 0) 1978 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1979 1980 /* 1981 * Load the bit that tells us to use the new accounting function 1982 * (raid-z deflation). If we have an older pool, this will not 1983 * be present. 1984 */ 1985 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 1986 if (error != 0 && error != ENOENT) 1987 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1988 1989 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 1990 &spa->spa_creation_version); 1991 if (error != 0 && error != ENOENT) 1992 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1993 1994 /* 1995 * Load the persistent error log. If we have an older pool, this will 1996 * not be present. 1997 */ 1998 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 1999 if (error != 0 && error != ENOENT) 2000 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2001 2002 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2003 &spa->spa_errlog_scrub); 2004 if (error != 0 && error != ENOENT) 2005 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2006 2007 /* 2008 * Load the history object. If we have an older pool, this 2009 * will not be present. 2010 */ 2011 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2012 if (error != 0 && error != ENOENT) 2013 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2014 2015 /* 2016 * If we're assembling the pool from the split-off vdevs of 2017 * an existing pool, we don't want to attach the spares & cache 2018 * devices. 2019 */ 2020 2021 /* 2022 * Load any hot spares for this pool. 2023 */ 2024 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2025 if (error != 0 && error != ENOENT) 2026 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2027 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2028 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2029 if (load_nvlist(spa, spa->spa_spares.sav_object, 2030 &spa->spa_spares.sav_config) != 0) 2031 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2032 2033 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2034 spa_load_spares(spa); 2035 spa_config_exit(spa, SCL_ALL, FTAG); 2036 } else if (error == 0) { 2037 spa->spa_spares.sav_sync = B_TRUE; 2038 } 2039 2040 /* 2041 * Load any level 2 ARC devices for this pool. 2042 */ 2043 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2044 &spa->spa_l2cache.sav_object); 2045 if (error != 0 && error != ENOENT) 2046 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2047 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2048 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2049 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2050 &spa->spa_l2cache.sav_config) != 0) 2051 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2052 2053 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2054 spa_load_l2cache(spa); 2055 spa_config_exit(spa, SCL_ALL, FTAG); 2056 } else if (error == 0) { 2057 spa->spa_l2cache.sav_sync = B_TRUE; 2058 } 2059 2060 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2061 2062 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2063 if (error && error != ENOENT) 2064 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2065 2066 if (error == 0) { 2067 uint64_t autoreplace; 2068 2069 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2070 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2071 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2072 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2073 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2074 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2075 &spa->spa_dedup_ditto); 2076 2077 spa->spa_autoreplace = (autoreplace != 0); 2078 } 2079 2080 /* 2081 * If the 'autoreplace' property is set, then post a resource notifying 2082 * the ZFS DE that it should not issue any faults for unopenable 2083 * devices. We also iterate over the vdevs, and post a sysevent for any 2084 * unopenable vdevs so that the normal autoreplace handler can take 2085 * over. 2086 */ 2087 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2088 spa_check_removed(spa->spa_root_vdev); 2089 /* 2090 * For the import case, this is done in spa_import(), because 2091 * at this point we're using the spare definitions from 2092 * the MOS config, not necessarily from the userland config. 2093 */ 2094 if (state != SPA_LOAD_IMPORT) { 2095 spa_aux_check_removed(&spa->spa_spares); 2096 spa_aux_check_removed(&spa->spa_l2cache); 2097 } 2098 } 2099 2100 /* 2101 * Load the vdev state for all toplevel vdevs. 2102 */ 2103 vdev_load(rvd); 2104 2105 /* 2106 * Propagate the leaf DTLs we just loaded all the way up the tree. 2107 */ 2108 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2109 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2110 spa_config_exit(spa, SCL_ALL, FTAG); 2111 2112 /* 2113 * Load the DDTs (dedup tables). 2114 */ 2115 error = ddt_load(spa); 2116 if (error != 0) 2117 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2118 2119 spa_update_dspace(spa); 2120 2121 /* 2122 * Validate the config, using the MOS config to fill in any 2123 * information which might be missing. If we fail to validate 2124 * the config then declare the pool unfit for use. If we're 2125 * assembling a pool from a split, the log is not transferred 2126 * over. 2127 */ 2128 if (type != SPA_IMPORT_ASSEMBLE) { 2129 nvlist_t *nvconfig; 2130 2131 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2132 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2133 2134 if (!spa_config_valid(spa, nvconfig)) { 2135 nvlist_free(nvconfig); 2136 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2137 ENXIO)); 2138 } 2139 nvlist_free(nvconfig); 2140 2141 /* 2142 * Now that we've validate the config, check the state of the 2143 * root vdev. If it can't be opened, it indicates one or 2144 * more toplevel vdevs are faulted. 2145 */ 2146 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2147 return (ENXIO); 2148 2149 if (spa_check_logs(spa)) { 2150 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2151 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2152 } 2153 } 2154 2155 /* 2156 * We've successfully opened the pool, verify that we're ready 2157 * to start pushing transactions. 2158 */ 2159 if (state != SPA_LOAD_TRYIMPORT) { 2160 if (error = spa_load_verify(spa)) 2161 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2162 error)); 2163 } 2164 2165 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2166 spa->spa_load_max_txg == UINT64_MAX)) { 2167 dmu_tx_t *tx; 2168 int need_update = B_FALSE; 2169 2170 ASSERT(state != SPA_LOAD_TRYIMPORT); 2171 2172 /* 2173 * Claim log blocks that haven't been committed yet. 2174 * This must all happen in a single txg. 2175 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2176 * invoked from zil_claim_log_block()'s i/o done callback. 2177 * Price of rollback is that we abandon the log. 2178 */ 2179 spa->spa_claiming = B_TRUE; 2180 2181 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2182 spa_first_txg(spa)); 2183 (void) dmu_objset_find(spa_name(spa), 2184 zil_claim, tx, DS_FIND_CHILDREN); 2185 dmu_tx_commit(tx); 2186 2187 spa->spa_claiming = B_FALSE; 2188 2189 spa_set_log_state(spa, SPA_LOG_GOOD); 2190 spa->spa_sync_on = B_TRUE; 2191 txg_sync_start(spa->spa_dsl_pool); 2192 2193 /* 2194 * Wait for all claims to sync. We sync up to the highest 2195 * claimed log block birth time so that claimed log blocks 2196 * don't appear to be from the future. spa_claim_max_txg 2197 * will have been set for us by either zil_check_log_chain() 2198 * (invoked from spa_check_logs()) or zil_claim() above. 2199 */ 2200 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2201 2202 /* 2203 * If the config cache is stale, or we have uninitialized 2204 * metaslabs (see spa_vdev_add()), then update the config. 2205 * 2206 * If this is a verbatim import, trust the current 2207 * in-core spa_config and update the disk labels. 2208 */ 2209 if (config_cache_txg != spa->spa_config_txg || 2210 state == SPA_LOAD_IMPORT || 2211 state == SPA_LOAD_RECOVER || 2212 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2213 need_update = B_TRUE; 2214 2215 for (int c = 0; c < rvd->vdev_children; c++) 2216 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2217 need_update = B_TRUE; 2218 2219 /* 2220 * Update the config cache asychronously in case we're the 2221 * root pool, in which case the config cache isn't writable yet. 2222 */ 2223 if (need_update) 2224 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2225 2226 /* 2227 * Check all DTLs to see if anything needs resilvering. 2228 */ 2229 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2230 vdev_resilver_needed(rvd, NULL, NULL)) 2231 spa_async_request(spa, SPA_ASYNC_RESILVER); 2232 2233 /* 2234 * Delete any inconsistent datasets. 2235 */ 2236 (void) dmu_objset_find(spa_name(spa), 2237 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2238 2239 /* 2240 * Clean up any stale temporary dataset userrefs. 2241 */ 2242 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2243 } 2244 2245 return (0); 2246 } 2247 2248 static int 2249 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2250 { 2251 int mode = spa->spa_mode; 2252 2253 spa_unload(spa); 2254 spa_deactivate(spa); 2255 2256 spa->spa_load_max_txg--; 2257 2258 spa_activate(spa, mode); 2259 spa_async_suspend(spa); 2260 2261 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2262 } 2263 2264 static int 2265 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2266 uint64_t max_request, int rewind_flags) 2267 { 2268 nvlist_t *config = NULL; 2269 int load_error, rewind_error; 2270 uint64_t safe_rewind_txg; 2271 uint64_t min_txg; 2272 2273 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2274 spa->spa_load_max_txg = spa->spa_load_txg; 2275 spa_set_log_state(spa, SPA_LOG_CLEAR); 2276 } else { 2277 spa->spa_load_max_txg = max_request; 2278 } 2279 2280 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2281 mosconfig); 2282 if (load_error == 0) 2283 return (0); 2284 2285 if (spa->spa_root_vdev != NULL) 2286 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2287 2288 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2289 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2290 2291 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2292 nvlist_free(config); 2293 return (load_error); 2294 } 2295 2296 /* Price of rolling back is discarding txgs, including log */ 2297 if (state == SPA_LOAD_RECOVER) 2298 spa_set_log_state(spa, SPA_LOG_CLEAR); 2299 2300 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2301 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2302 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2303 TXG_INITIAL : safe_rewind_txg; 2304 2305 /* 2306 * Continue as long as we're finding errors, we're still within 2307 * the acceptable rewind range, and we're still finding uberblocks 2308 */ 2309 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2310 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2311 if (spa->spa_load_max_txg < safe_rewind_txg) 2312 spa->spa_extreme_rewind = B_TRUE; 2313 rewind_error = spa_load_retry(spa, state, mosconfig); 2314 } 2315 2316 spa->spa_extreme_rewind = B_FALSE; 2317 spa->spa_load_max_txg = UINT64_MAX; 2318 2319 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2320 spa_config_set(spa, config); 2321 2322 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2323 } 2324 2325 /* 2326 * Pool Open/Import 2327 * 2328 * The import case is identical to an open except that the configuration is sent 2329 * down from userland, instead of grabbed from the configuration cache. For the 2330 * case of an open, the pool configuration will exist in the 2331 * POOL_STATE_UNINITIALIZED state. 2332 * 2333 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2334 * the same time open the pool, without having to keep around the spa_t in some 2335 * ambiguous state. 2336 */ 2337 static int 2338 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2339 nvlist_t **config) 2340 { 2341 spa_t *spa; 2342 spa_load_state_t state = SPA_LOAD_OPEN; 2343 int error; 2344 int locked = B_FALSE; 2345 2346 *spapp = NULL; 2347 2348 /* 2349 * As disgusting as this is, we need to support recursive calls to this 2350 * function because dsl_dir_open() is called during spa_load(), and ends 2351 * up calling spa_open() again. The real fix is to figure out how to 2352 * avoid dsl_dir_open() calling this in the first place. 2353 */ 2354 if (mutex_owner(&spa_namespace_lock) != curthread) { 2355 mutex_enter(&spa_namespace_lock); 2356 locked = B_TRUE; 2357 } 2358 2359 if ((spa = spa_lookup(pool)) == NULL) { 2360 if (locked) 2361 mutex_exit(&spa_namespace_lock); 2362 return (ENOENT); 2363 } 2364 2365 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2366 zpool_rewind_policy_t policy; 2367 2368 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2369 &policy); 2370 if (policy.zrp_request & ZPOOL_DO_REWIND) 2371 state = SPA_LOAD_RECOVER; 2372 2373 spa_activate(spa, spa_mode_global); 2374 2375 if (state != SPA_LOAD_RECOVER) 2376 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2377 2378 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2379 policy.zrp_request); 2380 2381 if (error == EBADF) { 2382 /* 2383 * If vdev_validate() returns failure (indicated by 2384 * EBADF), it indicates that one of the vdevs indicates 2385 * that the pool has been exported or destroyed. If 2386 * this is the case, the config cache is out of sync and 2387 * we should remove the pool from the namespace. 2388 */ 2389 spa_unload(spa); 2390 spa_deactivate(spa); 2391 spa_config_sync(spa, B_TRUE, B_TRUE); 2392 spa_remove(spa); 2393 if (locked) 2394 mutex_exit(&spa_namespace_lock); 2395 return (ENOENT); 2396 } 2397 2398 if (error) { 2399 /* 2400 * We can't open the pool, but we still have useful 2401 * information: the state of each vdev after the 2402 * attempted vdev_open(). Return this to the user. 2403 */ 2404 if (config != NULL && spa->spa_config) { 2405 VERIFY(nvlist_dup(spa->spa_config, config, 2406 KM_SLEEP) == 0); 2407 VERIFY(nvlist_add_nvlist(*config, 2408 ZPOOL_CONFIG_LOAD_INFO, 2409 spa->spa_load_info) == 0); 2410 } 2411 spa_unload(spa); 2412 spa_deactivate(spa); 2413 spa->spa_last_open_failed = error; 2414 if (locked) 2415 mutex_exit(&spa_namespace_lock); 2416 *spapp = NULL; 2417 return (error); 2418 } 2419 } 2420 2421 spa_open_ref(spa, tag); 2422 2423 if (config != NULL) 2424 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2425 2426 /* 2427 * If we've recovered the pool, pass back any information we 2428 * gathered while doing the load. 2429 */ 2430 if (state == SPA_LOAD_RECOVER) { 2431 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2432 spa->spa_load_info) == 0); 2433 } 2434 2435 if (locked) { 2436 spa->spa_last_open_failed = 0; 2437 spa->spa_last_ubsync_txg = 0; 2438 spa->spa_load_txg = 0; 2439 mutex_exit(&spa_namespace_lock); 2440 } 2441 2442 *spapp = spa; 2443 2444 return (0); 2445 } 2446 2447 int 2448 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2449 nvlist_t **config) 2450 { 2451 return (spa_open_common(name, spapp, tag, policy, config)); 2452 } 2453 2454 int 2455 spa_open(const char *name, spa_t **spapp, void *tag) 2456 { 2457 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2458 } 2459 2460 /* 2461 * Lookup the given spa_t, incrementing the inject count in the process, 2462 * preventing it from being exported or destroyed. 2463 */ 2464 spa_t * 2465 spa_inject_addref(char *name) 2466 { 2467 spa_t *spa; 2468 2469 mutex_enter(&spa_namespace_lock); 2470 if ((spa = spa_lookup(name)) == NULL) { 2471 mutex_exit(&spa_namespace_lock); 2472 return (NULL); 2473 } 2474 spa->spa_inject_ref++; 2475 mutex_exit(&spa_namespace_lock); 2476 2477 return (spa); 2478 } 2479 2480 void 2481 spa_inject_delref(spa_t *spa) 2482 { 2483 mutex_enter(&spa_namespace_lock); 2484 spa->spa_inject_ref--; 2485 mutex_exit(&spa_namespace_lock); 2486 } 2487 2488 /* 2489 * Add spares device information to the nvlist. 2490 */ 2491 static void 2492 spa_add_spares(spa_t *spa, nvlist_t *config) 2493 { 2494 nvlist_t **spares; 2495 uint_t i, nspares; 2496 nvlist_t *nvroot; 2497 uint64_t guid; 2498 vdev_stat_t *vs; 2499 uint_t vsc; 2500 uint64_t pool; 2501 2502 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2503 2504 if (spa->spa_spares.sav_count == 0) 2505 return; 2506 2507 VERIFY(nvlist_lookup_nvlist(config, 2508 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2509 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2510 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2511 if (nspares != 0) { 2512 VERIFY(nvlist_add_nvlist_array(nvroot, 2513 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2514 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2515 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2516 2517 /* 2518 * Go through and find any spares which have since been 2519 * repurposed as an active spare. If this is the case, update 2520 * their status appropriately. 2521 */ 2522 for (i = 0; i < nspares; i++) { 2523 VERIFY(nvlist_lookup_uint64(spares[i], 2524 ZPOOL_CONFIG_GUID, &guid) == 0); 2525 if (spa_spare_exists(guid, &pool, NULL) && 2526 pool != 0ULL) { 2527 VERIFY(nvlist_lookup_uint64_array( 2528 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2529 (uint64_t **)&vs, &vsc) == 0); 2530 vs->vs_state = VDEV_STATE_CANT_OPEN; 2531 vs->vs_aux = VDEV_AUX_SPARED; 2532 } 2533 } 2534 } 2535 } 2536 2537 /* 2538 * Add l2cache device information to the nvlist, including vdev stats. 2539 */ 2540 static void 2541 spa_add_l2cache(spa_t *spa, nvlist_t *config) 2542 { 2543 nvlist_t **l2cache; 2544 uint_t i, j, nl2cache; 2545 nvlist_t *nvroot; 2546 uint64_t guid; 2547 vdev_t *vd; 2548 vdev_stat_t *vs; 2549 uint_t vsc; 2550 2551 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2552 2553 if (spa->spa_l2cache.sav_count == 0) 2554 return; 2555 2556 VERIFY(nvlist_lookup_nvlist(config, 2557 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2558 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2559 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2560 if (nl2cache != 0) { 2561 VERIFY(nvlist_add_nvlist_array(nvroot, 2562 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2563 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2564 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2565 2566 /* 2567 * Update level 2 cache device stats. 2568 */ 2569 2570 for (i = 0; i < nl2cache; i++) { 2571 VERIFY(nvlist_lookup_uint64(l2cache[i], 2572 ZPOOL_CONFIG_GUID, &guid) == 0); 2573 2574 vd = NULL; 2575 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2576 if (guid == 2577 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2578 vd = spa->spa_l2cache.sav_vdevs[j]; 2579 break; 2580 } 2581 } 2582 ASSERT(vd != NULL); 2583 2584 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2585 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2586 == 0); 2587 vdev_get_stats(vd, vs); 2588 } 2589 } 2590 } 2591 2592 int 2593 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2594 { 2595 int error; 2596 spa_t *spa; 2597 2598 *config = NULL; 2599 error = spa_open_common(name, &spa, FTAG, NULL, config); 2600 2601 if (spa != NULL) { 2602 /* 2603 * This still leaves a window of inconsistency where the spares 2604 * or l2cache devices could change and the config would be 2605 * self-inconsistent. 2606 */ 2607 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2608 2609 if (*config != NULL) { 2610 uint64_t loadtimes[2]; 2611 2612 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 2613 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 2614 VERIFY(nvlist_add_uint64_array(*config, 2615 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 2616 2617 VERIFY(nvlist_add_uint64(*config, 2618 ZPOOL_CONFIG_ERRCOUNT, 2619 spa_get_errlog_size(spa)) == 0); 2620 2621 if (spa_suspended(spa)) 2622 VERIFY(nvlist_add_uint64(*config, 2623 ZPOOL_CONFIG_SUSPENDED, 2624 spa->spa_failmode) == 0); 2625 2626 spa_add_spares(spa, *config); 2627 spa_add_l2cache(spa, *config); 2628 } 2629 } 2630 2631 /* 2632 * We want to get the alternate root even for faulted pools, so we cheat 2633 * and call spa_lookup() directly. 2634 */ 2635 if (altroot) { 2636 if (spa == NULL) { 2637 mutex_enter(&spa_namespace_lock); 2638 spa = spa_lookup(name); 2639 if (spa) 2640 spa_altroot(spa, altroot, buflen); 2641 else 2642 altroot[0] = '\0'; 2643 spa = NULL; 2644 mutex_exit(&spa_namespace_lock); 2645 } else { 2646 spa_altroot(spa, altroot, buflen); 2647 } 2648 } 2649 2650 if (spa != NULL) { 2651 spa_config_exit(spa, SCL_CONFIG, FTAG); 2652 spa_close(spa, FTAG); 2653 } 2654 2655 return (error); 2656 } 2657 2658 /* 2659 * Validate that the auxiliary device array is well formed. We must have an 2660 * array of nvlists, each which describes a valid leaf vdev. If this is an 2661 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2662 * specified, as long as they are well-formed. 2663 */ 2664 static int 2665 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2666 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2667 vdev_labeltype_t label) 2668 { 2669 nvlist_t **dev; 2670 uint_t i, ndev; 2671 vdev_t *vd; 2672 int error; 2673 2674 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2675 2676 /* 2677 * It's acceptable to have no devs specified. 2678 */ 2679 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2680 return (0); 2681 2682 if (ndev == 0) 2683 return (EINVAL); 2684 2685 /* 2686 * Make sure the pool is formatted with a version that supports this 2687 * device type. 2688 */ 2689 if (spa_version(spa) < version) 2690 return (ENOTSUP); 2691 2692 /* 2693 * Set the pending device list so we correctly handle device in-use 2694 * checking. 2695 */ 2696 sav->sav_pending = dev; 2697 sav->sav_npending = ndev; 2698 2699 for (i = 0; i < ndev; i++) { 2700 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2701 mode)) != 0) 2702 goto out; 2703 2704 if (!vd->vdev_ops->vdev_op_leaf) { 2705 vdev_free(vd); 2706 error = EINVAL; 2707 goto out; 2708 } 2709 2710 /* 2711 * The L2ARC currently only supports disk devices in 2712 * kernel context. For user-level testing, we allow it. 2713 */ 2714 #ifdef _KERNEL 2715 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2716 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2717 error = ENOTBLK; 2718 goto out; 2719 } 2720 #endif 2721 vd->vdev_top = vd; 2722 2723 if ((error = vdev_open(vd)) == 0 && 2724 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2725 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2726 vd->vdev_guid) == 0); 2727 } 2728 2729 vdev_free(vd); 2730 2731 if (error && 2732 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2733 goto out; 2734 else 2735 error = 0; 2736 } 2737 2738 out: 2739 sav->sav_pending = NULL; 2740 sav->sav_npending = 0; 2741 return (error); 2742 } 2743 2744 static int 2745 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2746 { 2747 int error; 2748 2749 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2750 2751 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2752 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2753 VDEV_LABEL_SPARE)) != 0) { 2754 return (error); 2755 } 2756 2757 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2758 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2759 VDEV_LABEL_L2CACHE)); 2760 } 2761 2762 static void 2763 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2764 const char *config) 2765 { 2766 int i; 2767 2768 if (sav->sav_config != NULL) { 2769 nvlist_t **olddevs; 2770 uint_t oldndevs; 2771 nvlist_t **newdevs; 2772 2773 /* 2774 * Generate new dev list by concatentating with the 2775 * current dev list. 2776 */ 2777 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2778 &olddevs, &oldndevs) == 0); 2779 2780 newdevs = kmem_alloc(sizeof (void *) * 2781 (ndevs + oldndevs), KM_SLEEP); 2782 for (i = 0; i < oldndevs; i++) 2783 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2784 KM_SLEEP) == 0); 2785 for (i = 0; i < ndevs; i++) 2786 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2787 KM_SLEEP) == 0); 2788 2789 VERIFY(nvlist_remove(sav->sav_config, config, 2790 DATA_TYPE_NVLIST_ARRAY) == 0); 2791 2792 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2793 config, newdevs, ndevs + oldndevs) == 0); 2794 for (i = 0; i < oldndevs + ndevs; i++) 2795 nvlist_free(newdevs[i]); 2796 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2797 } else { 2798 /* 2799 * Generate a new dev list. 2800 */ 2801 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2802 KM_SLEEP) == 0); 2803 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2804 devs, ndevs) == 0); 2805 } 2806 } 2807 2808 /* 2809 * Stop and drop level 2 ARC devices 2810 */ 2811 void 2812 spa_l2cache_drop(spa_t *spa) 2813 { 2814 vdev_t *vd; 2815 int i; 2816 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2817 2818 for (i = 0; i < sav->sav_count; i++) { 2819 uint64_t pool; 2820 2821 vd = sav->sav_vdevs[i]; 2822 ASSERT(vd != NULL); 2823 2824 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2825 pool != 0ULL && l2arc_vdev_present(vd)) 2826 l2arc_remove_vdev(vd); 2827 if (vd->vdev_isl2cache) 2828 spa_l2cache_remove(vd); 2829 vdev_clear_stats(vd); 2830 (void) vdev_close(vd); 2831 } 2832 } 2833 2834 /* 2835 * Pool Creation 2836 */ 2837 int 2838 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2839 const char *history_str, nvlist_t *zplprops) 2840 { 2841 spa_t *spa; 2842 char *altroot = NULL; 2843 vdev_t *rvd; 2844 dsl_pool_t *dp; 2845 dmu_tx_t *tx; 2846 int error = 0; 2847 uint64_t txg = TXG_INITIAL; 2848 nvlist_t **spares, **l2cache; 2849 uint_t nspares, nl2cache; 2850 uint64_t version, obj; 2851 2852 /* 2853 * If this pool already exists, return failure. 2854 */ 2855 mutex_enter(&spa_namespace_lock); 2856 if (spa_lookup(pool) != NULL) { 2857 mutex_exit(&spa_namespace_lock); 2858 return (EEXIST); 2859 } 2860 2861 /* 2862 * Allocate a new spa_t structure. 2863 */ 2864 (void) nvlist_lookup_string(props, 2865 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2866 spa = spa_add(pool, NULL, altroot); 2867 spa_activate(spa, spa_mode_global); 2868 2869 if (props && (error = spa_prop_validate(spa, props))) { 2870 spa_deactivate(spa); 2871 spa_remove(spa); 2872 mutex_exit(&spa_namespace_lock); 2873 return (error); 2874 } 2875 2876 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2877 &version) != 0) 2878 version = SPA_VERSION; 2879 ASSERT(version <= SPA_VERSION); 2880 2881 spa->spa_first_txg = txg; 2882 spa->spa_uberblock.ub_txg = txg - 1; 2883 spa->spa_uberblock.ub_version = version; 2884 spa->spa_ubsync = spa->spa_uberblock; 2885 2886 /* 2887 * Create "The Godfather" zio to hold all async IOs 2888 */ 2889 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2890 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2891 2892 /* 2893 * Create the root vdev. 2894 */ 2895 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2896 2897 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2898 2899 ASSERT(error != 0 || rvd != NULL); 2900 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2901 2902 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2903 error = EINVAL; 2904 2905 if (error == 0 && 2906 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2907 (error = spa_validate_aux(spa, nvroot, txg, 2908 VDEV_ALLOC_ADD)) == 0) { 2909 for (int c = 0; c < rvd->vdev_children; c++) { 2910 vdev_metaslab_set_size(rvd->vdev_child[c]); 2911 vdev_expand(rvd->vdev_child[c], txg); 2912 } 2913 } 2914 2915 spa_config_exit(spa, SCL_ALL, FTAG); 2916 2917 if (error != 0) { 2918 spa_unload(spa); 2919 spa_deactivate(spa); 2920 spa_remove(spa); 2921 mutex_exit(&spa_namespace_lock); 2922 return (error); 2923 } 2924 2925 /* 2926 * Get the list of spares, if specified. 2927 */ 2928 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 2929 &spares, &nspares) == 0) { 2930 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 2931 KM_SLEEP) == 0); 2932 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 2933 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2934 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2935 spa_load_spares(spa); 2936 spa_config_exit(spa, SCL_ALL, FTAG); 2937 spa->spa_spares.sav_sync = B_TRUE; 2938 } 2939 2940 /* 2941 * Get the list of level 2 cache devices, if specified. 2942 */ 2943 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 2944 &l2cache, &nl2cache) == 0) { 2945 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 2946 NV_UNIQUE_NAME, KM_SLEEP) == 0); 2947 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 2948 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2949 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2950 spa_load_l2cache(spa); 2951 spa_config_exit(spa, SCL_ALL, FTAG); 2952 spa->spa_l2cache.sav_sync = B_TRUE; 2953 } 2954 2955 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 2956 spa->spa_meta_objset = dp->dp_meta_objset; 2957 2958 /* 2959 * Create DDTs (dedup tables). 2960 */ 2961 ddt_create(spa); 2962 2963 spa_update_dspace(spa); 2964 2965 tx = dmu_tx_create_assigned(dp, txg); 2966 2967 /* 2968 * Create the pool config object. 2969 */ 2970 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 2971 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 2972 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 2973 2974 if (zap_add(spa->spa_meta_objset, 2975 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 2976 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 2977 cmn_err(CE_PANIC, "failed to add pool config"); 2978 } 2979 2980 if (zap_add(spa->spa_meta_objset, 2981 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 2982 sizeof (uint64_t), 1, &version, tx) != 0) { 2983 cmn_err(CE_PANIC, "failed to add pool version"); 2984 } 2985 2986 /* Newly created pools with the right version are always deflated. */ 2987 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 2988 spa->spa_deflate = TRUE; 2989 if (zap_add(spa->spa_meta_objset, 2990 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 2991 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 2992 cmn_err(CE_PANIC, "failed to add deflate"); 2993 } 2994 } 2995 2996 /* 2997 * Create the deferred-free bpobj. Turn off compression 2998 * because sync-to-convergence takes longer if the blocksize 2999 * keeps changing. 3000 */ 3001 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3002 dmu_object_set_compress(spa->spa_meta_objset, obj, 3003 ZIO_COMPRESS_OFF, tx); 3004 if (zap_add(spa->spa_meta_objset, 3005 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3006 sizeof (uint64_t), 1, &obj, tx) != 0) { 3007 cmn_err(CE_PANIC, "failed to add bpobj"); 3008 } 3009 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3010 spa->spa_meta_objset, obj)); 3011 3012 /* 3013 * Create the pool's history object. 3014 */ 3015 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3016 spa_history_create_obj(spa, tx); 3017 3018 /* 3019 * Set pool properties. 3020 */ 3021 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3022 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3023 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3024 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3025 3026 if (props != NULL) { 3027 spa_configfile_set(spa, props, B_FALSE); 3028 spa_sync_props(spa, props, tx); 3029 } 3030 3031 dmu_tx_commit(tx); 3032 3033 spa->spa_sync_on = B_TRUE; 3034 txg_sync_start(spa->spa_dsl_pool); 3035 3036 /* 3037 * We explicitly wait for the first transaction to complete so that our 3038 * bean counters are appropriately updated. 3039 */ 3040 txg_wait_synced(spa->spa_dsl_pool, txg); 3041 3042 spa_config_sync(spa, B_FALSE, B_TRUE); 3043 3044 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3045 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3046 spa_history_log_version(spa, LOG_POOL_CREATE); 3047 3048 spa->spa_minref = refcount_count(&spa->spa_refcount); 3049 3050 mutex_exit(&spa_namespace_lock); 3051 3052 return (0); 3053 } 3054 3055 #ifdef _KERNEL 3056 /* 3057 * Get the root pool information from the root disk, then import the root pool 3058 * during the system boot up time. 3059 */ 3060 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3061 3062 static nvlist_t * 3063 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3064 { 3065 nvlist_t *config; 3066 nvlist_t *nvtop, *nvroot; 3067 uint64_t pgid; 3068 3069 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3070 return (NULL); 3071 3072 /* 3073 * Add this top-level vdev to the child array. 3074 */ 3075 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3076 &nvtop) == 0); 3077 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3078 &pgid) == 0); 3079 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3080 3081 /* 3082 * Put this pool's top-level vdevs into a root vdev. 3083 */ 3084 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3085 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3086 VDEV_TYPE_ROOT) == 0); 3087 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3088 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3089 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3090 &nvtop, 1) == 0); 3091 3092 /* 3093 * Replace the existing vdev_tree with the new root vdev in 3094 * this pool's configuration (remove the old, add the new). 3095 */ 3096 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3097 nvlist_free(nvroot); 3098 return (config); 3099 } 3100 3101 /* 3102 * Walk the vdev tree and see if we can find a device with "better" 3103 * configuration. A configuration is "better" if the label on that 3104 * device has a more recent txg. 3105 */ 3106 static void 3107 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3108 { 3109 for (int c = 0; c < vd->vdev_children; c++) 3110 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3111 3112 if (vd->vdev_ops->vdev_op_leaf) { 3113 nvlist_t *label; 3114 uint64_t label_txg; 3115 3116 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3117 &label) != 0) 3118 return; 3119 3120 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3121 &label_txg) == 0); 3122 3123 /* 3124 * Do we have a better boot device? 3125 */ 3126 if (label_txg > *txg) { 3127 *txg = label_txg; 3128 *avd = vd; 3129 } 3130 nvlist_free(label); 3131 } 3132 } 3133 3134 /* 3135 * Import a root pool. 3136 * 3137 * For x86. devpath_list will consist of devid and/or physpath name of 3138 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3139 * The GRUB "findroot" command will return the vdev we should boot. 3140 * 3141 * For Sparc, devpath_list consists the physpath name of the booting device 3142 * no matter the rootpool is a single device pool or a mirrored pool. 3143 * e.g. 3144 * "/pci@1f,0/ide@d/disk@0,0:a" 3145 */ 3146 int 3147 spa_import_rootpool(char *devpath, char *devid) 3148 { 3149 spa_t *spa; 3150 vdev_t *rvd, *bvd, *avd = NULL; 3151 nvlist_t *config, *nvtop; 3152 uint64_t guid, txg; 3153 char *pname; 3154 int error; 3155 3156 /* 3157 * Read the label from the boot device and generate a configuration. 3158 */ 3159 config = spa_generate_rootconf(devpath, devid, &guid); 3160 #if defined(_OBP) && defined(_KERNEL) 3161 if (config == NULL) { 3162 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3163 /* iscsi boot */ 3164 get_iscsi_bootpath_phy(devpath); 3165 config = spa_generate_rootconf(devpath, devid, &guid); 3166 } 3167 } 3168 #endif 3169 if (config == NULL) { 3170 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3171 devpath); 3172 return (EIO); 3173 } 3174 3175 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3176 &pname) == 0); 3177 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3178 3179 mutex_enter(&spa_namespace_lock); 3180 if ((spa = spa_lookup(pname)) != NULL) { 3181 /* 3182 * Remove the existing root pool from the namespace so that we 3183 * can replace it with the correct config we just read in. 3184 */ 3185 spa_remove(spa); 3186 } 3187 3188 spa = spa_add(pname, config, NULL); 3189 spa->spa_is_root = B_TRUE; 3190 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3191 3192 /* 3193 * Build up a vdev tree based on the boot device's label config. 3194 */ 3195 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3196 &nvtop) == 0); 3197 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3198 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3199 VDEV_ALLOC_ROOTPOOL); 3200 spa_config_exit(spa, SCL_ALL, FTAG); 3201 if (error) { 3202 mutex_exit(&spa_namespace_lock); 3203 nvlist_free(config); 3204 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3205 pname); 3206 return (error); 3207 } 3208 3209 /* 3210 * Get the boot vdev. 3211 */ 3212 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3213 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3214 (u_longlong_t)guid); 3215 error = ENOENT; 3216 goto out; 3217 } 3218 3219 /* 3220 * Determine if there is a better boot device. 3221 */ 3222 avd = bvd; 3223 spa_alt_rootvdev(rvd, &avd, &txg); 3224 if (avd != bvd) { 3225 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3226 "try booting from '%s'", avd->vdev_path); 3227 error = EINVAL; 3228 goto out; 3229 } 3230 3231 /* 3232 * If the boot device is part of a spare vdev then ensure that 3233 * we're booting off the active spare. 3234 */ 3235 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3236 !bvd->vdev_isspare) { 3237 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3238 "try booting from '%s'", 3239 bvd->vdev_parent-> 3240 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3241 error = EINVAL; 3242 goto out; 3243 } 3244 3245 error = 0; 3246 spa_history_log_version(spa, LOG_POOL_IMPORT); 3247 out: 3248 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3249 vdev_free(rvd); 3250 spa_config_exit(spa, SCL_ALL, FTAG); 3251 mutex_exit(&spa_namespace_lock); 3252 3253 nvlist_free(config); 3254 return (error); 3255 } 3256 3257 #endif 3258 3259 /* 3260 * Import a non-root pool into the system. 3261 */ 3262 int 3263 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3264 { 3265 spa_t *spa; 3266 char *altroot = NULL; 3267 spa_load_state_t state = SPA_LOAD_IMPORT; 3268 zpool_rewind_policy_t policy; 3269 uint64_t mode = spa_mode_global; 3270 uint64_t readonly = B_FALSE; 3271 int error; 3272 nvlist_t *nvroot; 3273 nvlist_t **spares, **l2cache; 3274 uint_t nspares, nl2cache; 3275 3276 /* 3277 * If a pool with this name exists, return failure. 3278 */ 3279 mutex_enter(&spa_namespace_lock); 3280 if (spa_lookup(pool) != NULL) { 3281 mutex_exit(&spa_namespace_lock); 3282 return (EEXIST); 3283 } 3284 3285 /* 3286 * Create and initialize the spa structure. 3287 */ 3288 (void) nvlist_lookup_string(props, 3289 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3290 (void) nvlist_lookup_uint64(props, 3291 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3292 if (readonly) 3293 mode = FREAD; 3294 spa = spa_add(pool, config, altroot); 3295 spa->spa_import_flags = flags; 3296 3297 /* 3298 * Verbatim import - Take a pool and insert it into the namespace 3299 * as if it had been loaded at boot. 3300 */ 3301 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3302 if (props != NULL) 3303 spa_configfile_set(spa, props, B_FALSE); 3304 3305 spa_config_sync(spa, B_FALSE, B_TRUE); 3306 3307 mutex_exit(&spa_namespace_lock); 3308 spa_history_log_version(spa, LOG_POOL_IMPORT); 3309 3310 return (0); 3311 } 3312 3313 spa_activate(spa, mode); 3314 3315 /* 3316 * Don't start async tasks until we know everything is healthy. 3317 */ 3318 spa_async_suspend(spa); 3319 3320 zpool_get_rewind_policy(config, &policy); 3321 if (policy.zrp_request & ZPOOL_DO_REWIND) 3322 state = SPA_LOAD_RECOVER; 3323 3324 /* 3325 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3326 * because the user-supplied config is actually the one to trust when 3327 * doing an import. 3328 */ 3329 if (state != SPA_LOAD_RECOVER) 3330 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3331 3332 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3333 policy.zrp_request); 3334 3335 /* 3336 * Propagate anything learned while loading the pool and pass it 3337 * back to caller (i.e. rewind info, missing devices, etc). 3338 */ 3339 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3340 spa->spa_load_info) == 0); 3341 3342 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3343 /* 3344 * Toss any existing sparelist, as it doesn't have any validity 3345 * anymore, and conflicts with spa_has_spare(). 3346 */ 3347 if (spa->spa_spares.sav_config) { 3348 nvlist_free(spa->spa_spares.sav_config); 3349 spa->spa_spares.sav_config = NULL; 3350 spa_load_spares(spa); 3351 } 3352 if (spa->spa_l2cache.sav_config) { 3353 nvlist_free(spa->spa_l2cache.sav_config); 3354 spa->spa_l2cache.sav_config = NULL; 3355 spa_load_l2cache(spa); 3356 } 3357 3358 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3359 &nvroot) == 0); 3360 if (error == 0) 3361 error = spa_validate_aux(spa, nvroot, -1ULL, 3362 VDEV_ALLOC_SPARE); 3363 if (error == 0) 3364 error = spa_validate_aux(spa, nvroot, -1ULL, 3365 VDEV_ALLOC_L2CACHE); 3366 spa_config_exit(spa, SCL_ALL, FTAG); 3367 3368 if (props != NULL) 3369 spa_configfile_set(spa, props, B_FALSE); 3370 3371 if (error != 0 || (props && spa_writeable(spa) && 3372 (error = spa_prop_set(spa, props)))) { 3373 spa_unload(spa); 3374 spa_deactivate(spa); 3375 spa_remove(spa); 3376 mutex_exit(&spa_namespace_lock); 3377 return (error); 3378 } 3379 3380 spa_async_resume(spa); 3381 3382 /* 3383 * Override any spares and level 2 cache devices as specified by 3384 * the user, as these may have correct device names/devids, etc. 3385 */ 3386 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3387 &spares, &nspares) == 0) { 3388 if (spa->spa_spares.sav_config) 3389 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3390 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3391 else 3392 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3393 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3394 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3395 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3396 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3397 spa_load_spares(spa); 3398 spa_config_exit(spa, SCL_ALL, FTAG); 3399 spa->spa_spares.sav_sync = B_TRUE; 3400 } 3401 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3402 &l2cache, &nl2cache) == 0) { 3403 if (spa->spa_l2cache.sav_config) 3404 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3405 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3406 else 3407 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3408 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3409 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3410 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3411 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3412 spa_load_l2cache(spa); 3413 spa_config_exit(spa, SCL_ALL, FTAG); 3414 spa->spa_l2cache.sav_sync = B_TRUE; 3415 } 3416 3417 /* 3418 * Check for any removed devices. 3419 */ 3420 if (spa->spa_autoreplace) { 3421 spa_aux_check_removed(&spa->spa_spares); 3422 spa_aux_check_removed(&spa->spa_l2cache); 3423 } 3424 3425 if (spa_writeable(spa)) { 3426 /* 3427 * Update the config cache to include the newly-imported pool. 3428 */ 3429 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3430 } 3431 3432 /* 3433 * It's possible that the pool was expanded while it was exported. 3434 * We kick off an async task to handle this for us. 3435 */ 3436 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3437 3438 mutex_exit(&spa_namespace_lock); 3439 spa_history_log_version(spa, LOG_POOL_IMPORT); 3440 3441 return (0); 3442 } 3443 3444 nvlist_t * 3445 spa_tryimport(nvlist_t *tryconfig) 3446 { 3447 nvlist_t *config = NULL; 3448 char *poolname; 3449 spa_t *spa; 3450 uint64_t state; 3451 int error; 3452 3453 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3454 return (NULL); 3455 3456 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3457 return (NULL); 3458 3459 /* 3460 * Create and initialize the spa structure. 3461 */ 3462 mutex_enter(&spa_namespace_lock); 3463 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3464 spa_activate(spa, FREAD); 3465 3466 /* 3467 * Pass off the heavy lifting to spa_load(). 3468 * Pass TRUE for mosconfig because the user-supplied config 3469 * is actually the one to trust when doing an import. 3470 */ 3471 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3472 3473 /* 3474 * If 'tryconfig' was at least parsable, return the current config. 3475 */ 3476 if (spa->spa_root_vdev != NULL) { 3477 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3478 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3479 poolname) == 0); 3480 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3481 state) == 0); 3482 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3483 spa->spa_uberblock.ub_timestamp) == 0); 3484 3485 /* 3486 * If the bootfs property exists on this pool then we 3487 * copy it out so that external consumers can tell which 3488 * pools are bootable. 3489 */ 3490 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3491 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3492 3493 /* 3494 * We have to play games with the name since the 3495 * pool was opened as TRYIMPORT_NAME. 3496 */ 3497 if (dsl_dsobj_to_dsname(spa_name(spa), 3498 spa->spa_bootfs, tmpname) == 0) { 3499 char *cp; 3500 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3501 3502 cp = strchr(tmpname, '/'); 3503 if (cp == NULL) { 3504 (void) strlcpy(dsname, tmpname, 3505 MAXPATHLEN); 3506 } else { 3507 (void) snprintf(dsname, MAXPATHLEN, 3508 "%s/%s", poolname, ++cp); 3509 } 3510 VERIFY(nvlist_add_string(config, 3511 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3512 kmem_free(dsname, MAXPATHLEN); 3513 } 3514 kmem_free(tmpname, MAXPATHLEN); 3515 } 3516 3517 /* 3518 * Add the list of hot spares and level 2 cache devices. 3519 */ 3520 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3521 spa_add_spares(spa, config); 3522 spa_add_l2cache(spa, config); 3523 spa_config_exit(spa, SCL_CONFIG, FTAG); 3524 } 3525 3526 spa_unload(spa); 3527 spa_deactivate(spa); 3528 spa_remove(spa); 3529 mutex_exit(&spa_namespace_lock); 3530 3531 return (config); 3532 } 3533 3534 /* 3535 * Pool export/destroy 3536 * 3537 * The act of destroying or exporting a pool is very simple. We make sure there 3538 * is no more pending I/O and any references to the pool are gone. Then, we 3539 * update the pool state and sync all the labels to disk, removing the 3540 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3541 * we don't sync the labels or remove the configuration cache. 3542 */ 3543 static int 3544 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3545 boolean_t force, boolean_t hardforce) 3546 { 3547 spa_t *spa; 3548 3549 if (oldconfig) 3550 *oldconfig = NULL; 3551 3552 if (!(spa_mode_global & FWRITE)) 3553 return (EROFS); 3554 3555 mutex_enter(&spa_namespace_lock); 3556 if ((spa = spa_lookup(pool)) == NULL) { 3557 mutex_exit(&spa_namespace_lock); 3558 return (ENOENT); 3559 } 3560 3561 /* 3562 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3563 * reacquire the namespace lock, and see if we can export. 3564 */ 3565 spa_open_ref(spa, FTAG); 3566 mutex_exit(&spa_namespace_lock); 3567 spa_async_suspend(spa); 3568 mutex_enter(&spa_namespace_lock); 3569 spa_close(spa, FTAG); 3570 3571 /* 3572 * The pool will be in core if it's openable, 3573 * in which case we can modify its state. 3574 */ 3575 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3576 /* 3577 * Objsets may be open only because they're dirty, so we 3578 * have to force it to sync before checking spa_refcnt. 3579 */ 3580 txg_wait_synced(spa->spa_dsl_pool, 0); 3581 3582 /* 3583 * A pool cannot be exported or destroyed if there are active 3584 * references. If we are resetting a pool, allow references by 3585 * fault injection handlers. 3586 */ 3587 if (!spa_refcount_zero(spa) || 3588 (spa->spa_inject_ref != 0 && 3589 new_state != POOL_STATE_UNINITIALIZED)) { 3590 spa_async_resume(spa); 3591 mutex_exit(&spa_namespace_lock); 3592 return (EBUSY); 3593 } 3594 3595 /* 3596 * A pool cannot be exported if it has an active shared spare. 3597 * This is to prevent other pools stealing the active spare 3598 * from an exported pool. At user's own will, such pool can 3599 * be forcedly exported. 3600 */ 3601 if (!force && new_state == POOL_STATE_EXPORTED && 3602 spa_has_active_shared_spare(spa)) { 3603 spa_async_resume(spa); 3604 mutex_exit(&spa_namespace_lock); 3605 return (EXDEV); 3606 } 3607 3608 /* 3609 * We want this to be reflected on every label, 3610 * so mark them all dirty. spa_unload() will do the 3611 * final sync that pushes these changes out. 3612 */ 3613 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3614 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3615 spa->spa_state = new_state; 3616 spa->spa_final_txg = spa_last_synced_txg(spa) + 3617 TXG_DEFER_SIZE + 1; 3618 vdev_config_dirty(spa->spa_root_vdev); 3619 spa_config_exit(spa, SCL_ALL, FTAG); 3620 } 3621 } 3622 3623 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3624 3625 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3626 spa_unload(spa); 3627 spa_deactivate(spa); 3628 } 3629 3630 if (oldconfig && spa->spa_config) 3631 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3632 3633 if (new_state != POOL_STATE_UNINITIALIZED) { 3634 if (!hardforce) 3635 spa_config_sync(spa, B_TRUE, B_TRUE); 3636 spa_remove(spa); 3637 } 3638 mutex_exit(&spa_namespace_lock); 3639 3640 return (0); 3641 } 3642 3643 /* 3644 * Destroy a storage pool. 3645 */ 3646 int 3647 spa_destroy(char *pool) 3648 { 3649 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3650 B_FALSE, B_FALSE)); 3651 } 3652 3653 /* 3654 * Export a storage pool. 3655 */ 3656 int 3657 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3658 boolean_t hardforce) 3659 { 3660 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3661 force, hardforce)); 3662 } 3663 3664 /* 3665 * Similar to spa_export(), this unloads the spa_t without actually removing it 3666 * from the namespace in any way. 3667 */ 3668 int 3669 spa_reset(char *pool) 3670 { 3671 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3672 B_FALSE, B_FALSE)); 3673 } 3674 3675 /* 3676 * ========================================================================== 3677 * Device manipulation 3678 * ========================================================================== 3679 */ 3680 3681 /* 3682 * Add a device to a storage pool. 3683 */ 3684 int 3685 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3686 { 3687 uint64_t txg, id; 3688 int error; 3689 vdev_t *rvd = spa->spa_root_vdev; 3690 vdev_t *vd, *tvd; 3691 nvlist_t **spares, **l2cache; 3692 uint_t nspares, nl2cache; 3693 3694 ASSERT(spa_writeable(spa)); 3695 3696 txg = spa_vdev_enter(spa); 3697 3698 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3699 VDEV_ALLOC_ADD)) != 0) 3700 return (spa_vdev_exit(spa, NULL, txg, error)); 3701 3702 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3703 3704 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3705 &nspares) != 0) 3706 nspares = 0; 3707 3708 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3709 &nl2cache) != 0) 3710 nl2cache = 0; 3711 3712 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3713 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3714 3715 if (vd->vdev_children != 0 && 3716 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3717 return (spa_vdev_exit(spa, vd, txg, error)); 3718 3719 /* 3720 * We must validate the spares and l2cache devices after checking the 3721 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3722 */ 3723 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3724 return (spa_vdev_exit(spa, vd, txg, error)); 3725 3726 /* 3727 * Transfer each new top-level vdev from vd to rvd. 3728 */ 3729 for (int c = 0; c < vd->vdev_children; c++) { 3730 3731 /* 3732 * Set the vdev id to the first hole, if one exists. 3733 */ 3734 for (id = 0; id < rvd->vdev_children; id++) { 3735 if (rvd->vdev_child[id]->vdev_ishole) { 3736 vdev_free(rvd->vdev_child[id]); 3737 break; 3738 } 3739 } 3740 tvd = vd->vdev_child[c]; 3741 vdev_remove_child(vd, tvd); 3742 tvd->vdev_id = id; 3743 vdev_add_child(rvd, tvd); 3744 vdev_config_dirty(tvd); 3745 } 3746 3747 if (nspares != 0) { 3748 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3749 ZPOOL_CONFIG_SPARES); 3750 spa_load_spares(spa); 3751 spa->spa_spares.sav_sync = B_TRUE; 3752 } 3753 3754 if (nl2cache != 0) { 3755 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3756 ZPOOL_CONFIG_L2CACHE); 3757 spa_load_l2cache(spa); 3758 spa->spa_l2cache.sav_sync = B_TRUE; 3759 } 3760 3761 /* 3762 * We have to be careful when adding new vdevs to an existing pool. 3763 * If other threads start allocating from these vdevs before we 3764 * sync the config cache, and we lose power, then upon reboot we may 3765 * fail to open the pool because there are DVAs that the config cache 3766 * can't translate. Therefore, we first add the vdevs without 3767 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3768 * and then let spa_config_update() initialize the new metaslabs. 3769 * 3770 * spa_load() checks for added-but-not-initialized vdevs, so that 3771 * if we lose power at any point in this sequence, the remaining 3772 * steps will be completed the next time we load the pool. 3773 */ 3774 (void) spa_vdev_exit(spa, vd, txg, 0); 3775 3776 mutex_enter(&spa_namespace_lock); 3777 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3778 mutex_exit(&spa_namespace_lock); 3779 3780 return (0); 3781 } 3782 3783 /* 3784 * Attach a device to a mirror. The arguments are the path to any device 3785 * in the mirror, and the nvroot for the new device. If the path specifies 3786 * a device that is not mirrored, we automatically insert the mirror vdev. 3787 * 3788 * If 'replacing' is specified, the new device is intended to replace the 3789 * existing device; in this case the two devices are made into their own 3790 * mirror using the 'replacing' vdev, which is functionally identical to 3791 * the mirror vdev (it actually reuses all the same ops) but has a few 3792 * extra rules: you can't attach to it after it's been created, and upon 3793 * completion of resilvering, the first disk (the one being replaced) 3794 * is automatically detached. 3795 */ 3796 int 3797 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3798 { 3799 uint64_t txg, dtl_max_txg; 3800 vdev_t *rvd = spa->spa_root_vdev; 3801 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3802 vdev_ops_t *pvops; 3803 char *oldvdpath, *newvdpath; 3804 int newvd_isspare; 3805 int error; 3806 3807 ASSERT(spa_writeable(spa)); 3808 3809 txg = spa_vdev_enter(spa); 3810 3811 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3812 3813 if (oldvd == NULL) 3814 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3815 3816 if (!oldvd->vdev_ops->vdev_op_leaf) 3817 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3818 3819 pvd = oldvd->vdev_parent; 3820 3821 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3822 VDEV_ALLOC_ADD)) != 0) 3823 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3824 3825 if (newrootvd->vdev_children != 1) 3826 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3827 3828 newvd = newrootvd->vdev_child[0]; 3829 3830 if (!newvd->vdev_ops->vdev_op_leaf) 3831 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3832 3833 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3834 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3835 3836 /* 3837 * Spares can't replace logs 3838 */ 3839 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3840 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3841 3842 if (!replacing) { 3843 /* 3844 * For attach, the only allowable parent is a mirror or the root 3845 * vdev. 3846 */ 3847 if (pvd->vdev_ops != &vdev_mirror_ops && 3848 pvd->vdev_ops != &vdev_root_ops) 3849 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3850 3851 pvops = &vdev_mirror_ops; 3852 } else { 3853 /* 3854 * Active hot spares can only be replaced by inactive hot 3855 * spares. 3856 */ 3857 if (pvd->vdev_ops == &vdev_spare_ops && 3858 oldvd->vdev_isspare && 3859 !spa_has_spare(spa, newvd->vdev_guid)) 3860 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3861 3862 /* 3863 * If the source is a hot spare, and the parent isn't already a 3864 * spare, then we want to create a new hot spare. Otherwise, we 3865 * want to create a replacing vdev. The user is not allowed to 3866 * attach to a spared vdev child unless the 'isspare' state is 3867 * the same (spare replaces spare, non-spare replaces 3868 * non-spare). 3869 */ 3870 if (pvd->vdev_ops == &vdev_replacing_ops && 3871 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 3872 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3873 } else if (pvd->vdev_ops == &vdev_spare_ops && 3874 newvd->vdev_isspare != oldvd->vdev_isspare) { 3875 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3876 } 3877 3878 if (newvd->vdev_isspare) 3879 pvops = &vdev_spare_ops; 3880 else 3881 pvops = &vdev_replacing_ops; 3882 } 3883 3884 /* 3885 * Make sure the new device is big enough. 3886 */ 3887 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3888 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3889 3890 /* 3891 * The new device cannot have a higher alignment requirement 3892 * than the top-level vdev. 3893 */ 3894 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3895 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3896 3897 /* 3898 * If this is an in-place replacement, update oldvd's path and devid 3899 * to make it distinguishable from newvd, and unopenable from now on. 3900 */ 3901 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3902 spa_strfree(oldvd->vdev_path); 3903 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3904 KM_SLEEP); 3905 (void) sprintf(oldvd->vdev_path, "%s/%s", 3906 newvd->vdev_path, "old"); 3907 if (oldvd->vdev_devid != NULL) { 3908 spa_strfree(oldvd->vdev_devid); 3909 oldvd->vdev_devid = NULL; 3910 } 3911 } 3912 3913 /* mark the device being resilvered */ 3914 newvd->vdev_resilvering = B_TRUE; 3915 3916 /* 3917 * If the parent is not a mirror, or if we're replacing, insert the new 3918 * mirror/replacing/spare vdev above oldvd. 3919 */ 3920 if (pvd->vdev_ops != pvops) 3921 pvd = vdev_add_parent(oldvd, pvops); 3922 3923 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3924 ASSERT(pvd->vdev_ops == pvops); 3925 ASSERT(oldvd->vdev_parent == pvd); 3926 3927 /* 3928 * Extract the new device from its root and add it to pvd. 3929 */ 3930 vdev_remove_child(newrootvd, newvd); 3931 newvd->vdev_id = pvd->vdev_children; 3932 newvd->vdev_crtxg = oldvd->vdev_crtxg; 3933 vdev_add_child(pvd, newvd); 3934 3935 tvd = newvd->vdev_top; 3936 ASSERT(pvd->vdev_top == tvd); 3937 ASSERT(tvd->vdev_parent == rvd); 3938 3939 vdev_config_dirty(tvd); 3940 3941 /* 3942 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 3943 * for any dmu_sync-ed blocks. It will propagate upward when 3944 * spa_vdev_exit() calls vdev_dtl_reassess(). 3945 */ 3946 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 3947 3948 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 3949 dtl_max_txg - TXG_INITIAL); 3950 3951 if (newvd->vdev_isspare) { 3952 spa_spare_activate(newvd); 3953 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 3954 } 3955 3956 oldvdpath = spa_strdup(oldvd->vdev_path); 3957 newvdpath = spa_strdup(newvd->vdev_path); 3958 newvd_isspare = newvd->vdev_isspare; 3959 3960 /* 3961 * Mark newvd's DTL dirty in this txg. 3962 */ 3963 vdev_dirty(tvd, VDD_DTL, newvd, txg); 3964 3965 /* 3966 * Restart the resilver 3967 */ 3968 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 3969 3970 /* 3971 * Commit the config 3972 */ 3973 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 3974 3975 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 3976 "%s vdev=%s %s vdev=%s", 3977 replacing && newvd_isspare ? "spare in" : 3978 replacing ? "replace" : "attach", newvdpath, 3979 replacing ? "for" : "to", oldvdpath); 3980 3981 spa_strfree(oldvdpath); 3982 spa_strfree(newvdpath); 3983 3984 if (spa->spa_bootfs) 3985 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 3986 3987 return (0); 3988 } 3989 3990 /* 3991 * Detach a device from a mirror or replacing vdev. 3992 * If 'replace_done' is specified, only detach if the parent 3993 * is a replacing vdev. 3994 */ 3995 int 3996 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 3997 { 3998 uint64_t txg; 3999 int error; 4000 vdev_t *rvd = spa->spa_root_vdev; 4001 vdev_t *vd, *pvd, *cvd, *tvd; 4002 boolean_t unspare = B_FALSE; 4003 uint64_t unspare_guid; 4004 char *vdpath; 4005 4006 ASSERT(spa_writeable(spa)); 4007 4008 txg = spa_vdev_enter(spa); 4009 4010 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4011 4012 if (vd == NULL) 4013 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4014 4015 if (!vd->vdev_ops->vdev_op_leaf) 4016 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4017 4018 pvd = vd->vdev_parent; 4019 4020 /* 4021 * If the parent/child relationship is not as expected, don't do it. 4022 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4023 * vdev that's replacing B with C. The user's intent in replacing 4024 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4025 * the replace by detaching C, the expected behavior is to end up 4026 * M(A,B). But suppose that right after deciding to detach C, 4027 * the replacement of B completes. We would have M(A,C), and then 4028 * ask to detach C, which would leave us with just A -- not what 4029 * the user wanted. To prevent this, we make sure that the 4030 * parent/child relationship hasn't changed -- in this example, 4031 * that C's parent is still the replacing vdev R. 4032 */ 4033 if (pvd->vdev_guid != pguid && pguid != 0) 4034 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4035 4036 /* 4037 * Only 'replacing' or 'spare' vdevs can be replaced. 4038 */ 4039 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4040 pvd->vdev_ops != &vdev_spare_ops) 4041 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4042 4043 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4044 spa_version(spa) >= SPA_VERSION_SPARES); 4045 4046 /* 4047 * Only mirror, replacing, and spare vdevs support detach. 4048 */ 4049 if (pvd->vdev_ops != &vdev_replacing_ops && 4050 pvd->vdev_ops != &vdev_mirror_ops && 4051 pvd->vdev_ops != &vdev_spare_ops) 4052 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4053 4054 /* 4055 * If this device has the only valid copy of some data, 4056 * we cannot safely detach it. 4057 */ 4058 if (vdev_dtl_required(vd)) 4059 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4060 4061 ASSERT(pvd->vdev_children >= 2); 4062 4063 /* 4064 * If we are detaching the second disk from a replacing vdev, then 4065 * check to see if we changed the original vdev's path to have "/old" 4066 * at the end in spa_vdev_attach(). If so, undo that change now. 4067 */ 4068 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4069 vd->vdev_path != NULL) { 4070 size_t len = strlen(vd->vdev_path); 4071 4072 for (int c = 0; c < pvd->vdev_children; c++) { 4073 cvd = pvd->vdev_child[c]; 4074 4075 if (cvd == vd || cvd->vdev_path == NULL) 4076 continue; 4077 4078 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4079 strcmp(cvd->vdev_path + len, "/old") == 0) { 4080 spa_strfree(cvd->vdev_path); 4081 cvd->vdev_path = spa_strdup(vd->vdev_path); 4082 break; 4083 } 4084 } 4085 } 4086 4087 /* 4088 * If we are detaching the original disk from a spare, then it implies 4089 * that the spare should become a real disk, and be removed from the 4090 * active spare list for the pool. 4091 */ 4092 if (pvd->vdev_ops == &vdev_spare_ops && 4093 vd->vdev_id == 0 && 4094 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4095 unspare = B_TRUE; 4096 4097 /* 4098 * Erase the disk labels so the disk can be used for other things. 4099 * This must be done after all other error cases are handled, 4100 * but before we disembowel vd (so we can still do I/O to it). 4101 * But if we can't do it, don't treat the error as fatal -- 4102 * it may be that the unwritability of the disk is the reason 4103 * it's being detached! 4104 */ 4105 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4106 4107 /* 4108 * Remove vd from its parent and compact the parent's children. 4109 */ 4110 vdev_remove_child(pvd, vd); 4111 vdev_compact_children(pvd); 4112 4113 /* 4114 * Remember one of the remaining children so we can get tvd below. 4115 */ 4116 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4117 4118 /* 4119 * If we need to remove the remaining child from the list of hot spares, 4120 * do it now, marking the vdev as no longer a spare in the process. 4121 * We must do this before vdev_remove_parent(), because that can 4122 * change the GUID if it creates a new toplevel GUID. For a similar 4123 * reason, we must remove the spare now, in the same txg as the detach; 4124 * otherwise someone could attach a new sibling, change the GUID, and 4125 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4126 */ 4127 if (unspare) { 4128 ASSERT(cvd->vdev_isspare); 4129 spa_spare_remove(cvd); 4130 unspare_guid = cvd->vdev_guid; 4131 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4132 cvd->vdev_unspare = B_TRUE; 4133 } 4134 4135 /* 4136 * If the parent mirror/replacing vdev only has one child, 4137 * the parent is no longer needed. Remove it from the tree. 4138 */ 4139 if (pvd->vdev_children == 1) { 4140 if (pvd->vdev_ops == &vdev_spare_ops) 4141 cvd->vdev_unspare = B_FALSE; 4142 vdev_remove_parent(cvd); 4143 cvd->vdev_resilvering = B_FALSE; 4144 } 4145 4146 4147 /* 4148 * We don't set tvd until now because the parent we just removed 4149 * may have been the previous top-level vdev. 4150 */ 4151 tvd = cvd->vdev_top; 4152 ASSERT(tvd->vdev_parent == rvd); 4153 4154 /* 4155 * Reevaluate the parent vdev state. 4156 */ 4157 vdev_propagate_state(cvd); 4158 4159 /* 4160 * If the 'autoexpand' property is set on the pool then automatically 4161 * try to expand the size of the pool. For example if the device we 4162 * just detached was smaller than the others, it may be possible to 4163 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4164 * first so that we can obtain the updated sizes of the leaf vdevs. 4165 */ 4166 if (spa->spa_autoexpand) { 4167 vdev_reopen(tvd); 4168 vdev_expand(tvd, txg); 4169 } 4170 4171 vdev_config_dirty(tvd); 4172 4173 /* 4174 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4175 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4176 * But first make sure we're not on any *other* txg's DTL list, to 4177 * prevent vd from being accessed after it's freed. 4178 */ 4179 vdpath = spa_strdup(vd->vdev_path); 4180 for (int t = 0; t < TXG_SIZE; t++) 4181 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4182 vd->vdev_detached = B_TRUE; 4183 vdev_dirty(tvd, VDD_DTL, vd, txg); 4184 4185 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4186 4187 /* hang on to the spa before we release the lock */ 4188 spa_open_ref(spa, FTAG); 4189 4190 error = spa_vdev_exit(spa, vd, txg, 0); 4191 4192 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4193 "vdev=%s", vdpath); 4194 spa_strfree(vdpath); 4195 4196 /* 4197 * If this was the removal of the original device in a hot spare vdev, 4198 * then we want to go through and remove the device from the hot spare 4199 * list of every other pool. 4200 */ 4201 if (unspare) { 4202 spa_t *altspa = NULL; 4203 4204 mutex_enter(&spa_namespace_lock); 4205 while ((altspa = spa_next(altspa)) != NULL) { 4206 if (altspa->spa_state != POOL_STATE_ACTIVE || 4207 altspa == spa) 4208 continue; 4209 4210 spa_open_ref(altspa, FTAG); 4211 mutex_exit(&spa_namespace_lock); 4212 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4213 mutex_enter(&spa_namespace_lock); 4214 spa_close(altspa, FTAG); 4215 } 4216 mutex_exit(&spa_namespace_lock); 4217 4218 /* search the rest of the vdevs for spares to remove */ 4219 spa_vdev_resilver_done(spa); 4220 } 4221 4222 /* all done with the spa; OK to release */ 4223 mutex_enter(&spa_namespace_lock); 4224 spa_close(spa, FTAG); 4225 mutex_exit(&spa_namespace_lock); 4226 4227 return (error); 4228 } 4229 4230 /* 4231 * Split a set of devices from their mirrors, and create a new pool from them. 4232 */ 4233 int 4234 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4235 nvlist_t *props, boolean_t exp) 4236 { 4237 int error = 0; 4238 uint64_t txg, *glist; 4239 spa_t *newspa; 4240 uint_t c, children, lastlog; 4241 nvlist_t **child, *nvl, *tmp; 4242 dmu_tx_t *tx; 4243 char *altroot = NULL; 4244 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4245 boolean_t activate_slog; 4246 4247 ASSERT(spa_writeable(spa)); 4248 4249 txg = spa_vdev_enter(spa); 4250 4251 /* clear the log and flush everything up to now */ 4252 activate_slog = spa_passivate_log(spa); 4253 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4254 error = spa_offline_log(spa); 4255 txg = spa_vdev_config_enter(spa); 4256 4257 if (activate_slog) 4258 spa_activate_log(spa); 4259 4260 if (error != 0) 4261 return (spa_vdev_exit(spa, NULL, txg, error)); 4262 4263 /* check new spa name before going any further */ 4264 if (spa_lookup(newname) != NULL) 4265 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4266 4267 /* 4268 * scan through all the children to ensure they're all mirrors 4269 */ 4270 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4271 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4272 &children) != 0) 4273 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4274 4275 /* first, check to ensure we've got the right child count */ 4276 rvd = spa->spa_root_vdev; 4277 lastlog = 0; 4278 for (c = 0; c < rvd->vdev_children; c++) { 4279 vdev_t *vd = rvd->vdev_child[c]; 4280 4281 /* don't count the holes & logs as children */ 4282 if (vd->vdev_islog || vd->vdev_ishole) { 4283 if (lastlog == 0) 4284 lastlog = c; 4285 continue; 4286 } 4287 4288 lastlog = 0; 4289 } 4290 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4291 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4292 4293 /* next, ensure no spare or cache devices are part of the split */ 4294 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4295 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4296 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4297 4298 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4299 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4300 4301 /* then, loop over each vdev and validate it */ 4302 for (c = 0; c < children; c++) { 4303 uint64_t is_hole = 0; 4304 4305 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4306 &is_hole); 4307 4308 if (is_hole != 0) { 4309 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4310 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4311 continue; 4312 } else { 4313 error = EINVAL; 4314 break; 4315 } 4316 } 4317 4318 /* which disk is going to be split? */ 4319 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4320 &glist[c]) != 0) { 4321 error = EINVAL; 4322 break; 4323 } 4324 4325 /* look it up in the spa */ 4326 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4327 if (vml[c] == NULL) { 4328 error = ENODEV; 4329 break; 4330 } 4331 4332 /* make sure there's nothing stopping the split */ 4333 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4334 vml[c]->vdev_islog || 4335 vml[c]->vdev_ishole || 4336 vml[c]->vdev_isspare || 4337 vml[c]->vdev_isl2cache || 4338 !vdev_writeable(vml[c]) || 4339 vml[c]->vdev_children != 0 || 4340 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4341 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4342 error = EINVAL; 4343 break; 4344 } 4345 4346 if (vdev_dtl_required(vml[c])) { 4347 error = EBUSY; 4348 break; 4349 } 4350 4351 /* we need certain info from the top level */ 4352 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4353 vml[c]->vdev_top->vdev_ms_array) == 0); 4354 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4355 vml[c]->vdev_top->vdev_ms_shift) == 0); 4356 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4357 vml[c]->vdev_top->vdev_asize) == 0); 4358 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4359 vml[c]->vdev_top->vdev_ashift) == 0); 4360 } 4361 4362 if (error != 0) { 4363 kmem_free(vml, children * sizeof (vdev_t *)); 4364 kmem_free(glist, children * sizeof (uint64_t)); 4365 return (spa_vdev_exit(spa, NULL, txg, error)); 4366 } 4367 4368 /* stop writers from using the disks */ 4369 for (c = 0; c < children; c++) { 4370 if (vml[c] != NULL) 4371 vml[c]->vdev_offline = B_TRUE; 4372 } 4373 vdev_reopen(spa->spa_root_vdev); 4374 4375 /* 4376 * Temporarily record the splitting vdevs in the spa config. This 4377 * will disappear once the config is regenerated. 4378 */ 4379 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4380 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4381 glist, children) == 0); 4382 kmem_free(glist, children * sizeof (uint64_t)); 4383 4384 mutex_enter(&spa->spa_props_lock); 4385 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4386 nvl) == 0); 4387 mutex_exit(&spa->spa_props_lock); 4388 spa->spa_config_splitting = nvl; 4389 vdev_config_dirty(spa->spa_root_vdev); 4390 4391 /* configure and create the new pool */ 4392 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4393 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4394 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4395 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4396 spa_version(spa)) == 0); 4397 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4398 spa->spa_config_txg) == 0); 4399 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4400 spa_generate_guid(NULL)) == 0); 4401 (void) nvlist_lookup_string(props, 4402 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4403 4404 /* add the new pool to the namespace */ 4405 newspa = spa_add(newname, config, altroot); 4406 newspa->spa_config_txg = spa->spa_config_txg; 4407 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4408 4409 /* release the spa config lock, retaining the namespace lock */ 4410 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4411 4412 if (zio_injection_enabled) 4413 zio_handle_panic_injection(spa, FTAG, 1); 4414 4415 spa_activate(newspa, spa_mode_global); 4416 spa_async_suspend(newspa); 4417 4418 /* create the new pool from the disks of the original pool */ 4419 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4420 if (error) 4421 goto out; 4422 4423 /* if that worked, generate a real config for the new pool */ 4424 if (newspa->spa_root_vdev != NULL) { 4425 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4426 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4427 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4428 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4429 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4430 B_TRUE)); 4431 } 4432 4433 /* set the props */ 4434 if (props != NULL) { 4435 spa_configfile_set(newspa, props, B_FALSE); 4436 error = spa_prop_set(newspa, props); 4437 if (error) 4438 goto out; 4439 } 4440 4441 /* flush everything */ 4442 txg = spa_vdev_config_enter(newspa); 4443 vdev_config_dirty(newspa->spa_root_vdev); 4444 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4445 4446 if (zio_injection_enabled) 4447 zio_handle_panic_injection(spa, FTAG, 2); 4448 4449 spa_async_resume(newspa); 4450 4451 /* finally, update the original pool's config */ 4452 txg = spa_vdev_config_enter(spa); 4453 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4454 error = dmu_tx_assign(tx, TXG_WAIT); 4455 if (error != 0) 4456 dmu_tx_abort(tx); 4457 for (c = 0; c < children; c++) { 4458 if (vml[c] != NULL) { 4459 vdev_split(vml[c]); 4460 if (error == 0) 4461 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4462 spa, tx, "vdev=%s", 4463 vml[c]->vdev_path); 4464 vdev_free(vml[c]); 4465 } 4466 } 4467 vdev_config_dirty(spa->spa_root_vdev); 4468 spa->spa_config_splitting = NULL; 4469 nvlist_free(nvl); 4470 if (error == 0) 4471 dmu_tx_commit(tx); 4472 (void) spa_vdev_exit(spa, NULL, txg, 0); 4473 4474 if (zio_injection_enabled) 4475 zio_handle_panic_injection(spa, FTAG, 3); 4476 4477 /* split is complete; log a history record */ 4478 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4479 "split new pool %s from pool %s", newname, spa_name(spa)); 4480 4481 kmem_free(vml, children * sizeof (vdev_t *)); 4482 4483 /* if we're not going to mount the filesystems in userland, export */ 4484 if (exp) 4485 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4486 B_FALSE, B_FALSE); 4487 4488 return (error); 4489 4490 out: 4491 spa_unload(newspa); 4492 spa_deactivate(newspa); 4493 spa_remove(newspa); 4494 4495 txg = spa_vdev_config_enter(spa); 4496 4497 /* re-online all offlined disks */ 4498 for (c = 0; c < children; c++) { 4499 if (vml[c] != NULL) 4500 vml[c]->vdev_offline = B_FALSE; 4501 } 4502 vdev_reopen(spa->spa_root_vdev); 4503 4504 nvlist_free(spa->spa_config_splitting); 4505 spa->spa_config_splitting = NULL; 4506 (void) spa_vdev_exit(spa, NULL, txg, error); 4507 4508 kmem_free(vml, children * sizeof (vdev_t *)); 4509 return (error); 4510 } 4511 4512 static nvlist_t * 4513 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4514 { 4515 for (int i = 0; i < count; i++) { 4516 uint64_t guid; 4517 4518 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4519 &guid) == 0); 4520 4521 if (guid == target_guid) 4522 return (nvpp[i]); 4523 } 4524 4525 return (NULL); 4526 } 4527 4528 static void 4529 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4530 nvlist_t *dev_to_remove) 4531 { 4532 nvlist_t **newdev = NULL; 4533 4534 if (count > 1) 4535 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4536 4537 for (int i = 0, j = 0; i < count; i++) { 4538 if (dev[i] == dev_to_remove) 4539 continue; 4540 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4541 } 4542 4543 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4544 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4545 4546 for (int i = 0; i < count - 1; i++) 4547 nvlist_free(newdev[i]); 4548 4549 if (count > 1) 4550 kmem_free(newdev, (count - 1) * sizeof (void *)); 4551 } 4552 4553 /* 4554 * Evacuate the device. 4555 */ 4556 static int 4557 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4558 { 4559 uint64_t txg; 4560 int error = 0; 4561 4562 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4563 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4564 ASSERT(vd == vd->vdev_top); 4565 4566 /* 4567 * Evacuate the device. We don't hold the config lock as writer 4568 * since we need to do I/O but we do keep the 4569 * spa_namespace_lock held. Once this completes the device 4570 * should no longer have any blocks allocated on it. 4571 */ 4572 if (vd->vdev_islog) { 4573 if (vd->vdev_stat.vs_alloc != 0) 4574 error = spa_offline_log(spa); 4575 } else { 4576 error = ENOTSUP; 4577 } 4578 4579 if (error) 4580 return (error); 4581 4582 /* 4583 * The evacuation succeeded. Remove any remaining MOS metadata 4584 * associated with this vdev, and wait for these changes to sync. 4585 */ 4586 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 4587 txg = spa_vdev_config_enter(spa); 4588 vd->vdev_removing = B_TRUE; 4589 vdev_dirty(vd, 0, NULL, txg); 4590 vdev_config_dirty(vd); 4591 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4592 4593 return (0); 4594 } 4595 4596 /* 4597 * Complete the removal by cleaning up the namespace. 4598 */ 4599 static void 4600 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4601 { 4602 vdev_t *rvd = spa->spa_root_vdev; 4603 uint64_t id = vd->vdev_id; 4604 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4605 4606 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4607 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4608 ASSERT(vd == vd->vdev_top); 4609 4610 /* 4611 * Only remove any devices which are empty. 4612 */ 4613 if (vd->vdev_stat.vs_alloc != 0) 4614 return; 4615 4616 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4617 4618 if (list_link_active(&vd->vdev_state_dirty_node)) 4619 vdev_state_clean(vd); 4620 if (list_link_active(&vd->vdev_config_dirty_node)) 4621 vdev_config_clean(vd); 4622 4623 vdev_free(vd); 4624 4625 if (last_vdev) { 4626 vdev_compact_children(rvd); 4627 } else { 4628 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4629 vdev_add_child(rvd, vd); 4630 } 4631 vdev_config_dirty(rvd); 4632 4633 /* 4634 * Reassess the health of our root vdev. 4635 */ 4636 vdev_reopen(rvd); 4637 } 4638 4639 /* 4640 * Remove a device from the pool - 4641 * 4642 * Removing a device from the vdev namespace requires several steps 4643 * and can take a significant amount of time. As a result we use 4644 * the spa_vdev_config_[enter/exit] functions which allow us to 4645 * grab and release the spa_config_lock while still holding the namespace 4646 * lock. During each step the configuration is synced out. 4647 */ 4648 4649 /* 4650 * Remove a device from the pool. Currently, this supports removing only hot 4651 * spares, slogs, and level 2 ARC devices. 4652 */ 4653 int 4654 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4655 { 4656 vdev_t *vd; 4657 metaslab_group_t *mg; 4658 nvlist_t **spares, **l2cache, *nv; 4659 uint64_t txg = 0; 4660 uint_t nspares, nl2cache; 4661 int error = 0; 4662 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4663 4664 ASSERT(spa_writeable(spa)); 4665 4666 if (!locked) 4667 txg = spa_vdev_enter(spa); 4668 4669 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4670 4671 if (spa->spa_spares.sav_vdevs != NULL && 4672 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4673 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4674 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4675 /* 4676 * Only remove the hot spare if it's not currently in use 4677 * in this pool. 4678 */ 4679 if (vd == NULL || unspare) { 4680 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4681 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4682 spa_load_spares(spa); 4683 spa->spa_spares.sav_sync = B_TRUE; 4684 } else { 4685 error = EBUSY; 4686 } 4687 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4688 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4689 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4690 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4691 /* 4692 * Cache devices can always be removed. 4693 */ 4694 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4695 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4696 spa_load_l2cache(spa); 4697 spa->spa_l2cache.sav_sync = B_TRUE; 4698 } else if (vd != NULL && vd->vdev_islog) { 4699 ASSERT(!locked); 4700 ASSERT(vd == vd->vdev_top); 4701 4702 /* 4703 * XXX - Once we have bp-rewrite this should 4704 * become the common case. 4705 */ 4706 4707 mg = vd->vdev_mg; 4708 4709 /* 4710 * Stop allocating from this vdev. 4711 */ 4712 metaslab_group_passivate(mg); 4713 4714 /* 4715 * Wait for the youngest allocations and frees to sync, 4716 * and then wait for the deferral of those frees to finish. 4717 */ 4718 spa_vdev_config_exit(spa, NULL, 4719 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4720 4721 /* 4722 * Attempt to evacuate the vdev. 4723 */ 4724 error = spa_vdev_remove_evacuate(spa, vd); 4725 4726 txg = spa_vdev_config_enter(spa); 4727 4728 /* 4729 * If we couldn't evacuate the vdev, unwind. 4730 */ 4731 if (error) { 4732 metaslab_group_activate(mg); 4733 return (spa_vdev_exit(spa, NULL, txg, error)); 4734 } 4735 4736 /* 4737 * Clean up the vdev namespace. 4738 */ 4739 spa_vdev_remove_from_namespace(spa, vd); 4740 4741 } else if (vd != NULL) { 4742 /* 4743 * Normal vdevs cannot be removed (yet). 4744 */ 4745 error = ENOTSUP; 4746 } else { 4747 /* 4748 * There is no vdev of any kind with the specified guid. 4749 */ 4750 error = ENOENT; 4751 } 4752 4753 if (!locked) 4754 return (spa_vdev_exit(spa, NULL, txg, error)); 4755 4756 return (error); 4757 } 4758 4759 /* 4760 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4761 * current spared, so we can detach it. 4762 */ 4763 static vdev_t * 4764 spa_vdev_resilver_done_hunt(vdev_t *vd) 4765 { 4766 vdev_t *newvd, *oldvd; 4767 4768 for (int c = 0; c < vd->vdev_children; c++) { 4769 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4770 if (oldvd != NULL) 4771 return (oldvd); 4772 } 4773 4774 /* 4775 * Check for a completed replacement. We always consider the first 4776 * vdev in the list to be the oldest vdev, and the last one to be 4777 * the newest (see spa_vdev_attach() for how that works). In 4778 * the case where the newest vdev is faulted, we will not automatically 4779 * remove it after a resilver completes. This is OK as it will require 4780 * user intervention to determine which disk the admin wishes to keep. 4781 */ 4782 if (vd->vdev_ops == &vdev_replacing_ops) { 4783 ASSERT(vd->vdev_children > 1); 4784 4785 newvd = vd->vdev_child[vd->vdev_children - 1]; 4786 oldvd = vd->vdev_child[0]; 4787 4788 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4789 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4790 !vdev_dtl_required(oldvd)) 4791 return (oldvd); 4792 } 4793 4794 /* 4795 * Check for a completed resilver with the 'unspare' flag set. 4796 */ 4797 if (vd->vdev_ops == &vdev_spare_ops) { 4798 vdev_t *first = vd->vdev_child[0]; 4799 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 4800 4801 if (last->vdev_unspare) { 4802 oldvd = first; 4803 newvd = last; 4804 } else if (first->vdev_unspare) { 4805 oldvd = last; 4806 newvd = first; 4807 } else { 4808 oldvd = NULL; 4809 } 4810 4811 if (oldvd != NULL && 4812 vdev_dtl_empty(newvd, DTL_MISSING) && 4813 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4814 !vdev_dtl_required(oldvd)) 4815 return (oldvd); 4816 4817 /* 4818 * If there are more than two spares attached to a disk, 4819 * and those spares are not required, then we want to 4820 * attempt to free them up now so that they can be used 4821 * by other pools. Once we're back down to a single 4822 * disk+spare, we stop removing them. 4823 */ 4824 if (vd->vdev_children > 2) { 4825 newvd = vd->vdev_child[1]; 4826 4827 if (newvd->vdev_isspare && last->vdev_isspare && 4828 vdev_dtl_empty(last, DTL_MISSING) && 4829 vdev_dtl_empty(last, DTL_OUTAGE) && 4830 !vdev_dtl_required(newvd)) 4831 return (newvd); 4832 } 4833 } 4834 4835 return (NULL); 4836 } 4837 4838 static void 4839 spa_vdev_resilver_done(spa_t *spa) 4840 { 4841 vdev_t *vd, *pvd, *ppvd; 4842 uint64_t guid, sguid, pguid, ppguid; 4843 4844 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4845 4846 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4847 pvd = vd->vdev_parent; 4848 ppvd = pvd->vdev_parent; 4849 guid = vd->vdev_guid; 4850 pguid = pvd->vdev_guid; 4851 ppguid = ppvd->vdev_guid; 4852 sguid = 0; 4853 /* 4854 * If we have just finished replacing a hot spared device, then 4855 * we need to detach the parent's first child (the original hot 4856 * spare) as well. 4857 */ 4858 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 4859 ppvd->vdev_children == 2) { 4860 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4861 sguid = ppvd->vdev_child[1]->vdev_guid; 4862 } 4863 spa_config_exit(spa, SCL_ALL, FTAG); 4864 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4865 return; 4866 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4867 return; 4868 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4869 } 4870 4871 spa_config_exit(spa, SCL_ALL, FTAG); 4872 } 4873 4874 /* 4875 * Update the stored path or FRU for this vdev. 4876 */ 4877 int 4878 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4879 boolean_t ispath) 4880 { 4881 vdev_t *vd; 4882 boolean_t sync = B_FALSE; 4883 4884 ASSERT(spa_writeable(spa)); 4885 4886 spa_vdev_state_enter(spa, SCL_ALL); 4887 4888 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4889 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 4890 4891 if (!vd->vdev_ops->vdev_op_leaf) 4892 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4893 4894 if (ispath) { 4895 if (strcmp(value, vd->vdev_path) != 0) { 4896 spa_strfree(vd->vdev_path); 4897 vd->vdev_path = spa_strdup(value); 4898 sync = B_TRUE; 4899 } 4900 } else { 4901 if (vd->vdev_fru == NULL) { 4902 vd->vdev_fru = spa_strdup(value); 4903 sync = B_TRUE; 4904 } else if (strcmp(value, vd->vdev_fru) != 0) { 4905 spa_strfree(vd->vdev_fru); 4906 vd->vdev_fru = spa_strdup(value); 4907 sync = B_TRUE; 4908 } 4909 } 4910 4911 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 4912 } 4913 4914 int 4915 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4916 { 4917 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4918 } 4919 4920 int 4921 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4922 { 4923 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4924 } 4925 4926 /* 4927 * ========================================================================== 4928 * SPA Scanning 4929 * ========================================================================== 4930 */ 4931 4932 int 4933 spa_scan_stop(spa_t *spa) 4934 { 4935 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4936 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 4937 return (EBUSY); 4938 return (dsl_scan_cancel(spa->spa_dsl_pool)); 4939 } 4940 4941 int 4942 spa_scan(spa_t *spa, pool_scan_func_t func) 4943 { 4944 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4945 4946 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 4947 return (ENOTSUP); 4948 4949 /* 4950 * If a resilver was requested, but there is no DTL on a 4951 * writeable leaf device, we have nothing to do. 4952 */ 4953 if (func == POOL_SCAN_RESILVER && 4954 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4955 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 4956 return (0); 4957 } 4958 4959 return (dsl_scan(spa->spa_dsl_pool, func)); 4960 } 4961 4962 /* 4963 * ========================================================================== 4964 * SPA async task processing 4965 * ========================================================================== 4966 */ 4967 4968 static void 4969 spa_async_remove(spa_t *spa, vdev_t *vd) 4970 { 4971 if (vd->vdev_remove_wanted) { 4972 vd->vdev_remove_wanted = B_FALSE; 4973 vd->vdev_delayed_close = B_FALSE; 4974 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 4975 4976 /* 4977 * We want to clear the stats, but we don't want to do a full 4978 * vdev_clear() as that will cause us to throw away 4979 * degraded/faulted state as well as attempt to reopen the 4980 * device, all of which is a waste. 4981 */ 4982 vd->vdev_stat.vs_read_errors = 0; 4983 vd->vdev_stat.vs_write_errors = 0; 4984 vd->vdev_stat.vs_checksum_errors = 0; 4985 4986 vdev_state_dirty(vd->vdev_top); 4987 } 4988 4989 for (int c = 0; c < vd->vdev_children; c++) 4990 spa_async_remove(spa, vd->vdev_child[c]); 4991 } 4992 4993 static void 4994 spa_async_probe(spa_t *spa, vdev_t *vd) 4995 { 4996 if (vd->vdev_probe_wanted) { 4997 vd->vdev_probe_wanted = B_FALSE; 4998 vdev_reopen(vd); /* vdev_open() does the actual probe */ 4999 } 5000 5001 for (int c = 0; c < vd->vdev_children; c++) 5002 spa_async_probe(spa, vd->vdev_child[c]); 5003 } 5004 5005 static void 5006 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5007 { 5008 sysevent_id_t eid; 5009 nvlist_t *attr; 5010 char *physpath; 5011 5012 if (!spa->spa_autoexpand) 5013 return; 5014 5015 for (int c = 0; c < vd->vdev_children; c++) { 5016 vdev_t *cvd = vd->vdev_child[c]; 5017 spa_async_autoexpand(spa, cvd); 5018 } 5019 5020 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5021 return; 5022 5023 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5024 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5025 5026 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5027 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5028 5029 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5030 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 5031 5032 nvlist_free(attr); 5033 kmem_free(physpath, MAXPATHLEN); 5034 } 5035 5036 static void 5037 spa_async_thread(spa_t *spa) 5038 { 5039 int tasks; 5040 5041 ASSERT(spa->spa_sync_on); 5042 5043 mutex_enter(&spa->spa_async_lock); 5044 tasks = spa->spa_async_tasks; 5045 spa->spa_async_tasks = 0; 5046 mutex_exit(&spa->spa_async_lock); 5047 5048 /* 5049 * See if the config needs to be updated. 5050 */ 5051 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5052 uint64_t old_space, new_space; 5053 5054 mutex_enter(&spa_namespace_lock); 5055 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5056 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5057 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5058 mutex_exit(&spa_namespace_lock); 5059 5060 /* 5061 * If the pool grew as a result of the config update, 5062 * then log an internal history event. 5063 */ 5064 if (new_space != old_space) { 5065 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5066 spa, NULL, 5067 "pool '%s' size: %llu(+%llu)", 5068 spa_name(spa), new_space, new_space - old_space); 5069 } 5070 } 5071 5072 /* 5073 * See if any devices need to be marked REMOVED. 5074 */ 5075 if (tasks & SPA_ASYNC_REMOVE) { 5076 spa_vdev_state_enter(spa, SCL_NONE); 5077 spa_async_remove(spa, spa->spa_root_vdev); 5078 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5079 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5080 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5081 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5082 (void) spa_vdev_state_exit(spa, NULL, 0); 5083 } 5084 5085 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5086 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5087 spa_async_autoexpand(spa, spa->spa_root_vdev); 5088 spa_config_exit(spa, SCL_CONFIG, FTAG); 5089 } 5090 5091 /* 5092 * See if any devices need to be probed. 5093 */ 5094 if (tasks & SPA_ASYNC_PROBE) { 5095 spa_vdev_state_enter(spa, SCL_NONE); 5096 spa_async_probe(spa, spa->spa_root_vdev); 5097 (void) spa_vdev_state_exit(spa, NULL, 0); 5098 } 5099 5100 /* 5101 * If any devices are done replacing, detach them. 5102 */ 5103 if (tasks & SPA_ASYNC_RESILVER_DONE) 5104 spa_vdev_resilver_done(spa); 5105 5106 /* 5107 * Kick off a resilver. 5108 */ 5109 if (tasks & SPA_ASYNC_RESILVER) 5110 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5111 5112 /* 5113 * Let the world know that we're done. 5114 */ 5115 mutex_enter(&spa->spa_async_lock); 5116 spa->spa_async_thread = NULL; 5117 cv_broadcast(&spa->spa_async_cv); 5118 mutex_exit(&spa->spa_async_lock); 5119 thread_exit(); 5120 } 5121 5122 void 5123 spa_async_suspend(spa_t *spa) 5124 { 5125 mutex_enter(&spa->spa_async_lock); 5126 spa->spa_async_suspended++; 5127 while (spa->spa_async_thread != NULL) 5128 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5129 mutex_exit(&spa->spa_async_lock); 5130 } 5131 5132 void 5133 spa_async_resume(spa_t *spa) 5134 { 5135 mutex_enter(&spa->spa_async_lock); 5136 ASSERT(spa->spa_async_suspended != 0); 5137 spa->spa_async_suspended--; 5138 mutex_exit(&spa->spa_async_lock); 5139 } 5140 5141 static void 5142 spa_async_dispatch(spa_t *spa) 5143 { 5144 mutex_enter(&spa->spa_async_lock); 5145 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5146 spa->spa_async_thread == NULL && 5147 rootdir != NULL && !vn_is_readonly(rootdir)) 5148 spa->spa_async_thread = thread_create(NULL, 0, 5149 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5150 mutex_exit(&spa->spa_async_lock); 5151 } 5152 5153 void 5154 spa_async_request(spa_t *spa, int task) 5155 { 5156 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5157 mutex_enter(&spa->spa_async_lock); 5158 spa->spa_async_tasks |= task; 5159 mutex_exit(&spa->spa_async_lock); 5160 } 5161 5162 /* 5163 * ========================================================================== 5164 * SPA syncing routines 5165 * ========================================================================== 5166 */ 5167 5168 static int 5169 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5170 { 5171 bpobj_t *bpo = arg; 5172 bpobj_enqueue(bpo, bp, tx); 5173 return (0); 5174 } 5175 5176 static int 5177 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5178 { 5179 zio_t *zio = arg; 5180 5181 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5182 zio->io_flags)); 5183 return (0); 5184 } 5185 5186 static void 5187 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5188 { 5189 char *packed = NULL; 5190 size_t bufsize; 5191 size_t nvsize = 0; 5192 dmu_buf_t *db; 5193 5194 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5195 5196 /* 5197 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5198 * information. This avoids the dbuf_will_dirty() path and 5199 * saves us a pre-read to get data we don't actually care about. 5200 */ 5201 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 5202 packed = kmem_alloc(bufsize, KM_SLEEP); 5203 5204 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5205 KM_SLEEP) == 0); 5206 bzero(packed + nvsize, bufsize - nvsize); 5207 5208 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5209 5210 kmem_free(packed, bufsize); 5211 5212 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5213 dmu_buf_will_dirty(db, tx); 5214 *(uint64_t *)db->db_data = nvsize; 5215 dmu_buf_rele(db, FTAG); 5216 } 5217 5218 static void 5219 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5220 const char *config, const char *entry) 5221 { 5222 nvlist_t *nvroot; 5223 nvlist_t **list; 5224 int i; 5225 5226 if (!sav->sav_sync) 5227 return; 5228 5229 /* 5230 * Update the MOS nvlist describing the list of available devices. 5231 * spa_validate_aux() will have already made sure this nvlist is 5232 * valid and the vdevs are labeled appropriately. 5233 */ 5234 if (sav->sav_object == 0) { 5235 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5236 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5237 sizeof (uint64_t), tx); 5238 VERIFY(zap_update(spa->spa_meta_objset, 5239 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5240 &sav->sav_object, tx) == 0); 5241 } 5242 5243 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5244 if (sav->sav_count == 0) { 5245 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5246 } else { 5247 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5248 for (i = 0; i < sav->sav_count; i++) 5249 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5250 B_FALSE, VDEV_CONFIG_L2CACHE); 5251 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5252 sav->sav_count) == 0); 5253 for (i = 0; i < sav->sav_count; i++) 5254 nvlist_free(list[i]); 5255 kmem_free(list, sav->sav_count * sizeof (void *)); 5256 } 5257 5258 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5259 nvlist_free(nvroot); 5260 5261 sav->sav_sync = B_FALSE; 5262 } 5263 5264 static void 5265 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5266 { 5267 nvlist_t *config; 5268 5269 if (list_is_empty(&spa->spa_config_dirty_list)) 5270 return; 5271 5272 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5273 5274 config = spa_config_generate(spa, spa->spa_root_vdev, 5275 dmu_tx_get_txg(tx), B_FALSE); 5276 5277 spa_config_exit(spa, SCL_STATE, FTAG); 5278 5279 if (spa->spa_config_syncing) 5280 nvlist_free(spa->spa_config_syncing); 5281 spa->spa_config_syncing = config; 5282 5283 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5284 } 5285 5286 /* 5287 * Set zpool properties. 5288 */ 5289 static void 5290 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5291 { 5292 spa_t *spa = arg1; 5293 objset_t *mos = spa->spa_meta_objset; 5294 nvlist_t *nvp = arg2; 5295 nvpair_t *elem; 5296 uint64_t intval; 5297 char *strval; 5298 zpool_prop_t prop; 5299 const char *propname; 5300 zprop_type_t proptype; 5301 5302 mutex_enter(&spa->spa_props_lock); 5303 5304 elem = NULL; 5305 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5306 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5307 case ZPOOL_PROP_VERSION: 5308 /* 5309 * Only set version for non-zpool-creation cases 5310 * (set/import). spa_create() needs special care 5311 * for version setting. 5312 */ 5313 if (tx->tx_txg != TXG_INITIAL) { 5314 VERIFY(nvpair_value_uint64(elem, 5315 &intval) == 0); 5316 ASSERT(intval <= SPA_VERSION); 5317 ASSERT(intval >= spa_version(spa)); 5318 spa->spa_uberblock.ub_version = intval; 5319 vdev_config_dirty(spa->spa_root_vdev); 5320 } 5321 break; 5322 5323 case ZPOOL_PROP_ALTROOT: 5324 /* 5325 * 'altroot' is a non-persistent property. It should 5326 * have been set temporarily at creation or import time. 5327 */ 5328 ASSERT(spa->spa_root != NULL); 5329 break; 5330 5331 case ZPOOL_PROP_READONLY: 5332 case ZPOOL_PROP_CACHEFILE: 5333 /* 5334 * 'readonly' and 'cachefile' are also non-persisitent 5335 * properties. 5336 */ 5337 break; 5338 default: 5339 /* 5340 * Set pool property values in the poolprops mos object. 5341 */ 5342 if (spa->spa_pool_props_object == 0) { 5343 VERIFY((spa->spa_pool_props_object = 5344 zap_create(mos, DMU_OT_POOL_PROPS, 5345 DMU_OT_NONE, 0, tx)) > 0); 5346 5347 VERIFY(zap_update(mos, 5348 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5349 8, 1, &spa->spa_pool_props_object, tx) 5350 == 0); 5351 } 5352 5353 /* normalize the property name */ 5354 propname = zpool_prop_to_name(prop); 5355 proptype = zpool_prop_get_type(prop); 5356 5357 if (nvpair_type(elem) == DATA_TYPE_STRING) { 5358 ASSERT(proptype == PROP_TYPE_STRING); 5359 VERIFY(nvpair_value_string(elem, &strval) == 0); 5360 VERIFY(zap_update(mos, 5361 spa->spa_pool_props_object, propname, 5362 1, strlen(strval) + 1, strval, tx) == 0); 5363 5364 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5365 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5366 5367 if (proptype == PROP_TYPE_INDEX) { 5368 const char *unused; 5369 VERIFY(zpool_prop_index_to_string( 5370 prop, intval, &unused) == 0); 5371 } 5372 VERIFY(zap_update(mos, 5373 spa->spa_pool_props_object, propname, 5374 8, 1, &intval, tx) == 0); 5375 } else { 5376 ASSERT(0); /* not allowed */ 5377 } 5378 5379 switch (prop) { 5380 case ZPOOL_PROP_DELEGATION: 5381 spa->spa_delegation = intval; 5382 break; 5383 case ZPOOL_PROP_BOOTFS: 5384 spa->spa_bootfs = intval; 5385 break; 5386 case ZPOOL_PROP_FAILUREMODE: 5387 spa->spa_failmode = intval; 5388 break; 5389 case ZPOOL_PROP_AUTOEXPAND: 5390 spa->spa_autoexpand = intval; 5391 if (tx->tx_txg != TXG_INITIAL) 5392 spa_async_request(spa, 5393 SPA_ASYNC_AUTOEXPAND); 5394 break; 5395 case ZPOOL_PROP_DEDUPDITTO: 5396 spa->spa_dedup_ditto = intval; 5397 break; 5398 default: 5399 break; 5400 } 5401 } 5402 5403 /* log internal history if this is not a zpool create */ 5404 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5405 tx->tx_txg != TXG_INITIAL) { 5406 spa_history_log_internal(LOG_POOL_PROPSET, 5407 spa, tx, "%s %lld %s", 5408 nvpair_name(elem), intval, spa_name(spa)); 5409 } 5410 } 5411 5412 mutex_exit(&spa->spa_props_lock); 5413 } 5414 5415 /* 5416 * Perform one-time upgrade on-disk changes. spa_version() does not 5417 * reflect the new version this txg, so there must be no changes this 5418 * txg to anything that the upgrade code depends on after it executes. 5419 * Therefore this must be called after dsl_pool_sync() does the sync 5420 * tasks. 5421 */ 5422 static void 5423 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 5424 { 5425 dsl_pool_t *dp = spa->spa_dsl_pool; 5426 5427 ASSERT(spa->spa_sync_pass == 1); 5428 5429 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5430 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5431 dsl_pool_create_origin(dp, tx); 5432 5433 /* Keeping the origin open increases spa_minref */ 5434 spa->spa_minref += 3; 5435 } 5436 5437 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5438 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5439 dsl_pool_upgrade_clones(dp, tx); 5440 } 5441 5442 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 5443 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 5444 dsl_pool_upgrade_dir_clones(dp, tx); 5445 5446 /* Keeping the freedir open increases spa_minref */ 5447 spa->spa_minref += 3; 5448 } 5449 } 5450 5451 /* 5452 * Sync the specified transaction group. New blocks may be dirtied as 5453 * part of the process, so we iterate until it converges. 5454 */ 5455 void 5456 spa_sync(spa_t *spa, uint64_t txg) 5457 { 5458 dsl_pool_t *dp = spa->spa_dsl_pool; 5459 objset_t *mos = spa->spa_meta_objset; 5460 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 5461 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5462 vdev_t *rvd = spa->spa_root_vdev; 5463 vdev_t *vd; 5464 dmu_tx_t *tx; 5465 int error; 5466 5467 VERIFY(spa_writeable(spa)); 5468 5469 /* 5470 * Lock out configuration changes. 5471 */ 5472 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5473 5474 spa->spa_syncing_txg = txg; 5475 spa->spa_sync_pass = 0; 5476 5477 /* 5478 * If there are any pending vdev state changes, convert them 5479 * into config changes that go out with this transaction group. 5480 */ 5481 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5482 while (list_head(&spa->spa_state_dirty_list) != NULL) { 5483 /* 5484 * We need the write lock here because, for aux vdevs, 5485 * calling vdev_config_dirty() modifies sav_config. 5486 * This is ugly and will become unnecessary when we 5487 * eliminate the aux vdev wart by integrating all vdevs 5488 * into the root vdev tree. 5489 */ 5490 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5491 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5492 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5493 vdev_state_clean(vd); 5494 vdev_config_dirty(vd); 5495 } 5496 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5497 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5498 } 5499 spa_config_exit(spa, SCL_STATE, FTAG); 5500 5501 tx = dmu_tx_create_assigned(dp, txg); 5502 5503 /* 5504 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5505 * set spa_deflate if we have no raid-z vdevs. 5506 */ 5507 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5508 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5509 int i; 5510 5511 for (i = 0; i < rvd->vdev_children; i++) { 5512 vd = rvd->vdev_child[i]; 5513 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 5514 break; 5515 } 5516 if (i == rvd->vdev_children) { 5517 spa->spa_deflate = TRUE; 5518 VERIFY(0 == zap_add(spa->spa_meta_objset, 5519 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5520 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 5521 } 5522 } 5523 5524 /* 5525 * If anything has changed in this txg, or if someone is waiting 5526 * for this txg to sync (eg, spa_vdev_remove()), push the 5527 * deferred frees from the previous txg. If not, leave them 5528 * alone so that we don't generate work on an otherwise idle 5529 * system. 5530 */ 5531 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 5532 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 5533 !txg_list_empty(&dp->dp_sync_tasks, txg) || 5534 ((dsl_scan_active(dp->dp_scan) || 5535 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 5536 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5537 VERIFY3U(bpobj_iterate(defer_bpo, 5538 spa_free_sync_cb, zio, tx), ==, 0); 5539 VERIFY3U(zio_wait(zio), ==, 0); 5540 } 5541 5542 /* 5543 * Iterate to convergence. 5544 */ 5545 do { 5546 int pass = ++spa->spa_sync_pass; 5547 5548 spa_sync_config_object(spa, tx); 5549 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 5550 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 5551 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 5552 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 5553 spa_errlog_sync(spa, txg); 5554 dsl_pool_sync(dp, txg); 5555 5556 if (pass <= SYNC_PASS_DEFERRED_FREE) { 5557 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5558 bplist_iterate(free_bpl, spa_free_sync_cb, 5559 zio, tx); 5560 VERIFY(zio_wait(zio) == 0); 5561 } else { 5562 bplist_iterate(free_bpl, bpobj_enqueue_cb, 5563 defer_bpo, tx); 5564 } 5565 5566 ddt_sync(spa, txg); 5567 dsl_scan_sync(dp, tx); 5568 5569 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 5570 vdev_sync(vd, txg); 5571 5572 if (pass == 1) 5573 spa_sync_upgrades(spa, tx); 5574 5575 } while (dmu_objset_is_dirty(mos, txg)); 5576 5577 /* 5578 * Rewrite the vdev configuration (which includes the uberblock) 5579 * to commit the transaction group. 5580 * 5581 * If there are no dirty vdevs, we sync the uberblock to a few 5582 * random top-level vdevs that are known to be visible in the 5583 * config cache (see spa_vdev_add() for a complete description). 5584 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5585 */ 5586 for (;;) { 5587 /* 5588 * We hold SCL_STATE to prevent vdev open/close/etc. 5589 * while we're attempting to write the vdev labels. 5590 */ 5591 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5592 5593 if (list_is_empty(&spa->spa_config_dirty_list)) { 5594 vdev_t *svd[SPA_DVAS_PER_BP]; 5595 int svdcount = 0; 5596 int children = rvd->vdev_children; 5597 int c0 = spa_get_random(children); 5598 5599 for (int c = 0; c < children; c++) { 5600 vd = rvd->vdev_child[(c0 + c) % children]; 5601 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 5602 continue; 5603 svd[svdcount++] = vd; 5604 if (svdcount == SPA_DVAS_PER_BP) 5605 break; 5606 } 5607 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 5608 if (error != 0) 5609 error = vdev_config_sync(svd, svdcount, txg, 5610 B_TRUE); 5611 } else { 5612 error = vdev_config_sync(rvd->vdev_child, 5613 rvd->vdev_children, txg, B_FALSE); 5614 if (error != 0) 5615 error = vdev_config_sync(rvd->vdev_child, 5616 rvd->vdev_children, txg, B_TRUE); 5617 } 5618 5619 spa_config_exit(spa, SCL_STATE, FTAG); 5620 5621 if (error == 0) 5622 break; 5623 zio_suspend(spa, NULL); 5624 zio_resume_wait(spa); 5625 } 5626 dmu_tx_commit(tx); 5627 5628 /* 5629 * Clear the dirty config list. 5630 */ 5631 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 5632 vdev_config_clean(vd); 5633 5634 /* 5635 * Now that the new config has synced transactionally, 5636 * let it become visible to the config cache. 5637 */ 5638 if (spa->spa_config_syncing != NULL) { 5639 spa_config_set(spa, spa->spa_config_syncing); 5640 spa->spa_config_txg = txg; 5641 spa->spa_config_syncing = NULL; 5642 } 5643 5644 spa->spa_ubsync = spa->spa_uberblock; 5645 5646 dsl_pool_sync_done(dp, txg); 5647 5648 /* 5649 * Update usable space statistics. 5650 */ 5651 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5652 vdev_sync_done(vd, txg); 5653 5654 spa_update_dspace(spa); 5655 5656 /* 5657 * It had better be the case that we didn't dirty anything 5658 * since vdev_config_sync(). 5659 */ 5660 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5661 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5662 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 5663 5664 spa->spa_sync_pass = 0; 5665 5666 spa_config_exit(spa, SCL_CONFIG, FTAG); 5667 5668 spa_handle_ignored_writes(spa); 5669 5670 /* 5671 * If any async tasks have been requested, kick them off. 5672 */ 5673 spa_async_dispatch(spa); 5674 } 5675 5676 /* 5677 * Sync all pools. We don't want to hold the namespace lock across these 5678 * operations, so we take a reference on the spa_t and drop the lock during the 5679 * sync. 5680 */ 5681 void 5682 spa_sync_allpools(void) 5683 { 5684 spa_t *spa = NULL; 5685 mutex_enter(&spa_namespace_lock); 5686 while ((spa = spa_next(spa)) != NULL) { 5687 if (spa_state(spa) != POOL_STATE_ACTIVE || 5688 !spa_writeable(spa) || spa_suspended(spa)) 5689 continue; 5690 spa_open_ref(spa, FTAG); 5691 mutex_exit(&spa_namespace_lock); 5692 txg_wait_synced(spa_get_dsl(spa), 0); 5693 mutex_enter(&spa_namespace_lock); 5694 spa_close(spa, FTAG); 5695 } 5696 mutex_exit(&spa_namespace_lock); 5697 } 5698 5699 /* 5700 * ========================================================================== 5701 * Miscellaneous routines 5702 * ========================================================================== 5703 */ 5704 5705 /* 5706 * Remove all pools in the system. 5707 */ 5708 void 5709 spa_evict_all(void) 5710 { 5711 spa_t *spa; 5712 5713 /* 5714 * Remove all cached state. All pools should be closed now, 5715 * so every spa in the AVL tree should be unreferenced. 5716 */ 5717 mutex_enter(&spa_namespace_lock); 5718 while ((spa = spa_next(NULL)) != NULL) { 5719 /* 5720 * Stop async tasks. The async thread may need to detach 5721 * a device that's been replaced, which requires grabbing 5722 * spa_namespace_lock, so we must drop it here. 5723 */ 5724 spa_open_ref(spa, FTAG); 5725 mutex_exit(&spa_namespace_lock); 5726 spa_async_suspend(spa); 5727 mutex_enter(&spa_namespace_lock); 5728 spa_close(spa, FTAG); 5729 5730 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5731 spa_unload(spa); 5732 spa_deactivate(spa); 5733 } 5734 spa_remove(spa); 5735 } 5736 mutex_exit(&spa_namespace_lock); 5737 } 5738 5739 vdev_t * 5740 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 5741 { 5742 vdev_t *vd; 5743 int i; 5744 5745 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 5746 return (vd); 5747 5748 if (aux) { 5749 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 5750 vd = spa->spa_l2cache.sav_vdevs[i]; 5751 if (vd->vdev_guid == guid) 5752 return (vd); 5753 } 5754 5755 for (i = 0; i < spa->spa_spares.sav_count; i++) { 5756 vd = spa->spa_spares.sav_vdevs[i]; 5757 if (vd->vdev_guid == guid) 5758 return (vd); 5759 } 5760 } 5761 5762 return (NULL); 5763 } 5764 5765 void 5766 spa_upgrade(spa_t *spa, uint64_t version) 5767 { 5768 ASSERT(spa_writeable(spa)); 5769 5770 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5771 5772 /* 5773 * This should only be called for a non-faulted pool, and since a 5774 * future version would result in an unopenable pool, this shouldn't be 5775 * possible. 5776 */ 5777 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 5778 ASSERT(version >= spa->spa_uberblock.ub_version); 5779 5780 spa->spa_uberblock.ub_version = version; 5781 vdev_config_dirty(spa->spa_root_vdev); 5782 5783 spa_config_exit(spa, SCL_ALL, FTAG); 5784 5785 txg_wait_synced(spa_get_dsl(spa), 0); 5786 } 5787 5788 boolean_t 5789 spa_has_spare(spa_t *spa, uint64_t guid) 5790 { 5791 int i; 5792 uint64_t spareguid; 5793 spa_aux_vdev_t *sav = &spa->spa_spares; 5794 5795 for (i = 0; i < sav->sav_count; i++) 5796 if (sav->sav_vdevs[i]->vdev_guid == guid) 5797 return (B_TRUE); 5798 5799 for (i = 0; i < sav->sav_npending; i++) { 5800 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 5801 &spareguid) == 0 && spareguid == guid) 5802 return (B_TRUE); 5803 } 5804 5805 return (B_FALSE); 5806 } 5807 5808 /* 5809 * Check if a pool has an active shared spare device. 5810 * Note: reference count of an active spare is 2, as a spare and as a replace 5811 */ 5812 static boolean_t 5813 spa_has_active_shared_spare(spa_t *spa) 5814 { 5815 int i, refcnt; 5816 uint64_t pool; 5817 spa_aux_vdev_t *sav = &spa->spa_spares; 5818 5819 for (i = 0; i < sav->sav_count; i++) { 5820 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 5821 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 5822 refcnt > 2) 5823 return (B_TRUE); 5824 } 5825 5826 return (B_FALSE); 5827 } 5828 5829 /* 5830 * Post a sysevent corresponding to the given event. The 'name' must be one of 5831 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 5832 * filled in from the spa and (optionally) the vdev. This doesn't do anything 5833 * in the userland libzpool, as we don't want consumers to misinterpret ztest 5834 * or zdb as real changes. 5835 */ 5836 void 5837 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 5838 { 5839 #ifdef _KERNEL 5840 sysevent_t *ev; 5841 sysevent_attr_list_t *attr = NULL; 5842 sysevent_value_t value; 5843 sysevent_id_t eid; 5844 5845 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5846 SE_SLEEP); 5847 5848 value.value_type = SE_DATA_TYPE_STRING; 5849 value.value.sv_string = spa_name(spa); 5850 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5851 goto done; 5852 5853 value.value_type = SE_DATA_TYPE_UINT64; 5854 value.value.sv_uint64 = spa_guid(spa); 5855 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5856 goto done; 5857 5858 if (vd) { 5859 value.value_type = SE_DATA_TYPE_UINT64; 5860 value.value.sv_uint64 = vd->vdev_guid; 5861 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5862 SE_SLEEP) != 0) 5863 goto done; 5864 5865 if (vd->vdev_path) { 5866 value.value_type = SE_DATA_TYPE_STRING; 5867 value.value.sv_string = vd->vdev_path; 5868 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 5869 &value, SE_SLEEP) != 0) 5870 goto done; 5871 } 5872 } 5873 5874 if (sysevent_attach_attributes(ev, attr) != 0) 5875 goto done; 5876 attr = NULL; 5877 5878 (void) log_sysevent(ev, SE_SLEEP, &eid); 5879 5880 done: 5881 if (attr) 5882 sysevent_free_attr(attr); 5883 sysevent_free(ev); 5884 #endif 5885 } 5886