1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/spa_impl.h> 29 #include <sys/spa_boot.h> 30 #include <sys/zio.h> 31 #include <sys/zio_checksum.h> 32 #include <sys/zio_compress.h> 33 #include <sys/dmu.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/zap.h> 36 #include <sys/zil.h> 37 #include <sys/vdev_impl.h> 38 #include <sys/metaslab.h> 39 #include <sys/uberblock_impl.h> 40 #include <sys/txg.h> 41 #include <sys/avl.h> 42 #include <sys/unique.h> 43 #include <sys/dsl_pool.h> 44 #include <sys/dsl_dir.h> 45 #include <sys/dsl_prop.h> 46 #include <sys/dsl_scan.h> 47 #include <sys/fs/zfs.h> 48 #include <sys/metaslab_impl.h> 49 #include <sys/arc.h> 50 #include <sys/ddt.h> 51 #include "zfs_prop.h" 52 #include "zfeature_common.h" 53 54 /* 55 * SPA locking 56 * 57 * There are four basic locks for managing spa_t structures: 58 * 59 * spa_namespace_lock (global mutex) 60 * 61 * This lock must be acquired to do any of the following: 62 * 63 * - Lookup a spa_t by name 64 * - Add or remove a spa_t from the namespace 65 * - Increase spa_refcount from non-zero 66 * - Check if spa_refcount is zero 67 * - Rename a spa_t 68 * - add/remove/attach/detach devices 69 * - Held for the duration of create/destroy/import/export 70 * 71 * It does not need to handle recursion. A create or destroy may 72 * reference objects (files or zvols) in other pools, but by 73 * definition they must have an existing reference, and will never need 74 * to lookup a spa_t by name. 75 * 76 * spa_refcount (per-spa refcount_t protected by mutex) 77 * 78 * This reference count keep track of any active users of the spa_t. The 79 * spa_t cannot be destroyed or freed while this is non-zero. Internally, 80 * the refcount is never really 'zero' - opening a pool implicitly keeps 81 * some references in the DMU. Internally we check against spa_minref, but 82 * present the image of a zero/non-zero value to consumers. 83 * 84 * spa_config_lock[] (per-spa array of rwlocks) 85 * 86 * This protects the spa_t from config changes, and must be held in 87 * the following circumstances: 88 * 89 * - RW_READER to perform I/O to the spa 90 * - RW_WRITER to change the vdev config 91 * 92 * The locking order is fairly straightforward: 93 * 94 * spa_namespace_lock -> spa_refcount 95 * 96 * The namespace lock must be acquired to increase the refcount from 0 97 * or to check if it is zero. 98 * 99 * spa_refcount -> spa_config_lock[] 100 * 101 * There must be at least one valid reference on the spa_t to acquire 102 * the config lock. 103 * 104 * spa_namespace_lock -> spa_config_lock[] 105 * 106 * The namespace lock must always be taken before the config lock. 107 * 108 * 109 * The spa_namespace_lock can be acquired directly and is globally visible. 110 * 111 * The namespace is manipulated using the following functions, all of which 112 * require the spa_namespace_lock to be held. 113 * 114 * spa_lookup() Lookup a spa_t by name. 115 * 116 * spa_add() Create a new spa_t in the namespace. 117 * 118 * spa_remove() Remove a spa_t from the namespace. This also 119 * frees up any memory associated with the spa_t. 120 * 121 * spa_next() Returns the next spa_t in the system, or the 122 * first if NULL is passed. 123 * 124 * spa_evict_all() Shutdown and remove all spa_t structures in 125 * the system. 126 * 127 * spa_guid_exists() Determine whether a pool/device guid exists. 128 * 129 * The spa_refcount is manipulated using the following functions: 130 * 131 * spa_open_ref() Adds a reference to the given spa_t. Must be 132 * called with spa_namespace_lock held if the 133 * refcount is currently zero. 134 * 135 * spa_close() Remove a reference from the spa_t. This will 136 * not free the spa_t or remove it from the 137 * namespace. No locking is required. 138 * 139 * spa_refcount_zero() Returns true if the refcount is currently 140 * zero. Must be called with spa_namespace_lock 141 * held. 142 * 143 * The spa_config_lock[] is an array of rwlocks, ordered as follows: 144 * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. 145 * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). 146 * 147 * To read the configuration, it suffices to hold one of these locks as reader. 148 * To modify the configuration, you must hold all locks as writer. To modify 149 * vdev state without altering the vdev tree's topology (e.g. online/offline), 150 * you must hold SCL_STATE and SCL_ZIO as writer. 151 * 152 * We use these distinct config locks to avoid recursive lock entry. 153 * For example, spa_sync() (which holds SCL_CONFIG as reader) induces 154 * block allocations (SCL_ALLOC), which may require reading space maps 155 * from disk (dmu_read() -> zio_read() -> SCL_ZIO). 156 * 157 * The spa config locks cannot be normal rwlocks because we need the 158 * ability to hand off ownership. For example, SCL_ZIO is acquired 159 * by the issuing thread and later released by an interrupt thread. 160 * They do, however, obey the usual write-wanted semantics to prevent 161 * writer (i.e. system administrator) starvation. 162 * 163 * The lock acquisition rules are as follows: 164 * 165 * SCL_CONFIG 166 * Protects changes to the vdev tree topology, such as vdev 167 * add/remove/attach/detach. Protects the dirty config list 168 * (spa_config_dirty_list) and the set of spares and l2arc devices. 169 * 170 * SCL_STATE 171 * Protects changes to pool state and vdev state, such as vdev 172 * online/offline/fault/degrade/clear. Protects the dirty state list 173 * (spa_state_dirty_list) and global pool state (spa_state). 174 * 175 * SCL_ALLOC 176 * Protects changes to metaslab groups and classes. 177 * Held as reader by metaslab_alloc() and metaslab_claim(). 178 * 179 * SCL_ZIO 180 * Held by bp-level zios (those which have no io_vd upon entry) 181 * to prevent changes to the vdev tree. The bp-level zio implicitly 182 * protects all of its vdev child zios, which do not hold SCL_ZIO. 183 * 184 * SCL_FREE 185 * Protects changes to metaslab groups and classes. 186 * Held as reader by metaslab_free(). SCL_FREE is distinct from 187 * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free 188 * blocks in zio_done() while another i/o that holds either 189 * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. 190 * 191 * SCL_VDEV 192 * Held as reader to prevent changes to the vdev tree during trivial 193 * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the 194 * other locks, and lower than all of them, to ensure that it's safe 195 * to acquire regardless of caller context. 196 * 197 * In addition, the following rules apply: 198 * 199 * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. 200 * The lock ordering is SCL_CONFIG > spa_props_lock. 201 * 202 * (b) I/O operations on leaf vdevs. For any zio operation that takes 203 * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), 204 * or zio_write_phys() -- the caller must ensure that the config cannot 205 * cannot change in the interim, and that the vdev cannot be reopened. 206 * SCL_STATE as reader suffices for both. 207 * 208 * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). 209 * 210 * spa_vdev_enter() Acquire the namespace lock and the config lock 211 * for writing. 212 * 213 * spa_vdev_exit() Release the config lock, wait for all I/O 214 * to complete, sync the updated configs to the 215 * cache, and release the namespace lock. 216 * 217 * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). 218 * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual 219 * locking is, always, based on spa_namespace_lock and spa_config_lock[]. 220 * 221 * spa_rename() is also implemented within this file since it requires 222 * manipulation of the namespace. 223 */ 224 225 static avl_tree_t spa_namespace_avl; 226 kmutex_t spa_namespace_lock; 227 static kcondvar_t spa_namespace_cv; 228 static int spa_active_count; 229 int spa_max_replication_override = SPA_DVAS_PER_BP; 230 231 static kmutex_t spa_spare_lock; 232 static avl_tree_t spa_spare_avl; 233 static kmutex_t spa_l2cache_lock; 234 static avl_tree_t spa_l2cache_avl; 235 236 kmem_cache_t *spa_buffer_pool; 237 int spa_mode_global; 238 239 #ifdef ZFS_DEBUG 240 /* Everything except dprintf is on by default in debug builds */ 241 int zfs_flags = ~ZFS_DEBUG_DPRINTF; 242 #else 243 int zfs_flags = 0; 244 #endif 245 246 /* 247 * zfs_recover can be set to nonzero to attempt to recover from 248 * otherwise-fatal errors, typically caused by on-disk corruption. When 249 * set, calls to zfs_panic_recover() will turn into warning messages. 250 */ 251 int zfs_recover = 0; 252 253 extern int zfs_txg_synctime_ms; 254 255 /* 256 * Expiration time in units of zfs_txg_synctime_ms. This value has two 257 * meanings. First it is used to determine when the spa_deadman logic 258 * should fire. By default the spa_deadman will fire if spa_sync has 259 * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds). 260 * Secondly, the value determines if an I/O is considered "hung". 261 * Any I/O that has not completed in zfs_deadman_synctime is considered 262 * "hung" resulting in a system panic. 263 */ 264 uint64_t zfs_deadman_synctime = 1000ULL; 265 266 /* 267 * Override the zfs deadman behavior via /etc/system. By default the 268 * deadman is enabled except on VMware and sparc deployments. 269 */ 270 int zfs_deadman_enabled = -1; 271 272 273 /* 274 * ========================================================================== 275 * SPA config locking 276 * ========================================================================== 277 */ 278 static void 279 spa_config_lock_init(spa_t *spa) 280 { 281 for (int i = 0; i < SCL_LOCKS; i++) { 282 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 283 mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); 284 cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); 285 refcount_create(&scl->scl_count); 286 scl->scl_writer = NULL; 287 scl->scl_write_wanted = 0; 288 } 289 } 290 291 static void 292 spa_config_lock_destroy(spa_t *spa) 293 { 294 for (int i = 0; i < SCL_LOCKS; i++) { 295 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 296 mutex_destroy(&scl->scl_lock); 297 cv_destroy(&scl->scl_cv); 298 refcount_destroy(&scl->scl_count); 299 ASSERT(scl->scl_writer == NULL); 300 ASSERT(scl->scl_write_wanted == 0); 301 } 302 } 303 304 int 305 spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) 306 { 307 for (int i = 0; i < SCL_LOCKS; i++) { 308 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 309 if (!(locks & (1 << i))) 310 continue; 311 mutex_enter(&scl->scl_lock); 312 if (rw == RW_READER) { 313 if (scl->scl_writer || scl->scl_write_wanted) { 314 mutex_exit(&scl->scl_lock); 315 spa_config_exit(spa, locks ^ (1 << i), tag); 316 return (0); 317 } 318 } else { 319 ASSERT(scl->scl_writer != curthread); 320 if (!refcount_is_zero(&scl->scl_count)) { 321 mutex_exit(&scl->scl_lock); 322 spa_config_exit(spa, locks ^ (1 << i), tag); 323 return (0); 324 } 325 scl->scl_writer = curthread; 326 } 327 (void) refcount_add(&scl->scl_count, tag); 328 mutex_exit(&scl->scl_lock); 329 } 330 return (1); 331 } 332 333 void 334 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) 335 { 336 int wlocks_held = 0; 337 338 for (int i = 0; i < SCL_LOCKS; i++) { 339 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 340 if (scl->scl_writer == curthread) 341 wlocks_held |= (1 << i); 342 if (!(locks & (1 << i))) 343 continue; 344 mutex_enter(&scl->scl_lock); 345 if (rw == RW_READER) { 346 while (scl->scl_writer || scl->scl_write_wanted) { 347 cv_wait(&scl->scl_cv, &scl->scl_lock); 348 } 349 } else { 350 ASSERT(scl->scl_writer != curthread); 351 while (!refcount_is_zero(&scl->scl_count)) { 352 scl->scl_write_wanted++; 353 cv_wait(&scl->scl_cv, &scl->scl_lock); 354 scl->scl_write_wanted--; 355 } 356 scl->scl_writer = curthread; 357 } 358 (void) refcount_add(&scl->scl_count, tag); 359 mutex_exit(&scl->scl_lock); 360 } 361 ASSERT(wlocks_held <= locks); 362 } 363 364 void 365 spa_config_exit(spa_t *spa, int locks, void *tag) 366 { 367 for (int i = SCL_LOCKS - 1; i >= 0; i--) { 368 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 369 if (!(locks & (1 << i))) 370 continue; 371 mutex_enter(&scl->scl_lock); 372 ASSERT(!refcount_is_zero(&scl->scl_count)); 373 if (refcount_remove(&scl->scl_count, tag) == 0) { 374 ASSERT(scl->scl_writer == NULL || 375 scl->scl_writer == curthread); 376 scl->scl_writer = NULL; /* OK in either case */ 377 cv_broadcast(&scl->scl_cv); 378 } 379 mutex_exit(&scl->scl_lock); 380 } 381 } 382 383 int 384 spa_config_held(spa_t *spa, int locks, krw_t rw) 385 { 386 int locks_held = 0; 387 388 for (int i = 0; i < SCL_LOCKS; i++) { 389 spa_config_lock_t *scl = &spa->spa_config_lock[i]; 390 if (!(locks & (1 << i))) 391 continue; 392 if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || 393 (rw == RW_WRITER && scl->scl_writer == curthread)) 394 locks_held |= 1 << i; 395 } 396 397 return (locks_held); 398 } 399 400 /* 401 * ========================================================================== 402 * SPA namespace functions 403 * ========================================================================== 404 */ 405 406 /* 407 * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. 408 * Returns NULL if no matching spa_t is found. 409 */ 410 spa_t * 411 spa_lookup(const char *name) 412 { 413 static spa_t search; /* spa_t is large; don't allocate on stack */ 414 spa_t *spa; 415 avl_index_t where; 416 char c; 417 char *cp; 418 419 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 420 421 /* 422 * If it's a full dataset name, figure out the pool name and 423 * just use that. 424 */ 425 cp = strpbrk(name, "/@"); 426 if (cp) { 427 c = *cp; 428 *cp = '\0'; 429 } 430 431 (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); 432 spa = avl_find(&spa_namespace_avl, &search, &where); 433 434 if (cp) 435 *cp = c; 436 437 return (spa); 438 } 439 440 /* 441 * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. 442 * If the zfs_deadman_enabled flag is set then it inspects all vdev queues 443 * looking for potentially hung I/Os. 444 */ 445 void 446 spa_deadman(void *arg) 447 { 448 spa_t *spa = arg; 449 450 zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", 451 (gethrtime() - spa->spa_sync_starttime) / NANOSEC, 452 ++spa->spa_deadman_calls); 453 if (zfs_deadman_enabled) 454 vdev_deadman(spa->spa_root_vdev); 455 } 456 457 /* 458 * Create an uninitialized spa_t with the given name. Requires 459 * spa_namespace_lock. The caller must ensure that the spa_t doesn't already 460 * exist by calling spa_lookup() first. 461 */ 462 spa_t * 463 spa_add(const char *name, nvlist_t *config, const char *altroot) 464 { 465 spa_t *spa; 466 spa_config_dirent_t *dp; 467 cyc_handler_t hdlr; 468 cyc_time_t when; 469 470 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 471 472 spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); 473 474 mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); 475 mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); 476 mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); 477 mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); 478 mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); 479 mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); 480 mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); 481 mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); 482 mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); 483 484 cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); 485 cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); 486 cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); 487 cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); 488 489 for (int t = 0; t < TXG_SIZE; t++) 490 bplist_create(&spa->spa_free_bplist[t]); 491 492 (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); 493 spa->spa_state = POOL_STATE_UNINITIALIZED; 494 spa->spa_freeze_txg = UINT64_MAX; 495 spa->spa_final_txg = UINT64_MAX; 496 spa->spa_load_max_txg = UINT64_MAX; 497 spa->spa_proc = &p0; 498 spa->spa_proc_state = SPA_PROC_NONE; 499 500 hdlr.cyh_func = spa_deadman; 501 hdlr.cyh_arg = spa; 502 hdlr.cyh_level = CY_LOW_LEVEL; 503 504 spa->spa_deadman_synctime = zfs_deadman_synctime * 505 zfs_txg_synctime_ms * MICROSEC; 506 507 /* 508 * This determines how often we need to check for hung I/Os after 509 * the cyclic has already fired. Since checking for hung I/Os is 510 * an expensive operation we don't want to check too frequently. 511 * Instead wait for 5 synctimes before checking again. 512 */ 513 when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC; 514 when.cyt_when = CY_INFINITY; 515 mutex_enter(&cpu_lock); 516 spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); 517 mutex_exit(&cpu_lock); 518 519 refcount_create(&spa->spa_refcount); 520 spa_config_lock_init(spa); 521 522 avl_add(&spa_namespace_avl, spa); 523 524 /* 525 * Set the alternate root, if there is one. 526 */ 527 if (altroot) { 528 spa->spa_root = spa_strdup(altroot); 529 spa_active_count++; 530 } 531 532 /* 533 * Every pool starts with the default cachefile 534 */ 535 list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), 536 offsetof(spa_config_dirent_t, scd_link)); 537 538 dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); 539 dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); 540 list_insert_head(&spa->spa_config_list, dp); 541 542 VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, 543 KM_SLEEP) == 0); 544 545 if (config != NULL) { 546 nvlist_t *features; 547 548 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, 549 &features) == 0) { 550 VERIFY(nvlist_dup(features, &spa->spa_label_features, 551 0) == 0); 552 } 553 554 VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); 555 } 556 557 if (spa->spa_label_features == NULL) { 558 VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, 559 KM_SLEEP) == 0); 560 } 561 562 return (spa); 563 } 564 565 /* 566 * Removes a spa_t from the namespace, freeing up any memory used. Requires 567 * spa_namespace_lock. This is called only after the spa_t has been closed and 568 * deactivated. 569 */ 570 void 571 spa_remove(spa_t *spa) 572 { 573 spa_config_dirent_t *dp; 574 575 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 576 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 577 578 nvlist_free(spa->spa_config_splitting); 579 580 avl_remove(&spa_namespace_avl, spa); 581 cv_broadcast(&spa_namespace_cv); 582 583 if (spa->spa_root) { 584 spa_strfree(spa->spa_root); 585 spa_active_count--; 586 } 587 588 while ((dp = list_head(&spa->spa_config_list)) != NULL) { 589 list_remove(&spa->spa_config_list, dp); 590 if (dp->scd_path != NULL) 591 spa_strfree(dp->scd_path); 592 kmem_free(dp, sizeof (spa_config_dirent_t)); 593 } 594 595 list_destroy(&spa->spa_config_list); 596 597 nvlist_free(spa->spa_label_features); 598 nvlist_free(spa->spa_load_info); 599 spa_config_set(spa, NULL); 600 601 mutex_enter(&cpu_lock); 602 if (spa->spa_deadman_cycid != CYCLIC_NONE) 603 cyclic_remove(spa->spa_deadman_cycid); 604 mutex_exit(&cpu_lock); 605 spa->spa_deadman_cycid = CYCLIC_NONE; 606 607 refcount_destroy(&spa->spa_refcount); 608 609 spa_config_lock_destroy(spa); 610 611 for (int t = 0; t < TXG_SIZE; t++) 612 bplist_destroy(&spa->spa_free_bplist[t]); 613 614 cv_destroy(&spa->spa_async_cv); 615 cv_destroy(&spa->spa_proc_cv); 616 cv_destroy(&spa->spa_scrub_io_cv); 617 cv_destroy(&spa->spa_suspend_cv); 618 619 mutex_destroy(&spa->spa_async_lock); 620 mutex_destroy(&spa->spa_errlist_lock); 621 mutex_destroy(&spa->spa_errlog_lock); 622 mutex_destroy(&spa->spa_history_lock); 623 mutex_destroy(&spa->spa_proc_lock); 624 mutex_destroy(&spa->spa_props_lock); 625 mutex_destroy(&spa->spa_scrub_lock); 626 mutex_destroy(&spa->spa_suspend_lock); 627 mutex_destroy(&spa->spa_vdev_top_lock); 628 629 kmem_free(spa, sizeof (spa_t)); 630 } 631 632 /* 633 * Given a pool, return the next pool in the namespace, or NULL if there is 634 * none. If 'prev' is NULL, return the first pool. 635 */ 636 spa_t * 637 spa_next(spa_t *prev) 638 { 639 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 640 641 if (prev) 642 return (AVL_NEXT(&spa_namespace_avl, prev)); 643 else 644 return (avl_first(&spa_namespace_avl)); 645 } 646 647 /* 648 * ========================================================================== 649 * SPA refcount functions 650 * ========================================================================== 651 */ 652 653 /* 654 * Add a reference to the given spa_t. Must have at least one reference, or 655 * have the namespace lock held. 656 */ 657 void 658 spa_open_ref(spa_t *spa, void *tag) 659 { 660 ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || 661 MUTEX_HELD(&spa_namespace_lock)); 662 (void) refcount_add(&spa->spa_refcount, tag); 663 } 664 665 /* 666 * Remove a reference to the given spa_t. Must have at least one reference, or 667 * have the namespace lock held. 668 */ 669 void 670 spa_close(spa_t *spa, void *tag) 671 { 672 ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || 673 MUTEX_HELD(&spa_namespace_lock)); 674 (void) refcount_remove(&spa->spa_refcount, tag); 675 } 676 677 /* 678 * Check to see if the spa refcount is zero. Must be called with 679 * spa_namespace_lock held. We really compare against spa_minref, which is the 680 * number of references acquired when opening a pool 681 */ 682 boolean_t 683 spa_refcount_zero(spa_t *spa) 684 { 685 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 686 687 return (refcount_count(&spa->spa_refcount) == spa->spa_minref); 688 } 689 690 /* 691 * ========================================================================== 692 * SPA spare and l2cache tracking 693 * ========================================================================== 694 */ 695 696 /* 697 * Hot spares and cache devices are tracked using the same code below, 698 * for 'auxiliary' devices. 699 */ 700 701 typedef struct spa_aux { 702 uint64_t aux_guid; 703 uint64_t aux_pool; 704 avl_node_t aux_avl; 705 int aux_count; 706 } spa_aux_t; 707 708 static int 709 spa_aux_compare(const void *a, const void *b) 710 { 711 const spa_aux_t *sa = a; 712 const spa_aux_t *sb = b; 713 714 if (sa->aux_guid < sb->aux_guid) 715 return (-1); 716 else if (sa->aux_guid > sb->aux_guid) 717 return (1); 718 else 719 return (0); 720 } 721 722 void 723 spa_aux_add(vdev_t *vd, avl_tree_t *avl) 724 { 725 avl_index_t where; 726 spa_aux_t search; 727 spa_aux_t *aux; 728 729 search.aux_guid = vd->vdev_guid; 730 if ((aux = avl_find(avl, &search, &where)) != NULL) { 731 aux->aux_count++; 732 } else { 733 aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); 734 aux->aux_guid = vd->vdev_guid; 735 aux->aux_count = 1; 736 avl_insert(avl, aux, where); 737 } 738 } 739 740 void 741 spa_aux_remove(vdev_t *vd, avl_tree_t *avl) 742 { 743 spa_aux_t search; 744 spa_aux_t *aux; 745 avl_index_t where; 746 747 search.aux_guid = vd->vdev_guid; 748 aux = avl_find(avl, &search, &where); 749 750 ASSERT(aux != NULL); 751 752 if (--aux->aux_count == 0) { 753 avl_remove(avl, aux); 754 kmem_free(aux, sizeof (spa_aux_t)); 755 } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { 756 aux->aux_pool = 0ULL; 757 } 758 } 759 760 boolean_t 761 spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) 762 { 763 spa_aux_t search, *found; 764 765 search.aux_guid = guid; 766 found = avl_find(avl, &search, NULL); 767 768 if (pool) { 769 if (found) 770 *pool = found->aux_pool; 771 else 772 *pool = 0ULL; 773 } 774 775 if (refcnt) { 776 if (found) 777 *refcnt = found->aux_count; 778 else 779 *refcnt = 0; 780 } 781 782 return (found != NULL); 783 } 784 785 void 786 spa_aux_activate(vdev_t *vd, avl_tree_t *avl) 787 { 788 spa_aux_t search, *found; 789 avl_index_t where; 790 791 search.aux_guid = vd->vdev_guid; 792 found = avl_find(avl, &search, &where); 793 ASSERT(found != NULL); 794 ASSERT(found->aux_pool == 0ULL); 795 796 found->aux_pool = spa_guid(vd->vdev_spa); 797 } 798 799 /* 800 * Spares are tracked globally due to the following constraints: 801 * 802 * - A spare may be part of multiple pools. 803 * - A spare may be added to a pool even if it's actively in use within 804 * another pool. 805 * - A spare in use in any pool can only be the source of a replacement if 806 * the target is a spare in the same pool. 807 * 808 * We keep track of all spares on the system through the use of a reference 809 * counted AVL tree. When a vdev is added as a spare, or used as a replacement 810 * spare, then we bump the reference count in the AVL tree. In addition, we set 811 * the 'vdev_isspare' member to indicate that the device is a spare (active or 812 * inactive). When a spare is made active (used to replace a device in the 813 * pool), we also keep track of which pool its been made a part of. 814 * 815 * The 'spa_spare_lock' protects the AVL tree. These functions are normally 816 * called under the spa_namespace lock as part of vdev reconfiguration. The 817 * separate spare lock exists for the status query path, which does not need to 818 * be completely consistent with respect to other vdev configuration changes. 819 */ 820 821 static int 822 spa_spare_compare(const void *a, const void *b) 823 { 824 return (spa_aux_compare(a, b)); 825 } 826 827 void 828 spa_spare_add(vdev_t *vd) 829 { 830 mutex_enter(&spa_spare_lock); 831 ASSERT(!vd->vdev_isspare); 832 spa_aux_add(vd, &spa_spare_avl); 833 vd->vdev_isspare = B_TRUE; 834 mutex_exit(&spa_spare_lock); 835 } 836 837 void 838 spa_spare_remove(vdev_t *vd) 839 { 840 mutex_enter(&spa_spare_lock); 841 ASSERT(vd->vdev_isspare); 842 spa_aux_remove(vd, &spa_spare_avl); 843 vd->vdev_isspare = B_FALSE; 844 mutex_exit(&spa_spare_lock); 845 } 846 847 boolean_t 848 spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) 849 { 850 boolean_t found; 851 852 mutex_enter(&spa_spare_lock); 853 found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); 854 mutex_exit(&spa_spare_lock); 855 856 return (found); 857 } 858 859 void 860 spa_spare_activate(vdev_t *vd) 861 { 862 mutex_enter(&spa_spare_lock); 863 ASSERT(vd->vdev_isspare); 864 spa_aux_activate(vd, &spa_spare_avl); 865 mutex_exit(&spa_spare_lock); 866 } 867 868 /* 869 * Level 2 ARC devices are tracked globally for the same reasons as spares. 870 * Cache devices currently only support one pool per cache device, and so 871 * for these devices the aux reference count is currently unused beyond 1. 872 */ 873 874 static int 875 spa_l2cache_compare(const void *a, const void *b) 876 { 877 return (spa_aux_compare(a, b)); 878 } 879 880 void 881 spa_l2cache_add(vdev_t *vd) 882 { 883 mutex_enter(&spa_l2cache_lock); 884 ASSERT(!vd->vdev_isl2cache); 885 spa_aux_add(vd, &spa_l2cache_avl); 886 vd->vdev_isl2cache = B_TRUE; 887 mutex_exit(&spa_l2cache_lock); 888 } 889 890 void 891 spa_l2cache_remove(vdev_t *vd) 892 { 893 mutex_enter(&spa_l2cache_lock); 894 ASSERT(vd->vdev_isl2cache); 895 spa_aux_remove(vd, &spa_l2cache_avl); 896 vd->vdev_isl2cache = B_FALSE; 897 mutex_exit(&spa_l2cache_lock); 898 } 899 900 boolean_t 901 spa_l2cache_exists(uint64_t guid, uint64_t *pool) 902 { 903 boolean_t found; 904 905 mutex_enter(&spa_l2cache_lock); 906 found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); 907 mutex_exit(&spa_l2cache_lock); 908 909 return (found); 910 } 911 912 void 913 spa_l2cache_activate(vdev_t *vd) 914 { 915 mutex_enter(&spa_l2cache_lock); 916 ASSERT(vd->vdev_isl2cache); 917 spa_aux_activate(vd, &spa_l2cache_avl); 918 mutex_exit(&spa_l2cache_lock); 919 } 920 921 /* 922 * ========================================================================== 923 * SPA vdev locking 924 * ========================================================================== 925 */ 926 927 /* 928 * Lock the given spa_t for the purpose of adding or removing a vdev. 929 * Grabs the global spa_namespace_lock plus the spa config lock for writing. 930 * It returns the next transaction group for the spa_t. 931 */ 932 uint64_t 933 spa_vdev_enter(spa_t *spa) 934 { 935 mutex_enter(&spa->spa_vdev_top_lock); 936 mutex_enter(&spa_namespace_lock); 937 return (spa_vdev_config_enter(spa)); 938 } 939 940 /* 941 * Internal implementation for spa_vdev_enter(). Used when a vdev 942 * operation requires multiple syncs (i.e. removing a device) while 943 * keeping the spa_namespace_lock held. 944 */ 945 uint64_t 946 spa_vdev_config_enter(spa_t *spa) 947 { 948 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 949 950 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 951 952 return (spa_last_synced_txg(spa) + 1); 953 } 954 955 /* 956 * Used in combination with spa_vdev_config_enter() to allow the syncing 957 * of multiple transactions without releasing the spa_namespace_lock. 958 */ 959 void 960 spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) 961 { 962 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 963 964 int config_changed = B_FALSE; 965 966 ASSERT(txg > spa_last_synced_txg(spa)); 967 968 spa->spa_pending_vdev = NULL; 969 970 /* 971 * Reassess the DTLs. 972 */ 973 vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); 974 975 if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { 976 config_changed = B_TRUE; 977 spa->spa_config_generation++; 978 } 979 980 /* 981 * Verify the metaslab classes. 982 */ 983 ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); 984 ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); 985 986 spa_config_exit(spa, SCL_ALL, spa); 987 988 /* 989 * Panic the system if the specified tag requires it. This 990 * is useful for ensuring that configurations are updated 991 * transactionally. 992 */ 993 if (zio_injection_enabled) 994 zio_handle_panic_injection(spa, tag, 0); 995 996 /* 997 * Note: this txg_wait_synced() is important because it ensures 998 * that there won't be more than one config change per txg. 999 * This allows us to use the txg as the generation number. 1000 */ 1001 if (error == 0) 1002 txg_wait_synced(spa->spa_dsl_pool, txg); 1003 1004 if (vd != NULL) { 1005 ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); 1006 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1007 vdev_free(vd); 1008 spa_config_exit(spa, SCL_ALL, spa); 1009 } 1010 1011 /* 1012 * If the config changed, update the config cache. 1013 */ 1014 if (config_changed) 1015 spa_config_sync(spa, B_FALSE, B_TRUE); 1016 } 1017 1018 /* 1019 * Unlock the spa_t after adding or removing a vdev. Besides undoing the 1020 * locking of spa_vdev_enter(), we also want make sure the transactions have 1021 * synced to disk, and then update the global configuration cache with the new 1022 * information. 1023 */ 1024 int 1025 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) 1026 { 1027 spa_vdev_config_exit(spa, vd, txg, error, FTAG); 1028 mutex_exit(&spa_namespace_lock); 1029 mutex_exit(&spa->spa_vdev_top_lock); 1030 1031 return (error); 1032 } 1033 1034 /* 1035 * Lock the given spa_t for the purpose of changing vdev state. 1036 */ 1037 void 1038 spa_vdev_state_enter(spa_t *spa, int oplocks) 1039 { 1040 int locks = SCL_STATE_ALL | oplocks; 1041 1042 /* 1043 * Root pools may need to read of the underlying devfs filesystem 1044 * when opening up a vdev. Unfortunately if we're holding the 1045 * SCL_ZIO lock it will result in a deadlock when we try to issue 1046 * the read from the root filesystem. Instead we "prefetch" 1047 * the associated vnodes that we need prior to opening the 1048 * underlying devices and cache them so that we can prevent 1049 * any I/O when we are doing the actual open. 1050 */ 1051 if (spa_is_root(spa)) { 1052 int low = locks & ~(SCL_ZIO - 1); 1053 int high = locks & ~low; 1054 1055 spa_config_enter(spa, high, spa, RW_WRITER); 1056 vdev_hold(spa->spa_root_vdev); 1057 spa_config_enter(spa, low, spa, RW_WRITER); 1058 } else { 1059 spa_config_enter(spa, locks, spa, RW_WRITER); 1060 } 1061 spa->spa_vdev_locks = locks; 1062 } 1063 1064 int 1065 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) 1066 { 1067 boolean_t config_changed = B_FALSE; 1068 1069 if (vd != NULL || error == 0) 1070 vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 1071 0, 0, B_FALSE); 1072 1073 if (vd != NULL) { 1074 vdev_state_dirty(vd->vdev_top); 1075 config_changed = B_TRUE; 1076 spa->spa_config_generation++; 1077 } 1078 1079 if (spa_is_root(spa)) 1080 vdev_rele(spa->spa_root_vdev); 1081 1082 ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); 1083 spa_config_exit(spa, spa->spa_vdev_locks, spa); 1084 1085 /* 1086 * If anything changed, wait for it to sync. This ensures that, 1087 * from the system administrator's perspective, zpool(1M) commands 1088 * are synchronous. This is important for things like zpool offline: 1089 * when the command completes, you expect no further I/O from ZFS. 1090 */ 1091 if (vd != NULL) 1092 txg_wait_synced(spa->spa_dsl_pool, 0); 1093 1094 /* 1095 * If the config changed, update the config cache. 1096 */ 1097 if (config_changed) { 1098 mutex_enter(&spa_namespace_lock); 1099 spa_config_sync(spa, B_FALSE, B_TRUE); 1100 mutex_exit(&spa_namespace_lock); 1101 } 1102 1103 return (error); 1104 } 1105 1106 /* 1107 * ========================================================================== 1108 * Miscellaneous functions 1109 * ========================================================================== 1110 */ 1111 1112 void 1113 spa_activate_mos_feature(spa_t *spa, const char *feature) 1114 { 1115 (void) nvlist_add_boolean(spa->spa_label_features, feature); 1116 vdev_config_dirty(spa->spa_root_vdev); 1117 } 1118 1119 void 1120 spa_deactivate_mos_feature(spa_t *spa, const char *feature) 1121 { 1122 (void) nvlist_remove_all(spa->spa_label_features, feature); 1123 vdev_config_dirty(spa->spa_root_vdev); 1124 } 1125 1126 /* 1127 * Rename a spa_t. 1128 */ 1129 int 1130 spa_rename(const char *name, const char *newname) 1131 { 1132 spa_t *spa; 1133 int err; 1134 1135 /* 1136 * Lookup the spa_t and grab the config lock for writing. We need to 1137 * actually open the pool so that we can sync out the necessary labels. 1138 * It's OK to call spa_open() with the namespace lock held because we 1139 * allow recursive calls for other reasons. 1140 */ 1141 mutex_enter(&spa_namespace_lock); 1142 if ((err = spa_open(name, &spa, FTAG)) != 0) { 1143 mutex_exit(&spa_namespace_lock); 1144 return (err); 1145 } 1146 1147 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1148 1149 avl_remove(&spa_namespace_avl, spa); 1150 (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); 1151 avl_add(&spa_namespace_avl, spa); 1152 1153 /* 1154 * Sync all labels to disk with the new names by marking the root vdev 1155 * dirty and waiting for it to sync. It will pick up the new pool name 1156 * during the sync. 1157 */ 1158 vdev_config_dirty(spa->spa_root_vdev); 1159 1160 spa_config_exit(spa, SCL_ALL, FTAG); 1161 1162 txg_wait_synced(spa->spa_dsl_pool, 0); 1163 1164 /* 1165 * Sync the updated config cache. 1166 */ 1167 spa_config_sync(spa, B_FALSE, B_TRUE); 1168 1169 spa_close(spa, FTAG); 1170 1171 mutex_exit(&spa_namespace_lock); 1172 1173 return (0); 1174 } 1175 1176 /* 1177 * Return the spa_t associated with given pool_guid, if it exists. If 1178 * device_guid is non-zero, determine whether the pool exists *and* contains 1179 * a device with the specified device_guid. 1180 */ 1181 spa_t * 1182 spa_by_guid(uint64_t pool_guid, uint64_t device_guid) 1183 { 1184 spa_t *spa; 1185 avl_tree_t *t = &spa_namespace_avl; 1186 1187 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1188 1189 for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { 1190 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1191 continue; 1192 if (spa->spa_root_vdev == NULL) 1193 continue; 1194 if (spa_guid(spa) == pool_guid) { 1195 if (device_guid == 0) 1196 break; 1197 1198 if (vdev_lookup_by_guid(spa->spa_root_vdev, 1199 device_guid) != NULL) 1200 break; 1201 1202 /* 1203 * Check any devices we may be in the process of adding. 1204 */ 1205 if (spa->spa_pending_vdev) { 1206 if (vdev_lookup_by_guid(spa->spa_pending_vdev, 1207 device_guid) != NULL) 1208 break; 1209 } 1210 } 1211 } 1212 1213 return (spa); 1214 } 1215 1216 /* 1217 * Determine whether a pool with the given pool_guid exists. 1218 */ 1219 boolean_t 1220 spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) 1221 { 1222 return (spa_by_guid(pool_guid, device_guid) != NULL); 1223 } 1224 1225 char * 1226 spa_strdup(const char *s) 1227 { 1228 size_t len; 1229 char *new; 1230 1231 len = strlen(s); 1232 new = kmem_alloc(len + 1, KM_SLEEP); 1233 bcopy(s, new, len); 1234 new[len] = '\0'; 1235 1236 return (new); 1237 } 1238 1239 void 1240 spa_strfree(char *s) 1241 { 1242 kmem_free(s, strlen(s) + 1); 1243 } 1244 1245 uint64_t 1246 spa_get_random(uint64_t range) 1247 { 1248 uint64_t r; 1249 1250 ASSERT(range != 0); 1251 1252 (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); 1253 1254 return (r % range); 1255 } 1256 1257 uint64_t 1258 spa_generate_guid(spa_t *spa) 1259 { 1260 uint64_t guid = spa_get_random(-1ULL); 1261 1262 if (spa != NULL) { 1263 while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) 1264 guid = spa_get_random(-1ULL); 1265 } else { 1266 while (guid == 0 || spa_guid_exists(guid, 0)) 1267 guid = spa_get_random(-1ULL); 1268 } 1269 1270 return (guid); 1271 } 1272 1273 void 1274 sprintf_blkptr(char *buf, const blkptr_t *bp) 1275 { 1276 char type[256]; 1277 char *checksum = NULL; 1278 char *compress = NULL; 1279 1280 if (bp != NULL) { 1281 if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { 1282 dmu_object_byteswap_t bswap = 1283 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 1284 (void) snprintf(type, sizeof (type), "bswap %s %s", 1285 DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? 1286 "metadata" : "data", 1287 dmu_ot_byteswap[bswap].ob_name); 1288 } else { 1289 (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, 1290 sizeof (type)); 1291 } 1292 checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; 1293 compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; 1294 } 1295 1296 SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); 1297 } 1298 1299 void 1300 spa_freeze(spa_t *spa) 1301 { 1302 uint64_t freeze_txg = 0; 1303 1304 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1305 if (spa->spa_freeze_txg == UINT64_MAX) { 1306 freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; 1307 spa->spa_freeze_txg = freeze_txg; 1308 } 1309 spa_config_exit(spa, SCL_ALL, FTAG); 1310 if (freeze_txg != 0) 1311 txg_wait_synced(spa_get_dsl(spa), freeze_txg); 1312 } 1313 1314 void 1315 zfs_panic_recover(const char *fmt, ...) 1316 { 1317 va_list adx; 1318 1319 va_start(adx, fmt); 1320 vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); 1321 va_end(adx); 1322 } 1323 1324 /* 1325 * This is a stripped-down version of strtoull, suitable only for converting 1326 * lowercase hexidecimal numbers that don't overflow. 1327 */ 1328 uint64_t 1329 strtonum(const char *str, char **nptr) 1330 { 1331 uint64_t val = 0; 1332 char c; 1333 int digit; 1334 1335 while ((c = *str) != '\0') { 1336 if (c >= '0' && c <= '9') 1337 digit = c - '0'; 1338 else if (c >= 'a' && c <= 'f') 1339 digit = 10 + c - 'a'; 1340 else 1341 break; 1342 1343 val *= 16; 1344 val += digit; 1345 1346 str++; 1347 } 1348 1349 if (nptr) 1350 *nptr = (char *)str; 1351 1352 return (val); 1353 } 1354 1355 /* 1356 * ========================================================================== 1357 * Accessor functions 1358 * ========================================================================== 1359 */ 1360 1361 boolean_t 1362 spa_shutting_down(spa_t *spa) 1363 { 1364 return (spa->spa_async_suspended); 1365 } 1366 1367 dsl_pool_t * 1368 spa_get_dsl(spa_t *spa) 1369 { 1370 return (spa->spa_dsl_pool); 1371 } 1372 1373 boolean_t 1374 spa_is_initializing(spa_t *spa) 1375 { 1376 return (spa->spa_is_initializing); 1377 } 1378 1379 blkptr_t * 1380 spa_get_rootblkptr(spa_t *spa) 1381 { 1382 return (&spa->spa_ubsync.ub_rootbp); 1383 } 1384 1385 void 1386 spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) 1387 { 1388 spa->spa_uberblock.ub_rootbp = *bp; 1389 } 1390 1391 void 1392 spa_altroot(spa_t *spa, char *buf, size_t buflen) 1393 { 1394 if (spa->spa_root == NULL) 1395 buf[0] = '\0'; 1396 else 1397 (void) strncpy(buf, spa->spa_root, buflen); 1398 } 1399 1400 int 1401 spa_sync_pass(spa_t *spa) 1402 { 1403 return (spa->spa_sync_pass); 1404 } 1405 1406 char * 1407 spa_name(spa_t *spa) 1408 { 1409 return (spa->spa_name); 1410 } 1411 1412 uint64_t 1413 spa_guid(spa_t *spa) 1414 { 1415 dsl_pool_t *dp = spa_get_dsl(spa); 1416 uint64_t guid; 1417 1418 /* 1419 * If we fail to parse the config during spa_load(), we can go through 1420 * the error path (which posts an ereport) and end up here with no root 1421 * vdev. We stash the original pool guid in 'spa_config_guid' to handle 1422 * this case. 1423 */ 1424 if (spa->spa_root_vdev == NULL) 1425 return (spa->spa_config_guid); 1426 1427 guid = spa->spa_last_synced_guid != 0 ? 1428 spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; 1429 1430 /* 1431 * Return the most recently synced out guid unless we're 1432 * in syncing context. 1433 */ 1434 if (dp && dsl_pool_sync_context(dp)) 1435 return (spa->spa_root_vdev->vdev_guid); 1436 else 1437 return (guid); 1438 } 1439 1440 uint64_t 1441 spa_load_guid(spa_t *spa) 1442 { 1443 /* 1444 * This is a GUID that exists solely as a reference for the 1445 * purposes of the arc. It is generated at load time, and 1446 * is never written to persistent storage. 1447 */ 1448 return (spa->spa_load_guid); 1449 } 1450 1451 uint64_t 1452 spa_last_synced_txg(spa_t *spa) 1453 { 1454 return (spa->spa_ubsync.ub_txg); 1455 } 1456 1457 uint64_t 1458 spa_first_txg(spa_t *spa) 1459 { 1460 return (spa->spa_first_txg); 1461 } 1462 1463 uint64_t 1464 spa_syncing_txg(spa_t *spa) 1465 { 1466 return (spa->spa_syncing_txg); 1467 } 1468 1469 pool_state_t 1470 spa_state(spa_t *spa) 1471 { 1472 return (spa->spa_state); 1473 } 1474 1475 spa_load_state_t 1476 spa_load_state(spa_t *spa) 1477 { 1478 return (spa->spa_load_state); 1479 } 1480 1481 uint64_t 1482 spa_freeze_txg(spa_t *spa) 1483 { 1484 return (spa->spa_freeze_txg); 1485 } 1486 1487 /* ARGSUSED */ 1488 uint64_t 1489 spa_get_asize(spa_t *spa, uint64_t lsize) 1490 { 1491 /* 1492 * The worst case is single-sector max-parity RAID-Z blocks, in which 1493 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) 1494 * times the size; so just assume that. Add to this the fact that 1495 * we can have up to 3 DVAs per bp, and one more factor of 2 because 1496 * the block may be dittoed with up to 3 DVAs by ddt_sync(). 1497 */ 1498 return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); 1499 } 1500 1501 uint64_t 1502 spa_get_dspace(spa_t *spa) 1503 { 1504 return (spa->spa_dspace); 1505 } 1506 1507 void 1508 spa_update_dspace(spa_t *spa) 1509 { 1510 spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + 1511 ddt_get_dedup_dspace(spa); 1512 } 1513 1514 /* 1515 * Return the failure mode that has been set to this pool. The default 1516 * behavior will be to block all I/Os when a complete failure occurs. 1517 */ 1518 uint8_t 1519 spa_get_failmode(spa_t *spa) 1520 { 1521 return (spa->spa_failmode); 1522 } 1523 1524 boolean_t 1525 spa_suspended(spa_t *spa) 1526 { 1527 return (spa->spa_suspended); 1528 } 1529 1530 uint64_t 1531 spa_version(spa_t *spa) 1532 { 1533 return (spa->spa_ubsync.ub_version); 1534 } 1535 1536 boolean_t 1537 spa_deflate(spa_t *spa) 1538 { 1539 return (spa->spa_deflate); 1540 } 1541 1542 metaslab_class_t * 1543 spa_normal_class(spa_t *spa) 1544 { 1545 return (spa->spa_normal_class); 1546 } 1547 1548 metaslab_class_t * 1549 spa_log_class(spa_t *spa) 1550 { 1551 return (spa->spa_log_class); 1552 } 1553 1554 int 1555 spa_max_replication(spa_t *spa) 1556 { 1557 /* 1558 * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to 1559 * handle BPs with more than one DVA allocated. Set our max 1560 * replication level accordingly. 1561 */ 1562 if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) 1563 return (1); 1564 return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); 1565 } 1566 1567 int 1568 spa_prev_software_version(spa_t *spa) 1569 { 1570 return (spa->spa_prev_software_version); 1571 } 1572 1573 uint64_t 1574 spa_deadman_synctime(spa_t *spa) 1575 { 1576 return (spa->spa_deadman_synctime); 1577 } 1578 1579 uint64_t 1580 dva_get_dsize_sync(spa_t *spa, const dva_t *dva) 1581 { 1582 uint64_t asize = DVA_GET_ASIZE(dva); 1583 uint64_t dsize = asize; 1584 1585 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1586 1587 if (asize != 0 && spa->spa_deflate) { 1588 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); 1589 dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; 1590 } 1591 1592 return (dsize); 1593 } 1594 1595 uint64_t 1596 bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) 1597 { 1598 uint64_t dsize = 0; 1599 1600 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 1601 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1602 1603 return (dsize); 1604 } 1605 1606 uint64_t 1607 bp_get_dsize(spa_t *spa, const blkptr_t *bp) 1608 { 1609 uint64_t dsize = 0; 1610 1611 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); 1612 1613 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 1614 dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); 1615 1616 spa_config_exit(spa, SCL_VDEV, FTAG); 1617 1618 return (dsize); 1619 } 1620 1621 /* 1622 * ========================================================================== 1623 * Initialization and Termination 1624 * ========================================================================== 1625 */ 1626 1627 static int 1628 spa_name_compare(const void *a1, const void *a2) 1629 { 1630 const spa_t *s1 = a1; 1631 const spa_t *s2 = a2; 1632 int s; 1633 1634 s = strcmp(s1->spa_name, s2->spa_name); 1635 if (s > 0) 1636 return (1); 1637 if (s < 0) 1638 return (-1); 1639 return (0); 1640 } 1641 1642 int 1643 spa_busy(void) 1644 { 1645 return (spa_active_count); 1646 } 1647 1648 void 1649 spa_boot_init() 1650 { 1651 spa_config_load(); 1652 } 1653 1654 void 1655 spa_init(int mode) 1656 { 1657 mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); 1658 mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); 1659 mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); 1660 cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); 1661 1662 avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), 1663 offsetof(spa_t, spa_avl)); 1664 1665 avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), 1666 offsetof(spa_aux_t, aux_avl)); 1667 1668 avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), 1669 offsetof(spa_aux_t, aux_avl)); 1670 1671 spa_mode_global = mode; 1672 1673 #ifdef _KERNEL 1674 spa_arch_init(); 1675 #else 1676 if (spa_mode_global != FREAD && dprintf_find_string("watch")) { 1677 arc_procfd = open("/proc/self/ctl", O_WRONLY); 1678 if (arc_procfd == -1) { 1679 perror("could not enable watchpoints: " 1680 "opening /proc/self/ctl failed: "); 1681 } else { 1682 arc_watch = B_TRUE; 1683 } 1684 } 1685 #endif 1686 1687 refcount_init(); 1688 unique_init(); 1689 space_map_init(); 1690 zio_init(); 1691 dmu_init(); 1692 zil_init(); 1693 vdev_cache_stat_init(); 1694 zfs_prop_init(); 1695 zpool_prop_init(); 1696 zpool_feature_init(); 1697 spa_config_load(); 1698 l2arc_start(); 1699 } 1700 1701 void 1702 spa_fini(void) 1703 { 1704 l2arc_stop(); 1705 1706 spa_evict_all(); 1707 1708 vdev_cache_stat_fini(); 1709 zil_fini(); 1710 dmu_fini(); 1711 zio_fini(); 1712 space_map_fini(); 1713 unique_fini(); 1714 refcount_fini(); 1715 1716 avl_destroy(&spa_namespace_avl); 1717 avl_destroy(&spa_spare_avl); 1718 avl_destroy(&spa_l2cache_avl); 1719 1720 cv_destroy(&spa_namespace_cv); 1721 mutex_destroy(&spa_namespace_lock); 1722 mutex_destroy(&spa_spare_lock); 1723 mutex_destroy(&spa_l2cache_lock); 1724 } 1725 1726 /* 1727 * Return whether this pool has slogs. No locking needed. 1728 * It's not a problem if the wrong answer is returned as it's only for 1729 * performance and not correctness 1730 */ 1731 boolean_t 1732 spa_has_slogs(spa_t *spa) 1733 { 1734 return (spa->spa_log_class->mc_rotor != NULL); 1735 } 1736 1737 spa_log_state_t 1738 spa_get_log_state(spa_t *spa) 1739 { 1740 return (spa->spa_log_state); 1741 } 1742 1743 void 1744 spa_set_log_state(spa_t *spa, spa_log_state_t state) 1745 { 1746 spa->spa_log_state = state; 1747 } 1748 1749 boolean_t 1750 spa_is_root(spa_t *spa) 1751 { 1752 return (spa->spa_is_root); 1753 } 1754 1755 boolean_t 1756 spa_writeable(spa_t *spa) 1757 { 1758 return (!!(spa->spa_mode & FWRITE)); 1759 } 1760 1761 int 1762 spa_mode(spa_t *spa) 1763 { 1764 return (spa->spa_mode); 1765 } 1766 1767 uint64_t 1768 spa_bootfs(spa_t *spa) 1769 { 1770 return (spa->spa_bootfs); 1771 } 1772 1773 uint64_t 1774 spa_delegation(spa_t *spa) 1775 { 1776 return (spa->spa_delegation); 1777 } 1778 1779 objset_t * 1780 spa_meta_objset(spa_t *spa) 1781 { 1782 return (spa->spa_meta_objset); 1783 } 1784 1785 enum zio_checksum 1786 spa_dedup_checksum(spa_t *spa) 1787 { 1788 return (spa->spa_dedup_checksum); 1789 } 1790 1791 /* 1792 * Reset pool scan stat per scan pass (or reboot). 1793 */ 1794 void 1795 spa_scan_stat_init(spa_t *spa) 1796 { 1797 /* data not stored on disk */ 1798 spa->spa_scan_pass_start = gethrestime_sec(); 1799 spa->spa_scan_pass_exam = 0; 1800 vdev_scan_stat_init(spa->spa_root_vdev); 1801 } 1802 1803 /* 1804 * Get scan stats for zpool status reports 1805 */ 1806 int 1807 spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) 1808 { 1809 dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; 1810 1811 if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) 1812 return (ENOENT); 1813 bzero(ps, sizeof (pool_scan_stat_t)); 1814 1815 /* data stored on disk */ 1816 ps->pss_func = scn->scn_phys.scn_func; 1817 ps->pss_start_time = scn->scn_phys.scn_start_time; 1818 ps->pss_end_time = scn->scn_phys.scn_end_time; 1819 ps->pss_to_examine = scn->scn_phys.scn_to_examine; 1820 ps->pss_examined = scn->scn_phys.scn_examined; 1821 ps->pss_to_process = scn->scn_phys.scn_to_process; 1822 ps->pss_processed = scn->scn_phys.scn_processed; 1823 ps->pss_errors = scn->scn_phys.scn_errors; 1824 ps->pss_state = scn->scn_phys.scn_state; 1825 1826 /* data not stored on disk */ 1827 ps->pss_pass_start = spa->spa_scan_pass_start; 1828 ps->pss_pass_exam = spa->spa_scan_pass_exam; 1829 1830 return (0); 1831 } 1832 1833 boolean_t 1834 spa_debug_enabled(spa_t *spa) 1835 { 1836 return (spa->spa_debug); 1837 } 1838