Lines Matching +full:inactive +full:- +full:delay +full:- +full:ms

1 // SPDX-License-Identifier: CDDL-1.0
10 * or https://opensource.org/licenses/CDDL-1.0.
52 * data to each disk before moving on to the next top-level vdev.
84 * The in-core space map representation is more compact than its on-disk form.
85 * The zfs_condense_pct determines how much more compact the in-core
86 * space map representation must be before we compact it on-disk.
173 * in a space map to continue allocations in a first-fit fashion.
175 * switch to using best-fit allocations.
183 * high-performance storage.
215 * unloaded sooner. These settings are intended to be generous -- to keep
242 * Enable/disable space-based metaslab group biasing.
247 * Control performance-based metaslab group biasing.
257 * Enable/disable segment-based metaslab selection.
262 * When using segment-based metaslab selection, we will continue
276 * in a given list when running in non-debug mode. We limit the number
277 * of entries in non-debug mode to prevent us from using up too much memory.
292 * To avoid 64-bit overflow, don't set above UINT32_MAX.
304 * Force the per-metaslab range trees to use 64-bit integers to store
310 * By default we only store segments over a certain size in the size-sorted
321 * gang allocation. If that fails then we will have a multi-layer gang
327 * that fails then we will have a multi-layer gang block.
340 * metaslabs all have free segments in the 32-63K bucket, but the best
392 metaslab_ksp->ks_data = &metaslab_stats; in metaslab_stat_init()
421 mc_allocator[spa->spa_alloc_count]), KM_SLEEP); in metaslab_class_create()
423 mc->mc_spa = spa; in metaslab_class_create()
424 mc->mc_name = name; in metaslab_class_create()
425 mc->mc_ops = ops; in metaslab_class_create()
426 mc->mc_is_log = is_log; in metaslab_class_create()
427 mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE; in metaslab_class_create()
428 mc->mc_alloc_max = UINT64_MAX; in metaslab_class_create()
429 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_class_create()
430 multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), in metaslab_class_create()
432 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_create()
433 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_create()
434 mutex_init(&mca->mca_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_class_create()
435 avl_create(&mca->mca_tree, zio_bookmark_compare, in metaslab_class_create()
437 mca->mca_rotor = NULL; in metaslab_class_create()
438 mca->mca_reserved = 0; in metaslab_class_create()
447 spa_t *spa = mc->mc_spa; in metaslab_class_destroy()
449 ASSERT(mc->mc_alloc == 0); in metaslab_class_destroy()
450 ASSERT(mc->mc_deferred == 0); in metaslab_class_destroy()
451 ASSERT(mc->mc_space == 0); in metaslab_class_destroy()
452 ASSERT(mc->mc_dspace == 0); in metaslab_class_destroy()
454 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_destroy()
455 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_destroy()
456 avl_destroy(&mca->mca_tree); in metaslab_class_destroy()
457 mutex_destroy(&mca->mca_lock); in metaslab_class_destroy()
458 ASSERT(mca->mca_rotor == NULL); in metaslab_class_destroy()
459 ASSERT0(mca->mca_reserved); in metaslab_class_destroy()
461 mutex_destroy(&mc->mc_lock); in metaslab_class_destroy()
462 multilist_destroy(&mc->mc_metaslab_txg_list); in metaslab_class_destroy()
464 mc_allocator[spa->spa_alloc_count])); in metaslab_class_destroy()
471 spa_t *spa = mc->mc_spa; in metaslab_class_validate()
479 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_validate()
480 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_validate()
483 ASSERT0(avl_numnodes(&mca->mca_tree)); in metaslab_class_validate()
484 ASSERT0(mca->mca_reserved); in metaslab_class_validate()
486 if ((mg = rotor = mca->mca_rotor) == NULL) in metaslab_class_validate()
489 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_class_validate()
490 vdev_t *vd = mg->mg_vd; in metaslab_class_validate()
492 ASSERT3P(vd->vdev_top, ==, vd); in metaslab_class_validate()
493 ASSERT(vd->vdev_mg == mg || vd->vdev_log_mg == mg); in metaslab_class_validate()
494 ASSERT3P(mg->mg_class, ==, mc); in metaslab_class_validate()
495 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); in metaslab_class_validate()
496 ASSERT0(zfs_refcount_count(&mga->mga_queue_depth)); in metaslab_class_validate()
497 } while ((mg = mg->mg_next) != rotor); in metaslab_class_validate()
503 * For each metaslab group in a class pre-calculate allocation quota and
505 * Based on those pre-calculate class allocation throttle threshold for
517 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || in metaslab_class_balance()
518 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); in metaslab_class_balance()
523 if (mc->mc_groups == 0) { in metaslab_class_balance()
525 mc->mc_alloc_throttle_enabled = B_FALSE; in metaslab_class_balance()
526 mc->mc_alloc_max = UINT64_MAX; in metaslab_class_balance()
535 mc->mc_alloc_io_size = (3 * mc->mc_alloc_io_size + in metaslab_class_balance()
537 mc->mc_alloc_throttle_enabled = mc->mc_is_log ? 0 : in metaslab_class_balance()
541 mg = first = mc->mc_allocator[0].mca_rotor; in metaslab_class_balance()
544 children += vdev_get_ndisks(mg->mg_vd) - in metaslab_class_balance()
545 vdev_get_nparity(mg->mg_vd); in metaslab_class_balance()
546 } while ((mg = mg->mg_next) != first); in metaslab_class_balance()
550 vdev_stat_t *vs = &mg->mg_vd->vdev_stat; in metaslab_class_balance()
556 * to keep decent per-child I/O size. in metaslab_class_balance()
559 mc->mc_groups, mc->mc_alloc_io_size * 4); in metaslab_class_balance()
566 if (mc->mc_space > 0 && vs->vs_space > 0) { in metaslab_class_balance()
567 ratio = vs->vs_space / (mc->mc_space / (mc->mc_groups * in metaslab_class_balance()
588 mc->mc_space > 0 && vs->vs_space > 0) { in metaslab_class_balance()
589 uint64_t vs_free = vs->vs_space > vs->vs_alloc ? in metaslab_class_balance()
590 vs->vs_space - vs->vs_alloc : 0; in metaslab_class_balance()
591 uint64_t mc_free = mc->mc_space > mc->mc_alloc ? in metaslab_class_balance()
592 mc->mc_space - mc->mc_alloc : 0; in metaslab_class_balance()
594 * vs_fr is 16 bit fixed-point free space fraction. in metaslab_class_balance()
595 * mc_fr is 8 bit fixed-point free space fraction. in metaslab_class_balance()
596 * ratio as their quotient is 8 bit fixed-point. in metaslab_class_balance()
598 uint_t vs_fr = vs_free / (vs->vs_space / 65536 + 1); in metaslab_class_balance()
599 uint_t mc_fr = mc_free / (mc->mc_space / 256 + 1); in metaslab_class_balance()
601 mg->mg_aliquot = mg_aliquot * ratio / 256; in metaslab_class_balance()
604 mg->mg_queue_target = MAX(mg->mg_aliquot, in metaslab_class_balance()
605 mg->mg_aliquot * ratio / 65536); in metaslab_class_balance()
607 mg->mg_aliquot = mg_aliquot; in metaslab_class_balance()
608 mg->mg_queue_target = mg->mg_aliquot * 2; in metaslab_class_balance()
610 sum_aliquot += mg->mg_aliquot; in metaslab_class_balance()
611 } while ((mg = mg->mg_next) != first); in metaslab_class_balance()
614 * Set per-class allocation throttle threshold to 4 iterations through in metaslab_class_balance()
619 mc->mc_alloc_max = sum_aliquot * 4; in metaslab_class_balance()
626 metaslab_class_t *mc = mg->mg_class; in metaslab_class_rotate()
627 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_class_rotate()
628 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_class_rotate()
634 if (mc->mc_groups < 2 || mca->mca_rotor != mg) in metaslab_class_rotate()
640 if (!success || mc->mc_is_log) in metaslab_class_rotate()
651 uint64_t naq = atomic_add_64_nv(&mca->mca_aliquot, psize) + psize / 2; in metaslab_class_rotate()
652 if (naq < mg->mg_aliquot) in metaslab_class_rotate()
654 if (naq >= mg->mg_queue_target) in metaslab_class_rotate()
656 if (zfs_refcount_count(&mga->mga_queue_depth) + psize + psize / 2 >= in metaslab_class_rotate()
657 mg->mg_queue_target) in metaslab_class_rotate()
669 spa_t *spa = mc->mc_spa; in metaslab_class_rotate()
676 if (dp->dp_dirty_total > busy_thresh || spa_has_pending_synctask(spa)) in metaslab_class_rotate()
680 mca->mca_rotor = mg->mg_next; in metaslab_class_rotate()
681 mca->mca_aliquot = 0; in metaslab_class_rotate()
688 atomic_add_64(&mc->mc_alloc, alloc_delta); in metaslab_class_space_update()
689 atomic_add_64(&mc->mc_deferred, defer_delta); in metaslab_class_space_update()
690 atomic_add_64(&mc->mc_space, space_delta); in metaslab_class_space_update()
691 atomic_add_64(&mc->mc_dspace, dspace_delta); in metaslab_class_space_update()
697 return (mc->mc_name); in metaslab_class_get_name()
703 return (mc->mc_alloc); in metaslab_class_get_alloc()
709 return (mc->mc_deferred); in metaslab_class_get_deferred()
715 return (mc->mc_space); in metaslab_class_get_space()
721 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); in metaslab_class_get_dspace()
727 spa_t *spa = mc->mc_spa; in metaslab_class_histogram_verify()
728 vdev_t *rvd = spa->spa_root_vdev; in metaslab_class_histogram_verify()
738 mutex_enter(&mc->mc_lock); in metaslab_class_histogram_verify()
739 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_histogram_verify()
740 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_histogram_verify()
744 * Skip any holes, uninitialized top-levels, or in metaslab_class_histogram_verify()
747 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_histogram_verify()
748 mg->mg_class != mc) { in metaslab_class_histogram_verify()
752 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_class_histogram_verify()
753 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_class_histogram_verify()
756 mc_hist[i] += mg->mg_histogram[i]; in metaslab_class_histogram_verify()
760 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); in metaslab_class_histogram_verify()
763 mutex_exit(&mc->mc_lock); in metaslab_class_histogram_verify()
777 vdev_t *rvd = mc->mc_spa->spa_root_vdev; in metaslab_class_fragmentation()
780 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); in metaslab_class_fragmentation()
782 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_fragmentation()
783 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_fragmentation()
784 metaslab_group_t *mg = tvd->vdev_mg; in metaslab_class_fragmentation()
787 * Skip any holes, uninitialized top-levels, in metaslab_class_fragmentation()
790 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_fragmentation()
791 mg->mg_class != mc) { in metaslab_class_fragmentation()
799 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { in metaslab_class_fragmentation()
800 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_fragmentation()
808 fragmentation += mg->mg_fragmentation * in metaslab_class_fragmentation()
814 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_fragmentation()
827 vdev_t *rvd = mc->mc_spa->spa_root_vdev; in metaslab_class_expandable_space()
830 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); in metaslab_class_expandable_space()
831 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_expandable_space()
832 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_expandable_space()
833 metaslab_group_t *mg = tvd->vdev_mg; in metaslab_class_expandable_space()
835 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_expandable_space()
836 mg->mg_class != mc) { in metaslab_class_expandable_space()
845 space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, in metaslab_class_expandable_space()
846 1ULL << tvd->vdev_ms_shift, uint64_t); in metaslab_class_expandable_space()
848 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_expandable_space()
855 multilist_t *ml = &mc->mc_metaslab_txg_list; in metaslab_class_evict_old()
857 /* Round delay up to next second. */ in metaslab_class_evict_old()
858 uint_t delay = (metaslab_unload_delay_ms + 999) / 1000; in metaslab_class_evict_old() local
864 mutex_enter(&msp->ms_lock); in metaslab_class_evict_old()
872 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_class_evict_old()
873 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
874 i--; in metaslab_class_evict_old()
881 msp->ms_selected_txg + metaslab_unload_delay && in metaslab_class_evict_old()
882 now > msp->ms_selected_time + delay && in metaslab_class_evict_old()
883 (msp->ms_allocator == -1 || in metaslab_class_evict_old()
892 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
895 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
909 if (m1->ms_allocator != -1 && m1->ms_primary) in metaslab_compare()
911 else if (m1->ms_allocator != -1 && !m1->ms_primary) in metaslab_compare()
913 if (m2->ms_allocator != -1 && m2->ms_primary) in metaslab_compare()
915 else if (m2->ms_allocator != -1 && !m2->ms_primary) in metaslab_compare()
919 * Sort inactive metaslabs first, then primaries, then secondaries. When in metaslab_compare()
922 * metaslabs, or can't allocate from them, it searches for an inactive in metaslab_compare()
927 return (-1); in metaslab_compare()
931 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); in metaslab_compare()
935 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); in metaslab_compare()
937 return (TREE_CMP(m1->ms_start, m2->ms_start)); in metaslab_compare()
950 * transitions from allocatable to non-allocatable or vice versa then the
956 vdev_t *vd = mg->mg_vd; in metaslab_group_alloc_update()
957 metaslab_class_t *mc = mg->mg_class; in metaslab_group_alloc_update()
958 vdev_stat_t *vs = &vd->vdev_stat; in metaslab_group_alloc_update()
962 ASSERT(vd == vd->vdev_top); in metaslab_group_alloc_update()
963 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, in metaslab_group_alloc_update()
966 mutex_enter(&mg->mg_lock); in metaslab_group_alloc_update()
967 was_allocatable = mg->mg_allocatable; in metaslab_group_alloc_update()
968 was_initialized = mg->mg_initialized; in metaslab_group_alloc_update()
970 uint64_t free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / in metaslab_group_alloc_update()
971 (vs->vs_space + 1); in metaslab_group_alloc_update()
973 mutex_enter(&mc->mc_lock); in metaslab_group_alloc_update()
979 * for allocations. We also don't consider non-activated in metaslab_group_alloc_update()
983 mg->mg_initialized = metaslab_group_initialized(mg); in metaslab_group_alloc_update()
984 if (!was_initialized && mg->mg_initialized) { in metaslab_group_alloc_update()
985 mc->mc_groups++; in metaslab_group_alloc_update()
986 } else if (was_initialized && !mg->mg_initialized) { in metaslab_group_alloc_update()
987 ASSERT3U(mc->mc_groups, >, 0); in metaslab_group_alloc_update()
988 mc->mc_groups--; in metaslab_group_alloc_update()
990 if (mg->mg_initialized) in metaslab_group_alloc_update()
991 mg->mg_no_free_space = B_FALSE; in metaslab_group_alloc_update()
999 mg->mg_allocatable = (mg->mg_activation_count > 0 && in metaslab_group_alloc_update()
1001 (mg->mg_fragmentation == ZFS_FRAG_INVALID || in metaslab_group_alloc_update()
1002 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); in metaslab_group_alloc_update()
1012 * When a group transitions from allocatable to non-allocatable or in metaslab_group_alloc_update()
1019 if (was_allocatable && !mg->mg_allocatable) in metaslab_group_alloc_update()
1020 mc->mc_alloc_groups--; in metaslab_group_alloc_update()
1021 else if (!was_allocatable && mg->mg_allocatable) in metaslab_group_alloc_update()
1022 mc->mc_alloc_groups++; in metaslab_group_alloc_update()
1023 mutex_exit(&mc->mc_lock); in metaslab_group_alloc_update()
1025 mutex_exit(&mg->mg_lock); in metaslab_group_alloc_update()
1034 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); in metaslab_sort_by_flushed()
1038 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; in metaslab_sort_by_flushed()
1039 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; in metaslab_sort_by_flushed()
1044 return (TREE_CMP(a->ms_id, b->ms_id)); in metaslab_sort_by_flushed()
1050 spa_t *spa = mc->mc_spa; in metaslab_group_create()
1054 mg_allocator[spa->spa_alloc_count]), KM_SLEEP); in metaslab_group_create()
1055 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_group_create()
1056 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_group_create()
1057 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); in metaslab_group_create()
1058 avl_create(&mg->mg_metaslab_tree, metaslab_compare, in metaslab_group_create()
1060 mg->mg_vd = vd; in metaslab_group_create()
1061 mg->mg_class = mc; in metaslab_group_create()
1062 mg->mg_activation_count = 0; in metaslab_group_create()
1063 mg->mg_initialized = B_FALSE; in metaslab_group_create()
1064 mg->mg_no_free_space = B_TRUE; in metaslab_group_create()
1066 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_create()
1067 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_create()
1068 zfs_refcount_create_tracked(&mga->mga_queue_depth); in metaslab_group_create()
1077 spa_t *spa = mg->mg_class->mc_spa; in metaslab_group_destroy()
1079 ASSERT(mg->mg_prev == NULL); in metaslab_group_destroy()
1080 ASSERT(mg->mg_next == NULL); in metaslab_group_destroy()
1086 ASSERT(mg->mg_activation_count <= 0); in metaslab_group_destroy()
1088 avl_destroy(&mg->mg_metaslab_tree); in metaslab_group_destroy()
1089 mutex_destroy(&mg->mg_lock); in metaslab_group_destroy()
1090 mutex_destroy(&mg->mg_ms_disabled_lock); in metaslab_group_destroy()
1091 cv_destroy(&mg->mg_ms_disabled_cv); in metaslab_group_destroy()
1093 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_destroy()
1094 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_destroy()
1095 zfs_refcount_destroy(&mga->mga_queue_depth); in metaslab_group_destroy()
1098 mg_allocator[spa->spa_alloc_count])); in metaslab_group_destroy()
1104 metaslab_class_t *mc = mg->mg_class; in metaslab_group_activate()
1105 spa_t *spa = mc->mc_spa; in metaslab_group_activate()
1110 ASSERT(mg->mg_prev == NULL); in metaslab_group_activate()
1111 ASSERT(mg->mg_next == NULL); in metaslab_group_activate()
1112 ASSERT(mg->mg_activation_count <= 0); in metaslab_group_activate()
1114 if (++mg->mg_activation_count <= 0) in metaslab_group_activate()
1119 if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { in metaslab_group_activate()
1120 mg->mg_prev = mg; in metaslab_group_activate()
1121 mg->mg_next = mg; in metaslab_group_activate()
1123 mgnext = mgprev->mg_next; in metaslab_group_activate()
1124 mg->mg_prev = mgprev; in metaslab_group_activate()
1125 mg->mg_next = mgnext; in metaslab_group_activate()
1126 mgprev->mg_next = mg; in metaslab_group_activate()
1127 mgnext->mg_prev = mg; in metaslab_group_activate()
1129 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_activate()
1130 mc->mc_allocator[i].mca_rotor = mg; in metaslab_group_activate()
1131 mg = mg->mg_next; in metaslab_group_activate()
1145 metaslab_class_t *mc = mg->mg_class; in metaslab_group_passivate()
1146 spa_t *spa = mc->mc_spa; in metaslab_group_passivate()
1153 if (--mg->mg_activation_count != 0) { in metaslab_group_passivate()
1154 for (int i = 0; i < spa->spa_alloc_count; i++) in metaslab_group_passivate()
1155 ASSERT(mc->mc_allocator[i].mca_rotor != mg); in metaslab_group_passivate()
1156 ASSERT(mg->mg_prev == NULL); in metaslab_group_passivate()
1157 ASSERT(mg->mg_next == NULL); in metaslab_group_passivate()
1158 ASSERT(mg->mg_activation_count < 0); in metaslab_group_passivate()
1176 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); in metaslab_group_passivate()
1177 taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); in metaslab_group_passivate()
1178 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); in metaslab_group_passivate()
1180 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_passivate()
1181 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_passivate()
1182 metaslab_t *msp = mga->mga_primary; in metaslab_group_passivate()
1184 mutex_enter(&msp->ms_lock); in metaslab_group_passivate()
1187 mutex_exit(&msp->ms_lock); in metaslab_group_passivate()
1189 msp = mga->mga_secondary; in metaslab_group_passivate()
1191 mutex_enter(&msp->ms_lock); in metaslab_group_passivate()
1194 mutex_exit(&msp->ms_lock); in metaslab_group_passivate()
1198 mgprev = mg->mg_prev; in metaslab_group_passivate()
1199 mgnext = mg->mg_next; in metaslab_group_passivate()
1204 mgprev->mg_next = mgnext; in metaslab_group_passivate()
1205 mgnext->mg_prev = mgprev; in metaslab_group_passivate()
1207 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_passivate()
1208 if (mc->mc_allocator[i].mca_rotor == mg) in metaslab_group_passivate()
1209 mc->mc_allocator[i].mca_rotor = mgnext; in metaslab_group_passivate()
1212 mg->mg_prev = NULL; in metaslab_group_passivate()
1213 mg->mg_next = NULL; in metaslab_group_passivate()
1220 vdev_t *vd = mg->mg_vd; in metaslab_group_initialized()
1221 vdev_stat_t *vs = &vd->vdev_stat; in metaslab_group_initialized()
1223 return (vs->vs_space != 0 && mg->mg_activation_count > 0); in metaslab_group_initialized()
1233 mutex_enter(&mg->mg_lock); in metaslab_group_get_space()
1234 uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); in metaslab_group_get_space()
1235 mutex_exit(&mg->mg_lock); in metaslab_group_get_space()
1236 return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); in metaslab_group_get_space()
1243 avl_tree_t *t = &mg->mg_metaslab_tree; in metaslab_group_histogram_verify()
1244 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_verify()
1255 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_verify()
1258 VERIFY3P(msp->ms_group, ==, mg); in metaslab_group_histogram_verify()
1260 if (msp->ms_sm == NULL) in metaslab_group_histogram_verify()
1265 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_verify()
1270 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); in metaslab_group_histogram_verify()
1272 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_verify()
1280 metaslab_class_t *mc = mg->mg_class; in metaslab_group_histogram_add()
1281 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_add()
1283 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_histogram_add()
1284 if (msp->ms_sm == NULL) in metaslab_group_histogram_add()
1287 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_add()
1288 mutex_enter(&mc->mc_lock); in metaslab_group_histogram_add()
1290 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_group_histogram_add()
1291 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_group_histogram_add()
1292 mg->mg_histogram[i + ashift] += in metaslab_group_histogram_add()
1293 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_add()
1294 mc->mc_histogram[i + ashift] += in metaslab_group_histogram_add()
1295 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_add()
1297 mutex_exit(&mc->mc_lock); in metaslab_group_histogram_add()
1298 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_add()
1304 metaslab_class_t *mc = mg->mg_class; in metaslab_group_histogram_remove()
1305 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_remove()
1307 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_histogram_remove()
1308 if (msp->ms_sm == NULL) in metaslab_group_histogram_remove()
1311 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_remove()
1312 mutex_enter(&mc->mc_lock); in metaslab_group_histogram_remove()
1314 ASSERT3U(mg->mg_histogram[i + ashift], >=, in metaslab_group_histogram_remove()
1315 msp->ms_sm->sm_phys->smp_histogram[i]); in metaslab_group_histogram_remove()
1316 ASSERT3U(mc->mc_histogram[i + ashift], >=, in metaslab_group_histogram_remove()
1317 msp->ms_sm->sm_phys->smp_histogram[i]); in metaslab_group_histogram_remove()
1318 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_group_histogram_remove()
1319 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_group_histogram_remove()
1321 mg->mg_histogram[i + ashift] -= in metaslab_group_histogram_remove()
1322 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_remove()
1323 mc->mc_histogram[i + ashift] -= in metaslab_group_histogram_remove()
1324 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_remove()
1326 mutex_exit(&mc->mc_lock); in metaslab_group_histogram_remove()
1327 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_remove()
1333 ASSERT(msp->ms_group == NULL); in metaslab_group_add()
1334 mutex_enter(&mg->mg_lock); in metaslab_group_add()
1335 msp->ms_group = mg; in metaslab_group_add()
1336 msp->ms_weight = 0; in metaslab_group_add()
1337 avl_add(&mg->mg_metaslab_tree, msp); in metaslab_group_add()
1338 mutex_exit(&mg->mg_lock); in metaslab_group_add()
1340 mutex_enter(&msp->ms_lock); in metaslab_group_add()
1342 mutex_exit(&msp->ms_lock); in metaslab_group_add()
1348 mutex_enter(&msp->ms_lock); in metaslab_group_remove()
1350 mutex_exit(&msp->ms_lock); in metaslab_group_remove()
1352 mutex_enter(&mg->mg_lock); in metaslab_group_remove()
1353 ASSERT(msp->ms_group == mg); in metaslab_group_remove()
1354 avl_remove(&mg->mg_metaslab_tree, msp); in metaslab_group_remove()
1356 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_group_remove()
1358 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_group_remove()
1359 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_group_remove()
1363 msp->ms_group = NULL; in metaslab_group_remove()
1364 mutex_exit(&mg->mg_lock); in metaslab_group_remove()
1370 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_sort_impl()
1371 ASSERT(MUTEX_HELD(&mg->mg_lock)); in metaslab_group_sort_impl()
1372 ASSERT(msp->ms_group == mg); in metaslab_group_sort_impl()
1374 avl_remove(&mg->mg_metaslab_tree, msp); in metaslab_group_sort_impl()
1375 msp->ms_weight = weight; in metaslab_group_sort_impl()
1376 avl_add(&mg->mg_metaslab_tree, msp); in metaslab_group_sort_impl()
1388 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_sort()
1390 mutex_enter(&mg->mg_lock); in metaslab_group_sort()
1392 mutex_exit(&mg->mg_lock); in metaslab_group_sort()
1404 vdev_t *vd = mg->mg_vd; in metaslab_group_fragmentation()
1409 for (int m = 0; m < vd->vdev_ms_count; m++) { in metaslab_group_fragmentation()
1410 metaslab_t *msp = vd->vdev_ms[m]; in metaslab_group_fragmentation()
1412 if (msp->ms_group != mg) in metaslab_group_fragmentation()
1415 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) in metaslab_group_fragmentation()
1419 free = (msp->ms_size - metaslab_allocated_space(msp)) / in metaslab_group_fragmentation()
1422 fragmentation += msp->ms_fragmentation * free; in metaslab_group_fragmentation()
1440 * Comparison function for the private size-ordered tree using 32-bit
1450 uint64_t rs_size1 = r1->rs_end - r1->rs_start; in metaslab_rangesize32_compare()
1451 uint64_t rs_size2 = r2->rs_end - r2->rs_start; in metaslab_rangesize32_compare()
1455 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); in metaslab_rangesize32_compare()
1459 * Comparison function for the private size-ordered tree using 64-bit
1469 uint64_t rs_size1 = r1->rs_end - r1->rs_start; in metaslab_rangesize64_compare()
1470 uint64_t rs_size2 = r2->rs_end - r2->rs_start; in metaslab_rangesize64_compare()
1474 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); in metaslab_rangesize64_compare()
1491 zfs_range_tree_t *rt = mssap->rt; in metaslab_size_sorted_add()
1492 metaslab_rt_arg_t *mrap = mssap->mra; in metaslab_size_sorted_add()
1502 metaslab_rt_arg_t *mrap = rt->rt_arg; in metaslab_size_tree_full_load()
1504 ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); in metaslab_size_tree_full_load()
1505 mrap->mra_floor_shift = 0; in metaslab_size_tree_full_load()
1521 * rely on using both a size-ordered zfs_range_tree_t and an array of in ZFS_BTREE_FIND_IN_BUF_FUNC()
1528 zfs_btree_t *size_tree = mrap->mra_bt; in ZFS_BTREE_FIND_IN_BUF_FUNC()
1533 switch (rt->rt_type) { in ZFS_BTREE_FIND_IN_BUF_FUNC()
1545 panic("Invalid range seg type %d", rt->rt_type); in ZFS_BTREE_FIND_IN_BUF_FUNC()
1548 mrap->mra_floor_shift = metaslab_by_size_min_shift; in ZFS_BTREE_FIND_IN_BUF_FUNC()
1556 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_destroy()
1566 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_add()
1568 if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < in metaslab_rt_add()
1569 (1ULL << mrap->mra_floor_shift)) in metaslab_rt_add()
1579 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_remove()
1581 if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << in metaslab_rt_remove()
1582 mrap->mra_floor_shift)) in metaslab_rt_remove()
1592 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_vacate()
1619 zfs_btree_t *t = &msp->ms_allocatable_by_size; in metaslab_largest_allocatable()
1625 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_largest_allocatable()
1631 return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs, in metaslab_largest_allocatable()
1632 msp->ms_allocatable)); in metaslab_largest_allocatable()
1642 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_largest_unflushed_free()
1644 if (msp->ms_unflushed_frees == NULL) in metaslab_largest_unflushed_free()
1647 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) in metaslab_largest_unflushed_free()
1648 metaslab_size_tree_full_load(msp->ms_unflushed_frees); in metaslab_largest_unflushed_free()
1649 zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, in metaslab_largest_unflushed_free()
1660 * bound for the largest currently-usable free segment in the in metaslab_largest_unflushed_free()
1665 * briefly and should eventually self-correct as frees are no longer in metaslab_largest_unflushed_free()
1677 uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees); in metaslab_largest_unflushed_free()
1678 uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart; in metaslab_largest_unflushed_free()
1682 boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t], in metaslab_largest_unflushed_free()
1687 rsize = start - rstart; in metaslab_largest_unflushed_free()
1693 boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart, in metaslab_largest_unflushed_free()
1696 rsize = start - rstart; in metaslab_largest_unflushed_free()
1722 if (rs == NULL || zfs_rs_get_end(rs, rt) - in metaslab_block_find()
1734 * suitable block to allocate. This will search the specified B-tree looking
1742 *cursor = rt->rt_start; in metaslab_block_picker()
1743 zfs_btree_t *bt = &rt->rt_root; in metaslab_block_picker()
1753 while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <= in metaslab_block_picker()
1757 *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, in metaslab_block_picker()
1768 return (-1ULL); in metaslab_block_picker()
1782 { "new-dynamic", metaslab_ndf_alloc },
1788 int a = ARRAY_SIZE(metaslab_allocators) - 1; in spa_find_allocator_byname()
1789 if (strcmp("new-dynamic", val) == 0) in spa_find_allocator_byname()
1790 return (-1); /* remove when ndf is working */ in spa_find_allocator_byname()
1791 for (; a >= 0; a--) { in spa_find_allocator_byname()
1795 return (-1); in spa_find_allocator_byname()
1803 spa->spa_active_allocator = a; in spa_set_allocator()
1810 return (spa->spa_active_allocator); in spa_get_allocator()
1872 uint64_t align = max_size & -max_size; in metaslab_df_alloc()
1873 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; in metaslab_df_alloc()
1874 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_df_alloc()
1875 uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; in metaslab_df_alloc()
1878 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_df_alloc()
1886 align = size & -size; in metaslab_df_alloc()
1887 cursor = &msp->ms_lbas[highbit64(align) - 1]; in metaslab_df_alloc()
1888 offset = -1; in metaslab_df_alloc()
1892 if (max_size != size && offset == -1) { in metaslab_df_alloc()
1893 align = size & -size; in metaslab_df_alloc()
1894 cursor = &msp->ms_lbas[highbit64(align) - 1]; in metaslab_df_alloc()
1900 if (offset == -1) { in metaslab_df_alloc()
1902 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) in metaslab_df_alloc()
1903 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_df_alloc()
1907 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); in metaslab_df_alloc()
1911 rs = metaslab_block_find(&msp->ms_allocatable_by_size, in metaslab_df_alloc()
1912 rt, msp->ms_start, size, max_size, &where); in metaslab_df_alloc()
1917 *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, in metaslab_df_alloc()
1928 * Cursor fit block allocator -
1939 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_cf_alloc()
1940 zfs_btree_t *t = &msp->ms_allocatable_by_size; in metaslab_cf_alloc()
1941 uint64_t *cursor = &msp->ms_lbas[0]; in metaslab_cf_alloc()
1942 uint64_t *cursor_end = &msp->ms_lbas[1]; in metaslab_cf_alloc()
1945 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_cf_alloc()
1953 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_cf_alloc()
1955 if (rs == NULL || (zfs_rs_get_end(rs, rt) - in metaslab_cf_alloc()
1957 return (-1ULL); in metaslab_cf_alloc()
1964 *found_size = MIN(*cursor_end - offset, max_size); in metaslab_cf_alloc()
1972 * New dynamic fit allocator -
1989 zfs_btree_t *t = &msp->ms_allocatable->rt_root; in metaslab_ndf_alloc()
1990 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_ndf_alloc()
1995 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; in metaslab_ndf_alloc()
1998 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_ndf_alloc()
2001 return (-1ULL); in metaslab_ndf_alloc()
2007 if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < in metaslab_ndf_alloc()
2010 cursor = &msp->ms_lbas[hbit - 1]; in metaslab_ndf_alloc()
2016 if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < in metaslab_ndf_alloc()
2018 t = &msp->ms_allocatable_by_size; in metaslab_ndf_alloc()
2030 if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) { in metaslab_ndf_alloc()
2031 *found_size = MIN(zfs_rs_get_end(rs, rt) - in metaslab_ndf_alloc()
2036 return (-1ULL); in metaslab_ndf_alloc()
2046 * Wait for any in-progress metaslab loads to complete.
2051 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load_wait()
2053 while (msp->ms_loading) { in metaslab_load_wait()
2054 ASSERT(!msp->ms_loaded); in metaslab_load_wait()
2055 cv_wait(&msp->ms_load_cv, &msp->ms_lock); in metaslab_load_wait()
2060 * Wait for any in-progress flushing to complete.
2065 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush_wait()
2067 while (msp->ms_flushing) in metaslab_flush_wait()
2068 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); in metaslab_flush_wait()
2080 return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); in metaslab_idx_func()
2086 return (msp->ms_allocated_space); in metaslab_allocated_space()
2090 * Verify that the space accounting on disk matches the in-core range_trees.
2095 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_verify_space()
2099 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_verify_space()
2100 ASSERT(!msp->ms_condensing); in metaslab_verify_space()
2108 * allocated space map. Calling this in non-syncing context in metaslab_verify_space()
2112 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || in metaslab_verify_space()
2113 !msp->ms_loaded) in metaslab_verify_space()
2121 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); in metaslab_verify_space()
2123 ASSERT3U(space_map_allocated(msp->ms_sm), >=, in metaslab_verify_space()
2124 zfs_range_tree_space(msp->ms_unflushed_frees)); in metaslab_verify_space()
2127 space_map_allocated(msp->ms_sm) + in metaslab_verify_space()
2128 zfs_range_tree_space(msp->ms_unflushed_allocs) - in metaslab_verify_space()
2129 zfs_range_tree_space(msp->ms_unflushed_frees)); in metaslab_verify_space()
2131 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); in metaslab_verify_space()
2139 zfs_range_tree_space(msp->ms_allocating[(txg + t) & in metaslab_verify_space()
2142 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, in metaslab_verify_space()
2143 msp->ms_allocating_total); in metaslab_verify_space()
2145 ASSERT3U(msp->ms_deferspace, ==, in metaslab_verify_space()
2146 zfs_range_tree_space(msp->ms_defer[0]) + in metaslab_verify_space()
2147 zfs_range_tree_space(msp->ms_defer[1])); in metaslab_verify_space()
2149 msp_free_space = zfs_range_tree_space(msp->ms_allocatable) + in metaslab_verify_space()
2150 allocating + msp->ms_deferspace + in metaslab_verify_space()
2151 zfs_range_tree_space(msp->ms_freed); in metaslab_verify_space()
2163 ASSERT(msp->ms_loaded); in metaslab_aux_histograms_clear()
2165 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); in metaslab_aux_histograms_clear()
2167 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); in metaslab_aux_histograms_clear()
2184 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); in metaslab_aux_histogram_add()
2186 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { in metaslab_aux_histogram_add()
2206 space_map_t *sm = msp->ms_sm; in metaslab_aux_histograms_update()
2215 if (msp->ms_loaded) { in metaslab_aux_histograms_update()
2218 metaslab_aux_histogram_add(msp->ms_synchist, in metaslab_aux_histograms_update()
2219 sm->sm_shift, msp->ms_freed); in metaslab_aux_histograms_update()
2222 metaslab_aux_histogram_add(msp->ms_deferhist[t], in metaslab_aux_histograms_update()
2223 sm->sm_shift, msp->ms_defer[t]); in metaslab_aux_histograms_update()
2227 metaslab_aux_histogram_add(msp->ms_synchist, in metaslab_aux_histograms_update()
2228 sm->sm_shift, msp->ms_freeing); in metaslab_aux_histograms_update()
2239 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_aux_histograms_update_done()
2240 space_map_t *sm = msp->ms_sm; in metaslab_aux_histograms_update_done()
2257 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, in metaslab_aux_histograms_update_done()
2258 sizeof (msp->ms_synchist)); in metaslab_aux_histograms_update_done()
2260 memset(msp->ms_deferhist[hist_index], 0, in metaslab_aux_histograms_update_done()
2261 sizeof (msp->ms_deferhist[hist_index])); in metaslab_aux_histograms_update_done()
2263 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); in metaslab_aux_histograms_update_done()
2274 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_verify_weight_and_frag()
2288 if (msp->ms_group == NULL) in metaslab_verify_weight_and_frag()
2293 * fragmentation and ms_max_size as is - there is nothing for in metaslab_verify_weight_and_frag()
2296 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_verify_weight_and_frag()
2297 if (vd->vdev_removing) in metaslab_verify_weight_and_frag()
2306 if (txg_list_member(&vd->vdev_ms_list, msp, t)) in metaslab_verify_weight_and_frag()
2311 * This verification checks that our in-memory state is consistent in metaslab_verify_weight_and_frag()
2312 * with what's on disk. If the pool is read-only then there aren't in metaslab_verify_weight_and_frag()
2313 * any changes and we just have the initially-loaded state. in metaslab_verify_weight_and_frag()
2315 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) in metaslab_verify_weight_and_frag()
2318 /* some extra verification for in-core tree if you can */ in metaslab_verify_weight_and_frag()
2319 if (msp->ms_loaded) { in metaslab_verify_weight_and_frag()
2320 zfs_range_tree_stat_verify(msp->ms_allocatable); in metaslab_verify_weight_and_frag()
2321 VERIFY(space_map_histogram_verify(msp->ms_sm, in metaslab_verify_weight_and_frag()
2322 msp->ms_allocatable)); in metaslab_verify_weight_and_frag()
2325 uint64_t weight = msp->ms_weight; in metaslab_verify_weight_and_frag()
2326 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_verify_weight_and_frag()
2327 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); in metaslab_verify_weight_and_frag()
2328 uint64_t frag = msp->ms_fragmentation; in metaslab_verify_weight_and_frag()
2329 uint64_t max_segsize = msp->ms_max_size; in metaslab_verify_weight_and_frag()
2331 msp->ms_weight = 0; in metaslab_verify_weight_and_frag()
2332 msp->ms_fragmentation = 0; in metaslab_verify_weight_and_frag()
2336 * not introduce any side-effects/mutations on the system's state. in metaslab_verify_weight_and_frag()
2347 msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; in metaslab_verify_weight_and_frag()
2349 VERIFY3U(max_segsize, ==, msp->ms_max_size); in metaslab_verify_weight_and_frag()
2355 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || in metaslab_verify_weight_and_frag()
2356 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { in metaslab_verify_weight_and_frag()
2357 msp->ms_fragmentation = frag; in metaslab_verify_weight_and_frag()
2358 msp->ms_weight = weight; in metaslab_verify_weight_and_frag()
2362 VERIFY3U(msp->ms_fragmentation, ==, frag); in metaslab_verify_weight_and_frag()
2363 VERIFY3U(msp->ms_weight, ==, weight); in metaslab_verify_weight_and_frag()
2383 tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; in metaslab_potentially_evict()
2386 &mc->mc_metaslab_txg_list); in metaslab_potentially_evict()
2388 multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); in metaslab_potentially_evict()
2394 &mc->mc_metaslab_txg_list, idx)); in metaslab_potentially_evict()
2396 metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); in metaslab_potentially_evict()
2398 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_potentially_evict()
2415 if (msp->ms_loading) { in metaslab_potentially_evict()
2429 * currently active because they are high-weight in metaslab_potentially_evict()
2433 mutex_enter(&msp->ms_lock); in metaslab_potentially_evict()
2434 if (msp->ms_allocator == -1 && msp->ms_sm != NULL && in metaslab_potentially_evict()
2435 msp->ms_allocating_total == 0) { in metaslab_potentially_evict()
2438 mutex_exit(&msp->ms_lock); in metaslab_potentially_evict()
2453 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load_impl()
2454 ASSERT(msp->ms_loading); in metaslab_load_impl()
2455 ASSERT(!msp->ms_condensing); in metaslab_load_impl()
2474 * metaslab_sync_done() would try to re-add later. in metaslab_load_impl()
2481 uint64_t length = msp->ms_synced_length; in metaslab_load_impl()
2482 mutex_exit(&msp->ms_lock); in metaslab_load_impl()
2486 if (msp->ms_allocatable->rt_arg == NULL) { in metaslab_load_impl()
2489 mrap = msp->ms_allocatable->rt_arg; in metaslab_load_impl()
2490 msp->ms_allocatable->rt_ops = NULL; in metaslab_load_impl()
2491 msp->ms_allocatable->rt_arg = NULL; in metaslab_load_impl()
2493 mrap->mra_bt = &msp->ms_allocatable_by_size; in metaslab_load_impl()
2494 mrap->mra_floor_shift = metaslab_by_size_min_shift; in metaslab_load_impl()
2496 if (msp->ms_sm != NULL) { in metaslab_load_impl()
2497 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, in metaslab_load_impl()
2500 /* Now, populate the size-sorted tree. */ in metaslab_load_impl()
2501 metaslab_rt_create(msp->ms_allocatable, mrap); in metaslab_load_impl()
2502 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; in metaslab_load_impl()
2503 msp->ms_allocatable->rt_arg = mrap; in metaslab_load_impl()
2506 arg.rt = msp->ms_allocatable; in metaslab_load_impl()
2508 zfs_range_tree_walk(msp->ms_allocatable, in metaslab_load_impl()
2512 * Add the size-sorted tree first, since we don't need to load in metaslab_load_impl()
2515 metaslab_rt_create(msp->ms_allocatable, mrap); in metaslab_load_impl()
2516 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; in metaslab_load_impl()
2517 msp->ms_allocatable->rt_arg = mrap; in metaslab_load_impl()
2523 zfs_range_tree_add(msp->ms_allocatable, in metaslab_load_impl()
2524 msp->ms_start, msp->ms_size); in metaslab_load_impl()
2526 if (msp->ms_new) { in metaslab_load_impl()
2535 msp->ms_unflushed_allocs)); in metaslab_load_impl()
2537 msp->ms_unflushed_frees)); in metaslab_load_impl()
2548 mutex_enter(&msp->ms_sync_lock); in metaslab_load_impl()
2549 mutex_enter(&msp->ms_lock); in metaslab_load_impl()
2551 ASSERT(!msp->ms_condensing); in metaslab_load_impl()
2552 ASSERT(!msp->ms_flushing); in metaslab_load_impl()
2555 mutex_exit(&msp->ms_sync_lock); in metaslab_load_impl()
2559 ASSERT3P(msp->ms_group, !=, NULL); in metaslab_load_impl()
2560 msp->ms_loaded = B_TRUE; in metaslab_load_impl()
2567 zfs_range_tree_walk(msp->ms_unflushed_allocs, in metaslab_load_impl()
2568 zfs_range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2569 zfs_range_tree_walk(msp->ms_unflushed_frees, in metaslab_load_impl()
2570 zfs_range_tree_add, msp->ms_allocatable); in metaslab_load_impl()
2572 ASSERT3P(msp->ms_group, !=, NULL); in metaslab_load_impl()
2573 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_load_impl()
2598 zfs_range_tree_walk(msp->ms_freed, in metaslab_load_impl()
2599 zfs_range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2617 zfs_range_tree_walk(msp->ms_defer[t], in metaslab_load_impl()
2618 zfs_range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2626 * has not yet been converted to use segment-based weight, we in metaslab_load_impl()
2633 uint64_t weight = msp->ms_weight; in metaslab_load_impl()
2634 uint64_t max_size = msp->ms_max_size; in metaslab_load_impl()
2637 ASSERT3U(weight, <=, msp->ms_weight); in metaslab_load_impl()
2638 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_load_impl()
2639 ASSERT3U(max_size, <=, msp->ms_max_size); in metaslab_load_impl()
2641 msp->ms_load_time = load_end; in metaslab_load_impl()
2645 "freed %llu, defer %llu + %llu, unloaded time %llu ms, " in metaslab_load_impl()
2646 "loading_time %lld ms, ms_max_size %llu, " in metaslab_load_impl()
2650 msp->ms_group->mg_class->mc_name, in metaslab_load_impl()
2651 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_load_impl()
2652 (u_longlong_t)msp->ms_id, in metaslab_load_impl()
2653 (u_longlong_t)space_map_length(msp->ms_sm), in metaslab_load_impl()
2654 (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs), in metaslab_load_impl()
2655 (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees), in metaslab_load_impl()
2656 (u_longlong_t)zfs_range_tree_space(msp->ms_freed), in metaslab_load_impl()
2657 (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]), in metaslab_load_impl()
2658 (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]), in metaslab_load_impl()
2659 (longlong_t)((load_start - msp->ms_unload_time) / 1000000), in metaslab_load_impl()
2660 (longlong_t)((load_end - load_start) / 1000000), in metaslab_load_impl()
2661 (u_longlong_t)msp->ms_max_size, in metaslab_load_impl()
2662 (u_longlong_t)msp->ms_max_size - max_size, in metaslab_load_impl()
2663 (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); in metaslab_load_impl()
2666 mutex_exit(&msp->ms_sync_lock); in metaslab_load_impl()
2673 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load()
2680 if (msp->ms_loaded) in metaslab_load()
2682 VERIFY(!msp->ms_loading); in metaslab_load()
2683 ASSERT(!msp->ms_condensing); in metaslab_load()
2691 msp->ms_loading = B_TRUE; in metaslab_load()
2694 * Wait for any in-progress flushing to finish as we drop the ms_lock in metaslab_load()
2698 if (msp->ms_flushing) in metaslab_load()
2706 ASSERT(!msp->ms_loaded); in metaslab_load()
2713 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == in metaslab_load()
2714 msp->ms_group->mg_class) { in metaslab_load()
2715 metaslab_potentially_evict(msp->ms_group->mg_class); in metaslab_load()
2720 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load()
2721 msp->ms_loading = B_FALSE; in metaslab_load()
2722 cv_broadcast(&msp->ms_load_cv); in metaslab_load()
2730 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_unload()
2737 if (!msp->ms_loaded) in metaslab_unload()
2740 zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); in metaslab_unload()
2741 msp->ms_loaded = B_FALSE; in metaslab_unload()
2742 msp->ms_unload_time = gethrtime(); in metaslab_unload()
2744 msp->ms_activation_weight = 0; in metaslab_unload()
2745 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; in metaslab_unload()
2747 if (msp->ms_group != NULL) { in metaslab_unload()
2748 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_unload()
2750 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_unload()
2751 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_unload()
2755 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unload()
2759 "loaded %llu ms ago, max_size %llu", in metaslab_unload()
2761 msp->ms_group->mg_class->mc_name, in metaslab_unload()
2762 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_unload()
2763 (u_longlong_t)msp->ms_id, in metaslab_unload()
2764 (u_longlong_t)msp->ms_weight, in metaslab_unload()
2765 (u_longlong_t)msp->ms_selected_txg, in metaslab_unload()
2766 (u_longlong_t)(NSEC2SEC(msp->ms_unload_time) - in metaslab_unload()
2767 msp->ms_selected_time), in metaslab_unload()
2768 (u_longlong_t)msp->ms_alloc_txg, in metaslab_unload()
2769 (u_longlong_t)(msp->ms_unload_time - in metaslab_unload()
2770 msp->ms_load_time) / 1000 / 1000, in metaslab_unload()
2771 (u_longlong_t)msp->ms_max_size); in metaslab_unload()
2778 * loaded ones have it calculated from their in-core range tree in metaslab_unload()
2780 * available in-core, whether it is loaded or not. in metaslab_unload()
2786 if (msp->ms_group != NULL) in metaslab_unload()
2791 * We want to optimize the memory use of the per-metaslab range
2793 * units of sectors, zero-indexing from the start of the metaslab. If
2794 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2801 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && in metaslab_calculate_range_tree_type()
2803 *shift = vdev->vdev_ashift; in metaslab_calculate_range_tree_type()
2804 *start = msp->ms_start; in metaslab_calculate_range_tree_type()
2816 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_set_selected_txg()
2817 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_set_selected_txg()
2819 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_set_selected_txg()
2820 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_set_selected_txg()
2822 msp->ms_selected_txg = txg; in metaslab_set_selected_txg()
2823 msp->ms_selected_time = gethrestime_sec(); in metaslab_set_selected_txg()
2834 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); in metaslab_space_update()
2835 ASSERT(vd->vdev_ms_count != 0); in metaslab_space_update()
2845 vdev_t *vd = mg->mg_vd; in metaslab_init()
2846 spa_t *spa = vd->vdev_spa; in metaslab_init()
2847 objset_t *mos = spa->spa_meta_objset; in metaslab_init()
2848 metaslab_t *ms; in metaslab_init() local
2851 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); in metaslab_init()
2852 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_init()
2853 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_init()
2854 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); in metaslab_init()
2855 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); in metaslab_init()
2856 multilist_link_init(&ms->ms_class_txg_node); in metaslab_init()
2858 ms->ms_id = id; in metaslab_init()
2859 ms->ms_start = id << vd->vdev_ms_shift; in metaslab_init()
2860 ms->ms_size = 1ULL << vd->vdev_ms_shift; in metaslab_init()
2861 ms->ms_allocator = -1; in metaslab_init()
2862 ms->ms_new = B_TRUE; in metaslab_init()
2864 vdev_ops_t *ops = vd->vdev_ops; in metaslab_init()
2865 if (ops->vdev_op_metaslab_init != NULL) in metaslab_init()
2866 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); in metaslab_init()
2880 if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && in metaslab_init()
2881 !spa->spa_read_spacemaps)) { in metaslab_init()
2882 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, in metaslab_init()
2883 ms->ms_size, vd->vdev_ashift); in metaslab_init()
2886 kmem_free(ms, sizeof (metaslab_t)); in metaslab_init()
2890 ASSERT(ms->ms_sm != NULL); in metaslab_init()
2891 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); in metaslab_init()
2896 metaslab_calculate_range_tree_type(vd, ms, &start, &shift); in metaslab_init()
2898 ms->ms_allocatable = zfs_range_tree_create(NULL, type, NULL, start, in metaslab_init()
2901 ms->ms_allocating[t] = zfs_range_tree_create(NULL, type, in metaslab_init()
2904 ms->ms_freeing = zfs_range_tree_create(NULL, type, NULL, start, shift); in metaslab_init()
2905 ms->ms_freed = zfs_range_tree_create(NULL, type, NULL, start, shift); in metaslab_init()
2907 ms->ms_defer[t] = zfs_range_tree_create(NULL, type, NULL, in metaslab_init()
2910 ms->ms_checkpointing = in metaslab_init()
2912 ms->ms_unflushed_allocs = in metaslab_init()
2916 mrap->mra_bt = &ms->ms_unflushed_frees_by_size; in metaslab_init()
2917 mrap->mra_floor_shift = metaslab_by_size_min_shift; in metaslab_init()
2918 ms->ms_unflushed_frees = zfs_range_tree_create(&metaslab_rt_ops, in metaslab_init()
2921 ms->ms_trim = zfs_range_tree_create(NULL, type, NULL, start, shift); in metaslab_init()
2923 metaslab_group_add(mg, ms); in metaslab_init()
2924 metaslab_set_fragmentation(ms, B_FALSE); in metaslab_init()
2936 metaslab_sync_done(ms, 0); in metaslab_init()
2937 metaslab_space_update(vd, mg->mg_class, in metaslab_init()
2938 metaslab_allocated_space(ms), 0, 0); in metaslab_init()
2943 vdev_dirty(vd, VDD_METASLAB, ms, txg); in metaslab_init()
2946 *msp = ms; in metaslab_init()
2954 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_fini_flush_data()
2957 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), in metaslab_fini_flush_data()
2963 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_fini_flush_data()
2964 avl_remove(&spa->spa_metaslabs_by_flushed, msp); in metaslab_fini_flush_data()
2965 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_fini_flush_data()
2973 metaslab_unflushed_changes_memused(metaslab_t *ms) in metaslab_unflushed_changes_memused() argument
2975 return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) + in metaslab_unflushed_changes_memused()
2976 zfs_range_tree_numsegs(ms->ms_unflushed_frees)) * in metaslab_unflushed_changes_memused()
2977 ms->ms_unflushed_allocs->rt_root.bt_elem_size); in metaslab_unflushed_changes_memused()
2983 metaslab_group_t *mg = msp->ms_group; in metaslab_fini()
2984 vdev_t *vd = mg->mg_vd; in metaslab_fini()
2985 spa_t *spa = vd->vdev_spa; in metaslab_fini()
2991 mutex_enter(&msp->ms_lock); in metaslab_fini()
2992 VERIFY(msp->ms_group == NULL); in metaslab_fini()
2999 if (!msp->ms_new) { in metaslab_fini()
3000 metaslab_space_update(vd, mg->mg_class, in metaslab_fini()
3001 -metaslab_allocated_space(msp), 0, -msp->ms_size); in metaslab_fini()
3004 space_map_close(msp->ms_sm); in metaslab_fini()
3005 msp->ms_sm = NULL; in metaslab_fini()
3009 zfs_range_tree_destroy(msp->ms_allocatable); in metaslab_fini()
3010 zfs_range_tree_destroy(msp->ms_freeing); in metaslab_fini()
3011 zfs_range_tree_destroy(msp->ms_freed); in metaslab_fini()
3013 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_fini()
3015 spa->spa_unflushed_stats.sus_memused -= in metaslab_fini()
3017 zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_fini()
3018 zfs_range_tree_destroy(msp->ms_unflushed_allocs); in metaslab_fini()
3019 zfs_range_tree_destroy(msp->ms_checkpointing); in metaslab_fini()
3020 zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_fini()
3021 zfs_range_tree_destroy(msp->ms_unflushed_frees); in metaslab_fini()
3024 zfs_range_tree_destroy(msp->ms_allocating[t]); in metaslab_fini()
3027 zfs_range_tree_destroy(msp->ms_defer[t]); in metaslab_fini()
3029 ASSERT0(msp->ms_deferspace); in metaslab_fini()
3032 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); in metaslab_fini()
3034 zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); in metaslab_fini()
3035 zfs_range_tree_destroy(msp->ms_trim); in metaslab_fini()
3037 mutex_exit(&msp->ms_lock); in metaslab_fini()
3038 cv_destroy(&msp->ms_load_cv); in metaslab_fini()
3039 cv_destroy(&msp->ms_flush_cv); in metaslab_fini()
3040 mutex_destroy(&msp->ms_lock); in metaslab_fini()
3041 mutex_destroy(&msp->ms_sync_lock); in metaslab_fini()
3042 ASSERT3U(msp->ms_allocator, ==, -1); in metaslab_fini()
3101 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_set_fragmentation()
3108 msp->ms_fragmentation = ZFS_FRAG_INVALID; in metaslab_set_fragmentation()
3116 if (msp->ms_sm == NULL) { in metaslab_set_fragmentation()
3117 msp->ms_fragmentation = 0; in metaslab_set_fragmentation()
3125 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { in metaslab_set_fragmentation()
3127 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_set_fragmentation()
3139 msp->ms_condense_wanted = B_TRUE; in metaslab_set_fragmentation()
3143 (u_longlong_t)msp->ms_id, in metaslab_set_fragmentation()
3144 (u_longlong_t)vd->vdev_id); in metaslab_set_fragmentation()
3146 msp->ms_fragmentation = ZFS_FRAG_INVALID; in metaslab_set_fragmentation()
3152 uint8_t shift = msp->ms_sm->sm_shift; in metaslab_set_fragmentation()
3154 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, in metaslab_set_fragmentation()
3155 FRAGMENTATION_TABLE_SIZE - 1); in metaslab_set_fragmentation()
3157 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) in metaslab_set_fragmentation()
3160 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); in metaslab_set_fragmentation()
3171 msp->ms_fragmentation = fragmentation; in metaslab_set_fragmentation()
3175 * Compute a weight -- a selection preference value -- for the given metaslab.
3182 metaslab_group_t *mg = msp->ms_group; in metaslab_space_weight()
3183 vdev_t *vd = mg->mg_vd; in metaslab_space_weight()
3186 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_space_weight()
3191 space = msp->ms_size - metaslab_allocated_space(msp); in metaslab_space_weight()
3194 msp->ms_fragmentation != ZFS_FRAG_INVALID) { in metaslab_space_weight()
3202 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; in metaslab_space_weight()
3225 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { in metaslab_space_weight()
3226 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; in metaslab_space_weight()
3232 * weight to make it preferable to any inactive metaslab so in metaslab_space_weight()
3236 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && in metaslab_space_weight()
3237 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { in metaslab_space_weight()
3238 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_space_weight()
3246 * Return the weight of the specified metaslab, according to the segment-based
3257 ASSERT(msp->ms_loaded); in metaslab_weight_from_range_tree()
3259 for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; in metaslab_weight_from_range_tree()
3260 i--) { in metaslab_weight_from_range_tree()
3261 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; in metaslab_weight_from_range_tree()
3262 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; in metaslab_weight_from_range_tree()
3265 segments += msp->ms_allocatable->rt_histogram[i]; in metaslab_weight_from_range_tree()
3288 * Calculate the weight based on the on-disk histogram. Should be applied
3289 * only to unloaded metaslabs (i.e no incoming allocations) in-order to
3290 * give results consistent with the on-disk state
3295 space_map_t *sm = msp->ms_sm; in metaslab_weight_from_spacemap()
3296 ASSERT(!msp->ms_loaded); in metaslab_weight_from_spacemap()
3299 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); in metaslab_weight_from_spacemap()
3311 deferspace_histogram[i] += msp->ms_synchist[i]; in metaslab_weight_from_spacemap()
3314 deferspace_histogram[i] += msp->ms_deferhist[t][i]; in metaslab_weight_from_spacemap()
3319 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { in metaslab_weight_from_spacemap()
3320 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, in metaslab_weight_from_spacemap()
3323 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; in metaslab_weight_from_spacemap()
3326 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); in metaslab_weight_from_spacemap()
3335 * Compute a segment-based weight for the specified metaslab. The weight
3342 metaslab_group_t *mg = msp->ms_group; in metaslab_segment_weight()
3344 uint8_t shift = mg->mg_vd->vdev_ashift; in metaslab_segment_weight()
3346 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_segment_weight()
3352 int idx = highbit64(msp->ms_size) - 1; in metaslab_segment_weight()
3353 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; in metaslab_segment_weight()
3359 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); in metaslab_segment_weight()
3367 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); in metaslab_segment_weight()
3372 if (metaslab_allocated_space(msp) == msp->ms_size) in metaslab_segment_weight()
3379 if (msp->ms_loaded) { in metaslab_segment_weight()
3390 if (msp->ms_activation_weight != 0 && weight != 0) in metaslab_segment_weight()
3391 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); in metaslab_segment_weight()
3400 * weight. For segment-based weighting we can determine the maximum
3401 * allocation based on the index encoded in its value. For space-based
3402 * weights we rely on the entire weight (excluding the weight-type bit).
3413 if (unlikely(msp->ms_new)) in metaslab_should_allocate()
3423 if (msp->ms_loaded || in metaslab_should_allocate()
3424 (msp->ms_max_size != 0 && !try_hard && gethrtime() < in metaslab_should_allocate()
3425 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) in metaslab_should_allocate()
3426 return (msp->ms_max_size >= asize); in metaslab_should_allocate()
3429 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { in metaslab_should_allocate()
3437 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); in metaslab_should_allocate()
3440 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); in metaslab_should_allocate()
3449 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_weight()
3450 spa_t *spa = vd->vdev_spa; in metaslab_weight()
3453 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_weight()
3467 if (msp->ms_loaded) { in metaslab_weight()
3468 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_weight()
3470 msp->ms_max_size = MAX(msp->ms_max_size, in metaslab_weight()
3475 * Segment-based weighting requires space map histogram support. in metaslab_weight()
3479 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == in metaslab_weight()
3491 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_recalculate_weight_and_sort()
3494 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_recalculate_weight_and_sort()
3495 metaslab_group_sort(msp->ms_group, msp, in metaslab_recalculate_weight_and_sort()
3503 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_activate_allocator()
3504 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_activate_allocator()
3511 ASSERT0(msp->ms_activation_weight); in metaslab_activate_allocator()
3512 msp->ms_activation_weight = msp->ms_weight; in metaslab_activate_allocator()
3513 metaslab_group_sort(mg, msp, msp->ms_weight | in metaslab_activate_allocator()
3519 &mga->mga_primary : &mga->mga_secondary); in metaslab_activate_allocator()
3521 mutex_enter(&mg->mg_lock); in metaslab_activate_allocator()
3523 mutex_exit(&mg->mg_lock); in metaslab_activate_allocator()
3528 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_activate_allocator()
3529 msp->ms_allocator = allocator; in metaslab_activate_allocator()
3530 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); in metaslab_activate_allocator()
3532 ASSERT0(msp->ms_activation_weight); in metaslab_activate_allocator()
3533 msp->ms_activation_weight = msp->ms_weight; in metaslab_activate_allocator()
3535 msp->ms_weight | activation_weight); in metaslab_activate_allocator()
3536 mutex_exit(&mg->mg_lock); in metaslab_activate_allocator()
3544 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_activate()
3557 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { in metaslab_activate()
3558 ASSERT(msp->ms_loaded); in metaslab_activate()
3564 metaslab_group_sort(msp->ms_group, msp, 0); in metaslab_activate()
3583 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { in metaslab_activate()
3584 if (msp->ms_allocator != allocator) in metaslab_activate()
3587 if ((msp->ms_weight & activation_weight) == 0) in metaslab_activate()
3591 msp->ms_primary); in metaslab_activate()
3602 if (msp->ms_weight == 0) { in metaslab_activate()
3603 ASSERT0(zfs_range_tree_space(msp->ms_allocatable)); in metaslab_activate()
3607 if ((error = metaslab_activate_allocator(msp->ms_group, msp, in metaslab_activate()
3612 ASSERT(msp->ms_loaded); in metaslab_activate()
3613 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_activate()
3622 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_passivate_allocator()
3623 ASSERT(msp->ms_loaded); in metaslab_passivate_allocator()
3625 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { in metaslab_passivate_allocator()
3630 mutex_enter(&mg->mg_lock); in metaslab_passivate_allocator()
3631 ASSERT3P(msp->ms_group, ==, mg); in metaslab_passivate_allocator()
3632 ASSERT3S(0, <=, msp->ms_allocator); in metaslab_passivate_allocator()
3633 ASSERT3U(msp->ms_allocator, <, mg->mg_class->mc_spa->spa_alloc_count); in metaslab_passivate_allocator()
3635 metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; in metaslab_passivate_allocator()
3636 if (msp->ms_primary) { in metaslab_passivate_allocator()
3637 ASSERT3P(mga->mga_primary, ==, msp); in metaslab_passivate_allocator()
3638 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_passivate_allocator()
3639 mga->mga_primary = NULL; in metaslab_passivate_allocator()
3641 ASSERT3P(mga->mga_secondary, ==, msp); in metaslab_passivate_allocator()
3642 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_passivate_allocator()
3643 mga->mga_secondary = NULL; in metaslab_passivate_allocator()
3645 msp->ms_allocator = -1; in metaslab_passivate_allocator()
3647 mutex_exit(&mg->mg_lock); in metaslab_passivate_allocator()
3660 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || in metaslab_passivate()
3662 zfs_range_tree_space(msp->ms_allocatable) == 0); in metaslab_passivate()
3665 ASSERT(msp->ms_activation_weight != 0); in metaslab_passivate()
3666 msp->ms_activation_weight = 0; in metaslab_passivate()
3667 metaslab_passivate_allocator(msp->ms_group, msp, weight); in metaslab_passivate()
3668 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_passivate()
3672 * Segment-based metaslabs are activated once and remain active until
3673 * we either fail an allocation attempt (similar to space-based metaslabs)
3685 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_segment_may_passivate()
3687 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) in metaslab_segment_may_passivate()
3696 zfs_range_tree_space(msp->ms_allocatable) * 15 / 16) in metaslab_segment_may_passivate()
3701 * information that is accessible to us is the in-core range tree in metaslab_segment_may_passivate()
3705 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); in metaslab_segment_may_passivate()
3708 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) in metaslab_segment_may_passivate()
3716 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_preload()
3717 spa_t *spa = mc->mc_spa; in metaslab_preload()
3720 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); in metaslab_preload()
3722 mutex_enter(&msp->ms_lock); in metaslab_preload()
3725 mutex_exit(&msp->ms_lock); in metaslab_preload()
3732 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_group_preload()
3734 avl_tree_t *t = &mg->mg_metaslab_tree; in metaslab_group_preload()
3740 mutex_enter(&mg->mg_lock); in metaslab_group_preload()
3746 ASSERT3P(msp->ms_group, ==, mg); in metaslab_group_preload()
3754 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { in metaslab_group_preload()
3758 VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, in metaslab_group_preload()
3759 msp, TQ_SLEEP | (m <= spa->spa_alloc_count ? TQ_FRONT : 0)) in metaslab_group_preload()
3762 mutex_exit(&mg->mg_lock); in metaslab_group_preload()
3766 * Determine if the space map's on-disk footprint is past our tolerance for
3773 * 2. Condense if the on on-disk space map representation is at least
3775 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3777 * 3. Do not condense if the on-disk size of the space map does not actually
3780 * Unfortunately, we cannot compute the on-disk size of the space map in this
3783 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3789 space_map_t *sm = msp->ms_sm; in metaslab_should_condense()
3790 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_should_condense()
3791 uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; in metaslab_should_condense()
3793 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_should_condense()
3794 ASSERT(msp->ms_loaded); in metaslab_should_condense()
3796 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); in metaslab_should_condense()
3802 if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 || in metaslab_should_condense()
3803 msp->ms_condense_wanted) in metaslab_should_condense()
3806 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); in metaslab_should_condense()
3809 msp->ms_allocatable, SM_NO_VDEVID); in metaslab_should_condense()
3816 * Condense the on-disk space map representation to its minimized form.
3820 * the pool-wide log spacemaps; thus this is effectively a superset of
3827 space_map_t *sm = msp->ms_sm; in metaslab_condense()
3829 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_condense()
3831 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_condense()
3832 ASSERT(msp->ms_loaded); in metaslab_condense()
3833 ASSERT(msp->ms_sm != NULL); in metaslab_condense()
3878 ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ in metaslab_condense()
3882 (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, in metaslab_condense()
3883 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_condense()
3884 spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), in metaslab_condense()
3885 (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable), in metaslab_condense()
3886 msp->ms_condense_wanted ? "TRUE" : "FALSE"); in metaslab_condense()
3888 msp->ms_condense_wanted = B_FALSE; in metaslab_condense()
3892 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, in metaslab_condense()
3898 zfs_range_tree_walk(msp->ms_defer[t], in metaslab_condense()
3903 zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], in metaslab_condense()
3907 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_condense()
3909 spa->spa_unflushed_stats.sus_memused -= in metaslab_condense()
3911 zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_condense()
3912 zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_condense()
3922 msp->ms_condensing = B_TRUE; in metaslab_condense()
3924 mutex_exit(&msp->ms_lock); in metaslab_condense()
3925 uint64_t object = space_map_object(msp->ms_sm); in metaslab_condense()
3934 if (space_map_object(msp->ms_sm) != object) { in metaslab_condense()
3935 object = space_map_object(msp->ms_sm); in metaslab_condense()
3936 dmu_write(spa->spa_meta_objset, in metaslab_condense()
3937 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * in metaslab_condense()
3938 msp->ms_id, sizeof (uint64_t), &object, tx); in metaslab_condense()
3954 zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); in metaslab_condense()
3956 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); in metaslab_condense()
3963 mutex_enter(&msp->ms_lock); in metaslab_condense()
3965 msp->ms_condensing = B_FALSE; in metaslab_condense()
3972 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unflushed_add()
3974 ASSERT(msp->ms_sm != NULL); in metaslab_unflushed_add()
3975 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_unflushed_add()
3976 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_unflushed_add()
3978 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_unflushed_add()
3981 avl_add(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_add()
3982 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_unflushed_add()
3991 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unflushed_bump()
3993 ASSERT(msp->ms_sm != NULL); in metaslab_unflushed_bump()
3995 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); in metaslab_unflushed_bump()
3996 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_unflushed_bump()
3997 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_unflushed_bump()
3999 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); in metaslab_unflushed_bump()
4004 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_unflushed_bump()
4005 avl_remove(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_bump()
4008 avl_add(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_bump()
4009 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_unflushed_bump()
4026 * all the contents of the pool-wide spacemap log). Updates the metaslab's
4027 * metadata and any pool-wide related log space map data (e.g. summary,
4033 metaslab_group_t *mg = msp->ms_group; in metaslab_flush_update()
4034 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_flush_update()
4036 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush_update()
4045 msp->ms_synced_length = space_map_length(msp->ms_sm); in metaslab_flush_update()
4049 * feature being active. In that case this is a no-op. in metaslab_flush_update()
4061 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_flush()
4063 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush()
4067 ASSERT(msp->ms_sm != NULL); in metaslab_flush()
4069 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); in metaslab_flush()
4084 if (msp->ms_loading) in metaslab_flush()
4101 if (msp->ms_loaded && metaslab_should_condense(msp)) { in metaslab_flush()
4102 metaslab_group_t *mg = msp->ms_group; in metaslab_flush()
4110 metaslab_class_histogram_verify(mg->mg_class); in metaslab_flush()
4115 space_map_histogram_clear(msp->ms_sm); in metaslab_flush()
4116 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); in metaslab_flush()
4117 ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); in metaslab_flush()
4119 space_map_histogram_add(msp->ms_sm, in metaslab_flush()
4120 msp->ms_defer[t], tx); in metaslab_flush()
4126 metaslab_class_histogram_verify(mg->mg_class); in metaslab_flush()
4141 msp->ms_flushing = B_TRUE; in metaslab_flush()
4142 uint64_t sm_len_before = space_map_length(msp->ms_sm); in metaslab_flush()
4144 mutex_exit(&msp->ms_lock); in metaslab_flush()
4145 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, in metaslab_flush()
4147 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, in metaslab_flush()
4149 mutex_enter(&msp->ms_lock); in metaslab_flush()
4151 uint64_t sm_len_after = space_map_length(msp->ms_sm); in metaslab_flush()
4157 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_flush()
4158 (u_longlong_t)msp->ms_id, in metaslab_flush()
4160 msp->ms_unflushed_allocs), in metaslab_flush()
4162 msp->ms_unflushed_frees), in metaslab_flush()
4163 (u_longlong_t)(sm_len_after - sm_len_before)); in metaslab_flush()
4166 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_flush()
4168 spa->spa_unflushed_stats.sus_memused -= in metaslab_flush()
4170 zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_flush()
4171 zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_flush()
4181 msp->ms_flushing = B_FALSE; in metaslab_flush()
4182 cv_broadcast(&msp->ms_flush_cv); in metaslab_flush()
4192 metaslab_group_t *mg = msp->ms_group; in metaslab_sync()
4193 vdev_t *vd = mg->mg_vd; in metaslab_sync()
4194 spa_t *spa = vd->vdev_spa; in metaslab_sync()
4196 zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; in metaslab_sync()
4199 ASSERT(!vd->vdev_ishole); in metaslab_sync()
4204 if (msp->ms_new) { in metaslab_sync()
4206 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync()
4207 ASSERT0(zfs_range_tree_space(msp->ms_freed)); in metaslab_sync()
4208 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync()
4209 ASSERT0(zfs_range_tree_space(msp->ms_trim)); in metaslab_sync()
4225 zfs_range_tree_is_empty(msp->ms_freeing) && in metaslab_sync()
4226 zfs_range_tree_is_empty(msp->ms_checkpointing) && in metaslab_sync()
4227 !(msp->ms_loaded && msp->ms_condense_wanted && in metaslab_sync()
4254 if (msp->ms_sm == NULL) { in metaslab_sync()
4261 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * in metaslab_sync()
4262 msp->ms_id, sizeof (uint64_t), &new_object, tx); in metaslab_sync()
4264 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, in metaslab_sync()
4265 msp->ms_start, msp->ms_size, vd->vdev_ashift)); in metaslab_sync()
4266 ASSERT(msp->ms_sm != NULL); in metaslab_sync()
4268 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_sync()
4269 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_sync()
4273 if (!zfs_range_tree_is_empty(msp->ms_checkpointing) && in metaslab_sync()
4274 vd->vdev_checkpoint_sm == NULL) { in metaslab_sync()
4281 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, in metaslab_sync()
4282 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); in metaslab_sync()
4283 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); in metaslab_sync()
4290 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, in metaslab_sync()
4291 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, in metaslab_sync()
4295 mutex_enter(&msp->ms_sync_lock); in metaslab_sync()
4296 mutex_enter(&msp->ms_lock); in metaslab_sync()
4304 metaslab_class_histogram_verify(mg->mg_class); in metaslab_sync()
4307 if (spa->spa_sync_pass == 1 && msp->ms_loaded && in metaslab_sync()
4314 * open-context (ZIL) for future TXGs do not block. in metaslab_sync()
4316 mutex_exit(&msp->ms_lock); in metaslab_sync()
4326 vd->vdev_id, tx); in metaslab_sync()
4327 space_map_write(log_sm, msp->ms_freeing, SM_FREE, in metaslab_sync()
4328 vd->vdev_id, tx); in metaslab_sync()
4329 mutex_enter(&msp->ms_lock); in metaslab_sync()
4331 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_sync()
4333 spa->spa_unflushed_stats.sus_memused -= in metaslab_sync()
4336 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); in metaslab_sync()
4337 zfs_range_tree_remove_xor_add(msp->ms_freeing, in metaslab_sync()
4338 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); in metaslab_sync()
4339 spa->spa_unflushed_stats.sus_memused += in metaslab_sync()
4344 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, in metaslab_sync()
4346 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, in metaslab_sync()
4348 mutex_enter(&msp->ms_lock); in metaslab_sync()
4351 msp->ms_allocated_space += zfs_range_tree_space(alloctree); in metaslab_sync()
4352 ASSERT3U(msp->ms_allocated_space, >=, in metaslab_sync()
4353 zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync()
4354 msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing); in metaslab_sync()
4356 if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) { in metaslab_sync()
4358 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); in metaslab_sync()
4366 mutex_exit(&msp->ms_lock); in metaslab_sync()
4367 space_map_write(vd->vdev_checkpoint_sm, in metaslab_sync()
4368 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); in metaslab_sync()
4369 mutex_enter(&msp->ms_lock); in metaslab_sync()
4371 spa->spa_checkpoint_info.sci_dspace += in metaslab_sync()
4372 zfs_range_tree_space(msp->ms_checkpointing); in metaslab_sync()
4373 vd->vdev_stat.vs_checkpoint_space += in metaslab_sync()
4374 zfs_range_tree_space(msp->ms_checkpointing); in metaslab_sync()
4375 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, in metaslab_sync()
4376 -space_map_allocated(vd->vdev_checkpoint_sm)); in metaslab_sync()
4378 zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL); in metaslab_sync()
4381 if (msp->ms_loaded) { in metaslab_sync()
4385 * to bring the space map's histogram up-to-date so we clear in metaslab_sync()
4388 space_map_histogram_clear(msp->ms_sm); in metaslab_sync()
4389 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); in metaslab_sync()
4394 * any deferred space. This allows the on-disk histogram in metaslab_sync()
4398 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); in metaslab_sync()
4402 * added back into the in-core free tree yet. This will in metaslab_sync()
4408 space_map_histogram_add(msp->ms_sm, in metaslab_sync()
4409 msp->ms_defer[t], tx); in metaslab_sync()
4415 * map histogram. We want to make sure that the on-disk histogram in metaslab_sync()
4420 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); in metaslab_sync()
4425 metaslab_class_histogram_verify(mg->mg_class); in metaslab_sync()
4438 zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed); in metaslab_sync()
4439 ASSERT0(msp->ms_allocated_this_txg); in metaslab_sync()
4441 zfs_range_tree_vacate(msp->ms_freeing, in metaslab_sync()
4442 zfs_range_tree_add, msp->ms_freed); in metaslab_sync()
4444 msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree); in metaslab_sync()
4447 ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); in metaslab_sync()
4448 ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) in metaslab_sync()
4450 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync()
4451 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync()
4453 mutex_exit(&msp->ms_lock); in metaslab_sync()
4460 VERIFY0(dmu_read(mos, vd->vdev_ms_array, in metaslab_sync()
4461 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); in metaslab_sync()
4462 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); in metaslab_sync()
4464 mutex_exit(&msp->ms_sync_lock); in metaslab_sync()
4471 if (!msp->ms_loaded || msp->ms_disabled != 0) in metaslab_evict()
4476 msp->ms_allocating[(txg + t) & TXG_MASK])); in metaslab_evict()
4478 if (msp->ms_allocator != -1) in metaslab_evict()
4479 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); in metaslab_evict()
4492 metaslab_group_t *mg = msp->ms_group; in metaslab_sync_done()
4493 vdev_t *vd = mg->mg_vd; in metaslab_sync_done()
4494 spa_t *spa = vd->vdev_spa; in metaslab_sync_done()
4499 ASSERT(!vd->vdev_ishole); in metaslab_sync_done()
4501 mutex_enter(&msp->ms_lock); in metaslab_sync_done()
4503 if (msp->ms_new) { in metaslab_sync_done()
4505 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); in metaslab_sync_done()
4508 VERIFY0(msp->ms_allocated_this_txg); in metaslab_sync_done()
4509 VERIFY0(zfs_range_tree_space(msp->ms_freed)); in metaslab_sync_done()
4512 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync_done()
4513 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync_done()
4515 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; in metaslab_sync_done()
4517 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - in metaslab_sync_done()
4519 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || in metaslab_sync_done()
4520 vd->vdev_rz_expanding) { in metaslab_sync_done()
4525 alloc_delta = msp->ms_allocated_this_txg - in metaslab_sync_done()
4526 zfs_range_tree_space(msp->ms_freed); in metaslab_sync_done()
4529 defer_delta = zfs_range_tree_space(msp->ms_freed) - in metaslab_sync_done()
4532 defer_delta -= zfs_range_tree_space(*defer_tree); in metaslab_sync_done()
4534 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, in metaslab_sync_done()
4543 * have a consistent view at the in-core side of the metaslab. in metaslab_sync_done()
4551 * When auto-trimming is enabled, free ranges which are added to in metaslab_sync_done()
4560 msp->ms_trim); in metaslab_sync_done()
4562 zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add, in metaslab_sync_done()
4563 msp->ms_trim); in metaslab_sync_done()
4566 zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); in metaslab_sync_done()
4572 * the defer_tree -- this is safe to do because we've in metaslab_sync_done()
4576 msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); in metaslab_sync_done()
4578 zfs_range_tree_swap(&msp->ms_freed, defer_tree); in metaslab_sync_done()
4580 zfs_range_tree_vacate(msp->ms_freed, in metaslab_sync_done()
4581 msp->ms_loaded ? zfs_range_tree_add : NULL, in metaslab_sync_done()
4582 msp->ms_allocatable); in metaslab_sync_done()
4585 msp->ms_synced_length = space_map_length(msp->ms_sm); in metaslab_sync_done()
4587 msp->ms_deferspace += defer_delta; in metaslab_sync_done()
4588 ASSERT3S(msp->ms_deferspace, >=, 0); in metaslab_sync_done()
4589 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); in metaslab_sync_done()
4590 if (msp->ms_deferspace != 0) { in metaslab_sync_done()
4599 if (msp->ms_new) { in metaslab_sync_done()
4600 msp->ms_new = B_FALSE; in metaslab_sync_done()
4601 mutex_enter(&mg->mg_lock); in metaslab_sync_done()
4602 mg->mg_ms_ready++; in metaslab_sync_done()
4603 mutex_exit(&mg->mg_lock); in metaslab_sync_done()
4607 * Re-sort metaslab within its group now that we've adjusted in metaslab_sync_done()
4612 ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); in metaslab_sync_done()
4613 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync_done()
4614 ASSERT0(zfs_range_tree_space(msp->ms_freed)); in metaslab_sync_done()
4615 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync_done()
4616 msp->ms_allocating_total -= msp->ms_allocated_this_txg; in metaslab_sync_done()
4617 msp->ms_allocated_this_txg = 0; in metaslab_sync_done()
4618 mutex_exit(&msp->ms_lock); in metaslab_sync_done()
4624 spa_t *spa = mg->mg_class->mc_spa; in metaslab_sync_reassess()
4627 mg->mg_fragmentation = metaslab_group_fragmentation(mg); in metaslab_sync_reassess()
4637 if (mg->mg_activation_count > 0) { in metaslab_sync_reassess()
4656 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) in metaslab_is_unique()
4659 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; in metaslab_is_unique()
4661 return (msp->ms_id != dva_ms_id); in metaslab_is_unique()
4690 if (zal->zal_size == metaslab_trace_max_entries) { in metaslab_trace_add()
4696 zal->zal_size--; in metaslab_trace_add()
4697 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); in metaslab_trace_add()
4698 list_remove(&zal->zal_list, mat_next); in metaslab_trace_add()
4703 list_link_init(&mat->mat_list_node); in metaslab_trace_add()
4704 mat->mat_mg = mg; in metaslab_trace_add()
4705 mat->mat_msp = msp; in metaslab_trace_add()
4706 mat->mat_size = psize; in metaslab_trace_add()
4707 mat->mat_dva_id = dva_id; in metaslab_trace_add()
4708 mat->mat_offset = offset; in metaslab_trace_add()
4709 mat->mat_weight = 0; in metaslab_trace_add()
4710 mat->mat_allocator = allocator; in metaslab_trace_add()
4713 mat->mat_weight = msp->ms_weight; in metaslab_trace_add()
4719 list_insert_tail(&zal->zal_list, mat); in metaslab_trace_add()
4720 zal->zal_size++; in metaslab_trace_add()
4722 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); in metaslab_trace_add()
4728 ASSERT0(new->zal_size); in metaslab_trace_move()
4729 list_move_tail(&new->zal_list, &old->zal_list); in metaslab_trace_move()
4730 new->zal_size = old->zal_size; in metaslab_trace_move()
4731 list_destroy(&old->zal_list); in metaslab_trace_move()
4737 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), in metaslab_trace_init()
4739 zal->zal_size = 0; in metaslab_trace_init()
4747 while ((mat = list_remove_head(&zal->zal_list)) != NULL) in metaslab_trace_fini()
4749 list_destroy(&zal->zal_list); in metaslab_trace_fini()
4750 zal->zal_size = 0; in metaslab_trace_fini()
4766 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; in metaslab_group_alloc_increment()
4767 if (!mg->mg_class->mc_alloc_throttle_enabled) in metaslab_group_alloc_increment()
4770 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_increment()
4771 (void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag); in metaslab_group_alloc_increment()
4779 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]); in metaslab_group_alloc_increment_all()
4792 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; in metaslab_group_alloc_decrement()
4793 if (!mg->mg_class->mc_alloc_throttle_enabled) in metaslab_group_alloc_decrement()
4796 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_decrement()
4797 (void) zfs_refcount_remove_many(&mga->mga_queue_depth, psize, tag); in metaslab_group_alloc_decrement()
4805 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_block_alloc()
4806 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_block_alloc()
4808 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_block_alloc()
4809 VERIFY(!msp->ms_condensing); in metaslab_block_alloc()
4810 VERIFY0(msp->ms_disabled); in metaslab_block_alloc()
4811 VERIFY0(msp->ms_new); in metaslab_block_alloc()
4813 start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size); in metaslab_block_alloc()
4814 if (start != -1ULL) { in metaslab_block_alloc()
4816 metaslab_group_t *mg = msp->ms_group; in metaslab_block_alloc()
4817 vdev_t *vd = mg->mg_vd; in metaslab_block_alloc()
4819 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); in metaslab_block_alloc()
4820 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_block_alloc()
4821 VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size); in metaslab_block_alloc()
4823 zfs_range_tree_clear(msp->ms_trim, start, size); in metaslab_block_alloc()
4825 if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) in metaslab_block_alloc()
4826 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); in metaslab_block_alloc()
4828 zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, in metaslab_block_alloc()
4830 msp->ms_allocating_total += size; in metaslab_block_alloc()
4833 msp->ms_alloc_txg = txg; in metaslab_block_alloc()
4841 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_block_alloc()
4851 * have selected, we may not try the newly-activated metaslab, and instead
4854 * except for the newly-activated metaslab which we fail to examine).
4863 avl_tree_t *t = &mg->mg_metaslab_tree; in find_valid_metaslab()
4888 if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) in find_valid_metaslab()
4891 *was_active = msp->ms_allocator != -1; in find_valid_metaslab()
4913 search->ms_weight = msp->ms_weight; in find_valid_metaslab()
4914 search->ms_start = msp->ms_start + 1; in find_valid_metaslab()
4915 search->ms_allocator = msp->ms_allocator; in find_valid_metaslab()
4916 search->ms_primary = msp->ms_primary; in find_valid_metaslab()
4924 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_active_mask_verify()
4929 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) in metaslab_active_mask_verify()
4932 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { in metaslab_active_mask_verify()
4933 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_active_mask_verify()
4934 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); in metaslab_active_mask_verify()
4935 VERIFY3S(msp->ms_allocator, !=, -1); in metaslab_active_mask_verify()
4936 VERIFY(msp->ms_primary); in metaslab_active_mask_verify()
4940 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { in metaslab_active_mask_verify()
4941 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_active_mask_verify()
4942 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); in metaslab_active_mask_verify()
4943 VERIFY3S(msp->ms_allocator, !=, -1); in metaslab_active_mask_verify()
4944 VERIFY(!msp->ms_primary); in metaslab_active_mask_verify()
4948 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { in metaslab_active_mask_verify()
4949 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_active_mask_verify()
4950 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_active_mask_verify()
4951 VERIFY3S(msp->ms_allocator, ==, -1); in metaslab_active_mask_verify()
4963 uint64_t offset = -1ULL; in metaslab_group_alloc()
4968 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { in metaslab_group_alloc()
4971 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { in metaslab_group_alloc()
4980 if (allocator >= mg->mg_ms_ready / 3) in metaslab_group_alloc()
4982 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc()
4984 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); in metaslab_group_alloc()
4987 search->ms_weight = UINT64_MAX; in metaslab_group_alloc()
4988 search->ms_start = 0; in metaslab_group_alloc()
4990 * At the end of the metaslab tree are the already-active metaslabs, in metaslab_group_alloc()
4996 search->ms_allocator = -1; in metaslab_group_alloc()
4997 search->ms_primary = B_TRUE; in metaslab_group_alloc()
5001 mutex_enter(&mg->mg_lock); in metaslab_group_alloc()
5004 mga->mga_primary != NULL) { in metaslab_group_alloc()
5005 msp = mga->mga_primary; in metaslab_group_alloc()
5013 ASSERT(msp->ms_primary); in metaslab_group_alloc()
5014 ASSERT3S(msp->ms_allocator, ==, allocator); in metaslab_group_alloc()
5015 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5018 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_group_alloc()
5020 mga->mga_secondary != NULL) { in metaslab_group_alloc()
5021 msp = mga->mga_secondary; in metaslab_group_alloc()
5027 ASSERT(!msp->ms_primary); in metaslab_group_alloc()
5028 ASSERT3S(msp->ms_allocator, ==, allocator); in metaslab_group_alloc()
5029 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5032 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_group_alloc()
5039 mutex_exit(&mg->mg_lock); in metaslab_group_alloc()
5042 mutex_enter(&msp->ms_lock); in metaslab_group_alloc()
5048 * tracepoints in non-gpl kernel modules. in metaslab_group_alloc()
5064 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { in metaslab_group_alloc()
5065 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_group_alloc()
5066 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5076 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && in metaslab_group_alloc()
5077 (msp->ms_allocator != -1) && in metaslab_group_alloc()
5078 (msp->ms_allocator != allocator || ((activation_weight == in metaslab_group_alloc()
5079 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { in metaslab_group_alloc()
5080 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5081 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || in metaslab_group_alloc()
5082 msp->ms_allocator != -1); in metaslab_group_alloc()
5083 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5094 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && in metaslab_group_alloc()
5096 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5097 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_group_alloc()
5098 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc()
5100 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5129 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5132 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5155 if (msp->ms_condensing) { in metaslab_group_alloc()
5159 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc()
5162 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5164 } else if (msp->ms_disabled > 0) { in metaslab_group_alloc()
5168 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc()
5171 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5178 if (offset != -1ULL) { in metaslab_group_alloc()
5184 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5189 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5193 * tracepoints in non-gpl kernel modules. in metaslab_group_alloc()
5206 * For space-based metaslabs, we use the maximum block size. in metaslab_group_alloc()
5214 * For segment-based metaslabs, determine the new weight in metaslab_group_alloc()
5222 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { in metaslab_group_alloc()
5244 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_group_alloc()
5257 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5261 if (offset == -1ULL) { in metaslab_group_alloc()
5264 if (asize <= vdev_get_min_alloc(mg->mg_vd)) { in metaslab_group_alloc()
5272 mg->mg_no_free_space = B_TRUE; in metaslab_group_alloc()
5282 metaslab_class_t *mc = mg->mg_class; in metaslab_group_allocatable()
5283 vdev_t *vd = mg->mg_vd; in metaslab_group_allocatable()
5304 if (!GANG_ALLOCATION(flags) && (mg->mg_no_free_space || in metaslab_group_allocatable()
5305 (!mg->mg_allocatable && mc->mc_alloc_groups > 0))) { in metaslab_group_allocatable()
5312 * Avoid writing single-copy data to an unhealthy, in metaslab_group_allocatable()
5313 * non-redundant vdev. in metaslab_group_allocatable()
5315 if (d == 0 && vd->vdev_state < VDEV_STATE_HEALTHY && in metaslab_group_allocatable()
5316 vd->vdev_children == 0) { in metaslab_group_allocatable()
5331 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_alloc_dva_range()
5341 * and a large number of split blocks coupled with ztest-induced in metaslab_alloc_dva_range()
5363 * nothing actually breaks if we miss a few updates -- we just won't in metaslab_alloc_dva_range()
5373 * able to reason about. Otherwise, any two top-level vdev failures in metaslab_alloc_dva_range()
5375 * only two adjacent top-level vdev failures will result in data loss. in metaslab_alloc_dva_range()
5377 * If we are doing gang blocks (hintdva is non-NULL), try to keep in metaslab_alloc_dva_range()
5386 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); in metaslab_alloc_dva_range()
5387 mg = vdev_get_mg(vd, mc)->mg_next; in metaslab_alloc_dva_range()
5389 if (mg == NULL || mg->mg_class != mc || mg->mg_activation_count <= 0) { in metaslab_alloc_dva_range()
5390 ASSERT(mca->mca_rotor != NULL); in metaslab_alloc_dva_range()
5391 mg = mca->mca_rotor; in metaslab_alloc_dva_range()
5397 ASSERT(mg->mg_activation_count == 1); in metaslab_alloc_dva_range()
5398 ASSERT(mg->mg_class == mc); in metaslab_alloc_dva_range()
5404 vd = mg->mg_vd; in metaslab_alloc_dva_range()
5406 ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); in metaslab_alloc_dva_range()
5409 ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); in metaslab_alloc_dva_range()
5414 if (offset != -1ULL) { in metaslab_alloc_dva_range()
5420 DVA_SET_VDEV(&dva[d], vd->vdev_id); in metaslab_alloc_dva_range()
5429 } while ((mg = mg->mg_next) != rotor); in metaslab_alloc_dva_range()
5436 psize <= spa->spa_min_alloc)) { in metaslab_alloc_dva_range()
5465 spa_t *spa = vd->vdev_spa; in metaslab_free_concrete()
5466 int m = offset >> vd->vdev_ms_shift; in metaslab_free_concrete()
5470 VERIFY3U(m, <, vd->vdev_ms_count); in metaslab_free_concrete()
5472 msp = vd->vdev_ms[m]; in metaslab_free_concrete()
5474 VERIFY(!msp->ms_condensing); in metaslab_free_concrete()
5475 VERIFY3U(offset, >=, msp->ms_start); in metaslab_free_concrete()
5476 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); in metaslab_free_concrete()
5477 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_free_concrete()
5478 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); in metaslab_free_concrete()
5482 mutex_enter(&msp->ms_lock); in metaslab_free_concrete()
5483 if (zfs_range_tree_is_empty(msp->ms_freeing) && in metaslab_free_concrete()
5484 zfs_range_tree_is_empty(msp->ms_checkpointing)) { in metaslab_free_concrete()
5490 zfs_range_tree_add(msp->ms_checkpointing, offset, asize); in metaslab_free_concrete()
5492 zfs_range_tree_add(msp->ms_freeing, offset, asize); in metaslab_free_concrete()
5494 mutex_exit(&msp->ms_lock); in metaslab_free_concrete()
5506 if (vd->vdev_ops->vdev_op_remap != NULL) in metaslab_free_impl_cb()
5516 spa_t *spa = vd->vdev_spa; in metaslab_free_impl()
5523 if (spa->spa_vdev_removal != NULL && in metaslab_free_impl()
5524 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && in metaslab_free_impl()
5533 } else if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_free_impl()
5535 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_free_impl()
5555 blkptr_t *bp = rbca->rbca_bp; in remap_blkptr_cb()
5558 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) in remap_blkptr_cb()
5562 if (rbca->rbca_cb != NULL) { in remap_blkptr_cb()
5568 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); in remap_blkptr_cb()
5570 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, in remap_blkptr_cb()
5571 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); in remap_blkptr_cb()
5574 rbca->rbca_remap_vd = vd; in remap_blkptr_cb()
5575 rbca->rbca_remap_offset = offset; in remap_blkptr_cb()
5588 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, in remap_blkptr_cb()
5589 DVA_GET_VDEV(&bp->blk_dva[0])); in remap_blkptr_cb()
5590 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; in remap_blkptr_cb()
5592 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); in remap_blkptr_cb()
5595 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); in remap_blkptr_cb()
5596 DVA_SET_OFFSET(&bp->blk_dva[0], offset); in remap_blkptr_cb()
5655 dva_t *dva = &bp->blk_dva[0]; in spa_remap_blkptr()
5661 if (vd->vdev_ops->vdev_op_remap == NULL) in spa_remap_blkptr()
5677 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); in spa_remap_blkptr()
5680 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) in spa_remap_blkptr()
5705 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { in metaslab_unalloc_dva()
5712 ASSERT(!vd->vdev_removing); in metaslab_unalloc_dva()
5714 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); in metaslab_unalloc_dva()
5715 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); in metaslab_unalloc_dva()
5720 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_unalloc_dva()
5722 mutex_enter(&msp->ms_lock); in metaslab_unalloc_dva()
5723 zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK], in metaslab_unalloc_dva()
5725 msp->ms_allocating_total -= size; in metaslab_unalloc_dva()
5727 VERIFY(!msp->ms_condensing); in metaslab_unalloc_dva()
5728 VERIFY3U(offset, >=, msp->ms_start); in metaslab_unalloc_dva()
5729 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); in metaslab_unalloc_dva()
5730 VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=, in metaslab_unalloc_dva()
5731 msp->ms_size); in metaslab_unalloc_dva()
5732 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_unalloc_dva()
5733 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_unalloc_dva()
5734 zfs_range_tree_add(msp->ms_allocatable, offset, size); in metaslab_unalloc_dva()
5735 mutex_exit(&msp->ms_lock); in metaslab_unalloc_dva()
5770 metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; in metaslab_class_throttle_reserve()
5772 ASSERT(mc->mc_alloc_throttle_enabled); in metaslab_class_throttle_reserve()
5773 if (mc->mc_alloc_io_size < zio->io_size) { in metaslab_class_throttle_reserve()
5774 mc->mc_alloc_io_size = zio->io_size; in metaslab_class_throttle_reserve()
5777 if (must || mca->mca_reserved <= mc->mc_alloc_max) { in metaslab_class_throttle_reserve()
5781 * But even if we assume some other non-existing scenario, the in metaslab_class_throttle_reserve()
5785 int64_t delta = slots * zio->io_size; in metaslab_class_throttle_reserve()
5786 *more = (atomic_add_64_nv(&mca->mca_reserved, delta) <= in metaslab_class_throttle_reserve()
5787 mc->mc_alloc_max); in metaslab_class_throttle_reserve()
5788 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; in metaslab_class_throttle_reserve()
5799 metaslab_class_allocator_t *mca = &mc->mc_allocator[zio->io_allocator]; in metaslab_class_throttle_unreserve()
5801 ASSERT(mc->mc_alloc_throttle_enabled); in metaslab_class_throttle_unreserve()
5802 int64_t delta = slots * zio->io_size; in metaslab_class_throttle_unreserve()
5803 return (atomic_add_64_nv(&mca->mca_reserved, -delta) <= in metaslab_class_throttle_unreserve()
5804 mc->mc_alloc_max); in metaslab_class_throttle_unreserve()
5812 spa_t *spa = vd->vdev_spa; in metaslab_claim_concrete()
5815 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) in metaslab_claim_concrete()
5818 ASSERT3P(vd->vdev_ms, !=, NULL); in metaslab_claim_concrete()
5819 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_claim_concrete()
5821 mutex_enter(&msp->ms_lock); in metaslab_claim_concrete()
5823 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { in metaslab_claim_concrete()
5826 ASSERT(msp->ms_loaded); in metaslab_claim_concrete()
5827 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_claim_concrete()
5833 !zfs_range_tree_contains(msp->ms_allocatable, offset, size)) in metaslab_claim_concrete()
5837 mutex_exit(&msp->ms_lock); in metaslab_claim_concrete()
5841 VERIFY(!msp->ms_condensing); in metaslab_claim_concrete()
5842 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_claim_concrete()
5843 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_claim_concrete()
5844 VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=, in metaslab_claim_concrete()
5845 msp->ms_size); in metaslab_claim_concrete()
5846 zfs_range_tree_remove(msp->ms_allocatable, offset, size); in metaslab_claim_concrete()
5847 zfs_range_tree_clear(msp->ms_trim, offset, size); in metaslab_claim_concrete()
5850 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_claim_concrete()
5852 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_claim_concrete()
5853 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_claim_concrete()
5854 msp->ms_selected_txg = txg; in metaslab_claim_concrete()
5859 if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) in metaslab_claim_concrete()
5861 zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], in metaslab_claim_concrete()
5863 msp->ms_allocating_total += size; in metaslab_claim_concrete()
5866 mutex_exit(&msp->ms_lock); in metaslab_claim_concrete()
5883 if (mcca_arg->mcca_error == 0) { in metaslab_claim_impl_cb()
5884 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, in metaslab_claim_impl_cb()
5885 size, mcca_arg->mcca_txg); in metaslab_claim_impl_cb()
5892 if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_claim_impl()
5900 ASSERT(!spa_writeable(vd->vdev_spa)); in metaslab_claim_impl()
5904 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_claim_impl()
5958 dva_t *dva = bp->blk_dva; in metaslab_alloc_range()
5959 const dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; in metaslab_alloc_range()
5967 if (mc->mc_allocator[allocator].mca_rotor == NULL) { in metaslab_alloc_range()
5985 for (d--; d >= 0; d--) { in metaslab_alloc_range()
6021 const dva_t *dva = bp->blk_dva; in metaslab_free()
6043 if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && in metaslab_free()
6044 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { in metaslab_free()
6071 const dva_t *dva = bp->blk_dva; in metaslab_claim()
6107 if (vd->vdev_ops == &vdev_indirect_ops) in metaslab_check_free_impl_cb()
6117 spa_t *spa __maybe_unused = vd->vdev_spa; in metaslab_check_free_impl()
6122 if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_check_free_impl()
6123 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_check_free_impl()
6129 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); in metaslab_check_free_impl()
6132 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_check_free_impl()
6134 mutex_enter(&msp->ms_lock); in metaslab_check_free_impl()
6135 if (msp->ms_loaded) { in metaslab_check_free_impl()
6136 zfs_range_tree_verify_not_present(msp->ms_allocatable, in metaslab_check_free_impl()
6151 zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size); in metaslab_check_free_impl()
6152 zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size); in metaslab_check_free_impl()
6153 zfs_range_tree_verify_not_present(msp->ms_freed, offset, size); in metaslab_check_free_impl()
6155 zfs_range_tree_verify_not_present(msp->ms_defer[j], offset, in metaslab_check_free_impl()
6157 zfs_range_tree_verify_not_present(msp->ms_trim, offset, size); in metaslab_check_free_impl()
6158 mutex_exit(&msp->ms_lock); in metaslab_check_free_impl()
6169 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); in metaslab_check_free()
6171 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); in metaslab_check_free()
6172 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); in metaslab_check_free()
6174 if (DVA_GET_GANG(&bp->blk_dva[i])) in metaslab_check_free()
6187 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); in metaslab_group_disable_wait()
6188 while (mg->mg_disabled_updating) { in metaslab_group_disable_wait()
6189 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); in metaslab_group_disable_wait()
6196 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); in metaslab_group_disabled_increment()
6197 ASSERT(mg->mg_disabled_updating); in metaslab_group_disabled_increment()
6199 while (mg->mg_ms_disabled >= max_disabled_ms) { in metaslab_group_disabled_increment()
6200 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); in metaslab_group_disabled_increment()
6202 mg->mg_ms_disabled++; in metaslab_group_disabled_increment()
6203 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); in metaslab_group_disabled_increment()
6215 ASSERT(!MUTEX_HELD(&msp->ms_lock)); in metaslab_disable()
6216 metaslab_group_t *mg = msp->ms_group; in metaslab_disable()
6218 mutex_enter(&mg->mg_ms_disabled_lock); in metaslab_disable()
6230 mg->mg_disabled_updating = B_TRUE; in metaslab_disable()
6231 if (msp->ms_disabled == 0) { in metaslab_disable()
6234 mutex_enter(&msp->ms_lock); in metaslab_disable()
6235 msp->ms_disabled++; in metaslab_disable()
6236 mutex_exit(&msp->ms_lock); in metaslab_disable()
6238 mg->mg_disabled_updating = B_FALSE; in metaslab_disable()
6239 cv_broadcast(&mg->mg_ms_disabled_cv); in metaslab_disable()
6240 mutex_exit(&mg->mg_ms_disabled_lock); in metaslab_disable()
6246 metaslab_group_t *mg = msp->ms_group; in metaslab_enable()
6247 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_enable()
6257 mutex_enter(&mg->mg_ms_disabled_lock); in metaslab_enable()
6258 mutex_enter(&msp->ms_lock); in metaslab_enable()
6259 if (--msp->ms_disabled == 0) { in metaslab_enable()
6260 mg->mg_ms_disabled--; in metaslab_enable()
6261 cv_broadcast(&mg->mg_ms_disabled_cv); in metaslab_enable()
6265 mutex_exit(&msp->ms_lock); in metaslab_enable()
6266 mutex_exit(&mg->mg_ms_disabled_lock); in metaslab_enable()
6270 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) in metaslab_set_unflushed_dirty() argument
6272 ms->ms_unflushed_dirty = dirty; in metaslab_set_unflushed_dirty()
6276 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) in metaslab_update_ondisk_flush_data() argument
6278 vdev_t *vd = ms->ms_group->mg_vd; in metaslab_update_ondisk_flush_data()
6279 spa_t *spa = vd->vdev_spa; in metaslab_update_ondisk_flush_data()
6285 .msp_unflushed_txg = metaslab_unflushed_txg(ms), in metaslab_update_ondisk_flush_data()
6288 uint64_t entry_offset = ms->ms_id * entry_size; in metaslab_update_ondisk_flush_data()
6291 int err = zap_lookup(mos, vd->vdev_top_zap, in metaslab_update_ondisk_flush_data()
6297 VERIFY0(zap_add(mos, vd->vdev_top_zap, in metaslab_update_ondisk_flush_data()
6309 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) in metaslab_set_unflushed_txg() argument
6311 ms->ms_unflushed_txg = txg; in metaslab_set_unflushed_txg()
6312 metaslab_update_ondisk_flush_data(ms, tx); in metaslab_set_unflushed_txg()
6316 metaslab_unflushed_dirty(metaslab_t *ms) in metaslab_unflushed_dirty() argument
6318 return (ms->ms_unflushed_dirty); in metaslab_unflushed_dirty()
6322 metaslab_unflushed_txg(metaslab_t *ms) in metaslab_unflushed_txg() argument
6324 return (ms->ms_unflushed_txg); in metaslab_unflushed_txg()
6343 "Delay in txgs after metaslab was last used before unloading");
6346 "Delay in milliseconds after metaslab was last used before unloading");
6368 "Enable space-based metaslab group biasing");
6371 "Enable performance-based metaslab group biasing");
6374 ZMOD_RW, "Enable segment-based metaslab selection");
6377 "Segment-based metaslab selection maximum buckets before switching");