Lines Matching +full:saturation +full:- +full:ratio

1 // SPDX-License-Identifier: CDDL-1.0
10 * or https://opensource.org/licenses/CDDL-1.0.
52 * data to each disk before moving on to the next top-level vdev.
84 * The in-core space map representation is more compact than its on-disk form.
85 * The zfs_condense_pct determines how much more compact the in-core
86 * space map representation must be before we compact it on-disk.
173 * in a space map to continue allocations in a first-fit fashion.
175 * switch to using best-fit allocations.
183 * high-performance storage.
215 * unloaded sooner. These settings are intended to be generous -- to keep
242 * Enable/disable space-based metaslab group biasing.
247 * Control performance-based metaslab group biasing.
257 * Enable/disable segment-based metaslab selection.
262 * When using segment-based metaslab selection, we will continue
276 * in a given list when running in non-debug mode. We limit the number
277 * of entries in non-debug mode to prevent us from using up too much memory.
292 * To avoid 64-bit overflow, don't set above UINT32_MAX.
304 * Force the per-metaslab range trees to use 64-bit integers to store
310 * By default we only store segments over a certain size in the size-sorted
321 * gang allocation. If that fails then we will have a multi-layer gang
327 * that fails then we will have a multi-layer gang block.
340 * metaslabs all have free segments in the 32-63K bucket, but the best
382 spa_name(mg->mg_vd->vdev_spa), in metaslab_rt_name()
383 (u_longlong_t)mg->mg_vd->vdev_guid, in metaslab_rt_name()
384 (u_longlong_t)ms->ms_id, in metaslab_rt_name()
402 metaslab_ksp->ks_data = &metaslab_stats; in metaslab_stat_init()
431 mc_allocator[spa->spa_alloc_count]), KM_SLEEP); in metaslab_class_create()
433 mc->mc_spa = spa; in metaslab_class_create()
434 mc->mc_name = name; in metaslab_class_create()
435 mc->mc_ops = ops; in metaslab_class_create()
436 mc->mc_is_log = is_log; in metaslab_class_create()
437 mc->mc_alloc_io_size = SPA_OLD_MAXBLOCKSIZE; in metaslab_class_create()
438 mc->mc_alloc_max = UINT64_MAX; in metaslab_class_create()
439 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_class_create()
440 multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), in metaslab_class_create()
442 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_create()
443 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_create()
444 mutex_init(&mca->mca_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_class_create()
445 avl_create(&mca->mca_tree, zio_bookmark_compare, in metaslab_class_create()
447 mca->mca_rotor = NULL; in metaslab_class_create()
448 mca->mca_reserved = 0; in metaslab_class_create()
457 spa_t *spa = mc->mc_spa; in metaslab_class_destroy()
459 ASSERT0(mc->mc_alloc); in metaslab_class_destroy()
460 ASSERT0(mc->mc_deferred); in metaslab_class_destroy()
461 ASSERT0(mc->mc_space); in metaslab_class_destroy()
462 ASSERT0(mc->mc_dspace); in metaslab_class_destroy()
464 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_destroy()
465 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_destroy()
466 avl_destroy(&mca->mca_tree); in metaslab_class_destroy()
467 mutex_destroy(&mca->mca_lock); in metaslab_class_destroy()
468 ASSERT0P(mca->mca_rotor); in metaslab_class_destroy()
469 ASSERT0(mca->mca_reserved); in metaslab_class_destroy()
471 mutex_destroy(&mc->mc_lock); in metaslab_class_destroy()
472 multilist_destroy(&mc->mc_metaslab_txg_list); in metaslab_class_destroy()
474 mc_allocator[spa->spa_alloc_count])); in metaslab_class_destroy()
481 spa_t *spa = mc->mc_spa; in metaslab_class_validate()
489 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_validate()
490 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_validate()
493 ASSERT0(avl_numnodes(&mca->mca_tree)); in metaslab_class_validate()
494 ASSERT0(mca->mca_reserved); in metaslab_class_validate()
496 if ((mg = rotor = mca->mca_rotor) == NULL) in metaslab_class_validate()
499 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_class_validate()
500 vdev_t *vd = mg->mg_vd; in metaslab_class_validate()
502 ASSERT3P(vd->vdev_top, ==, vd); in metaslab_class_validate()
503 ASSERT(vd->vdev_mg == mg || vd->vdev_log_mg == mg); in metaslab_class_validate()
504 ASSERT3P(mg->mg_class, ==, mc); in metaslab_class_validate()
505 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); in metaslab_class_validate()
506 ASSERT0(zfs_refcount_count(&mga->mga_queue_depth)); in metaslab_class_validate()
507 } while ((mg = mg->mg_next) != rotor); in metaslab_class_validate()
513 * For each metaslab group in a class pre-calculate allocation quota and
515 * Based on those pre-calculate class allocation throttle threshold for
516 * optimal saturation. onsync is true once per TXG to enable/disable
527 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || in metaslab_class_balance()
528 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); in metaslab_class_balance()
533 if (mc->mc_groups == 0) { in metaslab_class_balance()
535 mc->mc_alloc_throttle_enabled = B_FALSE; in metaslab_class_balance()
536 mc->mc_alloc_max = UINT64_MAX; in metaslab_class_balance()
545 mc->mc_alloc_io_size = (3 * mc->mc_alloc_io_size + in metaslab_class_balance()
547 mc->mc_alloc_throttle_enabled = mc->mc_is_log ? 0 : in metaslab_class_balance()
551 mg = first = mc->mc_allocator[0].mca_rotor; in metaslab_class_balance()
554 children += vdev_get_ndisks(mg->mg_vd) - in metaslab_class_balance()
555 vdev_get_nparity(mg->mg_vd); in metaslab_class_balance()
556 } while ((mg = mg->mg_next) != first); in metaslab_class_balance()
560 vdev_stat_t *vs = &mg->mg_vd->vdev_stat; in metaslab_class_balance()
561 uint_t ratio; in metaslab_class_balance() local
566 * to keep decent per-child I/O size. in metaslab_class_balance()
569 mc->mc_groups, mc->mc_alloc_io_size * 4); in metaslab_class_balance()
576 if (mc->mc_space > 0 && vs->vs_space > 0) { in metaslab_class_balance()
577 ratio = vs->vs_space / (mc->mc_space / (mc->mc_groups * in metaslab_class_balance()
579 mg_aliquot = mg_aliquot * ratio / 256; in metaslab_class_balance()
598 mc->mc_space > 0 && vs->vs_space > 0) { in metaslab_class_balance()
599 uint64_t vs_free = vs->vs_space > vs->vs_alloc ? in metaslab_class_balance()
600 vs->vs_space - vs->vs_alloc : 0; in metaslab_class_balance()
601 uint64_t mc_free = mc->mc_space > mc->mc_alloc ? in metaslab_class_balance()
602 mc->mc_space - mc->mc_alloc : 0; in metaslab_class_balance()
604 * vs_fr is 16 bit fixed-point free space fraction. in metaslab_class_balance()
605 * mc_fr is 8 bit fixed-point free space fraction. in metaslab_class_balance()
606 * ratio as their quotient is 8 bit fixed-point. in metaslab_class_balance()
608 uint_t vs_fr = vs_free / (vs->vs_space / 65536 + 1); in metaslab_class_balance()
609 uint_t mc_fr = mc_free / (mc->mc_space / 256 + 1); in metaslab_class_balance()
610 ratio = vs_fr / (mc_fr + 1); in metaslab_class_balance()
611 mg->mg_aliquot = mg_aliquot * ratio / 256; in metaslab_class_balance()
613 ratio = MIN(163840, vs_fr * 3 + 16384); in metaslab_class_balance()
614 mg->mg_queue_target = MAX(mg->mg_aliquot, in metaslab_class_balance()
615 mg->mg_aliquot * ratio / 65536); in metaslab_class_balance()
617 mg->mg_aliquot = mg_aliquot; in metaslab_class_balance()
618 mg->mg_queue_target = mg->mg_aliquot * 2; in metaslab_class_balance()
620 sum_aliquot += mg->mg_aliquot; in metaslab_class_balance()
621 } while ((mg = mg->mg_next) != first); in metaslab_class_balance()
624 * Set per-class allocation throttle threshold to 4 iterations through in metaslab_class_balance()
629 mc->mc_alloc_max = sum_aliquot * 4; in metaslab_class_balance()
636 metaslab_class_t *mc = mg->mg_class; in metaslab_class_rotate()
637 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_class_rotate()
638 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_class_rotate()
644 if (mc->mc_groups < 2 || mca->mca_rotor != mg) in metaslab_class_rotate()
650 if (!success || mc->mc_is_log) in metaslab_class_rotate()
661 uint64_t naq = atomic_add_64_nv(&mca->mca_aliquot, psize) + psize / 2; in metaslab_class_rotate()
662 if (naq < mg->mg_aliquot) in metaslab_class_rotate()
664 if (naq >= mg->mg_queue_target) in metaslab_class_rotate()
666 if (zfs_refcount_count(&mga->mga_queue_depth) + psize + psize / 2 >= in metaslab_class_rotate()
667 mg->mg_queue_target) in metaslab_class_rotate()
679 spa_t *spa = mc->mc_spa; in metaslab_class_rotate()
686 if (dp->dp_dirty_total > busy_thresh || spa_has_pending_synctask(spa)) in metaslab_class_rotate()
690 mca->mca_rotor = mg->mg_next; in metaslab_class_rotate()
691 mca->mca_aliquot = 0; in metaslab_class_rotate()
698 atomic_add_64(&mc->mc_alloc, alloc_delta); in metaslab_class_space_update()
699 atomic_add_64(&mc->mc_deferred, defer_delta); in metaslab_class_space_update()
700 atomic_add_64(&mc->mc_space, space_delta); in metaslab_class_space_update()
701 atomic_add_64(&mc->mc_dspace, dspace_delta); in metaslab_class_space_update()
707 return (mc->mc_name); in metaslab_class_get_name()
713 return (mc->mc_alloc); in metaslab_class_get_alloc()
719 return (mc->mc_deferred); in metaslab_class_get_deferred()
725 return (mc->mc_space); in metaslab_class_get_space()
731 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); in metaslab_class_get_dspace()
737 spa_t *spa = mc->mc_spa; in metaslab_class_histogram_verify()
738 vdev_t *rvd = spa->spa_root_vdev; in metaslab_class_histogram_verify()
748 mutex_enter(&mc->mc_lock); in metaslab_class_histogram_verify()
749 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_histogram_verify()
750 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_histogram_verify()
754 * Skip any holes, uninitialized top-levels, or in metaslab_class_histogram_verify()
757 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_histogram_verify()
758 mg->mg_class != mc) { in metaslab_class_histogram_verify()
762 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_class_histogram_verify()
763 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || in metaslab_class_histogram_verify()
764 mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_class_histogram_verify()
767 mc_hist[i] += mg->mg_histogram[i]; in metaslab_class_histogram_verify()
771 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); in metaslab_class_histogram_verify()
774 mutex_exit(&mc->mc_lock); in metaslab_class_histogram_verify()
788 vdev_t *rvd = mc->mc_spa->spa_root_vdev; in metaslab_class_fragmentation()
791 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); in metaslab_class_fragmentation()
793 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_fragmentation()
794 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_fragmentation()
795 metaslab_group_t *mg = tvd->vdev_mg; in metaslab_class_fragmentation()
798 * Skip any holes, uninitialized top-levels, in metaslab_class_fragmentation()
801 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_fragmentation()
802 mg->mg_class != mc) { in metaslab_class_fragmentation()
810 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { in metaslab_class_fragmentation()
811 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_fragmentation()
819 fragmentation += mg->mg_fragmentation * in metaslab_class_fragmentation()
825 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_fragmentation()
838 vdev_t *rvd = mc->mc_spa->spa_root_vdev; in metaslab_class_expandable_space()
841 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); in metaslab_class_expandable_space()
842 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_expandable_space()
843 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_expandable_space()
844 metaslab_group_t *mg = tvd->vdev_mg; in metaslab_class_expandable_space()
846 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_expandable_space()
847 mg->mg_class != mc) { in metaslab_class_expandable_space()
856 space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, in metaslab_class_expandable_space()
857 1ULL << tvd->vdev_ms_shift, uint64_t); in metaslab_class_expandable_space()
859 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_expandable_space()
866 multilist_t *ml = &mc->mc_metaslab_txg_list; in metaslab_class_evict_old()
875 mutex_enter(&msp->ms_lock); in metaslab_class_evict_old()
883 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_class_evict_old()
884 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
885 i--; in metaslab_class_evict_old()
892 msp->ms_selected_txg + metaslab_unload_delay && in metaslab_class_evict_old()
893 now > msp->ms_selected_time + delay && in metaslab_class_evict_old()
894 (msp->ms_allocator == -1 || in metaslab_class_evict_old()
903 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
906 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
920 if (m1->ms_allocator != -1 && m1->ms_primary) in metaslab_compare()
922 else if (m1->ms_allocator != -1 && !m1->ms_primary) in metaslab_compare()
924 if (m2->ms_allocator != -1 && m2->ms_primary) in metaslab_compare()
926 else if (m2->ms_allocator != -1 && !m2->ms_primary) in metaslab_compare()
938 return (-1); in metaslab_compare()
942 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); in metaslab_compare()
946 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); in metaslab_compare()
948 return (TREE_CMP(m1->ms_start, m2->ms_start)); in metaslab_compare()
961 * transitions from allocatable to non-allocatable or vice versa then the
967 vdev_t *vd = mg->mg_vd; in metaslab_group_alloc_update()
968 metaslab_class_t *mc = mg->mg_class; in metaslab_group_alloc_update()
969 vdev_stat_t *vs = &vd->vdev_stat; in metaslab_group_alloc_update()
973 ASSERT(vd == vd->vdev_top); in metaslab_group_alloc_update()
974 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, in metaslab_group_alloc_update()
977 mutex_enter(&mg->mg_lock); in metaslab_group_alloc_update()
978 was_allocatable = mg->mg_allocatable; in metaslab_group_alloc_update()
979 was_initialized = mg->mg_initialized; in metaslab_group_alloc_update()
981 uint64_t free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / in metaslab_group_alloc_update()
982 (vs->vs_space + 1); in metaslab_group_alloc_update()
984 mutex_enter(&mc->mc_lock); in metaslab_group_alloc_update()
990 * for allocations. We also don't consider non-activated in metaslab_group_alloc_update()
994 mg->mg_initialized = metaslab_group_initialized(mg); in metaslab_group_alloc_update()
995 if (!was_initialized && mg->mg_initialized) { in metaslab_group_alloc_update()
996 mc->mc_groups++; in metaslab_group_alloc_update()
997 } else if (was_initialized && !mg->mg_initialized) { in metaslab_group_alloc_update()
998 ASSERT3U(mc->mc_groups, >, 0); in metaslab_group_alloc_update()
999 mc->mc_groups--; in metaslab_group_alloc_update()
1001 if (mg->mg_initialized) in metaslab_group_alloc_update()
1002 mg->mg_no_free_space = B_FALSE; in metaslab_group_alloc_update()
1010 mg->mg_allocatable = (mg->mg_activation_count > 0 && in metaslab_group_alloc_update()
1012 (mg->mg_fragmentation == ZFS_FRAG_INVALID || in metaslab_group_alloc_update()
1013 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); in metaslab_group_alloc_update()
1023 * When a group transitions from allocatable to non-allocatable or in metaslab_group_alloc_update()
1030 if (was_allocatable && !mg->mg_allocatable) in metaslab_group_alloc_update()
1031 mc->mc_alloc_groups--; in metaslab_group_alloc_update()
1032 else if (!was_allocatable && mg->mg_allocatable) in metaslab_group_alloc_update()
1033 mc->mc_alloc_groups++; in metaslab_group_alloc_update()
1034 mutex_exit(&mc->mc_lock); in metaslab_group_alloc_update()
1036 mutex_exit(&mg->mg_lock); in metaslab_group_alloc_update()
1045 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); in metaslab_sort_by_flushed()
1049 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; in metaslab_sort_by_flushed()
1050 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; in metaslab_sort_by_flushed()
1055 return (TREE_CMP(a->ms_id, b->ms_id)); in metaslab_sort_by_flushed()
1061 spa_t *spa = mc->mc_spa; in metaslab_group_create()
1065 mg_allocator[spa->spa_alloc_count]), KM_SLEEP); in metaslab_group_create()
1066 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_group_create()
1067 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_group_create()
1068 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); in metaslab_group_create()
1069 avl_create(&mg->mg_metaslab_tree, metaslab_compare, in metaslab_group_create()
1071 mg->mg_vd = vd; in metaslab_group_create()
1072 mg->mg_class = mc; in metaslab_group_create()
1073 mg->mg_activation_count = 0; in metaslab_group_create()
1074 mg->mg_initialized = B_FALSE; in metaslab_group_create()
1075 mg->mg_no_free_space = B_TRUE; in metaslab_group_create()
1077 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_create()
1078 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_create()
1079 zfs_refcount_create_tracked(&mga->mga_queue_depth); in metaslab_group_create()
1088 spa_t *spa = mg->mg_class->mc_spa; in metaslab_group_destroy()
1090 ASSERT0P(mg->mg_prev); in metaslab_group_destroy()
1091 ASSERT0P(mg->mg_next); in metaslab_group_destroy()
1097 ASSERT(mg->mg_activation_count <= 0); in metaslab_group_destroy()
1099 avl_destroy(&mg->mg_metaslab_tree); in metaslab_group_destroy()
1100 mutex_destroy(&mg->mg_lock); in metaslab_group_destroy()
1101 mutex_destroy(&mg->mg_ms_disabled_lock); in metaslab_group_destroy()
1102 cv_destroy(&mg->mg_ms_disabled_cv); in metaslab_group_destroy()
1104 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_destroy()
1105 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_destroy()
1106 zfs_refcount_destroy(&mga->mga_queue_depth); in metaslab_group_destroy()
1109 mg_allocator[spa->spa_alloc_count])); in metaslab_group_destroy()
1115 metaslab_class_t *mc = mg->mg_class; in metaslab_group_activate()
1116 spa_t *spa = mc->mc_spa; in metaslab_group_activate()
1121 ASSERT0P(mg->mg_prev); in metaslab_group_activate()
1122 ASSERT0P(mg->mg_next); in metaslab_group_activate()
1123 ASSERT(mg->mg_activation_count <= 0); in metaslab_group_activate()
1125 if (++mg->mg_activation_count <= 0) in metaslab_group_activate()
1130 if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { in metaslab_group_activate()
1131 mg->mg_prev = mg; in metaslab_group_activate()
1132 mg->mg_next = mg; in metaslab_group_activate()
1134 mgnext = mgprev->mg_next; in metaslab_group_activate()
1135 mg->mg_prev = mgprev; in metaslab_group_activate()
1136 mg->mg_next = mgnext; in metaslab_group_activate()
1137 mgprev->mg_next = mg; in metaslab_group_activate()
1138 mgnext->mg_prev = mg; in metaslab_group_activate()
1140 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_activate()
1141 mc->mc_allocator[i].mca_rotor = mg; in metaslab_group_activate()
1142 mg = mg->mg_next; in metaslab_group_activate()
1156 metaslab_class_t *mc = mg->mg_class; in metaslab_group_passivate()
1157 spa_t *spa = mc->mc_spa; in metaslab_group_passivate()
1164 if (--mg->mg_activation_count != 0) { in metaslab_group_passivate()
1165 for (int i = 0; i < spa->spa_alloc_count; i++) in metaslab_group_passivate()
1166 ASSERT(mc->mc_allocator[i].mca_rotor != mg); in metaslab_group_passivate()
1167 ASSERT0P(mg->mg_prev); in metaslab_group_passivate()
1168 ASSERT0P(mg->mg_next); in metaslab_group_passivate()
1169 ASSERT(mg->mg_activation_count < 0); in metaslab_group_passivate()
1187 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); in metaslab_group_passivate()
1188 taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); in metaslab_group_passivate()
1189 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); in metaslab_group_passivate()
1191 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_passivate()
1192 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_passivate()
1193 metaslab_t *msp = mga->mga_primary; in metaslab_group_passivate()
1195 mutex_enter(&msp->ms_lock); in metaslab_group_passivate()
1199 mutex_exit(&msp->ms_lock); in metaslab_group_passivate()
1201 msp = mga->mga_secondary; in metaslab_group_passivate()
1203 mutex_enter(&msp->ms_lock); in metaslab_group_passivate()
1207 mutex_exit(&msp->ms_lock); in metaslab_group_passivate()
1211 mgprev = mg->mg_prev; in metaslab_group_passivate()
1212 mgnext = mg->mg_next; in metaslab_group_passivate()
1217 mgprev->mg_next = mgnext; in metaslab_group_passivate()
1218 mgnext->mg_prev = mgprev; in metaslab_group_passivate()
1220 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_passivate()
1221 if (mc->mc_allocator[i].mca_rotor == mg) in metaslab_group_passivate()
1222 mc->mc_allocator[i].mca_rotor = mgnext; in metaslab_group_passivate()
1225 mg->mg_prev = NULL; in metaslab_group_passivate()
1226 mg->mg_next = NULL; in metaslab_group_passivate()
1233 vdev_t *vd = mg->mg_vd; in metaslab_group_initialized()
1234 vdev_stat_t *vs = &vd->vdev_stat; in metaslab_group_initialized()
1236 return (vs->vs_space != 0 && mg->mg_activation_count > 0); in metaslab_group_initialized()
1246 mutex_enter(&mg->mg_lock); in metaslab_group_get_space()
1247 uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); in metaslab_group_get_space()
1248 mutex_exit(&mg->mg_lock); in metaslab_group_get_space()
1249 return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); in metaslab_group_get_space()
1256 avl_tree_t *t = &mg->mg_metaslab_tree; in metaslab_group_histogram_verify()
1257 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_verify()
1268 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_verify()
1271 VERIFY3P(msp->ms_group, ==, mg); in metaslab_group_histogram_verify()
1273 if (msp->ms_sm == NULL) in metaslab_group_histogram_verify()
1278 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_verify()
1283 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); in metaslab_group_histogram_verify()
1285 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_verify()
1293 metaslab_class_t *mc = mg->mg_class; in metaslab_group_histogram_add()
1294 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_add()
1296 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_histogram_add()
1297 if (msp->ms_sm == NULL) in metaslab_group_histogram_add()
1300 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_add()
1301 mutex_enter(&mc->mc_lock); in metaslab_group_histogram_add()
1303 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_group_histogram_add()
1304 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || in metaslab_group_histogram_add()
1305 mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_group_histogram_add()
1306 mg->mg_histogram[i + ashift] += in metaslab_group_histogram_add()
1307 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_add()
1308 mc->mc_histogram[i + ashift] += in metaslab_group_histogram_add()
1309 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_add()
1311 mutex_exit(&mc->mc_lock); in metaslab_group_histogram_add()
1312 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_add()
1318 metaslab_class_t *mc = mg->mg_class; in metaslab_group_histogram_remove()
1319 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_remove()
1321 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_histogram_remove()
1322 if (msp->ms_sm == NULL) in metaslab_group_histogram_remove()
1325 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_remove()
1326 mutex_enter(&mc->mc_lock); in metaslab_group_histogram_remove()
1328 ASSERT3U(mg->mg_histogram[i + ashift], >=, in metaslab_group_histogram_remove()
1329 msp->ms_sm->sm_phys->smp_histogram[i]); in metaslab_group_histogram_remove()
1330 ASSERT3U(mc->mc_histogram[i + ashift], >=, in metaslab_group_histogram_remove()
1331 msp->ms_sm->sm_phys->smp_histogram[i]); in metaslab_group_histogram_remove()
1332 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_group_histogram_remove()
1333 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa) || in metaslab_group_histogram_remove()
1334 mc == spa_special_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_group_histogram_remove()
1336 mg->mg_histogram[i + ashift] -= in metaslab_group_histogram_remove()
1337 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_remove()
1338 mc->mc_histogram[i + ashift] -= in metaslab_group_histogram_remove()
1339 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_remove()
1341 mutex_exit(&mc->mc_lock); in metaslab_group_histogram_remove()
1342 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_remove()
1348 ASSERT0P(msp->ms_group); in metaslab_group_add()
1349 mutex_enter(&mg->mg_lock); in metaslab_group_add()
1350 msp->ms_group = mg; in metaslab_group_add()
1351 msp->ms_weight = 0; in metaslab_group_add()
1352 avl_add(&mg->mg_metaslab_tree, msp); in metaslab_group_add()
1353 mutex_exit(&mg->mg_lock); in metaslab_group_add()
1355 mutex_enter(&msp->ms_lock); in metaslab_group_add()
1357 mutex_exit(&msp->ms_lock); in metaslab_group_add()
1363 mutex_enter(&msp->ms_lock); in metaslab_group_remove()
1365 mutex_exit(&msp->ms_lock); in metaslab_group_remove()
1367 mutex_enter(&mg->mg_lock); in metaslab_group_remove()
1368 ASSERT(msp->ms_group == mg); in metaslab_group_remove()
1369 avl_remove(&mg->mg_metaslab_tree, msp); in metaslab_group_remove()
1371 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_group_remove()
1373 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_group_remove()
1374 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_group_remove()
1378 msp->ms_group = NULL; in metaslab_group_remove()
1379 mutex_exit(&mg->mg_lock); in metaslab_group_remove()
1385 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_sort_impl()
1386 ASSERT(MUTEX_HELD(&mg->mg_lock)); in metaslab_group_sort_impl()
1387 ASSERT(msp->ms_group == mg); in metaslab_group_sort_impl()
1389 avl_remove(&mg->mg_metaslab_tree, msp); in metaslab_group_sort_impl()
1390 msp->ms_weight = weight; in metaslab_group_sort_impl()
1391 avl_add(&mg->mg_metaslab_tree, msp); in metaslab_group_sort_impl()
1403 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_sort()
1405 mutex_enter(&mg->mg_lock); in metaslab_group_sort()
1407 mutex_exit(&mg->mg_lock); in metaslab_group_sort()
1419 vdev_t *vd = mg->mg_vd; in metaslab_group_fragmentation()
1424 for (int m = 0; m < vd->vdev_ms_count; m++) { in metaslab_group_fragmentation()
1425 metaslab_t *msp = vd->vdev_ms[m]; in metaslab_group_fragmentation()
1427 if (msp->ms_group != mg) in metaslab_group_fragmentation()
1430 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) in metaslab_group_fragmentation()
1434 free = (msp->ms_size - metaslab_allocated_space(msp)) / in metaslab_group_fragmentation()
1437 fragmentation += msp->ms_fragmentation * free; in metaslab_group_fragmentation()
1455 * Comparison function for the private size-ordered tree using 32-bit
1465 uint64_t rs_size1 = r1->rs_end - r1->rs_start; in metaslab_rangesize32_compare()
1466 uint64_t rs_size2 = r2->rs_end - r2->rs_start; in metaslab_rangesize32_compare()
1470 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); in metaslab_rangesize32_compare()
1474 * Comparison function for the private size-ordered tree using 64-bit
1484 uint64_t rs_size1 = r1->rs_end - r1->rs_start; in metaslab_rangesize64_compare()
1485 uint64_t rs_size2 = r2->rs_end - r2->rs_start; in metaslab_rangesize64_compare()
1489 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); in metaslab_rangesize64_compare()
1506 zfs_range_tree_t *rt = mssap->rt; in metaslab_size_sorted_add()
1507 metaslab_rt_arg_t *mrap = mssap->mra; in metaslab_size_sorted_add()
1517 metaslab_rt_arg_t *mrap = rt->rt_arg; in metaslab_size_tree_full_load()
1519 ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); in metaslab_size_tree_full_load()
1520 mrap->mra_floor_shift = 0; in metaslab_size_tree_full_load()
1536 * rely on using both a size-ordered zfs_range_tree_t and an array of in ZFS_BTREE_FIND_IN_BUF_FUNC()
1543 zfs_btree_t *size_tree = mrap->mra_bt; in ZFS_BTREE_FIND_IN_BUF_FUNC()
1548 switch (rt->rt_type) { in ZFS_BTREE_FIND_IN_BUF_FUNC()
1560 panic("Invalid range seg type %d", rt->rt_type); in ZFS_BTREE_FIND_IN_BUF_FUNC()
1563 mrap->mra_floor_shift = metaslab_by_size_min_shift; in ZFS_BTREE_FIND_IN_BUF_FUNC()
1571 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_destroy()
1581 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_add()
1583 if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < in metaslab_rt_add()
1584 (1ULL << mrap->mra_floor_shift)) in metaslab_rt_add()
1594 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_remove()
1596 if (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt) < (1ULL << in metaslab_rt_remove()
1597 mrap->mra_floor_shift)) in metaslab_rt_remove()
1607 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_vacate()
1634 zfs_btree_t *t = &msp->ms_allocatable_by_size; in metaslab_largest_allocatable()
1640 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_largest_allocatable()
1646 return (zfs_rs_get_end(rs, msp->ms_allocatable) - zfs_rs_get_start(rs, in metaslab_largest_allocatable()
1647 msp->ms_allocatable)); in metaslab_largest_allocatable()
1657 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_largest_unflushed_free()
1659 if (msp->ms_unflushed_frees == NULL) in metaslab_largest_unflushed_free()
1662 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) in metaslab_largest_unflushed_free()
1663 metaslab_size_tree_full_load(msp->ms_unflushed_frees); in metaslab_largest_unflushed_free()
1664 zfs_range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, in metaslab_largest_unflushed_free()
1675 * bound for the largest currently-usable free segment in the in metaslab_largest_unflushed_free()
1680 * briefly and should eventually self-correct as frees are no longer in metaslab_largest_unflushed_free()
1692 uint64_t rstart = zfs_rs_get_start(rs, msp->ms_unflushed_frees); in metaslab_largest_unflushed_free()
1693 uint64_t rsize = zfs_rs_get_end(rs, msp->ms_unflushed_frees) - rstart; in metaslab_largest_unflushed_free()
1697 boolean_t found = zfs_range_tree_find_in(msp->ms_defer[t], in metaslab_largest_unflushed_free()
1702 rsize = start - rstart; in metaslab_largest_unflushed_free()
1708 boolean_t found = zfs_range_tree_find_in(msp->ms_freed, rstart, in metaslab_largest_unflushed_free()
1711 rsize = start - rstart; in metaslab_largest_unflushed_free()
1737 if (rs == NULL || zfs_rs_get_end(rs, rt) - in metaslab_block_find()
1749 * suitable block to allocate. This will search the specified B-tree looking
1757 *cursor = rt->rt_start; in metaslab_block_picker()
1758 zfs_btree_t *bt = &rt->rt_root; in metaslab_block_picker()
1768 while (rs != NULL && (zfs_rs_get_start(rs, rt) - first_found <= in metaslab_block_picker()
1772 *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, in metaslab_block_picker()
1783 return (-1ULL); in metaslab_block_picker()
1797 { "new-dynamic", metaslab_ndf_alloc },
1803 int a = ARRAY_SIZE(metaslab_allocators) - 1; in spa_find_allocator_byname()
1804 if (strcmp("new-dynamic", val) == 0) in spa_find_allocator_byname()
1805 return (-1); /* remove when ndf is working */ in spa_find_allocator_byname()
1806 for (; a >= 0; a--) { in spa_find_allocator_byname()
1810 return (-1); in spa_find_allocator_byname()
1818 spa->spa_active_allocator = a; in spa_set_allocator()
1825 return (spa->spa_active_allocator); in spa_get_allocator()
1887 uint64_t align = max_size & -max_size; in metaslab_df_alloc()
1888 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; in metaslab_df_alloc()
1889 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_df_alloc()
1890 uint_t free_pct = zfs_range_tree_space(rt) * 100 / msp->ms_size; in metaslab_df_alloc()
1893 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_df_alloc()
1901 align = size & -size; in metaslab_df_alloc()
1902 cursor = &msp->ms_lbas[highbit64(align) - 1]; in metaslab_df_alloc()
1903 offset = -1; in metaslab_df_alloc()
1907 if (max_size != size && offset == -1) { in metaslab_df_alloc()
1908 align = size & -size; in metaslab_df_alloc()
1909 cursor = &msp->ms_lbas[highbit64(align) - 1]; in metaslab_df_alloc()
1915 if (offset == -1) { in metaslab_df_alloc()
1917 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) in metaslab_df_alloc()
1918 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_df_alloc()
1922 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); in metaslab_df_alloc()
1926 rs = metaslab_block_find(&msp->ms_allocatable_by_size, in metaslab_df_alloc()
1927 rt, msp->ms_start, size, max_size, &where); in metaslab_df_alloc()
1932 *found_size = MIN(zfs_rs_get_end(rs, rt) - offset, in metaslab_df_alloc()
1943 * Cursor fit block allocator -
1954 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_cf_alloc()
1955 zfs_btree_t *t = &msp->ms_allocatable_by_size; in metaslab_cf_alloc()
1956 uint64_t *cursor = &msp->ms_lbas[0]; in metaslab_cf_alloc()
1957 uint64_t *cursor_end = &msp->ms_lbas[1]; in metaslab_cf_alloc()
1960 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_cf_alloc()
1968 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_cf_alloc()
1970 if (rs == NULL || (zfs_rs_get_end(rs, rt) - in metaslab_cf_alloc()
1972 return (-1ULL); in metaslab_cf_alloc()
1979 *found_size = MIN(*cursor_end - offset, max_size); in metaslab_cf_alloc()
1987 * New dynamic fit allocator -
2004 zfs_btree_t *t = &msp->ms_allocatable->rt_root; in metaslab_ndf_alloc()
2005 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_ndf_alloc()
2010 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; in metaslab_ndf_alloc()
2013 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_ndf_alloc()
2016 return (-1ULL); in metaslab_ndf_alloc()
2022 if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < in metaslab_ndf_alloc()
2025 cursor = &msp->ms_lbas[hbit - 1]; in metaslab_ndf_alloc()
2031 if (rs == NULL || (zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) < in metaslab_ndf_alloc()
2033 t = &msp->ms_allocatable_by_size; in metaslab_ndf_alloc()
2045 if ((zfs_rs_get_end(rs, rt) - zfs_rs_get_start(rs, rt)) >= size) { in metaslab_ndf_alloc()
2046 *found_size = MIN(zfs_rs_get_end(rs, rt) - in metaslab_ndf_alloc()
2051 return (-1ULL); in metaslab_ndf_alloc()
2061 * Wait for any in-progress metaslab loads to complete.
2066 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load_wait()
2068 while (msp->ms_loading) { in metaslab_load_wait()
2069 ASSERT(!msp->ms_loaded); in metaslab_load_wait()
2070 cv_wait(&msp->ms_load_cv, &msp->ms_lock); in metaslab_load_wait()
2075 * Wait for any in-progress flushing to complete.
2080 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush_wait()
2082 while (msp->ms_flushing) in metaslab_flush_wait()
2083 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); in metaslab_flush_wait()
2095 return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); in metaslab_idx_func()
2101 return (msp->ms_allocated_space); in metaslab_allocated_space()
2105 * Verify that the space accounting on disk matches the in-core range_trees.
2110 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_verify_space()
2114 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_verify_space()
2115 ASSERT(!msp->ms_condensing); in metaslab_verify_space()
2123 * allocated space map. Calling this in non-syncing context in metaslab_verify_space()
2127 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || in metaslab_verify_space()
2128 !msp->ms_loaded) in metaslab_verify_space()
2136 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); in metaslab_verify_space()
2138 ASSERT3U(space_map_allocated(msp->ms_sm), >=, in metaslab_verify_space()
2139 zfs_range_tree_space(msp->ms_unflushed_frees)); in metaslab_verify_space()
2142 space_map_allocated(msp->ms_sm) + in metaslab_verify_space()
2143 zfs_range_tree_space(msp->ms_unflushed_allocs) - in metaslab_verify_space()
2144 zfs_range_tree_space(msp->ms_unflushed_frees)); in metaslab_verify_space()
2146 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); in metaslab_verify_space()
2154 zfs_range_tree_space(msp->ms_allocating[(txg + t) & in metaslab_verify_space()
2157 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, in metaslab_verify_space()
2158 msp->ms_allocating_total); in metaslab_verify_space()
2160 ASSERT3U(msp->ms_deferspace, ==, in metaslab_verify_space()
2161 zfs_range_tree_space(msp->ms_defer[0]) + in metaslab_verify_space()
2162 zfs_range_tree_space(msp->ms_defer[1])); in metaslab_verify_space()
2164 msp_free_space = zfs_range_tree_space(msp->ms_allocatable) + in metaslab_verify_space()
2165 allocating + msp->ms_deferspace + in metaslab_verify_space()
2166 zfs_range_tree_space(msp->ms_freed); in metaslab_verify_space()
2178 ASSERT(msp->ms_loaded); in metaslab_aux_histograms_clear()
2180 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); in metaslab_aux_histograms_clear()
2182 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); in metaslab_aux_histograms_clear()
2199 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); in metaslab_aux_histogram_add()
2201 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { in metaslab_aux_histogram_add()
2221 space_map_t *sm = msp->ms_sm; in metaslab_aux_histograms_update()
2230 if (msp->ms_loaded) { in metaslab_aux_histograms_update()
2233 metaslab_aux_histogram_add(msp->ms_synchist, in metaslab_aux_histograms_update()
2234 sm->sm_shift, msp->ms_freed); in metaslab_aux_histograms_update()
2237 metaslab_aux_histogram_add(msp->ms_deferhist[t], in metaslab_aux_histograms_update()
2238 sm->sm_shift, msp->ms_defer[t]); in metaslab_aux_histograms_update()
2242 metaslab_aux_histogram_add(msp->ms_synchist, in metaslab_aux_histograms_update()
2243 sm->sm_shift, msp->ms_freeing); in metaslab_aux_histograms_update()
2254 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_aux_histograms_update_done()
2255 space_map_t *sm = msp->ms_sm; in metaslab_aux_histograms_update_done()
2272 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, in metaslab_aux_histograms_update_done()
2273 sizeof (msp->ms_synchist)); in metaslab_aux_histograms_update_done()
2275 memset(msp->ms_deferhist[hist_index], 0, in metaslab_aux_histograms_update_done()
2276 sizeof (msp->ms_deferhist[hist_index])); in metaslab_aux_histograms_update_done()
2278 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); in metaslab_aux_histograms_update_done()
2289 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_verify_weight_and_frag()
2303 if (msp->ms_group == NULL) in metaslab_verify_weight_and_frag()
2308 * fragmentation and ms_max_size as is - there is nothing for in metaslab_verify_weight_and_frag()
2311 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_verify_weight_and_frag()
2312 if (vd->vdev_removing) in metaslab_verify_weight_and_frag()
2321 if (txg_list_member(&vd->vdev_ms_list, msp, t)) in metaslab_verify_weight_and_frag()
2326 * This verification checks that our in-memory state is consistent in metaslab_verify_weight_and_frag()
2327 * with what's on disk. If the pool is read-only then there aren't in metaslab_verify_weight_and_frag()
2328 * any changes and we just have the initially-loaded state. in metaslab_verify_weight_and_frag()
2330 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) in metaslab_verify_weight_and_frag()
2333 /* some extra verification for in-core tree if you can */ in metaslab_verify_weight_and_frag()
2334 if (msp->ms_loaded) { in metaslab_verify_weight_and_frag()
2335 zfs_range_tree_stat_verify(msp->ms_allocatable); in metaslab_verify_weight_and_frag()
2336 VERIFY(space_map_histogram_verify(msp->ms_sm, in metaslab_verify_weight_and_frag()
2337 msp->ms_allocatable)); in metaslab_verify_weight_and_frag()
2340 uint64_t weight = msp->ms_weight; in metaslab_verify_weight_and_frag()
2341 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_verify_weight_and_frag()
2342 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); in metaslab_verify_weight_and_frag()
2343 uint64_t frag = msp->ms_fragmentation; in metaslab_verify_weight_and_frag()
2344 uint64_t max_segsize = msp->ms_max_size; in metaslab_verify_weight_and_frag()
2346 msp->ms_weight = 0; in metaslab_verify_weight_and_frag()
2347 msp->ms_fragmentation = 0; in metaslab_verify_weight_and_frag()
2351 * not introduce any side-effects/mutations on the system's state. in metaslab_verify_weight_and_frag()
2362 msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; in metaslab_verify_weight_and_frag()
2364 VERIFY3U(max_segsize, ==, msp->ms_max_size); in metaslab_verify_weight_and_frag()
2370 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || in metaslab_verify_weight_and_frag()
2371 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { in metaslab_verify_weight_and_frag()
2372 msp->ms_fragmentation = frag; in metaslab_verify_weight_and_frag()
2373 msp->ms_weight = weight; in metaslab_verify_weight_and_frag()
2377 VERIFY3U(msp->ms_fragmentation, ==, frag); in metaslab_verify_weight_and_frag()
2378 VERIFY3U(msp->ms_weight, ==, weight); in metaslab_verify_weight_and_frag()
2398 tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; in metaslab_potentially_evict()
2401 &mc->mc_metaslab_txg_list); in metaslab_potentially_evict()
2403 multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); in metaslab_potentially_evict()
2409 &mc->mc_metaslab_txg_list, idx)); in metaslab_potentially_evict()
2411 metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); in metaslab_potentially_evict()
2413 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_potentially_evict()
2430 if (msp->ms_loading) { in metaslab_potentially_evict()
2444 * currently active because they are high-weight in metaslab_potentially_evict()
2448 mutex_enter(&msp->ms_lock); in metaslab_potentially_evict()
2449 if (msp->ms_allocator == -1 && msp->ms_sm != NULL && in metaslab_potentially_evict()
2450 msp->ms_allocating_total == 0) { in metaslab_potentially_evict()
2453 mutex_exit(&msp->ms_lock); in metaslab_potentially_evict()
2468 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load_impl()
2469 ASSERT(msp->ms_loading); in metaslab_load_impl()
2470 ASSERT(!msp->ms_condensing); in metaslab_load_impl()
2489 * metaslab_sync_done() would try to re-add later. in metaslab_load_impl()
2496 uint64_t length = msp->ms_synced_length; in metaslab_load_impl()
2497 mutex_exit(&msp->ms_lock); in metaslab_load_impl()
2501 if (msp->ms_allocatable->rt_arg == NULL) { in metaslab_load_impl()
2504 mrap = msp->ms_allocatable->rt_arg; in metaslab_load_impl()
2505 msp->ms_allocatable->rt_ops = NULL; in metaslab_load_impl()
2506 msp->ms_allocatable->rt_arg = NULL; in metaslab_load_impl()
2508 mrap->mra_bt = &msp->ms_allocatable_by_size; in metaslab_load_impl()
2509 mrap->mra_floor_shift = metaslab_by_size_min_shift; in metaslab_load_impl()
2511 if (msp->ms_sm != NULL) { in metaslab_load_impl()
2512 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, in metaslab_load_impl()
2515 /* Now, populate the size-sorted tree. */ in metaslab_load_impl()
2516 metaslab_rt_create(msp->ms_allocatable, mrap); in metaslab_load_impl()
2517 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; in metaslab_load_impl()
2518 msp->ms_allocatable->rt_arg = mrap; in metaslab_load_impl()
2521 arg.rt = msp->ms_allocatable; in metaslab_load_impl()
2523 zfs_range_tree_walk(msp->ms_allocatable, in metaslab_load_impl()
2527 * Add the size-sorted tree first, since we don't need to load in metaslab_load_impl()
2530 metaslab_rt_create(msp->ms_allocatable, mrap); in metaslab_load_impl()
2531 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; in metaslab_load_impl()
2532 msp->ms_allocatable->rt_arg = mrap; in metaslab_load_impl()
2538 zfs_range_tree_add(msp->ms_allocatable, in metaslab_load_impl()
2539 msp->ms_start, msp->ms_size); in metaslab_load_impl()
2541 if (msp->ms_new) { in metaslab_load_impl()
2550 msp->ms_unflushed_allocs)); in metaslab_load_impl()
2552 msp->ms_unflushed_frees)); in metaslab_load_impl()
2563 mutex_enter(&msp->ms_sync_lock); in metaslab_load_impl()
2564 mutex_enter(&msp->ms_lock); in metaslab_load_impl()
2566 ASSERT(!msp->ms_condensing); in metaslab_load_impl()
2567 ASSERT(!msp->ms_flushing); in metaslab_load_impl()
2570 mutex_exit(&msp->ms_sync_lock); in metaslab_load_impl()
2574 ASSERT3P(msp->ms_group, !=, NULL); in metaslab_load_impl()
2575 msp->ms_loaded = B_TRUE; in metaslab_load_impl()
2582 zfs_range_tree_walk(msp->ms_unflushed_allocs, in metaslab_load_impl()
2583 zfs_range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2584 zfs_range_tree_walk(msp->ms_unflushed_frees, in metaslab_load_impl()
2585 zfs_range_tree_add, msp->ms_allocatable); in metaslab_load_impl()
2587 ASSERT3P(msp->ms_group, !=, NULL); in metaslab_load_impl()
2588 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_load_impl()
2613 zfs_range_tree_walk(msp->ms_freed, in metaslab_load_impl()
2614 zfs_range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2632 zfs_range_tree_walk(msp->ms_defer[t], in metaslab_load_impl()
2633 zfs_range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2641 * has not yet been converted to use segment-based weight, we in metaslab_load_impl()
2648 uint64_t weight = msp->ms_weight; in metaslab_load_impl()
2649 uint64_t max_size = msp->ms_max_size; in metaslab_load_impl()
2652 ASSERT3U(weight, <=, msp->ms_weight); in metaslab_load_impl()
2653 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_load_impl()
2654 ASSERT3U(max_size, <=, msp->ms_max_size); in metaslab_load_impl()
2656 msp->ms_load_time = load_end; in metaslab_load_impl()
2665 msp->ms_group->mg_class->mc_name, in metaslab_load_impl()
2666 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_load_impl()
2667 (u_longlong_t)msp->ms_id, in metaslab_load_impl()
2668 (u_longlong_t)space_map_length(msp->ms_sm), in metaslab_load_impl()
2669 (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_allocs), in metaslab_load_impl()
2670 (u_longlong_t)zfs_range_tree_space(msp->ms_unflushed_frees), in metaslab_load_impl()
2671 (u_longlong_t)zfs_range_tree_space(msp->ms_freed), in metaslab_load_impl()
2672 (u_longlong_t)zfs_range_tree_space(msp->ms_defer[0]), in metaslab_load_impl()
2673 (u_longlong_t)zfs_range_tree_space(msp->ms_defer[1]), in metaslab_load_impl()
2674 (longlong_t)((load_start - msp->ms_unload_time) / 1000000), in metaslab_load_impl()
2675 (longlong_t)((load_end - load_start) / 1000000), in metaslab_load_impl()
2676 (u_longlong_t)msp->ms_max_size, in metaslab_load_impl()
2677 (u_longlong_t)msp->ms_max_size - max_size, in metaslab_load_impl()
2678 (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); in metaslab_load_impl()
2681 mutex_exit(&msp->ms_sync_lock); in metaslab_load_impl()
2688 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load()
2695 if (msp->ms_loaded) in metaslab_load()
2697 VERIFY(!msp->ms_loading); in metaslab_load()
2698 ASSERT(!msp->ms_condensing); in metaslab_load()
2706 msp->ms_loading = B_TRUE; in metaslab_load()
2709 * Wait for any in-progress flushing to finish as we drop the ms_lock in metaslab_load()
2713 if (msp->ms_flushing) in metaslab_load()
2721 ASSERT(!msp->ms_loaded); in metaslab_load()
2728 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == in metaslab_load()
2729 msp->ms_group->mg_class) { in metaslab_load()
2730 metaslab_potentially_evict(msp->ms_group->mg_class); in metaslab_load()
2735 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load()
2736 msp->ms_loading = B_FALSE; in metaslab_load()
2737 cv_broadcast(&msp->ms_load_cv); in metaslab_load()
2745 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_unload()
2752 if (!msp->ms_loaded) in metaslab_unload()
2755 zfs_range_tree_vacate(msp->ms_allocatable, NULL, NULL); in metaslab_unload()
2756 msp->ms_loaded = B_FALSE; in metaslab_unload()
2757 msp->ms_unload_time = gethrtime(); in metaslab_unload()
2759 msp->ms_activation_weight = 0; in metaslab_unload()
2760 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; in metaslab_unload()
2762 if (msp->ms_group != NULL) { in metaslab_unload()
2763 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_unload()
2765 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_unload()
2766 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_unload()
2770 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unload()
2776 msp->ms_group->mg_class->mc_name, in metaslab_unload()
2777 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_unload()
2778 (u_longlong_t)msp->ms_id, in metaslab_unload()
2779 (u_longlong_t)msp->ms_weight, in metaslab_unload()
2780 (u_longlong_t)msp->ms_selected_txg, in metaslab_unload()
2781 (u_longlong_t)(NSEC2SEC(msp->ms_unload_time) - in metaslab_unload()
2782 msp->ms_selected_time), in metaslab_unload()
2783 (u_longlong_t)msp->ms_alloc_txg, in metaslab_unload()
2784 (u_longlong_t)(msp->ms_unload_time - in metaslab_unload()
2785 msp->ms_load_time) / 1000 / 1000, in metaslab_unload()
2786 (u_longlong_t)msp->ms_max_size); in metaslab_unload()
2793 * loaded ones have it calculated from their in-core range tree in metaslab_unload()
2795 * available in-core, whether it is loaded or not. in metaslab_unload()
2801 if (msp->ms_group != NULL) in metaslab_unload()
2806 * We want to optimize the memory use of the per-metaslab range
2808 * units of sectors, zero-indexing from the start of the metaslab. If
2809 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2816 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && in metaslab_calculate_range_tree_type()
2818 *shift = vdev->vdev_ashift; in metaslab_calculate_range_tree_type()
2819 *start = msp->ms_start; in metaslab_calculate_range_tree_type()
2831 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_set_selected_txg()
2832 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_set_selected_txg()
2834 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_set_selected_txg()
2835 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_set_selected_txg()
2837 msp->ms_selected_txg = txg; in metaslab_set_selected_txg()
2838 msp->ms_selected_time = gethrestime_sec(); in metaslab_set_selected_txg()
2849 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); in metaslab_space_update()
2850 ASSERT(vd->vdev_ms_count != 0); in metaslab_space_update()
2860 vdev_t *vd = mg->mg_vd; in metaslab_init()
2861 spa_t *spa = vd->vdev_spa; in metaslab_init()
2862 objset_t *mos = spa->spa_meta_objset; in metaslab_init()
2867 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_init()
2868 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_init()
2869 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); in metaslab_init()
2870 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); in metaslab_init()
2871 multilist_link_init(&ms->ms_class_txg_node); in metaslab_init()
2873 ms->ms_id = id; in metaslab_init()
2874 ms->ms_start = id << vd->vdev_ms_shift; in metaslab_init()
2875 ms->ms_size = 1ULL << vd->vdev_ms_shift; in metaslab_init()
2876 ms->ms_allocator = -1; in metaslab_init()
2877 ms->ms_new = B_TRUE; in metaslab_init()
2879 vdev_ops_t *ops = vd->vdev_ops; in metaslab_init()
2880 if (ops->vdev_op_metaslab_init != NULL) in metaslab_init()
2881 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); in metaslab_init()
2895 if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && in metaslab_init()
2896 !spa->spa_read_spacemaps)) { in metaslab_init()
2897 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, in metaslab_init()
2898 ms->ms_size, vd->vdev_ashift); in metaslab_init()
2905 ASSERT(ms->ms_sm != NULL); in metaslab_init()
2906 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); in metaslab_init()
2913 ms->ms_allocatable = zfs_range_tree_create_flags( in metaslab_init()
2917 ms->ms_allocating[t] = zfs_range_tree_create_flags( in metaslab_init()
2922 ms->ms_freeing = zfs_range_tree_create_flags( in metaslab_init()
2925 ms->ms_freed = zfs_range_tree_create_flags( in metaslab_init()
2929 ms->ms_defer[t] = zfs_range_tree_create_flags( in metaslab_init()
2933 ms->ms_checkpointing = zfs_range_tree_create_flags( in metaslab_init()
2936 ms->ms_unflushed_allocs = zfs_range_tree_create_flags( in metaslab_init()
2941 mrap->mra_bt = &ms->ms_unflushed_frees_by_size; in metaslab_init()
2942 mrap->mra_floor_shift = metaslab_by_size_min_shift; in metaslab_init()
2943 ms->ms_unflushed_frees = zfs_range_tree_create_flags( in metaslab_init()
2947 ms->ms_trim = zfs_range_tree_create_flags( in metaslab_init()
2965 metaslab_space_update(vd, mg->mg_class, in metaslab_init()
2982 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_fini_flush_data()
2985 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), in metaslab_fini_flush_data()
2991 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_fini_flush_data()
2992 avl_remove(&spa->spa_metaslabs_by_flushed, msp); in metaslab_fini_flush_data()
2993 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_fini_flush_data()
3003 return ((zfs_range_tree_numsegs(ms->ms_unflushed_allocs) + in metaslab_unflushed_changes_memused()
3004 zfs_range_tree_numsegs(ms->ms_unflushed_frees)) * in metaslab_unflushed_changes_memused()
3005 ms->ms_unflushed_allocs->rt_root.bt_elem_size); in metaslab_unflushed_changes_memused()
3011 metaslab_group_t *mg = msp->ms_group; in metaslab_fini()
3012 vdev_t *vd = mg->mg_vd; in metaslab_fini()
3013 spa_t *spa = vd->vdev_spa; in metaslab_fini()
3019 mutex_enter(&msp->ms_lock); in metaslab_fini()
3020 VERIFY0P(msp->ms_group); in metaslab_fini()
3027 if (!msp->ms_new) { in metaslab_fini()
3028 metaslab_space_update(vd, mg->mg_class, in metaslab_fini()
3029 -metaslab_allocated_space(msp), 0, -msp->ms_size); in metaslab_fini()
3032 space_map_close(msp->ms_sm); in metaslab_fini()
3033 msp->ms_sm = NULL; in metaslab_fini()
3037 zfs_range_tree_destroy(msp->ms_allocatable); in metaslab_fini()
3038 zfs_range_tree_destroy(msp->ms_freeing); in metaslab_fini()
3039 zfs_range_tree_destroy(msp->ms_freed); in metaslab_fini()
3041 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_fini()
3043 spa->spa_unflushed_stats.sus_memused -= in metaslab_fini()
3045 zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_fini()
3046 zfs_range_tree_destroy(msp->ms_unflushed_allocs); in metaslab_fini()
3047 zfs_range_tree_destroy(msp->ms_checkpointing); in metaslab_fini()
3048 zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_fini()
3049 zfs_range_tree_destroy(msp->ms_unflushed_frees); in metaslab_fini()
3052 zfs_range_tree_destroy(msp->ms_allocating[t]); in metaslab_fini()
3055 zfs_range_tree_destroy(msp->ms_defer[t]); in metaslab_fini()
3057 ASSERT0(msp->ms_deferspace); in metaslab_fini()
3060 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); in metaslab_fini()
3062 zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); in metaslab_fini()
3063 zfs_range_tree_destroy(msp->ms_trim); in metaslab_fini()
3065 mutex_exit(&msp->ms_lock); in metaslab_fini()
3066 cv_destroy(&msp->ms_load_cv); in metaslab_fini()
3067 cv_destroy(&msp->ms_flush_cv); in metaslab_fini()
3068 mutex_destroy(&msp->ms_lock); in metaslab_fini()
3069 mutex_destroy(&msp->ms_sync_lock); in metaslab_fini()
3070 ASSERT3U(msp->ms_allocator, ==, -1); in metaslab_fini()
3129 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_set_fragmentation()
3136 msp->ms_fragmentation = ZFS_FRAG_INVALID; in metaslab_set_fragmentation()
3144 if (msp->ms_sm == NULL) { in metaslab_set_fragmentation()
3145 msp->ms_fragmentation = 0; in metaslab_set_fragmentation()
3153 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { in metaslab_set_fragmentation()
3155 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_set_fragmentation()
3167 msp->ms_condense_wanted = B_TRUE; in metaslab_set_fragmentation()
3171 (u_longlong_t)msp->ms_id, in metaslab_set_fragmentation()
3172 (u_longlong_t)vd->vdev_id); in metaslab_set_fragmentation()
3174 msp->ms_fragmentation = ZFS_FRAG_INVALID; in metaslab_set_fragmentation()
3180 uint8_t shift = msp->ms_sm->sm_shift; in metaslab_set_fragmentation()
3182 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, in metaslab_set_fragmentation()
3183 FRAGMENTATION_TABLE_SIZE - 1); in metaslab_set_fragmentation()
3185 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) in metaslab_set_fragmentation()
3188 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); in metaslab_set_fragmentation()
3199 msp->ms_fragmentation = fragmentation; in metaslab_set_fragmentation()
3203 * Compute a weight -- a selection preference value -- for the given metaslab.
3210 metaslab_group_t *mg = msp->ms_group; in metaslab_space_weight()
3211 vdev_t *vd = mg->mg_vd; in metaslab_space_weight()
3214 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_space_weight()
3219 space = msp->ms_size - metaslab_allocated_space(msp); in metaslab_space_weight()
3222 msp->ms_fragmentation != ZFS_FRAG_INVALID) { in metaslab_space_weight()
3230 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; in metaslab_space_weight()
3247 * than the inner zones by the ratio of outer to inner track diameter, in metaslab_space_weight()
3253 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { in metaslab_space_weight()
3254 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; in metaslab_space_weight()
3264 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && in metaslab_space_weight()
3265 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { in metaslab_space_weight()
3266 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_space_weight()
3274 * Return the weight of the specified metaslab, according to the segment-based
3285 ASSERT(msp->ms_loaded); in metaslab_weight_from_range_tree()
3287 for (int i = ZFS_RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; in metaslab_weight_from_range_tree()
3288 i--) { in metaslab_weight_from_range_tree()
3289 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; in metaslab_weight_from_range_tree()
3290 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; in metaslab_weight_from_range_tree()
3293 segments += msp->ms_allocatable->rt_histogram[i]; in metaslab_weight_from_range_tree()
3316 * Calculate the weight based on the on-disk histogram. Should be applied
3317 * only to unloaded metaslabs (i.e no incoming allocations) in-order to
3318 * give results consistent with the on-disk state
3323 space_map_t *sm = msp->ms_sm; in metaslab_weight_from_spacemap()
3324 ASSERT(!msp->ms_loaded); in metaslab_weight_from_spacemap()
3327 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); in metaslab_weight_from_spacemap()
3339 deferspace_histogram[i] += msp->ms_synchist[i]; in metaslab_weight_from_spacemap()
3342 deferspace_histogram[i] += msp->ms_deferhist[t][i]; in metaslab_weight_from_spacemap()
3347 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { in metaslab_weight_from_spacemap()
3348 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, in metaslab_weight_from_spacemap()
3351 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; in metaslab_weight_from_spacemap()
3354 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); in metaslab_weight_from_spacemap()
3363 * Compute a segment-based weight for the specified metaslab. The weight
3370 metaslab_group_t *mg = msp->ms_group; in metaslab_segment_weight()
3372 uint8_t shift = mg->mg_vd->vdev_ashift; in metaslab_segment_weight()
3374 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_segment_weight()
3380 int idx = highbit64(msp->ms_size) - 1; in metaslab_segment_weight()
3381 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; in metaslab_segment_weight()
3387 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); in metaslab_segment_weight()
3395 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); in metaslab_segment_weight()
3400 if (metaslab_allocated_space(msp) == msp->ms_size) in metaslab_segment_weight()
3407 if (msp->ms_loaded) { in metaslab_segment_weight()
3418 if (msp->ms_activation_weight != 0 && weight != 0) in metaslab_segment_weight()
3419 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); in metaslab_segment_weight()
3428 * weight. For segment-based weighting we can determine the maximum
3429 * allocation based on the index encoded in its value. For space-based
3430 * weights we rely on the entire weight (excluding the weight-type bit).
3441 if (unlikely(msp->ms_new)) in metaslab_should_allocate()
3451 if (msp->ms_loaded || in metaslab_should_allocate()
3452 (msp->ms_max_size != 0 && !try_hard && gethrtime() < in metaslab_should_allocate()
3453 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) in metaslab_should_allocate()
3454 return (msp->ms_max_size >= asize); in metaslab_should_allocate()
3457 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { in metaslab_should_allocate()
3465 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); in metaslab_should_allocate()
3468 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); in metaslab_should_allocate()
3477 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_weight()
3478 spa_t *spa = vd->vdev_spa; in metaslab_weight()
3481 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_weight()
3495 if (msp->ms_loaded) { in metaslab_weight()
3496 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_weight()
3498 msp->ms_max_size = MAX(msp->ms_max_size, in metaslab_weight()
3503 * Segment-based weighting requires space map histogram support. in metaslab_weight()
3507 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == in metaslab_weight()
3519 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_recalculate_weight_and_sort()
3522 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_recalculate_weight_and_sort()
3523 metaslab_group_sort(msp->ms_group, msp, in metaslab_recalculate_weight_and_sort()
3531 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_activate_allocator()
3532 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_activate_allocator()
3539 ASSERT0(msp->ms_activation_weight); in metaslab_activate_allocator()
3540 msp->ms_activation_weight = msp->ms_weight; in metaslab_activate_allocator()
3541 metaslab_group_sort(mg, msp, msp->ms_weight | in metaslab_activate_allocator()
3547 &mga->mga_primary : &mga->mga_secondary); in metaslab_activate_allocator()
3549 mutex_enter(&mg->mg_lock); in metaslab_activate_allocator()
3551 mutex_exit(&mg->mg_lock); in metaslab_activate_allocator()
3556 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_activate_allocator()
3557 msp->ms_allocator = allocator; in metaslab_activate_allocator()
3558 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); in metaslab_activate_allocator()
3560 ASSERT0(msp->ms_activation_weight); in metaslab_activate_allocator()
3561 msp->ms_activation_weight = msp->ms_weight; in metaslab_activate_allocator()
3563 msp->ms_weight | activation_weight); in metaslab_activate_allocator()
3564 mutex_exit(&mg->mg_lock); in metaslab_activate_allocator()
3572 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_activate()
3585 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { in metaslab_activate()
3586 ASSERT(msp->ms_loaded); in metaslab_activate()
3592 metaslab_group_sort(msp->ms_group, msp, 0); in metaslab_activate()
3611 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { in metaslab_activate()
3612 if (msp->ms_allocator != allocator) in metaslab_activate()
3615 if ((msp->ms_weight & activation_weight) == 0) in metaslab_activate()
3619 msp->ms_primary); in metaslab_activate()
3630 if (msp->ms_weight == 0) { in metaslab_activate()
3631 ASSERT0(zfs_range_tree_space(msp->ms_allocatable)); in metaslab_activate()
3635 if ((error = metaslab_activate_allocator(msp->ms_group, msp, in metaslab_activate()
3640 ASSERT(msp->ms_loaded); in metaslab_activate()
3641 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_activate()
3650 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_passivate_allocator()
3651 ASSERT(msp->ms_loaded); in metaslab_passivate_allocator()
3653 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { in metaslab_passivate_allocator()
3658 mutex_enter(&mg->mg_lock); in metaslab_passivate_allocator()
3659 ASSERT3P(msp->ms_group, ==, mg); in metaslab_passivate_allocator()
3660 ASSERT3S(0, <=, msp->ms_allocator); in metaslab_passivate_allocator()
3661 ASSERT3U(msp->ms_allocator, <, mg->mg_class->mc_spa->spa_alloc_count); in metaslab_passivate_allocator()
3663 metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; in metaslab_passivate_allocator()
3664 if (msp->ms_primary) { in metaslab_passivate_allocator()
3665 ASSERT3P(mga->mga_primary, ==, msp); in metaslab_passivate_allocator()
3666 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_passivate_allocator()
3667 mga->mga_primary = NULL; in metaslab_passivate_allocator()
3669 ASSERT3P(mga->mga_secondary, ==, msp); in metaslab_passivate_allocator()
3670 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_passivate_allocator()
3671 mga->mga_secondary = NULL; in metaslab_passivate_allocator()
3673 msp->ms_allocator = -1; in metaslab_passivate_allocator()
3675 mutex_exit(&mg->mg_lock); in metaslab_passivate_allocator()
3688 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || in metaslab_passivate()
3690 zfs_range_tree_space(msp->ms_allocatable) == 0); in metaslab_passivate()
3693 ASSERT(msp->ms_activation_weight != 0); in metaslab_passivate()
3694 msp->ms_activation_weight = 0; in metaslab_passivate()
3695 metaslab_passivate_allocator(msp->ms_group, msp, weight); in metaslab_passivate()
3696 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_passivate()
3700 * Segment-based metaslabs are activated once and remain active until
3701 * we either fail an allocation attempt (similar to space-based metaslabs)
3713 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_segment_may_passivate()
3715 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) in metaslab_segment_may_passivate()
3724 zfs_range_tree_space(msp->ms_allocatable) * 15 / 16) in metaslab_segment_may_passivate()
3729 * information that is accessible to us is the in-core range tree in metaslab_segment_may_passivate()
3733 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); in metaslab_segment_may_passivate()
3736 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) in metaslab_segment_may_passivate()
3744 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_preload()
3745 spa_t *spa = mc->mc_spa; in metaslab_preload()
3748 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); in metaslab_preload()
3750 mutex_enter(&msp->ms_lock); in metaslab_preload()
3753 mutex_exit(&msp->ms_lock); in metaslab_preload()
3760 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_group_preload()
3762 avl_tree_t *t = &mg->mg_metaslab_tree; in metaslab_group_preload()
3768 mutex_enter(&mg->mg_lock); in metaslab_group_preload()
3774 ASSERT3P(msp->ms_group, ==, mg); in metaslab_group_preload()
3782 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { in metaslab_group_preload()
3786 VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, in metaslab_group_preload()
3787 msp, TQ_SLEEP | (m <= spa->spa_alloc_count ? TQ_FRONT : 0)) in metaslab_group_preload()
3790 mutex_exit(&mg->mg_lock); in metaslab_group_preload()
3794 * Determine if the space map's on-disk footprint is past our tolerance for
3801 * 2. Condense if the on on-disk space map representation is at least
3803 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3805 * 3. Do not condense if the on-disk size of the space map does not actually
3808 * Unfortunately, we cannot compute the on-disk size of the space map in this
3811 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3817 space_map_t *sm = msp->ms_sm; in metaslab_should_condense()
3818 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_should_condense()
3819 uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; in metaslab_should_condense()
3821 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_should_condense()
3822 ASSERT(msp->ms_loaded); in metaslab_should_condense()
3824 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); in metaslab_should_condense()
3830 if (zfs_range_tree_numsegs(msp->ms_allocatable) == 0 || in metaslab_should_condense()
3831 msp->ms_condense_wanted) in metaslab_should_condense()
3834 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); in metaslab_should_condense()
3837 msp->ms_allocatable, SM_NO_VDEVID); in metaslab_should_condense()
3844 * Condense the on-disk space map representation to its minimized form.
3848 * the pool-wide log spacemaps; thus this is effectively a superset of
3855 space_map_t *sm = msp->ms_sm; in metaslab_condense()
3857 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_condense()
3859 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_condense()
3860 ASSERT(msp->ms_loaded); in metaslab_condense()
3861 ASSERT(msp->ms_sm != NULL); in metaslab_condense()
3906 ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ in metaslab_condense()
3910 (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, in metaslab_condense()
3911 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_condense()
3912 spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), in metaslab_condense()
3913 (u_longlong_t)zfs_range_tree_numsegs(msp->ms_allocatable), in metaslab_condense()
3914 msp->ms_condense_wanted ? "TRUE" : "FALSE"); in metaslab_condense()
3916 msp->ms_condense_wanted = B_FALSE; in metaslab_condense()
3920 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, in metaslab_condense()
3926 metaslab_rt_name(msp->ms_group, msp, "condense_tree")); in metaslab_condense()
3929 zfs_range_tree_walk(msp->ms_defer[t], in metaslab_condense()
3934 zfs_range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], in metaslab_condense()
3938 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_condense()
3940 spa->spa_unflushed_stats.sus_memused -= in metaslab_condense()
3942 zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_condense()
3943 zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_condense()
3953 msp->ms_condensing = B_TRUE; in metaslab_condense()
3955 mutex_exit(&msp->ms_lock); in metaslab_condense()
3956 uint64_t object = space_map_object(msp->ms_sm); in metaslab_condense()
3965 if (space_map_object(msp->ms_sm) != object) { in metaslab_condense()
3966 object = space_map_object(msp->ms_sm); in metaslab_condense()
3967 dmu_write(spa->spa_meta_objset, in metaslab_condense()
3968 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * in metaslab_condense()
3969 msp->ms_id, sizeof (uint64_t), &object, tx, in metaslab_condense()
3987 metaslab_rt_name(msp->ms_group, msp, "tmp_tree")); in metaslab_condense()
3988 zfs_range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); in metaslab_condense()
3990 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); in metaslab_condense()
3997 mutex_enter(&msp->ms_lock); in metaslab_condense()
3999 msp->ms_condensing = B_FALSE; in metaslab_condense()
4006 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unflushed_add()
4008 ASSERT(msp->ms_sm != NULL); in metaslab_unflushed_add()
4009 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_unflushed_add()
4010 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_unflushed_add()
4012 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_unflushed_add()
4015 avl_add(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_add()
4016 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_unflushed_add()
4025 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unflushed_bump()
4027 ASSERT(msp->ms_sm != NULL); in metaslab_unflushed_bump()
4029 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); in metaslab_unflushed_bump()
4030 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_unflushed_bump()
4031 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_unflushed_bump()
4033 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); in metaslab_unflushed_bump()
4038 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_unflushed_bump()
4039 avl_remove(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_bump()
4042 avl_add(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_bump()
4043 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_unflushed_bump()
4060 * all the contents of the pool-wide spacemap log). Updates the metaslab's
4061 * metadata and any pool-wide related log space map data (e.g. summary,
4067 metaslab_group_t *mg = msp->ms_group; in metaslab_flush_update()
4068 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_flush_update()
4070 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush_update()
4079 msp->ms_synced_length = space_map_length(msp->ms_sm); in metaslab_flush_update()
4083 * feature being active. In that case this is a no-op. in metaslab_flush_update()
4095 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_flush()
4097 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush()
4101 ASSERT(msp->ms_sm != NULL); in metaslab_flush()
4103 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); in metaslab_flush()
4118 if (msp->ms_loading) in metaslab_flush()
4135 if (msp->ms_loaded && metaslab_should_condense(msp)) { in metaslab_flush()
4136 metaslab_group_t *mg = msp->ms_group; in metaslab_flush()
4144 metaslab_class_histogram_verify(mg->mg_class); in metaslab_flush()
4149 space_map_histogram_clear(msp->ms_sm); in metaslab_flush()
4150 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); in metaslab_flush()
4151 ASSERT(zfs_range_tree_is_empty(msp->ms_freed)); in metaslab_flush()
4153 space_map_histogram_add(msp->ms_sm, in metaslab_flush()
4154 msp->ms_defer[t], tx); in metaslab_flush()
4160 metaslab_class_histogram_verify(mg->mg_class); in metaslab_flush()
4175 msp->ms_flushing = B_TRUE; in metaslab_flush()
4176 uint64_t sm_len_before = space_map_length(msp->ms_sm); in metaslab_flush()
4178 mutex_exit(&msp->ms_lock); in metaslab_flush()
4179 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, in metaslab_flush()
4181 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, in metaslab_flush()
4183 mutex_enter(&msp->ms_lock); in metaslab_flush()
4185 uint64_t sm_len_after = space_map_length(msp->ms_sm); in metaslab_flush()
4191 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_flush()
4192 (u_longlong_t)msp->ms_id, in metaslab_flush()
4194 msp->ms_unflushed_allocs), in metaslab_flush()
4196 msp->ms_unflushed_frees), in metaslab_flush()
4197 (u_longlong_t)(sm_len_after - sm_len_before)); in metaslab_flush()
4200 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_flush()
4202 spa->spa_unflushed_stats.sus_memused -= in metaslab_flush()
4204 zfs_range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_flush()
4205 zfs_range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_flush()
4215 msp->ms_flushing = B_FALSE; in metaslab_flush()
4216 cv_broadcast(&msp->ms_flush_cv); in metaslab_flush()
4226 metaslab_group_t *mg = msp->ms_group; in metaslab_sync()
4227 vdev_t *vd = mg->mg_vd; in metaslab_sync()
4228 spa_t *spa = vd->vdev_spa; in metaslab_sync()
4230 zfs_range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; in metaslab_sync()
4233 ASSERT(!vd->vdev_ishole); in metaslab_sync()
4238 if (msp->ms_new) { in metaslab_sync()
4240 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync()
4241 ASSERT0(zfs_range_tree_space(msp->ms_freed)); in metaslab_sync()
4242 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync()
4243 ASSERT0(zfs_range_tree_space(msp->ms_trim)); in metaslab_sync()
4259 zfs_range_tree_is_empty(msp->ms_freeing) && in metaslab_sync()
4260 zfs_range_tree_is_empty(msp->ms_checkpointing) && in metaslab_sync()
4261 !(msp->ms_loaded && msp->ms_condense_wanted && in metaslab_sync()
4288 if (msp->ms_sm == NULL) { in metaslab_sync()
4295 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * in metaslab_sync()
4296 msp->ms_id, sizeof (uint64_t), &new_object, tx, in metaslab_sync()
4299 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, in metaslab_sync()
4300 msp->ms_start, msp->ms_size, vd->vdev_ashift)); in metaslab_sync()
4301 ASSERT(msp->ms_sm != NULL); in metaslab_sync()
4303 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_sync()
4304 ASSERT(zfs_range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_sync()
4308 if (!zfs_range_tree_is_empty(msp->ms_checkpointing) && in metaslab_sync()
4309 vd->vdev_checkpoint_sm == NULL) { in metaslab_sync()
4316 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, in metaslab_sync()
4317 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); in metaslab_sync()
4318 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); in metaslab_sync()
4325 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, in metaslab_sync()
4326 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, in metaslab_sync()
4330 mutex_enter(&msp->ms_sync_lock); in metaslab_sync()
4331 mutex_enter(&msp->ms_lock); in metaslab_sync()
4339 metaslab_class_histogram_verify(mg->mg_class); in metaslab_sync()
4342 if (spa->spa_sync_pass == 1 && msp->ms_loaded && in metaslab_sync()
4349 * open-context (ZIL) for future TXGs do not block. in metaslab_sync()
4351 mutex_exit(&msp->ms_lock); in metaslab_sync()
4361 vd->vdev_id, tx); in metaslab_sync()
4362 space_map_write(log_sm, msp->ms_freeing, SM_FREE, in metaslab_sync()
4363 vd->vdev_id, tx); in metaslab_sync()
4364 mutex_enter(&msp->ms_lock); in metaslab_sync()
4366 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_sync()
4368 spa->spa_unflushed_stats.sus_memused -= in metaslab_sync()
4371 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); in metaslab_sync()
4372 zfs_range_tree_remove_xor_add(msp->ms_freeing, in metaslab_sync()
4373 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); in metaslab_sync()
4374 spa->spa_unflushed_stats.sus_memused += in metaslab_sync()
4379 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, in metaslab_sync()
4381 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, in metaslab_sync()
4383 mutex_enter(&msp->ms_lock); in metaslab_sync()
4386 msp->ms_allocated_space += zfs_range_tree_space(alloctree); in metaslab_sync()
4387 ASSERT3U(msp->ms_allocated_space, >=, in metaslab_sync()
4388 zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync()
4389 msp->ms_allocated_space -= zfs_range_tree_space(msp->ms_freeing); in metaslab_sync()
4391 if (!zfs_range_tree_is_empty(msp->ms_checkpointing)) { in metaslab_sync()
4393 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); in metaslab_sync()
4401 mutex_exit(&msp->ms_lock); in metaslab_sync()
4402 space_map_write(vd->vdev_checkpoint_sm, in metaslab_sync()
4403 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); in metaslab_sync()
4404 mutex_enter(&msp->ms_lock); in metaslab_sync()
4406 spa->spa_checkpoint_info.sci_dspace += in metaslab_sync()
4407 zfs_range_tree_space(msp->ms_checkpointing); in metaslab_sync()
4408 vd->vdev_stat.vs_checkpoint_space += in metaslab_sync()
4409 zfs_range_tree_space(msp->ms_checkpointing); in metaslab_sync()
4410 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, in metaslab_sync()
4411 -space_map_allocated(vd->vdev_checkpoint_sm)); in metaslab_sync()
4413 zfs_range_tree_vacate(msp->ms_checkpointing, NULL, NULL); in metaslab_sync()
4416 if (msp->ms_loaded) { in metaslab_sync()
4420 * to bring the space map's histogram up-to-date so we clear in metaslab_sync()
4423 space_map_histogram_clear(msp->ms_sm); in metaslab_sync()
4424 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); in metaslab_sync()
4429 * any deferred space. This allows the on-disk histogram in metaslab_sync()
4433 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); in metaslab_sync()
4437 * added back into the in-core free tree yet. This will in metaslab_sync()
4443 space_map_histogram_add(msp->ms_sm, in metaslab_sync()
4444 msp->ms_defer[t], tx); in metaslab_sync()
4450 * map histogram. We want to make sure that the on-disk histogram in metaslab_sync()
4455 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); in metaslab_sync()
4460 metaslab_class_histogram_verify(mg->mg_class); in metaslab_sync()
4473 zfs_range_tree_swap(&msp->ms_freeing, &msp->ms_freed); in metaslab_sync()
4474 ASSERT0(msp->ms_allocated_this_txg); in metaslab_sync()
4476 zfs_range_tree_vacate(msp->ms_freeing, in metaslab_sync()
4477 zfs_range_tree_add, msp->ms_freed); in metaslab_sync()
4479 msp->ms_allocated_this_txg += zfs_range_tree_space(alloctree); in metaslab_sync()
4482 ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); in metaslab_sync()
4483 ASSERT0(zfs_range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) in metaslab_sync()
4485 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync()
4486 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync()
4488 mutex_exit(&msp->ms_lock); in metaslab_sync()
4495 VERIFY0(dmu_read(mos, vd->vdev_ms_array, in metaslab_sync()
4496 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); in metaslab_sync()
4497 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); in metaslab_sync()
4499 mutex_exit(&msp->ms_sync_lock); in metaslab_sync()
4506 if (!msp->ms_loaded || msp->ms_disabled != 0) in metaslab_evict()
4511 msp->ms_allocating[(txg + t) & TXG_MASK])); in metaslab_evict()
4513 if (msp->ms_allocator != -1) in metaslab_evict()
4514 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); in metaslab_evict()
4527 metaslab_group_t *mg = msp->ms_group; in metaslab_sync_done()
4528 vdev_t *vd = mg->mg_vd; in metaslab_sync_done()
4529 spa_t *spa = vd->vdev_spa; in metaslab_sync_done()
4534 ASSERT(!vd->vdev_ishole); in metaslab_sync_done()
4536 mutex_enter(&msp->ms_lock); in metaslab_sync_done()
4538 if (msp->ms_new) { in metaslab_sync_done()
4540 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); in metaslab_sync_done()
4543 VERIFY0(msp->ms_allocated_this_txg); in metaslab_sync_done()
4544 VERIFY0(zfs_range_tree_space(msp->ms_freed)); in metaslab_sync_done()
4547 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync_done()
4548 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync_done()
4550 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; in metaslab_sync_done()
4552 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - in metaslab_sync_done()
4554 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || in metaslab_sync_done()
4555 vd->vdev_rz_expanding) { in metaslab_sync_done()
4560 alloc_delta = msp->ms_allocated_this_txg - in metaslab_sync_done()
4561 zfs_range_tree_space(msp->ms_freed); in metaslab_sync_done()
4564 defer_delta = zfs_range_tree_space(msp->ms_freed) - in metaslab_sync_done()
4567 defer_delta -= zfs_range_tree_space(*defer_tree); in metaslab_sync_done()
4569 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, in metaslab_sync_done()
4578 * have a consistent view at the in-core side of the metaslab. in metaslab_sync_done()
4586 * When auto-trimming is enabled, free ranges which are added to in metaslab_sync_done()
4595 msp->ms_trim); in metaslab_sync_done()
4597 zfs_range_tree_walk(msp->ms_freed, zfs_range_tree_add, in metaslab_sync_done()
4598 msp->ms_trim); in metaslab_sync_done()
4601 zfs_range_tree_vacate(msp->ms_trim, NULL, NULL); in metaslab_sync_done()
4607 * the defer_tree -- this is safe to do because we've in metaslab_sync_done()
4611 msp->ms_loaded ? zfs_range_tree_add : NULL, msp->ms_allocatable); in metaslab_sync_done()
4613 zfs_range_tree_swap(&msp->ms_freed, defer_tree); in metaslab_sync_done()
4615 zfs_range_tree_vacate(msp->ms_freed, in metaslab_sync_done()
4616 msp->ms_loaded ? zfs_range_tree_add : NULL, in metaslab_sync_done()
4617 msp->ms_allocatable); in metaslab_sync_done()
4620 msp->ms_synced_length = space_map_length(msp->ms_sm); in metaslab_sync_done()
4622 msp->ms_deferspace += defer_delta; in metaslab_sync_done()
4623 ASSERT3S(msp->ms_deferspace, >=, 0); in metaslab_sync_done()
4624 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); in metaslab_sync_done()
4625 if (msp->ms_deferspace != 0) { in metaslab_sync_done()
4634 if (msp->ms_new) { in metaslab_sync_done()
4635 msp->ms_new = B_FALSE; in metaslab_sync_done()
4636 mutex_enter(&mg->mg_lock); in metaslab_sync_done()
4637 mg->mg_ms_ready++; in metaslab_sync_done()
4638 mutex_exit(&mg->mg_lock); in metaslab_sync_done()
4642 * Re-sort metaslab within its group now that we've adjusted in metaslab_sync_done()
4647 ASSERT0(zfs_range_tree_space(msp->ms_allocating[txg & TXG_MASK])); in metaslab_sync_done()
4648 ASSERT0(zfs_range_tree_space(msp->ms_freeing)); in metaslab_sync_done()
4649 ASSERT0(zfs_range_tree_space(msp->ms_freed)); in metaslab_sync_done()
4650 ASSERT0(zfs_range_tree_space(msp->ms_checkpointing)); in metaslab_sync_done()
4651 msp->ms_allocating_total -= msp->ms_allocated_this_txg; in metaslab_sync_done()
4652 msp->ms_allocated_this_txg = 0; in metaslab_sync_done()
4653 mutex_exit(&msp->ms_lock); in metaslab_sync_done()
4659 spa_t *spa = mg->mg_class->mc_spa; in metaslab_sync_reassess()
4662 mg->mg_fragmentation = metaslab_group_fragmentation(mg); in metaslab_sync_reassess()
4672 if (mg->mg_activation_count > 0) { in metaslab_sync_reassess()
4691 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) in metaslab_is_unique()
4694 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; in metaslab_is_unique()
4696 return (msp->ms_id != dva_ms_id); in metaslab_is_unique()
4725 if (zal->zal_size == metaslab_trace_max_entries) { in metaslab_trace_add()
4731 zal->zal_size--; in metaslab_trace_add()
4732 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); in metaslab_trace_add()
4733 list_remove(&zal->zal_list, mat_next); in metaslab_trace_add()
4738 list_link_init(&mat->mat_list_node); in metaslab_trace_add()
4739 mat->mat_mg = mg; in metaslab_trace_add()
4740 mat->mat_msp = msp; in metaslab_trace_add()
4741 mat->mat_size = psize; in metaslab_trace_add()
4742 mat->mat_dva_id = dva_id; in metaslab_trace_add()
4743 mat->mat_offset = offset; in metaslab_trace_add()
4744 mat->mat_weight = 0; in metaslab_trace_add()
4745 mat->mat_allocator = allocator; in metaslab_trace_add()
4748 mat->mat_weight = msp->ms_weight; in metaslab_trace_add()
4754 list_insert_tail(&zal->zal_list, mat); in metaslab_trace_add()
4755 zal->zal_size++; in metaslab_trace_add()
4757 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); in metaslab_trace_add()
4763 ASSERT0(new->zal_size); in metaslab_trace_move()
4764 list_move_tail(&new->zal_list, &old->zal_list); in metaslab_trace_move()
4765 new->zal_size = old->zal_size; in metaslab_trace_move()
4766 list_destroy(&old->zal_list); in metaslab_trace_move()
4772 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), in metaslab_trace_init()
4774 zal->zal_size = 0; in metaslab_trace_init()
4782 while ((mat = list_remove_head(&zal->zal_list)) != NULL) in metaslab_trace_fini()
4784 list_destroy(&zal->zal_list); in metaslab_trace_fini()
4785 zal->zal_size = 0; in metaslab_trace_fini()
4801 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; in metaslab_group_alloc_increment()
4802 if (!mg->mg_class->mc_alloc_throttle_enabled) in metaslab_group_alloc_increment()
4805 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_increment()
4806 (void) zfs_refcount_add_many(&mga->mga_queue_depth, psize, tag); in metaslab_group_alloc_increment()
4814 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[d]); in metaslab_group_alloc_increment_all()
4827 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; in metaslab_group_alloc_decrement()
4828 if (!mg->mg_class->mc_alloc_throttle_enabled) in metaslab_group_alloc_decrement()
4831 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_decrement()
4832 (void) zfs_refcount_remove_many(&mga->mga_queue_depth, psize, tag); in metaslab_group_alloc_decrement()
4840 zfs_range_tree_t *rt = msp->ms_allocatable; in metaslab_block_alloc()
4841 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_block_alloc()
4843 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_block_alloc()
4844 VERIFY(!msp->ms_condensing); in metaslab_block_alloc()
4845 VERIFY0(msp->ms_disabled); in metaslab_block_alloc()
4846 VERIFY0(msp->ms_new); in metaslab_block_alloc()
4848 start = mc->mc_ops->msop_alloc(msp, size, max_size, actual_size); in metaslab_block_alloc()
4849 if (start != -1ULL) { in metaslab_block_alloc()
4851 metaslab_group_t *mg = msp->ms_group; in metaslab_block_alloc()
4852 vdev_t *vd = mg->mg_vd; in metaslab_block_alloc()
4854 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); in metaslab_block_alloc()
4855 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_block_alloc()
4856 VERIFY3U(zfs_range_tree_space(rt) - size, <=, msp->ms_size); in metaslab_block_alloc()
4858 zfs_range_tree_clear(msp->ms_trim, start, size); in metaslab_block_alloc()
4860 if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) in metaslab_block_alloc()
4861 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); in metaslab_block_alloc()
4863 zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, in metaslab_block_alloc()
4865 msp->ms_allocating_total += size; in metaslab_block_alloc()
4868 msp->ms_alloc_txg = txg; in metaslab_block_alloc()
4876 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_block_alloc()
4886 * have selected, we may not try the newly-activated metaslab, and instead
4889 * except for the newly-activated metaslab which we fail to examine).
4898 avl_tree_t *t = &mg->mg_metaslab_tree; in find_valid_metaslab()
4923 if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) in find_valid_metaslab()
4926 *was_active = msp->ms_allocator != -1; in find_valid_metaslab()
4948 search->ms_weight = msp->ms_weight; in find_valid_metaslab()
4949 search->ms_start = msp->ms_start + 1; in find_valid_metaslab()
4950 search->ms_allocator = msp->ms_allocator; in find_valid_metaslab()
4951 search->ms_primary = msp->ms_primary; in find_valid_metaslab()
4959 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_active_mask_verify()
4964 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) in metaslab_active_mask_verify()
4967 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { in metaslab_active_mask_verify()
4968 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_active_mask_verify()
4969 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); in metaslab_active_mask_verify()
4970 VERIFY3S(msp->ms_allocator, !=, -1); in metaslab_active_mask_verify()
4971 VERIFY(msp->ms_primary); in metaslab_active_mask_verify()
4975 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { in metaslab_active_mask_verify()
4976 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_active_mask_verify()
4977 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); in metaslab_active_mask_verify()
4978 VERIFY3S(msp->ms_allocator, !=, -1); in metaslab_active_mask_verify()
4979 VERIFY(!msp->ms_primary); in metaslab_active_mask_verify()
4983 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { in metaslab_active_mask_verify()
4984 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_active_mask_verify()
4985 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_active_mask_verify()
4986 VERIFY3S(msp->ms_allocator, ==, -1); in metaslab_active_mask_verify()
4998 uint64_t offset = -1ULL; in metaslab_group_alloc()
5003 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { in metaslab_group_alloc()
5006 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { in metaslab_group_alloc()
5015 if (allocator >= mg->mg_ms_ready / 3) in metaslab_group_alloc()
5017 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc()
5019 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); in metaslab_group_alloc()
5022 search->ms_weight = UINT64_MAX; in metaslab_group_alloc()
5023 search->ms_start = 0; in metaslab_group_alloc()
5025 * At the end of the metaslab tree are the already-active metaslabs, in metaslab_group_alloc()
5031 search->ms_allocator = -1; in metaslab_group_alloc()
5032 search->ms_primary = B_TRUE; in metaslab_group_alloc()
5036 mutex_enter(&mg->mg_lock); in metaslab_group_alloc()
5039 mga->mga_primary != NULL) { in metaslab_group_alloc()
5040 msp = mga->mga_primary; in metaslab_group_alloc()
5048 ASSERT(msp->ms_primary); in metaslab_group_alloc()
5049 ASSERT3S(msp->ms_allocator, ==, allocator); in metaslab_group_alloc()
5050 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5053 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_group_alloc()
5055 mga->mga_secondary != NULL) { in metaslab_group_alloc()
5056 msp = mga->mga_secondary; in metaslab_group_alloc()
5062 ASSERT(!msp->ms_primary); in metaslab_group_alloc()
5063 ASSERT3S(msp->ms_allocator, ==, allocator); in metaslab_group_alloc()
5064 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5067 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_group_alloc()
5074 mutex_exit(&mg->mg_lock); in metaslab_group_alloc()
5077 mutex_enter(&msp->ms_lock); in metaslab_group_alloc()
5083 * tracepoints in non-gpl kernel modules. in metaslab_group_alloc()
5099 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { in metaslab_group_alloc()
5100 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_group_alloc()
5101 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5111 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && in metaslab_group_alloc()
5112 (msp->ms_allocator != -1) && in metaslab_group_alloc()
5113 (msp->ms_allocator != allocator || ((activation_weight == in metaslab_group_alloc()
5114 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { in metaslab_group_alloc()
5115 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5116 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || in metaslab_group_alloc()
5117 msp->ms_allocator != -1); in metaslab_group_alloc()
5118 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5129 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && in metaslab_group_alloc()
5131 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5132 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_group_alloc()
5133 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc()
5135 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5164 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5167 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5190 if (msp->ms_condensing) { in metaslab_group_alloc()
5194 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc()
5197 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5199 } else if (msp->ms_disabled > 0) { in metaslab_group_alloc()
5203 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc()
5206 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5213 if (offset != -1ULL) { in metaslab_group_alloc()
5219 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5224 ASSERT(msp->ms_loaded); in metaslab_group_alloc()
5228 * tracepoints in non-gpl kernel modules. in metaslab_group_alloc()
5243 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { in metaslab_group_alloc()
5259 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_group_alloc()
5272 mutex_exit(&msp->ms_lock); in metaslab_group_alloc()
5276 if (offset == -1ULL) { in metaslab_group_alloc()
5279 if (asize <= vdev_get_min_alloc(mg->mg_vd)) { in metaslab_group_alloc()
5287 mg->mg_no_free_space = B_TRUE; in metaslab_group_alloc()
5297 metaslab_class_t *mc = mg->mg_class; in metaslab_group_allocatable()
5298 vdev_t *vd = mg->mg_vd; in metaslab_group_allocatable()
5319 if (!GANG_ALLOCATION(flags) && (mg->mg_no_free_space || in metaslab_group_allocatable()
5320 (!mg->mg_allocatable && mc->mc_alloc_groups > 0))) { in metaslab_group_allocatable()
5327 * Avoid writing single-copy data to an unhealthy, in metaslab_group_allocatable()
5328 * non-redundant vdev. in metaslab_group_allocatable()
5330 if (d == 0 && vd->vdev_state < VDEV_STATE_HEALTHY && in metaslab_group_allocatable()
5331 vd->vdev_children == 0) { in metaslab_group_allocatable()
5346 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_alloc_dva_range()
5356 * and a large number of split blocks coupled with ztest-induced in metaslab_alloc_dva_range()
5378 * nothing actually breaks if we miss a few updates -- we just won't in metaslab_alloc_dva_range()
5388 * able to reason about. Otherwise, any two top-level vdev failures in metaslab_alloc_dva_range()
5390 * only two adjacent top-level vdev failures will result in data loss. in metaslab_alloc_dva_range()
5392 * If we are doing gang blocks (hintdva is non-NULL), try to keep in metaslab_alloc_dva_range()
5401 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); in metaslab_alloc_dva_range()
5402 mg = vdev_get_mg(vd, mc)->mg_next; in metaslab_alloc_dva_range()
5404 if (mg == NULL || mg->mg_class != mc || mg->mg_activation_count <= 0) { in metaslab_alloc_dva_range()
5405 ASSERT(mca->mca_rotor != NULL); in metaslab_alloc_dva_range()
5406 mg = mca->mca_rotor; in metaslab_alloc_dva_range()
5412 ASSERT(mg->mg_activation_count == 1); in metaslab_alloc_dva_range()
5413 ASSERT(mg->mg_class == mc); in metaslab_alloc_dva_range()
5419 vd = mg->mg_vd; in metaslab_alloc_dva_range()
5421 ASSERT0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); in metaslab_alloc_dva_range()
5424 ASSERT0(P2PHASE(max_asize, 1ULL << vd->vdev_ashift)); in metaslab_alloc_dva_range()
5429 if (offset != -1ULL) { in metaslab_alloc_dva_range()
5435 DVA_SET_VDEV(&dva[d], vd->vdev_id); in metaslab_alloc_dva_range()
5444 } while ((mg = mg->mg_next) != rotor); in metaslab_alloc_dva_range()
5451 psize <= spa->spa_min_alloc)) { in metaslab_alloc_dva_range()
5480 spa_t *spa = vd->vdev_spa; in metaslab_free_concrete()
5481 int m = offset >> vd->vdev_ms_shift; in metaslab_free_concrete()
5485 VERIFY3U(m, <, vd->vdev_ms_count); in metaslab_free_concrete()
5487 msp = vd->vdev_ms[m]; in metaslab_free_concrete()
5489 VERIFY(!msp->ms_condensing); in metaslab_free_concrete()
5490 VERIFY3U(offset, >=, msp->ms_start); in metaslab_free_concrete()
5491 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); in metaslab_free_concrete()
5492 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_free_concrete()
5493 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); in metaslab_free_concrete()
5497 mutex_enter(&msp->ms_lock); in metaslab_free_concrete()
5498 if (zfs_range_tree_is_empty(msp->ms_freeing) && in metaslab_free_concrete()
5499 zfs_range_tree_is_empty(msp->ms_checkpointing)) { in metaslab_free_concrete()
5505 zfs_range_tree_add(msp->ms_checkpointing, offset, asize); in metaslab_free_concrete()
5507 zfs_range_tree_add(msp->ms_freeing, offset, asize); in metaslab_free_concrete()
5509 mutex_exit(&msp->ms_lock); in metaslab_free_concrete()
5521 if (vd->vdev_ops->vdev_op_remap != NULL) in metaslab_free_impl_cb()
5531 spa_t *spa = vd->vdev_spa; in metaslab_free_impl()
5538 if (spa->spa_vdev_removal != NULL && in metaslab_free_impl()
5539 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && in metaslab_free_impl()
5548 } else if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_free_impl()
5550 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_free_impl()
5570 blkptr_t *bp = rbca->rbca_bp; in remap_blkptr_cb()
5573 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) in remap_blkptr_cb()
5577 if (rbca->rbca_cb != NULL) { in remap_blkptr_cb()
5583 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); in remap_blkptr_cb()
5585 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, in remap_blkptr_cb()
5586 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); in remap_blkptr_cb()
5589 rbca->rbca_remap_vd = vd; in remap_blkptr_cb()
5590 rbca->rbca_remap_offset = offset; in remap_blkptr_cb()
5603 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, in remap_blkptr_cb()
5604 DVA_GET_VDEV(&bp->blk_dva[0])); in remap_blkptr_cb()
5605 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; in remap_blkptr_cb()
5607 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); in remap_blkptr_cb()
5624 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); in remap_blkptr_cb()
5625 DVA_SET_OFFSET(&bp->blk_dva[0], offset); in remap_blkptr_cb()
5684 dva_t *dva = &bp->blk_dva[0]; in spa_remap_blkptr()
5690 if (vd->vdev_ops->vdev_op_remap == NULL) in spa_remap_blkptr()
5706 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); in spa_remap_blkptr()
5709 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) in spa_remap_blkptr()
5734 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { in metaslab_unalloc_dva()
5741 ASSERT(!vd->vdev_removing); in metaslab_unalloc_dva()
5743 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); in metaslab_unalloc_dva()
5744 ASSERT0P(vd->vdev_indirect_mapping); in metaslab_unalloc_dva()
5749 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_unalloc_dva()
5751 mutex_enter(&msp->ms_lock); in metaslab_unalloc_dva()
5752 zfs_range_tree_remove(msp->ms_allocating[txg & TXG_MASK], in metaslab_unalloc_dva()
5754 msp->ms_allocating_total -= size; in metaslab_unalloc_dva()
5756 VERIFY(!msp->ms_condensing); in metaslab_unalloc_dva()
5757 VERIFY3U(offset, >=, msp->ms_start); in metaslab_unalloc_dva()
5758 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); in metaslab_unalloc_dva()
5759 VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) + size, <=, in metaslab_unalloc_dva()
5760 msp->ms_size); in metaslab_unalloc_dva()
5761 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_unalloc_dva()
5762 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_unalloc_dva()
5763 zfs_range_tree_add(msp->ms_allocatable, offset, size); in metaslab_unalloc_dva()
5764 mutex_exit(&msp->ms_lock); in metaslab_unalloc_dva()
5799 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_class_throttle_reserve()
5801 ASSERT(mc->mc_alloc_throttle_enabled); in metaslab_class_throttle_reserve()
5802 if (mc->mc_alloc_io_size < io_size) { in metaslab_class_throttle_reserve()
5803 mc->mc_alloc_io_size = io_size; in metaslab_class_throttle_reserve()
5806 if (must || mca->mca_reserved <= mc->mc_alloc_max) { in metaslab_class_throttle_reserve()
5810 * But even if we assume some other non-existing scenario, the in metaslab_class_throttle_reserve()
5815 *more = (atomic_add_64_nv(&mca->mca_reserved, delta) <= in metaslab_class_throttle_reserve()
5816 mc->mc_alloc_max); in metaslab_class_throttle_reserve()
5827 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_class_throttle_unreserve()
5829 ASSERT(mc->mc_alloc_throttle_enabled); in metaslab_class_throttle_unreserve()
5831 return (atomic_add_64_nv(&mca->mca_reserved, -delta) <= in metaslab_class_throttle_unreserve()
5832 mc->mc_alloc_max); in metaslab_class_throttle_unreserve()
5840 spa_t *spa = vd->vdev_spa; in metaslab_claim_concrete()
5843 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) in metaslab_claim_concrete()
5846 ASSERT3P(vd->vdev_ms, !=, NULL); in metaslab_claim_concrete()
5847 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_claim_concrete()
5849 mutex_enter(&msp->ms_lock); in metaslab_claim_concrete()
5851 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { in metaslab_claim_concrete()
5854 ASSERT(msp->ms_loaded); in metaslab_claim_concrete()
5855 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_claim_concrete()
5861 !zfs_range_tree_contains(msp->ms_allocatable, offset, size)) in metaslab_claim_concrete()
5865 mutex_exit(&msp->ms_lock); in metaslab_claim_concrete()
5869 VERIFY(!msp->ms_condensing); in metaslab_claim_concrete()
5870 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_claim_concrete()
5871 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_claim_concrete()
5872 VERIFY3U(zfs_range_tree_space(msp->ms_allocatable) - size, <=, in metaslab_claim_concrete()
5873 msp->ms_size); in metaslab_claim_concrete()
5874 zfs_range_tree_remove(msp->ms_allocatable, offset, size); in metaslab_claim_concrete()
5875 zfs_range_tree_clear(msp->ms_trim, offset, size); in metaslab_claim_concrete()
5878 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_claim_concrete()
5880 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_claim_concrete()
5881 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_claim_concrete()
5882 msp->ms_selected_txg = txg; in metaslab_claim_concrete()
5887 if (zfs_range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) in metaslab_claim_concrete()
5889 zfs_range_tree_add(msp->ms_allocating[txg & TXG_MASK], in metaslab_claim_concrete()
5891 msp->ms_allocating_total += size; in metaslab_claim_concrete()
5894 mutex_exit(&msp->ms_lock); in metaslab_claim_concrete()
5911 if (mcca_arg->mcca_error == 0) { in metaslab_claim_impl_cb()
5912 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, in metaslab_claim_impl_cb()
5913 size, mcca_arg->mcca_txg); in metaslab_claim_impl_cb()
5920 if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_claim_impl()
5928 ASSERT(!spa_writeable(vd->vdev_spa)); in metaslab_claim_impl()
5932 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_claim_impl()
5986 dva_t *dva = bp->blk_dva; in metaslab_alloc_range()
5987 const dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; in metaslab_alloc_range()
5995 if (mc->mc_allocator[allocator].mca_rotor == NULL) { in metaslab_alloc_range()
6013 for (d--; d >= 0; d--) { in metaslab_alloc_range()
6049 const dva_t *dva = bp->blk_dva; in metaslab_free()
6071 if (BP_GET_BIRTH(bp) <= spa->spa_checkpoint_txg && in metaslab_free()
6072 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { in metaslab_free()
6099 const dva_t *dva = bp->blk_dva; in metaslab_claim()
6135 if (vd->vdev_ops == &vdev_indirect_ops) in metaslab_check_free_impl_cb()
6145 spa_t *spa __maybe_unused = vd->vdev_spa; in metaslab_check_free_impl()
6150 if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_check_free_impl()
6151 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_check_free_impl()
6157 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); in metaslab_check_free_impl()
6160 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_check_free_impl()
6162 mutex_enter(&msp->ms_lock); in metaslab_check_free_impl()
6163 if (msp->ms_loaded) { in metaslab_check_free_impl()
6164 zfs_range_tree_verify_not_present(msp->ms_allocatable, in metaslab_check_free_impl()
6179 zfs_range_tree_verify_not_present(msp->ms_freeing, offset, size); in metaslab_check_free_impl()
6180 zfs_range_tree_verify_not_present(msp->ms_checkpointing, offset, size); in metaslab_check_free_impl()
6181 zfs_range_tree_verify_not_present(msp->ms_freed, offset, size); in metaslab_check_free_impl()
6183 zfs_range_tree_verify_not_present(msp->ms_defer[j], offset, in metaslab_check_free_impl()
6185 zfs_range_tree_verify_not_present(msp->ms_trim, offset, size); in metaslab_check_free_impl()
6186 mutex_exit(&msp->ms_lock); in metaslab_check_free_impl()
6197 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); in metaslab_check_free()
6199 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); in metaslab_check_free()
6200 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); in metaslab_check_free()
6202 if (DVA_GET_GANG(&bp->blk_dva[i])) in metaslab_check_free()
6215 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); in metaslab_group_disable_wait()
6216 while (mg->mg_disabled_updating) { in metaslab_group_disable_wait()
6217 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); in metaslab_group_disable_wait()
6224 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); in metaslab_group_disabled_increment()
6225 ASSERT(mg->mg_disabled_updating); in metaslab_group_disabled_increment()
6227 while (mg->mg_ms_disabled >= max_disabled_ms) { in metaslab_group_disabled_increment()
6228 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); in metaslab_group_disabled_increment()
6230 mg->mg_ms_disabled++; in metaslab_group_disabled_increment()
6231 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); in metaslab_group_disabled_increment()
6243 ASSERT(!MUTEX_HELD(&msp->ms_lock)); in metaslab_disable()
6244 metaslab_group_t *mg = msp->ms_group; in metaslab_disable()
6246 mutex_enter(&mg->mg_ms_disabled_lock); in metaslab_disable()
6258 mg->mg_disabled_updating = B_TRUE; in metaslab_disable()
6259 if (msp->ms_disabled == 0) { in metaslab_disable()
6262 mutex_enter(&msp->ms_lock); in metaslab_disable()
6263 msp->ms_disabled++; in metaslab_disable()
6264 mutex_exit(&msp->ms_lock); in metaslab_disable()
6266 mg->mg_disabled_updating = B_FALSE; in metaslab_disable()
6267 cv_broadcast(&mg->mg_ms_disabled_cv); in metaslab_disable()
6268 mutex_exit(&mg->mg_ms_disabled_lock); in metaslab_disable()
6274 metaslab_group_t *mg = msp->ms_group; in metaslab_enable()
6275 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_enable()
6285 mutex_enter(&mg->mg_ms_disabled_lock); in metaslab_enable()
6286 mutex_enter(&msp->ms_lock); in metaslab_enable()
6287 if (--msp->ms_disabled == 0) { in metaslab_enable()
6288 mg->mg_ms_disabled--; in metaslab_enable()
6289 cv_broadcast(&mg->mg_ms_disabled_cv); in metaslab_enable()
6293 mutex_exit(&msp->ms_lock); in metaslab_enable()
6294 mutex_exit(&mg->mg_ms_disabled_lock); in metaslab_enable()
6300 ms->ms_unflushed_dirty = dirty; in metaslab_set_unflushed_dirty()
6306 vdev_t *vd = ms->ms_group->mg_vd; in metaslab_update_ondisk_flush_data()
6307 spa_t *spa = vd->vdev_spa; in metaslab_update_ondisk_flush_data()
6316 uint64_t entry_offset = ms->ms_id * entry_size; in metaslab_update_ondisk_flush_data()
6319 int err = zap_lookup(mos, vd->vdev_top_zap, in metaslab_update_ondisk_flush_data()
6325 VERIFY0(zap_add(mos, vd->vdev_top_zap, in metaslab_update_ondisk_flush_data()
6339 ms->ms_unflushed_txg = txg; in metaslab_set_unflushed_txg()
6346 return (ms->ms_unflushed_dirty); in metaslab_unflushed_dirty()
6352 return (ms->ms_unflushed_txg); in metaslab_unflushed_txg()
6396 "Enable space-based metaslab group biasing");
6399 "Enable performance-based metaslab group biasing");
6402 ZMOD_RW, "Enable segment-based metaslab selection");
6405 "Segment-based metaslab selection maximum buckets before switching");