Lines Matching +full:touch +full:- +full:hold +full:- +full:ms

9  * or https://opensource.org/licenses/CDDL-1.0.
50 * moving on to the next top-level vdev.
65 * In pools where the log space map feature is not enabled we touch
82 * The in-core space map representation is more compact than its on-disk form.
83 * The zfs_condense_pct determines how much more compact the in-core
84 * space map representation must be before we compact it on-disk.
171 * in a space map to continue allocations in a first-fit fashion.
173 * switch to using best-fit allocations.
181 * high-performance storage.
213 * unloaded sooner. These settings are intended to be generous -- to keep
250 * Enable/disable segment-based metaslab selection.
255 * When using segment-based metaslab selection, we will continue
269 * in a given list when running in non-debug mode. We limit the number
270 * of entries in non-debug mode to prevent us from using up too much memory.
285 * To avoid 64-bit overflow, don't set above UINT32_MAX.
297 * Force the per-metaslab range trees to use 64-bit integers to store
303 * By default we only store segments over a certain size in the size-sorted
314 * gang allocation. If that fails then we will have a multi-layer gang
320 * that fails then we will have a multi-layer gang block.
333 * metaslabs all have free segments in the 32-63K bucket, but the best
384 metaslab_ksp->ks_data = &metaslab_stats; in metaslab_stat_init()
412 mc_allocator[spa->spa_alloc_count]), KM_SLEEP); in metaslab_class_create()
414 mc->mc_spa = spa; in metaslab_class_create()
415 mc->mc_ops = ops; in metaslab_class_create()
416 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_class_create()
417 multilist_create(&mc->mc_metaslab_txg_list, sizeof (metaslab_t), in metaslab_class_create()
419 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_create()
420 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_create()
421 mca->mca_rotor = NULL; in metaslab_class_create()
422 zfs_refcount_create_tracked(&mca->mca_alloc_slots); in metaslab_class_create()
431 spa_t *spa = mc->mc_spa; in metaslab_class_destroy()
433 ASSERT(mc->mc_alloc == 0); in metaslab_class_destroy()
434 ASSERT(mc->mc_deferred == 0); in metaslab_class_destroy()
435 ASSERT(mc->mc_space == 0); in metaslab_class_destroy()
436 ASSERT(mc->mc_dspace == 0); in metaslab_class_destroy()
438 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_class_destroy()
439 metaslab_class_allocator_t *mca = &mc->mc_allocator[i]; in metaslab_class_destroy()
440 ASSERT(mca->mca_rotor == NULL); in metaslab_class_destroy()
441 zfs_refcount_destroy(&mca->mca_alloc_slots); in metaslab_class_destroy()
443 mutex_destroy(&mc->mc_lock); in metaslab_class_destroy()
444 multilist_destroy(&mc->mc_metaslab_txg_list); in metaslab_class_destroy()
446 mc_allocator[spa->spa_alloc_count])); in metaslab_class_destroy()
456 * Must hold one of the spa_config locks. in metaslab_class_validate()
458 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || in metaslab_class_validate()
459 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); in metaslab_class_validate()
461 if ((mg = mc->mc_allocator[0].mca_rotor) == NULL) in metaslab_class_validate()
465 vd = mg->mg_vd; in metaslab_class_validate()
466 ASSERT(vd->vdev_mg != NULL); in metaslab_class_validate()
467 ASSERT3P(vd->vdev_top, ==, vd); in metaslab_class_validate()
468 ASSERT3P(mg->mg_class, ==, mc); in metaslab_class_validate()
469 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); in metaslab_class_validate()
470 } while ((mg = mg->mg_next) != mc->mc_allocator[0].mca_rotor); in metaslab_class_validate()
479 atomic_add_64(&mc->mc_alloc, alloc_delta); in metaslab_class_space_update()
480 atomic_add_64(&mc->mc_deferred, defer_delta); in metaslab_class_space_update()
481 atomic_add_64(&mc->mc_space, space_delta); in metaslab_class_space_update()
482 atomic_add_64(&mc->mc_dspace, dspace_delta); in metaslab_class_space_update()
488 return (mc->mc_alloc); in metaslab_class_get_alloc()
494 return (mc->mc_deferred); in metaslab_class_get_deferred()
500 return (mc->mc_space); in metaslab_class_get_space()
506 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); in metaslab_class_get_dspace()
512 spa_t *spa = mc->mc_spa; in metaslab_class_histogram_verify()
513 vdev_t *rvd = spa->spa_root_vdev; in metaslab_class_histogram_verify()
523 mutex_enter(&mc->mc_lock); in metaslab_class_histogram_verify()
524 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_histogram_verify()
525 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_histogram_verify()
529 * Skip any holes, uninitialized top-levels, or in metaslab_class_histogram_verify()
532 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_histogram_verify()
533 mg->mg_class != mc) { in metaslab_class_histogram_verify()
537 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_class_histogram_verify()
538 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_class_histogram_verify()
541 mc_hist[i] += mg->mg_histogram[i]; in metaslab_class_histogram_verify()
545 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); in metaslab_class_histogram_verify()
548 mutex_exit(&mc->mc_lock); in metaslab_class_histogram_verify()
562 vdev_t *rvd = mc->mc_spa->spa_root_vdev; in metaslab_class_fragmentation()
565 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); in metaslab_class_fragmentation()
567 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_fragmentation()
568 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_fragmentation()
569 metaslab_group_t *mg = tvd->vdev_mg; in metaslab_class_fragmentation()
572 * Skip any holes, uninitialized top-levels, in metaslab_class_fragmentation()
575 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_fragmentation()
576 mg->mg_class != mc) { in metaslab_class_fragmentation()
584 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { in metaslab_class_fragmentation()
585 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_fragmentation()
593 fragmentation += mg->mg_fragmentation * in metaslab_class_fragmentation()
599 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_fragmentation()
612 vdev_t *rvd = mc->mc_spa->spa_root_vdev; in metaslab_class_expandable_space()
615 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); in metaslab_class_expandable_space()
616 for (int c = 0; c < rvd->vdev_children; c++) { in metaslab_class_expandable_space()
617 vdev_t *tvd = rvd->vdev_child[c]; in metaslab_class_expandable_space()
618 metaslab_group_t *mg = tvd->vdev_mg; in metaslab_class_expandable_space()
620 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || in metaslab_class_expandable_space()
621 mg->mg_class != mc) { in metaslab_class_expandable_space()
630 space += P2ALIGN_TYPED(tvd->vdev_max_asize - tvd->vdev_asize, in metaslab_class_expandable_space()
631 1ULL << tvd->vdev_ms_shift, uint64_t); in metaslab_class_expandable_space()
633 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); in metaslab_class_expandable_space()
640 multilist_t *ml = &mc->mc_metaslab_txg_list; in metaslab_class_evict_old()
647 mutex_enter(&msp->ms_lock); in metaslab_class_evict_old()
655 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_class_evict_old()
656 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
657 i--; in metaslab_class_evict_old()
664 msp->ms_selected_txg + metaslab_unload_delay && in metaslab_class_evict_old()
665 now > msp->ms_selected_time + in metaslab_class_evict_old()
667 (msp->ms_allocator == -1 || in metaslab_class_evict_old()
676 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
679 mutex_exit(&msp->ms_lock); in metaslab_class_evict_old()
693 if (m1->ms_allocator != -1 && m1->ms_primary) in metaslab_compare()
695 else if (m1->ms_allocator != -1 && !m1->ms_primary) in metaslab_compare()
697 if (m2->ms_allocator != -1 && m2->ms_primary) in metaslab_compare()
699 else if (m2->ms_allocator != -1 && !m2->ms_primary) in metaslab_compare()
711 return (-1); in metaslab_compare()
715 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); in metaslab_compare()
719 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); in metaslab_compare()
721 return (TREE_CMP(m1->ms_start, m2->ms_start)); in metaslab_compare()
734 * transitions from allocatable to non-allocatable or vice versa then the
740 vdev_t *vd = mg->mg_vd; in metaslab_group_alloc_update()
741 metaslab_class_t *mc = mg->mg_class; in metaslab_group_alloc_update()
742 vdev_stat_t *vs = &vd->vdev_stat; in metaslab_group_alloc_update()
746 ASSERT(vd == vd->vdev_top); in metaslab_group_alloc_update()
747 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, in metaslab_group_alloc_update()
750 mutex_enter(&mg->mg_lock); in metaslab_group_alloc_update()
751 was_allocatable = mg->mg_allocatable; in metaslab_group_alloc_update()
752 was_initialized = mg->mg_initialized; in metaslab_group_alloc_update()
754 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / in metaslab_group_alloc_update()
755 (vs->vs_space + 1); in metaslab_group_alloc_update()
757 mutex_enter(&mc->mc_lock); in metaslab_group_alloc_update()
763 * for allocations. We also don't consider non-activated in metaslab_group_alloc_update()
767 mg->mg_initialized = metaslab_group_initialized(mg); in metaslab_group_alloc_update()
768 if (!was_initialized && mg->mg_initialized) { in metaslab_group_alloc_update()
769 mc->mc_groups++; in metaslab_group_alloc_update()
770 } else if (was_initialized && !mg->mg_initialized) { in metaslab_group_alloc_update()
771 ASSERT3U(mc->mc_groups, >, 0); in metaslab_group_alloc_update()
772 mc->mc_groups--; in metaslab_group_alloc_update()
774 if (mg->mg_initialized) in metaslab_group_alloc_update()
775 mg->mg_no_free_space = B_FALSE; in metaslab_group_alloc_update()
783 mg->mg_allocatable = (mg->mg_activation_count > 0 && in metaslab_group_alloc_update()
784 mg->mg_free_capacity > zfs_mg_noalloc_threshold && in metaslab_group_alloc_update()
785 (mg->mg_fragmentation == ZFS_FRAG_INVALID || in metaslab_group_alloc_update()
786 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); in metaslab_group_alloc_update()
796 * When a group transitions from allocatable to non-allocatable or in metaslab_group_alloc_update()
803 if (was_allocatable && !mg->mg_allocatable) in metaslab_group_alloc_update()
804 mc->mc_alloc_groups--; in metaslab_group_alloc_update()
805 else if (!was_allocatable && mg->mg_allocatable) in metaslab_group_alloc_update()
806 mc->mc_alloc_groups++; in metaslab_group_alloc_update()
807 mutex_exit(&mc->mc_lock); in metaslab_group_alloc_update()
809 mutex_exit(&mg->mg_lock); in metaslab_group_alloc_update()
818 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); in metaslab_sort_by_flushed()
822 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; in metaslab_sort_by_flushed()
823 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; in metaslab_sort_by_flushed()
828 return (TREE_CMP(a->ms_id, b->ms_id)); in metaslab_sort_by_flushed()
838 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_group_create()
839 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_group_create()
840 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); in metaslab_group_create()
841 avl_create(&mg->mg_metaslab_tree, metaslab_compare, in metaslab_group_create()
843 mg->mg_vd = vd; in metaslab_group_create()
844 mg->mg_class = mc; in metaslab_group_create()
845 mg->mg_activation_count = 0; in metaslab_group_create()
846 mg->mg_initialized = B_FALSE; in metaslab_group_create()
847 mg->mg_no_free_space = B_TRUE; in metaslab_group_create()
848 mg->mg_allocators = allocators; in metaslab_group_create()
851 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_create()
852 zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); in metaslab_group_create()
861 ASSERT(mg->mg_prev == NULL); in metaslab_group_destroy()
862 ASSERT(mg->mg_next == NULL); in metaslab_group_destroy()
868 ASSERT(mg->mg_activation_count <= 0); in metaslab_group_destroy()
870 avl_destroy(&mg->mg_metaslab_tree); in metaslab_group_destroy()
871 mutex_destroy(&mg->mg_lock); in metaslab_group_destroy()
872 mutex_destroy(&mg->mg_ms_disabled_lock); in metaslab_group_destroy()
873 cv_destroy(&mg->mg_ms_disabled_cv); in metaslab_group_destroy()
875 for (int i = 0; i < mg->mg_allocators; i++) { in metaslab_group_destroy()
876 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_destroy()
877 zfs_refcount_destroy(&mga->mga_alloc_queue_depth); in metaslab_group_destroy()
880 mg_allocator[mg->mg_allocators])); in metaslab_group_destroy()
886 metaslab_class_t *mc = mg->mg_class; in metaslab_group_activate()
887 spa_t *spa = mc->mc_spa; in metaslab_group_activate()
892 ASSERT(mg->mg_prev == NULL); in metaslab_group_activate()
893 ASSERT(mg->mg_next == NULL); in metaslab_group_activate()
894 ASSERT(mg->mg_activation_count <= 0); in metaslab_group_activate()
896 if (++mg->mg_activation_count <= 0) in metaslab_group_activate()
899 mg->mg_aliquot = metaslab_aliquot * MAX(1, in metaslab_group_activate()
900 vdev_get_ndisks(mg->mg_vd) - vdev_get_nparity(mg->mg_vd)); in metaslab_group_activate()
903 if ((mgprev = mc->mc_allocator[0].mca_rotor) == NULL) { in metaslab_group_activate()
904 mg->mg_prev = mg; in metaslab_group_activate()
905 mg->mg_next = mg; in metaslab_group_activate()
907 mgnext = mgprev->mg_next; in metaslab_group_activate()
908 mg->mg_prev = mgprev; in metaslab_group_activate()
909 mg->mg_next = mgnext; in metaslab_group_activate()
910 mgprev->mg_next = mg; in metaslab_group_activate()
911 mgnext->mg_prev = mg; in metaslab_group_activate()
913 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_activate()
914 mc->mc_allocator[i].mca_rotor = mg; in metaslab_group_activate()
915 mg = mg->mg_next; in metaslab_group_activate()
921 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
928 metaslab_class_t *mc = mg->mg_class; in metaslab_group_passivate()
929 spa_t *spa = mc->mc_spa; in metaslab_group_passivate()
936 if (--mg->mg_activation_count != 0) { in metaslab_group_passivate()
937 for (int i = 0; i < spa->spa_alloc_count; i++) in metaslab_group_passivate()
938 ASSERT(mc->mc_allocator[i].mca_rotor != mg); in metaslab_group_passivate()
939 ASSERT(mg->mg_prev == NULL); in metaslab_group_passivate()
940 ASSERT(mg->mg_next == NULL); in metaslab_group_passivate()
941 ASSERT(mg->mg_activation_count < 0); in metaslab_group_passivate()
952 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO in metaslab_group_passivate()
956 * we continue to hold the SCL_ALLOC lock, which prevents any future in metaslab_group_passivate()
959 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); in metaslab_group_passivate()
960 taskq_wait_outstanding(spa->spa_metaslab_taskq, 0); in metaslab_group_passivate()
961 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); in metaslab_group_passivate()
963 for (int i = 0; i < mg->mg_allocators; i++) { in metaslab_group_passivate()
964 metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; in metaslab_group_passivate()
965 metaslab_t *msp = mga->mga_primary; in metaslab_group_passivate()
967 mutex_enter(&msp->ms_lock); in metaslab_group_passivate()
970 mutex_exit(&msp->ms_lock); in metaslab_group_passivate()
972 msp = mga->mga_secondary; in metaslab_group_passivate()
974 mutex_enter(&msp->ms_lock); in metaslab_group_passivate()
977 mutex_exit(&msp->ms_lock); in metaslab_group_passivate()
981 mgprev = mg->mg_prev; in metaslab_group_passivate()
982 mgnext = mg->mg_next; in metaslab_group_passivate()
987 mgprev->mg_next = mgnext; in metaslab_group_passivate()
988 mgnext->mg_prev = mgprev; in metaslab_group_passivate()
990 for (int i = 0; i < spa->spa_alloc_count; i++) { in metaslab_group_passivate()
991 if (mc->mc_allocator[i].mca_rotor == mg) in metaslab_group_passivate()
992 mc->mc_allocator[i].mca_rotor = mgnext; in metaslab_group_passivate()
995 mg->mg_prev = NULL; in metaslab_group_passivate()
996 mg->mg_next = NULL; in metaslab_group_passivate()
1002 vdev_t *vd = mg->mg_vd; in metaslab_group_initialized()
1003 vdev_stat_t *vs = &vd->vdev_stat; in metaslab_group_initialized()
1005 return (vs->vs_space != 0 && mg->mg_activation_count > 0); in metaslab_group_initialized()
1015 mutex_enter(&mg->mg_lock); in metaslab_group_get_space()
1016 uint64_t ms_count = avl_numnodes(&mg->mg_metaslab_tree); in metaslab_group_get_space()
1017 mutex_exit(&mg->mg_lock); in metaslab_group_get_space()
1018 return ((1ULL << mg->mg_vd->vdev_ms_shift) * ms_count); in metaslab_group_get_space()
1025 avl_tree_t *t = &mg->mg_metaslab_tree; in metaslab_group_histogram_verify()
1026 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_verify()
1037 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_verify()
1040 VERIFY3P(msp->ms_group, ==, mg); in metaslab_group_histogram_verify()
1042 if (msp->ms_sm == NULL) in metaslab_group_histogram_verify()
1047 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_verify()
1052 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); in metaslab_group_histogram_verify()
1054 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_verify()
1062 metaslab_class_t *mc = mg->mg_class; in metaslab_group_histogram_add()
1063 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_add()
1065 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_histogram_add()
1066 if (msp->ms_sm == NULL) in metaslab_group_histogram_add()
1069 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_add()
1070 mutex_enter(&mc->mc_lock); in metaslab_group_histogram_add()
1072 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_group_histogram_add()
1073 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_group_histogram_add()
1074 mg->mg_histogram[i + ashift] += in metaslab_group_histogram_add()
1075 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_add()
1076 mc->mc_histogram[i + ashift] += in metaslab_group_histogram_add()
1077 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_add()
1079 mutex_exit(&mc->mc_lock); in metaslab_group_histogram_add()
1080 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_add()
1086 metaslab_class_t *mc = mg->mg_class; in metaslab_group_histogram_remove()
1087 uint64_t ashift = mg->mg_vd->vdev_ashift; in metaslab_group_histogram_remove()
1089 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_histogram_remove()
1090 if (msp->ms_sm == NULL) in metaslab_group_histogram_remove()
1093 mutex_enter(&mg->mg_lock); in metaslab_group_histogram_remove()
1094 mutex_enter(&mc->mc_lock); in metaslab_group_histogram_remove()
1096 ASSERT3U(mg->mg_histogram[i + ashift], >=, in metaslab_group_histogram_remove()
1097 msp->ms_sm->sm_phys->smp_histogram[i]); in metaslab_group_histogram_remove()
1098 ASSERT3U(mc->mc_histogram[i + ashift], >=, in metaslab_group_histogram_remove()
1099 msp->ms_sm->sm_phys->smp_histogram[i]); in metaslab_group_histogram_remove()
1100 IMPLY(mg == mg->mg_vd->vdev_log_mg, in metaslab_group_histogram_remove()
1101 mc == spa_embedded_log_class(mg->mg_vd->vdev_spa)); in metaslab_group_histogram_remove()
1103 mg->mg_histogram[i + ashift] -= in metaslab_group_histogram_remove()
1104 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_remove()
1105 mc->mc_histogram[i + ashift] -= in metaslab_group_histogram_remove()
1106 msp->ms_sm->sm_phys->smp_histogram[i]; in metaslab_group_histogram_remove()
1108 mutex_exit(&mc->mc_lock); in metaslab_group_histogram_remove()
1109 mutex_exit(&mg->mg_lock); in metaslab_group_histogram_remove()
1115 ASSERT(msp->ms_group == NULL); in metaslab_group_add()
1116 mutex_enter(&mg->mg_lock); in metaslab_group_add()
1117 msp->ms_group = mg; in metaslab_group_add()
1118 msp->ms_weight = 0; in metaslab_group_add()
1119 avl_add(&mg->mg_metaslab_tree, msp); in metaslab_group_add()
1120 mutex_exit(&mg->mg_lock); in metaslab_group_add()
1122 mutex_enter(&msp->ms_lock); in metaslab_group_add()
1124 mutex_exit(&msp->ms_lock); in metaslab_group_add()
1130 mutex_enter(&msp->ms_lock); in metaslab_group_remove()
1132 mutex_exit(&msp->ms_lock); in metaslab_group_remove()
1134 mutex_enter(&mg->mg_lock); in metaslab_group_remove()
1135 ASSERT(msp->ms_group == mg); in metaslab_group_remove()
1136 avl_remove(&mg->mg_metaslab_tree, msp); in metaslab_group_remove()
1138 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_group_remove()
1140 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_group_remove()
1141 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_group_remove()
1145 msp->ms_group = NULL; in metaslab_group_remove()
1146 mutex_exit(&mg->mg_lock); in metaslab_group_remove()
1152 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_sort_impl()
1153 ASSERT(MUTEX_HELD(&mg->mg_lock)); in metaslab_group_sort_impl()
1154 ASSERT(msp->ms_group == mg); in metaslab_group_sort_impl()
1156 avl_remove(&mg->mg_metaslab_tree, msp); in metaslab_group_sort_impl()
1157 msp->ms_weight = weight; in metaslab_group_sort_impl()
1158 avl_add(&mg->mg_metaslab_tree, msp); in metaslab_group_sort_impl()
1170 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_group_sort()
1172 mutex_enter(&mg->mg_lock); in metaslab_group_sort()
1174 mutex_exit(&mg->mg_lock); in metaslab_group_sort()
1187 vdev_t *vd = mg->mg_vd; in metaslab_group_fragmentation()
1191 for (int m = 0; m < vd->vdev_ms_count; m++) { in metaslab_group_fragmentation()
1192 metaslab_t *msp = vd->vdev_ms[m]; in metaslab_group_fragmentation()
1194 if (msp->ms_fragmentation == ZFS_FRAG_INVALID) in metaslab_group_fragmentation()
1196 if (msp->ms_group != mg) in metaslab_group_fragmentation()
1200 fragmentation += msp->ms_fragmentation; in metaslab_group_fragmentation()
1203 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) in metaslab_group_fragmentation()
1225 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_group_allocatable()
1226 metaslab_class_t *mc = mg->mg_class; in metaslab_group_allocatable()
1237 mc->mc_groups <= 1) in metaslab_group_allocatable()
1252 if (mg->mg_allocatable) { in metaslab_group_allocatable()
1253 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_allocatable()
1255 uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; in metaslab_group_allocatable()
1257 if (!mc->mc_alloc_throttle_enabled) in metaslab_group_allocatable()
1264 if (mg->mg_no_free_space) in metaslab_group_allocatable()
1283 qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); in metaslab_group_allocatable()
1290 if (qdepth < qmax || mc->mc_alloc_groups == 1) in metaslab_group_allocatable()
1292 ASSERT3U(mc->mc_alloc_groups, >, 1); in metaslab_group_allocatable()
1298 * racy since we can't hold the locks for all metaslab in metaslab_group_allocatable()
1301 for (metaslab_group_t *mgp = mg->mg_next; in metaslab_group_allocatable()
1302 mgp != rotor; mgp = mgp->mg_next) { in metaslab_group_allocatable()
1304 &mgp->mg_allocator[allocator]; in metaslab_group_allocatable()
1305 qmax = mgap->mga_cur_max_alloc_queue_depth; in metaslab_group_allocatable()
1308 zfs_refcount_count(&mgap->mga_alloc_queue_depth); in metaslab_group_allocatable()
1315 if (qdepth < qmax && !mgp->mg_no_free_space) in metaslab_group_allocatable()
1326 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { in metaslab_group_allocatable()
1339 * Comparison function for the private size-ordered tree using 32-bit
1349 uint64_t rs_size1 = r1->rs_end - r1->rs_start; in metaslab_rangesize32_compare()
1350 uint64_t rs_size2 = r2->rs_end - r2->rs_start; in metaslab_rangesize32_compare()
1354 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); in metaslab_rangesize32_compare()
1358 * Comparison function for the private size-ordered tree using 64-bit
1368 uint64_t rs_size1 = r1->rs_end - r1->rs_start; in metaslab_rangesize64_compare()
1369 uint64_t rs_size2 = r2->rs_end - r2->rs_start; in metaslab_rangesize64_compare()
1373 return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); in metaslab_rangesize64_compare()
1390 range_tree_t *rt = mssap->rt; in metaslab_size_sorted_add()
1391 metaslab_rt_arg_t *mrap = mssap->mra; in metaslab_size_sorted_add()
1401 metaslab_rt_arg_t *mrap = rt->rt_arg; in metaslab_size_tree_full_load()
1403 ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); in metaslab_size_tree_full_load()
1404 mrap->mra_floor_shift = 0; in metaslab_size_tree_full_load()
1420 * rely on using both a size-ordered range_tree_t and an array of uint64_t's. in ZFS_BTREE_FIND_IN_BUF_FUNC()
1426 zfs_btree_t *size_tree = mrap->mra_bt; in ZFS_BTREE_FIND_IN_BUF_FUNC()
1431 switch (rt->rt_type) { in ZFS_BTREE_FIND_IN_BUF_FUNC()
1443 panic("Invalid range seg type %d", rt->rt_type); in ZFS_BTREE_FIND_IN_BUF_FUNC()
1446 mrap->mra_floor_shift = metaslab_by_size_min_shift; in ZFS_BTREE_FIND_IN_BUF_FUNC()
1454 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_destroy()
1464 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_add()
1466 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < in metaslab_rt_add()
1467 (1ULL << mrap->mra_floor_shift)) in metaslab_rt_add()
1477 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_remove()
1479 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1ULL << in metaslab_rt_remove()
1480 mrap->mra_floor_shift)) in metaslab_rt_remove()
1490 zfs_btree_t *size_tree = mrap->mra_bt; in metaslab_rt_vacate()
1517 zfs_btree_t *t = &msp->ms_allocatable_by_size; in metaslab_largest_allocatable()
1523 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_largest_allocatable()
1529 return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, in metaslab_largest_allocatable()
1530 msp->ms_allocatable)); in metaslab_largest_allocatable()
1540 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_largest_unflushed_free()
1542 if (msp->ms_unflushed_frees == NULL) in metaslab_largest_unflushed_free()
1545 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) in metaslab_largest_unflushed_free()
1546 metaslab_size_tree_full_load(msp->ms_unflushed_frees); in metaslab_largest_unflushed_free()
1547 range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, in metaslab_largest_unflushed_free()
1558 * bound for the largest currently-usable free segment in the in metaslab_largest_unflushed_free()
1563 * briefly and should eventually self-correct as frees are no longer in metaslab_largest_unflushed_free()
1575 uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); in metaslab_largest_unflushed_free()
1576 uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; in metaslab_largest_unflushed_free()
1580 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, in metaslab_largest_unflushed_free()
1585 rsize = start - rstart; in metaslab_largest_unflushed_free()
1591 boolean_t found = range_tree_find_in(msp->ms_freed, rstart, in metaslab_largest_unflushed_free()
1594 rsize = start - rstart; in metaslab_largest_unflushed_free()
1619 * suitable block to allocate. This will search the specified B-tree looking
1627 *cursor = rt->rt_start; in metaslab_block_picker()
1628 zfs_btree_t *bt = &rt->rt_root; in metaslab_block_picker()
1637 while (rs != NULL && (rs_get_start(rs, rt) - first_found <= in metaslab_block_picker()
1649 return (-1ULL); in metaslab_block_picker()
1660 { "new-dynamic", metaslab_ndf_alloc },
1666 int a = ARRAY_SIZE(metaslab_allocators) - 1; in spa_find_allocator_byname()
1667 if (strcmp("new-dynamic", val) == 0) in spa_find_allocator_byname()
1668 return (-1); /* remove when ndf is working */ in spa_find_allocator_byname()
1669 for (; a >= 0; a--) { in spa_find_allocator_byname()
1673 return (-1); in spa_find_allocator_byname()
1681 spa->spa_active_allocator = a; in spa_set_allocator()
1688 return (spa->spa_active_allocator); in spa_get_allocator()
1749 uint64_t align = size & -size; in metaslab_df_alloc()
1750 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; in metaslab_df_alloc()
1751 range_tree_t *rt = msp->ms_allocatable; in metaslab_df_alloc()
1752 uint_t free_pct = range_tree_space(rt) * 100 / msp->ms_size; in metaslab_df_alloc()
1755 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_df_alloc()
1763 offset = -1; in metaslab_df_alloc()
1769 if (offset == -1) { in metaslab_df_alloc()
1771 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) in metaslab_df_alloc()
1772 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_df_alloc()
1776 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); in metaslab_df_alloc()
1780 rs = metaslab_block_find(&msp->ms_allocatable_by_size, in metaslab_df_alloc()
1781 rt, msp->ms_start, size, &where); in metaslab_df_alloc()
1795 * Cursor fit block allocator -
1805 range_tree_t *rt = msp->ms_allocatable; in metaslab_cf_alloc()
1806 zfs_btree_t *t = &msp->ms_allocatable_by_size; in metaslab_cf_alloc()
1807 uint64_t *cursor = &msp->ms_lbas[0]; in metaslab_cf_alloc()
1808 uint64_t *cursor_end = &msp->ms_lbas[1]; in metaslab_cf_alloc()
1811 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_cf_alloc()
1819 metaslab_size_tree_full_load(msp->ms_allocatable); in metaslab_cf_alloc()
1821 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < in metaslab_cf_alloc()
1823 return (-1ULL); in metaslab_cf_alloc()
1837 * New dynamic fit allocator -
1853 zfs_btree_t *t = &msp->ms_allocatable->rt_root; in metaslab_ndf_alloc()
1854 range_tree_t *rt = msp->ms_allocatable; in metaslab_ndf_alloc()
1859 uint64_t *cursor = &msp->ms_lbas[hbit - 1]; in metaslab_ndf_alloc()
1862 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_ndf_alloc()
1865 return (-1ULL); in metaslab_ndf_alloc()
1871 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { in metaslab_ndf_alloc()
1872 t = &msp->ms_allocatable_by_size; in metaslab_ndf_alloc()
1884 if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { in metaslab_ndf_alloc()
1888 return (-1ULL); in metaslab_ndf_alloc()
1898 * Wait for any in-progress metaslab loads to complete.
1903 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load_wait()
1905 while (msp->ms_loading) { in metaslab_load_wait()
1906 ASSERT(!msp->ms_loaded); in metaslab_load_wait()
1907 cv_wait(&msp->ms_load_cv, &msp->ms_lock); in metaslab_load_wait()
1912 * Wait for any in-progress flushing to complete.
1917 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush_wait()
1919 while (msp->ms_flushing) in metaslab_flush_wait()
1920 cv_wait(&msp->ms_flush_cv, &msp->ms_lock); in metaslab_flush_wait()
1932 return ((unsigned int)msp->ms_id % multilist_get_num_sublists(ml)); in metaslab_idx_func()
1938 return (msp->ms_allocated_space); in metaslab_allocated_space()
1942 * Verify that the space accounting on disk matches the in-core range_trees.
1947 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_verify_space()
1951 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_verify_space()
1952 ASSERT(!msp->ms_condensing); in metaslab_verify_space()
1960 * allocated space map. Calling this in non-syncing context in metaslab_verify_space()
1964 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || in metaslab_verify_space()
1965 !msp->ms_loaded) in metaslab_verify_space()
1973 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); in metaslab_verify_space()
1975 ASSERT3U(space_map_allocated(msp->ms_sm), >=, in metaslab_verify_space()
1976 range_tree_space(msp->ms_unflushed_frees)); in metaslab_verify_space()
1979 space_map_allocated(msp->ms_sm) + in metaslab_verify_space()
1980 range_tree_space(msp->ms_unflushed_allocs) - in metaslab_verify_space()
1981 range_tree_space(msp->ms_unflushed_frees)); in metaslab_verify_space()
1983 sm_free_space = msp->ms_size - metaslab_allocated_space(msp); in metaslab_verify_space()
1991 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); in metaslab_verify_space()
1993 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, in metaslab_verify_space()
1994 msp->ms_allocating_total); in metaslab_verify_space()
1996 ASSERT3U(msp->ms_deferspace, ==, in metaslab_verify_space()
1997 range_tree_space(msp->ms_defer[0]) + in metaslab_verify_space()
1998 range_tree_space(msp->ms_defer[1])); in metaslab_verify_space()
2000 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + in metaslab_verify_space()
2001 msp->ms_deferspace + range_tree_space(msp->ms_freed); in metaslab_verify_space()
2013 ASSERT(msp->ms_loaded); in metaslab_aux_histograms_clear()
2015 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); in metaslab_aux_histograms_clear()
2017 memset(msp->ms_deferhist[t], 0, sizeof (msp->ms_deferhist[t])); in metaslab_aux_histograms_clear()
2034 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); in metaslab_aux_histogram_add()
2036 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { in metaslab_aux_histogram_add()
2056 space_map_t *sm = msp->ms_sm; in metaslab_aux_histograms_update()
2065 if (msp->ms_loaded) { in metaslab_aux_histograms_update()
2068 metaslab_aux_histogram_add(msp->ms_synchist, in metaslab_aux_histograms_update()
2069 sm->sm_shift, msp->ms_freed); in metaslab_aux_histograms_update()
2072 metaslab_aux_histogram_add(msp->ms_deferhist[t], in metaslab_aux_histograms_update()
2073 sm->sm_shift, msp->ms_defer[t]); in metaslab_aux_histograms_update()
2077 metaslab_aux_histogram_add(msp->ms_synchist, in metaslab_aux_histograms_update()
2078 sm->sm_shift, msp->ms_freeing); in metaslab_aux_histograms_update()
2089 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_aux_histograms_update_done()
2090 space_map_t *sm = msp->ms_sm; in metaslab_aux_histograms_update_done()
2107 memcpy(msp->ms_deferhist[hist_index], msp->ms_synchist, in metaslab_aux_histograms_update_done()
2108 sizeof (msp->ms_synchist)); in metaslab_aux_histograms_update_done()
2110 memset(msp->ms_deferhist[hist_index], 0, in metaslab_aux_histograms_update_done()
2111 sizeof (msp->ms_deferhist[hist_index])); in metaslab_aux_histograms_update_done()
2113 memset(msp->ms_synchist, 0, sizeof (msp->ms_synchist)); in metaslab_aux_histograms_update_done()
2124 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_verify_weight_and_frag()
2131 * cannot do these assertions because we hold spa config locks and in metaslab_verify_weight_and_frag()
2138 if (msp->ms_group == NULL) in metaslab_verify_weight_and_frag()
2143 * fragmentation and ms_max_size as is - there is nothing for in metaslab_verify_weight_and_frag()
2146 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_verify_weight_and_frag()
2147 if (vd->vdev_removing) in metaslab_verify_weight_and_frag()
2156 if (txg_list_member(&vd->vdev_ms_list, msp, t)) in metaslab_verify_weight_and_frag()
2161 * This verification checks that our in-memory state is consistent in metaslab_verify_weight_and_frag()
2162 * with what's on disk. If the pool is read-only then there aren't in metaslab_verify_weight_and_frag()
2163 * any changes and we just have the initially-loaded state. in metaslab_verify_weight_and_frag()
2165 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) in metaslab_verify_weight_and_frag()
2168 /* some extra verification for in-core tree if you can */ in metaslab_verify_weight_and_frag()
2169 if (msp->ms_loaded) { in metaslab_verify_weight_and_frag()
2170 range_tree_stat_verify(msp->ms_allocatable); in metaslab_verify_weight_and_frag()
2171 VERIFY(space_map_histogram_verify(msp->ms_sm, in metaslab_verify_weight_and_frag()
2172 msp->ms_allocatable)); in metaslab_verify_weight_and_frag()
2175 uint64_t weight = msp->ms_weight; in metaslab_verify_weight_and_frag()
2176 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_verify_weight_and_frag()
2177 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); in metaslab_verify_weight_and_frag()
2178 uint64_t frag = msp->ms_fragmentation; in metaslab_verify_weight_and_frag()
2179 uint64_t max_segsize = msp->ms_max_size; in metaslab_verify_weight_and_frag()
2181 msp->ms_weight = 0; in metaslab_verify_weight_and_frag()
2182 msp->ms_fragmentation = 0; in metaslab_verify_weight_and_frag()
2186 * not introduce any side-effects/mutations on the system's state. in metaslab_verify_weight_and_frag()
2197 msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; in metaslab_verify_weight_and_frag()
2199 VERIFY3U(max_segsize, ==, msp->ms_max_size); in metaslab_verify_weight_and_frag()
2205 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || in metaslab_verify_weight_and_frag()
2206 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { in metaslab_verify_weight_and_frag()
2207 msp->ms_fragmentation = frag; in metaslab_verify_weight_and_frag()
2208 msp->ms_weight = weight; in metaslab_verify_weight_and_frag()
2212 VERIFY3U(msp->ms_fragmentation, ==, frag); in metaslab_verify_weight_and_frag()
2213 VERIFY3U(msp->ms_weight, ==, weight); in metaslab_verify_weight_and_frag()
2233 tries < multilist_get_num_sublists(&mc->mc_metaslab_txg_list) * 2; in metaslab_potentially_evict()
2236 &mc->mc_metaslab_txg_list); in metaslab_potentially_evict()
2238 multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx); in metaslab_potentially_evict()
2244 &mc->mc_metaslab_txg_list, idx)); in metaslab_potentially_evict()
2246 metaslab_idx_func(&mc->mc_metaslab_txg_list, msp)); in metaslab_potentially_evict()
2248 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_potentially_evict()
2265 if (msp->ms_loading) { in metaslab_potentially_evict()
2279 * currently active because they are high-weight in metaslab_potentially_evict()
2283 mutex_enter(&msp->ms_lock); in metaslab_potentially_evict()
2284 if (msp->ms_allocator == -1 && msp->ms_sm != NULL && in metaslab_potentially_evict()
2285 msp->ms_allocating_total == 0) { in metaslab_potentially_evict()
2288 mutex_exit(&msp->ms_lock); in metaslab_potentially_evict()
2303 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load_impl()
2304 ASSERT(msp->ms_loading); in metaslab_load_impl()
2305 ASSERT(!msp->ms_condensing); in metaslab_load_impl()
2324 * metaslab_sync_done() would try to re-add later. in metaslab_load_impl()
2331 uint64_t length = msp->ms_synced_length; in metaslab_load_impl()
2332 mutex_exit(&msp->ms_lock); in metaslab_load_impl()
2336 if (msp->ms_allocatable->rt_arg == NULL) { in metaslab_load_impl()
2339 mrap = msp->ms_allocatable->rt_arg; in metaslab_load_impl()
2340 msp->ms_allocatable->rt_ops = NULL; in metaslab_load_impl()
2341 msp->ms_allocatable->rt_arg = NULL; in metaslab_load_impl()
2343 mrap->mra_bt = &msp->ms_allocatable_by_size; in metaslab_load_impl()
2344 mrap->mra_floor_shift = metaslab_by_size_min_shift; in metaslab_load_impl()
2346 if (msp->ms_sm != NULL) { in metaslab_load_impl()
2347 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, in metaslab_load_impl()
2350 /* Now, populate the size-sorted tree. */ in metaslab_load_impl()
2351 metaslab_rt_create(msp->ms_allocatable, mrap); in metaslab_load_impl()
2352 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; in metaslab_load_impl()
2353 msp->ms_allocatable->rt_arg = mrap; in metaslab_load_impl()
2356 arg.rt = msp->ms_allocatable; in metaslab_load_impl()
2358 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, in metaslab_load_impl()
2362 * Add the size-sorted tree first, since we don't need to load in metaslab_load_impl()
2365 metaslab_rt_create(msp->ms_allocatable, mrap); in metaslab_load_impl()
2366 msp->ms_allocatable->rt_ops = &metaslab_rt_ops; in metaslab_load_impl()
2367 msp->ms_allocatable->rt_arg = mrap; in metaslab_load_impl()
2373 range_tree_add(msp->ms_allocatable, in metaslab_load_impl()
2374 msp->ms_start, msp->ms_size); in metaslab_load_impl()
2376 if (msp->ms_new) { in metaslab_load_impl()
2384 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_load_impl()
2385 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_load_impl()
2394 * hold the ms_lock while writing the ms_checkpointing tree to disk. in metaslab_load_impl()
2396 mutex_enter(&msp->ms_sync_lock); in metaslab_load_impl()
2397 mutex_enter(&msp->ms_lock); in metaslab_load_impl()
2399 ASSERT(!msp->ms_condensing); in metaslab_load_impl()
2400 ASSERT(!msp->ms_flushing); in metaslab_load_impl()
2403 mutex_exit(&msp->ms_sync_lock); in metaslab_load_impl()
2407 ASSERT3P(msp->ms_group, !=, NULL); in metaslab_load_impl()
2408 msp->ms_loaded = B_TRUE; in metaslab_load_impl()
2415 range_tree_walk(msp->ms_unflushed_allocs, in metaslab_load_impl()
2416 range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2417 range_tree_walk(msp->ms_unflushed_frees, in metaslab_load_impl()
2418 range_tree_add, msp->ms_allocatable); in metaslab_load_impl()
2420 ASSERT3P(msp->ms_group, !=, NULL); in metaslab_load_impl()
2421 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_load_impl()
2446 range_tree_walk(msp->ms_freed, in metaslab_load_impl()
2447 range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2465 range_tree_walk(msp->ms_defer[t], in metaslab_load_impl()
2466 range_tree_remove, msp->ms_allocatable); in metaslab_load_impl()
2474 * has not yet been converted to use segment-based weight, we in metaslab_load_impl()
2481 uint64_t weight = msp->ms_weight; in metaslab_load_impl()
2482 uint64_t max_size = msp->ms_max_size; in metaslab_load_impl()
2485 ASSERT3U(weight, <=, msp->ms_weight); in metaslab_load_impl()
2486 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_load_impl()
2487 ASSERT3U(max_size, <=, msp->ms_max_size); in metaslab_load_impl()
2489 msp->ms_load_time = load_end; in metaslab_load_impl()
2493 "freed %llu, defer %llu + %llu, unloaded time %llu ms, " in metaslab_load_impl()
2494 "loading_time %lld ms, ms_max_size %llu, " in metaslab_load_impl()
2498 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_load_impl()
2499 (u_longlong_t)msp->ms_id, in metaslab_load_impl()
2500 (u_longlong_t)space_map_length(msp->ms_sm), in metaslab_load_impl()
2501 (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), in metaslab_load_impl()
2502 (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), in metaslab_load_impl()
2503 (u_longlong_t)range_tree_space(msp->ms_freed), in metaslab_load_impl()
2504 (u_longlong_t)range_tree_space(msp->ms_defer[0]), in metaslab_load_impl()
2505 (u_longlong_t)range_tree_space(msp->ms_defer[1]), in metaslab_load_impl()
2506 (longlong_t)((load_start - msp->ms_unload_time) / 1000000), in metaslab_load_impl()
2507 (longlong_t)((load_end - load_start) / 1000000), in metaslab_load_impl()
2508 (u_longlong_t)msp->ms_max_size, in metaslab_load_impl()
2509 (u_longlong_t)msp->ms_max_size - max_size, in metaslab_load_impl()
2510 (u_longlong_t)weight, (u_longlong_t)msp->ms_weight); in metaslab_load_impl()
2513 mutex_exit(&msp->ms_sync_lock); in metaslab_load_impl()
2520 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load()
2527 if (msp->ms_loaded) in metaslab_load()
2529 VERIFY(!msp->ms_loading); in metaslab_load()
2530 ASSERT(!msp->ms_condensing); in metaslab_load()
2538 msp->ms_loading = B_TRUE; in metaslab_load()
2541 * Wait for any in-progress flushing to finish as we drop the ms_lock in metaslab_load()
2545 if (msp->ms_flushing) in metaslab_load()
2553 ASSERT(!msp->ms_loaded); in metaslab_load()
2560 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == in metaslab_load()
2561 msp->ms_group->mg_class) { in metaslab_load()
2562 metaslab_potentially_evict(msp->ms_group->mg_class); in metaslab_load()
2567 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_load()
2568 msp->ms_loading = B_FALSE; in metaslab_load()
2569 cv_broadcast(&msp->ms_load_cv); in metaslab_load()
2577 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_unload()
2584 if (!msp->ms_loaded) in metaslab_unload()
2587 range_tree_vacate(msp->ms_allocatable, NULL, NULL); in metaslab_unload()
2588 msp->ms_loaded = B_FALSE; in metaslab_unload()
2589 msp->ms_unload_time = gethrtime(); in metaslab_unload()
2591 msp->ms_activation_weight = 0; in metaslab_unload()
2592 msp->ms_weight &= ~METASLAB_ACTIVE_MASK; in metaslab_unload()
2594 if (msp->ms_group != NULL) { in metaslab_unload()
2595 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_unload()
2597 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_unload()
2598 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_unload()
2602 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unload()
2605 "selected txg %llu (%llu ms ago), alloc_txg %llu, " in metaslab_unload()
2606 "loaded %llu ms ago, max_size %llu", in metaslab_unload()
2608 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_unload()
2609 (u_longlong_t)msp->ms_id, in metaslab_unload()
2610 (u_longlong_t)msp->ms_weight, in metaslab_unload()
2611 (u_longlong_t)msp->ms_selected_txg, in metaslab_unload()
2612 (u_longlong_t)(msp->ms_unload_time - in metaslab_unload()
2613 msp->ms_selected_time) / 1000 / 1000, in metaslab_unload()
2614 (u_longlong_t)msp->ms_alloc_txg, in metaslab_unload()
2615 (u_longlong_t)(msp->ms_unload_time - in metaslab_unload()
2616 msp->ms_load_time) / 1000 / 1000, in metaslab_unload()
2617 (u_longlong_t)msp->ms_max_size); in metaslab_unload()
2624 * loaded ones have it calculated from their in-core range tree in metaslab_unload()
2626 * available in-core, whether it is loaded or not. in metaslab_unload()
2632 if (msp->ms_group != NULL) in metaslab_unload()
2637 * We want to optimize the memory use of the per-metaslab range
2639 * units of sectors, zero-indexing from the start of the metaslab. If
2640 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2647 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && in metaslab_calculate_range_tree_type()
2649 *shift = vdev->vdev_ashift; in metaslab_calculate_range_tree_type()
2650 *start = msp->ms_start; in metaslab_calculate_range_tree_type()
2662 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_set_selected_txg()
2663 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_set_selected_txg()
2665 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_set_selected_txg()
2666 if (multilist_link_active(&msp->ms_class_txg_node)) in metaslab_set_selected_txg()
2668 msp->ms_selected_txg = txg; in metaslab_set_selected_txg()
2669 msp->ms_selected_time = gethrtime(); in metaslab_set_selected_txg()
2680 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); in metaslab_space_update()
2681 ASSERT(vd->vdev_ms_count != 0); in metaslab_space_update()
2691 vdev_t *vd = mg->mg_vd; in metaslab_init()
2692 spa_t *spa = vd->vdev_spa; in metaslab_init()
2693 objset_t *mos = spa->spa_meta_objset; in metaslab_init()
2694 metaslab_t *ms; in metaslab_init() local
2697 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); in metaslab_init()
2698 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_init()
2699 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); in metaslab_init()
2700 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); in metaslab_init()
2701 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); in metaslab_init()
2702 multilist_link_init(&ms->ms_class_txg_node); in metaslab_init()
2704 ms->ms_id = id; in metaslab_init()
2705 ms->ms_start = id << vd->vdev_ms_shift; in metaslab_init()
2706 ms->ms_size = 1ULL << vd->vdev_ms_shift; in metaslab_init()
2707 ms->ms_allocator = -1; in metaslab_init()
2708 ms->ms_new = B_TRUE; in metaslab_init()
2710 vdev_ops_t *ops = vd->vdev_ops; in metaslab_init()
2711 if (ops->vdev_op_metaslab_init != NULL) in metaslab_init()
2712 ops->vdev_op_metaslab_init(vd, &ms->ms_start, &ms->ms_size); in metaslab_init()
2726 if (object != 0 && !(spa->spa_mode == SPA_MODE_READ && in metaslab_init()
2727 !spa->spa_read_spacemaps)) { in metaslab_init()
2728 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, in metaslab_init()
2729 ms->ms_size, vd->vdev_ashift); in metaslab_init()
2732 kmem_free(ms, sizeof (metaslab_t)); in metaslab_init()
2736 ASSERT(ms->ms_sm != NULL); in metaslab_init()
2737 ms->ms_allocated_space = space_map_allocated(ms->ms_sm); in metaslab_init()
2742 metaslab_calculate_range_tree_type(vd, ms, &start, &shift); in metaslab_init()
2744 ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); in metaslab_init()
2746 ms->ms_allocating[t] = range_tree_create(NULL, type, in metaslab_init()
2749 ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift); in metaslab_init()
2750 ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift); in metaslab_init()
2752 ms->ms_defer[t] = range_tree_create(NULL, type, NULL, in metaslab_init()
2755 ms->ms_checkpointing = in metaslab_init()
2757 ms->ms_unflushed_allocs = in metaslab_init()
2761 mrap->mra_bt = &ms->ms_unflushed_frees_by_size; in metaslab_init()
2762 mrap->mra_floor_shift = metaslab_by_size_min_shift; in metaslab_init()
2763 ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, in metaslab_init()
2766 ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); in metaslab_init()
2768 metaslab_group_add(mg, ms); in metaslab_init()
2769 metaslab_set_fragmentation(ms, B_FALSE); in metaslab_init()
2781 metaslab_sync_done(ms, 0); in metaslab_init()
2782 metaslab_space_update(vd, mg->mg_class, in metaslab_init()
2783 metaslab_allocated_space(ms), 0, 0); in metaslab_init()
2788 vdev_dirty(vd, VDD_METASLAB, ms, txg); in metaslab_init()
2791 *msp = ms; in metaslab_init()
2799 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_fini_flush_data()
2802 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), in metaslab_fini_flush_data()
2808 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_fini_flush_data()
2809 avl_remove(&spa->spa_metaslabs_by_flushed, msp); in metaslab_fini_flush_data()
2810 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_fini_flush_data()
2818 metaslab_unflushed_changes_memused(metaslab_t *ms) in metaslab_unflushed_changes_memused() argument
2820 return ((range_tree_numsegs(ms->ms_unflushed_allocs) + in metaslab_unflushed_changes_memused()
2821 range_tree_numsegs(ms->ms_unflushed_frees)) * in metaslab_unflushed_changes_memused()
2822 ms->ms_unflushed_allocs->rt_root.bt_elem_size); in metaslab_unflushed_changes_memused()
2828 metaslab_group_t *mg = msp->ms_group; in metaslab_fini()
2829 vdev_t *vd = mg->mg_vd; in metaslab_fini()
2830 spa_t *spa = vd->vdev_spa; in metaslab_fini()
2836 mutex_enter(&msp->ms_lock); in metaslab_fini()
2837 VERIFY(msp->ms_group == NULL); in metaslab_fini()
2844 if (!msp->ms_new) { in metaslab_fini()
2845 metaslab_space_update(vd, mg->mg_class, in metaslab_fini()
2846 -metaslab_allocated_space(msp), 0, -msp->ms_size); in metaslab_fini()
2849 space_map_close(msp->ms_sm); in metaslab_fini()
2850 msp->ms_sm = NULL; in metaslab_fini()
2854 range_tree_destroy(msp->ms_allocatable); in metaslab_fini()
2855 range_tree_destroy(msp->ms_freeing); in metaslab_fini()
2856 range_tree_destroy(msp->ms_freed); in metaslab_fini()
2858 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_fini()
2860 spa->spa_unflushed_stats.sus_memused -= in metaslab_fini()
2862 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_fini()
2863 range_tree_destroy(msp->ms_unflushed_allocs); in metaslab_fini()
2864 range_tree_destroy(msp->ms_checkpointing); in metaslab_fini()
2865 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_fini()
2866 range_tree_destroy(msp->ms_unflushed_frees); in metaslab_fini()
2869 range_tree_destroy(msp->ms_allocating[t]); in metaslab_fini()
2872 range_tree_destroy(msp->ms_defer[t]); in metaslab_fini()
2874 ASSERT0(msp->ms_deferspace); in metaslab_fini()
2877 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); in metaslab_fini()
2879 range_tree_vacate(msp->ms_trim, NULL, NULL); in metaslab_fini()
2880 range_tree_destroy(msp->ms_trim); in metaslab_fini()
2882 mutex_exit(&msp->ms_lock); in metaslab_fini()
2883 cv_destroy(&msp->ms_load_cv); in metaslab_fini()
2884 cv_destroy(&msp->ms_flush_cv); in metaslab_fini()
2885 mutex_destroy(&msp->ms_lock); in metaslab_fini()
2886 mutex_destroy(&msp->ms_sync_lock); in metaslab_fini()
2887 ASSERT3U(msp->ms_allocator, ==, -1); in metaslab_fini()
2941 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_set_fragmentation()
2948 msp->ms_fragmentation = ZFS_FRAG_INVALID; in metaslab_set_fragmentation()
2956 if (msp->ms_sm == NULL) { in metaslab_set_fragmentation()
2957 msp->ms_fragmentation = 0; in metaslab_set_fragmentation()
2965 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { in metaslab_set_fragmentation()
2967 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_set_fragmentation()
2979 msp->ms_condense_wanted = B_TRUE; in metaslab_set_fragmentation()
2983 (u_longlong_t)msp->ms_id, in metaslab_set_fragmentation()
2984 (u_longlong_t)vd->vdev_id); in metaslab_set_fragmentation()
2986 msp->ms_fragmentation = ZFS_FRAG_INVALID; in metaslab_set_fragmentation()
2992 uint8_t shift = msp->ms_sm->sm_shift; in metaslab_set_fragmentation()
2994 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, in metaslab_set_fragmentation()
2995 FRAGMENTATION_TABLE_SIZE - 1); in metaslab_set_fragmentation()
2997 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) in metaslab_set_fragmentation()
3000 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); in metaslab_set_fragmentation()
3011 msp->ms_fragmentation = fragmentation; in metaslab_set_fragmentation()
3015 * Compute a weight -- a selection preference value -- for the given metaslab.
3022 metaslab_group_t *mg = msp->ms_group; in metaslab_space_weight()
3023 vdev_t *vd = mg->mg_vd; in metaslab_space_weight()
3026 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_space_weight()
3031 space = msp->ms_size - metaslab_allocated_space(msp); in metaslab_space_weight()
3034 msp->ms_fragmentation != ZFS_FRAG_INVALID) { in metaslab_space_weight()
3042 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; in metaslab_space_weight()
3065 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { in metaslab_space_weight()
3066 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; in metaslab_space_weight()
3076 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && in metaslab_space_weight()
3077 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { in metaslab_space_weight()
3078 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_space_weight()
3086 * Return the weight of the specified metaslab, according to the segment-based
3097 ASSERT(msp->ms_loaded); in metaslab_weight_from_range_tree()
3099 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; in metaslab_weight_from_range_tree()
3100 i--) { in metaslab_weight_from_range_tree()
3101 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; in metaslab_weight_from_range_tree()
3102 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; in metaslab_weight_from_range_tree()
3105 segments += msp->ms_allocatable->rt_histogram[i]; in metaslab_weight_from_range_tree()
3128 * Calculate the weight based on the on-disk histogram. Should be applied
3129 * only to unloaded metaslabs (i.e no incoming allocations) in-order to
3130 * give results consistent with the on-disk state
3135 space_map_t *sm = msp->ms_sm; in metaslab_weight_from_spacemap()
3136 ASSERT(!msp->ms_loaded); in metaslab_weight_from_spacemap()
3139 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); in metaslab_weight_from_spacemap()
3151 deferspace_histogram[i] += msp->ms_synchist[i]; in metaslab_weight_from_spacemap()
3154 deferspace_histogram[i] += msp->ms_deferhist[t][i]; in metaslab_weight_from_spacemap()
3159 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { in metaslab_weight_from_spacemap()
3160 ASSERT3U(sm->sm_phys->smp_histogram[i], >=, in metaslab_weight_from_spacemap()
3163 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; in metaslab_weight_from_spacemap()
3166 WEIGHT_SET_INDEX(weight, i + sm->sm_shift); in metaslab_weight_from_spacemap()
3175 * Compute a segment-based weight for the specified metaslab. The weight
3182 metaslab_group_t *mg = msp->ms_group; in metaslab_segment_weight()
3184 uint8_t shift = mg->mg_vd->vdev_ashift; in metaslab_segment_weight()
3186 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_segment_weight()
3192 int idx = highbit64(msp->ms_size) - 1; in metaslab_segment_weight()
3193 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; in metaslab_segment_weight()
3199 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); in metaslab_segment_weight()
3207 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); in metaslab_segment_weight()
3212 if (metaslab_allocated_space(msp) == msp->ms_size) in metaslab_segment_weight()
3219 if (msp->ms_loaded) { in metaslab_segment_weight()
3230 if (msp->ms_activation_weight != 0 && weight != 0) in metaslab_segment_weight()
3231 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); in metaslab_segment_weight()
3240 * weight. For segment-based weighting we can determine the maximum
3241 * allocation based on the index encoded in its value. For space-based
3242 * weights we rely on the entire weight (excluding the weight-type bit).
3253 if (unlikely(msp->ms_new)) in metaslab_should_allocate()
3263 if (msp->ms_loaded || in metaslab_should_allocate()
3264 (msp->ms_max_size != 0 && !try_hard && gethrtime() < in metaslab_should_allocate()
3265 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) in metaslab_should_allocate()
3266 return (msp->ms_max_size >= asize); in metaslab_should_allocate()
3269 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { in metaslab_should_allocate()
3277 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); in metaslab_should_allocate()
3280 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); in metaslab_should_allocate()
3289 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_weight()
3290 spa_t *spa = vd->vdev_spa; in metaslab_weight()
3293 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_weight()
3307 if (msp->ms_loaded) { in metaslab_weight()
3308 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_weight()
3310 msp->ms_max_size = MAX(msp->ms_max_size, in metaslab_weight()
3315 * Segment-based weighting requires space map histogram support. in metaslab_weight()
3319 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == in metaslab_weight()
3331 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_recalculate_weight_and_sort()
3334 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_recalculate_weight_and_sort()
3335 metaslab_group_sort(msp->ms_group, msp, in metaslab_recalculate_weight_and_sort()
3343 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_activate_allocator()
3344 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_activate_allocator()
3351 ASSERT0(msp->ms_activation_weight); in metaslab_activate_allocator()
3352 msp->ms_activation_weight = msp->ms_weight; in metaslab_activate_allocator()
3353 metaslab_group_sort(mg, msp, msp->ms_weight | in metaslab_activate_allocator()
3359 &mga->mga_primary : &mga->mga_secondary); in metaslab_activate_allocator()
3361 mutex_enter(&mg->mg_lock); in metaslab_activate_allocator()
3363 mutex_exit(&mg->mg_lock); in metaslab_activate_allocator()
3368 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_activate_allocator()
3369 msp->ms_allocator = allocator; in metaslab_activate_allocator()
3370 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); in metaslab_activate_allocator()
3372 ASSERT0(msp->ms_activation_weight); in metaslab_activate_allocator()
3373 msp->ms_activation_weight = msp->ms_weight; in metaslab_activate_allocator()
3375 msp->ms_weight | activation_weight); in metaslab_activate_allocator()
3376 mutex_exit(&mg->mg_lock); in metaslab_activate_allocator()
3384 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_activate()
3397 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { in metaslab_activate()
3398 ASSERT(msp->ms_loaded); in metaslab_activate()
3404 metaslab_group_sort(msp->ms_group, msp, 0); in metaslab_activate()
3423 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { in metaslab_activate()
3424 if (msp->ms_allocator != allocator) in metaslab_activate()
3427 if ((msp->ms_weight & activation_weight) == 0) in metaslab_activate()
3431 msp->ms_primary); in metaslab_activate()
3442 if (msp->ms_weight == 0) { in metaslab_activate()
3443 ASSERT0(range_tree_space(msp->ms_allocatable)); in metaslab_activate()
3447 if ((error = metaslab_activate_allocator(msp->ms_group, msp, in metaslab_activate()
3452 ASSERT(msp->ms_loaded); in metaslab_activate()
3453 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_activate()
3462 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_passivate_allocator()
3463 ASSERT(msp->ms_loaded); in metaslab_passivate_allocator()
3465 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { in metaslab_passivate_allocator()
3470 mutex_enter(&mg->mg_lock); in metaslab_passivate_allocator()
3471 ASSERT3P(msp->ms_group, ==, mg); in metaslab_passivate_allocator()
3472 ASSERT3S(0, <=, msp->ms_allocator); in metaslab_passivate_allocator()
3473 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); in metaslab_passivate_allocator()
3475 metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; in metaslab_passivate_allocator()
3476 if (msp->ms_primary) { in metaslab_passivate_allocator()
3477 ASSERT3P(mga->mga_primary, ==, msp); in metaslab_passivate_allocator()
3478 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_passivate_allocator()
3479 mga->mga_primary = NULL; in metaslab_passivate_allocator()
3481 ASSERT3P(mga->mga_secondary, ==, msp); in metaslab_passivate_allocator()
3482 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_passivate_allocator()
3483 mga->mga_secondary = NULL; in metaslab_passivate_allocator()
3485 msp->ms_allocator = -1; in metaslab_passivate_allocator()
3487 mutex_exit(&mg->mg_lock); in metaslab_passivate_allocator()
3500 ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || in metaslab_passivate()
3502 range_tree_space(msp->ms_allocatable) == 0); in metaslab_passivate()
3505 ASSERT(msp->ms_activation_weight != 0); in metaslab_passivate()
3506 msp->ms_activation_weight = 0; in metaslab_passivate()
3507 metaslab_passivate_allocator(msp->ms_group, msp, weight); in metaslab_passivate()
3508 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_passivate()
3512 * Segment-based metaslabs are activated once and remain active until
3513 * we either fail an allocation attempt (similar to space-based metaslabs)
3525 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_segment_may_passivate()
3527 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) in metaslab_segment_may_passivate()
3532 * information that is accessible to us is the in-core range tree in metaslab_segment_may_passivate()
3536 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); in metaslab_segment_may_passivate()
3539 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) in metaslab_segment_may_passivate()
3547 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_preload()
3548 spa_t *spa = mc->mc_spa; in metaslab_preload()
3551 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); in metaslab_preload()
3553 mutex_enter(&msp->ms_lock); in metaslab_preload()
3556 mutex_exit(&msp->ms_lock); in metaslab_preload()
3563 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_group_preload()
3565 avl_tree_t *t = &mg->mg_metaslab_tree; in metaslab_group_preload()
3571 mutex_enter(&mg->mg_lock); in metaslab_group_preload()
3577 ASSERT3P(msp->ms_group, ==, mg); in metaslab_group_preload()
3585 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { in metaslab_group_preload()
3589 VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload, in metaslab_group_preload()
3590 msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0)) in metaslab_group_preload()
3593 mutex_exit(&mg->mg_lock); in metaslab_group_preload()
3597 * Determine if the space map's on-disk footprint is past our tolerance for
3604 * 2. Condense if the on on-disk space map representation is at least
3606 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3608 * 3. Do not condense if the on-disk size of the space map does not actually
3611 * Unfortunately, we cannot compute the on-disk size of the space map in this
3614 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3620 space_map_t *sm = msp->ms_sm; in metaslab_should_condense()
3621 vdev_t *vd = msp->ms_group->mg_vd; in metaslab_should_condense()
3622 uint64_t vdev_blocksize = 1ULL << vd->vdev_ashift; in metaslab_should_condense()
3624 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_should_condense()
3625 ASSERT(msp->ms_loaded); in metaslab_should_condense()
3627 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); in metaslab_should_condense()
3633 if (range_tree_numsegs(msp->ms_allocatable) == 0 || in metaslab_should_condense()
3634 msp->ms_condense_wanted) in metaslab_should_condense()
3637 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); in metaslab_should_condense()
3640 msp->ms_allocatable, SM_NO_VDEVID); in metaslab_should_condense()
3647 * Condense the on-disk space map representation to its minimized form.
3651 * the pool-wide log spacemaps; thus this is effectively a superset of
3658 space_map_t *sm = msp->ms_sm; in metaslab_condense()
3660 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_condense()
3662 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_condense()
3663 ASSERT(msp->ms_loaded); in metaslab_condense()
3664 ASSERT(msp->ms_sm != NULL); in metaslab_condense()
3709 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ in metaslab_condense()
3713 (u_longlong_t)txg, (u_longlong_t)msp->ms_id, msp, in metaslab_condense()
3714 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_condense()
3715 spa->spa_name, (u_longlong_t)space_map_length(msp->ms_sm), in metaslab_condense()
3716 (u_longlong_t)range_tree_numsegs(msp->ms_allocatable), in metaslab_condense()
3717 msp->ms_condense_wanted ? "TRUE" : "FALSE"); in metaslab_condense()
3719 msp->ms_condense_wanted = B_FALSE; in metaslab_condense()
3723 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, in metaslab_condense()
3729 range_tree_walk(msp->ms_defer[t], in metaslab_condense()
3734 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], in metaslab_condense()
3738 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_condense()
3740 spa->spa_unflushed_stats.sus_memused -= in metaslab_condense()
3742 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_condense()
3743 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_condense()
3753 msp->ms_condensing = B_TRUE; in metaslab_condense()
3755 mutex_exit(&msp->ms_lock); in metaslab_condense()
3756 uint64_t object = space_map_object(msp->ms_sm); in metaslab_condense()
3765 if (space_map_object(msp->ms_sm) != object) { in metaslab_condense()
3766 object = space_map_object(msp->ms_sm); in metaslab_condense()
3767 dmu_write(spa->spa_meta_objset, in metaslab_condense()
3768 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * in metaslab_condense()
3769 msp->ms_id, sizeof (uint64_t), &object, tx); in metaslab_condense()
3785 range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); in metaslab_condense()
3787 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); in metaslab_condense()
3794 mutex_enter(&msp->ms_lock); in metaslab_condense()
3796 msp->ms_condensing = B_FALSE; in metaslab_condense()
3803 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unflushed_add()
3805 ASSERT(msp->ms_sm != NULL); in metaslab_unflushed_add()
3806 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_unflushed_add()
3807 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_unflushed_add()
3809 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_unflushed_add()
3812 avl_add(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_add()
3813 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_unflushed_add()
3822 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_unflushed_bump()
3824 ASSERT(msp->ms_sm != NULL); in metaslab_unflushed_bump()
3826 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); in metaslab_unflushed_bump()
3827 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_unflushed_bump()
3828 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_unflushed_bump()
3830 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); in metaslab_unflushed_bump()
3835 mutex_enter(&spa->spa_flushed_ms_lock); in metaslab_unflushed_bump()
3836 avl_remove(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_bump()
3839 avl_add(&spa->spa_metaslabs_by_flushed, msp); in metaslab_unflushed_bump()
3840 mutex_exit(&spa->spa_flushed_ms_lock); in metaslab_unflushed_bump()
3857 * all the contents of the pool-wide spacemap log). Updates the metaslab's
3858 * metadata and any pool-wide related log space map data (e.g. summary,
3864 metaslab_group_t *mg = msp->ms_group; in metaslab_flush_update()
3865 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_flush_update()
3867 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush_update()
3876 msp->ms_synced_length = space_map_length(msp->ms_sm); in metaslab_flush_update()
3880 * feature being active. In that case this is a no-op. in metaslab_flush_update()
3892 spa_t *spa = msp->ms_group->mg_vd->vdev_spa; in metaslab_flush()
3894 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_flush()
3898 ASSERT(msp->ms_sm != NULL); in metaslab_flush()
3900 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); in metaslab_flush()
3915 if (msp->ms_loading) in metaslab_flush()
3932 if (msp->ms_loaded && metaslab_should_condense(msp)) { in metaslab_flush()
3933 metaslab_group_t *mg = msp->ms_group; in metaslab_flush()
3941 metaslab_class_histogram_verify(mg->mg_class); in metaslab_flush()
3946 space_map_histogram_clear(msp->ms_sm); in metaslab_flush()
3947 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); in metaslab_flush()
3948 ASSERT(range_tree_is_empty(msp->ms_freed)); in metaslab_flush()
3950 space_map_histogram_add(msp->ms_sm, in metaslab_flush()
3951 msp->ms_defer[t], tx); in metaslab_flush()
3957 metaslab_class_histogram_verify(mg->mg_class); in metaslab_flush()
3972 msp->ms_flushing = B_TRUE; in metaslab_flush()
3973 uint64_t sm_len_before = space_map_length(msp->ms_sm); in metaslab_flush()
3975 mutex_exit(&msp->ms_lock); in metaslab_flush()
3976 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, in metaslab_flush()
3978 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, in metaslab_flush()
3980 mutex_enter(&msp->ms_lock); in metaslab_flush()
3982 uint64_t sm_len_after = space_map_length(msp->ms_sm); in metaslab_flush()
3988 (u_longlong_t)msp->ms_group->mg_vd->vdev_id, in metaslab_flush()
3989 (u_longlong_t)msp->ms_id, in metaslab_flush()
3990 (u_longlong_t)range_tree_space(msp->ms_unflushed_allocs), in metaslab_flush()
3991 (u_longlong_t)range_tree_space(msp->ms_unflushed_frees), in metaslab_flush()
3992 (u_longlong_t)(sm_len_after - sm_len_before)); in metaslab_flush()
3995 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_flush()
3997 spa->spa_unflushed_stats.sus_memused -= in metaslab_flush()
3999 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); in metaslab_flush()
4000 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); in metaslab_flush()
4010 msp->ms_flushing = B_FALSE; in metaslab_flush()
4011 cv_broadcast(&msp->ms_flush_cv); in metaslab_flush()
4021 metaslab_group_t *mg = msp->ms_group; in metaslab_sync()
4022 vdev_t *vd = mg->mg_vd; in metaslab_sync()
4023 spa_t *spa = vd->vdev_spa; in metaslab_sync()
4025 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; in metaslab_sync()
4028 ASSERT(!vd->vdev_ishole); in metaslab_sync()
4033 if (msp->ms_new) { in metaslab_sync()
4035 ASSERT0(range_tree_space(msp->ms_freeing)); in metaslab_sync()
4036 ASSERT0(range_tree_space(msp->ms_freed)); in metaslab_sync()
4037 ASSERT0(range_tree_space(msp->ms_checkpointing)); in metaslab_sync()
4038 ASSERT0(range_tree_space(msp->ms_trim)); in metaslab_sync()
4054 range_tree_is_empty(msp->ms_freeing) && in metaslab_sync()
4055 range_tree_is_empty(msp->ms_checkpointing) && in metaslab_sync()
4056 !(msp->ms_loaded && msp->ms_condense_wanted && in metaslab_sync()
4083 if (msp->ms_sm == NULL) { in metaslab_sync()
4090 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * in metaslab_sync()
4091 msp->ms_id, sizeof (uint64_t), &new_object, tx); in metaslab_sync()
4093 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, in metaslab_sync()
4094 msp->ms_start, msp->ms_size, vd->vdev_ashift)); in metaslab_sync()
4095 ASSERT(msp->ms_sm != NULL); in metaslab_sync()
4097 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); in metaslab_sync()
4098 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); in metaslab_sync()
4102 if (!range_tree_is_empty(msp->ms_checkpointing) && in metaslab_sync()
4103 vd->vdev_checkpoint_sm == NULL) { in metaslab_sync()
4110 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, in metaslab_sync()
4111 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); in metaslab_sync()
4112 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); in metaslab_sync()
4119 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, in metaslab_sync()
4120 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, in metaslab_sync()
4124 mutex_enter(&msp->ms_sync_lock); in metaslab_sync()
4125 mutex_enter(&msp->ms_lock); in metaslab_sync()
4133 metaslab_class_histogram_verify(mg->mg_class); in metaslab_sync()
4136 if (spa->spa_sync_pass == 1 && msp->ms_loaded && in metaslab_sync()
4143 * open-context (ZIL) for future TXGs do not block. in metaslab_sync()
4145 mutex_exit(&msp->ms_lock); in metaslab_sync()
4155 vd->vdev_id, tx); in metaslab_sync()
4156 space_map_write(log_sm, msp->ms_freeing, SM_FREE, in metaslab_sync()
4157 vd->vdev_id, tx); in metaslab_sync()
4158 mutex_enter(&msp->ms_lock); in metaslab_sync()
4160 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, in metaslab_sync()
4162 spa->spa_unflushed_stats.sus_memused -= in metaslab_sync()
4165 msp->ms_unflushed_frees, msp->ms_unflushed_allocs); in metaslab_sync()
4166 range_tree_remove_xor_add(msp->ms_freeing, in metaslab_sync()
4167 msp->ms_unflushed_allocs, msp->ms_unflushed_frees); in metaslab_sync()
4168 spa->spa_unflushed_stats.sus_memused += in metaslab_sync()
4173 space_map_write(msp->ms_sm, alloctree, SM_ALLOC, in metaslab_sync()
4175 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, in metaslab_sync()
4177 mutex_enter(&msp->ms_lock); in metaslab_sync()
4180 msp->ms_allocated_space += range_tree_space(alloctree); in metaslab_sync()
4181 ASSERT3U(msp->ms_allocated_space, >=, in metaslab_sync()
4182 range_tree_space(msp->ms_freeing)); in metaslab_sync()
4183 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); in metaslab_sync()
4185 if (!range_tree_is_empty(msp->ms_checkpointing)) { in metaslab_sync()
4187 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); in metaslab_sync()
4195 mutex_exit(&msp->ms_lock); in metaslab_sync()
4196 space_map_write(vd->vdev_checkpoint_sm, in metaslab_sync()
4197 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); in metaslab_sync()
4198 mutex_enter(&msp->ms_lock); in metaslab_sync()
4200 spa->spa_checkpoint_info.sci_dspace += in metaslab_sync()
4201 range_tree_space(msp->ms_checkpointing); in metaslab_sync()
4202 vd->vdev_stat.vs_checkpoint_space += in metaslab_sync()
4203 range_tree_space(msp->ms_checkpointing); in metaslab_sync()
4204 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, in metaslab_sync()
4205 -space_map_allocated(vd->vdev_checkpoint_sm)); in metaslab_sync()
4207 range_tree_vacate(msp->ms_checkpointing, NULL, NULL); in metaslab_sync()
4210 if (msp->ms_loaded) { in metaslab_sync()
4214 * to bring the space map's histogram up-to-date so we clear in metaslab_sync()
4217 space_map_histogram_clear(msp->ms_sm); in metaslab_sync()
4218 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); in metaslab_sync()
4223 * any deferred space. This allows the on-disk histogram in metaslab_sync()
4227 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); in metaslab_sync()
4231 * added back into the in-core free tree yet. This will in metaslab_sync()
4237 space_map_histogram_add(msp->ms_sm, in metaslab_sync()
4238 msp->ms_defer[t], tx); in metaslab_sync()
4244 * map histogram. We want to make sure that the on-disk histogram in metaslab_sync()
4249 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); in metaslab_sync()
4254 metaslab_class_histogram_verify(mg->mg_class); in metaslab_sync()
4267 range_tree_swap(&msp->ms_freeing, &msp->ms_freed); in metaslab_sync()
4268 ASSERT0(msp->ms_allocated_this_txg); in metaslab_sync()
4270 range_tree_vacate(msp->ms_freeing, in metaslab_sync()
4271 range_tree_add, msp->ms_freed); in metaslab_sync()
4273 msp->ms_allocated_this_txg += range_tree_space(alloctree); in metaslab_sync()
4276 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); in metaslab_sync()
4277 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) in metaslab_sync()
4279 ASSERT0(range_tree_space(msp->ms_freeing)); in metaslab_sync()
4280 ASSERT0(range_tree_space(msp->ms_checkpointing)); in metaslab_sync()
4282 mutex_exit(&msp->ms_lock); in metaslab_sync()
4289 VERIFY0(dmu_read(mos, vd->vdev_ms_array, in metaslab_sync()
4290 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); in metaslab_sync()
4291 VERIFY3U(object, ==, space_map_object(msp->ms_sm)); in metaslab_sync()
4293 mutex_exit(&msp->ms_sync_lock); in metaslab_sync()
4300 if (!msp->ms_loaded || msp->ms_disabled != 0) in metaslab_evict()
4305 msp->ms_allocating[(txg + t) & TXG_MASK])); in metaslab_evict()
4307 if (msp->ms_allocator != -1) in metaslab_evict()
4308 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); in metaslab_evict()
4321 metaslab_group_t *mg = msp->ms_group; in metaslab_sync_done()
4322 vdev_t *vd = mg->mg_vd; in metaslab_sync_done()
4323 spa_t *spa = vd->vdev_spa; in metaslab_sync_done()
4328 ASSERT(!vd->vdev_ishole); in metaslab_sync_done()
4330 mutex_enter(&msp->ms_lock); in metaslab_sync_done()
4332 if (msp->ms_new) { in metaslab_sync_done()
4334 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); in metaslab_sync_done()
4337 VERIFY0(msp->ms_allocated_this_txg); in metaslab_sync_done()
4338 VERIFY0(range_tree_space(msp->ms_freed)); in metaslab_sync_done()
4341 ASSERT0(range_tree_space(msp->ms_freeing)); in metaslab_sync_done()
4342 ASSERT0(range_tree_space(msp->ms_checkpointing)); in metaslab_sync_done()
4344 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; in metaslab_sync_done()
4346 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - in metaslab_sync_done()
4348 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing || in metaslab_sync_done()
4349 vd->vdev_rz_expanding) { in metaslab_sync_done()
4354 alloc_delta = msp->ms_allocated_this_txg - in metaslab_sync_done()
4355 range_tree_space(msp->ms_freed); in metaslab_sync_done()
4358 defer_delta = range_tree_space(msp->ms_freed) - in metaslab_sync_done()
4361 defer_delta -= range_tree_space(*defer_tree); in metaslab_sync_done()
4363 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, in metaslab_sync_done()
4372 * have a consistent view at the in-core side of the metaslab. in metaslab_sync_done()
4380 * When auto-trimming is enabled, free ranges which are added to in metaslab_sync_done()
4388 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); in metaslab_sync_done()
4390 range_tree_walk(msp->ms_freed, range_tree_add, in metaslab_sync_done()
4391 msp->ms_trim); in metaslab_sync_done()
4394 range_tree_vacate(msp->ms_trim, NULL, NULL); in metaslab_sync_done()
4400 * the defer_tree -- this is safe to do because we've in metaslab_sync_done()
4404 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); in metaslab_sync_done()
4406 range_tree_swap(&msp->ms_freed, defer_tree); in metaslab_sync_done()
4408 range_tree_vacate(msp->ms_freed, in metaslab_sync_done()
4409 msp->ms_loaded ? range_tree_add : NULL, in metaslab_sync_done()
4410 msp->ms_allocatable); in metaslab_sync_done()
4413 msp->ms_synced_length = space_map_length(msp->ms_sm); in metaslab_sync_done()
4415 msp->ms_deferspace += defer_delta; in metaslab_sync_done()
4416 ASSERT3S(msp->ms_deferspace, >=, 0); in metaslab_sync_done()
4417 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); in metaslab_sync_done()
4418 if (msp->ms_deferspace != 0) { in metaslab_sync_done()
4427 if (msp->ms_new) { in metaslab_sync_done()
4428 msp->ms_new = B_FALSE; in metaslab_sync_done()
4429 mutex_enter(&mg->mg_lock); in metaslab_sync_done()
4430 mg->mg_ms_ready++; in metaslab_sync_done()
4431 mutex_exit(&mg->mg_lock); in metaslab_sync_done()
4435 * Re-sort metaslab within its group now that we've adjusted in metaslab_sync_done()
4440 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); in metaslab_sync_done()
4441 ASSERT0(range_tree_space(msp->ms_freeing)); in metaslab_sync_done()
4442 ASSERT0(range_tree_space(msp->ms_freed)); in metaslab_sync_done()
4443 ASSERT0(range_tree_space(msp->ms_checkpointing)); in metaslab_sync_done()
4444 msp->ms_allocating_total -= msp->ms_allocated_this_txg; in metaslab_sync_done()
4445 msp->ms_allocated_this_txg = 0; in metaslab_sync_done()
4446 mutex_exit(&msp->ms_lock); in metaslab_sync_done()
4452 spa_t *spa = mg->mg_class->mc_spa; in metaslab_sync_reassess()
4456 mg->mg_fragmentation = metaslab_group_fragmentation(mg); in metaslab_sync_reassess()
4465 if (mg->mg_activation_count > 0) { in metaslab_sync_reassess()
4484 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) in metaslab_is_unique()
4487 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; in metaslab_is_unique()
4489 return (msp->ms_id != dva_ms_id); in metaslab_is_unique()
4518 if (zal->zal_size == metaslab_trace_max_entries) { in metaslab_trace_add()
4524 zal->zal_size--; in metaslab_trace_add()
4525 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); in metaslab_trace_add()
4526 list_remove(&zal->zal_list, mat_next); in metaslab_trace_add()
4531 list_link_init(&mat->mat_list_node); in metaslab_trace_add()
4532 mat->mat_mg = mg; in metaslab_trace_add()
4533 mat->mat_msp = msp; in metaslab_trace_add()
4534 mat->mat_size = psize; in metaslab_trace_add()
4535 mat->mat_dva_id = dva_id; in metaslab_trace_add()
4536 mat->mat_offset = offset; in metaslab_trace_add()
4537 mat->mat_weight = 0; in metaslab_trace_add()
4538 mat->mat_allocator = allocator; in metaslab_trace_add()
4541 mat->mat_weight = msp->ms_weight; in metaslab_trace_add()
4547 list_insert_tail(&zal->zal_list, mat); in metaslab_trace_add()
4548 zal->zal_size++; in metaslab_trace_add()
4550 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); in metaslab_trace_add()
4556 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), in metaslab_trace_init()
4558 zal->zal_size = 0; in metaslab_trace_init()
4566 while ((mat = list_remove_head(&zal->zal_list)) != NULL) in metaslab_trace_fini()
4568 list_destroy(&zal->zal_list); in metaslab_trace_fini()
4569 zal->zal_size = 0; in metaslab_trace_fini()
4586 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; in metaslab_group_alloc_increment()
4587 if (!mg->mg_class->mc_alloc_throttle_enabled) in metaslab_group_alloc_increment()
4590 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_increment()
4591 (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); in metaslab_group_alloc_increment()
4597 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_increment_qdepth()
4599 &mg->mg_class->mc_allocator[allocator]; in metaslab_group_increment_qdepth()
4600 uint64_t max = mg->mg_max_alloc_queue_depth; in metaslab_group_increment_qdepth()
4601 uint64_t cur = mga->mga_cur_max_alloc_queue_depth; in metaslab_group_increment_qdepth()
4603 if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, in metaslab_group_increment_qdepth()
4605 atomic_inc_64(&mca->mca_alloc_max_slots); in metaslab_group_increment_qdepth()
4608 cur = mga->mga_cur_max_alloc_queue_depth; in metaslab_group_increment_qdepth()
4620 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; in metaslab_group_alloc_decrement()
4621 if (!mg->mg_class->mc_alloc_throttle_enabled) in metaslab_group_alloc_decrement()
4624 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_decrement()
4625 (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); in metaslab_group_alloc_decrement()
4635 const dva_t *dva = bp->blk_dva; in metaslab_group_alloc_verify()
4640 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; in metaslab_group_alloc_verify()
4641 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_verify()
4642 VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); in metaslab_group_alloc_verify()
4651 range_tree_t *rt = msp->ms_allocatable; in metaslab_block_alloc()
4652 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_block_alloc()
4654 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_block_alloc()
4655 VERIFY(!msp->ms_condensing); in metaslab_block_alloc()
4656 VERIFY0(msp->ms_disabled); in metaslab_block_alloc()
4657 VERIFY0(msp->ms_new); in metaslab_block_alloc()
4659 start = mc->mc_ops->msop_alloc(msp, size); in metaslab_block_alloc()
4660 if (start != -1ULL) { in metaslab_block_alloc()
4661 metaslab_group_t *mg = msp->ms_group; in metaslab_block_alloc()
4662 vdev_t *vd = mg->mg_vd; in metaslab_block_alloc()
4664 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); in metaslab_block_alloc()
4665 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_block_alloc()
4666 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); in metaslab_block_alloc()
4668 range_tree_clear(msp->ms_trim, start, size); in metaslab_block_alloc()
4670 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) in metaslab_block_alloc()
4671 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); in metaslab_block_alloc()
4673 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); in metaslab_block_alloc()
4674 msp->ms_allocating_total += size; in metaslab_block_alloc()
4677 msp->ms_alloc_txg = txg; in metaslab_block_alloc()
4685 msp->ms_max_size = metaslab_largest_allocatable(msp); in metaslab_block_alloc()
4695 * have selected, we may not try the newly-activated metaslab, and instead
4698 * except for the newly-activated metaslab which we fail to examine).
4707 avl_tree_t *t = &mg->mg_metaslab_tree; in find_valid_metaslab()
4732 if (msp->ms_condensing || msp->ms_disabled > 0 || msp->ms_new) in find_valid_metaslab()
4735 *was_active = msp->ms_allocator != -1; in find_valid_metaslab()
4756 search->ms_weight = msp->ms_weight; in find_valid_metaslab()
4757 search->ms_start = msp->ms_start + 1; in find_valid_metaslab()
4758 search->ms_allocator = msp->ms_allocator; in find_valid_metaslab()
4759 search->ms_primary = msp->ms_primary; in find_valid_metaslab()
4767 ASSERT(MUTEX_HELD(&msp->ms_lock)); in metaslab_active_mask_verify()
4772 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) in metaslab_active_mask_verify()
4775 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { in metaslab_active_mask_verify()
4776 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_active_mask_verify()
4777 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); in metaslab_active_mask_verify()
4778 VERIFY3S(msp->ms_allocator, !=, -1); in metaslab_active_mask_verify()
4779 VERIFY(msp->ms_primary); in metaslab_active_mask_verify()
4783 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { in metaslab_active_mask_verify()
4784 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_active_mask_verify()
4785 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); in metaslab_active_mask_verify()
4786 VERIFY3S(msp->ms_allocator, !=, -1); in metaslab_active_mask_verify()
4787 VERIFY(!msp->ms_primary); in metaslab_active_mask_verify()
4791 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { in metaslab_active_mask_verify()
4792 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); in metaslab_active_mask_verify()
4793 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); in metaslab_active_mask_verify()
4794 VERIFY3S(msp->ms_allocator, ==, -1); in metaslab_active_mask_verify()
4805 uint64_t offset = -1ULL; in metaslab_group_alloc_normal()
4810 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { in metaslab_group_alloc_normal()
4813 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { in metaslab_group_alloc_normal()
4823 if (mg->mg_ms_ready < mg->mg_allocators * 3) in metaslab_group_alloc_normal()
4825 metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; in metaslab_group_alloc_normal()
4827 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); in metaslab_group_alloc_normal()
4830 search->ms_weight = UINT64_MAX; in metaslab_group_alloc_normal()
4831 search->ms_start = 0; in metaslab_group_alloc_normal()
4833 * At the end of the metaslab tree are the already-active metaslabs, in metaslab_group_alloc_normal()
4839 search->ms_allocator = -1; in metaslab_group_alloc_normal()
4840 search->ms_primary = B_TRUE; in metaslab_group_alloc_normal()
4844 mutex_enter(&mg->mg_lock); in metaslab_group_alloc_normal()
4847 mga->mga_primary != NULL) { in metaslab_group_alloc_normal()
4848 msp = mga->mga_primary; in metaslab_group_alloc_normal()
4851 * Even though we don't hold the ms_lock for the in metaslab_group_alloc_normal()
4853 * change while we hold the mg_lock. Thus it is in metaslab_group_alloc_normal()
4856 ASSERT(msp->ms_primary); in metaslab_group_alloc_normal()
4857 ASSERT3S(msp->ms_allocator, ==, allocator); in metaslab_group_alloc_normal()
4858 ASSERT(msp->ms_loaded); in metaslab_group_alloc_normal()
4861 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_group_alloc_normal()
4863 mga->mga_secondary != NULL) { in metaslab_group_alloc_normal()
4864 msp = mga->mga_secondary; in metaslab_group_alloc_normal()
4870 ASSERT(!msp->ms_primary); in metaslab_group_alloc_normal()
4871 ASSERT3S(msp->ms_allocator, ==, allocator); in metaslab_group_alloc_normal()
4872 ASSERT(msp->ms_loaded); in metaslab_group_alloc_normal()
4875 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_group_alloc_normal()
4882 mutex_exit(&mg->mg_lock); in metaslab_group_alloc_normal()
4885 return (-1ULL); in metaslab_group_alloc_normal()
4887 mutex_enter(&msp->ms_lock); in metaslab_group_alloc_normal()
4893 * tracepoints in non-gpl kernel modules. in metaslab_group_alloc_normal()
4909 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { in metaslab_group_alloc_normal()
4910 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_group_alloc_normal()
4911 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
4921 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && in metaslab_group_alloc_normal()
4922 (msp->ms_allocator != -1) && in metaslab_group_alloc_normal()
4923 (msp->ms_allocator != allocator || ((activation_weight == in metaslab_group_alloc_normal()
4924 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { in metaslab_group_alloc_normal()
4925 ASSERT(msp->ms_loaded); in metaslab_group_alloc_normal()
4926 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || in metaslab_group_alloc_normal()
4927 msp->ms_allocator != -1); in metaslab_group_alloc_normal()
4928 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
4939 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && in metaslab_group_alloc_normal()
4941 ASSERT(msp->ms_loaded); in metaslab_group_alloc_normal()
4942 ASSERT3S(msp->ms_allocator, ==, -1); in metaslab_group_alloc_normal()
4943 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc_normal()
4945 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
4974 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
4977 ASSERT(msp->ms_loaded); in metaslab_group_alloc_normal()
5000 if (msp->ms_condensing) { in metaslab_group_alloc_normal()
5004 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc_normal()
5007 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
5009 } else if (msp->ms_disabled > 0) { in metaslab_group_alloc_normal()
5013 metaslab_passivate(msp, msp->ms_weight & in metaslab_group_alloc_normal()
5016 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
5023 if (offset != -1ULL) { in metaslab_group_alloc_normal()
5030 ASSERT(msp->ms_loaded); in metaslab_group_alloc_normal()
5034 * tracepoints in non-gpl kernel modules. in metaslab_group_alloc_normal()
5047 * For space-based metaslabs, we use the maximum block size. in metaslab_group_alloc_normal()
5055 * For segment-based metaslabs, determine the new weight in metaslab_group_alloc_normal()
5063 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { in metaslab_group_alloc_normal()
5085 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; in metaslab_group_alloc_normal()
5098 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
5100 mutex_exit(&msp->ms_lock); in metaslab_group_alloc_normal()
5115 mutex_enter(&mg->mg_lock); in metaslab_group_alloc()
5116 if (offset == -1ULL) { in metaslab_group_alloc()
5117 mg->mg_failed_allocations++; in metaslab_group_alloc()
5132 mg->mg_no_free_space = B_TRUE; in metaslab_group_alloc()
5135 mg->mg_allocations++; in metaslab_group_alloc()
5136 mutex_exit(&mg->mg_lock); in metaslab_group_alloc()
5148 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_alloc_dva()
5158 * and a large number of split blocks coupled with ztest-induced in metaslab_alloc_dva()
5173 * nothing actually breaks if we miss a few updates -- we just won't in metaslab_alloc_dva()
5183 * able to reason about. Otherwise, any two top-level vdev failures in metaslab_alloc_dva()
5185 * only two adjacent top-level vdev failures will result in data loss. in metaslab_alloc_dva()
5187 * If we are doing gang blocks (hintdva is non-NULL), try to keep in metaslab_alloc_dva()
5201 if (vd != NULL && vd->vdev_mg != NULL) { in metaslab_alloc_dva()
5205 mg = mg->mg_next; in metaslab_alloc_dva()
5207 mg = mca->mca_rotor; in metaslab_alloc_dva()
5210 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); in metaslab_alloc_dva()
5211 mg = vd->vdev_mg->mg_next; in metaslab_alloc_dva()
5213 ASSERT(mca->mca_rotor != NULL); in metaslab_alloc_dva()
5214 mg = mca->mca_rotor; in metaslab_alloc_dva()
5221 if (mg->mg_class != mc || mg->mg_activation_count <= 0) in metaslab_alloc_dva()
5222 mg = mca->mca_rotor; in metaslab_alloc_dva()
5229 ASSERT(mg->mg_activation_count == 1); in metaslab_alloc_dva()
5230 vd = mg->mg_vd; in metaslab_alloc_dva()
5262 * Avoid writing single-copy data to an unhealthy, in metaslab_alloc_dva()
5263 * non-redundant vdev, unless we've already tried all in metaslab_alloc_dva()
5266 if (vd->vdev_state < VDEV_STATE_HEALTHY && in metaslab_alloc_dva()
5267 d == 0 && !try_hard && vd->vdev_children == 0) { in metaslab_alloc_dva()
5273 ASSERT(mg->mg_class == mc); in metaslab_alloc_dva()
5276 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); in metaslab_alloc_dva()
5287 if (offset != -1ULL) { in metaslab_alloc_dva()
5291 * over- or under-used relative to the pool, in metaslab_alloc_dva()
5297 if (mca->mca_aliquot == 0 && metaslab_bias_enabled) { in metaslab_alloc_dva()
5298 vdev_stat_t *vs = &vd->vdev_stat; in metaslab_alloc_dva()
5299 int64_t vs_free = vs->vs_space - vs->vs_alloc; in metaslab_alloc_dva()
5300 int64_t mc_free = mc->mc_space - mc->mc_alloc; in metaslab_alloc_dva()
5308 * This basically introduces a zero-centered in metaslab_alloc_dva()
5326 ratio = (vs_free * mc->mc_alloc_groups * 100) / in metaslab_alloc_dva()
5328 mg->mg_bias = ((ratio - 100) * in metaslab_alloc_dva()
5329 (int64_t)mg->mg_aliquot) / 100; in metaslab_alloc_dva()
5331 mg->mg_bias = 0; in metaslab_alloc_dva()
5335 atomic_add_64_nv(&mca->mca_aliquot, asize) >= in metaslab_alloc_dva()
5336 mg->mg_aliquot + mg->mg_bias) { in metaslab_alloc_dva()
5337 mca->mca_rotor = mg->mg_next; in metaslab_alloc_dva()
5338 mca->mca_aliquot = 0; in metaslab_alloc_dva()
5341 DVA_SET_VDEV(&dva[d], vd->vdev_id); in metaslab_alloc_dva()
5350 mca->mca_rotor = mg->mg_next; in metaslab_alloc_dva()
5351 mca->mca_aliquot = 0; in metaslab_alloc_dva()
5352 } while ((mg = mg->mg_next) != rotor); in metaslab_alloc_dva()
5359 psize <= 1 << spa->spa_min_ashift)) { in metaslab_alloc_dva()
5376 spa_t *spa = vd->vdev_spa; in metaslab_free_concrete()
5380 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); in metaslab_free_concrete()
5382 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_free_concrete()
5384 VERIFY(!msp->ms_condensing); in metaslab_free_concrete()
5385 VERIFY3U(offset, >=, msp->ms_start); in metaslab_free_concrete()
5386 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); in metaslab_free_concrete()
5387 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_free_concrete()
5388 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); in metaslab_free_concrete()
5392 mutex_enter(&msp->ms_lock); in metaslab_free_concrete()
5393 if (range_tree_is_empty(msp->ms_freeing) && in metaslab_free_concrete()
5394 range_tree_is_empty(msp->ms_checkpointing)) { in metaslab_free_concrete()
5400 range_tree_add(msp->ms_checkpointing, offset, asize); in metaslab_free_concrete()
5402 range_tree_add(msp->ms_freeing, offset, asize); in metaslab_free_concrete()
5404 mutex_exit(&msp->ms_lock); in metaslab_free_concrete()
5416 if (vd->vdev_ops->vdev_op_remap != NULL) in metaslab_free_impl_cb()
5426 spa_t *spa = vd->vdev_spa; in metaslab_free_impl()
5433 if (spa->spa_vdev_removal != NULL && in metaslab_free_impl()
5434 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && in metaslab_free_impl()
5443 } else if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_free_impl()
5445 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_free_impl()
5465 blkptr_t *bp = rbca->rbca_bp; in remap_blkptr_cb()
5468 if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) in remap_blkptr_cb()
5472 if (rbca->rbca_cb != NULL) { in remap_blkptr_cb()
5478 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); in remap_blkptr_cb()
5480 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, in remap_blkptr_cb()
5481 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); in remap_blkptr_cb()
5484 rbca->rbca_remap_vd = vd; in remap_blkptr_cb()
5485 rbca->rbca_remap_offset = offset; in remap_blkptr_cb()
5498 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, in remap_blkptr_cb()
5499 DVA_GET_VDEV(&bp->blk_dva[0])); in remap_blkptr_cb()
5500 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; in remap_blkptr_cb()
5502 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); in remap_blkptr_cb()
5505 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); in remap_blkptr_cb()
5506 DVA_SET_OFFSET(&bp->blk_dva[0], offset); in remap_blkptr_cb()
5558 dva_t *dva = &bp->blk_dva[0]; in spa_remap_blkptr()
5564 if (vd->vdev_ops->vdev_op_remap == NULL) in spa_remap_blkptr()
5580 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); in spa_remap_blkptr()
5583 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) in spa_remap_blkptr()
5608 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { in metaslab_unalloc_dva()
5615 ASSERT(!vd->vdev_removing); in metaslab_unalloc_dva()
5617 ASSERT0(vd->vdev_indirect_config.vic_mapping_object); in metaslab_unalloc_dva()
5618 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); in metaslab_unalloc_dva()
5623 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_unalloc_dva()
5625 mutex_enter(&msp->ms_lock); in metaslab_unalloc_dva()
5626 range_tree_remove(msp->ms_allocating[txg & TXG_MASK], in metaslab_unalloc_dva()
5628 msp->ms_allocating_total -= size; in metaslab_unalloc_dva()
5630 VERIFY(!msp->ms_condensing); in metaslab_unalloc_dva()
5631 VERIFY3U(offset, >=, msp->ms_start); in metaslab_unalloc_dva()
5632 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); in metaslab_unalloc_dva()
5633 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, in metaslab_unalloc_dva()
5634 msp->ms_size); in metaslab_unalloc_dva()
5635 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_unalloc_dva()
5636 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_unalloc_dva()
5637 range_tree_add(msp->ms_allocatable, offset, size); in metaslab_unalloc_dva()
5638 mutex_exit(&msp->ms_lock); in metaslab_unalloc_dva()
5673 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_class_throttle_reserve()
5674 uint64_t max = mca->mca_alloc_max_slots; in metaslab_class_throttle_reserve()
5676 ASSERT(mc->mc_alloc_throttle_enabled); in metaslab_class_throttle_reserve()
5678 zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) { in metaslab_class_throttle_reserve()
5683 * But even if we assume some other non-existing scenario, the in metaslab_class_throttle_reserve()
5690 zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); in metaslab_class_throttle_reserve()
5691 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; in metaslab_class_throttle_reserve()
5701 metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; in metaslab_class_throttle_unreserve()
5703 ASSERT(mc->mc_alloc_throttle_enabled); in metaslab_class_throttle_unreserve()
5704 zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); in metaslab_class_throttle_unreserve()
5712 spa_t *spa = vd->vdev_spa; in metaslab_claim_concrete()
5715 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) in metaslab_claim_concrete()
5718 ASSERT3P(vd->vdev_ms, !=, NULL); in metaslab_claim_concrete()
5719 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_claim_concrete()
5721 mutex_enter(&msp->ms_lock); in metaslab_claim_concrete()
5723 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { in metaslab_claim_concrete()
5726 ASSERT(msp->ms_loaded); in metaslab_claim_concrete()
5727 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); in metaslab_claim_concrete()
5733 !range_tree_contains(msp->ms_allocatable, offset, size)) in metaslab_claim_concrete()
5737 mutex_exit(&msp->ms_lock); in metaslab_claim_concrete()
5741 VERIFY(!msp->ms_condensing); in metaslab_claim_concrete()
5742 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); in metaslab_claim_concrete()
5743 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); in metaslab_claim_concrete()
5744 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, in metaslab_claim_concrete()
5745 msp->ms_size); in metaslab_claim_concrete()
5746 range_tree_remove(msp->ms_allocatable, offset, size); in metaslab_claim_concrete()
5747 range_tree_clear(msp->ms_trim, offset, size); in metaslab_claim_concrete()
5750 metaslab_class_t *mc = msp->ms_group->mg_class; in metaslab_claim_concrete()
5752 multilist_sublist_lock_obj(&mc->mc_metaslab_txg_list, msp); in metaslab_claim_concrete()
5753 if (!multilist_link_active(&msp->ms_class_txg_node)) { in metaslab_claim_concrete()
5754 msp->ms_selected_txg = txg; in metaslab_claim_concrete()
5759 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) in metaslab_claim_concrete()
5761 range_tree_add(msp->ms_allocating[txg & TXG_MASK], in metaslab_claim_concrete()
5763 msp->ms_allocating_total += size; in metaslab_claim_concrete()
5766 mutex_exit(&msp->ms_lock); in metaslab_claim_concrete()
5783 if (mcca_arg->mcca_error == 0) { in metaslab_claim_impl_cb()
5784 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, in metaslab_claim_impl_cb()
5785 size, mcca_arg->mcca_txg); in metaslab_claim_impl_cb()
5792 if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_claim_impl()
5800 ASSERT(!spa_writeable(vd->vdev_spa)); in metaslab_claim_impl()
5804 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_claim_impl()
5848 dva_t *dva = bp->blk_dva; in metaslab_alloc()
5849 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; in metaslab_alloc()
5857 if (mc->mc_allocator[allocator].mca_rotor == NULL) { in metaslab_alloc()
5872 for (d--; d >= 0; d--) { in metaslab_alloc()
5903 const dva_t *dva = bp->blk_dva; in metaslab_free()
5925 if (BP_GET_LOGICAL_BIRTH(bp) <= spa->spa_checkpoint_txg && in metaslab_free()
5926 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { in metaslab_free()
5953 const dva_t *dva = bp->blk_dva; in metaslab_claim()
5989 if (vd->vdev_ops == &vdev_indirect_ops) in metaslab_check_free_impl_cb()
5999 spa_t *spa __maybe_unused = vd->vdev_spa; in metaslab_check_free_impl()
6004 if (vd->vdev_ops->vdev_op_remap != NULL) { in metaslab_check_free_impl()
6005 vd->vdev_ops->vdev_op_remap(vd, offset, size, in metaslab_check_free_impl()
6011 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); in metaslab_check_free_impl()
6014 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; in metaslab_check_free_impl()
6016 mutex_enter(&msp->ms_lock); in metaslab_check_free_impl()
6017 if (msp->ms_loaded) { in metaslab_check_free_impl()
6018 range_tree_verify_not_present(msp->ms_allocatable, in metaslab_check_free_impl()
6033 range_tree_verify_not_present(msp->ms_freeing, offset, size); in metaslab_check_free_impl()
6034 range_tree_verify_not_present(msp->ms_checkpointing, offset, size); in metaslab_check_free_impl()
6035 range_tree_verify_not_present(msp->ms_freed, offset, size); in metaslab_check_free_impl()
6037 range_tree_verify_not_present(msp->ms_defer[j], offset, size); in metaslab_check_free_impl()
6038 range_tree_verify_not_present(msp->ms_trim, offset, size); in metaslab_check_free_impl()
6039 mutex_exit(&msp->ms_lock); in metaslab_check_free_impl()
6050 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); in metaslab_check_free()
6052 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); in metaslab_check_free()
6053 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); in metaslab_check_free()
6055 if (DVA_GET_GANG(&bp->blk_dva[i])) in metaslab_check_free()
6068 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); in metaslab_group_disable_wait()
6069 while (mg->mg_disabled_updating) { in metaslab_group_disable_wait()
6070 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); in metaslab_group_disable_wait()
6077 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); in metaslab_group_disabled_increment()
6078 ASSERT(mg->mg_disabled_updating); in metaslab_group_disabled_increment()
6080 while (mg->mg_ms_disabled >= max_disabled_ms) { in metaslab_group_disabled_increment()
6081 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); in metaslab_group_disabled_increment()
6083 mg->mg_ms_disabled++; in metaslab_group_disabled_increment()
6084 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); in metaslab_group_disabled_increment()
6096 ASSERT(!MUTEX_HELD(&msp->ms_lock)); in metaslab_disable()
6097 metaslab_group_t *mg = msp->ms_group; in metaslab_disable()
6099 mutex_enter(&mg->mg_ms_disabled_lock); in metaslab_disable()
6111 mg->mg_disabled_updating = B_TRUE; in metaslab_disable()
6112 if (msp->ms_disabled == 0) { in metaslab_disable()
6115 mutex_enter(&msp->ms_lock); in metaslab_disable()
6116 msp->ms_disabled++; in metaslab_disable()
6117 mutex_exit(&msp->ms_lock); in metaslab_disable()
6119 mg->mg_disabled_updating = B_FALSE; in metaslab_disable()
6120 cv_broadcast(&mg->mg_ms_disabled_cv); in metaslab_disable()
6121 mutex_exit(&mg->mg_ms_disabled_lock); in metaslab_disable()
6127 metaslab_group_t *mg = msp->ms_group; in metaslab_enable()
6128 spa_t *spa = mg->mg_vd->vdev_spa; in metaslab_enable()
6138 mutex_enter(&mg->mg_ms_disabled_lock); in metaslab_enable()
6139 mutex_enter(&msp->ms_lock); in metaslab_enable()
6140 if (--msp->ms_disabled == 0) { in metaslab_enable()
6141 mg->mg_ms_disabled--; in metaslab_enable()
6142 cv_broadcast(&mg->mg_ms_disabled_cv); in metaslab_enable()
6146 mutex_exit(&msp->ms_lock); in metaslab_enable()
6147 mutex_exit(&mg->mg_ms_disabled_lock); in metaslab_enable()
6151 metaslab_set_unflushed_dirty(metaslab_t *ms, boolean_t dirty) in metaslab_set_unflushed_dirty() argument
6153 ms->ms_unflushed_dirty = dirty; in metaslab_set_unflushed_dirty()
6157 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) in metaslab_update_ondisk_flush_data() argument
6159 vdev_t *vd = ms->ms_group->mg_vd; in metaslab_update_ondisk_flush_data()
6160 spa_t *spa = vd->vdev_spa; in metaslab_update_ondisk_flush_data()
6166 .msp_unflushed_txg = metaslab_unflushed_txg(ms), in metaslab_update_ondisk_flush_data()
6169 uint64_t entry_offset = ms->ms_id * entry_size; in metaslab_update_ondisk_flush_data()
6172 int err = zap_lookup(mos, vd->vdev_top_zap, in metaslab_update_ondisk_flush_data()
6178 VERIFY0(zap_add(mos, vd->vdev_top_zap, in metaslab_update_ondisk_flush_data()
6190 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) in metaslab_set_unflushed_txg() argument
6192 ms->ms_unflushed_txg = txg; in metaslab_set_unflushed_txg()
6193 metaslab_update_ondisk_flush_data(ms, tx); in metaslab_set_unflushed_txg()
6197 metaslab_unflushed_dirty(metaslab_t *ms) in metaslab_unflushed_dirty() argument
6199 return (ms->ms_unflushed_dirty); in metaslab_unflushed_dirty()
6203 metaslab_unflushed_txg(metaslab_t *ms) in metaslab_unflushed_txg() argument
6205 return (ms->ms_unflushed_txg); in metaslab_unflushed_txg()
6252 ZMOD_RW, "Enable segment-based metaslab selection");
6255 "Segment-based metaslab selection maximum buckets before switching");