xref: /illumos-gate/usr/src/uts/common/fs/zfs/metaslab.c (revision 6400a6be1817a40f3dbefdd3df7b7d87bcebef30)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 #include <sys/spa_impl.h>
35 
36 /*
37  * Allow allocations to switch to gang blocks quickly. We do this to
38  * avoid having to load lots of space_maps in a given txg. There are,
39  * however, some cases where we want to avoid "fast" ganging and instead
40  * we want to do an exhaustive search of all metaslabs on this device.
41  * Currently we don't allow any gang, slog, or dump device related allocations
42  * to "fast" gang.
43  */
44 #define	CAN_FASTGANG(flags) \
45 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
46 	METASLAB_GANG_AVOID)))
47 
48 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
49 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
50 #define	METASLAB_ACTIVE_MASK		\
51 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
52 
53 uint64_t metaslab_aliquot = 512ULL << 10;
54 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
55 
56 /*
57  * The in-core space map representation is more compact than its on-disk form.
58  * The zfs_condense_pct determines how much more compact the in-core
59  * space_map representation must be before we compact it on-disk.
60  * Values should be greater than or equal to 100.
61  */
62 int zfs_condense_pct = 200;
63 
64 /*
65  * Condensing a metaslab is not guaranteed to actually reduce the amount of
66  * space used on disk. In particular, a space map uses data in increments of
67  * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
68  * same number of blocks after condensing. Since the goal of condensing is to
69  * reduce the number of IOPs required to read the space map, we only want to
70  * condense when we can be sure we will reduce the number of blocks used by the
71  * space map. Unfortunately, we cannot precisely compute whether or not this is
72  * the case in metaslab_should_condense since we are holding ms_lock. Instead,
73  * we apply the following heuristic: do not condense a spacemap unless the
74  * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
75  * blocks.
76  */
77 int zfs_metaslab_condense_block_threshold = 4;
78 
79 /*
80  * The zfs_mg_noalloc_threshold defines which metaslab groups should
81  * be eligible for allocation. The value is defined as a percentage of
82  * a free space. Metaslab groups that have more free space than
83  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
84  * a metaslab group's free space is less than or equal to the
85  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
86  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
87  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
88  * groups are allowed to accept allocations. Gang blocks are always
89  * eligible to allocate on any metaslab group. The default value of 0 means
90  * no metaslab group will be excluded based on this criterion.
91  */
92 int zfs_mg_noalloc_threshold = 0;
93 
94 /*
95  * When set will load all metaslabs when pool is first opened.
96  */
97 int metaslab_debug_load = 0;
98 
99 /*
100  * When set will prevent metaslabs from being unloaded.
101  */
102 int metaslab_debug_unload = 0;
103 
104 /*
105  * Minimum size which forces the dynamic allocator to change
106  * it's allocation strategy.  Once the space map cannot satisfy
107  * an allocation of this size then it switches to using more
108  * aggressive strategy (i.e search by size rather than offset).
109  */
110 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
111 
112 /*
113  * The minimum free space, in percent, which must be available
114  * in a space map to continue allocations in a first-fit fashion.
115  * Once the space_map's free space drops below this level we dynamically
116  * switch to using best-fit allocations.
117  */
118 int metaslab_df_free_pct = 4;
119 
120 /*
121  * A metaslab is considered "free" if it contains a contiguous
122  * segment which is greater than metaslab_min_alloc_size.
123  */
124 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
125 
126 /*
127  * Percentage of all cpus that can be used by the metaslab taskq.
128  */
129 int metaslab_load_pct = 50;
130 
131 /*
132  * Determines how many txgs a metaslab may remain loaded without having any
133  * allocations from it. As long as a metaslab continues to be used we will
134  * keep it loaded.
135  */
136 int metaslab_unload_delay = TXG_SIZE * 2;
137 
138 /*
139  * Should we be willing to write data to degraded vdevs?
140  */
141 boolean_t zfs_write_to_degraded = B_FALSE;
142 
143 /*
144  * Max number of metaslabs per group to preload.
145  */
146 int metaslab_preload_limit = SPA_DVAS_PER_BP;
147 
148 /*
149  * Enable/disable preloading of metaslab.
150  */
151 boolean_t metaslab_preload_enabled = B_TRUE;
152 
153 /*
154  * Enable/disable additional weight factor for each metaslab.
155  */
156 boolean_t metaslab_weight_factor_enable = B_FALSE;
157 
158 
159 /*
160  * ==========================================================================
161  * Metaslab classes
162  * ==========================================================================
163  */
164 metaslab_class_t *
165 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
166 {
167 	metaslab_class_t *mc;
168 
169 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
170 
171 	mc->mc_spa = spa;
172 	mc->mc_rotor = NULL;
173 	mc->mc_ops = ops;
174 
175 	return (mc);
176 }
177 
178 void
179 metaslab_class_destroy(metaslab_class_t *mc)
180 {
181 	ASSERT(mc->mc_rotor == NULL);
182 	ASSERT(mc->mc_alloc == 0);
183 	ASSERT(mc->mc_deferred == 0);
184 	ASSERT(mc->mc_space == 0);
185 	ASSERT(mc->mc_dspace == 0);
186 
187 	kmem_free(mc, sizeof (metaslab_class_t));
188 }
189 
190 int
191 metaslab_class_validate(metaslab_class_t *mc)
192 {
193 	metaslab_group_t *mg;
194 	vdev_t *vd;
195 
196 	/*
197 	 * Must hold one of the spa_config locks.
198 	 */
199 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
200 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
201 
202 	if ((mg = mc->mc_rotor) == NULL)
203 		return (0);
204 
205 	do {
206 		vd = mg->mg_vd;
207 		ASSERT(vd->vdev_mg != NULL);
208 		ASSERT3P(vd->vdev_top, ==, vd);
209 		ASSERT3P(mg->mg_class, ==, mc);
210 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
211 	} while ((mg = mg->mg_next) != mc->mc_rotor);
212 
213 	return (0);
214 }
215 
216 void
217 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
218     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
219 {
220 	atomic_add_64(&mc->mc_alloc, alloc_delta);
221 	atomic_add_64(&mc->mc_deferred, defer_delta);
222 	atomic_add_64(&mc->mc_space, space_delta);
223 	atomic_add_64(&mc->mc_dspace, dspace_delta);
224 }
225 
226 uint64_t
227 metaslab_class_get_alloc(metaslab_class_t *mc)
228 {
229 	return (mc->mc_alloc);
230 }
231 
232 uint64_t
233 metaslab_class_get_deferred(metaslab_class_t *mc)
234 {
235 	return (mc->mc_deferred);
236 }
237 
238 uint64_t
239 metaslab_class_get_space(metaslab_class_t *mc)
240 {
241 	return (mc->mc_space);
242 }
243 
244 uint64_t
245 metaslab_class_get_dspace(metaslab_class_t *mc)
246 {
247 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
248 }
249 
250 /*
251  * ==========================================================================
252  * Metaslab groups
253  * ==========================================================================
254  */
255 static int
256 metaslab_compare(const void *x1, const void *x2)
257 {
258 	const metaslab_t *m1 = x1;
259 	const metaslab_t *m2 = x2;
260 
261 	if (m1->ms_weight < m2->ms_weight)
262 		return (1);
263 	if (m1->ms_weight > m2->ms_weight)
264 		return (-1);
265 
266 	/*
267 	 * If the weights are identical, use the offset to force uniqueness.
268 	 */
269 	if (m1->ms_start < m2->ms_start)
270 		return (-1);
271 	if (m1->ms_start > m2->ms_start)
272 		return (1);
273 
274 	ASSERT3P(m1, ==, m2);
275 
276 	return (0);
277 }
278 
279 /*
280  * Update the allocatable flag and the metaslab group's capacity.
281  * The allocatable flag is set to true if the capacity is below
282  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
283  * from allocatable to non-allocatable or vice versa then the metaslab
284  * group's class is updated to reflect the transition.
285  */
286 static void
287 metaslab_group_alloc_update(metaslab_group_t *mg)
288 {
289 	vdev_t *vd = mg->mg_vd;
290 	metaslab_class_t *mc = mg->mg_class;
291 	vdev_stat_t *vs = &vd->vdev_stat;
292 	boolean_t was_allocatable;
293 
294 	ASSERT(vd == vd->vdev_top);
295 
296 	mutex_enter(&mg->mg_lock);
297 	was_allocatable = mg->mg_allocatable;
298 
299 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
300 	    (vs->vs_space + 1);
301 
302 	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
303 
304 	/*
305 	 * The mc_alloc_groups maintains a count of the number of
306 	 * groups in this metaslab class that are still above the
307 	 * zfs_mg_noalloc_threshold. This is used by the allocating
308 	 * threads to determine if they should avoid allocations to
309 	 * a given group. The allocator will avoid allocations to a group
310 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
311 	 * and there are still other groups that are above the threshold.
312 	 * When a group transitions from allocatable to non-allocatable or
313 	 * vice versa we update the metaslab class to reflect that change.
314 	 * When the mc_alloc_groups value drops to 0 that means that all
315 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
316 	 * eligible for allocations. This effectively means that all devices
317 	 * are balanced again.
318 	 */
319 	if (was_allocatable && !mg->mg_allocatable)
320 		mc->mc_alloc_groups--;
321 	else if (!was_allocatable && mg->mg_allocatable)
322 		mc->mc_alloc_groups++;
323 	mutex_exit(&mg->mg_lock);
324 }
325 
326 metaslab_group_t *
327 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
328 {
329 	metaslab_group_t *mg;
330 
331 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
332 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
333 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
334 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
335 	mg->mg_vd = vd;
336 	mg->mg_class = mc;
337 	mg->mg_activation_count = 0;
338 
339 	mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
340 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
341 
342 	return (mg);
343 }
344 
345 void
346 metaslab_group_destroy(metaslab_group_t *mg)
347 {
348 	ASSERT(mg->mg_prev == NULL);
349 	ASSERT(mg->mg_next == NULL);
350 	/*
351 	 * We may have gone below zero with the activation count
352 	 * either because we never activated in the first place or
353 	 * because we're done, and possibly removing the vdev.
354 	 */
355 	ASSERT(mg->mg_activation_count <= 0);
356 
357 	taskq_destroy(mg->mg_taskq);
358 	avl_destroy(&mg->mg_metaslab_tree);
359 	mutex_destroy(&mg->mg_lock);
360 	kmem_free(mg, sizeof (metaslab_group_t));
361 }
362 
363 void
364 metaslab_group_activate(metaslab_group_t *mg)
365 {
366 	metaslab_class_t *mc = mg->mg_class;
367 	metaslab_group_t *mgprev, *mgnext;
368 
369 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
370 
371 	ASSERT(mc->mc_rotor != mg);
372 	ASSERT(mg->mg_prev == NULL);
373 	ASSERT(mg->mg_next == NULL);
374 	ASSERT(mg->mg_activation_count <= 0);
375 
376 	if (++mg->mg_activation_count <= 0)
377 		return;
378 
379 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
380 	metaslab_group_alloc_update(mg);
381 
382 	if ((mgprev = mc->mc_rotor) == NULL) {
383 		mg->mg_prev = mg;
384 		mg->mg_next = mg;
385 	} else {
386 		mgnext = mgprev->mg_next;
387 		mg->mg_prev = mgprev;
388 		mg->mg_next = mgnext;
389 		mgprev->mg_next = mg;
390 		mgnext->mg_prev = mg;
391 	}
392 	mc->mc_rotor = mg;
393 }
394 
395 void
396 metaslab_group_passivate(metaslab_group_t *mg)
397 {
398 	metaslab_class_t *mc = mg->mg_class;
399 	metaslab_group_t *mgprev, *mgnext;
400 
401 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
402 
403 	if (--mg->mg_activation_count != 0) {
404 		ASSERT(mc->mc_rotor != mg);
405 		ASSERT(mg->mg_prev == NULL);
406 		ASSERT(mg->mg_next == NULL);
407 		ASSERT(mg->mg_activation_count < 0);
408 		return;
409 	}
410 
411 	taskq_wait(mg->mg_taskq);
412 
413 	mgprev = mg->mg_prev;
414 	mgnext = mg->mg_next;
415 
416 	if (mg == mgnext) {
417 		mc->mc_rotor = NULL;
418 	} else {
419 		mc->mc_rotor = mgnext;
420 		mgprev->mg_next = mgnext;
421 		mgnext->mg_prev = mgprev;
422 	}
423 
424 	mg->mg_prev = NULL;
425 	mg->mg_next = NULL;
426 }
427 
428 static void
429 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
430 {
431 	mutex_enter(&mg->mg_lock);
432 	ASSERT(msp->ms_group == NULL);
433 	msp->ms_group = mg;
434 	msp->ms_weight = 0;
435 	avl_add(&mg->mg_metaslab_tree, msp);
436 	mutex_exit(&mg->mg_lock);
437 }
438 
439 static void
440 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
441 {
442 	mutex_enter(&mg->mg_lock);
443 	ASSERT(msp->ms_group == mg);
444 	avl_remove(&mg->mg_metaslab_tree, msp);
445 	msp->ms_group = NULL;
446 	mutex_exit(&mg->mg_lock);
447 }
448 
449 static void
450 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
451 {
452 	/*
453 	 * Although in principle the weight can be any value, in
454 	 * practice we do not use values in the range [1, 510].
455 	 */
456 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
457 	ASSERT(MUTEX_HELD(&msp->ms_lock));
458 
459 	mutex_enter(&mg->mg_lock);
460 	ASSERT(msp->ms_group == mg);
461 	avl_remove(&mg->mg_metaslab_tree, msp);
462 	msp->ms_weight = weight;
463 	avl_add(&mg->mg_metaslab_tree, msp);
464 	mutex_exit(&mg->mg_lock);
465 }
466 
467 /*
468  * Determine if a given metaslab group should skip allocations. A metaslab
469  * group should avoid allocations if its used capacity has crossed the
470  * zfs_mg_noalloc_threshold and there is at least one metaslab group
471  * that can still handle allocations.
472  */
473 static boolean_t
474 metaslab_group_allocatable(metaslab_group_t *mg)
475 {
476 	vdev_t *vd = mg->mg_vd;
477 	spa_t *spa = vd->vdev_spa;
478 	metaslab_class_t *mc = mg->mg_class;
479 
480 	/*
481 	 * A metaslab group is considered allocatable if its free capacity
482 	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
483 	 * associated with a slog, or there are no other metaslab groups
484 	 * with free capacity greater than zfs_mg_noalloc_threshold.
485 	 */
486 	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
487 	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
488 }
489 
490 /*
491  * ==========================================================================
492  * Range tree callbacks
493  * ==========================================================================
494  */
495 
496 /*
497  * Comparison function for the private size-ordered tree. Tree is sorted
498  * by size, larger sizes at the end of the tree.
499  */
500 static int
501 metaslab_rangesize_compare(const void *x1, const void *x2)
502 {
503 	const range_seg_t *r1 = x1;
504 	const range_seg_t *r2 = x2;
505 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
506 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
507 
508 	if (rs_size1 < rs_size2)
509 		return (-1);
510 	if (rs_size1 > rs_size2)
511 		return (1);
512 
513 	if (r1->rs_start < r2->rs_start)
514 		return (-1);
515 
516 	if (r1->rs_start > r2->rs_start)
517 		return (1);
518 
519 	return (0);
520 }
521 
522 /*
523  * Create any block allocator specific components. The current allocators
524  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
525  */
526 static void
527 metaslab_rt_create(range_tree_t *rt, void *arg)
528 {
529 	metaslab_t *msp = arg;
530 
531 	ASSERT3P(rt->rt_arg, ==, msp);
532 	ASSERT(msp->ms_tree == NULL);
533 
534 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
535 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
536 }
537 
538 /*
539  * Destroy the block allocator specific components.
540  */
541 static void
542 metaslab_rt_destroy(range_tree_t *rt, void *arg)
543 {
544 	metaslab_t *msp = arg;
545 
546 	ASSERT3P(rt->rt_arg, ==, msp);
547 	ASSERT3P(msp->ms_tree, ==, rt);
548 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
549 
550 	avl_destroy(&msp->ms_size_tree);
551 }
552 
553 static void
554 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
555 {
556 	metaslab_t *msp = arg;
557 
558 	ASSERT3P(rt->rt_arg, ==, msp);
559 	ASSERT3P(msp->ms_tree, ==, rt);
560 	VERIFY(!msp->ms_condensing);
561 	avl_add(&msp->ms_size_tree, rs);
562 }
563 
564 static void
565 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
566 {
567 	metaslab_t *msp = arg;
568 
569 	ASSERT3P(rt->rt_arg, ==, msp);
570 	ASSERT3P(msp->ms_tree, ==, rt);
571 	VERIFY(!msp->ms_condensing);
572 	avl_remove(&msp->ms_size_tree, rs);
573 }
574 
575 static void
576 metaslab_rt_vacate(range_tree_t *rt, void *arg)
577 {
578 	metaslab_t *msp = arg;
579 
580 	ASSERT3P(rt->rt_arg, ==, msp);
581 	ASSERT3P(msp->ms_tree, ==, rt);
582 
583 	/*
584 	 * Normally one would walk the tree freeing nodes along the way.
585 	 * Since the nodes are shared with the range trees we can avoid
586 	 * walking all nodes and just reinitialize the avl tree. The nodes
587 	 * will be freed by the range tree, so we don't want to free them here.
588 	 */
589 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
590 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
591 }
592 
593 static range_tree_ops_t metaslab_rt_ops = {
594 	metaslab_rt_create,
595 	metaslab_rt_destroy,
596 	metaslab_rt_add,
597 	metaslab_rt_remove,
598 	metaslab_rt_vacate
599 };
600 
601 /*
602  * ==========================================================================
603  * Metaslab block operations
604  * ==========================================================================
605  */
606 
607 /*
608  * Return the maximum contiguous segment within the metaslab.
609  */
610 uint64_t
611 metaslab_block_maxsize(metaslab_t *msp)
612 {
613 	avl_tree_t *t = &msp->ms_size_tree;
614 	range_seg_t *rs;
615 
616 	if (t == NULL || (rs = avl_last(t)) == NULL)
617 		return (0ULL);
618 
619 	return (rs->rs_end - rs->rs_start);
620 }
621 
622 uint64_t
623 metaslab_block_alloc(metaslab_t *msp, uint64_t size)
624 {
625 	uint64_t start;
626 	range_tree_t *rt = msp->ms_tree;
627 
628 	VERIFY(!msp->ms_condensing);
629 
630 	start = msp->ms_ops->msop_alloc(msp, size);
631 	if (start != -1ULL) {
632 		vdev_t *vd = msp->ms_group->mg_vd;
633 
634 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
635 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
636 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
637 		range_tree_remove(rt, start, size);
638 	}
639 	return (start);
640 }
641 
642 /*
643  * ==========================================================================
644  * Common allocator routines
645  * ==========================================================================
646  */
647 
648 /*
649  * This is a helper function that can be used by the allocator to find
650  * a suitable block to allocate. This will search the specified AVL
651  * tree looking for a block that matches the specified criteria.
652  */
653 static uint64_t
654 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
655     uint64_t align)
656 {
657 	range_seg_t *rs, rsearch;
658 	avl_index_t where;
659 
660 	rsearch.rs_start = *cursor;
661 	rsearch.rs_end = *cursor + size;
662 
663 	rs = avl_find(t, &rsearch, &where);
664 	if (rs == NULL)
665 		rs = avl_nearest(t, where, AVL_AFTER);
666 
667 	while (rs != NULL) {
668 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
669 
670 		if (offset + size <= rs->rs_end) {
671 			*cursor = offset + size;
672 			return (offset);
673 		}
674 		rs = AVL_NEXT(t, rs);
675 	}
676 
677 	/*
678 	 * If we know we've searched the whole map (*cursor == 0), give up.
679 	 * Otherwise, reset the cursor to the beginning and try again.
680 	 */
681 	if (*cursor == 0)
682 		return (-1ULL);
683 
684 	*cursor = 0;
685 	return (metaslab_block_picker(t, cursor, size, align));
686 }
687 
688 /*
689  * ==========================================================================
690  * The first-fit block allocator
691  * ==========================================================================
692  */
693 static uint64_t
694 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
695 {
696 	/*
697 	 * Find the largest power of 2 block size that evenly divides the
698 	 * requested size. This is used to try to allocate blocks with similar
699 	 * alignment from the same area of the metaslab (i.e. same cursor
700 	 * bucket) but it does not guarantee that other allocations sizes
701 	 * may exist in the same region.
702 	 */
703 	uint64_t align = size & -size;
704 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
705 	avl_tree_t *t = &msp->ms_tree->rt_root;
706 
707 	return (metaslab_block_picker(t, cursor, size, align));
708 }
709 
710 /* ARGSUSED */
711 static boolean_t
712 metaslab_ff_fragmented(metaslab_t *msp)
713 {
714 	return (B_TRUE);
715 }
716 
717 static metaslab_ops_t metaslab_ff_ops = {
718 	metaslab_ff_alloc,
719 	metaslab_ff_fragmented
720 };
721 
722 /*
723  * ==========================================================================
724  * Dynamic block allocator -
725  * Uses the first fit allocation scheme until space get low and then
726  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
727  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
728  * ==========================================================================
729  */
730 static uint64_t
731 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
732 {
733 	/*
734 	 * Find the largest power of 2 block size that evenly divides the
735 	 * requested size. This is used to try to allocate blocks with similar
736 	 * alignment from the same area of the metaslab (i.e. same cursor
737 	 * bucket) but it does not guarantee that other allocations sizes
738 	 * may exist in the same region.
739 	 */
740 	uint64_t align = size & -size;
741 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
742 	range_tree_t *rt = msp->ms_tree;
743 	avl_tree_t *t = &rt->rt_root;
744 	uint64_t max_size = metaslab_block_maxsize(msp);
745 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
746 
747 	ASSERT(MUTEX_HELD(&msp->ms_lock));
748 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
749 
750 	if (max_size < size)
751 		return (-1ULL);
752 
753 	/*
754 	 * If we're running low on space switch to using the size
755 	 * sorted AVL tree (best-fit).
756 	 */
757 	if (max_size < metaslab_df_alloc_threshold ||
758 	    free_pct < metaslab_df_free_pct) {
759 		t = &msp->ms_size_tree;
760 		*cursor = 0;
761 	}
762 
763 	return (metaslab_block_picker(t, cursor, size, 1ULL));
764 }
765 
766 static boolean_t
767 metaslab_df_fragmented(metaslab_t *msp)
768 {
769 	range_tree_t *rt = msp->ms_tree;
770 	uint64_t max_size = metaslab_block_maxsize(msp);
771 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
772 
773 	if (max_size >= metaslab_df_alloc_threshold &&
774 	    free_pct >= metaslab_df_free_pct)
775 		return (B_FALSE);
776 
777 	return (B_TRUE);
778 }
779 
780 static metaslab_ops_t metaslab_df_ops = {
781 	metaslab_df_alloc,
782 	metaslab_df_fragmented
783 };
784 
785 /*
786  * ==========================================================================
787  * Cursor fit block allocator -
788  * Select the largest region in the metaslab, set the cursor to the beginning
789  * of the range and the cursor_end to the end of the range. As allocations
790  * are made advance the cursor. Continue allocating from the cursor until
791  * the range is exhausted and then find a new range.
792  * ==========================================================================
793  */
794 static uint64_t
795 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
796 {
797 	range_tree_t *rt = msp->ms_tree;
798 	avl_tree_t *t = &msp->ms_size_tree;
799 	uint64_t *cursor = &msp->ms_lbas[0];
800 	uint64_t *cursor_end = &msp->ms_lbas[1];
801 	uint64_t offset = 0;
802 
803 	ASSERT(MUTEX_HELD(&msp->ms_lock));
804 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
805 
806 	ASSERT3U(*cursor_end, >=, *cursor);
807 
808 	if ((*cursor + size) > *cursor_end) {
809 		range_seg_t *rs;
810 
811 		rs = avl_last(&msp->ms_size_tree);
812 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
813 			return (-1ULL);
814 
815 		*cursor = rs->rs_start;
816 		*cursor_end = rs->rs_end;
817 	}
818 
819 	offset = *cursor;
820 	*cursor += size;
821 
822 	return (offset);
823 }
824 
825 static boolean_t
826 metaslab_cf_fragmented(metaslab_t *msp)
827 {
828 	return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
829 }
830 
831 static metaslab_ops_t metaslab_cf_ops = {
832 	metaslab_cf_alloc,
833 	metaslab_cf_fragmented
834 };
835 
836 /*
837  * ==========================================================================
838  * New dynamic fit allocator -
839  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
840  * contiguous blocks. If no region is found then just use the largest segment
841  * that remains.
842  * ==========================================================================
843  */
844 
845 /*
846  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
847  * to request from the allocator.
848  */
849 uint64_t metaslab_ndf_clump_shift = 4;
850 
851 static uint64_t
852 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
853 {
854 	avl_tree_t *t = &msp->ms_tree->rt_root;
855 	avl_index_t where;
856 	range_seg_t *rs, rsearch;
857 	uint64_t hbit = highbit64(size);
858 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
859 	uint64_t max_size = metaslab_block_maxsize(msp);
860 
861 	ASSERT(MUTEX_HELD(&msp->ms_lock));
862 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
863 
864 	if (max_size < size)
865 		return (-1ULL);
866 
867 	rsearch.rs_start = *cursor;
868 	rsearch.rs_end = *cursor + size;
869 
870 	rs = avl_find(t, &rsearch, &where);
871 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
872 		t = &msp->ms_size_tree;
873 
874 		rsearch.rs_start = 0;
875 		rsearch.rs_end = MIN(max_size,
876 		    1ULL << (hbit + metaslab_ndf_clump_shift));
877 		rs = avl_find(t, &rsearch, &where);
878 		if (rs == NULL)
879 			rs = avl_nearest(t, where, AVL_AFTER);
880 		ASSERT(rs != NULL);
881 	}
882 
883 	if ((rs->rs_end - rs->rs_start) >= size) {
884 		*cursor = rs->rs_start + size;
885 		return (rs->rs_start);
886 	}
887 	return (-1ULL);
888 }
889 
890 static boolean_t
891 metaslab_ndf_fragmented(metaslab_t *msp)
892 {
893 	return (metaslab_block_maxsize(msp) <=
894 	    (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
895 }
896 
897 static metaslab_ops_t metaslab_ndf_ops = {
898 	metaslab_ndf_alloc,
899 	metaslab_ndf_fragmented
900 };
901 
902 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
903 
904 /*
905  * ==========================================================================
906  * Metaslabs
907  * ==========================================================================
908  */
909 
910 /*
911  * Wait for any in-progress metaslab loads to complete.
912  */
913 void
914 metaslab_load_wait(metaslab_t *msp)
915 {
916 	ASSERT(MUTEX_HELD(&msp->ms_lock));
917 
918 	while (msp->ms_loading) {
919 		ASSERT(!msp->ms_loaded);
920 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
921 	}
922 }
923 
924 int
925 metaslab_load(metaslab_t *msp)
926 {
927 	int error = 0;
928 
929 	ASSERT(MUTEX_HELD(&msp->ms_lock));
930 	ASSERT(!msp->ms_loaded);
931 	ASSERT(!msp->ms_loading);
932 
933 	msp->ms_loading = B_TRUE;
934 
935 	/*
936 	 * If the space map has not been allocated yet, then treat
937 	 * all the space in the metaslab as free and add it to the
938 	 * ms_tree.
939 	 */
940 	if (msp->ms_sm != NULL)
941 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
942 	else
943 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
944 
945 	msp->ms_loaded = (error == 0);
946 	msp->ms_loading = B_FALSE;
947 
948 	if (msp->ms_loaded) {
949 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
950 			range_tree_walk(msp->ms_defertree[t],
951 			    range_tree_remove, msp->ms_tree);
952 		}
953 	}
954 	cv_broadcast(&msp->ms_load_cv);
955 	return (error);
956 }
957 
958 void
959 metaslab_unload(metaslab_t *msp)
960 {
961 	ASSERT(MUTEX_HELD(&msp->ms_lock));
962 	range_tree_vacate(msp->ms_tree, NULL, NULL);
963 	msp->ms_loaded = B_FALSE;
964 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
965 }
966 
967 metaslab_t *
968 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
969 {
970 	vdev_t *vd = mg->mg_vd;
971 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
972 	metaslab_t *msp;
973 
974 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
975 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
976 	cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
977 	msp->ms_id = id;
978 	msp->ms_start = id << vd->vdev_ms_shift;
979 	msp->ms_size = 1ULL << vd->vdev_ms_shift;
980 
981 	/*
982 	 * We only open space map objects that already exist. All others
983 	 * will be opened when we finally allocate an object for it.
984 	 */
985 	if (object != 0) {
986 		VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
987 		    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
988 		ASSERT(msp->ms_sm != NULL);
989 	}
990 
991 	/*
992 	 * We create the main range tree here, but we don't create the
993 	 * alloctree and freetree until metaslab_sync_done().  This serves
994 	 * two purposes: it allows metaslab_sync_done() to detect the
995 	 * addition of new space; and for debugging, it ensures that we'd
996 	 * data fault on any attempt to use this metaslab before it's ready.
997 	 */
998 	msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
999 	metaslab_group_add(mg, msp);
1000 
1001 	msp->ms_ops = mg->mg_class->mc_ops;
1002 
1003 	/*
1004 	 * If we're opening an existing pool (txg == 0) or creating
1005 	 * a new one (txg == TXG_INITIAL), all space is available now.
1006 	 * If we're adding space to an existing pool, the new space
1007 	 * does not become available until after this txg has synced.
1008 	 */
1009 	if (txg <= TXG_INITIAL)
1010 		metaslab_sync_done(msp, 0);
1011 
1012 	/*
1013 	 * If metaslab_debug_load is set and we're initializing a metaslab
1014 	 * that has an allocated space_map object then load the its space
1015 	 * map so that can verify frees.
1016 	 */
1017 	if (metaslab_debug_load && msp->ms_sm != NULL) {
1018 		mutex_enter(&msp->ms_lock);
1019 		VERIFY0(metaslab_load(msp));
1020 		mutex_exit(&msp->ms_lock);
1021 	}
1022 
1023 	if (txg != 0) {
1024 		vdev_dirty(vd, 0, NULL, txg);
1025 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
1026 	}
1027 
1028 	return (msp);
1029 }
1030 
1031 void
1032 metaslab_fini(metaslab_t *msp)
1033 {
1034 	metaslab_group_t *mg = msp->ms_group;
1035 
1036 	metaslab_group_remove(mg, msp);
1037 
1038 	mutex_enter(&msp->ms_lock);
1039 
1040 	VERIFY(msp->ms_group == NULL);
1041 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1042 	    0, -msp->ms_size);
1043 	space_map_close(msp->ms_sm);
1044 
1045 	metaslab_unload(msp);
1046 	range_tree_destroy(msp->ms_tree);
1047 
1048 	for (int t = 0; t < TXG_SIZE; t++) {
1049 		range_tree_destroy(msp->ms_alloctree[t]);
1050 		range_tree_destroy(msp->ms_freetree[t]);
1051 	}
1052 
1053 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1054 		range_tree_destroy(msp->ms_defertree[t]);
1055 	}
1056 
1057 	ASSERT0(msp->ms_deferspace);
1058 
1059 	mutex_exit(&msp->ms_lock);
1060 	cv_destroy(&msp->ms_load_cv);
1061 	mutex_destroy(&msp->ms_lock);
1062 
1063 	kmem_free(msp, sizeof (metaslab_t));
1064 }
1065 
1066 /*
1067  * Apply a weighting factor based on the histogram information for this
1068  * metaslab. The current weighting factor is somewhat arbitrary and requires
1069  * additional investigation. The implementation provides a measure of
1070  * "weighted" free space and gives a higher weighting for larger contiguous
1071  * regions. The weighting factor is determined by counting the number of
1072  * sm_shift sectors that exist in each region represented by the histogram.
1073  * That value is then multiplied by the power of 2 exponent and the sm_shift
1074  * value.
1075  *
1076  * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1077  * metaslab has an sm_shift value of 9 (512B):
1078  *
1079  * 1) calculate the number of sm_shift sectors in the region:
1080  *	2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1081  * 2) multiply by the power of 2 exponent and the sm_shift value:
1082  *	16384 * 21 * 9 = 3096576
1083  * This value will be added to the weighting of the metaslab.
1084  */
1085 static uint64_t
1086 metaslab_weight_factor(metaslab_t *msp)
1087 {
1088 	uint64_t factor = 0;
1089 	uint64_t sectors;
1090 	int i;
1091 
1092 	/*
1093 	 * A null space map means that the entire metaslab is free,
1094 	 * calculate a weight factor that spans the entire size of the
1095 	 * metaslab.
1096 	 */
1097 	if (msp->ms_sm == NULL) {
1098 		vdev_t *vd = msp->ms_group->mg_vd;
1099 
1100 		i = highbit64(msp->ms_size) - 1;
1101 		sectors = msp->ms_size >> vd->vdev_ashift;
1102 		return (sectors * i * vd->vdev_ashift);
1103 	}
1104 
1105 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1106 		return (0);
1107 
1108 	for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1109 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1110 			continue;
1111 
1112 		/*
1113 		 * Determine the number of sm_shift sectors in the region
1114 		 * indicated by the histogram. For example, given an
1115 		 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1116 		 * that we're looking at an 8K region in the histogram
1117 		 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1118 		 * number of sm_shift sectors (512 bytes in this example),
1119 		 * we would take 8192 / 512 = 16. Since the histogram
1120 		 * is offset by sm_shift we can simply use the value of
1121 		 * of i to calculate this (i.e. 2^i = 16 where i = 4).
1122 		 */
1123 		sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1124 		factor += (i + msp->ms_sm->sm_shift) * sectors;
1125 	}
1126 	return (factor * msp->ms_sm->sm_shift);
1127 }
1128 
1129 static uint64_t
1130 metaslab_weight(metaslab_t *msp)
1131 {
1132 	metaslab_group_t *mg = msp->ms_group;
1133 	vdev_t *vd = mg->mg_vd;
1134 	uint64_t weight, space;
1135 
1136 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1137 
1138 	/*
1139 	 * This vdev is in the process of being removed so there is nothing
1140 	 * for us to do here.
1141 	 */
1142 	if (vd->vdev_removing) {
1143 		ASSERT0(space_map_allocated(msp->ms_sm));
1144 		ASSERT0(vd->vdev_ms_shift);
1145 		return (0);
1146 	}
1147 
1148 	/*
1149 	 * The baseline weight is the metaslab's free space.
1150 	 */
1151 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1152 	weight = space;
1153 
1154 	/*
1155 	 * Modern disks have uniform bit density and constant angular velocity.
1156 	 * Therefore, the outer recording zones are faster (higher bandwidth)
1157 	 * than the inner zones by the ratio of outer to inner track diameter,
1158 	 * which is typically around 2:1.  We account for this by assigning
1159 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1160 	 * In effect, this means that we'll select the metaslab with the most
1161 	 * free bandwidth rather than simply the one with the most free space.
1162 	 */
1163 	weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1164 	ASSERT(weight >= space && weight <= 2 * space);
1165 
1166 	msp->ms_factor = metaslab_weight_factor(msp);
1167 	if (metaslab_weight_factor_enable)
1168 		weight += msp->ms_factor;
1169 
1170 	if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1171 		/*
1172 		 * If this metaslab is one we're actively using, adjust its
1173 		 * weight to make it preferable to any inactive metaslab so
1174 		 * we'll polish it off.
1175 		 */
1176 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1177 	}
1178 
1179 	return (weight);
1180 }
1181 
1182 static int
1183 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1184 {
1185 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1186 
1187 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1188 		metaslab_load_wait(msp);
1189 		if (!msp->ms_loaded) {
1190 			int error = metaslab_load(msp);
1191 			if (error) {
1192 				metaslab_group_sort(msp->ms_group, msp, 0);
1193 				return (error);
1194 			}
1195 		}
1196 
1197 		metaslab_group_sort(msp->ms_group, msp,
1198 		    msp->ms_weight | activation_weight);
1199 	}
1200 	ASSERT(msp->ms_loaded);
1201 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1202 
1203 	return (0);
1204 }
1205 
1206 static void
1207 metaslab_passivate(metaslab_t *msp, uint64_t size)
1208 {
1209 	/*
1210 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1211 	 * this metaslab again.  In that case, it had better be empty,
1212 	 * or we would be leaving space on the table.
1213 	 */
1214 	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1215 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1216 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1217 }
1218 
1219 static void
1220 metaslab_preload(void *arg)
1221 {
1222 	metaslab_t *msp = arg;
1223 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1224 
1225 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
1226 
1227 	mutex_enter(&msp->ms_lock);
1228 	metaslab_load_wait(msp);
1229 	if (!msp->ms_loaded)
1230 		(void) metaslab_load(msp);
1231 
1232 	/*
1233 	 * Set the ms_access_txg value so that we don't unload it right away.
1234 	 */
1235 	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1236 	mutex_exit(&msp->ms_lock);
1237 }
1238 
1239 static void
1240 metaslab_group_preload(metaslab_group_t *mg)
1241 {
1242 	spa_t *spa = mg->mg_vd->vdev_spa;
1243 	metaslab_t *msp;
1244 	avl_tree_t *t = &mg->mg_metaslab_tree;
1245 	int m = 0;
1246 
1247 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1248 		taskq_wait(mg->mg_taskq);
1249 		return;
1250 	}
1251 
1252 	mutex_enter(&mg->mg_lock);
1253 	/*
1254 	 * Load the next potential metaslabs
1255 	 */
1256 	msp = avl_first(t);
1257 	while (msp != NULL) {
1258 		metaslab_t *msp_next = AVL_NEXT(t, msp);
1259 
1260 		/* If we have reached our preload limit then we're done */
1261 		if (++m > metaslab_preload_limit)
1262 			break;
1263 
1264 		/*
1265 		 * We must drop the metaslab group lock here to preserve
1266 		 * lock ordering with the ms_lock (when grabbing both
1267 		 * the mg_lock and the ms_lock, the ms_lock must be taken
1268 		 * first).  As a result, it is possible that the ordering
1269 		 * of the metaslabs within the avl tree may change before
1270 		 * we reacquire the lock. The metaslab cannot be removed from
1271 		 * the tree while we're in syncing context so it is safe to
1272 		 * drop the mg_lock here. If the metaslabs are reordered
1273 		 * nothing will break -- we just may end up loading a
1274 		 * less than optimal one.
1275 		 */
1276 		mutex_exit(&mg->mg_lock);
1277 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1278 		    msp, TQ_SLEEP) != NULL);
1279 		mutex_enter(&mg->mg_lock);
1280 		msp = msp_next;
1281 	}
1282 	mutex_exit(&mg->mg_lock);
1283 }
1284 
1285 /*
1286  * Determine if the space map's on-disk footprint is past our tolerance
1287  * for inefficiency. We would like to use the following criteria to make
1288  * our decision:
1289  *
1290  * 1. The size of the space map object should not dramatically increase as a
1291  * result of writing out the free space range tree.
1292  *
1293  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1294  * times the size than the free space range tree representation
1295  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1296  *
1297  * 3. The on-disk size of the space map should actually decrease.
1298  *
1299  * Checking the first condition is tricky since we don't want to walk
1300  * the entire AVL tree calculating the estimated on-disk size. Instead we
1301  * use the size-ordered range tree in the metaslab and calculate the
1302  * size required to write out the largest segment in our free tree. If the
1303  * size required to represent that segment on disk is larger than the space
1304  * map object then we avoid condensing this map.
1305  *
1306  * To determine the second criterion we use a best-case estimate and assume
1307  * each segment can be represented on-disk as a single 64-bit entry. We refer
1308  * to this best-case estimate as the space map's minimal form.
1309  *
1310  * Unfortunately, we cannot compute the on-disk size of the space map in this
1311  * context because we cannot accurately compute the effects of compression, etc.
1312  * Instead, we apply the heuristic described in the block comment for
1313  * zfs_metaslab_condense_block_threshold - we only condense if the space used
1314  * is greater than a threshold number of blocks.
1315  */
1316 static boolean_t
1317 metaslab_should_condense(metaslab_t *msp)
1318 {
1319 	space_map_t *sm = msp->ms_sm;
1320 	range_seg_t *rs;
1321 	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
1322 	dmu_object_info_t doi;
1323 	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
1324 
1325 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1326 	ASSERT(msp->ms_loaded);
1327 
1328 	/*
1329 	 * Use the ms_size_tree range tree, which is ordered by size, to
1330 	 * obtain the largest segment in the free tree. If the tree is empty
1331 	 * then we should condense the map.
1332 	 */
1333 	rs = avl_last(&msp->ms_size_tree);
1334 	if (rs == NULL)
1335 		return (B_TRUE);
1336 
1337 	/*
1338 	 * Calculate the number of 64-bit entries this segment would
1339 	 * require when written to disk. If this single segment would be
1340 	 * larger on-disk than the entire current on-disk structure, then
1341 	 * clearly condensing will increase the on-disk structure size.
1342 	 */
1343 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1344 	entries = size / (MIN(size, SM_RUN_MAX));
1345 	segsz = entries * sizeof (uint64_t);
1346 
1347 	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
1348 	object_size = space_map_length(msp->ms_sm);
1349 
1350 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
1351 	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
1352 
1353 	return (segsz <= object_size &&
1354 	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
1355 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
1356 }
1357 
1358 /*
1359  * Condense the on-disk space map representation to its minimized form.
1360  * The minimized form consists of a small number of allocations followed by
1361  * the entries of the free range tree.
1362  */
1363 static void
1364 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1365 {
1366 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1367 	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1368 	range_tree_t *condense_tree;
1369 	space_map_t *sm = msp->ms_sm;
1370 
1371 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1372 	ASSERT3U(spa_sync_pass(spa), ==, 1);
1373 	ASSERT(msp->ms_loaded);
1374 
1375 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1376 	    "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1377 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1378 
1379 	/*
1380 	 * Create an range tree that is 100% allocated. We remove segments
1381 	 * that have been freed in this txg, any deferred frees that exist,
1382 	 * and any allocation in the future. Removing segments should be
1383 	 * a relatively inexpensive operation since we expect these trees to
1384 	 * have a small number of nodes.
1385 	 */
1386 	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1387 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1388 
1389 	/*
1390 	 * Remove what's been freed in this txg from the condense_tree.
1391 	 * Since we're in sync_pass 1, we know that all the frees from
1392 	 * this txg are in the freetree.
1393 	 */
1394 	range_tree_walk(freetree, range_tree_remove, condense_tree);
1395 
1396 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1397 		range_tree_walk(msp->ms_defertree[t],
1398 		    range_tree_remove, condense_tree);
1399 	}
1400 
1401 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1402 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1403 		    range_tree_remove, condense_tree);
1404 	}
1405 
1406 	/*
1407 	 * We're about to drop the metaslab's lock thus allowing
1408 	 * other consumers to change it's content. Set the
1409 	 * metaslab's ms_condensing flag to ensure that
1410 	 * allocations on this metaslab do not occur while we're
1411 	 * in the middle of committing it to disk. This is only critical
1412 	 * for the ms_tree as all other range trees use per txg
1413 	 * views of their content.
1414 	 */
1415 	msp->ms_condensing = B_TRUE;
1416 
1417 	mutex_exit(&msp->ms_lock);
1418 	space_map_truncate(sm, tx);
1419 	mutex_enter(&msp->ms_lock);
1420 
1421 	/*
1422 	 * While we would ideally like to create a space_map representation
1423 	 * that consists only of allocation records, doing so can be
1424 	 * prohibitively expensive because the in-core free tree can be
1425 	 * large, and therefore computationally expensive to subtract
1426 	 * from the condense_tree. Instead we sync out two trees, a cheap
1427 	 * allocation only tree followed by the in-core free tree. While not
1428 	 * optimal, this is typically close to optimal, and much cheaper to
1429 	 * compute.
1430 	 */
1431 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1432 	range_tree_vacate(condense_tree, NULL, NULL);
1433 	range_tree_destroy(condense_tree);
1434 
1435 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1436 	msp->ms_condensing = B_FALSE;
1437 }
1438 
1439 /*
1440  * Write a metaslab to disk in the context of the specified transaction group.
1441  */
1442 void
1443 metaslab_sync(metaslab_t *msp, uint64_t txg)
1444 {
1445 	metaslab_group_t *mg = msp->ms_group;
1446 	vdev_t *vd = mg->mg_vd;
1447 	spa_t *spa = vd->vdev_spa;
1448 	objset_t *mos = spa_meta_objset(spa);
1449 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1450 	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1451 	range_tree_t **freed_tree =
1452 	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1453 	dmu_tx_t *tx;
1454 	uint64_t object = space_map_object(msp->ms_sm);
1455 
1456 	ASSERT(!vd->vdev_ishole);
1457 
1458 	/*
1459 	 * This metaslab has just been added so there's no work to do now.
1460 	 */
1461 	if (*freetree == NULL) {
1462 		ASSERT3P(alloctree, ==, NULL);
1463 		return;
1464 	}
1465 
1466 	ASSERT3P(alloctree, !=, NULL);
1467 	ASSERT3P(*freetree, !=, NULL);
1468 	ASSERT3P(*freed_tree, !=, NULL);
1469 
1470 	if (range_tree_space(alloctree) == 0 &&
1471 	    range_tree_space(*freetree) == 0)
1472 		return;
1473 
1474 	/*
1475 	 * The only state that can actually be changing concurrently with
1476 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1477 	 * be modifying this txg's alloctree, freetree, freed_tree, or
1478 	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1479 	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1480 	 * because the DMU can call down to us (e.g. via zio_free()) at
1481 	 * any time.
1482 	 */
1483 
1484 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1485 
1486 	if (msp->ms_sm == NULL) {
1487 		uint64_t new_object;
1488 
1489 		new_object = space_map_alloc(mos, tx);
1490 		VERIFY3U(new_object, !=, 0);
1491 
1492 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1493 		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1494 		    &msp->ms_lock));
1495 		ASSERT(msp->ms_sm != NULL);
1496 	}
1497 
1498 	mutex_enter(&msp->ms_lock);
1499 
1500 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1501 	    metaslab_should_condense(msp)) {
1502 		metaslab_condense(msp, txg, tx);
1503 	} else {
1504 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1505 		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1506 	}
1507 
1508 	range_tree_vacate(alloctree, NULL, NULL);
1509 
1510 	if (msp->ms_loaded) {
1511 		/*
1512 		 * When the space map is loaded, we have an accruate
1513 		 * histogram in the range tree. This gives us an opportunity
1514 		 * to bring the space map's histogram up-to-date so we clear
1515 		 * it first before updating it.
1516 		 */
1517 		space_map_histogram_clear(msp->ms_sm);
1518 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1519 	} else {
1520 		/*
1521 		 * Since the space map is not loaded we simply update the
1522 		 * exisiting histogram with what was freed in this txg. This
1523 		 * means that the on-disk histogram may not have an accurate
1524 		 * view of the free space but it's close enough to allow
1525 		 * us to make allocation decisions.
1526 		 */
1527 		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1528 	}
1529 
1530 	/*
1531 	 * For sync pass 1, we avoid traversing this txg's free range tree
1532 	 * and instead will just swap the pointers for freetree and
1533 	 * freed_tree. We can safely do this since the freed_tree is
1534 	 * guaranteed to be empty on the initial pass.
1535 	 */
1536 	if (spa_sync_pass(spa) == 1) {
1537 		range_tree_swap(freetree, freed_tree);
1538 	} else {
1539 		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1540 	}
1541 
1542 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1543 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1544 
1545 	mutex_exit(&msp->ms_lock);
1546 
1547 	if (object != space_map_object(msp->ms_sm)) {
1548 		object = space_map_object(msp->ms_sm);
1549 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1550 		    msp->ms_id, sizeof (uint64_t), &object, tx);
1551 	}
1552 	dmu_tx_commit(tx);
1553 }
1554 
1555 /*
1556  * Called after a transaction group has completely synced to mark
1557  * all of the metaslab's free space as usable.
1558  */
1559 void
1560 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1561 {
1562 	metaslab_group_t *mg = msp->ms_group;
1563 	vdev_t *vd = mg->mg_vd;
1564 	range_tree_t **freed_tree;
1565 	range_tree_t **defer_tree;
1566 	int64_t alloc_delta, defer_delta;
1567 
1568 	ASSERT(!vd->vdev_ishole);
1569 
1570 	mutex_enter(&msp->ms_lock);
1571 
1572 	/*
1573 	 * If this metaslab is just becoming available, initialize its
1574 	 * alloctrees, freetrees, and defertree and add its capacity to
1575 	 * the vdev.
1576 	 */
1577 	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1578 		for (int t = 0; t < TXG_SIZE; t++) {
1579 			ASSERT(msp->ms_alloctree[t] == NULL);
1580 			ASSERT(msp->ms_freetree[t] == NULL);
1581 
1582 			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1583 			    &msp->ms_lock);
1584 			msp->ms_freetree[t] = range_tree_create(NULL, msp,
1585 			    &msp->ms_lock);
1586 		}
1587 
1588 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1589 			ASSERT(msp->ms_defertree[t] == NULL);
1590 
1591 			msp->ms_defertree[t] = range_tree_create(NULL, msp,
1592 			    &msp->ms_lock);
1593 		}
1594 
1595 		vdev_space_update(vd, 0, 0, msp->ms_size);
1596 	}
1597 
1598 	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1599 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1600 
1601 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
1602 	defer_delta = range_tree_space(*freed_tree) -
1603 	    range_tree_space(*defer_tree);
1604 
1605 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1606 
1607 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1608 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1609 
1610 	/*
1611 	 * If there's a metaslab_load() in progress, wait for it to complete
1612 	 * so that we have a consistent view of the in-core space map.
1613 	 */
1614 	metaslab_load_wait(msp);
1615 
1616 	/*
1617 	 * Move the frees from the defer_tree back to the free
1618 	 * range tree (if it's loaded). Swap the freed_tree and the
1619 	 * defer_tree -- this is safe to do because we've just emptied out
1620 	 * the defer_tree.
1621 	 */
1622 	range_tree_vacate(*defer_tree,
1623 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1624 	range_tree_swap(freed_tree, defer_tree);
1625 
1626 	space_map_update(msp->ms_sm);
1627 
1628 	msp->ms_deferspace += defer_delta;
1629 	ASSERT3S(msp->ms_deferspace, >=, 0);
1630 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1631 	if (msp->ms_deferspace != 0) {
1632 		/*
1633 		 * Keep syncing this metaslab until all deferred frees
1634 		 * are back in circulation.
1635 		 */
1636 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1637 	}
1638 
1639 	if (msp->ms_loaded && msp->ms_access_txg < txg) {
1640 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1641 			VERIFY0(range_tree_space(
1642 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
1643 		}
1644 
1645 		if (!metaslab_debug_unload)
1646 			metaslab_unload(msp);
1647 	}
1648 
1649 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1650 	mutex_exit(&msp->ms_lock);
1651 
1652 }
1653 
1654 void
1655 metaslab_sync_reassess(metaslab_group_t *mg)
1656 {
1657 	metaslab_group_alloc_update(mg);
1658 
1659 	/*
1660 	 * Preload the next potential metaslabs
1661 	 */
1662 	metaslab_group_preload(mg);
1663 }
1664 
1665 static uint64_t
1666 metaslab_distance(metaslab_t *msp, dva_t *dva)
1667 {
1668 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1669 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1670 	uint64_t start = msp->ms_id;
1671 
1672 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1673 		return (1ULL << 63);
1674 
1675 	if (offset < start)
1676 		return ((start - offset) << ms_shift);
1677 	if (offset > start)
1678 		return ((offset - start) << ms_shift);
1679 	return (0);
1680 }
1681 
1682 static uint64_t
1683 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1684     uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
1685 {
1686 	spa_t *spa = mg->mg_vd->vdev_spa;
1687 	metaslab_t *msp = NULL;
1688 	uint64_t offset = -1ULL;
1689 	avl_tree_t *t = &mg->mg_metaslab_tree;
1690 	uint64_t activation_weight;
1691 	uint64_t target_distance;
1692 	int i;
1693 
1694 	activation_weight = METASLAB_WEIGHT_PRIMARY;
1695 	for (i = 0; i < d; i++) {
1696 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1697 			activation_weight = METASLAB_WEIGHT_SECONDARY;
1698 			break;
1699 		}
1700 	}
1701 
1702 	for (;;) {
1703 		boolean_t was_active;
1704 
1705 		mutex_enter(&mg->mg_lock);
1706 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1707 			if (msp->ms_weight < asize) {
1708 				spa_dbgmsg(spa, "%s: failed to meet weight "
1709 				    "requirement: vdev %llu, txg %llu, mg %p, "
1710 				    "msp %p, psize %llu, asize %llu, "
1711 				    "weight %llu", spa_name(spa),
1712 				    mg->mg_vd->vdev_id, txg,
1713 				    mg, msp, psize, asize, msp->ms_weight);
1714 				mutex_exit(&mg->mg_lock);
1715 				return (-1ULL);
1716 			}
1717 
1718 			/*
1719 			 * If the selected metaslab is condensing, skip it.
1720 			 */
1721 			if (msp->ms_condensing)
1722 				continue;
1723 
1724 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1725 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1726 				break;
1727 
1728 			target_distance = min_distance +
1729 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1730 			    min_distance >> 1);
1731 
1732 			for (i = 0; i < d; i++)
1733 				if (metaslab_distance(msp, &dva[i]) <
1734 				    target_distance)
1735 					break;
1736 			if (i == d)
1737 				break;
1738 		}
1739 		mutex_exit(&mg->mg_lock);
1740 		if (msp == NULL)
1741 			return (-1ULL);
1742 
1743 		mutex_enter(&msp->ms_lock);
1744 
1745 		/*
1746 		 * Ensure that the metaslab we have selected is still
1747 		 * capable of handling our request. It's possible that
1748 		 * another thread may have changed the weight while we
1749 		 * were blocked on the metaslab lock.
1750 		 */
1751 		if (msp->ms_weight < asize || (was_active &&
1752 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1753 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1754 			mutex_exit(&msp->ms_lock);
1755 			continue;
1756 		}
1757 
1758 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1759 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1760 			metaslab_passivate(msp,
1761 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1762 			mutex_exit(&msp->ms_lock);
1763 			continue;
1764 		}
1765 
1766 		if (metaslab_activate(msp, activation_weight) != 0) {
1767 			mutex_exit(&msp->ms_lock);
1768 			continue;
1769 		}
1770 
1771 		/*
1772 		 * If this metaslab is currently condensing then pick again as
1773 		 * we can't manipulate this metaslab until it's committed
1774 		 * to disk.
1775 		 */
1776 		if (msp->ms_condensing) {
1777 			mutex_exit(&msp->ms_lock);
1778 			continue;
1779 		}
1780 
1781 		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1782 			break;
1783 
1784 		metaslab_passivate(msp, metaslab_block_maxsize(msp));
1785 		mutex_exit(&msp->ms_lock);
1786 	}
1787 
1788 	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1789 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1790 
1791 	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1792 	msp->ms_access_txg = txg + metaslab_unload_delay;
1793 
1794 	mutex_exit(&msp->ms_lock);
1795 
1796 	return (offset);
1797 }
1798 
1799 /*
1800  * Allocate a block for the specified i/o.
1801  */
1802 static int
1803 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1804     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1805 {
1806 	metaslab_group_t *mg, *rotor;
1807 	vdev_t *vd;
1808 	int dshift = 3;
1809 	int all_zero;
1810 	int zio_lock = B_FALSE;
1811 	boolean_t allocatable;
1812 	uint64_t offset = -1ULL;
1813 	uint64_t asize;
1814 	uint64_t distance;
1815 
1816 	ASSERT(!DVA_IS_VALID(&dva[d]));
1817 
1818 	/*
1819 	 * For testing, make some blocks above a certain size be gang blocks.
1820 	 */
1821 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1822 		return (SET_ERROR(ENOSPC));
1823 
1824 	/*
1825 	 * Start at the rotor and loop through all mgs until we find something.
1826 	 * Note that there's no locking on mc_rotor or mc_aliquot because
1827 	 * nothing actually breaks if we miss a few updates -- we just won't
1828 	 * allocate quite as evenly.  It all balances out over time.
1829 	 *
1830 	 * If we are doing ditto or log blocks, try to spread them across
1831 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1832 	 * allocated all of our ditto blocks, then try and spread them out on
1833 	 * that vdev as much as possible.  If it turns out to not be possible,
1834 	 * gradually lower our standards until anything becomes acceptable.
1835 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1836 	 * gives us hope of containing our fault domains to something we're
1837 	 * able to reason about.  Otherwise, any two top-level vdev failures
1838 	 * will guarantee the loss of data.  With consecutive allocation,
1839 	 * only two adjacent top-level vdev failures will result in data loss.
1840 	 *
1841 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1842 	 * ourselves on the same vdev as our gang block header.  That
1843 	 * way, we can hope for locality in vdev_cache, plus it makes our
1844 	 * fault domains something tractable.
1845 	 */
1846 	if (hintdva) {
1847 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1848 
1849 		/*
1850 		 * It's possible the vdev we're using as the hint no
1851 		 * longer exists (i.e. removed). Consult the rotor when
1852 		 * all else fails.
1853 		 */
1854 		if (vd != NULL) {
1855 			mg = vd->vdev_mg;
1856 
1857 			if (flags & METASLAB_HINTBP_AVOID &&
1858 			    mg->mg_next != NULL)
1859 				mg = mg->mg_next;
1860 		} else {
1861 			mg = mc->mc_rotor;
1862 		}
1863 	} else if (d != 0) {
1864 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1865 		mg = vd->vdev_mg->mg_next;
1866 	} else {
1867 		mg = mc->mc_rotor;
1868 	}
1869 
1870 	/*
1871 	 * If the hint put us into the wrong metaslab class, or into a
1872 	 * metaslab group that has been passivated, just follow the rotor.
1873 	 */
1874 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1875 		mg = mc->mc_rotor;
1876 
1877 	rotor = mg;
1878 top:
1879 	all_zero = B_TRUE;
1880 	do {
1881 		ASSERT(mg->mg_activation_count == 1);
1882 
1883 		vd = mg->mg_vd;
1884 
1885 		/*
1886 		 * Don't allocate from faulted devices.
1887 		 */
1888 		if (zio_lock) {
1889 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1890 			allocatable = vdev_allocatable(vd);
1891 			spa_config_exit(spa, SCL_ZIO, FTAG);
1892 		} else {
1893 			allocatable = vdev_allocatable(vd);
1894 		}
1895 
1896 		/*
1897 		 * Determine if the selected metaslab group is eligible
1898 		 * for allocations. If we're ganging or have requested
1899 		 * an allocation for the smallest gang block size
1900 		 * then we don't want to avoid allocating to the this
1901 		 * metaslab group. If we're in this condition we should
1902 		 * try to allocate from any device possible so that we
1903 		 * don't inadvertently return ENOSPC and suspend the pool
1904 		 * even though space is still available.
1905 		 */
1906 		if (allocatable && CAN_FASTGANG(flags) &&
1907 		    psize > SPA_GANGBLOCKSIZE)
1908 			allocatable = metaslab_group_allocatable(mg);
1909 
1910 		if (!allocatable)
1911 			goto next;
1912 
1913 		/*
1914 		 * Avoid writing single-copy data to a failing vdev
1915 		 * unless the user instructs us that it is okay.
1916 		 */
1917 		if ((vd->vdev_stat.vs_write_errors > 0 ||
1918 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1919 		    d == 0 && dshift == 3 &&
1920 		    !(zfs_write_to_degraded && vd->vdev_state ==
1921 		    VDEV_STATE_DEGRADED)) {
1922 			all_zero = B_FALSE;
1923 			goto next;
1924 		}
1925 
1926 		ASSERT(mg->mg_class == mc);
1927 
1928 		distance = vd->vdev_asize >> dshift;
1929 		if (distance <= (1ULL << vd->vdev_ms_shift))
1930 			distance = 0;
1931 		else
1932 			all_zero = B_FALSE;
1933 
1934 		asize = vdev_psize_to_asize(vd, psize);
1935 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1936 
1937 		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1938 		    dva, d);
1939 		if (offset != -1ULL) {
1940 			/*
1941 			 * If we've just selected this metaslab group,
1942 			 * figure out whether the corresponding vdev is
1943 			 * over- or under-used relative to the pool,
1944 			 * and set an allocation bias to even it out.
1945 			 */
1946 			if (mc->mc_aliquot == 0) {
1947 				vdev_stat_t *vs = &vd->vdev_stat;
1948 				int64_t vu, cu;
1949 
1950 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1951 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1952 
1953 				/*
1954 				 * Calculate how much more or less we should
1955 				 * try to allocate from this device during
1956 				 * this iteration around the rotor.
1957 				 * For example, if a device is 80% full
1958 				 * and the pool is 20% full then we should
1959 				 * reduce allocations by 60% on this device.
1960 				 *
1961 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
1962 				 *
1963 				 * This reduces allocations by 307K for this
1964 				 * iteration.
1965 				 */
1966 				mg->mg_bias = ((cu - vu) *
1967 				    (int64_t)mg->mg_aliquot) / 100;
1968 			}
1969 
1970 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1971 			    mg->mg_aliquot + mg->mg_bias) {
1972 				mc->mc_rotor = mg->mg_next;
1973 				mc->mc_aliquot = 0;
1974 			}
1975 
1976 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
1977 			DVA_SET_OFFSET(&dva[d], offset);
1978 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1979 			DVA_SET_ASIZE(&dva[d], asize);
1980 
1981 			return (0);
1982 		}
1983 next:
1984 		mc->mc_rotor = mg->mg_next;
1985 		mc->mc_aliquot = 0;
1986 	} while ((mg = mg->mg_next) != rotor);
1987 
1988 	if (!all_zero) {
1989 		dshift++;
1990 		ASSERT(dshift < 64);
1991 		goto top;
1992 	}
1993 
1994 	if (!allocatable && !zio_lock) {
1995 		dshift = 3;
1996 		zio_lock = B_TRUE;
1997 		goto top;
1998 	}
1999 
2000 	bzero(&dva[d], sizeof (dva_t));
2001 
2002 	return (SET_ERROR(ENOSPC));
2003 }
2004 
2005 /*
2006  * Free the block represented by DVA in the context of the specified
2007  * transaction group.
2008  */
2009 static void
2010 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
2011 {
2012 	uint64_t vdev = DVA_GET_VDEV(dva);
2013 	uint64_t offset = DVA_GET_OFFSET(dva);
2014 	uint64_t size = DVA_GET_ASIZE(dva);
2015 	vdev_t *vd;
2016 	metaslab_t *msp;
2017 
2018 	ASSERT(DVA_IS_VALID(dva));
2019 
2020 	if (txg > spa_freeze_txg(spa))
2021 		return;
2022 
2023 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2024 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
2025 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
2026 		    (u_longlong_t)vdev, (u_longlong_t)offset);
2027 		ASSERT(0);
2028 		return;
2029 	}
2030 
2031 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2032 
2033 	if (DVA_GET_GANG(dva))
2034 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2035 
2036 	mutex_enter(&msp->ms_lock);
2037 
2038 	if (now) {
2039 		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
2040 		    offset, size);
2041 
2042 		VERIFY(!msp->ms_condensing);
2043 		VERIFY3U(offset, >=, msp->ms_start);
2044 		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
2045 		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
2046 		    msp->ms_size);
2047 		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2048 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2049 		range_tree_add(msp->ms_tree, offset, size);
2050 	} else {
2051 		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2052 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2053 		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2054 		    offset, size);
2055 	}
2056 
2057 	mutex_exit(&msp->ms_lock);
2058 }
2059 
2060 /*
2061  * Intent log support: upon opening the pool after a crash, notify the SPA
2062  * of blocks that the intent log has allocated for immediate write, but
2063  * which are still considered free by the SPA because the last transaction
2064  * group didn't commit yet.
2065  */
2066 static int
2067 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2068 {
2069 	uint64_t vdev = DVA_GET_VDEV(dva);
2070 	uint64_t offset = DVA_GET_OFFSET(dva);
2071 	uint64_t size = DVA_GET_ASIZE(dva);
2072 	vdev_t *vd;
2073 	metaslab_t *msp;
2074 	int error = 0;
2075 
2076 	ASSERT(DVA_IS_VALID(dva));
2077 
2078 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2079 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2080 		return (SET_ERROR(ENXIO));
2081 
2082 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2083 
2084 	if (DVA_GET_GANG(dva))
2085 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2086 
2087 	mutex_enter(&msp->ms_lock);
2088 
2089 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2090 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2091 
2092 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2093 		error = SET_ERROR(ENOENT);
2094 
2095 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2096 		mutex_exit(&msp->ms_lock);
2097 		return (error);
2098 	}
2099 
2100 	VERIFY(!msp->ms_condensing);
2101 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2102 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2103 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2104 	range_tree_remove(msp->ms_tree, offset, size);
2105 
2106 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2107 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2108 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2109 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2110 	}
2111 
2112 	mutex_exit(&msp->ms_lock);
2113 
2114 	return (0);
2115 }
2116 
2117 int
2118 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2119     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2120 {
2121 	dva_t *dva = bp->blk_dva;
2122 	dva_t *hintdva = hintbp->blk_dva;
2123 	int error = 0;
2124 
2125 	ASSERT(bp->blk_birth == 0);
2126 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2127 
2128 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2129 
2130 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2131 		spa_config_exit(spa, SCL_ALLOC, FTAG);
2132 		return (SET_ERROR(ENOSPC));
2133 	}
2134 
2135 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2136 	ASSERT(BP_GET_NDVAS(bp) == 0);
2137 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2138 
2139 	for (int d = 0; d < ndvas; d++) {
2140 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2141 		    txg, flags);
2142 		if (error != 0) {
2143 			for (d--; d >= 0; d--) {
2144 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2145 				bzero(&dva[d], sizeof (dva_t));
2146 			}
2147 			spa_config_exit(spa, SCL_ALLOC, FTAG);
2148 			return (error);
2149 		}
2150 	}
2151 	ASSERT(error == 0);
2152 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2153 
2154 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2155 
2156 	BP_SET_BIRTH(bp, txg, txg);
2157 
2158 	return (0);
2159 }
2160 
2161 void
2162 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2163 {
2164 	const dva_t *dva = bp->blk_dva;
2165 	int ndvas = BP_GET_NDVAS(bp);
2166 
2167 	ASSERT(!BP_IS_HOLE(bp));
2168 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2169 
2170 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2171 
2172 	for (int d = 0; d < ndvas; d++)
2173 		metaslab_free_dva(spa, &dva[d], txg, now);
2174 
2175 	spa_config_exit(spa, SCL_FREE, FTAG);
2176 }
2177 
2178 int
2179 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2180 {
2181 	const dva_t *dva = bp->blk_dva;
2182 	int ndvas = BP_GET_NDVAS(bp);
2183 	int error = 0;
2184 
2185 	ASSERT(!BP_IS_HOLE(bp));
2186 
2187 	if (txg != 0) {
2188 		/*
2189 		 * First do a dry run to make sure all DVAs are claimable,
2190 		 * so we don't have to unwind from partial failures below.
2191 		 */
2192 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2193 			return (error);
2194 	}
2195 
2196 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2197 
2198 	for (int d = 0; d < ndvas; d++)
2199 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2200 			break;
2201 
2202 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2203 
2204 	ASSERT(error == 0 || txg == 0);
2205 
2206 	return (error);
2207 }
2208 
2209 void
2210 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2211 {
2212 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2213 		return;
2214 
2215 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2216 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2217 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2218 		vdev_t *vd = vdev_lookup_top(spa, vdev);
2219 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2220 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2221 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2222 
2223 		if (msp->ms_loaded)
2224 			range_tree_verify(msp->ms_tree, offset, size);
2225 
2226 		for (int j = 0; j < TXG_SIZE; j++)
2227 			range_tree_verify(msp->ms_freetree[j], offset, size);
2228 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2229 			range_tree_verify(msp->ms_defertree[j], offset, size);
2230 	}
2231 	spa_config_exit(spa, SCL_VDEV, FTAG);
2232 }
2233