xref: /titanic_41/usr/src/uts/common/fs/zfs/metaslab.c (revision 721df37ded2cce623b0249cc35a872c3ce291b9b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 #include <sys/spa_impl.h>
35 
36 /*
37  * Allow allocations to switch to gang blocks quickly. We do this to
38  * avoid having to load lots of space_maps in a given txg. There are,
39  * however, some cases where we want to avoid "fast" ganging and instead
40  * we want to do an exhaustive search of all metaslabs on this device.
41  * Currently we don't allow any gang, slog, or dump device related allocations
42  * to "fast" gang.
43  */
44 #define	CAN_FASTGANG(flags) \
45 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
46 	METASLAB_GANG_AVOID)))
47 
48 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
49 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
50 #define	METASLAB_ACTIVE_MASK		\
51 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
52 
53 uint64_t metaslab_aliquot = 512ULL << 10;
54 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
55 
56 /*
57  * The in-core space map representation is more compact than its on-disk form.
58  * The zfs_condense_pct determines how much more compact the in-core
59  * space_map representation must be before we compact it on-disk.
60  * Values should be greater than or equal to 100.
61  */
62 int zfs_condense_pct = 200;
63 
64 /*
65  * The zfs_mg_noalloc_threshold defines which metaslab groups should
66  * be eligible for allocation. The value is defined as a percentage of
67  * a free space. Metaslab groups that have more free space than
68  * zfs_mg_noalloc_threshold are always eligible for allocations. Once
69  * a metaslab group's free space is less than or equal to the
70  * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
71  * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
72  * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
73  * groups are allowed to accept allocations. Gang blocks are always
74  * eligible to allocate on any metaslab group. The default value of 0 means
75  * no metaslab group will be excluded based on this criterion.
76  */
77 int zfs_mg_noalloc_threshold = 0;
78 
79 /*
80  * When set will load all metaslabs when pool is first opened.
81  */
82 int metaslab_debug_load = 0;
83 
84 /*
85  * When set will prevent metaslabs from being unloaded.
86  */
87 int metaslab_debug_unload = 0;
88 
89 /*
90  * Minimum size which forces the dynamic allocator to change
91  * it's allocation strategy.  Once the space map cannot satisfy
92  * an allocation of this size then it switches to using more
93  * aggressive strategy (i.e search by size rather than offset).
94  */
95 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
96 
97 /*
98  * The minimum free space, in percent, which must be available
99  * in a space map to continue allocations in a first-fit fashion.
100  * Once the space_map's free space drops below this level we dynamically
101  * switch to using best-fit allocations.
102  */
103 int metaslab_df_free_pct = 4;
104 
105 /*
106  * A metaslab is considered "free" if it contains a contiguous
107  * segment which is greater than metaslab_min_alloc_size.
108  */
109 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
110 
111 /*
112  * Percentage of all cpus that can be used by the metaslab taskq.
113  */
114 int metaslab_load_pct = 50;
115 
116 /*
117  * Determines how many txgs a metaslab may remain loaded without having any
118  * allocations from it. As long as a metaslab continues to be used we will
119  * keep it loaded.
120  */
121 int metaslab_unload_delay = TXG_SIZE * 2;
122 
123 /*
124  * Should we be willing to write data to degraded vdevs?
125  */
126 boolean_t zfs_write_to_degraded = B_FALSE;
127 
128 /*
129  * Max number of metaslabs per group to preload.
130  */
131 int metaslab_preload_limit = SPA_DVAS_PER_BP;
132 
133 /*
134  * Enable/disable preloading of metaslab.
135  */
136 boolean_t metaslab_preload_enabled = B_TRUE;
137 
138 /*
139  * Enable/disable additional weight factor for each metaslab.
140  */
141 boolean_t metaslab_weight_factor_enable = B_FALSE;
142 
143 
144 /*
145  * ==========================================================================
146  * Metaslab classes
147  * ==========================================================================
148  */
149 metaslab_class_t *
150 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
151 {
152 	metaslab_class_t *mc;
153 
154 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
155 
156 	mc->mc_spa = spa;
157 	mc->mc_rotor = NULL;
158 	mc->mc_ops = ops;
159 
160 	return (mc);
161 }
162 
163 void
164 metaslab_class_destroy(metaslab_class_t *mc)
165 {
166 	ASSERT(mc->mc_rotor == NULL);
167 	ASSERT(mc->mc_alloc == 0);
168 	ASSERT(mc->mc_deferred == 0);
169 	ASSERT(mc->mc_space == 0);
170 	ASSERT(mc->mc_dspace == 0);
171 
172 	kmem_free(mc, sizeof (metaslab_class_t));
173 }
174 
175 int
176 metaslab_class_validate(metaslab_class_t *mc)
177 {
178 	metaslab_group_t *mg;
179 	vdev_t *vd;
180 
181 	/*
182 	 * Must hold one of the spa_config locks.
183 	 */
184 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
185 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
186 
187 	if ((mg = mc->mc_rotor) == NULL)
188 		return (0);
189 
190 	do {
191 		vd = mg->mg_vd;
192 		ASSERT(vd->vdev_mg != NULL);
193 		ASSERT3P(vd->vdev_top, ==, vd);
194 		ASSERT3P(mg->mg_class, ==, mc);
195 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
196 	} while ((mg = mg->mg_next) != mc->mc_rotor);
197 
198 	return (0);
199 }
200 
201 void
202 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
203     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
204 {
205 	atomic_add_64(&mc->mc_alloc, alloc_delta);
206 	atomic_add_64(&mc->mc_deferred, defer_delta);
207 	atomic_add_64(&mc->mc_space, space_delta);
208 	atomic_add_64(&mc->mc_dspace, dspace_delta);
209 }
210 
211 uint64_t
212 metaslab_class_get_alloc(metaslab_class_t *mc)
213 {
214 	return (mc->mc_alloc);
215 }
216 
217 uint64_t
218 metaslab_class_get_deferred(metaslab_class_t *mc)
219 {
220 	return (mc->mc_deferred);
221 }
222 
223 uint64_t
224 metaslab_class_get_space(metaslab_class_t *mc)
225 {
226 	return (mc->mc_space);
227 }
228 
229 uint64_t
230 metaslab_class_get_dspace(metaslab_class_t *mc)
231 {
232 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
233 }
234 
235 /*
236  * ==========================================================================
237  * Metaslab groups
238  * ==========================================================================
239  */
240 static int
241 metaslab_compare(const void *x1, const void *x2)
242 {
243 	const metaslab_t *m1 = x1;
244 	const metaslab_t *m2 = x2;
245 
246 	if (m1->ms_weight < m2->ms_weight)
247 		return (1);
248 	if (m1->ms_weight > m2->ms_weight)
249 		return (-1);
250 
251 	/*
252 	 * If the weights are identical, use the offset to force uniqueness.
253 	 */
254 	if (m1->ms_start < m2->ms_start)
255 		return (-1);
256 	if (m1->ms_start > m2->ms_start)
257 		return (1);
258 
259 	ASSERT3P(m1, ==, m2);
260 
261 	return (0);
262 }
263 
264 /*
265  * Update the allocatable flag and the metaslab group's capacity.
266  * The allocatable flag is set to true if the capacity is below
267  * the zfs_mg_noalloc_threshold. If a metaslab group transitions
268  * from allocatable to non-allocatable or vice versa then the metaslab
269  * group's class is updated to reflect the transition.
270  */
271 static void
272 metaslab_group_alloc_update(metaslab_group_t *mg)
273 {
274 	vdev_t *vd = mg->mg_vd;
275 	metaslab_class_t *mc = mg->mg_class;
276 	vdev_stat_t *vs = &vd->vdev_stat;
277 	boolean_t was_allocatable;
278 
279 	ASSERT(vd == vd->vdev_top);
280 
281 	mutex_enter(&mg->mg_lock);
282 	was_allocatable = mg->mg_allocatable;
283 
284 	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
285 	    (vs->vs_space + 1);
286 
287 	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
288 
289 	/*
290 	 * The mc_alloc_groups maintains a count of the number of
291 	 * groups in this metaslab class that are still above the
292 	 * zfs_mg_noalloc_threshold. This is used by the allocating
293 	 * threads to determine if they should avoid allocations to
294 	 * a given group. The allocator will avoid allocations to a group
295 	 * if that group has reached or is below the zfs_mg_noalloc_threshold
296 	 * and there are still other groups that are above the threshold.
297 	 * When a group transitions from allocatable to non-allocatable or
298 	 * vice versa we update the metaslab class to reflect that change.
299 	 * When the mc_alloc_groups value drops to 0 that means that all
300 	 * groups have reached the zfs_mg_noalloc_threshold making all groups
301 	 * eligible for allocations. This effectively means that all devices
302 	 * are balanced again.
303 	 */
304 	if (was_allocatable && !mg->mg_allocatable)
305 		mc->mc_alloc_groups--;
306 	else if (!was_allocatable && mg->mg_allocatable)
307 		mc->mc_alloc_groups++;
308 	mutex_exit(&mg->mg_lock);
309 }
310 
311 metaslab_group_t *
312 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
313 {
314 	metaslab_group_t *mg;
315 
316 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
317 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
318 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
319 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
320 	mg->mg_vd = vd;
321 	mg->mg_class = mc;
322 	mg->mg_activation_count = 0;
323 
324 	mg->mg_taskq = taskq_create("metaslab_group_tasksq", metaslab_load_pct,
325 	    minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
326 
327 	return (mg);
328 }
329 
330 void
331 metaslab_group_destroy(metaslab_group_t *mg)
332 {
333 	ASSERT(mg->mg_prev == NULL);
334 	ASSERT(mg->mg_next == NULL);
335 	/*
336 	 * We may have gone below zero with the activation count
337 	 * either because we never activated in the first place or
338 	 * because we're done, and possibly removing the vdev.
339 	 */
340 	ASSERT(mg->mg_activation_count <= 0);
341 
342 	avl_destroy(&mg->mg_metaslab_tree);
343 	mutex_destroy(&mg->mg_lock);
344 	kmem_free(mg, sizeof (metaslab_group_t));
345 }
346 
347 void
348 metaslab_group_activate(metaslab_group_t *mg)
349 {
350 	metaslab_class_t *mc = mg->mg_class;
351 	metaslab_group_t *mgprev, *mgnext;
352 
353 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
354 
355 	ASSERT(mc->mc_rotor != mg);
356 	ASSERT(mg->mg_prev == NULL);
357 	ASSERT(mg->mg_next == NULL);
358 	ASSERT(mg->mg_activation_count <= 0);
359 
360 	if (++mg->mg_activation_count <= 0)
361 		return;
362 
363 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
364 	metaslab_group_alloc_update(mg);
365 
366 	if ((mgprev = mc->mc_rotor) == NULL) {
367 		mg->mg_prev = mg;
368 		mg->mg_next = mg;
369 	} else {
370 		mgnext = mgprev->mg_next;
371 		mg->mg_prev = mgprev;
372 		mg->mg_next = mgnext;
373 		mgprev->mg_next = mg;
374 		mgnext->mg_prev = mg;
375 	}
376 	mc->mc_rotor = mg;
377 }
378 
379 void
380 metaslab_group_passivate(metaslab_group_t *mg)
381 {
382 	metaslab_class_t *mc = mg->mg_class;
383 	metaslab_group_t *mgprev, *mgnext;
384 
385 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
386 
387 	if (--mg->mg_activation_count != 0) {
388 		ASSERT(mc->mc_rotor != mg);
389 		ASSERT(mg->mg_prev == NULL);
390 		ASSERT(mg->mg_next == NULL);
391 		ASSERT(mg->mg_activation_count < 0);
392 		return;
393 	}
394 
395 	taskq_wait(mg->mg_taskq);
396 
397 	mgprev = mg->mg_prev;
398 	mgnext = mg->mg_next;
399 
400 	if (mg == mgnext) {
401 		mc->mc_rotor = NULL;
402 	} else {
403 		mc->mc_rotor = mgnext;
404 		mgprev->mg_next = mgnext;
405 		mgnext->mg_prev = mgprev;
406 	}
407 
408 	mg->mg_prev = NULL;
409 	mg->mg_next = NULL;
410 }
411 
412 static void
413 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
414 {
415 	mutex_enter(&mg->mg_lock);
416 	ASSERT(msp->ms_group == NULL);
417 	msp->ms_group = mg;
418 	msp->ms_weight = 0;
419 	avl_add(&mg->mg_metaslab_tree, msp);
420 	mutex_exit(&mg->mg_lock);
421 }
422 
423 static void
424 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
425 {
426 	mutex_enter(&mg->mg_lock);
427 	ASSERT(msp->ms_group == mg);
428 	avl_remove(&mg->mg_metaslab_tree, msp);
429 	msp->ms_group = NULL;
430 	mutex_exit(&mg->mg_lock);
431 }
432 
433 static void
434 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
435 {
436 	/*
437 	 * Although in principle the weight can be any value, in
438 	 * practice we do not use values in the range [1, 510].
439 	 */
440 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
441 	ASSERT(MUTEX_HELD(&msp->ms_lock));
442 
443 	mutex_enter(&mg->mg_lock);
444 	ASSERT(msp->ms_group == mg);
445 	avl_remove(&mg->mg_metaslab_tree, msp);
446 	msp->ms_weight = weight;
447 	avl_add(&mg->mg_metaslab_tree, msp);
448 	mutex_exit(&mg->mg_lock);
449 }
450 
451 /*
452  * Determine if a given metaslab group should skip allocations. A metaslab
453  * group should avoid allocations if its used capacity has crossed the
454  * zfs_mg_noalloc_threshold and there is at least one metaslab group
455  * that can still handle allocations.
456  */
457 static boolean_t
458 metaslab_group_allocatable(metaslab_group_t *mg)
459 {
460 	vdev_t *vd = mg->mg_vd;
461 	spa_t *spa = vd->vdev_spa;
462 	metaslab_class_t *mc = mg->mg_class;
463 
464 	/*
465 	 * A metaslab group is considered allocatable if its free capacity
466 	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
467 	 * associated with a slog, or there are no other metaslab groups
468 	 * with free capacity greater than zfs_mg_noalloc_threshold.
469 	 */
470 	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
471 	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
472 }
473 
474 /*
475  * ==========================================================================
476  * Range tree callbacks
477  * ==========================================================================
478  */
479 
480 /*
481  * Comparison function for the private size-ordered tree. Tree is sorted
482  * by size, larger sizes at the end of the tree.
483  */
484 static int
485 metaslab_rangesize_compare(const void *x1, const void *x2)
486 {
487 	const range_seg_t *r1 = x1;
488 	const range_seg_t *r2 = x2;
489 	uint64_t rs_size1 = r1->rs_end - r1->rs_start;
490 	uint64_t rs_size2 = r2->rs_end - r2->rs_start;
491 
492 	if (rs_size1 < rs_size2)
493 		return (-1);
494 	if (rs_size1 > rs_size2)
495 		return (1);
496 
497 	if (r1->rs_start < r2->rs_start)
498 		return (-1);
499 
500 	if (r1->rs_start > r2->rs_start)
501 		return (1);
502 
503 	return (0);
504 }
505 
506 /*
507  * Create any block allocator specific components. The current allocators
508  * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
509  */
510 static void
511 metaslab_rt_create(range_tree_t *rt, void *arg)
512 {
513 	metaslab_t *msp = arg;
514 
515 	ASSERT3P(rt->rt_arg, ==, msp);
516 	ASSERT(msp->ms_tree == NULL);
517 
518 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
519 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
520 }
521 
522 /*
523  * Destroy the block allocator specific components.
524  */
525 static void
526 metaslab_rt_destroy(range_tree_t *rt, void *arg)
527 {
528 	metaslab_t *msp = arg;
529 
530 	ASSERT3P(rt->rt_arg, ==, msp);
531 	ASSERT3P(msp->ms_tree, ==, rt);
532 	ASSERT0(avl_numnodes(&msp->ms_size_tree));
533 
534 	avl_destroy(&msp->ms_size_tree);
535 }
536 
537 static void
538 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
539 {
540 	metaslab_t *msp = arg;
541 
542 	ASSERT3P(rt->rt_arg, ==, msp);
543 	ASSERT3P(msp->ms_tree, ==, rt);
544 	VERIFY(!msp->ms_condensing);
545 	avl_add(&msp->ms_size_tree, rs);
546 }
547 
548 static void
549 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
550 {
551 	metaslab_t *msp = arg;
552 
553 	ASSERT3P(rt->rt_arg, ==, msp);
554 	ASSERT3P(msp->ms_tree, ==, rt);
555 	VERIFY(!msp->ms_condensing);
556 	avl_remove(&msp->ms_size_tree, rs);
557 }
558 
559 static void
560 metaslab_rt_vacate(range_tree_t *rt, void *arg)
561 {
562 	metaslab_t *msp = arg;
563 
564 	ASSERT3P(rt->rt_arg, ==, msp);
565 	ASSERT3P(msp->ms_tree, ==, rt);
566 
567 	/*
568 	 * Normally one would walk the tree freeing nodes along the way.
569 	 * Since the nodes are shared with the range trees we can avoid
570 	 * walking all nodes and just reinitialize the avl tree. The nodes
571 	 * will be freed by the range tree, so we don't want to free them here.
572 	 */
573 	avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
574 	    sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
575 }
576 
577 static range_tree_ops_t metaslab_rt_ops = {
578 	metaslab_rt_create,
579 	metaslab_rt_destroy,
580 	metaslab_rt_add,
581 	metaslab_rt_remove,
582 	metaslab_rt_vacate
583 };
584 
585 /*
586  * ==========================================================================
587  * Metaslab block operations
588  * ==========================================================================
589  */
590 
591 /*
592  * Return the maximum contiguous segment within the metaslab.
593  */
594 uint64_t
595 metaslab_block_maxsize(metaslab_t *msp)
596 {
597 	avl_tree_t *t = &msp->ms_size_tree;
598 	range_seg_t *rs;
599 
600 	if (t == NULL || (rs = avl_last(t)) == NULL)
601 		return (0ULL);
602 
603 	return (rs->rs_end - rs->rs_start);
604 }
605 
606 uint64_t
607 metaslab_block_alloc(metaslab_t *msp, uint64_t size)
608 {
609 	uint64_t start;
610 	range_tree_t *rt = msp->ms_tree;
611 
612 	VERIFY(!msp->ms_condensing);
613 
614 	start = msp->ms_ops->msop_alloc(msp, size);
615 	if (start != -1ULL) {
616 		vdev_t *vd = msp->ms_group->mg_vd;
617 
618 		VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
619 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
620 		VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
621 		range_tree_remove(rt, start, size);
622 	}
623 	return (start);
624 }
625 
626 /*
627  * ==========================================================================
628  * Common allocator routines
629  * ==========================================================================
630  */
631 
632 /*
633  * This is a helper function that can be used by the allocator to find
634  * a suitable block to allocate. This will search the specified AVL
635  * tree looking for a block that matches the specified criteria.
636  */
637 static uint64_t
638 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
639     uint64_t align)
640 {
641 	range_seg_t *rs, rsearch;
642 	avl_index_t where;
643 
644 	rsearch.rs_start = *cursor;
645 	rsearch.rs_end = *cursor + size;
646 
647 	rs = avl_find(t, &rsearch, &where);
648 	if (rs == NULL)
649 		rs = avl_nearest(t, where, AVL_AFTER);
650 
651 	while (rs != NULL) {
652 		uint64_t offset = P2ROUNDUP(rs->rs_start, align);
653 
654 		if (offset + size <= rs->rs_end) {
655 			*cursor = offset + size;
656 			return (offset);
657 		}
658 		rs = AVL_NEXT(t, rs);
659 	}
660 
661 	/*
662 	 * If we know we've searched the whole map (*cursor == 0), give up.
663 	 * Otherwise, reset the cursor to the beginning and try again.
664 	 */
665 	if (*cursor == 0)
666 		return (-1ULL);
667 
668 	*cursor = 0;
669 	return (metaslab_block_picker(t, cursor, size, align));
670 }
671 
672 /*
673  * ==========================================================================
674  * The first-fit block allocator
675  * ==========================================================================
676  */
677 static uint64_t
678 metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
679 {
680 	/*
681 	 * Find the largest power of 2 block size that evenly divides the
682 	 * requested size. This is used to try to allocate blocks with similar
683 	 * alignment from the same area of the metaslab (i.e. same cursor
684 	 * bucket) but it does not guarantee that other allocations sizes
685 	 * may exist in the same region.
686 	 */
687 	uint64_t align = size & -size;
688 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
689 	avl_tree_t *t = &msp->ms_tree->rt_root;
690 
691 	return (metaslab_block_picker(t, cursor, size, align));
692 }
693 
694 /* ARGSUSED */
695 static boolean_t
696 metaslab_ff_fragmented(metaslab_t *msp)
697 {
698 	return (B_TRUE);
699 }
700 
701 static metaslab_ops_t metaslab_ff_ops = {
702 	metaslab_ff_alloc,
703 	metaslab_ff_fragmented
704 };
705 
706 /*
707  * ==========================================================================
708  * Dynamic block allocator -
709  * Uses the first fit allocation scheme until space get low and then
710  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
711  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
712  * ==========================================================================
713  */
714 static uint64_t
715 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
716 {
717 	/*
718 	 * Find the largest power of 2 block size that evenly divides the
719 	 * requested size. This is used to try to allocate blocks with similar
720 	 * alignment from the same area of the metaslab (i.e. same cursor
721 	 * bucket) but it does not guarantee that other allocations sizes
722 	 * may exist in the same region.
723 	 */
724 	uint64_t align = size & -size;
725 	uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
726 	range_tree_t *rt = msp->ms_tree;
727 	avl_tree_t *t = &rt->rt_root;
728 	uint64_t max_size = metaslab_block_maxsize(msp);
729 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
730 
731 	ASSERT(MUTEX_HELD(&msp->ms_lock));
732 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
733 
734 	if (max_size < size)
735 		return (-1ULL);
736 
737 	/*
738 	 * If we're running low on space switch to using the size
739 	 * sorted AVL tree (best-fit).
740 	 */
741 	if (max_size < metaslab_df_alloc_threshold ||
742 	    free_pct < metaslab_df_free_pct) {
743 		t = &msp->ms_size_tree;
744 		*cursor = 0;
745 	}
746 
747 	return (metaslab_block_picker(t, cursor, size, 1ULL));
748 }
749 
750 static boolean_t
751 metaslab_df_fragmented(metaslab_t *msp)
752 {
753 	range_tree_t *rt = msp->ms_tree;
754 	uint64_t max_size = metaslab_block_maxsize(msp);
755 	int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
756 
757 	if (max_size >= metaslab_df_alloc_threshold &&
758 	    free_pct >= metaslab_df_free_pct)
759 		return (B_FALSE);
760 
761 	return (B_TRUE);
762 }
763 
764 static metaslab_ops_t metaslab_df_ops = {
765 	metaslab_df_alloc,
766 	metaslab_df_fragmented
767 };
768 
769 /*
770  * ==========================================================================
771  * Cursor fit block allocator -
772  * Select the largest region in the metaslab, set the cursor to the beginning
773  * of the range and the cursor_end to the end of the range. As allocations
774  * are made advance the cursor. Continue allocating from the cursor until
775  * the range is exhausted and then find a new range.
776  * ==========================================================================
777  */
778 static uint64_t
779 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
780 {
781 	range_tree_t *rt = msp->ms_tree;
782 	avl_tree_t *t = &msp->ms_size_tree;
783 	uint64_t *cursor = &msp->ms_lbas[0];
784 	uint64_t *cursor_end = &msp->ms_lbas[1];
785 	uint64_t offset = 0;
786 
787 	ASSERT(MUTEX_HELD(&msp->ms_lock));
788 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
789 
790 	ASSERT3U(*cursor_end, >=, *cursor);
791 
792 	if ((*cursor + size) > *cursor_end) {
793 		range_seg_t *rs;
794 
795 		rs = avl_last(&msp->ms_size_tree);
796 		if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
797 			return (-1ULL);
798 
799 		*cursor = rs->rs_start;
800 		*cursor_end = rs->rs_end;
801 	}
802 
803 	offset = *cursor;
804 	*cursor += size;
805 
806 	return (offset);
807 }
808 
809 static boolean_t
810 metaslab_cf_fragmented(metaslab_t *msp)
811 {
812 	return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size);
813 }
814 
815 static metaslab_ops_t metaslab_cf_ops = {
816 	metaslab_cf_alloc,
817 	metaslab_cf_fragmented
818 };
819 
820 /*
821  * ==========================================================================
822  * New dynamic fit allocator -
823  * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
824  * contiguous blocks. If no region is found then just use the largest segment
825  * that remains.
826  * ==========================================================================
827  */
828 
829 /*
830  * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
831  * to request from the allocator.
832  */
833 uint64_t metaslab_ndf_clump_shift = 4;
834 
835 static uint64_t
836 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
837 {
838 	avl_tree_t *t = &msp->ms_tree->rt_root;
839 	avl_index_t where;
840 	range_seg_t *rs, rsearch;
841 	uint64_t hbit = highbit64(size);
842 	uint64_t *cursor = &msp->ms_lbas[hbit - 1];
843 	uint64_t max_size = metaslab_block_maxsize(msp);
844 
845 	ASSERT(MUTEX_HELD(&msp->ms_lock));
846 	ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
847 
848 	if (max_size < size)
849 		return (-1ULL);
850 
851 	rsearch.rs_start = *cursor;
852 	rsearch.rs_end = *cursor + size;
853 
854 	rs = avl_find(t, &rsearch, &where);
855 	if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
856 		t = &msp->ms_size_tree;
857 
858 		rsearch.rs_start = 0;
859 		rsearch.rs_end = MIN(max_size,
860 		    1ULL << (hbit + metaslab_ndf_clump_shift));
861 		rs = avl_find(t, &rsearch, &where);
862 		if (rs == NULL)
863 			rs = avl_nearest(t, where, AVL_AFTER);
864 		ASSERT(rs != NULL);
865 	}
866 
867 	if ((rs->rs_end - rs->rs_start) >= size) {
868 		*cursor = rs->rs_start + size;
869 		return (rs->rs_start);
870 	}
871 	return (-1ULL);
872 }
873 
874 static boolean_t
875 metaslab_ndf_fragmented(metaslab_t *msp)
876 {
877 	return (metaslab_block_maxsize(msp) <=
878 	    (metaslab_min_alloc_size << metaslab_ndf_clump_shift));
879 }
880 
881 static metaslab_ops_t metaslab_ndf_ops = {
882 	metaslab_ndf_alloc,
883 	metaslab_ndf_fragmented
884 };
885 
886 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
887 
888 /*
889  * ==========================================================================
890  * Metaslabs
891  * ==========================================================================
892  */
893 
894 /*
895  * Wait for any in-progress metaslab loads to complete.
896  */
897 void
898 metaslab_load_wait(metaslab_t *msp)
899 {
900 	ASSERT(MUTEX_HELD(&msp->ms_lock));
901 
902 	while (msp->ms_loading) {
903 		ASSERT(!msp->ms_loaded);
904 		cv_wait(&msp->ms_load_cv, &msp->ms_lock);
905 	}
906 }
907 
908 int
909 metaslab_load(metaslab_t *msp)
910 {
911 	int error = 0;
912 
913 	ASSERT(MUTEX_HELD(&msp->ms_lock));
914 	ASSERT(!msp->ms_loaded);
915 	ASSERT(!msp->ms_loading);
916 
917 	msp->ms_loading = B_TRUE;
918 
919 	/*
920 	 * If the space map has not been allocated yet, then treat
921 	 * all the space in the metaslab as free and add it to the
922 	 * ms_tree.
923 	 */
924 	if (msp->ms_sm != NULL)
925 		error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
926 	else
927 		range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
928 
929 	msp->ms_loaded = (error == 0);
930 	msp->ms_loading = B_FALSE;
931 
932 	if (msp->ms_loaded) {
933 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
934 			range_tree_walk(msp->ms_defertree[t],
935 			    range_tree_remove, msp->ms_tree);
936 		}
937 	}
938 	cv_broadcast(&msp->ms_load_cv);
939 	return (error);
940 }
941 
942 void
943 metaslab_unload(metaslab_t *msp)
944 {
945 	ASSERT(MUTEX_HELD(&msp->ms_lock));
946 	range_tree_vacate(msp->ms_tree, NULL, NULL);
947 	msp->ms_loaded = B_FALSE;
948 	msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
949 }
950 
951 metaslab_t *
952 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg)
953 {
954 	vdev_t *vd = mg->mg_vd;
955 	objset_t *mos = vd->vdev_spa->spa_meta_objset;
956 	metaslab_t *msp;
957 
958 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
959 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
960 	cv_init(&msp->ms_load_cv, NULL, CV_DEFAULT, NULL);
961 	msp->ms_id = id;
962 	msp->ms_start = id << vd->vdev_ms_shift;
963 	msp->ms_size = 1ULL << vd->vdev_ms_shift;
964 
965 	/*
966 	 * We only open space map objects that already exist. All others
967 	 * will be opened when we finally allocate an object for it.
968 	 */
969 	if (object != 0) {
970 		VERIFY0(space_map_open(&msp->ms_sm, mos, object, msp->ms_start,
971 		    msp->ms_size, vd->vdev_ashift, &msp->ms_lock));
972 		ASSERT(msp->ms_sm != NULL);
973 	}
974 
975 	/*
976 	 * We create the main range tree here, but we don't create the
977 	 * alloctree and freetree until metaslab_sync_done().  This serves
978 	 * two purposes: it allows metaslab_sync_done() to detect the
979 	 * addition of new space; and for debugging, it ensures that we'd
980 	 * data fault on any attempt to use this metaslab before it's ready.
981 	 */
982 	msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock);
983 	metaslab_group_add(mg, msp);
984 
985 	msp->ms_ops = mg->mg_class->mc_ops;
986 
987 	/*
988 	 * If we're opening an existing pool (txg == 0) or creating
989 	 * a new one (txg == TXG_INITIAL), all space is available now.
990 	 * If we're adding space to an existing pool, the new space
991 	 * does not become available until after this txg has synced.
992 	 */
993 	if (txg <= TXG_INITIAL)
994 		metaslab_sync_done(msp, 0);
995 
996 	/*
997 	 * If metaslab_debug_load is set and we're initializing a metaslab
998 	 * that has an allocated space_map object then load the its space
999 	 * map so that can verify frees.
1000 	 */
1001 	if (metaslab_debug_load && msp->ms_sm != NULL) {
1002 		mutex_enter(&msp->ms_lock);
1003 		VERIFY0(metaslab_load(msp));
1004 		mutex_exit(&msp->ms_lock);
1005 	}
1006 
1007 	if (txg != 0) {
1008 		vdev_dirty(vd, 0, NULL, txg);
1009 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
1010 	}
1011 
1012 	return (msp);
1013 }
1014 
1015 void
1016 metaslab_fini(metaslab_t *msp)
1017 {
1018 	metaslab_group_t *mg = msp->ms_group;
1019 
1020 	metaslab_group_remove(mg, msp);
1021 
1022 	mutex_enter(&msp->ms_lock);
1023 
1024 	VERIFY(msp->ms_group == NULL);
1025 	vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
1026 	    0, -msp->ms_size);
1027 	space_map_close(msp->ms_sm);
1028 
1029 	metaslab_unload(msp);
1030 	range_tree_destroy(msp->ms_tree);
1031 
1032 	for (int t = 0; t < TXG_SIZE; t++) {
1033 		range_tree_destroy(msp->ms_alloctree[t]);
1034 		range_tree_destroy(msp->ms_freetree[t]);
1035 	}
1036 
1037 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1038 		range_tree_destroy(msp->ms_defertree[t]);
1039 	}
1040 
1041 	ASSERT0(msp->ms_deferspace);
1042 
1043 	mutex_exit(&msp->ms_lock);
1044 	cv_destroy(&msp->ms_load_cv);
1045 	mutex_destroy(&msp->ms_lock);
1046 
1047 	kmem_free(msp, sizeof (metaslab_t));
1048 }
1049 
1050 /*
1051  * Apply a weighting factor based on the histogram information for this
1052  * metaslab. The current weighting factor is somewhat arbitrary and requires
1053  * additional investigation. The implementation provides a measure of
1054  * "weighted" free space and gives a higher weighting for larger contiguous
1055  * regions. The weighting factor is determined by counting the number of
1056  * sm_shift sectors that exist in each region represented by the histogram.
1057  * That value is then multiplied by the power of 2 exponent and the sm_shift
1058  * value.
1059  *
1060  * For example, assume the 2^21 histogram bucket has 4 2MB regions and the
1061  * metaslab has an sm_shift value of 9 (512B):
1062  *
1063  * 1) calculate the number of sm_shift sectors in the region:
1064  *	2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384
1065  * 2) multiply by the power of 2 exponent and the sm_shift value:
1066  *	16384 * 21 * 9 = 3096576
1067  * This value will be added to the weighting of the metaslab.
1068  */
1069 static uint64_t
1070 metaslab_weight_factor(metaslab_t *msp)
1071 {
1072 	uint64_t factor = 0;
1073 	uint64_t sectors;
1074 	int i;
1075 
1076 	/*
1077 	 * A null space map means that the entire metaslab is free,
1078 	 * calculate a weight factor that spans the entire size of the
1079 	 * metaslab.
1080 	 */
1081 	if (msp->ms_sm == NULL) {
1082 		vdev_t *vd = msp->ms_group->mg_vd;
1083 
1084 		i = highbit64(msp->ms_size) - 1;
1085 		sectors = msp->ms_size >> vd->vdev_ashift;
1086 		return (sectors * i * vd->vdev_ashift);
1087 	}
1088 
1089 	if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
1090 		return (0);
1091 
1092 	for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) {
1093 		if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
1094 			continue;
1095 
1096 		/*
1097 		 * Determine the number of sm_shift sectors in the region
1098 		 * indicated by the histogram. For example, given an
1099 		 * sm_shift value of 9 (512 bytes) and i = 4 then we know
1100 		 * that we're looking at an 8K region in the histogram
1101 		 * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the
1102 		 * number of sm_shift sectors (512 bytes in this example),
1103 		 * we would take 8192 / 512 = 16. Since the histogram
1104 		 * is offset by sm_shift we can simply use the value of
1105 		 * of i to calculate this (i.e. 2^i = 16 where i = 4).
1106 		 */
1107 		sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i;
1108 		factor += (i + msp->ms_sm->sm_shift) * sectors;
1109 	}
1110 	return (factor * msp->ms_sm->sm_shift);
1111 }
1112 
1113 static uint64_t
1114 metaslab_weight(metaslab_t *msp)
1115 {
1116 	metaslab_group_t *mg = msp->ms_group;
1117 	vdev_t *vd = mg->mg_vd;
1118 	uint64_t weight, space;
1119 
1120 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1121 
1122 	/*
1123 	 * This vdev is in the process of being removed so there is nothing
1124 	 * for us to do here.
1125 	 */
1126 	if (vd->vdev_removing) {
1127 		ASSERT0(space_map_allocated(msp->ms_sm));
1128 		ASSERT0(vd->vdev_ms_shift);
1129 		return (0);
1130 	}
1131 
1132 	/*
1133 	 * The baseline weight is the metaslab's free space.
1134 	 */
1135 	space = msp->ms_size - space_map_allocated(msp->ms_sm);
1136 	weight = space;
1137 
1138 	/*
1139 	 * Modern disks have uniform bit density and constant angular velocity.
1140 	 * Therefore, the outer recording zones are faster (higher bandwidth)
1141 	 * than the inner zones by the ratio of outer to inner track diameter,
1142 	 * which is typically around 2:1.  We account for this by assigning
1143 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
1144 	 * In effect, this means that we'll select the metaslab with the most
1145 	 * free bandwidth rather than simply the one with the most free space.
1146 	 */
1147 	weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
1148 	ASSERT(weight >= space && weight <= 2 * space);
1149 
1150 	msp->ms_factor = metaslab_weight_factor(msp);
1151 	if (metaslab_weight_factor_enable)
1152 		weight += msp->ms_factor;
1153 
1154 	if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) {
1155 		/*
1156 		 * If this metaslab is one we're actively using, adjust its
1157 		 * weight to make it preferable to any inactive metaslab so
1158 		 * we'll polish it off.
1159 		 */
1160 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
1161 	}
1162 
1163 	return (weight);
1164 }
1165 
1166 static int
1167 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
1168 {
1169 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1170 
1171 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1172 		metaslab_load_wait(msp);
1173 		if (!msp->ms_loaded) {
1174 			int error = metaslab_load(msp);
1175 			if (error) {
1176 				metaslab_group_sort(msp->ms_group, msp, 0);
1177 				return (error);
1178 			}
1179 		}
1180 
1181 		metaslab_group_sort(msp->ms_group, msp,
1182 		    msp->ms_weight | activation_weight);
1183 	}
1184 	ASSERT(msp->ms_loaded);
1185 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
1186 
1187 	return (0);
1188 }
1189 
1190 static void
1191 metaslab_passivate(metaslab_t *msp, uint64_t size)
1192 {
1193 	/*
1194 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
1195 	 * this metaslab again.  In that case, it had better be empty,
1196 	 * or we would be leaving space on the table.
1197 	 */
1198 	ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
1199 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
1200 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
1201 }
1202 
1203 static void
1204 metaslab_preload(void *arg)
1205 {
1206 	metaslab_t *msp = arg;
1207 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1208 
1209 	mutex_enter(&msp->ms_lock);
1210 	metaslab_load_wait(msp);
1211 	if (!msp->ms_loaded)
1212 		(void) metaslab_load(msp);
1213 
1214 	/*
1215 	 * Set the ms_access_txg value so that we don't unload it right away.
1216 	 */
1217 	msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
1218 	mutex_exit(&msp->ms_lock);
1219 }
1220 
1221 static void
1222 metaslab_group_preload(metaslab_group_t *mg)
1223 {
1224 	spa_t *spa = mg->mg_vd->vdev_spa;
1225 	metaslab_t *msp;
1226 	avl_tree_t *t = &mg->mg_metaslab_tree;
1227 	int m = 0;
1228 
1229 	if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
1230 		taskq_wait(mg->mg_taskq);
1231 		return;
1232 	}
1233 	mutex_enter(&mg->mg_lock);
1234 
1235 	/*
1236 	 * Prefetch the next potential metaslabs
1237 	 */
1238 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
1239 
1240 		/* If we have reached our preload limit then we're done */
1241 		if (++m > metaslab_preload_limit)
1242 			break;
1243 
1244 		VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
1245 		    msp, TQ_SLEEP) != NULL);
1246 	}
1247 	mutex_exit(&mg->mg_lock);
1248 }
1249 
1250 /*
1251  * Determine if the space map's on-disk footprint is past our tolerance
1252  * for inefficiency. We would like to use the following criteria to make
1253  * our decision:
1254  *
1255  * 1. The size of the space map object should not dramatically increase as a
1256  * result of writing out the free space range tree.
1257  *
1258  * 2. The minimal on-disk space map representation is zfs_condense_pct/100
1259  * times the size than the free space range tree representation
1260  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
1261  *
1262  * Checking the first condition is tricky since we don't want to walk
1263  * the entire AVL tree calculating the estimated on-disk size. Instead we
1264  * use the size-ordered range tree in the metaslab and calculate the
1265  * size required to write out the largest segment in our free tree. If the
1266  * size required to represent that segment on disk is larger than the space
1267  * map object then we avoid condensing this map.
1268  *
1269  * To determine the second criterion we use a best-case estimate and assume
1270  * each segment can be represented on-disk as a single 64-bit entry. We refer
1271  * to this best-case estimate as the space map's minimal form.
1272  */
1273 static boolean_t
1274 metaslab_should_condense(metaslab_t *msp)
1275 {
1276 	space_map_t *sm = msp->ms_sm;
1277 	range_seg_t *rs;
1278 	uint64_t size, entries, segsz;
1279 
1280 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1281 	ASSERT(msp->ms_loaded);
1282 
1283 	/*
1284 	 * Use the ms_size_tree range tree, which is ordered by size, to
1285 	 * obtain the largest segment in the free tree. If the tree is empty
1286 	 * then we should condense the map.
1287 	 */
1288 	rs = avl_last(&msp->ms_size_tree);
1289 	if (rs == NULL)
1290 		return (B_TRUE);
1291 
1292 	/*
1293 	 * Calculate the number of 64-bit entries this segment would
1294 	 * require when written to disk. If this single segment would be
1295 	 * larger on-disk than the entire current on-disk structure, then
1296 	 * clearly condensing will increase the on-disk structure size.
1297 	 */
1298 	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
1299 	entries = size / (MIN(size, SM_RUN_MAX));
1300 	segsz = entries * sizeof (uint64_t);
1301 
1302 	return (segsz <= space_map_length(msp->ms_sm) &&
1303 	    space_map_length(msp->ms_sm) >= (zfs_condense_pct *
1304 	    sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
1305 }
1306 
1307 /*
1308  * Condense the on-disk space map representation to its minimized form.
1309  * The minimized form consists of a small number of allocations followed by
1310  * the entries of the free range tree.
1311  */
1312 static void
1313 metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
1314 {
1315 	spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1316 	range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
1317 	range_tree_t *condense_tree;
1318 	space_map_t *sm = msp->ms_sm;
1319 
1320 	ASSERT(MUTEX_HELD(&msp->ms_lock));
1321 	ASSERT3U(spa_sync_pass(spa), ==, 1);
1322 	ASSERT(msp->ms_loaded);
1323 
1324 	spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
1325 	    "smp size %llu, segments %lu", txg, msp->ms_id, msp,
1326 	    space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root));
1327 
1328 	/*
1329 	 * Create an range tree that is 100% allocated. We remove segments
1330 	 * that have been freed in this txg, any deferred frees that exist,
1331 	 * and any allocation in the future. Removing segments should be
1332 	 * a relatively inexpensive operation since we expect these trees to
1333 	 * have a small number of nodes.
1334 	 */
1335 	condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
1336 	range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
1337 
1338 	/*
1339 	 * Remove what's been freed in this txg from the condense_tree.
1340 	 * Since we're in sync_pass 1, we know that all the frees from
1341 	 * this txg are in the freetree.
1342 	 */
1343 	range_tree_walk(freetree, range_tree_remove, condense_tree);
1344 
1345 	for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1346 		range_tree_walk(msp->ms_defertree[t],
1347 		    range_tree_remove, condense_tree);
1348 	}
1349 
1350 	for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1351 		range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
1352 		    range_tree_remove, condense_tree);
1353 	}
1354 
1355 	/*
1356 	 * We're about to drop the metaslab's lock thus allowing
1357 	 * other consumers to change it's content. Set the
1358 	 * metaslab's ms_condensing flag to ensure that
1359 	 * allocations on this metaslab do not occur while we're
1360 	 * in the middle of committing it to disk. This is only critical
1361 	 * for the ms_tree as all other range trees use per txg
1362 	 * views of their content.
1363 	 */
1364 	msp->ms_condensing = B_TRUE;
1365 
1366 	mutex_exit(&msp->ms_lock);
1367 	space_map_truncate(sm, tx);
1368 	mutex_enter(&msp->ms_lock);
1369 
1370 	/*
1371 	 * While we would ideally like to create a space_map representation
1372 	 * that consists only of allocation records, doing so can be
1373 	 * prohibitively expensive because the in-core free tree can be
1374 	 * large, and therefore computationally expensive to subtract
1375 	 * from the condense_tree. Instead we sync out two trees, a cheap
1376 	 * allocation only tree followed by the in-core free tree. While not
1377 	 * optimal, this is typically close to optimal, and much cheaper to
1378 	 * compute.
1379 	 */
1380 	space_map_write(sm, condense_tree, SM_ALLOC, tx);
1381 	range_tree_vacate(condense_tree, NULL, NULL);
1382 	range_tree_destroy(condense_tree);
1383 
1384 	space_map_write(sm, msp->ms_tree, SM_FREE, tx);
1385 	msp->ms_condensing = B_FALSE;
1386 }
1387 
1388 /*
1389  * Write a metaslab to disk in the context of the specified transaction group.
1390  */
1391 void
1392 metaslab_sync(metaslab_t *msp, uint64_t txg)
1393 {
1394 	metaslab_group_t *mg = msp->ms_group;
1395 	vdev_t *vd = mg->mg_vd;
1396 	spa_t *spa = vd->vdev_spa;
1397 	objset_t *mos = spa_meta_objset(spa);
1398 	range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
1399 	range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
1400 	range_tree_t **freed_tree =
1401 	    &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1402 	dmu_tx_t *tx;
1403 	uint64_t object = space_map_object(msp->ms_sm);
1404 
1405 	ASSERT(!vd->vdev_ishole);
1406 
1407 	/*
1408 	 * This metaslab has just been added so there's no work to do now.
1409 	 */
1410 	if (*freetree == NULL) {
1411 		ASSERT3P(alloctree, ==, NULL);
1412 		return;
1413 	}
1414 
1415 	ASSERT3P(alloctree, !=, NULL);
1416 	ASSERT3P(*freetree, !=, NULL);
1417 	ASSERT3P(*freed_tree, !=, NULL);
1418 
1419 	if (range_tree_space(alloctree) == 0 &&
1420 	    range_tree_space(*freetree) == 0)
1421 		return;
1422 
1423 	/*
1424 	 * The only state that can actually be changing concurrently with
1425 	 * metaslab_sync() is the metaslab's ms_tree.  No other thread can
1426 	 * be modifying this txg's alloctree, freetree, freed_tree, or
1427 	 * space_map_phys_t. Therefore, we only hold ms_lock to satify
1428 	 * space_map ASSERTs. We drop it whenever we call into the DMU,
1429 	 * because the DMU can call down to us (e.g. via zio_free()) at
1430 	 * any time.
1431 	 */
1432 
1433 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
1434 
1435 	if (msp->ms_sm == NULL) {
1436 		uint64_t new_object;
1437 
1438 		new_object = space_map_alloc(mos, tx);
1439 		VERIFY3U(new_object, !=, 0);
1440 
1441 		VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
1442 		    msp->ms_start, msp->ms_size, vd->vdev_ashift,
1443 		    &msp->ms_lock));
1444 		ASSERT(msp->ms_sm != NULL);
1445 	}
1446 
1447 	mutex_enter(&msp->ms_lock);
1448 
1449 	if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
1450 	    metaslab_should_condense(msp)) {
1451 		metaslab_condense(msp, txg, tx);
1452 	} else {
1453 		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
1454 		space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
1455 	}
1456 
1457 	range_tree_vacate(alloctree, NULL, NULL);
1458 
1459 	if (msp->ms_loaded) {
1460 		/*
1461 		 * When the space map is loaded, we have an accruate
1462 		 * histogram in the range tree. This gives us an opportunity
1463 		 * to bring the space map's histogram up-to-date so we clear
1464 		 * it first before updating it.
1465 		 */
1466 		space_map_histogram_clear(msp->ms_sm);
1467 		space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
1468 	} else {
1469 		/*
1470 		 * Since the space map is not loaded we simply update the
1471 		 * exisiting histogram with what was freed in this txg. This
1472 		 * means that the on-disk histogram may not have an accurate
1473 		 * view of the free space but it's close enough to allow
1474 		 * us to make allocation decisions.
1475 		 */
1476 		space_map_histogram_add(msp->ms_sm, *freetree, tx);
1477 	}
1478 
1479 	/*
1480 	 * For sync pass 1, we avoid traversing this txg's free range tree
1481 	 * and instead will just swap the pointers for freetree and
1482 	 * freed_tree. We can safely do this since the freed_tree is
1483 	 * guaranteed to be empty on the initial pass.
1484 	 */
1485 	if (spa_sync_pass(spa) == 1) {
1486 		range_tree_swap(freetree, freed_tree);
1487 	} else {
1488 		range_tree_vacate(*freetree, range_tree_add, *freed_tree);
1489 	}
1490 
1491 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1492 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1493 
1494 	mutex_exit(&msp->ms_lock);
1495 
1496 	if (object != space_map_object(msp->ms_sm)) {
1497 		object = space_map_object(msp->ms_sm);
1498 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
1499 		    msp->ms_id, sizeof (uint64_t), &object, tx);
1500 	}
1501 	dmu_tx_commit(tx);
1502 }
1503 
1504 /*
1505  * Called after a transaction group has completely synced to mark
1506  * all of the metaslab's free space as usable.
1507  */
1508 void
1509 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1510 {
1511 	metaslab_group_t *mg = msp->ms_group;
1512 	vdev_t *vd = mg->mg_vd;
1513 	range_tree_t **freed_tree;
1514 	range_tree_t **defer_tree;
1515 	int64_t alloc_delta, defer_delta;
1516 
1517 	ASSERT(!vd->vdev_ishole);
1518 
1519 	mutex_enter(&msp->ms_lock);
1520 
1521 	/*
1522 	 * If this metaslab is just becoming available, initialize its
1523 	 * alloctrees, freetrees, and defertree and add its capacity to
1524 	 * the vdev.
1525 	 */
1526 	if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
1527 		for (int t = 0; t < TXG_SIZE; t++) {
1528 			ASSERT(msp->ms_alloctree[t] == NULL);
1529 			ASSERT(msp->ms_freetree[t] == NULL);
1530 
1531 			msp->ms_alloctree[t] = range_tree_create(NULL, msp,
1532 			    &msp->ms_lock);
1533 			msp->ms_freetree[t] = range_tree_create(NULL, msp,
1534 			    &msp->ms_lock);
1535 		}
1536 
1537 		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1538 			ASSERT(msp->ms_defertree[t] == NULL);
1539 
1540 			msp->ms_defertree[t] = range_tree_create(NULL, msp,
1541 			    &msp->ms_lock);
1542 		}
1543 
1544 		vdev_space_update(vd, 0, 0, msp->ms_size);
1545 	}
1546 
1547 	freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
1548 	defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
1549 
1550 	alloc_delta = space_map_alloc_delta(msp->ms_sm);
1551 	defer_delta = range_tree_space(*freed_tree) -
1552 	    range_tree_space(*defer_tree);
1553 
1554 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1555 
1556 	ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
1557 	ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
1558 
1559 	/*
1560 	 * If there's a metaslab_load() in progress, wait for it to complete
1561 	 * so that we have a consistent view of the in-core space map.
1562 	 */
1563 	metaslab_load_wait(msp);
1564 
1565 	/*
1566 	 * Move the frees from the defer_tree back to the free
1567 	 * range tree (if it's loaded). Swap the freed_tree and the
1568 	 * defer_tree -- this is safe to do because we've just emptied out
1569 	 * the defer_tree.
1570 	 */
1571 	range_tree_vacate(*defer_tree,
1572 	    msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
1573 	range_tree_swap(freed_tree, defer_tree);
1574 
1575 	space_map_update(msp->ms_sm);
1576 
1577 	msp->ms_deferspace += defer_delta;
1578 	ASSERT3S(msp->ms_deferspace, >=, 0);
1579 	ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
1580 	if (msp->ms_deferspace != 0) {
1581 		/*
1582 		 * Keep syncing this metaslab until all deferred frees
1583 		 * are back in circulation.
1584 		 */
1585 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1586 	}
1587 
1588 	if (msp->ms_loaded && msp->ms_access_txg < txg) {
1589 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
1590 			VERIFY0(range_tree_space(
1591 			    msp->ms_alloctree[(txg + t) & TXG_MASK]));
1592 		}
1593 
1594 		if (!metaslab_debug_unload)
1595 			metaslab_unload(msp);
1596 	}
1597 
1598 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1599 	mutex_exit(&msp->ms_lock);
1600 
1601 }
1602 
1603 void
1604 metaslab_sync_reassess(metaslab_group_t *mg)
1605 {
1606 	metaslab_group_alloc_update(mg);
1607 
1608 	/*
1609 	 * Preload the next potential metaslabs
1610 	 */
1611 	metaslab_group_preload(mg);
1612 }
1613 
1614 static uint64_t
1615 metaslab_distance(metaslab_t *msp, dva_t *dva)
1616 {
1617 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1618 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1619 	uint64_t start = msp->ms_id;
1620 
1621 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1622 		return (1ULL << 63);
1623 
1624 	if (offset < start)
1625 		return ((start - offset) << ms_shift);
1626 	if (offset > start)
1627 		return ((offset - start) << ms_shift);
1628 	return (0);
1629 }
1630 
1631 static uint64_t
1632 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1633     uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
1634 {
1635 	spa_t *spa = mg->mg_vd->vdev_spa;
1636 	metaslab_t *msp = NULL;
1637 	uint64_t offset = -1ULL;
1638 	avl_tree_t *t = &mg->mg_metaslab_tree;
1639 	uint64_t activation_weight;
1640 	uint64_t target_distance;
1641 	int i;
1642 
1643 	activation_weight = METASLAB_WEIGHT_PRIMARY;
1644 	for (i = 0; i < d; i++) {
1645 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1646 			activation_weight = METASLAB_WEIGHT_SECONDARY;
1647 			break;
1648 		}
1649 	}
1650 
1651 	for (;;) {
1652 		boolean_t was_active;
1653 
1654 		mutex_enter(&mg->mg_lock);
1655 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1656 			if (msp->ms_weight < asize) {
1657 				spa_dbgmsg(spa, "%s: failed to meet weight "
1658 				    "requirement: vdev %llu, txg %llu, mg %p, "
1659 				    "msp %p, psize %llu, asize %llu, "
1660 				    "weight %llu", spa_name(spa),
1661 				    mg->mg_vd->vdev_id, txg,
1662 				    mg, msp, psize, asize, msp->ms_weight);
1663 				mutex_exit(&mg->mg_lock);
1664 				return (-1ULL);
1665 			}
1666 
1667 			/*
1668 			 * If the selected metaslab is condensing, skip it.
1669 			 */
1670 			if (msp->ms_condensing)
1671 				continue;
1672 
1673 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1674 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1675 				break;
1676 
1677 			target_distance = min_distance +
1678 			    (space_map_allocated(msp->ms_sm) != 0 ? 0 :
1679 			    min_distance >> 1);
1680 
1681 			for (i = 0; i < d; i++)
1682 				if (metaslab_distance(msp, &dva[i]) <
1683 				    target_distance)
1684 					break;
1685 			if (i == d)
1686 				break;
1687 		}
1688 		mutex_exit(&mg->mg_lock);
1689 		if (msp == NULL)
1690 			return (-1ULL);
1691 
1692 		mutex_enter(&msp->ms_lock);
1693 
1694 		/*
1695 		 * Ensure that the metaslab we have selected is still
1696 		 * capable of handling our request. It's possible that
1697 		 * another thread may have changed the weight while we
1698 		 * were blocked on the metaslab lock.
1699 		 */
1700 		if (msp->ms_weight < asize || (was_active &&
1701 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1702 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1703 			mutex_exit(&msp->ms_lock);
1704 			continue;
1705 		}
1706 
1707 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1708 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1709 			metaslab_passivate(msp,
1710 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1711 			mutex_exit(&msp->ms_lock);
1712 			continue;
1713 		}
1714 
1715 		if (metaslab_activate(msp, activation_weight) != 0) {
1716 			mutex_exit(&msp->ms_lock);
1717 			continue;
1718 		}
1719 
1720 		/*
1721 		 * If this metaslab is currently condensing then pick again as
1722 		 * we can't manipulate this metaslab until it's committed
1723 		 * to disk.
1724 		 */
1725 		if (msp->ms_condensing) {
1726 			mutex_exit(&msp->ms_lock);
1727 			continue;
1728 		}
1729 
1730 		if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
1731 			break;
1732 
1733 		metaslab_passivate(msp, metaslab_block_maxsize(msp));
1734 		mutex_exit(&msp->ms_lock);
1735 	}
1736 
1737 	if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
1738 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1739 
1740 	range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
1741 	msp->ms_access_txg = txg + metaslab_unload_delay;
1742 
1743 	mutex_exit(&msp->ms_lock);
1744 
1745 	return (offset);
1746 }
1747 
1748 /*
1749  * Allocate a block for the specified i/o.
1750  */
1751 static int
1752 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1753     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1754 {
1755 	metaslab_group_t *mg, *rotor;
1756 	vdev_t *vd;
1757 	int dshift = 3;
1758 	int all_zero;
1759 	int zio_lock = B_FALSE;
1760 	boolean_t allocatable;
1761 	uint64_t offset = -1ULL;
1762 	uint64_t asize;
1763 	uint64_t distance;
1764 
1765 	ASSERT(!DVA_IS_VALID(&dva[d]));
1766 
1767 	/*
1768 	 * For testing, make some blocks above a certain size be gang blocks.
1769 	 */
1770 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1771 		return (SET_ERROR(ENOSPC));
1772 
1773 	/*
1774 	 * Start at the rotor and loop through all mgs until we find something.
1775 	 * Note that there's no locking on mc_rotor or mc_aliquot because
1776 	 * nothing actually breaks if we miss a few updates -- we just won't
1777 	 * allocate quite as evenly.  It all balances out over time.
1778 	 *
1779 	 * If we are doing ditto or log blocks, try to spread them across
1780 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1781 	 * allocated all of our ditto blocks, then try and spread them out on
1782 	 * that vdev as much as possible.  If it turns out to not be possible,
1783 	 * gradually lower our standards until anything becomes acceptable.
1784 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1785 	 * gives us hope of containing our fault domains to something we're
1786 	 * able to reason about.  Otherwise, any two top-level vdev failures
1787 	 * will guarantee the loss of data.  With consecutive allocation,
1788 	 * only two adjacent top-level vdev failures will result in data loss.
1789 	 *
1790 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1791 	 * ourselves on the same vdev as our gang block header.  That
1792 	 * way, we can hope for locality in vdev_cache, plus it makes our
1793 	 * fault domains something tractable.
1794 	 */
1795 	if (hintdva) {
1796 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1797 
1798 		/*
1799 		 * It's possible the vdev we're using as the hint no
1800 		 * longer exists (i.e. removed). Consult the rotor when
1801 		 * all else fails.
1802 		 */
1803 		if (vd != NULL) {
1804 			mg = vd->vdev_mg;
1805 
1806 			if (flags & METASLAB_HINTBP_AVOID &&
1807 			    mg->mg_next != NULL)
1808 				mg = mg->mg_next;
1809 		} else {
1810 			mg = mc->mc_rotor;
1811 		}
1812 	} else if (d != 0) {
1813 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1814 		mg = vd->vdev_mg->mg_next;
1815 	} else {
1816 		mg = mc->mc_rotor;
1817 	}
1818 
1819 	/*
1820 	 * If the hint put us into the wrong metaslab class, or into a
1821 	 * metaslab group that has been passivated, just follow the rotor.
1822 	 */
1823 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1824 		mg = mc->mc_rotor;
1825 
1826 	rotor = mg;
1827 top:
1828 	all_zero = B_TRUE;
1829 	do {
1830 		ASSERT(mg->mg_activation_count == 1);
1831 
1832 		vd = mg->mg_vd;
1833 
1834 		/*
1835 		 * Don't allocate from faulted devices.
1836 		 */
1837 		if (zio_lock) {
1838 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1839 			allocatable = vdev_allocatable(vd);
1840 			spa_config_exit(spa, SCL_ZIO, FTAG);
1841 		} else {
1842 			allocatable = vdev_allocatable(vd);
1843 		}
1844 
1845 		/*
1846 		 * Determine if the selected metaslab group is eligible
1847 		 * for allocations. If we're ganging or have requested
1848 		 * an allocation for the smallest gang block size
1849 		 * then we don't want to avoid allocating to the this
1850 		 * metaslab group. If we're in this condition we should
1851 		 * try to allocate from any device possible so that we
1852 		 * don't inadvertently return ENOSPC and suspend the pool
1853 		 * even though space is still available.
1854 		 */
1855 		if (allocatable && CAN_FASTGANG(flags) &&
1856 		    psize > SPA_GANGBLOCKSIZE)
1857 			allocatable = metaslab_group_allocatable(mg);
1858 
1859 		if (!allocatable)
1860 			goto next;
1861 
1862 		/*
1863 		 * Avoid writing single-copy data to a failing vdev
1864 		 * unless the user instructs us that it is okay.
1865 		 */
1866 		if ((vd->vdev_stat.vs_write_errors > 0 ||
1867 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1868 		    d == 0 && dshift == 3 &&
1869 		    !(zfs_write_to_degraded && vd->vdev_state ==
1870 		    VDEV_STATE_DEGRADED)) {
1871 			all_zero = B_FALSE;
1872 			goto next;
1873 		}
1874 
1875 		ASSERT(mg->mg_class == mc);
1876 
1877 		distance = vd->vdev_asize >> dshift;
1878 		if (distance <= (1ULL << vd->vdev_ms_shift))
1879 			distance = 0;
1880 		else
1881 			all_zero = B_FALSE;
1882 
1883 		asize = vdev_psize_to_asize(vd, psize);
1884 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1885 
1886 		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1887 		    dva, d);
1888 		if (offset != -1ULL) {
1889 			/*
1890 			 * If we've just selected this metaslab group,
1891 			 * figure out whether the corresponding vdev is
1892 			 * over- or under-used relative to the pool,
1893 			 * and set an allocation bias to even it out.
1894 			 */
1895 			if (mc->mc_aliquot == 0) {
1896 				vdev_stat_t *vs = &vd->vdev_stat;
1897 				int64_t vu, cu;
1898 
1899 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1900 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1901 
1902 				/*
1903 				 * Calculate how much more or less we should
1904 				 * try to allocate from this device during
1905 				 * this iteration around the rotor.
1906 				 * For example, if a device is 80% full
1907 				 * and the pool is 20% full then we should
1908 				 * reduce allocations by 60% on this device.
1909 				 *
1910 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
1911 				 *
1912 				 * This reduces allocations by 307K for this
1913 				 * iteration.
1914 				 */
1915 				mg->mg_bias = ((cu - vu) *
1916 				    (int64_t)mg->mg_aliquot) / 100;
1917 			}
1918 
1919 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1920 			    mg->mg_aliquot + mg->mg_bias) {
1921 				mc->mc_rotor = mg->mg_next;
1922 				mc->mc_aliquot = 0;
1923 			}
1924 
1925 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
1926 			DVA_SET_OFFSET(&dva[d], offset);
1927 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1928 			DVA_SET_ASIZE(&dva[d], asize);
1929 
1930 			return (0);
1931 		}
1932 next:
1933 		mc->mc_rotor = mg->mg_next;
1934 		mc->mc_aliquot = 0;
1935 	} while ((mg = mg->mg_next) != rotor);
1936 
1937 	if (!all_zero) {
1938 		dshift++;
1939 		ASSERT(dshift < 64);
1940 		goto top;
1941 	}
1942 
1943 	if (!allocatable && !zio_lock) {
1944 		dshift = 3;
1945 		zio_lock = B_TRUE;
1946 		goto top;
1947 	}
1948 
1949 	bzero(&dva[d], sizeof (dva_t));
1950 
1951 	return (SET_ERROR(ENOSPC));
1952 }
1953 
1954 /*
1955  * Free the block represented by DVA in the context of the specified
1956  * transaction group.
1957  */
1958 static void
1959 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1960 {
1961 	uint64_t vdev = DVA_GET_VDEV(dva);
1962 	uint64_t offset = DVA_GET_OFFSET(dva);
1963 	uint64_t size = DVA_GET_ASIZE(dva);
1964 	vdev_t *vd;
1965 	metaslab_t *msp;
1966 
1967 	ASSERT(DVA_IS_VALID(dva));
1968 
1969 	if (txg > spa_freeze_txg(spa))
1970 		return;
1971 
1972 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1973 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1974 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1975 		    (u_longlong_t)vdev, (u_longlong_t)offset);
1976 		ASSERT(0);
1977 		return;
1978 	}
1979 
1980 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1981 
1982 	if (DVA_GET_GANG(dva))
1983 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1984 
1985 	mutex_enter(&msp->ms_lock);
1986 
1987 	if (now) {
1988 		range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
1989 		    offset, size);
1990 
1991 		VERIFY(!msp->ms_condensing);
1992 		VERIFY3U(offset, >=, msp->ms_start);
1993 		VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
1994 		VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
1995 		    msp->ms_size);
1996 		VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
1997 		VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
1998 		range_tree_add(msp->ms_tree, offset, size);
1999 	} else {
2000 		if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
2001 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2002 		range_tree_add(msp->ms_freetree[txg & TXG_MASK],
2003 		    offset, size);
2004 	}
2005 
2006 	mutex_exit(&msp->ms_lock);
2007 }
2008 
2009 /*
2010  * Intent log support: upon opening the pool after a crash, notify the SPA
2011  * of blocks that the intent log has allocated for immediate write, but
2012  * which are still considered free by the SPA because the last transaction
2013  * group didn't commit yet.
2014  */
2015 static int
2016 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
2017 {
2018 	uint64_t vdev = DVA_GET_VDEV(dva);
2019 	uint64_t offset = DVA_GET_OFFSET(dva);
2020 	uint64_t size = DVA_GET_ASIZE(dva);
2021 	vdev_t *vd;
2022 	metaslab_t *msp;
2023 	int error = 0;
2024 
2025 	ASSERT(DVA_IS_VALID(dva));
2026 
2027 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
2028 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
2029 		return (SET_ERROR(ENXIO));
2030 
2031 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2032 
2033 	if (DVA_GET_GANG(dva))
2034 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
2035 
2036 	mutex_enter(&msp->ms_lock);
2037 
2038 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
2039 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
2040 
2041 	if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
2042 		error = SET_ERROR(ENOENT);
2043 
2044 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
2045 		mutex_exit(&msp->ms_lock);
2046 		return (error);
2047 	}
2048 
2049 	VERIFY(!msp->ms_condensing);
2050 	VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
2051 	VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
2052 	VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
2053 	range_tree_remove(msp->ms_tree, offset, size);
2054 
2055 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
2056 		if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
2057 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
2058 		range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
2059 	}
2060 
2061 	mutex_exit(&msp->ms_lock);
2062 
2063 	return (0);
2064 }
2065 
2066 int
2067 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
2068     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
2069 {
2070 	dva_t *dva = bp->blk_dva;
2071 	dva_t *hintdva = hintbp->blk_dva;
2072 	int error = 0;
2073 
2074 	ASSERT(bp->blk_birth == 0);
2075 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
2076 
2077 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2078 
2079 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
2080 		spa_config_exit(spa, SCL_ALLOC, FTAG);
2081 		return (SET_ERROR(ENOSPC));
2082 	}
2083 
2084 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
2085 	ASSERT(BP_GET_NDVAS(bp) == 0);
2086 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
2087 
2088 	for (int d = 0; d < ndvas; d++) {
2089 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
2090 		    txg, flags);
2091 		if (error != 0) {
2092 			for (d--; d >= 0; d--) {
2093 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
2094 				bzero(&dva[d], sizeof (dva_t));
2095 			}
2096 			spa_config_exit(spa, SCL_ALLOC, FTAG);
2097 			return (error);
2098 		}
2099 	}
2100 	ASSERT(error == 0);
2101 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
2102 
2103 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2104 
2105 	BP_SET_BIRTH(bp, txg, txg);
2106 
2107 	return (0);
2108 }
2109 
2110 void
2111 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
2112 {
2113 	const dva_t *dva = bp->blk_dva;
2114 	int ndvas = BP_GET_NDVAS(bp);
2115 
2116 	ASSERT(!BP_IS_HOLE(bp));
2117 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
2118 
2119 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
2120 
2121 	for (int d = 0; d < ndvas; d++)
2122 		metaslab_free_dva(spa, &dva[d], txg, now);
2123 
2124 	spa_config_exit(spa, SCL_FREE, FTAG);
2125 }
2126 
2127 int
2128 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
2129 {
2130 	const dva_t *dva = bp->blk_dva;
2131 	int ndvas = BP_GET_NDVAS(bp);
2132 	int error = 0;
2133 
2134 	ASSERT(!BP_IS_HOLE(bp));
2135 
2136 	if (txg != 0) {
2137 		/*
2138 		 * First do a dry run to make sure all DVAs are claimable,
2139 		 * so we don't have to unwind from partial failures below.
2140 		 */
2141 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
2142 			return (error);
2143 	}
2144 
2145 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
2146 
2147 	for (int d = 0; d < ndvas; d++)
2148 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
2149 			break;
2150 
2151 	spa_config_exit(spa, SCL_ALLOC, FTAG);
2152 
2153 	ASSERT(error == 0 || txg == 0);
2154 
2155 	return (error);
2156 }
2157 
2158 void
2159 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
2160 {
2161 	if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
2162 		return;
2163 
2164 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
2165 	for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
2166 		uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
2167 		vdev_t *vd = vdev_lookup_top(spa, vdev);
2168 		uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
2169 		uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
2170 		metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
2171 
2172 		if (msp->ms_loaded)
2173 			range_tree_verify(msp->ms_tree, offset, size);
2174 
2175 		for (int j = 0; j < TXG_SIZE; j++)
2176 			range_tree_verify(msp->ms_freetree[j], offset, size);
2177 		for (int j = 0; j < TXG_DEFER_SIZE; j++)
2178 			range_tree_verify(msp->ms_defertree[j], offset, size);
2179 	}
2180 	spa_config_exit(spa, SCL_VDEV, FTAG);
2181 }
2182