xref: /titanic_44/usr/src/uts/common/fs/zfs/metaslab.c (revision 43ae55058ad99c869a9ae39d039490e8a3680520)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2012 by Delphix. All rights reserved.
24  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/dmu.h>
29 #include <sys/dmu_tx.h>
30 #include <sys/space_map.h>
31 #include <sys/metaslab_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/zio.h>
34 
35 /*
36  * Allow allocations to switch to gang blocks quickly. We do this to
37  * avoid having to load lots of space_maps in a given txg. There are,
38  * however, some cases where we want to avoid "fast" ganging and instead
39  * we want to do an exhaustive search of all metaslabs on this device.
40  * Currently we don't allow any gang, zil, or dump device related allocations
41  * to "fast" gang.
42  */
43 #define	CAN_FASTGANG(flags) \
44 	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
45 	METASLAB_GANG_AVOID)))
46 
47 uint64_t metaslab_aliquot = 512ULL << 10;
48 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
49 
50 /*
51  * This value defines the number of allowed allocation failures per vdev.
52  * If a device reaches this threshold in a given txg then we consider skipping
53  * allocations on that device.
54  */
55 int zfs_mg_alloc_failures;
56 
57 /*
58  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
59  */
60 static int metaslab_debug = 0;
61 
62 /*
63  * Minimum size which forces the dynamic allocator to change
64  * it's allocation strategy.  Once the space map cannot satisfy
65  * an allocation of this size then it switches to using more
66  * aggressive strategy (i.e search by size rather than offset).
67  */
68 uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
69 
70 /*
71  * The minimum free space, in percent, which must be available
72  * in a space map to continue allocations in a first-fit fashion.
73  * Once the space_map's free space drops below this level we dynamically
74  * switch to using best-fit allocations.
75  */
76 int metaslab_df_free_pct = 4;
77 
78 /*
79  * A metaslab is considered "free" if it contains a contiguous
80  * segment which is greater than metaslab_min_alloc_size.
81  */
82 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
83 
84 /*
85  * Max number of space_maps to prefetch.
86  */
87 int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
88 
89 /*
90  * Percentage bonus multiplier for metaslabs that are in the bonus area.
91  */
92 int metaslab_smo_bonus_pct = 150;
93 
94 /*
95  * Should we be willing to write data to degraded vdevs?
96  */
97 boolean_t zfs_write_to_degraded = B_FALSE;
98 
99 /*
100  * ==========================================================================
101  * Metaslab classes
102  * ==========================================================================
103  */
104 metaslab_class_t *
105 metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
106 {
107 	metaslab_class_t *mc;
108 
109 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
110 
111 	mc->mc_spa = spa;
112 	mc->mc_rotor = NULL;
113 	mc->mc_ops = ops;
114 
115 	return (mc);
116 }
117 
118 void
119 metaslab_class_destroy(metaslab_class_t *mc)
120 {
121 	ASSERT(mc->mc_rotor == NULL);
122 	ASSERT(mc->mc_alloc == 0);
123 	ASSERT(mc->mc_deferred == 0);
124 	ASSERT(mc->mc_space == 0);
125 	ASSERT(mc->mc_dspace == 0);
126 
127 	kmem_free(mc, sizeof (metaslab_class_t));
128 }
129 
130 int
131 metaslab_class_validate(metaslab_class_t *mc)
132 {
133 	metaslab_group_t *mg;
134 	vdev_t *vd;
135 
136 	/*
137 	 * Must hold one of the spa_config locks.
138 	 */
139 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
140 	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
141 
142 	if ((mg = mc->mc_rotor) == NULL)
143 		return (0);
144 
145 	do {
146 		vd = mg->mg_vd;
147 		ASSERT(vd->vdev_mg != NULL);
148 		ASSERT3P(vd->vdev_top, ==, vd);
149 		ASSERT3P(mg->mg_class, ==, mc);
150 		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
151 	} while ((mg = mg->mg_next) != mc->mc_rotor);
152 
153 	return (0);
154 }
155 
156 void
157 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
158     int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
159 {
160 	atomic_add_64(&mc->mc_alloc, alloc_delta);
161 	atomic_add_64(&mc->mc_deferred, defer_delta);
162 	atomic_add_64(&mc->mc_space, space_delta);
163 	atomic_add_64(&mc->mc_dspace, dspace_delta);
164 }
165 
166 uint64_t
167 metaslab_class_get_alloc(metaslab_class_t *mc)
168 {
169 	return (mc->mc_alloc);
170 }
171 
172 uint64_t
173 metaslab_class_get_deferred(metaslab_class_t *mc)
174 {
175 	return (mc->mc_deferred);
176 }
177 
178 uint64_t
179 metaslab_class_get_space(metaslab_class_t *mc)
180 {
181 	return (mc->mc_space);
182 }
183 
184 uint64_t
185 metaslab_class_get_dspace(metaslab_class_t *mc)
186 {
187 	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
188 }
189 
190 /*
191  * ==========================================================================
192  * Metaslab groups
193  * ==========================================================================
194  */
195 static int
196 metaslab_compare(const void *x1, const void *x2)
197 {
198 	const metaslab_t *m1 = x1;
199 	const metaslab_t *m2 = x2;
200 
201 	if (m1->ms_weight < m2->ms_weight)
202 		return (1);
203 	if (m1->ms_weight > m2->ms_weight)
204 		return (-1);
205 
206 	/*
207 	 * If the weights are identical, use the offset to force uniqueness.
208 	 */
209 	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
210 		return (-1);
211 	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
212 		return (1);
213 
214 	ASSERT3P(m1, ==, m2);
215 
216 	return (0);
217 }
218 
219 metaslab_group_t *
220 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
221 {
222 	metaslab_group_t *mg;
223 
224 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
225 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
226 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
227 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
228 	mg->mg_vd = vd;
229 	mg->mg_class = mc;
230 	mg->mg_activation_count = 0;
231 
232 	return (mg);
233 }
234 
235 void
236 metaslab_group_destroy(metaslab_group_t *mg)
237 {
238 	ASSERT(mg->mg_prev == NULL);
239 	ASSERT(mg->mg_next == NULL);
240 	/*
241 	 * We may have gone below zero with the activation count
242 	 * either because we never activated in the first place or
243 	 * because we're done, and possibly removing the vdev.
244 	 */
245 	ASSERT(mg->mg_activation_count <= 0);
246 
247 	avl_destroy(&mg->mg_metaslab_tree);
248 	mutex_destroy(&mg->mg_lock);
249 	kmem_free(mg, sizeof (metaslab_group_t));
250 }
251 
252 void
253 metaslab_group_activate(metaslab_group_t *mg)
254 {
255 	metaslab_class_t *mc = mg->mg_class;
256 	metaslab_group_t *mgprev, *mgnext;
257 
258 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
259 
260 	ASSERT(mc->mc_rotor != mg);
261 	ASSERT(mg->mg_prev == NULL);
262 	ASSERT(mg->mg_next == NULL);
263 	ASSERT(mg->mg_activation_count <= 0);
264 
265 	if (++mg->mg_activation_count <= 0)
266 		return;
267 
268 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
269 
270 	if ((mgprev = mc->mc_rotor) == NULL) {
271 		mg->mg_prev = mg;
272 		mg->mg_next = mg;
273 	} else {
274 		mgnext = mgprev->mg_next;
275 		mg->mg_prev = mgprev;
276 		mg->mg_next = mgnext;
277 		mgprev->mg_next = mg;
278 		mgnext->mg_prev = mg;
279 	}
280 	mc->mc_rotor = mg;
281 }
282 
283 void
284 metaslab_group_passivate(metaslab_group_t *mg)
285 {
286 	metaslab_class_t *mc = mg->mg_class;
287 	metaslab_group_t *mgprev, *mgnext;
288 
289 	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
290 
291 	if (--mg->mg_activation_count != 0) {
292 		ASSERT(mc->mc_rotor != mg);
293 		ASSERT(mg->mg_prev == NULL);
294 		ASSERT(mg->mg_next == NULL);
295 		ASSERT(mg->mg_activation_count < 0);
296 		return;
297 	}
298 
299 	mgprev = mg->mg_prev;
300 	mgnext = mg->mg_next;
301 
302 	if (mg == mgnext) {
303 		mc->mc_rotor = NULL;
304 	} else {
305 		mc->mc_rotor = mgnext;
306 		mgprev->mg_next = mgnext;
307 		mgnext->mg_prev = mgprev;
308 	}
309 
310 	mg->mg_prev = NULL;
311 	mg->mg_next = NULL;
312 }
313 
314 static void
315 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
316 {
317 	mutex_enter(&mg->mg_lock);
318 	ASSERT(msp->ms_group == NULL);
319 	msp->ms_group = mg;
320 	msp->ms_weight = 0;
321 	avl_add(&mg->mg_metaslab_tree, msp);
322 	mutex_exit(&mg->mg_lock);
323 }
324 
325 static void
326 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
327 {
328 	mutex_enter(&mg->mg_lock);
329 	ASSERT(msp->ms_group == mg);
330 	avl_remove(&mg->mg_metaslab_tree, msp);
331 	msp->ms_group = NULL;
332 	mutex_exit(&mg->mg_lock);
333 }
334 
335 static void
336 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
337 {
338 	/*
339 	 * Although in principle the weight can be any value, in
340 	 * practice we do not use values in the range [1, 510].
341 	 */
342 	ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
343 	ASSERT(MUTEX_HELD(&msp->ms_lock));
344 
345 	mutex_enter(&mg->mg_lock);
346 	ASSERT(msp->ms_group == mg);
347 	avl_remove(&mg->mg_metaslab_tree, msp);
348 	msp->ms_weight = weight;
349 	avl_add(&mg->mg_metaslab_tree, msp);
350 	mutex_exit(&mg->mg_lock);
351 }
352 
353 /*
354  * ==========================================================================
355  * Common allocator routines
356  * ==========================================================================
357  */
358 static int
359 metaslab_segsize_compare(const void *x1, const void *x2)
360 {
361 	const space_seg_t *s1 = x1;
362 	const space_seg_t *s2 = x2;
363 	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
364 	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
365 
366 	if (ss_size1 < ss_size2)
367 		return (-1);
368 	if (ss_size1 > ss_size2)
369 		return (1);
370 
371 	if (s1->ss_start < s2->ss_start)
372 		return (-1);
373 	if (s1->ss_start > s2->ss_start)
374 		return (1);
375 
376 	return (0);
377 }
378 
379 /*
380  * This is a helper function that can be used by the allocator to find
381  * a suitable block to allocate. This will search the specified AVL
382  * tree looking for a block that matches the specified criteria.
383  */
384 static uint64_t
385 metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
386     uint64_t align)
387 {
388 	space_seg_t *ss, ssearch;
389 	avl_index_t where;
390 
391 	ssearch.ss_start = *cursor;
392 	ssearch.ss_end = *cursor + size;
393 
394 	ss = avl_find(t, &ssearch, &where);
395 	if (ss == NULL)
396 		ss = avl_nearest(t, where, AVL_AFTER);
397 
398 	while (ss != NULL) {
399 		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
400 
401 		if (offset + size <= ss->ss_end) {
402 			*cursor = offset + size;
403 			return (offset);
404 		}
405 		ss = AVL_NEXT(t, ss);
406 	}
407 
408 	/*
409 	 * If we know we've searched the whole map (*cursor == 0), give up.
410 	 * Otherwise, reset the cursor to the beginning and try again.
411 	 */
412 	if (*cursor == 0)
413 		return (-1ULL);
414 
415 	*cursor = 0;
416 	return (metaslab_block_picker(t, cursor, size, align));
417 }
418 
419 static void
420 metaslab_pp_load(space_map_t *sm)
421 {
422 	space_seg_t *ss;
423 
424 	ASSERT(sm->sm_ppd == NULL);
425 	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
426 
427 	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
428 	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
429 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
430 
431 	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
432 		avl_add(sm->sm_pp_root, ss);
433 }
434 
435 static void
436 metaslab_pp_unload(space_map_t *sm)
437 {
438 	void *cookie = NULL;
439 
440 	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
441 	sm->sm_ppd = NULL;
442 
443 	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
444 		/* tear down the tree */
445 	}
446 
447 	avl_destroy(sm->sm_pp_root);
448 	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
449 	sm->sm_pp_root = NULL;
450 }
451 
452 /* ARGSUSED */
453 static void
454 metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
455 {
456 	/* No need to update cursor */
457 }
458 
459 /* ARGSUSED */
460 static void
461 metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
462 {
463 	/* No need to update cursor */
464 }
465 
466 /*
467  * Return the maximum contiguous segment within the metaslab.
468  */
469 uint64_t
470 metaslab_pp_maxsize(space_map_t *sm)
471 {
472 	avl_tree_t *t = sm->sm_pp_root;
473 	space_seg_t *ss;
474 
475 	if (t == NULL || (ss = avl_last(t)) == NULL)
476 		return (0ULL);
477 
478 	return (ss->ss_end - ss->ss_start);
479 }
480 
481 /*
482  * ==========================================================================
483  * The first-fit block allocator
484  * ==========================================================================
485  */
486 static uint64_t
487 metaslab_ff_alloc(space_map_t *sm, uint64_t size)
488 {
489 	avl_tree_t *t = &sm->sm_root;
490 	uint64_t align = size & -size;
491 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
492 
493 	return (metaslab_block_picker(t, cursor, size, align));
494 }
495 
496 /* ARGSUSED */
497 boolean_t
498 metaslab_ff_fragmented(space_map_t *sm)
499 {
500 	return (B_TRUE);
501 }
502 
503 static space_map_ops_t metaslab_ff_ops = {
504 	metaslab_pp_load,
505 	metaslab_pp_unload,
506 	metaslab_ff_alloc,
507 	metaslab_pp_claim,
508 	metaslab_pp_free,
509 	metaslab_pp_maxsize,
510 	metaslab_ff_fragmented
511 };
512 
513 /*
514  * ==========================================================================
515  * Dynamic block allocator -
516  * Uses the first fit allocation scheme until space get low and then
517  * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
518  * and metaslab_df_free_pct to determine when to switch the allocation scheme.
519  * ==========================================================================
520  */
521 static uint64_t
522 metaslab_df_alloc(space_map_t *sm, uint64_t size)
523 {
524 	avl_tree_t *t = &sm->sm_root;
525 	uint64_t align = size & -size;
526 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
527 	uint64_t max_size = metaslab_pp_maxsize(sm);
528 	int free_pct = sm->sm_space * 100 / sm->sm_size;
529 
530 	ASSERT(MUTEX_HELD(sm->sm_lock));
531 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
532 
533 	if (max_size < size)
534 		return (-1ULL);
535 
536 	/*
537 	 * If we're running low on space switch to using the size
538 	 * sorted AVL tree (best-fit).
539 	 */
540 	if (max_size < metaslab_df_alloc_threshold ||
541 	    free_pct < metaslab_df_free_pct) {
542 		t = sm->sm_pp_root;
543 		*cursor = 0;
544 	}
545 
546 	return (metaslab_block_picker(t, cursor, size, 1ULL));
547 }
548 
549 static boolean_t
550 metaslab_df_fragmented(space_map_t *sm)
551 {
552 	uint64_t max_size = metaslab_pp_maxsize(sm);
553 	int free_pct = sm->sm_space * 100 / sm->sm_size;
554 
555 	if (max_size >= metaslab_df_alloc_threshold &&
556 	    free_pct >= metaslab_df_free_pct)
557 		return (B_FALSE);
558 
559 	return (B_TRUE);
560 }
561 
562 static space_map_ops_t metaslab_df_ops = {
563 	metaslab_pp_load,
564 	metaslab_pp_unload,
565 	metaslab_df_alloc,
566 	metaslab_pp_claim,
567 	metaslab_pp_free,
568 	metaslab_pp_maxsize,
569 	metaslab_df_fragmented
570 };
571 
572 /*
573  * ==========================================================================
574  * Other experimental allocators
575  * ==========================================================================
576  */
577 static uint64_t
578 metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
579 {
580 	avl_tree_t *t = &sm->sm_root;
581 	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
582 	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
583 	uint64_t max_size = metaslab_pp_maxsize(sm);
584 	uint64_t rsize = size;
585 	uint64_t offset = 0;
586 
587 	ASSERT(MUTEX_HELD(sm->sm_lock));
588 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
589 
590 	if (max_size < size)
591 		return (-1ULL);
592 
593 	ASSERT3U(*extent_end, >=, *cursor);
594 
595 	/*
596 	 * If we're running low on space switch to using the size
597 	 * sorted AVL tree (best-fit).
598 	 */
599 	if ((*cursor + size) > *extent_end) {
600 
601 		t = sm->sm_pp_root;
602 		*cursor = *extent_end = 0;
603 
604 		if (max_size > 2 * SPA_MAXBLOCKSIZE)
605 			rsize = MIN(metaslab_min_alloc_size, max_size);
606 		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
607 		if (offset != -1)
608 			*cursor = offset + size;
609 	} else {
610 		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
611 	}
612 	ASSERT3U(*cursor, <=, *extent_end);
613 	return (offset);
614 }
615 
616 static boolean_t
617 metaslab_cdf_fragmented(space_map_t *sm)
618 {
619 	uint64_t max_size = metaslab_pp_maxsize(sm);
620 
621 	if (max_size > (metaslab_min_alloc_size * 10))
622 		return (B_FALSE);
623 	return (B_TRUE);
624 }
625 
626 static space_map_ops_t metaslab_cdf_ops = {
627 	metaslab_pp_load,
628 	metaslab_pp_unload,
629 	metaslab_cdf_alloc,
630 	metaslab_pp_claim,
631 	metaslab_pp_free,
632 	metaslab_pp_maxsize,
633 	metaslab_cdf_fragmented
634 };
635 
636 uint64_t metaslab_ndf_clump_shift = 4;
637 
638 static uint64_t
639 metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
640 {
641 	avl_tree_t *t = &sm->sm_root;
642 	avl_index_t where;
643 	space_seg_t *ss, ssearch;
644 	uint64_t hbit = highbit(size);
645 	uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
646 	uint64_t max_size = metaslab_pp_maxsize(sm);
647 
648 	ASSERT(MUTEX_HELD(sm->sm_lock));
649 	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
650 
651 	if (max_size < size)
652 		return (-1ULL);
653 
654 	ssearch.ss_start = *cursor;
655 	ssearch.ss_end = *cursor + size;
656 
657 	ss = avl_find(t, &ssearch, &where);
658 	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
659 		t = sm->sm_pp_root;
660 
661 		ssearch.ss_start = 0;
662 		ssearch.ss_end = MIN(max_size,
663 		    1ULL << (hbit + metaslab_ndf_clump_shift));
664 		ss = avl_find(t, &ssearch, &where);
665 		if (ss == NULL)
666 			ss = avl_nearest(t, where, AVL_AFTER);
667 		ASSERT(ss != NULL);
668 	}
669 
670 	if (ss != NULL) {
671 		if (ss->ss_start + size <= ss->ss_end) {
672 			*cursor = ss->ss_start + size;
673 			return (ss->ss_start);
674 		}
675 	}
676 	return (-1ULL);
677 }
678 
679 static boolean_t
680 metaslab_ndf_fragmented(space_map_t *sm)
681 {
682 	uint64_t max_size = metaslab_pp_maxsize(sm);
683 
684 	if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
685 		return (B_FALSE);
686 	return (B_TRUE);
687 }
688 
689 
690 static space_map_ops_t metaslab_ndf_ops = {
691 	metaslab_pp_load,
692 	metaslab_pp_unload,
693 	metaslab_ndf_alloc,
694 	metaslab_pp_claim,
695 	metaslab_pp_free,
696 	metaslab_pp_maxsize,
697 	metaslab_ndf_fragmented
698 };
699 
700 space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
701 
702 /*
703  * ==========================================================================
704  * Metaslabs
705  * ==========================================================================
706  */
707 metaslab_t *
708 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
709 	uint64_t start, uint64_t size, uint64_t txg)
710 {
711 	vdev_t *vd = mg->mg_vd;
712 	metaslab_t *msp;
713 
714 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
715 	mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
716 
717 	msp->ms_smo_syncing = *smo;
718 
719 	/*
720 	 * We create the main space map here, but we don't create the
721 	 * allocmaps and freemaps until metaslab_sync_done().  This serves
722 	 * two purposes: it allows metaslab_sync_done() to detect the
723 	 * addition of new space; and for debugging, it ensures that we'd
724 	 * data fault on any attempt to use this metaslab before it's ready.
725 	 */
726 	space_map_create(&msp->ms_map, start, size,
727 	    vd->vdev_ashift, &msp->ms_lock);
728 
729 	metaslab_group_add(mg, msp);
730 
731 	if (metaslab_debug && smo->smo_object != 0) {
732 		mutex_enter(&msp->ms_lock);
733 		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
734 		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
735 		mutex_exit(&msp->ms_lock);
736 	}
737 
738 	/*
739 	 * If we're opening an existing pool (txg == 0) or creating
740 	 * a new one (txg == TXG_INITIAL), all space is available now.
741 	 * If we're adding space to an existing pool, the new space
742 	 * does not become available until after this txg has synced.
743 	 */
744 	if (txg <= TXG_INITIAL)
745 		metaslab_sync_done(msp, 0);
746 
747 	if (txg != 0) {
748 		vdev_dirty(vd, 0, NULL, txg);
749 		vdev_dirty(vd, VDD_METASLAB, msp, txg);
750 	}
751 
752 	return (msp);
753 }
754 
755 void
756 metaslab_fini(metaslab_t *msp)
757 {
758 	metaslab_group_t *mg = msp->ms_group;
759 
760 	vdev_space_update(mg->mg_vd,
761 	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
762 
763 	metaslab_group_remove(mg, msp);
764 
765 	mutex_enter(&msp->ms_lock);
766 
767 	space_map_unload(&msp->ms_map);
768 	space_map_destroy(&msp->ms_map);
769 
770 	for (int t = 0; t < TXG_SIZE; t++) {
771 		space_map_destroy(&msp->ms_allocmap[t]);
772 		space_map_destroy(&msp->ms_freemap[t]);
773 	}
774 
775 	for (int t = 0; t < TXG_DEFER_SIZE; t++)
776 		space_map_destroy(&msp->ms_defermap[t]);
777 
778 	ASSERT0(msp->ms_deferspace);
779 
780 	mutex_exit(&msp->ms_lock);
781 	mutex_destroy(&msp->ms_lock);
782 
783 	kmem_free(msp, sizeof (metaslab_t));
784 }
785 
786 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
787 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
788 #define	METASLAB_ACTIVE_MASK		\
789 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
790 
791 static uint64_t
792 metaslab_weight(metaslab_t *msp)
793 {
794 	metaslab_group_t *mg = msp->ms_group;
795 	space_map_t *sm = &msp->ms_map;
796 	space_map_obj_t *smo = &msp->ms_smo;
797 	vdev_t *vd = mg->mg_vd;
798 	uint64_t weight, space;
799 
800 	ASSERT(MUTEX_HELD(&msp->ms_lock));
801 
802 	/*
803 	 * The baseline weight is the metaslab's free space.
804 	 */
805 	space = sm->sm_size - smo->smo_alloc;
806 	weight = space;
807 
808 	/*
809 	 * Modern disks have uniform bit density and constant angular velocity.
810 	 * Therefore, the outer recording zones are faster (higher bandwidth)
811 	 * than the inner zones by the ratio of outer to inner track diameter,
812 	 * which is typically around 2:1.  We account for this by assigning
813 	 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
814 	 * In effect, this means that we'll select the metaslab with the most
815 	 * free bandwidth rather than simply the one with the most free space.
816 	 */
817 	weight = 2 * weight -
818 	    ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
819 	ASSERT(weight >= space && weight <= 2 * space);
820 
821 	/*
822 	 * For locality, assign higher weight to metaslabs which have
823 	 * a lower offset than what we've already activated.
824 	 */
825 	if (sm->sm_start <= mg->mg_bonus_area)
826 		weight *= (metaslab_smo_bonus_pct / 100);
827 	ASSERT(weight >= space &&
828 	    weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
829 
830 	if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
831 		/*
832 		 * If this metaslab is one we're actively using, adjust its
833 		 * weight to make it preferable to any inactive metaslab so
834 		 * we'll polish it off.
835 		 */
836 		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
837 	}
838 	return (weight);
839 }
840 
841 static void
842 metaslab_prefetch(metaslab_group_t *mg)
843 {
844 	spa_t *spa = mg->mg_vd->vdev_spa;
845 	metaslab_t *msp;
846 	avl_tree_t *t = &mg->mg_metaslab_tree;
847 	int m;
848 
849 	mutex_enter(&mg->mg_lock);
850 
851 	/*
852 	 * Prefetch the next potential metaslabs
853 	 */
854 	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
855 		space_map_t *sm = &msp->ms_map;
856 		space_map_obj_t *smo = &msp->ms_smo;
857 
858 		/* If we have reached our prefetch limit then we're done */
859 		if (m >= metaslab_prefetch_limit)
860 			break;
861 
862 		if (!sm->sm_loaded && smo->smo_object != 0) {
863 			mutex_exit(&mg->mg_lock);
864 			dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
865 			    0ULL, smo->smo_objsize);
866 			mutex_enter(&mg->mg_lock);
867 		}
868 	}
869 	mutex_exit(&mg->mg_lock);
870 }
871 
872 static int
873 metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
874 {
875 	metaslab_group_t *mg = msp->ms_group;
876 	space_map_t *sm = &msp->ms_map;
877 	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
878 
879 	ASSERT(MUTEX_HELD(&msp->ms_lock));
880 
881 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
882 		space_map_load_wait(sm);
883 		if (!sm->sm_loaded) {
884 			space_map_obj_t *smo = &msp->ms_smo;
885 
886 			int error = space_map_load(sm, sm_ops, SM_FREE, smo,
887 			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
888 			if (error)  {
889 				metaslab_group_sort(msp->ms_group, msp, 0);
890 				return (error);
891 			}
892 			for (int t = 0; t < TXG_DEFER_SIZE; t++)
893 				space_map_walk(&msp->ms_defermap[t],
894 				    space_map_claim, sm);
895 
896 		}
897 
898 		/*
899 		 * Track the bonus area as we activate new metaslabs.
900 		 */
901 		if (sm->sm_start > mg->mg_bonus_area) {
902 			mutex_enter(&mg->mg_lock);
903 			mg->mg_bonus_area = sm->sm_start;
904 			mutex_exit(&mg->mg_lock);
905 		}
906 
907 		metaslab_group_sort(msp->ms_group, msp,
908 		    msp->ms_weight | activation_weight);
909 	}
910 	ASSERT(sm->sm_loaded);
911 	ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
912 
913 	return (0);
914 }
915 
916 static void
917 metaslab_passivate(metaslab_t *msp, uint64_t size)
918 {
919 	/*
920 	 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
921 	 * this metaslab again.  In that case, it had better be empty,
922 	 * or we would be leaving space on the table.
923 	 */
924 	ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0);
925 	metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
926 	ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
927 }
928 
929 /*
930  * Write a metaslab to disk in the context of the specified transaction group.
931  */
932 void
933 metaslab_sync(metaslab_t *msp, uint64_t txg)
934 {
935 	vdev_t *vd = msp->ms_group->mg_vd;
936 	spa_t *spa = vd->vdev_spa;
937 	objset_t *mos = spa_meta_objset(spa);
938 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
939 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
940 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
941 	space_map_t *sm = &msp->ms_map;
942 	space_map_obj_t *smo = &msp->ms_smo_syncing;
943 	dmu_buf_t *db;
944 	dmu_tx_t *tx;
945 
946 	ASSERT(!vd->vdev_ishole);
947 
948 	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
949 		return;
950 
951 	/*
952 	 * The only state that can actually be changing concurrently with
953 	 * metaslab_sync() is the metaslab's ms_map.  No other thread can
954 	 * be modifying this txg's allocmap, freemap, freed_map, or smo.
955 	 * Therefore, we only hold ms_lock to satify space_map ASSERTs.
956 	 * We drop it whenever we call into the DMU, because the DMU
957 	 * can call down to us (e.g. via zio_free()) at any time.
958 	 */
959 
960 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
961 
962 	if (smo->smo_object == 0) {
963 		ASSERT(smo->smo_objsize == 0);
964 		ASSERT(smo->smo_alloc == 0);
965 		smo->smo_object = dmu_object_alloc(mos,
966 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
967 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
968 		ASSERT(smo->smo_object != 0);
969 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
970 		    (sm->sm_start >> vd->vdev_ms_shift),
971 		    sizeof (uint64_t), &smo->smo_object, tx);
972 	}
973 
974 	mutex_enter(&msp->ms_lock);
975 
976 	space_map_walk(freemap, space_map_add, freed_map);
977 
978 	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
979 	    2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) {
980 		/*
981 		 * The in-core space map representation is twice as compact
982 		 * as the on-disk one, so it's time to condense the latter
983 		 * by generating a pure allocmap from first principles.
984 		 *
985 		 * This metaslab is 100% allocated,
986 		 * minus the content of the in-core map (sm),
987 		 * minus what's been freed this txg (freed_map),
988 		 * minus deferred frees (ms_defermap[]),
989 		 * minus allocations from txgs in the future
990 		 * (because they haven't been committed yet).
991 		 */
992 		space_map_vacate(allocmap, NULL, NULL);
993 		space_map_vacate(freemap, NULL, NULL);
994 
995 		space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size);
996 
997 		space_map_walk(sm, space_map_remove, allocmap);
998 		space_map_walk(freed_map, space_map_remove, allocmap);
999 
1000 		for (int t = 0; t < TXG_DEFER_SIZE; t++)
1001 			space_map_walk(&msp->ms_defermap[t],
1002 			    space_map_remove, allocmap);
1003 
1004 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1005 			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
1006 			    space_map_remove, allocmap);
1007 
1008 		mutex_exit(&msp->ms_lock);
1009 		space_map_truncate(smo, mos, tx);
1010 		mutex_enter(&msp->ms_lock);
1011 	}
1012 
1013 	space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
1014 	space_map_sync(freemap, SM_FREE, smo, mos, tx);
1015 
1016 	mutex_exit(&msp->ms_lock);
1017 
1018 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1019 	dmu_buf_will_dirty(db, tx);
1020 	ASSERT3U(db->db_size, >=, sizeof (*smo));
1021 	bcopy(smo, db->db_data, sizeof (*smo));
1022 	dmu_buf_rele(db, FTAG);
1023 
1024 	dmu_tx_commit(tx);
1025 }
1026 
1027 /*
1028  * Called after a transaction group has completely synced to mark
1029  * all of the metaslab's free space as usable.
1030  */
1031 void
1032 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
1033 {
1034 	space_map_obj_t *smo = &msp->ms_smo;
1035 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
1036 	space_map_t *sm = &msp->ms_map;
1037 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
1038 	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
1039 	metaslab_group_t *mg = msp->ms_group;
1040 	vdev_t *vd = mg->mg_vd;
1041 	int64_t alloc_delta, defer_delta;
1042 
1043 	ASSERT(!vd->vdev_ishole);
1044 
1045 	mutex_enter(&msp->ms_lock);
1046 
1047 	/*
1048 	 * If this metaslab is just becoming available, initialize its
1049 	 * allocmaps and freemaps and add its capacity to the vdev.
1050 	 */
1051 	if (freed_map->sm_size == 0) {
1052 		for (int t = 0; t < TXG_SIZE; t++) {
1053 			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
1054 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
1055 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
1056 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
1057 		}
1058 
1059 		for (int t = 0; t < TXG_DEFER_SIZE; t++)
1060 			space_map_create(&msp->ms_defermap[t], sm->sm_start,
1061 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
1062 
1063 		vdev_space_update(vd, 0, 0, sm->sm_size);
1064 	}
1065 
1066 	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
1067 	defer_delta = freed_map->sm_space - defer_map->sm_space;
1068 
1069 	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
1070 
1071 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
1072 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
1073 
1074 	/*
1075 	 * If there's a space_map_load() in progress, wait for it to complete
1076 	 * so that we have a consistent view of the in-core space map.
1077 	 * Then, add defer_map (oldest deferred frees) to this map and
1078 	 * transfer freed_map (this txg's frees) to defer_map.
1079 	 */
1080 	space_map_load_wait(sm);
1081 	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
1082 	space_map_vacate(freed_map, space_map_add, defer_map);
1083 
1084 	*smo = *smosync;
1085 
1086 	msp->ms_deferspace += defer_delta;
1087 	ASSERT3S(msp->ms_deferspace, >=, 0);
1088 	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
1089 	if (msp->ms_deferspace != 0) {
1090 		/*
1091 		 * Keep syncing this metaslab until all deferred frees
1092 		 * are back in circulation.
1093 		 */
1094 		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
1095 	}
1096 
1097 	/*
1098 	 * If the map is loaded but no longer active, evict it as soon as all
1099 	 * future allocations have synced.  (If we unloaded it now and then
1100 	 * loaded a moment later, the map wouldn't reflect those allocations.)
1101 	 */
1102 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
1103 		int evictable = 1;
1104 
1105 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
1106 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
1107 				evictable = 0;
1108 
1109 		if (evictable && !metaslab_debug)
1110 			space_map_unload(sm);
1111 	}
1112 
1113 	metaslab_group_sort(mg, msp, metaslab_weight(msp));
1114 
1115 	mutex_exit(&msp->ms_lock);
1116 }
1117 
1118 void
1119 metaslab_sync_reassess(metaslab_group_t *mg)
1120 {
1121 	vdev_t *vd = mg->mg_vd;
1122 	int64_t failures = mg->mg_alloc_failures;
1123 
1124 	/*
1125 	 * Re-evaluate all metaslabs which have lower offsets than the
1126 	 * bonus area.
1127 	 */
1128 	for (int m = 0; m < vd->vdev_ms_count; m++) {
1129 		metaslab_t *msp = vd->vdev_ms[m];
1130 
1131 		if (msp->ms_map.sm_start > mg->mg_bonus_area)
1132 			break;
1133 
1134 		mutex_enter(&msp->ms_lock);
1135 		metaslab_group_sort(mg, msp, metaslab_weight(msp));
1136 		mutex_exit(&msp->ms_lock);
1137 	}
1138 
1139 	atomic_add_64(&mg->mg_alloc_failures, -failures);
1140 
1141 	/*
1142 	 * Prefetch the next potential metaslabs
1143 	 */
1144 	metaslab_prefetch(mg);
1145 }
1146 
1147 static uint64_t
1148 metaslab_distance(metaslab_t *msp, dva_t *dva)
1149 {
1150 	uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
1151 	uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
1152 	uint64_t start = msp->ms_map.sm_start >> ms_shift;
1153 
1154 	if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
1155 		return (1ULL << 63);
1156 
1157 	if (offset < start)
1158 		return ((start - offset) << ms_shift);
1159 	if (offset > start)
1160 		return ((offset - start) << ms_shift);
1161 	return (0);
1162 }
1163 
1164 static uint64_t
1165 metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
1166     uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
1167 {
1168 	spa_t *spa = mg->mg_vd->vdev_spa;
1169 	metaslab_t *msp = NULL;
1170 	uint64_t offset = -1ULL;
1171 	avl_tree_t *t = &mg->mg_metaslab_tree;
1172 	uint64_t activation_weight;
1173 	uint64_t target_distance;
1174 	int i;
1175 
1176 	activation_weight = METASLAB_WEIGHT_PRIMARY;
1177 	for (i = 0; i < d; i++) {
1178 		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
1179 			activation_weight = METASLAB_WEIGHT_SECONDARY;
1180 			break;
1181 		}
1182 	}
1183 
1184 	for (;;) {
1185 		boolean_t was_active;
1186 
1187 		mutex_enter(&mg->mg_lock);
1188 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
1189 			if (msp->ms_weight < asize) {
1190 				spa_dbgmsg(spa, "%s: failed to meet weight "
1191 				    "requirement: vdev %llu, txg %llu, mg %p, "
1192 				    "msp %p, psize %llu, asize %llu, "
1193 				    "failures %llu, weight %llu",
1194 				    spa_name(spa), mg->mg_vd->vdev_id, txg,
1195 				    mg, msp, psize, asize,
1196 				    mg->mg_alloc_failures, msp->ms_weight);
1197 				mutex_exit(&mg->mg_lock);
1198 				return (-1ULL);
1199 			}
1200 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
1201 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
1202 				break;
1203 
1204 			target_distance = min_distance +
1205 			    (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
1206 
1207 			for (i = 0; i < d; i++)
1208 				if (metaslab_distance(msp, &dva[i]) <
1209 				    target_distance)
1210 					break;
1211 			if (i == d)
1212 				break;
1213 		}
1214 		mutex_exit(&mg->mg_lock);
1215 		if (msp == NULL)
1216 			return (-1ULL);
1217 
1218 		/*
1219 		 * If we've already reached the allowable number of failed
1220 		 * allocation attempts on this metaslab group then we
1221 		 * consider skipping it. We skip it only if we're allowed
1222 		 * to "fast" gang, the physical size is larger than
1223 		 * a gang block, and we're attempting to allocate from
1224 		 * the primary metaslab.
1225 		 */
1226 		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
1227 		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
1228 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1229 			spa_dbgmsg(spa, "%s: skipping metaslab group: "
1230 			    "vdev %llu, txg %llu, mg %p, psize %llu, "
1231 			    "asize %llu, failures %llu", spa_name(spa),
1232 			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
1233 			    mg->mg_alloc_failures);
1234 			return (-1ULL);
1235 		}
1236 
1237 		mutex_enter(&msp->ms_lock);
1238 
1239 		/*
1240 		 * Ensure that the metaslab we have selected is still
1241 		 * capable of handling our request. It's possible that
1242 		 * another thread may have changed the weight while we
1243 		 * were blocked on the metaslab lock.
1244 		 */
1245 		if (msp->ms_weight < asize || (was_active &&
1246 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
1247 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
1248 			mutex_exit(&msp->ms_lock);
1249 			continue;
1250 		}
1251 
1252 		if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) &&
1253 		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
1254 			metaslab_passivate(msp,
1255 			    msp->ms_weight & ~METASLAB_ACTIVE_MASK);
1256 			mutex_exit(&msp->ms_lock);
1257 			continue;
1258 		}
1259 
1260 		if (metaslab_activate(msp, activation_weight) != 0) {
1261 			mutex_exit(&msp->ms_lock);
1262 			continue;
1263 		}
1264 
1265 		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
1266 			break;
1267 
1268 		atomic_inc_64(&mg->mg_alloc_failures);
1269 
1270 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
1271 
1272 		mutex_exit(&msp->ms_lock);
1273 	}
1274 
1275 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1276 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
1277 
1278 	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
1279 
1280 	mutex_exit(&msp->ms_lock);
1281 
1282 	return (offset);
1283 }
1284 
1285 /*
1286  * Allocate a block for the specified i/o.
1287  */
1288 static int
1289 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
1290     dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags)
1291 {
1292 	metaslab_group_t *mg, *rotor;
1293 	vdev_t *vd;
1294 	int dshift = 3;
1295 	int all_zero;
1296 	int zio_lock = B_FALSE;
1297 	boolean_t allocatable;
1298 	uint64_t offset = -1ULL;
1299 	uint64_t asize;
1300 	uint64_t distance;
1301 
1302 	ASSERT(!DVA_IS_VALID(&dva[d]));
1303 
1304 	/*
1305 	 * For testing, make some blocks above a certain size be gang blocks.
1306 	 */
1307 	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
1308 		return (ENOSPC);
1309 
1310 	/*
1311 	 * Start at the rotor and loop through all mgs until we find something.
1312 	 * Note that there's no locking on mc_rotor or mc_aliquot because
1313 	 * nothing actually breaks if we miss a few updates -- we just won't
1314 	 * allocate quite as evenly.  It all balances out over time.
1315 	 *
1316 	 * If we are doing ditto or log blocks, try to spread them across
1317 	 * consecutive vdevs.  If we're forced to reuse a vdev before we've
1318 	 * allocated all of our ditto blocks, then try and spread them out on
1319 	 * that vdev as much as possible.  If it turns out to not be possible,
1320 	 * gradually lower our standards until anything becomes acceptable.
1321 	 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
1322 	 * gives us hope of containing our fault domains to something we're
1323 	 * able to reason about.  Otherwise, any two top-level vdev failures
1324 	 * will guarantee the loss of data.  With consecutive allocation,
1325 	 * only two adjacent top-level vdev failures will result in data loss.
1326 	 *
1327 	 * If we are doing gang blocks (hintdva is non-NULL), try to keep
1328 	 * ourselves on the same vdev as our gang block header.  That
1329 	 * way, we can hope for locality in vdev_cache, plus it makes our
1330 	 * fault domains something tractable.
1331 	 */
1332 	if (hintdva) {
1333 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
1334 
1335 		/*
1336 		 * It's possible the vdev we're using as the hint no
1337 		 * longer exists (i.e. removed). Consult the rotor when
1338 		 * all else fails.
1339 		 */
1340 		if (vd != NULL) {
1341 			mg = vd->vdev_mg;
1342 
1343 			if (flags & METASLAB_HINTBP_AVOID &&
1344 			    mg->mg_next != NULL)
1345 				mg = mg->mg_next;
1346 		} else {
1347 			mg = mc->mc_rotor;
1348 		}
1349 	} else if (d != 0) {
1350 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
1351 		mg = vd->vdev_mg->mg_next;
1352 	} else {
1353 		mg = mc->mc_rotor;
1354 	}
1355 
1356 	/*
1357 	 * If the hint put us into the wrong metaslab class, or into a
1358 	 * metaslab group that has been passivated, just follow the rotor.
1359 	 */
1360 	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
1361 		mg = mc->mc_rotor;
1362 
1363 	rotor = mg;
1364 top:
1365 	all_zero = B_TRUE;
1366 	do {
1367 		ASSERT(mg->mg_activation_count == 1);
1368 
1369 		vd = mg->mg_vd;
1370 
1371 		/*
1372 		 * Don't allocate from faulted devices.
1373 		 */
1374 		if (zio_lock) {
1375 			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
1376 			allocatable = vdev_allocatable(vd);
1377 			spa_config_exit(spa, SCL_ZIO, FTAG);
1378 		} else {
1379 			allocatable = vdev_allocatable(vd);
1380 		}
1381 		if (!allocatable)
1382 			goto next;
1383 
1384 		/*
1385 		 * Avoid writing single-copy data to a failing vdev
1386 		 * unless the user instructs us that it is okay.
1387 		 */
1388 		if ((vd->vdev_stat.vs_write_errors > 0 ||
1389 		    vd->vdev_state < VDEV_STATE_HEALTHY) &&
1390 		    d == 0 && dshift == 3 &&
1391 		    !(zfs_write_to_degraded && vd->vdev_state ==
1392 		    VDEV_STATE_DEGRADED)) {
1393 			all_zero = B_FALSE;
1394 			goto next;
1395 		}
1396 
1397 		ASSERT(mg->mg_class == mc);
1398 
1399 		distance = vd->vdev_asize >> dshift;
1400 		if (distance <= (1ULL << vd->vdev_ms_shift))
1401 			distance = 0;
1402 		else
1403 			all_zero = B_FALSE;
1404 
1405 		asize = vdev_psize_to_asize(vd, psize);
1406 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
1407 
1408 		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
1409 		    dva, d, flags);
1410 		if (offset != -1ULL) {
1411 			/*
1412 			 * If we've just selected this metaslab group,
1413 			 * figure out whether the corresponding vdev is
1414 			 * over- or under-used relative to the pool,
1415 			 * and set an allocation bias to even it out.
1416 			 */
1417 			if (mc->mc_aliquot == 0) {
1418 				vdev_stat_t *vs = &vd->vdev_stat;
1419 				int64_t vu, cu;
1420 
1421 				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
1422 				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
1423 
1424 				/*
1425 				 * Calculate how much more or less we should
1426 				 * try to allocate from this device during
1427 				 * this iteration around the rotor.
1428 				 * For example, if a device is 80% full
1429 				 * and the pool is 20% full then we should
1430 				 * reduce allocations by 60% on this device.
1431 				 *
1432 				 * mg_bias = (20 - 80) * 512K / 100 = -307K
1433 				 *
1434 				 * This reduces allocations by 307K for this
1435 				 * iteration.
1436 				 */
1437 				mg->mg_bias = ((cu - vu) *
1438 				    (int64_t)mg->mg_aliquot) / 100;
1439 			}
1440 
1441 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
1442 			    mg->mg_aliquot + mg->mg_bias) {
1443 				mc->mc_rotor = mg->mg_next;
1444 				mc->mc_aliquot = 0;
1445 			}
1446 
1447 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
1448 			DVA_SET_OFFSET(&dva[d], offset);
1449 			DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
1450 			DVA_SET_ASIZE(&dva[d], asize);
1451 
1452 			return (0);
1453 		}
1454 next:
1455 		mc->mc_rotor = mg->mg_next;
1456 		mc->mc_aliquot = 0;
1457 	} while ((mg = mg->mg_next) != rotor);
1458 
1459 	if (!all_zero) {
1460 		dshift++;
1461 		ASSERT(dshift < 64);
1462 		goto top;
1463 	}
1464 
1465 	if (!allocatable && !zio_lock) {
1466 		dshift = 3;
1467 		zio_lock = B_TRUE;
1468 		goto top;
1469 	}
1470 
1471 	bzero(&dva[d], sizeof (dva_t));
1472 
1473 	return (ENOSPC);
1474 }
1475 
1476 /*
1477  * Free the block represented by DVA in the context of the specified
1478  * transaction group.
1479  */
1480 static void
1481 metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now)
1482 {
1483 	uint64_t vdev = DVA_GET_VDEV(dva);
1484 	uint64_t offset = DVA_GET_OFFSET(dva);
1485 	uint64_t size = DVA_GET_ASIZE(dva);
1486 	vdev_t *vd;
1487 	metaslab_t *msp;
1488 
1489 	ASSERT(DVA_IS_VALID(dva));
1490 
1491 	if (txg > spa_freeze_txg(spa))
1492 		return;
1493 
1494 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1495 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
1496 		cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
1497 		    (u_longlong_t)vdev, (u_longlong_t)offset);
1498 		ASSERT(0);
1499 		return;
1500 	}
1501 
1502 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1503 
1504 	if (DVA_GET_GANG(dva))
1505 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1506 
1507 	mutex_enter(&msp->ms_lock);
1508 
1509 	if (now) {
1510 		space_map_remove(&msp->ms_allocmap[txg & TXG_MASK],
1511 		    offset, size);
1512 		space_map_free(&msp->ms_map, offset, size);
1513 	} else {
1514 		if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0)
1515 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
1516 		space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
1517 	}
1518 
1519 	mutex_exit(&msp->ms_lock);
1520 }
1521 
1522 /*
1523  * Intent log support: upon opening the pool after a crash, notify the SPA
1524  * of blocks that the intent log has allocated for immediate write, but
1525  * which are still considered free by the SPA because the last transaction
1526  * group didn't commit yet.
1527  */
1528 static int
1529 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
1530 {
1531 	uint64_t vdev = DVA_GET_VDEV(dva);
1532 	uint64_t offset = DVA_GET_OFFSET(dva);
1533 	uint64_t size = DVA_GET_ASIZE(dva);
1534 	vdev_t *vd;
1535 	metaslab_t *msp;
1536 	int error = 0;
1537 
1538 	ASSERT(DVA_IS_VALID(dva));
1539 
1540 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
1541 	    (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
1542 		return (ENXIO);
1543 
1544 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
1545 
1546 	if (DVA_GET_GANG(dva))
1547 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
1548 
1549 	mutex_enter(&msp->ms_lock);
1550 
1551 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
1552 		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
1553 
1554 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
1555 		error = ENOENT;
1556 
1557 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
1558 		mutex_exit(&msp->ms_lock);
1559 		return (error);
1560 	}
1561 
1562 	space_map_claim(&msp->ms_map, offset, size);
1563 
1564 	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
1565 		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
1566 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
1567 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
1568 	}
1569 
1570 	mutex_exit(&msp->ms_lock);
1571 
1572 	return (0);
1573 }
1574 
1575 int
1576 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
1577     int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
1578 {
1579 	dva_t *dva = bp->blk_dva;
1580 	dva_t *hintdva = hintbp->blk_dva;
1581 	int error = 0;
1582 
1583 	ASSERT(bp->blk_birth == 0);
1584 	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
1585 
1586 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1587 
1588 	if (mc->mc_rotor == NULL) {	/* no vdevs in this class */
1589 		spa_config_exit(spa, SCL_ALLOC, FTAG);
1590 		return (ENOSPC);
1591 	}
1592 
1593 	ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
1594 	ASSERT(BP_GET_NDVAS(bp) == 0);
1595 	ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
1596 
1597 	for (int d = 0; d < ndvas; d++) {
1598 		error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
1599 		    txg, flags);
1600 		if (error) {
1601 			for (d--; d >= 0; d--) {
1602 				metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
1603 				bzero(&dva[d], sizeof (dva_t));
1604 			}
1605 			spa_config_exit(spa, SCL_ALLOC, FTAG);
1606 			return (error);
1607 		}
1608 	}
1609 	ASSERT(error == 0);
1610 	ASSERT(BP_GET_NDVAS(bp) == ndvas);
1611 
1612 	spa_config_exit(spa, SCL_ALLOC, FTAG);
1613 
1614 	BP_SET_BIRTH(bp, txg, txg);
1615 
1616 	return (0);
1617 }
1618 
1619 void
1620 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
1621 {
1622 	const dva_t *dva = bp->blk_dva;
1623 	int ndvas = BP_GET_NDVAS(bp);
1624 
1625 	ASSERT(!BP_IS_HOLE(bp));
1626 	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
1627 
1628 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
1629 
1630 	for (int d = 0; d < ndvas; d++)
1631 		metaslab_free_dva(spa, &dva[d], txg, now);
1632 
1633 	spa_config_exit(spa, SCL_FREE, FTAG);
1634 }
1635 
1636 int
1637 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
1638 {
1639 	const dva_t *dva = bp->blk_dva;
1640 	int ndvas = BP_GET_NDVAS(bp);
1641 	int error = 0;
1642 
1643 	ASSERT(!BP_IS_HOLE(bp));
1644 
1645 	if (txg != 0) {
1646 		/*
1647 		 * First do a dry run to make sure all DVAs are claimable,
1648 		 * so we don't have to unwind from partial failures below.
1649 		 */
1650 		if ((error = metaslab_claim(spa, bp, 0)) != 0)
1651 			return (error);
1652 	}
1653 
1654 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
1655 
1656 	for (int d = 0; d < ndvas; d++)
1657 		if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0)
1658 			break;
1659 
1660 	spa_config_exit(spa, SCL_ALLOC, FTAG);
1661 
1662 	ASSERT(error == 0 || txg == 0);
1663 
1664 	return (error);
1665 }
1666