xref: /titanic_52/usr/src/uts/common/fs/zfs/metaslab.c (revision 734b6a94890be549309b21156f8ed6d4561cac51)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa_impl.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/space_map.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/zio.h>
36 
37 /*
38  * ==========================================================================
39  * Metaslab classes
40  * ==========================================================================
41  */
42 metaslab_class_t *
43 metaslab_class_create(void)
44 {
45 	metaslab_class_t *mc;
46 
47 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
48 
49 	mc->mc_rotor = NULL;
50 
51 	return (mc);
52 }
53 
54 void
55 metaslab_class_destroy(metaslab_class_t *mc)
56 {
57 	metaslab_group_t *mg;
58 
59 	while ((mg = mc->mc_rotor) != NULL) {
60 		metaslab_class_remove(mc, mg);
61 		metaslab_group_destroy(mg);
62 	}
63 
64 	kmem_free(mc, sizeof (metaslab_class_t));
65 }
66 
67 void
68 metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
69 {
70 	metaslab_group_t *mgprev, *mgnext;
71 
72 	ASSERT(mg->mg_class == NULL);
73 
74 	if ((mgprev = mc->mc_rotor) == NULL) {
75 		mg->mg_prev = mg;
76 		mg->mg_next = mg;
77 	} else {
78 		mgnext = mgprev->mg_next;
79 		mg->mg_prev = mgprev;
80 		mg->mg_next = mgnext;
81 		mgprev->mg_next = mg;
82 		mgnext->mg_prev = mg;
83 	}
84 	mc->mc_rotor = mg;
85 	mg->mg_class = mc;
86 }
87 
88 void
89 metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
90 {
91 	metaslab_group_t *mgprev, *mgnext;
92 
93 	ASSERT(mg->mg_class == mc);
94 
95 	mgprev = mg->mg_prev;
96 	mgnext = mg->mg_next;
97 
98 	if (mg == mgnext) {
99 		mc->mc_rotor = NULL;
100 	} else {
101 		mc->mc_rotor = mgnext;
102 		mgprev->mg_next = mgnext;
103 		mgnext->mg_prev = mgprev;
104 	}
105 
106 	mg->mg_prev = NULL;
107 	mg->mg_next = NULL;
108 	mg->mg_class = NULL;
109 }
110 
111 /*
112  * ==========================================================================
113  * Metaslab groups
114  * ==========================================================================
115  */
116 static int
117 metaslab_compare(const void *x1, const void *x2)
118 {
119 	const metaslab_t *m1 = x1;
120 	const metaslab_t *m2 = x2;
121 
122 	if (m1->ms_weight < m2->ms_weight)
123 		return (1);
124 	if (m1->ms_weight > m2->ms_weight)
125 		return (-1);
126 
127 	/*
128 	 * If the weights are identical, use the offset to force uniqueness.
129 	 */
130 	if (m1->ms_map.sm_start < m2->ms_map.sm_start)
131 		return (-1);
132 	if (m1->ms_map.sm_start > m2->ms_map.sm_start)
133 		return (1);
134 
135 	ASSERT3P(m1, ==, m2);
136 
137 	return (0);
138 }
139 
140 metaslab_group_t *
141 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
142 {
143 	metaslab_group_t *mg;
144 
145 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
146 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
147 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
148 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
149 	mg->mg_aliquot = 2ULL << 20;		/* XXX -- tweak me */
150 	mg->mg_vd = vd;
151 	metaslab_class_add(mc, mg);
152 
153 	return (mg);
154 }
155 
156 void
157 metaslab_group_destroy(metaslab_group_t *mg)
158 {
159 	avl_destroy(&mg->mg_metaslab_tree);
160 	mutex_destroy(&mg->mg_lock);
161 	kmem_free(mg, sizeof (metaslab_group_t));
162 }
163 
164 void
165 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
166 {
167 	mutex_enter(&mg->mg_lock);
168 	ASSERT(msp->ms_group == NULL);
169 	msp->ms_group = mg;
170 	msp->ms_weight = weight;
171 	avl_add(&mg->mg_metaslab_tree, msp);
172 	mutex_exit(&mg->mg_lock);
173 }
174 
175 void
176 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
177 {
178 	mutex_enter(&mg->mg_lock);
179 	ASSERT(msp->ms_group == mg);
180 	avl_remove(&mg->mg_metaslab_tree, msp);
181 	msp->ms_group = NULL;
182 	mutex_exit(&mg->mg_lock);
183 }
184 
185 void
186 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
187 {
188 	mutex_enter(&mg->mg_lock);
189 	ASSERT(msp->ms_group == mg);
190 	avl_remove(&mg->mg_metaslab_tree, msp);
191 	msp->ms_weight = weight;
192 	avl_add(&mg->mg_metaslab_tree, msp);
193 	mutex_exit(&mg->mg_lock);
194 }
195 
196 /*
197  * ==========================================================================
198  * Metaslabs
199  * ==========================================================================
200  */
201 void
202 metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_t **mspp,
203 	uint64_t start, uint64_t size, uint64_t txg)
204 {
205 	vdev_t *vd = mg->mg_vd;
206 	metaslab_t *msp;
207 	int fm;
208 
209 	msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
210 
211 	msp->ms_smo = smo;
212 
213 	space_map_create(&msp->ms_map, start, size, vd->vdev_ashift,
214 	    &msp->ms_lock);
215 
216 	for (fm = 0; fm < TXG_SIZE; fm++) {
217 		space_map_create(&msp->ms_allocmap[fm], start, size,
218 		    vd->vdev_ashift, &msp->ms_lock);
219 		space_map_create(&msp->ms_freemap[fm], start, size,
220 		    vd->vdev_ashift, &msp->ms_lock);
221 	}
222 
223 	/*
224 	 * If we're opening an existing pool (txg == 0) or creating
225 	 * a new one (txg == TXG_INITIAL), all space is available now.
226 	 * If we're adding space to an existing pool, the new space
227 	 * does not become available until after this txg has synced.
228 	 * We enforce this by assigning an initial weight of 0 to new space.
229 	 *
230 	 * (Transactional allocations for this txg would actually be OK;
231 	 * it's intent log allocations that cause trouble.  If we wrote
232 	 * a log block in this txg and lost power, the log replay would be
233 	 * based on the DVA translations that had been synced in txg - 1.
234 	 * Those translations would not include this metaslab's vdev.)
235 	 */
236 	metaslab_group_add(mg, msp, txg > TXG_INITIAL ? 0 : size);
237 
238 	if (txg == 0) {
239 		/*
240 		 * We're opening the pool.  Make the metaslab's
241 		 * free space available immediately.
242 		 */
243 		vdev_space_update(vd, size, smo->smo_alloc);
244 		metaslab_sync_done(msp, 0);
245 	} else {
246 		/*
247 		 * We're adding a new metaslab to an already-open pool.
248 		 * Declare all of the metaslab's space to be free.
249 		 *
250 		 * Note that older transaction groups cannot allocate
251 		 * from this metaslab until its existence is committed,
252 		 * because we set ms_last_alloc to the current txg.
253 		 */
254 		smo->smo_alloc = 0;
255 		msp->ms_usable_space = size;
256 		mutex_enter(&msp->ms_lock);
257 		space_map_add(&msp->ms_map, start, size);
258 		msp->ms_map_incore = 1;
259 		mutex_exit(&msp->ms_lock);
260 
261 		/* XXX -- we'll need a call to picker_init here */
262 		msp->ms_dirty[txg & TXG_MASK] |= MSD_ADD;
263 		msp->ms_last_alloc = txg;
264 		vdev_dirty(vd, VDD_ADD, txg);
265 		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
266 	}
267 
268 	*mspp = msp;
269 }
270 
271 void
272 metaslab_fini(metaslab_t *msp)
273 {
274 	int fm;
275 	metaslab_group_t *mg = msp->ms_group;
276 
277 	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
278 	    -msp->ms_smo->smo_alloc);
279 
280 	metaslab_group_remove(mg, msp);
281 
282 	/* XXX -- we'll need a call to picker_fini here */
283 
284 	mutex_enter(&msp->ms_lock);
285 
286 	space_map_vacate(&msp->ms_map, NULL, NULL);
287 	msp->ms_map_incore = 0;
288 	space_map_destroy(&msp->ms_map);
289 
290 	for (fm = 0; fm < TXG_SIZE; fm++) {
291 		space_map_destroy(&msp->ms_allocmap[fm]);
292 		space_map_destroy(&msp->ms_freemap[fm]);
293 	}
294 
295 	mutex_exit(&msp->ms_lock);
296 
297 	kmem_free(msp, sizeof (metaslab_t));
298 }
299 
300 /*
301  * Write a metaslab to disk in the context of the specified transaction group.
302  */
303 void
304 metaslab_sync(metaslab_t *msp, uint64_t txg)
305 {
306 	vdev_t *vd = msp->ms_group->mg_vd;
307 	spa_t *spa = vd->vdev_spa;
308 	objset_t *os = spa->spa_meta_objset;
309 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
310 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
311 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
312 	space_map_obj_t *smo = msp->ms_smo;
313 	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
314 	uint64_t alloc_delta;
315 	dmu_buf_t *db;
316 	dmu_tx_t *tx;
317 
318 	dprintf("%s offset %llx\n", vdev_description(vd), msp->ms_map.sm_start);
319 
320 	mutex_enter(&msp->ms_lock);
321 
322 	if (*dirty & MSD_ADD)
323 		vdev_space_update(vd, msp->ms_map.sm_size, 0);
324 
325 	if (*dirty & (MSD_ALLOC | MSD_FREE)) {
326 		tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
327 
328 		if (smo->smo_object == 0) {
329 			ASSERT(smo->smo_objsize == 0);
330 			ASSERT(smo->smo_alloc == 0);
331 			smo->smo_object = dmu_object_alloc(os,
332 			    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
333 			    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
334 			ASSERT(smo->smo_object != 0);
335 			dmu_write(os, vd->vdev_ms_array, sizeof (uint64_t) *
336 			    (msp->ms_map.sm_start >> vd->vdev_ms_shift),
337 			    sizeof (uint64_t), &smo->smo_object, tx);
338 		}
339 
340 		alloc_delta = allocmap->sm_space - freemap->sm_space;
341 		vdev_space_update(vd, 0, alloc_delta);
342 		smo->smo_alloc += alloc_delta;
343 
344 		if (msp->ms_last_alloc == txg && msp->ms_map.sm_space == 0 &&
345 		    (*dirty & MSD_CONDENSE) == 0) {
346 			space_map_t *sm = &msp->ms_map;
347 			space_map_t *tsm;
348 			int i;
349 
350 			ASSERT(msp->ms_map_incore);
351 
352 			space_map_merge(freemap, freed_map);
353 			space_map_vacate(allocmap, NULL, NULL);
354 
355 			/*
356 			 * Write out the current state of the allocation
357 			 * world.  The current metaslab is full, minus
358 			 * stuff that's been freed this txg (freed_map),
359 			 * minus allocations from txgs in the future.
360 			 */
361 			space_map_add(sm, sm->sm_start, sm->sm_size);
362 			for (i = 1; i < TXG_CONCURRENT_STATES; i++) {
363 				tsm = &msp->ms_allocmap[(txg + i) & TXG_MASK];
364 				space_map_iterate(tsm, space_map_remove, sm);
365 			}
366 			space_map_iterate(freed_map, space_map_remove, sm);
367 
368 			space_map_write(sm, smo, os, tx);
369 
370 			ASSERT(sm->sm_space == 0);
371 			ASSERT(freemap->sm_space == 0);
372 			ASSERT(allocmap->sm_space == 0);
373 
374 			*dirty |= MSD_CONDENSE;
375 		} else {
376 			space_map_sync(allocmap, NULL, smo, SM_ALLOC, os, tx);
377 			space_map_sync(freemap, freed_map, smo, SM_FREE,
378 			    os, tx);
379 		}
380 
381 		VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db));
382 		dmu_buf_will_dirty(db, tx);
383 		ASSERT3U(db->db_size, ==, sizeof (*smo));
384 		bcopy(smo, db->db_data, db->db_size);
385 		dmu_buf_rele(db, FTAG);
386 
387 		dmu_tx_commit(tx);
388 	}
389 
390 	*dirty &= ~(MSD_ALLOC | MSD_FREE | MSD_ADD);
391 
392 	mutex_exit(&msp->ms_lock);
393 
394 	(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
395 }
396 
397 /*
398  * Called after a transaction group has completely synced to mark
399  * all of the metaslab's free space as usable.
400  */
401 void
402 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
403 {
404 	uint64_t weight;
405 	uint8_t *dirty = &msp->ms_dirty[txg & TXG_MASK];
406 	space_map_obj_t *smo = msp->ms_smo;
407 
408 	dprintf("%s offset %llx txg %llu\n",
409 	    vdev_description(msp->ms_group->mg_vd), msp->ms_map.sm_start, txg);
410 
411 	mutex_enter(&msp->ms_lock);
412 
413 	ASSERT3U((*dirty & (MSD_ALLOC | MSD_FREE | MSD_ADD)), ==, 0);
414 
415 	msp->ms_usable_space = msp->ms_map.sm_size - smo->smo_alloc;
416 	msp->ms_usable_end = smo->smo_objsize;
417 
418 	weight = msp->ms_usable_space;
419 
420 	if (txg != 0) {
421 		space_map_t *freed_map =
422 		    &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
423 
424 		/* XXX -- we'll need a call to picker_fini here */
425 
426 		/* If we're empty, don't bother sticking around */
427 		if (msp->ms_usable_space == 0) {
428 			space_map_vacate(&msp->ms_map, NULL, NULL);
429 			msp->ms_map_incore = 0;
430 			ASSERT3U(freed_map->sm_space, ==, 0);
431 			weight = 0;
432 		} else {
433 			/* Add the freed blocks to the available space map */
434 			if (msp->ms_map_incore)
435 				space_map_merge(freed_map, &msp->ms_map);
436 			else
437 				space_map_vacate(freed_map, NULL, NULL);
438 			weight += msp->ms_map.sm_size;
439 		}
440 
441 		if (msp->ms_last_alloc == txg)
442 			/* Safe to use for allocation now */
443 			msp->ms_last_alloc = 0;
444 
445 		*dirty = 0;
446 	}
447 
448 	mutex_exit(&msp->ms_lock);
449 
450 	metaslab_group_sort(msp->ms_group, msp, weight);
451 }
452 
453 /*
454  * The first-fit block picker.  No picker_init or picker_fini,
455  * this is just an experiment to see how it feels to separate out
456  * the block selection policy from the map updates.
457  * Note: the 'cursor' argument is a form of PPD.
458  */
459 static uint64_t
460 metaslab_pick_block(space_map_t *sm, uint64_t size, uint64_t *cursor)
461 {
462 	avl_tree_t *t = &sm->sm_root;
463 	uint64_t align = size & -size;
464 	space_seg_t *ss, ssearch;
465 	avl_index_t where;
466 	int tried_once = 0;
467 
468 again:
469 	ssearch.ss_start = *cursor;
470 	ssearch.ss_end = *cursor + size;
471 
472 	ss = avl_find(t, &ssearch, &where);
473 	if (ss == NULL)
474 		ss = avl_nearest(t, where, AVL_AFTER);
475 
476 	while (ss != NULL) {
477 		uint64_t offset = P2ROUNDUP(ss->ss_start, align);
478 
479 		if (offset + size <= ss->ss_end) {
480 			*cursor = offset + size;
481 			return (offset);
482 		}
483 		ss = AVL_NEXT(t, ss);
484 	}
485 
486 	/* If we couldn't find a block after cursor, search again */
487 	if (tried_once == 0) {
488 		tried_once = 1;
489 		*cursor = 0;
490 		goto again;
491 	}
492 
493 	return (-1ULL);
494 }
495 
496 static uint64_t
497 metaslab_getblock(metaslab_t *msp, uint64_t size, uint64_t txg)
498 {
499 	space_map_t *sm = &msp->ms_map;
500 	vdev_t *vd = msp->ms_group->mg_vd;
501 	uint64_t offset;
502 
503 	ASSERT(MUTEX_HELD(&msp->ms_lock));
504 	ASSERT(msp->ms_map_incore);
505 	ASSERT(sm->sm_space != 0);
506 	ASSERT(P2PHASE(size, 1ULL << vd->vdev_ashift) == 0);
507 
508 	offset = metaslab_pick_block(sm, size,
509 	    &msp->ms_map_cursor[highbit(size & -size) - vd->vdev_ashift - 1]);
510 	if (offset != -1ULL) {
511 		space_map_remove(sm, offset, size);
512 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
513 	}
514 	return (offset);
515 }
516 
517 /*
518  * Intent log support: upon opening the pool after a crash, notify the SPA
519  * of blocks that the intent log has allocated for immediate write, but
520  * which are still considered free by the SPA because the last transaction
521  * group didn't commit yet.
522  */
523 int
524 metaslab_claim(spa_t *spa, dva_t *dva, uint64_t txg)
525 {
526 	uint64_t vdev = DVA_GET_VDEV(dva);
527 	uint64_t offset = DVA_GET_OFFSET(dva);
528 	uint64_t size = DVA_GET_ASIZE(dva);
529 	objset_t *os = spa->spa_meta_objset;
530 	vdev_t *vd;
531 	metaslab_t *msp;
532 	space_map_t *sm;
533 	space_map_obj_t *smo;
534 	int error;
535 
536 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL)
537 		return (ENXIO);
538 
539 	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count)
540 		return (ENXIO);
541 
542 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
543 	sm = &msp->ms_map;
544 	smo = msp->ms_smo;
545 
546 	if (DVA_GET_GANG(dva))
547 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
548 
549 	mutex_enter(&msp->ms_lock);
550 
551 	if (msp->ms_map_incore == 0) {
552 		error = space_map_load(sm, smo, SM_FREE, os,
553 		    msp->ms_usable_end, sm->sm_size - msp->ms_usable_space);
554 		ASSERT(error == 0);
555 		if (error) {
556 			mutex_exit(&msp->ms_lock);
557 			return (error);
558 		}
559 		msp->ms_map_incore = 1;
560 		/* XXX -- we'll need a call to picker_init here */
561 		bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
562 	}
563 
564 	space_map_remove(sm, offset, size);
565 	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
566 
567 	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
568 		msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
569 		msp->ms_last_alloc = txg;
570 		vdev_dirty(vd, VDD_ALLOC, txg);
571 		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
572 	}
573 
574 	mutex_exit(&msp->ms_lock);
575 
576 	return (0);
577 }
578 
579 static int
580 metaslab_usable(metaslab_t *msp, uint64_t size, uint64_t txg)
581 {
582 	/*
583 	 * Enforce segregation across transaction groups.
584 	 */
585 	/* XXX -- We should probably not assume we know what ms_weight means */
586 	if (msp->ms_last_alloc == txg)
587 		return (msp->ms_map.sm_space >= size && msp->ms_weight >= size);
588 
589 	if (msp->ms_last_alloc != 0)
590 		return (0);
591 
592 	if (msp->ms_map.sm_space >= size && msp->ms_weight >= size)
593 		return (1);
594 
595 	/* XXX -- the weight test should be in terms of MINFREE */
596 	return (msp->ms_usable_space >= size && msp->ms_weight >= size);
597 }
598 
599 static metaslab_t *
600 metaslab_pick(metaslab_group_t *mg, uint64_t size, uint64_t txg)
601 {
602 	metaslab_t *msp;
603 	avl_tree_t *t = &mg->mg_metaslab_tree;
604 
605 	mutex_enter(&mg->mg_lock);
606 	for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp))
607 		if (metaslab_usable(msp, size, txg))
608 			break;
609 	mutex_exit(&mg->mg_lock);
610 
611 	return (msp);
612 }
613 
614 static metaslab_t *
615 metaslab_group_alloc(spa_t *spa, metaslab_group_t *mg, uint64_t size,
616     uint64_t *offp, uint64_t txg)
617 {
618 	metaslab_t *msp;
619 	int error;
620 
621 	while ((msp = metaslab_pick(mg, size, txg)) != NULL) {
622 		space_map_obj_t *smo = msp->ms_smo;
623 		mutex_enter(&msp->ms_lock);
624 		if (!metaslab_usable(msp, size, txg)) {
625 			mutex_exit(&msp->ms_lock);
626 			continue;
627 		}
628 		if (msp->ms_map_incore == 0) {
629 			error = space_map_load(&msp->ms_map, smo, SM_FREE,
630 			    spa->spa_meta_objset, msp->ms_usable_end,
631 			    msp->ms_map.sm_size - msp->ms_usable_space);
632 			ASSERT(error == 0);
633 			if (error) {
634 				mutex_exit(&msp->ms_lock);
635 				metaslab_group_sort(mg, msp, 0);
636 				continue;
637 			}
638 			msp->ms_map_incore = 1;
639 			/* XXX -- we'll need a call to picker_init here */
640 			bzero(msp->ms_map_cursor, sizeof (msp->ms_map_cursor));
641 		}
642 		*offp = metaslab_getblock(msp, size, txg);
643 		if (*offp != -1ULL) {
644 			if ((msp->ms_dirty[txg & TXG_MASK] & MSD_ALLOC) == 0) {
645 				vdev_t *vd = mg->mg_vd;
646 				msp->ms_dirty[txg & TXG_MASK] |= MSD_ALLOC;
647 				msp->ms_last_alloc = txg;
648 				vdev_dirty(vd, VDD_ALLOC, txg);
649 				(void) txg_list_add(&vd->vdev_ms_list,
650 				    msp, txg);
651 			}
652 			mutex_exit(&msp->ms_lock);
653 			return (msp);
654 		}
655 		mutex_exit(&msp->ms_lock);
656 		metaslab_group_sort(msp->ms_group, msp, size - 1);
657 	}
658 
659 	return (NULL);
660 }
661 
662 /*
663  * Allocate a block for the specified i/o.
664  */
665 int
666 metaslab_alloc(spa_t *spa, uint64_t psize, dva_t *dva, uint64_t txg)
667 {
668 	metaslab_t *msp;
669 	metaslab_group_t *mg, *rotor;
670 	metaslab_class_t *mc;
671 	vdev_t *vd;
672 	uint64_t offset = -1ULL;
673 	uint64_t asize;
674 
675 	mc = spa_metaslab_class_select(spa);
676 
677 	/*
678 	 * Start at the rotor and loop through all mgs until we find something.
679 	 * Note that there's no locking on mc_rotor or mc_allocated because
680 	 * nothing actually breaks if we miss a few updates -- we just won't
681 	 * allocate quite as evenly.  It all balances out over time.
682 	 */
683 	mg = rotor = mc->mc_rotor;
684 	do {
685 		vd = mg->mg_vd;
686 		asize = vdev_psize_to_asize(vd, psize);
687 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
688 
689 		msp = metaslab_group_alloc(spa, mg, asize, &offset, txg);
690 		if (msp != NULL) {
691 			ASSERT(offset != -1ULL);
692 
693 			/*
694 			 * If we've just selected this metaslab group,
695 			 * figure out whether the corresponding vdev is
696 			 * over- or under-used relative to the pool,
697 			 * and set an allocation bias to even it out.
698 			 */
699 			if (mc->mc_allocated == 0) {
700 				vdev_stat_t *vs = &vd->vdev_stat;
701 				uint64_t alloc, space;
702 				int64_t vu, su;
703 
704 				alloc = spa_get_alloc(spa);
705 				space = spa_get_space(spa);
706 
707 				/*
708 				 * Determine percent used in units of 0..1024.
709 				 * (This is just to avoid floating point.)
710 				 */
711 				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
712 				su = (alloc << 10) / (space + 1);
713 
714 				/*
715 				 * Bias by at most +/- 25% of the aliquot.
716 				 */
717 				mg->mg_bias = ((su - vu) *
718 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
719 
720 				dprintf("bias = %lld\n", mg->mg_bias);
721 			}
722 
723 			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
724 			    mg->mg_aliquot + mg->mg_bias) {
725 				mc->mc_rotor = mg->mg_next;
726 				mc->mc_allocated = 0;
727 			}
728 
729 			DVA_SET_VDEV(dva, vd->vdev_id);
730 			DVA_SET_OFFSET(dva, offset);
731 			DVA_SET_GANG(dva, 0);
732 			DVA_SET_ASIZE(dva, asize);
733 
734 			return (0);
735 		}
736 		mc->mc_rotor = mg->mg_next;
737 		mc->mc_allocated = 0;
738 	} while ((mg = mg->mg_next) != rotor);
739 
740 	dprintf("spa=%p, psize=%llu, txg=%llu: no\n", spa, psize, txg);
741 
742 	DVA_SET_VDEV(dva, 0);
743 	DVA_SET_OFFSET(dva, 0);
744 	DVA_SET_GANG(dva, 0);
745 
746 	return (ENOSPC);
747 }
748 
749 /*
750  * Free the block represented by DVA in the context of the specified
751  * transaction group.
752  */
753 void
754 metaslab_free(spa_t *spa, dva_t *dva, uint64_t txg)
755 {
756 	uint64_t vdev = DVA_GET_VDEV(dva);
757 	uint64_t offset = DVA_GET_OFFSET(dva);
758 	uint64_t size = DVA_GET_ASIZE(dva);
759 	vdev_t *vd;
760 	metaslab_t *msp;
761 
762 	if (txg > spa_freeze_txg(spa))
763 		return;
764 
765 	if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
766 		cmn_err(CE_WARN, "metaslab_free(): bad vdev %llu",
767 		    (u_longlong_t)vdev);
768 		ASSERT(0);
769 		return;
770 	}
771 
772 	if ((offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
773 		cmn_err(CE_WARN, "metaslab_free(): bad offset %llu",
774 		    (u_longlong_t)offset);
775 		ASSERT(0);
776 		return;
777 	}
778 
779 	msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
780 
781 	if (DVA_GET_GANG(dva))
782 		size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
783 
784 	mutex_enter(&msp->ms_lock);
785 
786 	if ((msp->ms_dirty[txg & TXG_MASK] & MSD_FREE) == 0) {
787 		msp->ms_dirty[txg & TXG_MASK] |= MSD_FREE;
788 		vdev_dirty(vd, VDD_FREE, txg);
789 		(void) txg_list_add(&vd->vdev_ms_list, msp, txg);
790 	}
791 
792 	space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size);
793 
794 	mutex_exit(&msp->ms_lock);
795 }
796