1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
24 * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright (c) 2017, Intel Corporation.
27 */
28
29 #include <sys/zfs_context.h>
30 #include <sys/dmu.h>
31 #include <sys/dmu_tx.h>
32 #include <sys/space_map.h>
33 #include <sys/metaslab_impl.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/zio.h>
36 #include <sys/spa_impl.h>
37 #include <sys/zfeature.h>
38 #include <sys/vdev_indirect_mapping.h>
39 #include <sys/zap.h>
40 #include <sys/btree.h>
41
42 #define GANG_ALLOCATION(flags) \
43 ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
44
45 uint64_t metaslab_aliquot = 512ULL << 10;
46 uint64_t metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
47
48 /*
49 * In pools where the log space map feature is not enabled we touch
50 * multiple metaslabs (and their respective space maps) with each
51 * transaction group. Thus, we benefit from having a small space map
52 * block size since it allows us to issue more I/O operations scattered
53 * around the disk. So a sane default for the space map block size
54 * is 8~16K.
55 */
56 int zfs_metaslab_sm_blksz_no_log = (1 << 14);
57
58 /*
59 * When the log space map feature is enabled, we accumulate a lot of
60 * changes per metaslab that are flushed once in a while so we benefit
61 * from a bigger block size like 128K for the metaslab space maps.
62 */
63 int zfs_metaslab_sm_blksz_with_log = (1 << 17);
64
65 /*
66 * The in-core space map representation is more compact than its on-disk form.
67 * The zfs_condense_pct determines how much more compact the in-core
68 * space map representation must be before we compact it on-disk.
69 * Values should be greater than or equal to 100.
70 */
71 int zfs_condense_pct = 200;
72
73 /*
74 * Condensing a metaslab is not guaranteed to actually reduce the amount of
75 * space used on disk. In particular, a space map uses data in increments of
76 * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
77 * same number of blocks after condensing. Since the goal of condensing is to
78 * reduce the number of IOPs required to read the space map, we only want to
79 * condense when we can be sure we will reduce the number of blocks used by the
80 * space map. Unfortunately, we cannot precisely compute whether or not this is
81 * the case in metaslab_should_condense since we are holding ms_lock. Instead,
82 * we apply the following heuristic: do not condense a spacemap unless the
83 * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
84 * blocks.
85 */
86 int zfs_metaslab_condense_block_threshold = 4;
87
88 /*
89 * The zfs_mg_noalloc_threshold defines which metaslab groups should
90 * be eligible for allocation. The value is defined as a percentage of
91 * free space. Metaslab groups that have more free space than
92 * zfs_mg_noalloc_threshold are always eligible for allocations. Once
93 * a metaslab group's free space is less than or equal to the
94 * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
95 * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
96 * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
97 * groups are allowed to accept allocations. Gang blocks are always
98 * eligible to allocate on any metaslab group. The default value of 0 means
99 * no metaslab group will be excluded based on this criterion.
100 */
101 int zfs_mg_noalloc_threshold = 0;
102
103 /*
104 * Metaslab groups are considered eligible for allocations if their
105 * fragmenation metric (measured as a percentage) is less than or
106 * equal to zfs_mg_fragmentation_threshold. If a metaslab group
107 * exceeds this threshold then it will be skipped unless all metaslab
108 * groups within the metaslab class have also crossed this threshold.
109 *
110 * This tunable was introduced to avoid edge cases where we continue
111 * allocating from very fragmented disks in our pool while other, less
112 * fragmented disks, exists. On the other hand, if all disks in the
113 * pool are uniformly approaching the threshold, the threshold can
114 * be a speed bump in performance, where we keep switching the disks
115 * that we allocate from (e.g. we allocate some segments from disk A
116 * making it bypassing the threshold while freeing segments from disk
117 * B getting its fragmentation below the threshold).
118 *
119 * Empirically, we've seen that our vdev selection for allocations is
120 * good enough that fragmentation increases uniformly across all vdevs
121 * the majority of the time. Thus we set the threshold percentage high
122 * enough to avoid hitting the speed bump on pools that are being pushed
123 * to the edge.
124 */
125 int zfs_mg_fragmentation_threshold = 95;
126
127 /*
128 * Allow metaslabs to keep their active state as long as their fragmentation
129 * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
130 * active metaslab that exceeds this threshold will no longer keep its active
131 * status allowing better metaslabs to be selected.
132 */
133 int zfs_metaslab_fragmentation_threshold = 70;
134
135 /*
136 * When set will load all metaslabs when pool is first opened.
137 */
138 int metaslab_debug_load = 0;
139
140 /*
141 * When set will prevent metaslabs from being unloaded.
142 */
143 int metaslab_debug_unload = 0;
144
145 /*
146 * Minimum size which forces the dynamic allocator to change
147 * it's allocation strategy. Once the space map cannot satisfy
148 * an allocation of this size then it switches to using more
149 * aggressive strategy (i.e search by size rather than offset).
150 */
151 uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
152
153 /*
154 * The minimum free space, in percent, which must be available
155 * in a space map to continue allocations in a first-fit fashion.
156 * Once the space map's free space drops below this level we dynamically
157 * switch to using best-fit allocations.
158 */
159 int metaslab_df_free_pct = 4;
160
161 /*
162 * Maximum distance to search forward from the last offset. Without this
163 * limit, fragmented pools can see >100,000 iterations and
164 * metaslab_block_picker() becomes the performance limiting factor on
165 * high-performance storage.
166 *
167 * With the default setting of 16MB, we typically see less than 500
168 * iterations, even with very fragmented, ashift=9 pools. The maximum number
169 * of iterations possible is:
170 * metaslab_df_max_search / (2 * (1<<ashift))
171 * With the default setting of 16MB this is 16*1024 (with ashift=9) or
172 * 2048 (with ashift=12).
173 */
174 int metaslab_df_max_search = 16 * 1024 * 1024;
175
176 /*
177 * Forces the metaslab_block_picker function to search for at least this many
178 * segments forwards until giving up on finding a segment that the allocation
179 * will fit into.
180 */
181 uint32_t metaslab_min_search_count = 100;
182
183 /*
184 * If we are not searching forward (due to metaslab_df_max_search,
185 * metaslab_df_free_pct, or metaslab_df_alloc_threshold), this tunable
186 * controls what segment is used. If it is set, we will use the largest free
187 * segment. If it is not set, we will use a segment of exactly the requested
188 * size (or larger).
189 */
190 int metaslab_df_use_largest_segment = B_FALSE;
191
192 /*
193 * A metaslab is considered "free" if it contains a contiguous
194 * segment which is greater than metaslab_min_alloc_size.
195 */
196 uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
197
198 /*
199 * Percentage of all cpus that can be used by the metaslab taskq.
200 */
201 int metaslab_load_pct = 50;
202
203 /*
204 * These tunables control how long a metaslab will remain loaded after the
205 * last allocation from it. A metaslab can't be unloaded until at least
206 * metaslab_unload_delay TXG's and metaslab_unload_delay_ms milliseconds
207 * have elapsed. However, zfs_metaslab_mem_limit may cause it to be
208 * unloaded sooner. These settings are intended to be generous -- to keep
209 * metaslabs loaded for a long time, reducing the rate of metaslab loading.
210 */
211 int metaslab_unload_delay = 32;
212 int metaslab_unload_delay_ms = 10 * 60 * 1000; /* ten minutes */
213
214 /*
215 * Max number of metaslabs per group to preload.
216 */
217 int metaslab_preload_limit = 10;
218
219 /*
220 * Enable/disable preloading of metaslab.
221 */
222 boolean_t metaslab_preload_enabled = B_TRUE;
223
224 /*
225 * Enable/disable fragmentation weighting on metaslabs.
226 */
227 boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
228
229 /*
230 * Enable/disable lba weighting (i.e. outer tracks are given preference).
231 */
232 boolean_t metaslab_lba_weighting_enabled = B_TRUE;
233
234 /*
235 * Enable/disable metaslab group biasing.
236 */
237 boolean_t metaslab_bias_enabled = B_TRUE;
238
239 /*
240 * Enable/disable remapping of indirect DVAs to their concrete vdevs.
241 */
242 boolean_t zfs_remap_blkptr_enable = B_TRUE;
243
244 /*
245 * Enable/disable segment-based metaslab selection.
246 */
247 boolean_t zfs_metaslab_segment_weight_enabled = B_TRUE;
248
249 /*
250 * When using segment-based metaslab selection, we will continue
251 * allocating from the active metaslab until we have exhausted
252 * zfs_metaslab_switch_threshold of its buckets.
253 */
254 int zfs_metaslab_switch_threshold = 2;
255
256 /*
257 * Internal switch to enable/disable the metaslab allocation tracing
258 * facility.
259 */
260 boolean_t metaslab_trace_enabled = B_TRUE;
261
262 /*
263 * Maximum entries that the metaslab allocation tracing facility will keep
264 * in a given list when running in non-debug mode. We limit the number
265 * of entries in non-debug mode to prevent us from using up too much memory.
266 * The limit should be sufficiently large that we don't expect any allocation
267 * to every exceed this value. In debug mode, the system will panic if this
268 * limit is ever reached allowing for further investigation.
269 */
270 uint64_t metaslab_trace_max_entries = 5000;
271
272 /*
273 * Maximum number of metaslabs per group that can be disabled
274 * simultaneously.
275 */
276 int max_disabled_ms = 3;
277
278 /*
279 * Time (in seconds) to respect ms_max_size when the metaslab is not loaded.
280 * To avoid 64-bit overflow, don't set above UINT32_MAX.
281 */
282 unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
283
284 /*
285 * Maximum percentage of memory to use on storing loaded metaslabs. If loading
286 * a metaslab would take it over this percentage, the oldest selected metaslab
287 * is automatically unloaded.
288 */
289 int zfs_metaslab_mem_limit = 75;
290
291 /*
292 * Force the per-metaslab range trees to use 64-bit integers to store
293 * segments. Used for debugging purposes.
294 */
295 boolean_t zfs_metaslab_force_large_segs = B_FALSE;
296
297 /*
298 * By default we only store segments over a certain size in the size-sorted
299 * metaslab trees (ms_allocatable_by_size and
300 * ms_unflushed_frees_by_size). This dramatically reduces memory usage and
301 * improves load and unload times at the cost of causing us to use slightly
302 * larger segments than we would otherwise in some cases.
303 */
304 uint32_t metaslab_by_size_min_shift = 14;
305
306 static uint64_t metaslab_weight(metaslab_t *);
307 static void metaslab_set_fragmentation(metaslab_t *);
308 static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
309 static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
310 static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
311 static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
312 static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
313 static unsigned int metaslab_idx_func(multilist_t *, void *);
314 static void metaslab_evict(metaslab_t *, uint64_t);
315 static void metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg);
316
317 kmem_cache_t *metaslab_alloc_trace_cache;
318
319 typedef struct metaslab_stats {
320 kstat_named_t metaslabstat_trace_over_limit;
321 kstat_named_t metaslabstat_df_find_under_floor;
322 kstat_named_t metaslabstat_reload_tree;
323 } metaslab_stats_t;
324
325 static metaslab_stats_t metaslab_stats = {
326 { "trace_over_limit", KSTAT_DATA_UINT64 },
327 { "df_find_under_floor", KSTAT_DATA_UINT64 },
328 { "reload_tree", KSTAT_DATA_UINT64 },
329 };
330
331 #define METASLABSTAT_BUMP(stat) \
332 atomic_inc_64(&metaslab_stats.stat.value.ui64);
333
334
335 kstat_t *metaslab_ksp;
336
337 void
metaslab_stat_init(void)338 metaslab_stat_init(void)
339 {
340 ASSERT(metaslab_alloc_trace_cache == NULL);
341 metaslab_alloc_trace_cache = kmem_cache_create(
342 "metaslab_alloc_trace_cache", sizeof (metaslab_alloc_trace_t),
343 0, NULL, NULL, NULL, NULL, NULL, 0);
344 metaslab_ksp = kstat_create("zfs", 0, "metaslab_stats",
345 "misc", KSTAT_TYPE_NAMED, sizeof (metaslab_stats) /
346 sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
347 if (metaslab_ksp != NULL) {
348 metaslab_ksp->ks_data = &metaslab_stats;
349 kstat_install(metaslab_ksp);
350 }
351 }
352
353 void
metaslab_stat_fini(void)354 metaslab_stat_fini(void)
355 {
356 if (metaslab_ksp != NULL) {
357 kstat_delete(metaslab_ksp);
358 metaslab_ksp = NULL;
359 }
360
361 kmem_cache_destroy(metaslab_alloc_trace_cache);
362 metaslab_alloc_trace_cache = NULL;
363 }
364
365 /*
366 * ==========================================================================
367 * Metaslab classes
368 * ==========================================================================
369 */
370 metaslab_class_t *
metaslab_class_create(spa_t * spa,metaslab_ops_t * ops)371 metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
372 {
373 metaslab_class_t *mc;
374
375 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
376
377 mc->mc_spa = spa;
378 mc->mc_rotor = NULL;
379 mc->mc_ops = ops;
380 mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
381 mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
382 offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
383 mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
384 sizeof (zfs_refcount_t), KM_SLEEP);
385 mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
386 sizeof (uint64_t), KM_SLEEP);
387 for (int i = 0; i < spa->spa_alloc_count; i++)
388 zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]);
389
390 return (mc);
391 }
392
393 void
metaslab_class_destroy(metaslab_class_t * mc)394 metaslab_class_destroy(metaslab_class_t *mc)
395 {
396 ASSERT(mc->mc_rotor == NULL);
397 ASSERT(mc->mc_alloc == 0);
398 ASSERT(mc->mc_deferred == 0);
399 ASSERT(mc->mc_space == 0);
400 ASSERT(mc->mc_dspace == 0);
401
402 for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++)
403 zfs_refcount_destroy(&mc->mc_alloc_slots[i]);
404 kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count *
405 sizeof (zfs_refcount_t));
406 kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
407 sizeof (uint64_t));
408 mutex_destroy(&mc->mc_lock);
409 multilist_destroy(mc->mc_metaslab_txg_list);
410 kmem_free(mc, sizeof (metaslab_class_t));
411 }
412
413 int
metaslab_class_validate(metaslab_class_t * mc)414 metaslab_class_validate(metaslab_class_t *mc)
415 {
416 metaslab_group_t *mg;
417 vdev_t *vd;
418
419 /*
420 * Must hold one of the spa_config locks.
421 */
422 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
423 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
424
425 if ((mg = mc->mc_rotor) == NULL)
426 return (0);
427
428 do {
429 vd = mg->mg_vd;
430 ASSERT(vd->vdev_mg != NULL);
431 ASSERT3P(vd->vdev_top, ==, vd);
432 ASSERT3P(mg->mg_class, ==, mc);
433 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
434 } while ((mg = mg->mg_next) != mc->mc_rotor);
435
436 return (0);
437 }
438
439 static void
metaslab_class_space_update(metaslab_class_t * mc,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta,int64_t dspace_delta)440 metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
441 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
442 {
443 atomic_add_64(&mc->mc_alloc, alloc_delta);
444 atomic_add_64(&mc->mc_deferred, defer_delta);
445 atomic_add_64(&mc->mc_space, space_delta);
446 atomic_add_64(&mc->mc_dspace, dspace_delta);
447 }
448
449 uint64_t
metaslab_class_get_alloc(metaslab_class_t * mc)450 metaslab_class_get_alloc(metaslab_class_t *mc)
451 {
452 return (mc->mc_alloc);
453 }
454
455 uint64_t
metaslab_class_get_deferred(metaslab_class_t * mc)456 metaslab_class_get_deferred(metaslab_class_t *mc)
457 {
458 return (mc->mc_deferred);
459 }
460
461 uint64_t
metaslab_class_get_space(metaslab_class_t * mc)462 metaslab_class_get_space(metaslab_class_t *mc)
463 {
464 return (mc->mc_space);
465 }
466
467 uint64_t
metaslab_class_get_dspace(metaslab_class_t * mc)468 metaslab_class_get_dspace(metaslab_class_t *mc)
469 {
470 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
471 }
472
473 void
metaslab_class_histogram_verify(metaslab_class_t * mc)474 metaslab_class_histogram_verify(metaslab_class_t *mc)
475 {
476 spa_t *spa = mc->mc_spa;
477 vdev_t *rvd = spa->spa_root_vdev;
478 uint64_t *mc_hist;
479 int i;
480
481 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
482 return;
483
484 mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
485 KM_SLEEP);
486
487 for (int c = 0; c < rvd->vdev_children; c++) {
488 vdev_t *tvd = rvd->vdev_child[c];
489 metaslab_group_t *mg = tvd->vdev_mg;
490
491 /*
492 * Skip any holes, uninitialized top-levels, or
493 * vdevs that are not in this metalab class.
494 */
495 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
496 mg->mg_class != mc) {
497 continue;
498 }
499
500 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
501 mc_hist[i] += mg->mg_histogram[i];
502 }
503
504 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
505 VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
506
507 kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
508 }
509
510 /*
511 * Calculate the metaslab class's fragmentation metric. The metric
512 * is weighted based on the space contribution of each metaslab group.
513 * The return value will be a number between 0 and 100 (inclusive), or
514 * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
515 * zfs_frag_table for more information about the metric.
516 */
517 uint64_t
metaslab_class_fragmentation(metaslab_class_t * mc)518 metaslab_class_fragmentation(metaslab_class_t *mc)
519 {
520 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
521 uint64_t fragmentation = 0;
522
523 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
524
525 for (int c = 0; c < rvd->vdev_children; c++) {
526 vdev_t *tvd = rvd->vdev_child[c];
527 metaslab_group_t *mg = tvd->vdev_mg;
528
529 /*
530 * Skip any holes, uninitialized top-levels,
531 * or vdevs that are not in this metalab class.
532 */
533 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
534 mg->mg_class != mc) {
535 continue;
536 }
537
538 /*
539 * If a metaslab group does not contain a fragmentation
540 * metric then just bail out.
541 */
542 if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
543 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
544 return (ZFS_FRAG_INVALID);
545 }
546
547 /*
548 * Determine how much this metaslab_group is contributing
549 * to the overall pool fragmentation metric.
550 */
551 fragmentation += mg->mg_fragmentation *
552 metaslab_group_get_space(mg);
553 }
554 fragmentation /= metaslab_class_get_space(mc);
555
556 ASSERT3U(fragmentation, <=, 100);
557 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
558 return (fragmentation);
559 }
560
561 /*
562 * Calculate the amount of expandable space that is available in
563 * this metaslab class. If a device is expanded then its expandable
564 * space will be the amount of allocatable space that is currently not
565 * part of this metaslab class.
566 */
567 uint64_t
metaslab_class_expandable_space(metaslab_class_t * mc)568 metaslab_class_expandable_space(metaslab_class_t *mc)
569 {
570 vdev_t *rvd = mc->mc_spa->spa_root_vdev;
571 uint64_t space = 0;
572
573 spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
574 for (int c = 0; c < rvd->vdev_children; c++) {
575 uint64_t tspace;
576 vdev_t *tvd = rvd->vdev_child[c];
577 metaslab_group_t *mg = tvd->vdev_mg;
578
579 if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 ||
580 mg->mg_class != mc) {
581 continue;
582 }
583
584 /*
585 * Calculate if we have enough space to add additional
586 * metaslabs. We report the expandable space in terms
587 * of the metaslab size since that's the unit of expansion.
588 * Adjust by efi system partition size.
589 */
590 tspace = tvd->vdev_max_asize - tvd->vdev_asize;
591 if (tspace > mc->mc_spa->spa_bootsize) {
592 tspace -= mc->mc_spa->spa_bootsize;
593 }
594 space += P2ALIGN(tspace, 1ULL << tvd->vdev_ms_shift);
595 }
596 spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
597 return (space);
598 }
599
600 void
metaslab_class_evict_old(metaslab_class_t * mc,uint64_t txg)601 metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
602 {
603 multilist_t *ml = mc->mc_metaslab_txg_list;
604 for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
605 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
606 metaslab_t *msp = multilist_sublist_head(mls);
607 multilist_sublist_unlock(mls);
608 while (msp != NULL) {
609 mutex_enter(&msp->ms_lock);
610
611 /*
612 * If the metaslab has been removed from the list
613 * (which could happen if we were at the memory limit
614 * and it was evicted during this loop), then we can't
615 * proceed and we should restart the sublist.
616 */
617 if (!multilist_link_active(&msp->ms_class_txg_node)) {
618 mutex_exit(&msp->ms_lock);
619 i--;
620 break;
621 }
622 mls = multilist_sublist_lock(ml, i);
623 metaslab_t *next_msp = multilist_sublist_next(mls, msp);
624 multilist_sublist_unlock(mls);
625 if (txg >
626 msp->ms_selected_txg + metaslab_unload_delay &&
627 gethrtime() > msp->ms_selected_time +
628 (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) {
629 metaslab_evict(msp, txg);
630 } else {
631 /*
632 * Once we've hit a metaslab selected too
633 * recently to evict, we're done evicting for
634 * now.
635 */
636 mutex_exit(&msp->ms_lock);
637 break;
638 }
639 mutex_exit(&msp->ms_lock);
640 msp = next_msp;
641 }
642 }
643 }
644
645 static int
metaslab_compare(const void * x1,const void * x2)646 metaslab_compare(const void *x1, const void *x2)
647 {
648 const metaslab_t *m1 = (const metaslab_t *)x1;
649 const metaslab_t *m2 = (const metaslab_t *)x2;
650
651 int sort1 = 0;
652 int sort2 = 0;
653 if (m1->ms_allocator != -1 && m1->ms_primary)
654 sort1 = 1;
655 else if (m1->ms_allocator != -1 && !m1->ms_primary)
656 sort1 = 2;
657 if (m2->ms_allocator != -1 && m2->ms_primary)
658 sort2 = 1;
659 else if (m2->ms_allocator != -1 && !m2->ms_primary)
660 sort2 = 2;
661
662 /*
663 * Sort inactive metaslabs first, then primaries, then secondaries. When
664 * selecting a metaslab to allocate from, an allocator first tries its
665 * primary, then secondary active metaslab. If it doesn't have active
666 * metaslabs, or can't allocate from them, it searches for an inactive
667 * metaslab to activate. If it can't find a suitable one, it will steal
668 * a primary or secondary metaslab from another allocator.
669 */
670 if (sort1 < sort2)
671 return (-1);
672 if (sort1 > sort2)
673 return (1);
674
675 int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight);
676 if (likely(cmp))
677 return (cmp);
678
679 IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2);
680
681 return (TREE_CMP(m1->ms_start, m2->ms_start));
682 }
683
684 /*
685 * ==========================================================================
686 * Metaslab groups
687 * ==========================================================================
688 */
689 /*
690 * Update the allocatable flag and the metaslab group's capacity.
691 * The allocatable flag is set to true if the capacity is below
692 * the zfs_mg_noalloc_threshold or has a fragmentation value that is
693 * greater than zfs_mg_fragmentation_threshold. If a metaslab group
694 * transitions from allocatable to non-allocatable or vice versa then the
695 * metaslab group's class is updated to reflect the transition.
696 */
697 static void
metaslab_group_alloc_update(metaslab_group_t * mg)698 metaslab_group_alloc_update(metaslab_group_t *mg)
699 {
700 vdev_t *vd = mg->mg_vd;
701 metaslab_class_t *mc = mg->mg_class;
702 vdev_stat_t *vs = &vd->vdev_stat;
703 boolean_t was_allocatable;
704 boolean_t was_initialized;
705
706 ASSERT(vd == vd->vdev_top);
707 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==,
708 SCL_ALLOC);
709
710 mutex_enter(&mg->mg_lock);
711 was_allocatable = mg->mg_allocatable;
712 was_initialized = mg->mg_initialized;
713
714 mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
715 (vs->vs_space + 1);
716
717 mutex_enter(&mc->mc_lock);
718
719 /*
720 * If the metaslab group was just added then it won't
721 * have any space until we finish syncing out this txg.
722 * At that point we will consider it initialized and available
723 * for allocations. We also don't consider non-activated
724 * metaslab groups (e.g. vdevs that are in the middle of being removed)
725 * to be initialized, because they can't be used for allocation.
726 */
727 mg->mg_initialized = metaslab_group_initialized(mg);
728 if (!was_initialized && mg->mg_initialized) {
729 mc->mc_groups++;
730 } else if (was_initialized && !mg->mg_initialized) {
731 ASSERT3U(mc->mc_groups, >, 0);
732 mc->mc_groups--;
733 }
734 if (mg->mg_initialized)
735 mg->mg_no_free_space = B_FALSE;
736
737 /*
738 * A metaslab group is considered allocatable if it has plenty
739 * of free space or is not heavily fragmented. We only take
740 * fragmentation into account if the metaslab group has a valid
741 * fragmentation metric (i.e. a value between 0 and 100).
742 */
743 mg->mg_allocatable = (mg->mg_activation_count > 0 &&
744 mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
745 (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
746 mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
747
748 /*
749 * The mc_alloc_groups maintains a count of the number of
750 * groups in this metaslab class that are still above the
751 * zfs_mg_noalloc_threshold. This is used by the allocating
752 * threads to determine if they should avoid allocations to
753 * a given group. The allocator will avoid allocations to a group
754 * if that group has reached or is below the zfs_mg_noalloc_threshold
755 * and there are still other groups that are above the threshold.
756 * When a group transitions from allocatable to non-allocatable or
757 * vice versa we update the metaslab class to reflect that change.
758 * When the mc_alloc_groups value drops to 0 that means that all
759 * groups have reached the zfs_mg_noalloc_threshold making all groups
760 * eligible for allocations. This effectively means that all devices
761 * are balanced again.
762 */
763 if (was_allocatable && !mg->mg_allocatable)
764 mc->mc_alloc_groups--;
765 else if (!was_allocatable && mg->mg_allocatable)
766 mc->mc_alloc_groups++;
767 mutex_exit(&mc->mc_lock);
768
769 mutex_exit(&mg->mg_lock);
770 }
771
772 int
metaslab_sort_by_flushed(const void * va,const void * vb)773 metaslab_sort_by_flushed(const void *va, const void *vb)
774 {
775 const metaslab_t *a = va;
776 const metaslab_t *b = vb;
777
778 int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg);
779 if (likely(cmp))
780 return (cmp);
781
782 uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id;
783 uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id;
784 cmp = TREE_CMP(a_vdev_id, b_vdev_id);
785 if (cmp)
786 return (cmp);
787
788 return (TREE_CMP(a->ms_id, b->ms_id));
789 }
790
791 metaslab_group_t *
metaslab_group_create(metaslab_class_t * mc,vdev_t * vd,int allocators)792 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
793 {
794 metaslab_group_t *mg;
795
796 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
797 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
798 mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
799 cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
800 mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
801 KM_SLEEP);
802 mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
803 KM_SLEEP);
804 avl_create(&mg->mg_metaslab_tree, metaslab_compare,
805 sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
806 mg->mg_vd = vd;
807 mg->mg_class = mc;
808 mg->mg_activation_count = 0;
809 mg->mg_initialized = B_FALSE;
810 mg->mg_no_free_space = B_TRUE;
811 mg->mg_allocators = allocators;
812
813 mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
814 sizeof (zfs_refcount_t), KM_SLEEP);
815 mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
816 sizeof (uint64_t), KM_SLEEP);
817 for (int i = 0; i < allocators; i++) {
818 zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
819 mg->mg_cur_max_alloc_queue_depth[i] = 0;
820 }
821
822 mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
823 minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
824
825 return (mg);
826 }
827
828 void
metaslab_group_destroy(metaslab_group_t * mg)829 metaslab_group_destroy(metaslab_group_t *mg)
830 {
831 ASSERT(mg->mg_prev == NULL);
832 ASSERT(mg->mg_next == NULL);
833 /*
834 * We may have gone below zero with the activation count
835 * either because we never activated in the first place or
836 * because we're done, and possibly removing the vdev.
837 */
838 ASSERT(mg->mg_activation_count <= 0);
839
840 taskq_destroy(mg->mg_taskq);
841 avl_destroy(&mg->mg_metaslab_tree);
842 kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
843 kmem_free(mg->mg_secondaries, mg->mg_allocators *
844 sizeof (metaslab_t *));
845 mutex_destroy(&mg->mg_lock);
846 mutex_destroy(&mg->mg_ms_disabled_lock);
847 cv_destroy(&mg->mg_ms_disabled_cv);
848
849 for (int i = 0; i < mg->mg_allocators; i++) {
850 zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
851 mg->mg_cur_max_alloc_queue_depth[i] = 0;
852 }
853 kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
854 sizeof (zfs_refcount_t));
855 kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
856 sizeof (uint64_t));
857
858 kmem_free(mg, sizeof (metaslab_group_t));
859 }
860
861 void
metaslab_group_activate(metaslab_group_t * mg)862 metaslab_group_activate(metaslab_group_t *mg)
863 {
864 metaslab_class_t *mc = mg->mg_class;
865 metaslab_group_t *mgprev, *mgnext;
866
867 ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0);
868
869 ASSERT(mc->mc_rotor != mg);
870 ASSERT(mg->mg_prev == NULL);
871 ASSERT(mg->mg_next == NULL);
872 ASSERT(mg->mg_activation_count <= 0);
873
874 if (++mg->mg_activation_count <= 0)
875 return;
876
877 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
878 metaslab_group_alloc_update(mg);
879
880 if ((mgprev = mc->mc_rotor) == NULL) {
881 mg->mg_prev = mg;
882 mg->mg_next = mg;
883 } else {
884 mgnext = mgprev->mg_next;
885 mg->mg_prev = mgprev;
886 mg->mg_next = mgnext;
887 mgprev->mg_next = mg;
888 mgnext->mg_prev = mg;
889 }
890 mc->mc_rotor = mg;
891 }
892
893 /*
894 * Passivate a metaslab group and remove it from the allocation rotor.
895 * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating
896 * a metaslab group. This function will momentarily drop spa_config_locks
897 * that are lower than the SCL_ALLOC lock (see comment below).
898 */
899 void
metaslab_group_passivate(metaslab_group_t * mg)900 metaslab_group_passivate(metaslab_group_t *mg)
901 {
902 metaslab_class_t *mc = mg->mg_class;
903 spa_t *spa = mc->mc_spa;
904 metaslab_group_t *mgprev, *mgnext;
905 int locks = spa_config_held(spa, SCL_ALL, RW_WRITER);
906
907 ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==,
908 (SCL_ALLOC | SCL_ZIO));
909
910 if (--mg->mg_activation_count != 0) {
911 ASSERT(mc->mc_rotor != mg);
912 ASSERT(mg->mg_prev == NULL);
913 ASSERT(mg->mg_next == NULL);
914 ASSERT(mg->mg_activation_count < 0);
915 return;
916 }
917
918 /*
919 * The spa_config_lock is an array of rwlocks, ordered as
920 * follows (from highest to lowest):
921 * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC >
922 * SCL_ZIO > SCL_FREE > SCL_VDEV
923 * (For more information about the spa_config_lock see spa_misc.c)
924 * The higher the lock, the broader its coverage. When we passivate
925 * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO
926 * config locks. However, the metaslab group's taskq might be trying
927 * to preload metaslabs so we must drop the SCL_ZIO lock and any
928 * lower locks to allow the I/O to complete. At a minimum,
929 * we continue to hold the SCL_ALLOC lock, which prevents any future
930 * allocations from taking place and any changes to the vdev tree.
931 */
932 spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
933 taskq_wait(mg->mg_taskq);
934 spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
935 metaslab_group_alloc_update(mg);
936 for (int i = 0; i < mg->mg_allocators; i++) {
937 metaslab_t *msp = mg->mg_primaries[i];
938 if (msp != NULL) {
939 mutex_enter(&msp->ms_lock);
940 metaslab_passivate(msp,
941 metaslab_weight_from_range_tree(msp));
942 mutex_exit(&msp->ms_lock);
943 }
944 msp = mg->mg_secondaries[i];
945 if (msp != NULL) {
946 mutex_enter(&msp->ms_lock);
947 metaslab_passivate(msp,
948 metaslab_weight_from_range_tree(msp));
949 mutex_exit(&msp->ms_lock);
950 }
951 }
952
953 mgprev = mg->mg_prev;
954 mgnext = mg->mg_next;
955
956 if (mg == mgnext) {
957 mc->mc_rotor = NULL;
958 } else {
959 mc->mc_rotor = mgnext;
960 mgprev->mg_next = mgnext;
961 mgnext->mg_prev = mgprev;
962 }
963
964 mg->mg_prev = NULL;
965 mg->mg_next = NULL;
966 }
967
968 boolean_t
metaslab_group_initialized(metaslab_group_t * mg)969 metaslab_group_initialized(metaslab_group_t *mg)
970 {
971 vdev_t *vd = mg->mg_vd;
972 vdev_stat_t *vs = &vd->vdev_stat;
973
974 return (vs->vs_space != 0 && mg->mg_activation_count > 0);
975 }
976
977 uint64_t
metaslab_group_get_space(metaslab_group_t * mg)978 metaslab_group_get_space(metaslab_group_t *mg)
979 {
980 return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
981 }
982
983 void
metaslab_group_histogram_verify(metaslab_group_t * mg)984 metaslab_group_histogram_verify(metaslab_group_t *mg)
985 {
986 uint64_t *mg_hist;
987 vdev_t *vd = mg->mg_vd;
988 uint64_t ashift = vd->vdev_ashift;
989 int i;
990
991 if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
992 return;
993
994 mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
995 KM_SLEEP);
996
997 ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
998 SPACE_MAP_HISTOGRAM_SIZE + ashift);
999
1000 for (int m = 0; m < vd->vdev_ms_count; m++) {
1001 metaslab_t *msp = vd->vdev_ms[m];
1002
1003 /* skip if not active or not a member */
1004 if (msp->ms_sm == NULL || msp->ms_group != mg)
1005 continue;
1006
1007 for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
1008 mg_hist[i + ashift] +=
1009 msp->ms_sm->sm_phys->smp_histogram[i];
1010 }
1011
1012 for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
1013 VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
1014
1015 kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
1016 }
1017
1018 static void
metaslab_group_histogram_add(metaslab_group_t * mg,metaslab_t * msp)1019 metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
1020 {
1021 metaslab_class_t *mc = mg->mg_class;
1022 uint64_t ashift = mg->mg_vd->vdev_ashift;
1023
1024 ASSERT(MUTEX_HELD(&msp->ms_lock));
1025 if (msp->ms_sm == NULL)
1026 return;
1027
1028 mutex_enter(&mg->mg_lock);
1029 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1030 mg->mg_histogram[i + ashift] +=
1031 msp->ms_sm->sm_phys->smp_histogram[i];
1032 mc->mc_histogram[i + ashift] +=
1033 msp->ms_sm->sm_phys->smp_histogram[i];
1034 }
1035 mutex_exit(&mg->mg_lock);
1036 }
1037
1038 void
metaslab_group_histogram_remove(metaslab_group_t * mg,metaslab_t * msp)1039 metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
1040 {
1041 metaslab_class_t *mc = mg->mg_class;
1042 uint64_t ashift = mg->mg_vd->vdev_ashift;
1043
1044 ASSERT(MUTEX_HELD(&msp->ms_lock));
1045 if (msp->ms_sm == NULL)
1046 return;
1047
1048 mutex_enter(&mg->mg_lock);
1049 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
1050 ASSERT3U(mg->mg_histogram[i + ashift], >=,
1051 msp->ms_sm->sm_phys->smp_histogram[i]);
1052 ASSERT3U(mc->mc_histogram[i + ashift], >=,
1053 msp->ms_sm->sm_phys->smp_histogram[i]);
1054
1055 mg->mg_histogram[i + ashift] -=
1056 msp->ms_sm->sm_phys->smp_histogram[i];
1057 mc->mc_histogram[i + ashift] -=
1058 msp->ms_sm->sm_phys->smp_histogram[i];
1059 }
1060 mutex_exit(&mg->mg_lock);
1061 }
1062
1063 static void
metaslab_group_add(metaslab_group_t * mg,metaslab_t * msp)1064 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
1065 {
1066 ASSERT(msp->ms_group == NULL);
1067 mutex_enter(&mg->mg_lock);
1068 msp->ms_group = mg;
1069 msp->ms_weight = 0;
1070 avl_add(&mg->mg_metaslab_tree, msp);
1071 mutex_exit(&mg->mg_lock);
1072
1073 mutex_enter(&msp->ms_lock);
1074 metaslab_group_histogram_add(mg, msp);
1075 mutex_exit(&msp->ms_lock);
1076 }
1077
1078 static void
metaslab_group_remove(metaslab_group_t * mg,metaslab_t * msp)1079 metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
1080 {
1081 mutex_enter(&msp->ms_lock);
1082 metaslab_group_histogram_remove(mg, msp);
1083 mutex_exit(&msp->ms_lock);
1084
1085 mutex_enter(&mg->mg_lock);
1086 ASSERT(msp->ms_group == mg);
1087 avl_remove(&mg->mg_metaslab_tree, msp);
1088
1089 metaslab_class_t *mc = msp->ms_group->mg_class;
1090 multilist_sublist_t *mls =
1091 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
1092 if (multilist_link_active(&msp->ms_class_txg_node))
1093 multilist_sublist_remove(mls, msp);
1094 multilist_sublist_unlock(mls);
1095
1096 msp->ms_group = NULL;
1097 mutex_exit(&mg->mg_lock);
1098 }
1099
1100 static void
metaslab_group_sort_impl(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)1101 metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1102 {
1103 ASSERT(MUTEX_HELD(&msp->ms_lock));
1104 ASSERT(MUTEX_HELD(&mg->mg_lock));
1105 ASSERT(msp->ms_group == mg);
1106
1107 avl_remove(&mg->mg_metaslab_tree, msp);
1108 msp->ms_weight = weight;
1109 avl_add(&mg->mg_metaslab_tree, msp);
1110
1111 }
1112
1113 static void
metaslab_group_sort(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)1114 metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
1115 {
1116 /*
1117 * Although in principle the weight can be any value, in
1118 * practice we do not use values in the range [1, 511].
1119 */
1120 ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
1121 ASSERT(MUTEX_HELD(&msp->ms_lock));
1122
1123 mutex_enter(&mg->mg_lock);
1124 metaslab_group_sort_impl(mg, msp, weight);
1125 mutex_exit(&mg->mg_lock);
1126 }
1127
1128 /*
1129 * Calculate the fragmentation for a given metaslab group. We can use
1130 * a simple average here since all metaslabs within the group must have
1131 * the same size. The return value will be a value between 0 and 100
1132 * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
1133 * group have a fragmentation metric.
1134 */
1135 uint64_t
metaslab_group_fragmentation(metaslab_group_t * mg)1136 metaslab_group_fragmentation(metaslab_group_t *mg)
1137 {
1138 vdev_t *vd = mg->mg_vd;
1139 uint64_t fragmentation = 0;
1140 uint64_t valid_ms = 0;
1141
1142 for (int m = 0; m < vd->vdev_ms_count; m++) {
1143 metaslab_t *msp = vd->vdev_ms[m];
1144
1145 if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
1146 continue;
1147 if (msp->ms_group != mg)
1148 continue;
1149
1150 valid_ms++;
1151 fragmentation += msp->ms_fragmentation;
1152 }
1153
1154 if (valid_ms <= mg->mg_vd->vdev_ms_count / 2)
1155 return (ZFS_FRAG_INVALID);
1156
1157 fragmentation /= valid_ms;
1158 ASSERT3U(fragmentation, <=, 100);
1159 return (fragmentation);
1160 }
1161
1162 /*
1163 * Determine if a given metaslab group should skip allocations. A metaslab
1164 * group should avoid allocations if its free capacity is less than the
1165 * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
1166 * zfs_mg_fragmentation_threshold and there is at least one metaslab group
1167 * that can still handle allocations. If the allocation throttle is enabled
1168 * then we skip allocations to devices that have reached their maximum
1169 * allocation queue depth unless the selected metaslab group is the only
1170 * eligible group remaining.
1171 */
1172 static boolean_t
metaslab_group_allocatable(metaslab_group_t * mg,metaslab_group_t * rotor,uint64_t psize,int allocator,int d)1173 metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
1174 uint64_t psize, int allocator, int d)
1175 {
1176 spa_t *spa = mg->mg_vd->vdev_spa;
1177 metaslab_class_t *mc = mg->mg_class;
1178
1179 /*
1180 * We can only consider skipping this metaslab group if it's
1181 * in the normal metaslab class and there are other metaslab
1182 * groups to select from. Otherwise, we always consider it eligible
1183 * for allocations.
1184 */
1185 if ((mc != spa_normal_class(spa) &&
1186 mc != spa_special_class(spa) &&
1187 mc != spa_dedup_class(spa)) ||
1188 mc->mc_groups <= 1)
1189 return (B_TRUE);
1190
1191 /*
1192 * If the metaslab group's mg_allocatable flag is set (see comments
1193 * in metaslab_group_alloc_update() for more information) and
1194 * the allocation throttle is disabled then allow allocations to this
1195 * device. However, if the allocation throttle is enabled then
1196 * check if we have reached our allocation limit (mg_alloc_queue_depth)
1197 * to determine if we should allow allocations to this metaslab group.
1198 * If all metaslab groups are no longer considered allocatable
1199 * (mc_alloc_groups == 0) or we're trying to allocate the smallest
1200 * gang block size then we allow allocations on this metaslab group
1201 * regardless of the mg_allocatable or throttle settings.
1202 */
1203 if (mg->mg_allocatable) {
1204 metaslab_group_t *mgp;
1205 int64_t qdepth;
1206 uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
1207
1208 if (!mc->mc_alloc_throttle_enabled)
1209 return (B_TRUE);
1210
1211 /*
1212 * If this metaslab group does not have any free space, then
1213 * there is no point in looking further.
1214 */
1215 if (mg->mg_no_free_space)
1216 return (B_FALSE);
1217
1218 /*
1219 * Relax allocation throttling for ditto blocks. Due to
1220 * random imbalances in allocation it tends to push copies
1221 * to one vdev, that looks a bit better at the moment.
1222 */
1223 qmax = qmax * (4 + d) / 4;
1224
1225 qdepth = zfs_refcount_count(
1226 &mg->mg_alloc_queue_depth[allocator]);
1227
1228 /*
1229 * If this metaslab group is below its qmax or it's
1230 * the only allocatable metasable group, then attempt
1231 * to allocate from it.
1232 */
1233 if (qdepth < qmax || mc->mc_alloc_groups == 1)
1234 return (B_TRUE);
1235 ASSERT3U(mc->mc_alloc_groups, >, 1);
1236
1237 /*
1238 * Since this metaslab group is at or over its qmax, we
1239 * need to determine if there are metaslab groups after this
1240 * one that might be able to handle this allocation. This is
1241 * racy since we can't hold the locks for all metaslab
1242 * groups at the same time when we make this check.
1243 */
1244 for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
1245 qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
1246 qmax = qmax * (4 + d) / 4;
1247 qdepth = zfs_refcount_count(
1248 &mgp->mg_alloc_queue_depth[allocator]);
1249
1250 /*
1251 * If there is another metaslab group that
1252 * might be able to handle the allocation, then
1253 * we return false so that we skip this group.
1254 */
1255 if (qdepth < qmax && !mgp->mg_no_free_space)
1256 return (B_FALSE);
1257 }
1258
1259 /*
1260 * We didn't find another group to handle the allocation
1261 * so we can't skip this metaslab group even though
1262 * we are at or over our qmax.
1263 */
1264 return (B_TRUE);
1265
1266 } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
1267 return (B_TRUE);
1268 }
1269 return (B_FALSE);
1270 }
1271
1272 /*
1273 * ==========================================================================
1274 * Range tree callbacks
1275 * ==========================================================================
1276 */
1277
1278 /*
1279 * Comparison function for the private size-ordered tree using 32-bit
1280 * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1281 */
1282 static int
metaslab_rangesize32_compare(const void * x1,const void * x2)1283 metaslab_rangesize32_compare(const void *x1, const void *x2)
1284 {
1285 const range_seg32_t *r1 = x1;
1286 const range_seg32_t *r2 = x2;
1287
1288 uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1289 uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1290
1291 int cmp = TREE_CMP(rs_size1, rs_size2);
1292 if (likely(cmp))
1293 return (cmp);
1294
1295 return (TREE_CMP(r1->rs_start, r2->rs_start));
1296 }
1297
1298 /*
1299 * Comparison function for the private size-ordered tree using 64-bit
1300 * ranges. Tree is sorted by size, larger sizes at the end of the tree.
1301 */
1302 static int
metaslab_rangesize64_compare(const void * x1,const void * x2)1303 metaslab_rangesize64_compare(const void *x1, const void *x2)
1304 {
1305 const range_seg64_t *r1 = x1;
1306 const range_seg64_t *r2 = x2;
1307
1308 uint64_t rs_size1 = r1->rs_end - r1->rs_start;
1309 uint64_t rs_size2 = r2->rs_end - r2->rs_start;
1310
1311 int cmp = TREE_CMP(rs_size1, rs_size2);
1312 if (likely(cmp))
1313 return (cmp);
1314
1315 return (TREE_CMP(r1->rs_start, r2->rs_start));
1316 }
1317 typedef struct metaslab_rt_arg {
1318 zfs_btree_t *mra_bt;
1319 uint32_t mra_floor_shift;
1320 } metaslab_rt_arg_t;
1321
1322 struct mssa_arg {
1323 range_tree_t *rt;
1324 metaslab_rt_arg_t *mra;
1325 };
1326
1327 static void
metaslab_size_sorted_add(void * arg,uint64_t start,uint64_t size)1328 metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size)
1329 {
1330 struct mssa_arg *mssap = arg;
1331 range_tree_t *rt = mssap->rt;
1332 metaslab_rt_arg_t *mrap = mssap->mra;
1333 range_seg_max_t seg = {0};
1334 rs_set_start(&seg, rt, start);
1335 rs_set_end(&seg, rt, start + size);
1336 metaslab_rt_add(rt, &seg, mrap);
1337 }
1338
1339 static void
metaslab_size_tree_full_load(range_tree_t * rt)1340 metaslab_size_tree_full_load(range_tree_t *rt)
1341 {
1342 metaslab_rt_arg_t *mrap = rt->rt_arg;
1343 #ifdef _METASLAB_TRACING
1344 METASLABSTAT_BUMP(metaslabstat_reload_tree);
1345 #endif
1346 ASSERT0(zfs_btree_numnodes(mrap->mra_bt));
1347 mrap->mra_floor_shift = 0;
1348 struct mssa_arg arg = {0};
1349 arg.rt = rt;
1350 arg.mra = mrap;
1351 range_tree_walk(rt, metaslab_size_sorted_add, &arg);
1352 }
1353
1354 /*
1355 * Create any block allocator specific components. The current allocators
1356 * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
1357 */
1358 /* ARGSUSED */
1359 static void
metaslab_rt_create(range_tree_t * rt,void * arg)1360 metaslab_rt_create(range_tree_t *rt, void *arg)
1361 {
1362 metaslab_rt_arg_t *mrap = arg;
1363 zfs_btree_t *size_tree = mrap->mra_bt;
1364
1365 size_t size;
1366 int (*compare) (const void *, const void *);
1367 switch (rt->rt_type) {
1368 case RANGE_SEG32:
1369 size = sizeof (range_seg32_t);
1370 compare = metaslab_rangesize32_compare;
1371 break;
1372 case RANGE_SEG64:
1373 size = sizeof (range_seg64_t);
1374 compare = metaslab_rangesize64_compare;
1375 break;
1376 default:
1377 panic("Invalid range seg type %d", rt->rt_type);
1378 }
1379 zfs_btree_create(size_tree, compare, size);
1380 mrap->mra_floor_shift = metaslab_by_size_min_shift;
1381 }
1382
1383 /* ARGSUSED */
1384 static void
metaslab_rt_destroy(range_tree_t * rt,void * arg)1385 metaslab_rt_destroy(range_tree_t *rt, void *arg)
1386 {
1387 metaslab_rt_arg_t *mrap = arg;
1388 zfs_btree_t *size_tree = mrap->mra_bt;
1389
1390 zfs_btree_destroy(size_tree);
1391 kmem_free(mrap, sizeof (*mrap));
1392 }
1393
1394 /* ARGSUSED */
1395 static void
metaslab_rt_add(range_tree_t * rt,range_seg_t * rs,void * arg)1396 metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
1397 {
1398 metaslab_rt_arg_t *mrap = arg;
1399 zfs_btree_t *size_tree = mrap->mra_bt;
1400
1401 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) <
1402 (1 << mrap->mra_floor_shift))
1403 return;
1404
1405 zfs_btree_add(size_tree, rs);
1406 }
1407
1408 /* ARGSUSED */
1409 static void
metaslab_rt_remove(range_tree_t * rt,range_seg_t * rs,void * arg)1410 metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
1411 {
1412 metaslab_rt_arg_t *mrap = arg;
1413 zfs_btree_t *size_tree = mrap->mra_bt;
1414
1415 if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 <<
1416 mrap->mra_floor_shift))
1417 return;
1418
1419 zfs_btree_remove(size_tree, rs);
1420 }
1421
1422 /* ARGSUSED */
1423 static void
metaslab_rt_vacate(range_tree_t * rt,void * arg)1424 metaslab_rt_vacate(range_tree_t *rt, void *arg)
1425 {
1426 metaslab_rt_arg_t *mrap = arg;
1427 zfs_btree_t *size_tree = mrap->mra_bt;
1428 zfs_btree_clear(size_tree);
1429 zfs_btree_destroy(size_tree);
1430
1431 metaslab_rt_create(rt, arg);
1432 }
1433
1434 static range_tree_ops_t metaslab_rt_ops = {
1435 .rtop_create = metaslab_rt_create,
1436 .rtop_destroy = metaslab_rt_destroy,
1437 .rtop_add = metaslab_rt_add,
1438 .rtop_remove = metaslab_rt_remove,
1439 .rtop_vacate = metaslab_rt_vacate
1440 };
1441
1442 /*
1443 * ==========================================================================
1444 * Common allocator routines
1445 * ==========================================================================
1446 */
1447
1448 /*
1449 * Return the maximum contiguous segment within the metaslab.
1450 */
1451 uint64_t
metaslab_largest_allocatable(metaslab_t * msp)1452 metaslab_largest_allocatable(metaslab_t *msp)
1453 {
1454 zfs_btree_t *t = &msp->ms_allocatable_by_size;
1455 range_seg_t *rs;
1456
1457 if (t == NULL)
1458 return (0);
1459 if (zfs_btree_numnodes(t) == 0)
1460 metaslab_size_tree_full_load(msp->ms_allocatable);
1461
1462 rs = zfs_btree_last(t, NULL);
1463 if (rs == NULL)
1464 return (0);
1465
1466 return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs,
1467 msp->ms_allocatable));
1468 }
1469
1470 /*
1471 * Return the maximum contiguous segment within the unflushed frees of this
1472 * metaslab.
1473 */
1474 uint64_t
metaslab_largest_unflushed_free(metaslab_t * msp)1475 metaslab_largest_unflushed_free(metaslab_t *msp)
1476 {
1477 ASSERT(MUTEX_HELD(&msp->ms_lock));
1478
1479 if (msp->ms_unflushed_frees == NULL)
1480 return (0);
1481
1482 if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0)
1483 metaslab_size_tree_full_load(msp->ms_unflushed_frees);
1484 range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size,
1485 NULL);
1486 if (rs == NULL)
1487 return (0);
1488
1489 /*
1490 * When a range is freed from the metaslab, that range is added to
1491 * both the unflushed frees and the deferred frees. While the block
1492 * will eventually be usable, if the metaslab were loaded the range
1493 * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE
1494 * txgs had passed. As a result, when attempting to estimate an upper
1495 * bound for the largest currently-usable free segment in the
1496 * metaslab, we need to not consider any ranges currently in the defer
1497 * trees. This algorithm approximates the largest available chunk in
1498 * the largest range in the unflushed_frees tree by taking the first
1499 * chunk. While this may be a poor estimate, it should only remain so
1500 * briefly and should eventually self-correct as frees are no longer
1501 * deferred. Similar logic applies to the ms_freed tree. See
1502 * metaslab_load() for more details.
1503 *
1504 * There are two primary sources of innacuracy in this estimate. Both
1505 * are tolerated for performance reasons. The first source is that we
1506 * only check the largest segment for overlaps. Smaller segments may
1507 * have more favorable overlaps with the other trees, resulting in
1508 * larger usable chunks. Second, we only look at the first chunk in
1509 * the largest segment; there may be other usable chunks in the
1510 * largest segment, but we ignore them.
1511 */
1512 uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees);
1513 uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart;
1514 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1515 uint64_t start = 0;
1516 uint64_t size = 0;
1517 boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart,
1518 rsize, &start, &size);
1519 if (found) {
1520 if (rstart == start)
1521 return (0);
1522 rsize = start - rstart;
1523 }
1524 }
1525
1526 uint64_t start = 0;
1527 uint64_t size = 0;
1528 boolean_t found = range_tree_find_in(msp->ms_freed, rstart,
1529 rsize, &start, &size);
1530 if (found)
1531 rsize = start - rstart;
1532
1533 return (rsize);
1534 }
1535
1536 static range_seg_t *
metaslab_block_find(zfs_btree_t * t,range_tree_t * rt,uint64_t start,uint64_t size,zfs_btree_index_t * where)1537 metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start,
1538 uint64_t size, zfs_btree_index_t *where)
1539 {
1540 range_seg_t *rs;
1541 range_seg_max_t rsearch;
1542
1543 rs_set_start(&rsearch, rt, start);
1544 rs_set_end(&rsearch, rt, start + size);
1545
1546 rs = zfs_btree_find(t, &rsearch, where);
1547 if (rs == NULL) {
1548 rs = zfs_btree_next(t, where, where);
1549 }
1550
1551 return (rs);
1552 }
1553
1554 /*
1555 * This is a helper function that can be used by the allocator to find a
1556 * suitable block to allocate. This will search the specified B-tree looking
1557 * for a block that matches the specified criteria.
1558 */
1559 static uint64_t
metaslab_block_picker(range_tree_t * rt,uint64_t * cursor,uint64_t size,uint64_t max_search)1560 metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size,
1561 uint64_t max_search)
1562 {
1563 if (*cursor == 0)
1564 *cursor = rt->rt_start;
1565 zfs_btree_t *bt = &rt->rt_root;
1566 zfs_btree_index_t where;
1567 range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where);
1568 uint64_t first_found;
1569 int count_searched = 0;
1570
1571 if (rs != NULL)
1572 first_found = rs_get_start(rs, rt);
1573
1574 while (rs != NULL && (rs_get_start(rs, rt) - first_found <=
1575 max_search || count_searched < metaslab_min_search_count)) {
1576 uint64_t offset = rs_get_start(rs, rt);
1577 if (offset + size <= rs_get_end(rs, rt)) {
1578 *cursor = offset + size;
1579 return (offset);
1580 }
1581 rs = zfs_btree_next(bt, &where, &where);
1582 count_searched++;
1583 }
1584
1585 *cursor = 0;
1586 return (-1ULL);
1587 }
1588
1589 /*
1590 * ==========================================================================
1591 * Dynamic Fit (df) block allocator
1592 *
1593 * Search for a free chunk of at least this size, starting from the last
1594 * offset (for this alignment of block) looking for up to
1595 * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not
1596 * found within 16MB, then return a free chunk of exactly the requested size (or
1597 * larger).
1598 *
1599 * If it seems like searching from the last offset will be unproductive, skip
1600 * that and just return a free chunk of exactly the requested size (or larger).
1601 * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This
1602 * mechanism is probably not very useful and may be removed in the future.
1603 *
1604 * The behavior when not searching can be changed to return the largest free
1605 * chunk, instead of a free chunk of exactly the requested size, by setting
1606 * metaslab_df_use_largest_segment.
1607 * ==========================================================================
1608 */
1609 static uint64_t
metaslab_df_alloc(metaslab_t * msp,uint64_t size)1610 metaslab_df_alloc(metaslab_t *msp, uint64_t size)
1611 {
1612 /*
1613 * Find the largest power of 2 block size that evenly divides the
1614 * requested size. This is used to try to allocate blocks with similar
1615 * alignment from the same area of the metaslab (i.e. same cursor
1616 * bucket) but it does not guarantee that other allocations sizes
1617 * may exist in the same region.
1618 */
1619 uint64_t align = size & -size;
1620 uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
1621 range_tree_t *rt = msp->ms_allocatable;
1622 int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
1623 uint64_t offset;
1624
1625 ASSERT(MUTEX_HELD(&msp->ms_lock));
1626
1627 /*
1628 * If we're running low on space, find a segment based on size,
1629 * rather than iterating based on offset.
1630 */
1631 if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold ||
1632 free_pct < metaslab_df_free_pct) {
1633 offset = -1;
1634 } else {
1635 offset = metaslab_block_picker(rt,
1636 cursor, size, metaslab_df_max_search);
1637 }
1638
1639 if (offset == -1) {
1640 range_seg_t *rs;
1641 if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0)
1642 metaslab_size_tree_full_load(msp->ms_allocatable);
1643 if (metaslab_df_use_largest_segment) {
1644 /* use largest free segment */
1645 rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL);
1646 } else {
1647 zfs_btree_index_t where;
1648 /* use segment of this size, or next largest */
1649 #ifdef _METASLAB_TRACING
1650 metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg;
1651 if (size < (1 << mrap->mra_floor_shift)) {
1652 METASLABSTAT_BUMP(
1653 metaslabstat_df_find_under_floor);
1654 }
1655 #endif
1656 rs = metaslab_block_find(&msp->ms_allocatable_by_size,
1657 rt, msp->ms_start, size, &where);
1658 }
1659 if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs,
1660 rt)) {
1661 offset = rs_get_start(rs, rt);
1662 *cursor = offset + size;
1663 }
1664 }
1665
1666 return (offset);
1667 }
1668
1669 static metaslab_ops_t metaslab_df_ops = {
1670 metaslab_df_alloc
1671 };
1672
1673 /*
1674 * ==========================================================================
1675 * Cursor fit block allocator -
1676 * Select the largest region in the metaslab, set the cursor to the beginning
1677 * of the range and the cursor_end to the end of the range. As allocations
1678 * are made advance the cursor. Continue allocating from the cursor until
1679 * the range is exhausted and then find a new range.
1680 * ==========================================================================
1681 */
1682 static uint64_t
metaslab_cf_alloc(metaslab_t * msp,uint64_t size)1683 metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
1684 {
1685 range_tree_t *rt = msp->ms_allocatable;
1686 zfs_btree_t *t = &msp->ms_allocatable_by_size;
1687 uint64_t *cursor = &msp->ms_lbas[0];
1688 uint64_t *cursor_end = &msp->ms_lbas[1];
1689 uint64_t offset = 0;
1690
1691 ASSERT(MUTEX_HELD(&msp->ms_lock));
1692
1693 ASSERT3U(*cursor_end, >=, *cursor);
1694
1695 if ((*cursor + size) > *cursor_end) {
1696 range_seg_t *rs;
1697
1698 if (zfs_btree_numnodes(t) == 0)
1699 metaslab_size_tree_full_load(msp->ms_allocatable);
1700 rs = zfs_btree_last(t, NULL);
1701 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) <
1702 size)
1703 return (-1ULL);
1704
1705 *cursor = rs_get_start(rs, rt);
1706 *cursor_end = rs_get_end(rs, rt);
1707 }
1708
1709 offset = *cursor;
1710 *cursor += size;
1711
1712 return (offset);
1713 }
1714
1715 static metaslab_ops_t metaslab_cf_ops = {
1716 metaslab_cf_alloc
1717 };
1718
1719 /*
1720 * ==========================================================================
1721 * New dynamic fit allocator -
1722 * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
1723 * contiguous blocks. If no region is found then just use the largest segment
1724 * that remains.
1725 * ==========================================================================
1726 */
1727
1728 /*
1729 * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
1730 * to request from the allocator.
1731 */
1732 uint64_t metaslab_ndf_clump_shift = 4;
1733
1734 static uint64_t
metaslab_ndf_alloc(metaslab_t * msp,uint64_t size)1735 metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
1736 {
1737 zfs_btree_t *t = &msp->ms_allocatable->rt_root;
1738 range_tree_t *rt = msp->ms_allocatable;
1739 zfs_btree_index_t where;
1740 range_seg_t *rs;
1741 range_seg_max_t rsearch;
1742 uint64_t hbit = highbit64(size);
1743 uint64_t *cursor = &msp->ms_lbas[hbit - 1];
1744 uint64_t max_size = metaslab_largest_allocatable(msp);
1745
1746 ASSERT(MUTEX_HELD(&msp->ms_lock));
1747
1748 if (max_size < size)
1749 return (-1ULL);
1750
1751 rs_set_start(&rsearch, rt, *cursor);
1752 rs_set_end(&rsearch, rt, *cursor + size);
1753
1754 rs = zfs_btree_find(t, &rsearch, &where);
1755 if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) {
1756 t = &msp->ms_allocatable_by_size;
1757
1758 rs_set_start(&rsearch, rt, 0);
1759 rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit +
1760 metaslab_ndf_clump_shift)));
1761
1762 rs = zfs_btree_find(t, &rsearch, &where);
1763 if (rs == NULL)
1764 rs = zfs_btree_next(t, &where, &where);
1765 ASSERT(rs != NULL);
1766 }
1767
1768 if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) {
1769 *cursor = rs_get_start(rs, rt) + size;
1770 return (rs_get_start(rs, rt));
1771 }
1772 return (-1ULL);
1773 }
1774
1775 static metaslab_ops_t metaslab_ndf_ops = {
1776 metaslab_ndf_alloc
1777 };
1778
1779 metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
1780
1781 /*
1782 * ==========================================================================
1783 * Metaslabs
1784 * ==========================================================================
1785 */
1786
1787 /*
1788 * Wait for any in-progress metaslab loads to complete.
1789 */
1790 void
metaslab_load_wait(metaslab_t * msp)1791 metaslab_load_wait(metaslab_t *msp)
1792 {
1793 ASSERT(MUTEX_HELD(&msp->ms_lock));
1794
1795 while (msp->ms_loading) {
1796 ASSERT(!msp->ms_loaded);
1797 cv_wait(&msp->ms_load_cv, &msp->ms_lock);
1798 }
1799 }
1800
1801 /*
1802 * Wait for any in-progress flushing to complete.
1803 */
1804 void
metaslab_flush_wait(metaslab_t * msp)1805 metaslab_flush_wait(metaslab_t *msp)
1806 {
1807 ASSERT(MUTEX_HELD(&msp->ms_lock));
1808
1809 while (msp->ms_flushing)
1810 cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
1811 }
1812
1813 static unsigned int
metaslab_idx_func(multilist_t * ml,void * arg)1814 metaslab_idx_func(multilist_t *ml, void *arg)
1815 {
1816 metaslab_t *msp = arg;
1817 return (msp->ms_id % multilist_get_num_sublists(ml));
1818 }
1819
1820 uint64_t
metaslab_allocated_space(metaslab_t * msp)1821 metaslab_allocated_space(metaslab_t *msp)
1822 {
1823 return (msp->ms_allocated_space);
1824 }
1825
1826 /*
1827 * Verify that the space accounting on disk matches the in-core range_trees.
1828 */
1829 static void
metaslab_verify_space(metaslab_t * msp,uint64_t txg)1830 metaslab_verify_space(metaslab_t *msp, uint64_t txg)
1831 {
1832 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1833 uint64_t allocating = 0;
1834 uint64_t sm_free_space, msp_free_space;
1835
1836 ASSERT(MUTEX_HELD(&msp->ms_lock));
1837 ASSERT(!msp->ms_condensing);
1838
1839 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
1840 return;
1841
1842 /*
1843 * We can only verify the metaslab space when we're called
1844 * from syncing context with a loaded metaslab that has an
1845 * allocated space map. Calling this in non-syncing context
1846 * does not provide a consistent view of the metaslab since
1847 * we're performing allocations in the future.
1848 */
1849 if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL ||
1850 !msp->ms_loaded)
1851 return;
1852
1853 /*
1854 * Even though the smp_alloc field can get negative,
1855 * when it comes to a metaslab's space map, that should
1856 * never be the case.
1857 */
1858 ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0);
1859
1860 ASSERT3U(space_map_allocated(msp->ms_sm), >=,
1861 range_tree_space(msp->ms_unflushed_frees));
1862
1863 ASSERT3U(metaslab_allocated_space(msp), ==,
1864 space_map_allocated(msp->ms_sm) +
1865 range_tree_space(msp->ms_unflushed_allocs) -
1866 range_tree_space(msp->ms_unflushed_frees));
1867
1868 sm_free_space = msp->ms_size - metaslab_allocated_space(msp);
1869
1870 /*
1871 * Account for future allocations since we would have
1872 * already deducted that space from the ms_allocatable.
1873 */
1874 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
1875 allocating +=
1876 range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
1877 }
1878 ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
1879 msp->ms_allocating_total);
1880
1881 ASSERT3U(msp->ms_deferspace, ==,
1882 range_tree_space(msp->ms_defer[0]) +
1883 range_tree_space(msp->ms_defer[1]));
1884
1885 msp_free_space = range_tree_space(msp->ms_allocatable) + allocating +
1886 msp->ms_deferspace + range_tree_space(msp->ms_freed);
1887
1888 VERIFY3U(sm_free_space, ==, msp_free_space);
1889 }
1890
1891 static void
metaslab_aux_histograms_clear(metaslab_t * msp)1892 metaslab_aux_histograms_clear(metaslab_t *msp)
1893 {
1894 /*
1895 * Auxiliary histograms are only cleared when resetting them,
1896 * which can only happen while the metaslab is loaded.
1897 */
1898 ASSERT(msp->ms_loaded);
1899
1900 bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1901 for (int t = 0; t < TXG_DEFER_SIZE; t++)
1902 bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t]));
1903 }
1904
1905 static void
metaslab_aux_histogram_add(uint64_t * histogram,uint64_t shift,range_tree_t * rt)1906 metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift,
1907 range_tree_t *rt)
1908 {
1909 /*
1910 * This is modeled after space_map_histogram_add(), so refer to that
1911 * function for implementation details. We want this to work like
1912 * the space map histogram, and not the range tree histogram, as we
1913 * are essentially constructing a delta that will be later subtracted
1914 * from the space map histogram.
1915 */
1916 int idx = 0;
1917 for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
1918 ASSERT3U(i, >=, idx + shift);
1919 histogram[idx] += rt->rt_histogram[i] << (i - idx - shift);
1920
1921 if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
1922 ASSERT3U(idx + shift, ==, i);
1923 idx++;
1924 ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
1925 }
1926 }
1927 }
1928
1929 /*
1930 * Called at every sync pass that the metaslab gets synced.
1931 *
1932 * The reason is that we want our auxiliary histograms to be updated
1933 * wherever the metaslab's space map histogram is updated. This way
1934 * we stay consistent on which parts of the metaslab space map's
1935 * histogram are currently not available for allocations (e.g because
1936 * they are in the defer, freed, and freeing trees).
1937 */
1938 static void
metaslab_aux_histograms_update(metaslab_t * msp)1939 metaslab_aux_histograms_update(metaslab_t *msp)
1940 {
1941 space_map_t *sm = msp->ms_sm;
1942 ASSERT(sm != NULL);
1943
1944 /*
1945 * This is similar to the metaslab's space map histogram updates
1946 * that take place in metaslab_sync(). The only difference is that
1947 * we only care about segments that haven't made it into the
1948 * ms_allocatable tree yet.
1949 */
1950 if (msp->ms_loaded) {
1951 metaslab_aux_histograms_clear(msp);
1952
1953 metaslab_aux_histogram_add(msp->ms_synchist,
1954 sm->sm_shift, msp->ms_freed);
1955
1956 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
1957 metaslab_aux_histogram_add(msp->ms_deferhist[t],
1958 sm->sm_shift, msp->ms_defer[t]);
1959 }
1960 }
1961
1962 metaslab_aux_histogram_add(msp->ms_synchist,
1963 sm->sm_shift, msp->ms_freeing);
1964 }
1965
1966 /*
1967 * Called every time we are done syncing (writing to) the metaslab,
1968 * i.e. at the end of each sync pass.
1969 * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist]
1970 */
1971 static void
metaslab_aux_histograms_update_done(metaslab_t * msp,boolean_t defer_allowed)1972 metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed)
1973 {
1974 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
1975 space_map_t *sm = msp->ms_sm;
1976
1977 if (sm == NULL) {
1978 /*
1979 * We came here from metaslab_init() when creating/opening a
1980 * pool, looking at a metaslab that hasn't had any allocations
1981 * yet.
1982 */
1983 return;
1984 }
1985
1986 /*
1987 * This is similar to the actions that we take for the ms_freed
1988 * and ms_defer trees in metaslab_sync_done().
1989 */
1990 uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE;
1991 if (defer_allowed) {
1992 bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index],
1993 sizeof (msp->ms_synchist));
1994 } else {
1995 bzero(msp->ms_deferhist[hist_index],
1996 sizeof (msp->ms_deferhist[hist_index]));
1997 }
1998 bzero(msp->ms_synchist, sizeof (msp->ms_synchist));
1999 }
2000
2001 /*
2002 * Ensure that the metaslab's weight and fragmentation are consistent
2003 * with the contents of the histogram (either the range tree's histogram
2004 * or the space map's depending whether the metaslab is loaded).
2005 */
2006 static void
metaslab_verify_weight_and_frag(metaslab_t * msp)2007 metaslab_verify_weight_and_frag(metaslab_t *msp)
2008 {
2009 ASSERT(MUTEX_HELD(&msp->ms_lock));
2010
2011 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
2012 return;
2013
2014 /*
2015 * We can end up here from vdev_remove_complete(), in which case we
2016 * cannot do these assertions because we hold spa config locks and
2017 * thus we are not allowed to read from the DMU.
2018 *
2019 * We check if the metaslab group has been removed and if that's
2020 * the case we return immediately as that would mean that we are
2021 * here from the aforementioned code path.
2022 */
2023 if (msp->ms_group == NULL)
2024 return;
2025
2026 /*
2027 * Devices being removed always return a weight of 0 and leave
2028 * fragmentation and ms_max_size as is - there is nothing for
2029 * us to verify here.
2030 */
2031 vdev_t *vd = msp->ms_group->mg_vd;
2032 if (vd->vdev_removing)
2033 return;
2034
2035 /*
2036 * If the metaslab is dirty it probably means that we've done
2037 * some allocations or frees that have changed our histograms
2038 * and thus the weight.
2039 */
2040 for (int t = 0; t < TXG_SIZE; t++) {
2041 if (txg_list_member(&vd->vdev_ms_list, msp, t))
2042 return;
2043 }
2044
2045 /*
2046 * This verification checks that our in-memory state is consistent
2047 * with what's on disk. If the pool is read-only then there aren't
2048 * any changes and we just have the initially-loaded state.
2049 */
2050 if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa))
2051 return;
2052
2053 /* some extra verification for in-core tree if you can */
2054 if (msp->ms_loaded) {
2055 range_tree_stat_verify(msp->ms_allocatable);
2056 VERIFY(space_map_histogram_verify(msp->ms_sm,
2057 msp->ms_allocatable));
2058 }
2059
2060 uint64_t weight = msp->ms_weight;
2061 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
2062 boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight);
2063 uint64_t frag = msp->ms_fragmentation;
2064 uint64_t max_segsize = msp->ms_max_size;
2065
2066 msp->ms_weight = 0;
2067 msp->ms_fragmentation = 0;
2068
2069 /*
2070 * This function is used for verification purposes. Regardless of
2071 * whether metaslab_weight() thinks this metaslab should be active or
2072 * not, we want to ensure that the actual weight (and therefore the
2073 * value of ms_weight) would be the same if it was to be recalculated
2074 * at this point.
2075 */
2076 msp->ms_weight = metaslab_weight(msp) | was_active;
2077
2078 VERIFY3U(max_segsize, ==, msp->ms_max_size);
2079
2080 /*
2081 * If the weight type changed then there is no point in doing
2082 * verification. Revert fields to their original values.
2083 */
2084 if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) ||
2085 (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) {
2086 msp->ms_fragmentation = frag;
2087 msp->ms_weight = weight;
2088 return;
2089 }
2090
2091 VERIFY3U(msp->ms_fragmentation, ==, frag);
2092 VERIFY3U(msp->ms_weight, ==, weight);
2093 }
2094
2095 /*
2096 * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
2097 * this class that was used longest ago, and attempt to unload it. We don't
2098 * want to spend too much time in this loop to prevent performance
2099 * degredation, and we expect that most of the time this operation will
2100 * succeed. Between that and the normal unloading processing during txg sync,
2101 * we expect this to keep the metaslab memory usage under control.
2102 */
2103 static void
metaslab_potentially_evict(metaslab_class_t * mc)2104 metaslab_potentially_evict(metaslab_class_t *mc)
2105 {
2106 #ifdef _KERNEL
2107 uint64_t allmem = arc_all_memory();
2108 extern kmem_cache_t *zfs_btree_leaf_cache;
2109 uint64_t inuse = kmem_cache_stat(zfs_btree_leaf_cache, "buf_inuse");
2110 uint64_t size = kmem_cache_stat(zfs_btree_leaf_cache, "buf_size");
2111 int tries = 0;
2112 for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
2113 tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
2114 tries++) {
2115 unsigned int idx = multilist_get_random_index(
2116 mc->mc_metaslab_txg_list);
2117 multilist_sublist_t *mls =
2118 multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
2119 metaslab_t *msp = multilist_sublist_head(mls);
2120 multilist_sublist_unlock(mls);
2121 while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
2122 inuse * size) {
2123 VERIFY3P(mls, ==, multilist_sublist_lock(
2124 mc->mc_metaslab_txg_list, idx));
2125 ASSERT3U(idx, ==,
2126 metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
2127
2128 if (!multilist_link_active(&msp->ms_class_txg_node)) {
2129 multilist_sublist_unlock(mls);
2130 break;
2131 }
2132 metaslab_t *next_msp = multilist_sublist_next(mls, msp);
2133 multilist_sublist_unlock(mls);
2134 /*
2135 * If the metaslab is currently loading there are two
2136 * cases. If it's the metaslab we're evicting, we
2137 * can't continue on or we'll panic when we attempt to
2138 * recursively lock the mutex. If it's another
2139 * metaslab that's loading, it can be safely skipped,
2140 * since we know it's very new and therefore not a
2141 * good eviction candidate. We check later once the
2142 * lock is held that the metaslab is fully loaded
2143 * before actually unloading it.
2144 */
2145 if (msp->ms_loading) {
2146 msp = next_msp;
2147 inuse = kmem_cache_stat(zfs_btree_leaf_cache,
2148 "buf_inuse");
2149 continue;
2150 }
2151 /*
2152 * We can't unload metaslabs with no spacemap because
2153 * they're not ready to be unloaded yet. We can't
2154 * unload metaslabs with outstanding allocations
2155 * because doing so could cause the metaslab's weight
2156 * to decrease while it's unloaded, which violates an
2157 * invariant that we use to prevent unnecessary
2158 * loading. We also don't unload metaslabs that are
2159 * currently active because they are high-weight
2160 * metaslabs that are likely to be used in the near
2161 * future.
2162 */
2163 mutex_enter(&msp->ms_lock);
2164 if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
2165 msp->ms_allocating_total == 0) {
2166 metaslab_unload(msp);
2167 }
2168 mutex_exit(&msp->ms_lock);
2169 msp = next_msp;
2170 inuse = kmem_cache_stat(zfs_btree_leaf_cache,
2171 "buf_inuse");
2172 }
2173 }
2174 #endif
2175 }
2176
2177 static int
metaslab_load_impl(metaslab_t * msp)2178 metaslab_load_impl(metaslab_t *msp)
2179 {
2180 int error = 0;
2181
2182 ASSERT(MUTEX_HELD(&msp->ms_lock));
2183 ASSERT(msp->ms_loading);
2184 ASSERT(!msp->ms_condensing);
2185
2186 /*
2187 * We temporarily drop the lock to unblock other operations while we
2188 * are reading the space map. Therefore, metaslab_sync() and
2189 * metaslab_sync_done() can run at the same time as we do.
2190 *
2191 * If we are using the log space maps, metaslab_sync() can't write to
2192 * the metaslab's space map while we are loading as we only write to
2193 * it when we are flushing the metaslab, and that can't happen while
2194 * we are loading it.
2195 *
2196 * If we are not using log space maps though, metaslab_sync() can
2197 * append to the space map while we are loading. Therefore we load
2198 * only entries that existed when we started the load. Additionally,
2199 * metaslab_sync_done() has to wait for the load to complete because
2200 * there are potential races like metaslab_load() loading parts of the
2201 * space map that are currently being appended by metaslab_sync(). If
2202 * we didn't, the ms_allocatable would have entries that
2203 * metaslab_sync_done() would try to re-add later.
2204 *
2205 * That's why before dropping the lock we remember the synced length
2206 * of the metaslab and read up to that point of the space map,
2207 * ignoring entries appended by metaslab_sync() that happen after we
2208 * drop the lock.
2209 */
2210 uint64_t length = msp->ms_synced_length;
2211 mutex_exit(&msp->ms_lock);
2212
2213 hrtime_t load_start = gethrtime();
2214 metaslab_rt_arg_t *mrap;
2215 if (msp->ms_allocatable->rt_arg == NULL) {
2216 mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
2217 } else {
2218 mrap = msp->ms_allocatable->rt_arg;
2219 msp->ms_allocatable->rt_ops = NULL;
2220 msp->ms_allocatable->rt_arg = NULL;
2221 }
2222 mrap->mra_bt = &msp->ms_allocatable_by_size;
2223 mrap->mra_floor_shift = metaslab_by_size_min_shift;
2224
2225 if (msp->ms_sm != NULL) {
2226 error = space_map_load_length(msp->ms_sm, msp->ms_allocatable,
2227 SM_FREE, length);
2228
2229 /* Now, populate the size-sorted tree. */
2230 metaslab_rt_create(msp->ms_allocatable, mrap);
2231 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2232 msp->ms_allocatable->rt_arg = mrap;
2233
2234 struct mssa_arg arg = {0};
2235 arg.rt = msp->ms_allocatable;
2236 arg.mra = mrap;
2237 range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add,
2238 &arg);
2239 } else {
2240 /*
2241 * Add the size-sorted tree first, since we don't need to load
2242 * the metaslab from the spacemap.
2243 */
2244 metaslab_rt_create(msp->ms_allocatable, mrap);
2245 msp->ms_allocatable->rt_ops = &metaslab_rt_ops;
2246 msp->ms_allocatable->rt_arg = mrap;
2247 /*
2248 * The space map has not been allocated yet, so treat
2249 * all the space in the metaslab as free and add it to the
2250 * ms_allocatable tree.
2251 */
2252 range_tree_add(msp->ms_allocatable,
2253 msp->ms_start, msp->ms_size);
2254
2255 if (msp->ms_freed != NULL) {
2256 /*
2257 * If the ms_sm doesn't exist, this means that this
2258 * metaslab hasn't gone through metaslab_sync() and
2259 * thus has never been dirtied. So we shouldn't
2260 * expect any unflushed allocs or frees from previous
2261 * TXGs.
2262 *
2263 * Note: ms_freed and all the other trees except for
2264 * the ms_allocatable, can be NULL at this point only
2265 * if this is a new metaslab of a vdev that just got
2266 * expanded.
2267 */
2268 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
2269 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
2270 }
2271 }
2272
2273 /*
2274 * We need to grab the ms_sync_lock to prevent metaslab_sync() from
2275 * changing the ms_sm (or log_sm) and the metaslab's range trees
2276 * while we are about to use them and populate the ms_allocatable.
2277 * The ms_lock is insufficient for this because metaslab_sync() doesn't
2278 * hold the ms_lock while writing the ms_checkpointing tree to disk.
2279 */
2280 mutex_enter(&msp->ms_sync_lock);
2281 mutex_enter(&msp->ms_lock);
2282
2283 ASSERT(!msp->ms_condensing);
2284 ASSERT(!msp->ms_flushing);
2285
2286 if (error != 0) {
2287 mutex_exit(&msp->ms_sync_lock);
2288 return (error);
2289 }
2290
2291 ASSERT3P(msp->ms_group, !=, NULL);
2292 msp->ms_loaded = B_TRUE;
2293
2294 /*
2295 * Apply all the unflushed changes to ms_allocatable right
2296 * away so any manipulations we do below have a clear view
2297 * of what is allocated and what is free.
2298 */
2299 range_tree_walk(msp->ms_unflushed_allocs,
2300 range_tree_remove, msp->ms_allocatable);
2301 range_tree_walk(msp->ms_unflushed_frees,
2302 range_tree_add, msp->ms_allocatable);
2303
2304 msp->ms_loaded = B_TRUE;
2305
2306 ASSERT3P(msp->ms_group, !=, NULL);
2307 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2308 if (spa_syncing_log_sm(spa) != NULL) {
2309 ASSERT(spa_feature_is_enabled(spa,
2310 SPA_FEATURE_LOG_SPACEMAP));
2311
2312 /*
2313 * If we use a log space map we add all the segments
2314 * that are in ms_unflushed_frees so they are available
2315 * for allocation.
2316 *
2317 * ms_allocatable needs to contain all free segments
2318 * that are ready for allocations (thus not segments
2319 * from ms_freeing, ms_freed, and the ms_defer trees).
2320 * But if we grab the lock in this code path at a sync
2321 * pass later that 1, then it also contains the
2322 * segments of ms_freed (they were added to it earlier
2323 * in this path through ms_unflushed_frees). So we
2324 * need to remove all the segments that exist in
2325 * ms_freed from ms_allocatable as they will be added
2326 * later in metaslab_sync_done().
2327 *
2328 * When there's no log space map, the ms_allocatable
2329 * correctly doesn't contain any segments that exist
2330 * in ms_freed [see ms_synced_length].
2331 */
2332 range_tree_walk(msp->ms_freed,
2333 range_tree_remove, msp->ms_allocatable);
2334 }
2335
2336 /*
2337 * If we are not using the log space map, ms_allocatable
2338 * contains the segments that exist in the ms_defer trees
2339 * [see ms_synced_length]. Thus we need to remove them
2340 * from ms_allocatable as they will be added again in
2341 * metaslab_sync_done().
2342 *
2343 * If we are using the log space map, ms_allocatable still
2344 * contains the segments that exist in the ms_defer trees.
2345 * Not because it read them through the ms_sm though. But
2346 * because these segments are part of ms_unflushed_frees
2347 * whose segments we add to ms_allocatable earlier in this
2348 * code path.
2349 */
2350 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2351 range_tree_walk(msp->ms_defer[t],
2352 range_tree_remove, msp->ms_allocatable);
2353 }
2354
2355 /*
2356 * Call metaslab_recalculate_weight_and_sort() now that the
2357 * metaslab is loaded so we get the metaslab's real weight.
2358 *
2359 * Unless this metaslab was created with older software and
2360 * has not yet been converted to use segment-based weight, we
2361 * expect the new weight to be better or equal to the weight
2362 * that the metaslab had while it was not loaded. This is
2363 * because the old weight does not take into account the
2364 * consolidation of adjacent segments between TXGs. [see
2365 * comment for ms_synchist and ms_deferhist[] for more info]
2366 */
2367 uint64_t weight = msp->ms_weight;
2368 uint64_t max_size = msp->ms_max_size;
2369 metaslab_recalculate_weight_and_sort(msp);
2370 if (!WEIGHT_IS_SPACEBASED(weight))
2371 ASSERT3U(weight, <=, msp->ms_weight);
2372 msp->ms_max_size = metaslab_largest_allocatable(msp);
2373 ASSERT3U(max_size, <=, msp->ms_max_size);
2374 hrtime_t load_end = gethrtime();
2375 msp->ms_load_time = load_end;
2376 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
2377 zfs_dbgmsg("loading: txg %llu, spa %s, vdev_id %llu, "
2378 "ms_id %llu, smp_length %llu, "
2379 "unflushed_allocs %llu, unflushed_frees %llu, "
2380 "freed %llu, defer %llu + %llu, "
2381 "loading_time %lld ms, ms_max_size %llu, "
2382 "max size error %llu",
2383 spa_syncing_txg(spa), spa_name(spa),
2384 msp->ms_group->mg_vd->vdev_id, msp->ms_id,
2385 space_map_length(msp->ms_sm),
2386 range_tree_space(msp->ms_unflushed_allocs),
2387 range_tree_space(msp->ms_unflushed_frees),
2388 range_tree_space(msp->ms_freed),
2389 range_tree_space(msp->ms_defer[0]),
2390 range_tree_space(msp->ms_defer[1]),
2391 (longlong_t)((load_end - load_start) / 1000000),
2392 msp->ms_max_size, msp->ms_max_size - max_size);
2393 }
2394
2395 metaslab_verify_space(msp, spa_syncing_txg(spa));
2396 mutex_exit(&msp->ms_sync_lock);
2397 return (0);
2398 }
2399
2400 int
metaslab_load(metaslab_t * msp)2401 metaslab_load(metaslab_t *msp)
2402 {
2403 ASSERT(MUTEX_HELD(&msp->ms_lock));
2404
2405 /*
2406 * There may be another thread loading the same metaslab, if that's
2407 * the case just wait until the other thread is done and return.
2408 */
2409 metaslab_load_wait(msp);
2410 if (msp->ms_loaded)
2411 return (0);
2412 VERIFY(!msp->ms_loading);
2413 ASSERT(!msp->ms_condensing);
2414
2415 /*
2416 * We set the loading flag BEFORE potentially dropping the lock to
2417 * wait for an ongoing flush (see ms_flushing below). This way other
2418 * threads know that there is already a thread that is loading this
2419 * metaslab.
2420 */
2421 msp->ms_loading = B_TRUE;
2422
2423 /*
2424 * Wait for any in-progress flushing to finish as we drop the ms_lock
2425 * both here (during space_map_load()) and in metaslab_flush() (when
2426 * we flush our changes to the ms_sm).
2427 */
2428 if (msp->ms_flushing)
2429 metaslab_flush_wait(msp);
2430
2431 /*
2432 * In the possibility that we were waiting for the metaslab to be
2433 * flushed (where we temporarily dropped the ms_lock), ensure that
2434 * no one else loaded the metaslab somehow.
2435 */
2436 ASSERT(!msp->ms_loaded);
2437
2438 /*
2439 * If we're loading a metaslab in the normal class, consider evicting
2440 * another one to keep our memory usage under the limit defined by the
2441 * zfs_metaslab_mem_limit tunable.
2442 */
2443 if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
2444 msp->ms_group->mg_class) {
2445 metaslab_potentially_evict(msp->ms_group->mg_class);
2446 }
2447
2448 int error = metaslab_load_impl(msp);
2449
2450 ASSERT(MUTEX_HELD(&msp->ms_lock));
2451 msp->ms_loading = B_FALSE;
2452 cv_broadcast(&msp->ms_load_cv);
2453
2454 return (error);
2455 }
2456
2457 void
metaslab_unload(metaslab_t * msp)2458 metaslab_unload(metaslab_t *msp)
2459 {
2460 ASSERT(MUTEX_HELD(&msp->ms_lock));
2461
2462 /*
2463 * This can happen if a metaslab is selected for eviction (in
2464 * metaslab_potentially_evict) and then unloaded during spa_sync (via
2465 * metaslab_class_evict_old).
2466 */
2467 if (!msp->ms_loaded)
2468 return;
2469
2470 range_tree_vacate(msp->ms_allocatable, NULL, NULL);
2471 msp->ms_loaded = B_FALSE;
2472 msp->ms_unload_time = gethrtime();
2473
2474 msp->ms_activation_weight = 0;
2475 msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
2476
2477 if (msp->ms_group != NULL) {
2478 metaslab_class_t *mc = msp->ms_group->mg_class;
2479 multilist_sublist_t *mls =
2480 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2481 if (multilist_link_active(&msp->ms_class_txg_node))
2482 multilist_sublist_remove(mls, msp);
2483 multilist_sublist_unlock(mls);
2484 }
2485
2486 /*
2487 * We explicitly recalculate the metaslab's weight based on its space
2488 * map (as it is now not loaded). We want unload metaslabs to always
2489 * have their weights calculated from the space map histograms, while
2490 * loaded ones have it calculated from their in-core range tree
2491 * [see metaslab_load()]. This way, the weight reflects the information
2492 * available in-core, whether it is loaded or not.
2493 *
2494 * If ms_group == NULL means that we came here from metaslab_fini(),
2495 * at which point it doesn't make sense for us to do the recalculation
2496 * and the sorting.
2497 */
2498 if (msp->ms_group != NULL)
2499 metaslab_recalculate_weight_and_sort(msp);
2500 }
2501
2502 /*
2503 * We want to optimize the memory use of the per-metaslab range
2504 * trees. To do this, we store the segments in the range trees in
2505 * units of sectors, zero-indexing from the start of the metaslab. If
2506 * the vdev_ms_shift - the vdev_ashift is less than 32, we can store
2507 * the ranges using two uint32_ts, rather than two uint64_ts.
2508 */
2509 static range_seg_type_t
metaslab_calculate_range_tree_type(vdev_t * vdev,metaslab_t * msp,uint64_t * start,uint64_t * shift)2510 metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp,
2511 uint64_t *start, uint64_t *shift)
2512 {
2513 if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 &&
2514 !zfs_metaslab_force_large_segs) {
2515 *shift = vdev->vdev_ashift;
2516 *start = msp->ms_start;
2517 return (RANGE_SEG32);
2518 } else {
2519 *shift = 0;
2520 *start = 0;
2521 return (RANGE_SEG64);
2522 }
2523 }
2524
2525 void
metaslab_set_selected_txg(metaslab_t * msp,uint64_t txg)2526 metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
2527 {
2528 ASSERT(MUTEX_HELD(&msp->ms_lock));
2529 metaslab_class_t *mc = msp->ms_group->mg_class;
2530 multilist_sublist_t *mls =
2531 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
2532 if (multilist_link_active(&msp->ms_class_txg_node))
2533 multilist_sublist_remove(mls, msp);
2534 msp->ms_selected_txg = txg;
2535 msp->ms_selected_time = gethrtime();
2536 multilist_sublist_insert_tail(mls, msp);
2537 multilist_sublist_unlock(mls);
2538 }
2539
2540 void
metaslab_space_update(vdev_t * vd,metaslab_class_t * mc,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta)2541 metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
2542 int64_t defer_delta, int64_t space_delta)
2543 {
2544 vdev_space_update(vd, alloc_delta, defer_delta, space_delta);
2545
2546 ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent);
2547 ASSERT(vd->vdev_ms_count != 0);
2548
2549 metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta,
2550 vdev_deflated_space(vd, space_delta));
2551 }
2552
2553 int
metaslab_init(metaslab_group_t * mg,uint64_t id,uint64_t object,uint64_t txg,metaslab_t ** msp)2554 metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
2555 uint64_t txg, metaslab_t **msp)
2556 {
2557 vdev_t *vd = mg->mg_vd;
2558 spa_t *spa = vd->vdev_spa;
2559 objset_t *mos = spa->spa_meta_objset;
2560 metaslab_t *ms;
2561 int error;
2562
2563 ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
2564 mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
2565 mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
2566 cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
2567 cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
2568 multilist_link_init(&ms->ms_class_txg_node);
2569
2570 ms->ms_id = id;
2571 ms->ms_start = id << vd->vdev_ms_shift;
2572 ms->ms_size = 1ULL << vd->vdev_ms_shift;
2573 ms->ms_allocator = -1;
2574 ms->ms_new = B_TRUE;
2575
2576 /*
2577 * We only open space map objects that already exist. All others
2578 * will be opened when we finally allocate an object for it.
2579 *
2580 * Note:
2581 * When called from vdev_expand(), we can't call into the DMU as
2582 * we are holding the spa_config_lock as a writer and we would
2583 * deadlock [see relevant comment in vdev_metaslab_init()]. in
2584 * that case, the object parameter is zero though, so we won't
2585 * call into the DMU.
2586 */
2587 if (object != 0) {
2588 error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
2589 ms->ms_size, vd->vdev_ashift);
2590
2591 if (error != 0) {
2592 kmem_free(ms, sizeof (metaslab_t));
2593 return (error);
2594 }
2595
2596 ASSERT(ms->ms_sm != NULL);
2597 ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0);
2598 ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
2599 }
2600
2601 range_seg_type_t type;
2602 uint64_t shift, start;
2603 type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
2604
2605 /*
2606 * We create the ms_allocatable here, but we don't create the
2607 * other range trees until metaslab_sync_done(). This serves
2608 * two purposes: it allows metaslab_sync_done() to detect the
2609 * addition of new space; and for debugging, it ensures that
2610 * we'd data fault on any attempt to use this metaslab before
2611 * it's ready.
2612 */
2613 ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
2614
2615 ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
2616
2617 metaslab_group_add(mg, ms);
2618 metaslab_set_fragmentation(ms);
2619
2620 /*
2621 * If we're opening an existing pool (txg == 0) or creating
2622 * a new one (txg == TXG_INITIAL), all space is available now.
2623 * If we're adding space to an existing pool, the new space
2624 * does not become available until after this txg has synced.
2625 * The metaslab's weight will also be initialized when we sync
2626 * out this txg. This ensures that we don't attempt to allocate
2627 * from it before we have initialized it completely.
2628 */
2629 if (txg <= TXG_INITIAL) {
2630 metaslab_sync_done(ms, 0);
2631 metaslab_space_update(vd, mg->mg_class,
2632 metaslab_allocated_space(ms), 0, 0);
2633 }
2634
2635 if (txg != 0) {
2636 vdev_dirty(vd, 0, NULL, txg);
2637 vdev_dirty(vd, VDD_METASLAB, ms, txg);
2638 }
2639
2640 *msp = ms;
2641
2642 return (0);
2643 }
2644
2645 static void
metaslab_fini_flush_data(metaslab_t * msp)2646 metaslab_fini_flush_data(metaslab_t *msp)
2647 {
2648 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2649
2650 if (metaslab_unflushed_txg(msp) == 0) {
2651 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL),
2652 ==, NULL);
2653 return;
2654 }
2655 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
2656
2657 mutex_enter(&spa->spa_flushed_ms_lock);
2658 avl_remove(&spa->spa_metaslabs_by_flushed, msp);
2659 mutex_exit(&spa->spa_flushed_ms_lock);
2660
2661 spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2662 spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp));
2663 }
2664
2665 uint64_t
metaslab_unflushed_changes_memused(metaslab_t * ms)2666 metaslab_unflushed_changes_memused(metaslab_t *ms)
2667 {
2668 return ((range_tree_numsegs(ms->ms_unflushed_allocs) +
2669 range_tree_numsegs(ms->ms_unflushed_frees)) *
2670 ms->ms_unflushed_allocs->rt_root.bt_elem_size);
2671 }
2672
2673 void
metaslab_fini(metaslab_t * msp)2674 metaslab_fini(metaslab_t *msp)
2675 {
2676 metaslab_group_t *mg = msp->ms_group;
2677 vdev_t *vd = mg->mg_vd;
2678 spa_t *spa = vd->vdev_spa;
2679
2680 metaslab_fini_flush_data(msp);
2681
2682 metaslab_group_remove(mg, msp);
2683
2684 mutex_enter(&msp->ms_lock);
2685 VERIFY(msp->ms_group == NULL);
2686 metaslab_space_update(vd, mg->mg_class,
2687 -metaslab_allocated_space(msp), 0, -msp->ms_size);
2688
2689 space_map_close(msp->ms_sm);
2690 msp->ms_sm = NULL;
2691
2692 metaslab_unload(msp);
2693 range_tree_destroy(msp->ms_allocatable);
2694 range_tree_destroy(msp->ms_freeing);
2695 range_tree_destroy(msp->ms_freed);
2696
2697 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
2698 metaslab_unflushed_changes_memused(msp));
2699 spa->spa_unflushed_stats.sus_memused -=
2700 metaslab_unflushed_changes_memused(msp);
2701 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
2702 range_tree_destroy(msp->ms_unflushed_allocs);
2703 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
2704 range_tree_destroy(msp->ms_unflushed_frees);
2705
2706 for (int t = 0; t < TXG_SIZE; t++) {
2707 range_tree_destroy(msp->ms_allocating[t]);
2708 }
2709
2710 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2711 range_tree_destroy(msp->ms_defer[t]);
2712 }
2713 ASSERT0(msp->ms_deferspace);
2714
2715 range_tree_destroy(msp->ms_checkpointing);
2716
2717 for (int t = 0; t < TXG_SIZE; t++)
2718 ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t));
2719
2720 range_tree_vacate(msp->ms_trim, NULL, NULL);
2721 range_tree_destroy(msp->ms_trim);
2722
2723 mutex_exit(&msp->ms_lock);
2724 cv_destroy(&msp->ms_load_cv);
2725 cv_destroy(&msp->ms_flush_cv);
2726 mutex_destroy(&msp->ms_lock);
2727 mutex_destroy(&msp->ms_sync_lock);
2728 ASSERT3U(msp->ms_allocator, ==, -1);
2729
2730 kmem_free(msp, sizeof (metaslab_t));
2731 }
2732
2733 #define FRAGMENTATION_TABLE_SIZE 17
2734
2735 /*
2736 * This table defines a segment size based fragmentation metric that will
2737 * allow each metaslab to derive its own fragmentation value. This is done
2738 * by calculating the space in each bucket of the spacemap histogram and
2739 * multiplying that by the fragmentation metric in this table. Doing
2740 * this for all buckets and dividing it by the total amount of free
2741 * space in this metaslab (i.e. the total free space in all buckets) gives
2742 * us the fragmentation metric. This means that a high fragmentation metric
2743 * equates to most of the free space being comprised of small segments.
2744 * Conversely, if the metric is low, then most of the free space is in
2745 * large segments. A 10% change in fragmentation equates to approximately
2746 * double the number of segments.
2747 *
2748 * This table defines 0% fragmented space using 16MB segments. Testing has
2749 * shown that segments that are greater than or equal to 16MB do not suffer
2750 * from drastic performance problems. Using this value, we derive the rest
2751 * of the table. Since the fragmentation value is never stored on disk, it
2752 * is possible to change these calculations in the future.
2753 */
2754 int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
2755 100, /* 512B */
2756 100, /* 1K */
2757 98, /* 2K */
2758 95, /* 4K */
2759 90, /* 8K */
2760 80, /* 16K */
2761 70, /* 32K */
2762 60, /* 64K */
2763 50, /* 128K */
2764 40, /* 256K */
2765 30, /* 512K */
2766 20, /* 1M */
2767 15, /* 2M */
2768 10, /* 4M */
2769 5, /* 8M */
2770 0 /* 16M */
2771 };
2772
2773 /*
2774 * Calculate the metaslab's fragmentation metric and set ms_fragmentation.
2775 * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not
2776 * been upgraded and does not support this metric. Otherwise, the return
2777 * value should be in the range [0, 100].
2778 */
2779 static void
metaslab_set_fragmentation(metaslab_t * msp)2780 metaslab_set_fragmentation(metaslab_t *msp)
2781 {
2782 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
2783 uint64_t fragmentation = 0;
2784 uint64_t total = 0;
2785 boolean_t feature_enabled = spa_feature_is_enabled(spa,
2786 SPA_FEATURE_SPACEMAP_HISTOGRAM);
2787
2788 if (!feature_enabled) {
2789 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2790 return;
2791 }
2792
2793 /*
2794 * A null space map means that the entire metaslab is free
2795 * and thus is not fragmented.
2796 */
2797 if (msp->ms_sm == NULL) {
2798 msp->ms_fragmentation = 0;
2799 return;
2800 }
2801
2802 /*
2803 * If this metaslab's space map has not been upgraded, flag it
2804 * so that we upgrade next time we encounter it.
2805 */
2806 if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
2807 uint64_t txg = spa_syncing_txg(spa);
2808 vdev_t *vd = msp->ms_group->mg_vd;
2809
2810 /*
2811 * If we've reached the final dirty txg, then we must
2812 * be shutting down the pool. We don't want to dirty
2813 * any data past this point so skip setting the condense
2814 * flag. We can retry this action the next time the pool
2815 * is imported.
2816 */
2817 if (spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) {
2818 msp->ms_condense_wanted = B_TRUE;
2819 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
2820 zfs_dbgmsg("txg %llu, requesting force condense: "
2821 "ms_id %llu, vdev_id %llu", txg, msp->ms_id,
2822 vd->vdev_id);
2823 }
2824 msp->ms_fragmentation = ZFS_FRAG_INVALID;
2825 return;
2826 }
2827
2828 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2829 uint64_t space = 0;
2830 uint8_t shift = msp->ms_sm->sm_shift;
2831
2832 int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
2833 FRAGMENTATION_TABLE_SIZE - 1);
2834
2835 if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
2836 continue;
2837
2838 space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
2839 total += space;
2840
2841 ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
2842 fragmentation += space * zfs_frag_table[idx];
2843 }
2844
2845 if (total > 0)
2846 fragmentation /= total;
2847 ASSERT3U(fragmentation, <=, 100);
2848
2849 msp->ms_fragmentation = fragmentation;
2850 }
2851
2852 /*
2853 * Compute a weight -- a selection preference value -- for the given metaslab.
2854 * This is based on the amount of free space, the level of fragmentation,
2855 * the LBA range, and whether the metaslab is loaded.
2856 */
2857 static uint64_t
metaslab_space_weight(metaslab_t * msp)2858 metaslab_space_weight(metaslab_t *msp)
2859 {
2860 metaslab_group_t *mg = msp->ms_group;
2861 vdev_t *vd = mg->mg_vd;
2862 uint64_t weight, space;
2863
2864 ASSERT(MUTEX_HELD(&msp->ms_lock));
2865
2866 /*
2867 * The baseline weight is the metaslab's free space.
2868 */
2869 space = msp->ms_size - metaslab_allocated_space(msp);
2870
2871 if (metaslab_fragmentation_factor_enabled &&
2872 msp->ms_fragmentation != ZFS_FRAG_INVALID) {
2873 /*
2874 * Use the fragmentation information to inversely scale
2875 * down the baseline weight. We need to ensure that we
2876 * don't exclude this metaslab completely when it's 100%
2877 * fragmented. To avoid this we reduce the fragmented value
2878 * by 1.
2879 */
2880 space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
2881
2882 /*
2883 * If space < SPA_MINBLOCKSIZE, then we will not allocate from
2884 * this metaslab again. The fragmentation metric may have
2885 * decreased the space to something smaller than
2886 * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
2887 * so that we can consume any remaining space.
2888 */
2889 if (space > 0 && space < SPA_MINBLOCKSIZE)
2890 space = SPA_MINBLOCKSIZE;
2891 }
2892 weight = space;
2893
2894 /*
2895 * Modern disks have uniform bit density and constant angular velocity.
2896 * Therefore, the outer recording zones are faster (higher bandwidth)
2897 * than the inner zones by the ratio of outer to inner track diameter,
2898 * which is typically around 2:1. We account for this by assigning
2899 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x).
2900 * In effect, this means that we'll select the metaslab with the most
2901 * free bandwidth rather than simply the one with the most free space.
2902 */
2903 if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) {
2904 weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
2905 ASSERT(weight >= space && weight <= 2 * space);
2906 }
2907
2908 /*
2909 * If this metaslab is one we're actively using, adjust its
2910 * weight to make it preferable to any inactive metaslab so
2911 * we'll polish it off. If the fragmentation on this metaslab
2912 * has exceed our threshold, then don't mark it active.
2913 */
2914 if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
2915 msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
2916 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
2917 }
2918
2919 WEIGHT_SET_SPACEBASED(weight);
2920 return (weight);
2921 }
2922
2923 /*
2924 * Return the weight of the specified metaslab, according to the segment-based
2925 * weighting algorithm. The metaslab must be loaded. This function can
2926 * be called within a sync pass since it relies only on the metaslab's
2927 * range tree which is always accurate when the metaslab is loaded.
2928 */
2929 static uint64_t
metaslab_weight_from_range_tree(metaslab_t * msp)2930 metaslab_weight_from_range_tree(metaslab_t *msp)
2931 {
2932 uint64_t weight = 0;
2933 uint32_t segments = 0;
2934
2935 ASSERT(msp->ms_loaded);
2936
2937 for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT;
2938 i--) {
2939 uint8_t shift = msp->ms_group->mg_vd->vdev_ashift;
2940 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
2941
2942 segments <<= 1;
2943 segments += msp->ms_allocatable->rt_histogram[i];
2944
2945 /*
2946 * The range tree provides more precision than the space map
2947 * and must be downgraded so that all values fit within the
2948 * space map's histogram. This allows us to compare loaded
2949 * vs. unloaded metaslabs to determine which metaslab is
2950 * considered "best".
2951 */
2952 if (i > max_idx)
2953 continue;
2954
2955 if (segments != 0) {
2956 WEIGHT_SET_COUNT(weight, segments);
2957 WEIGHT_SET_INDEX(weight, i);
2958 WEIGHT_SET_ACTIVE(weight, 0);
2959 break;
2960 }
2961 }
2962 return (weight);
2963 }
2964
2965 /*
2966 * Calculate the weight based on the on-disk histogram. Should be applied
2967 * only to unloaded metaslabs (i.e no incoming allocations) in-order to
2968 * give results consistent with the on-disk state
2969 */
2970 static uint64_t
metaslab_weight_from_spacemap(metaslab_t * msp)2971 metaslab_weight_from_spacemap(metaslab_t *msp)
2972 {
2973 space_map_t *sm = msp->ms_sm;
2974 ASSERT(!msp->ms_loaded);
2975 ASSERT(sm != NULL);
2976 ASSERT3U(space_map_object(sm), !=, 0);
2977 ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
2978
2979 /*
2980 * Create a joint histogram from all the segments that have made
2981 * it to the metaslab's space map histogram, that are not yet
2982 * available for allocation because they are still in the freeing
2983 * pipeline (e.g. freeing, freed, and defer trees). Then subtract
2984 * these segments from the space map's histogram to get a more
2985 * accurate weight.
2986 */
2987 uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0};
2988 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
2989 deferspace_histogram[i] += msp->ms_synchist[i];
2990 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
2991 for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
2992 deferspace_histogram[i] += msp->ms_deferhist[t][i];
2993 }
2994 }
2995
2996 uint64_t weight = 0;
2997 for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) {
2998 ASSERT3U(sm->sm_phys->smp_histogram[i], >=,
2999 deferspace_histogram[i]);
3000 uint64_t count =
3001 sm->sm_phys->smp_histogram[i] - deferspace_histogram[i];
3002 if (count != 0) {
3003 WEIGHT_SET_COUNT(weight, count);
3004 WEIGHT_SET_INDEX(weight, i + sm->sm_shift);
3005 WEIGHT_SET_ACTIVE(weight, 0);
3006 break;
3007 }
3008 }
3009 return (weight);
3010 }
3011
3012 /*
3013 * Compute a segment-based weight for the specified metaslab. The weight
3014 * is determined by highest bucket in the histogram. The information
3015 * for the highest bucket is encoded into the weight value.
3016 */
3017 static uint64_t
metaslab_segment_weight(metaslab_t * msp)3018 metaslab_segment_weight(metaslab_t *msp)
3019 {
3020 metaslab_group_t *mg = msp->ms_group;
3021 uint64_t weight = 0;
3022 uint8_t shift = mg->mg_vd->vdev_ashift;
3023
3024 ASSERT(MUTEX_HELD(&msp->ms_lock));
3025
3026 /*
3027 * The metaslab is completely free.
3028 */
3029 if (metaslab_allocated_space(msp) == 0) {
3030 int idx = highbit64(msp->ms_size) - 1;
3031 int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1;
3032
3033 if (idx < max_idx) {
3034 WEIGHT_SET_COUNT(weight, 1ULL);
3035 WEIGHT_SET_INDEX(weight, idx);
3036 } else {
3037 WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx));
3038 WEIGHT_SET_INDEX(weight, max_idx);
3039 }
3040 WEIGHT_SET_ACTIVE(weight, 0);
3041 ASSERT(!WEIGHT_IS_SPACEBASED(weight));
3042 return (weight);
3043 }
3044
3045 ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t));
3046
3047 /*
3048 * If the metaslab is fully allocated then just make the weight 0.
3049 */
3050 if (metaslab_allocated_space(msp) == msp->ms_size)
3051 return (0);
3052 /*
3053 * If the metaslab is already loaded, then use the range tree to
3054 * determine the weight. Otherwise, we rely on the space map information
3055 * to generate the weight.
3056 */
3057 if (msp->ms_loaded) {
3058 weight = metaslab_weight_from_range_tree(msp);
3059 } else {
3060 weight = metaslab_weight_from_spacemap(msp);
3061 }
3062
3063 /*
3064 * If the metaslab was active the last time we calculated its weight
3065 * then keep it active. We want to consume the entire region that
3066 * is associated with this weight.
3067 */
3068 if (msp->ms_activation_weight != 0 && weight != 0)
3069 WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight));
3070 return (weight);
3071 }
3072
3073 /*
3074 * Determine if we should attempt to allocate from this metaslab. If the
3075 * metaslab is loaded, then we can determine if the desired allocation
3076 * can be satisfied by looking at the size of the maximum free segment
3077 * on that metaslab. Otherwise, we make our decision based on the metaslab's
3078 * weight. For segment-based weighting we can determine the maximum
3079 * allocation based on the index encoded in its value. For space-based
3080 * weights we rely on the entire weight (excluding the weight-type bit).
3081 */
3082 boolean_t
metaslab_should_allocate(metaslab_t * msp,uint64_t asize,boolean_t try_hard)3083 metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard)
3084 {
3085 /*
3086 * If the metaslab is loaded, ms_max_size is definitive and we can use
3087 * the fast check. If it's not, the ms_max_size is a lower bound (once
3088 * set), and we should use the fast check as long as we're not in
3089 * try_hard and it's been less than zfs_metaslab_max_size_cache_sec
3090 * seconds since the metaslab was unloaded.
3091 */
3092 if (msp->ms_loaded ||
3093 (msp->ms_max_size != 0 && !try_hard && gethrtime() <
3094 msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec)))
3095 return (msp->ms_max_size >= asize);
3096
3097 boolean_t should_allocate;
3098 if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
3099 /*
3100 * The metaslab segment weight indicates segments in the
3101 * range [2^i, 2^(i+1)), where i is the index in the weight.
3102 * Since the asize might be in the middle of the range, we
3103 * should attempt the allocation if asize < 2^(i+1).
3104 */
3105 should_allocate = (asize <
3106 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1));
3107 } else {
3108 should_allocate = (asize <=
3109 (msp->ms_weight & ~METASLAB_WEIGHT_TYPE));
3110 }
3111
3112 return (should_allocate);
3113 }
3114
3115 static uint64_t
metaslab_weight(metaslab_t * msp)3116 metaslab_weight(metaslab_t *msp)
3117 {
3118 vdev_t *vd = msp->ms_group->mg_vd;
3119 spa_t *spa = vd->vdev_spa;
3120 uint64_t weight;
3121
3122 ASSERT(MUTEX_HELD(&msp->ms_lock));
3123
3124 metaslab_set_fragmentation(msp);
3125
3126 /*
3127 * Update the maximum size. If the metaslab is loaded, this will
3128 * ensure that we get an accurate maximum size if newly freed space
3129 * has been added back into the free tree. If the metaslab is
3130 * unloaded, we check if there's a larger free segment in the
3131 * unflushed frees. This is a lower bound on the largest allocatable
3132 * segment size. Coalescing of adjacent entries may reveal larger
3133 * allocatable segments, but we aren't aware of those until loading
3134 * the space map into a range tree.
3135 */
3136 if (msp->ms_loaded) {
3137 msp->ms_max_size = metaslab_largest_allocatable(msp);
3138 } else {
3139 msp->ms_max_size = MAX(msp->ms_max_size,
3140 metaslab_largest_unflushed_free(msp));
3141 }
3142
3143 /*
3144 * Segment-based weighting requires space map histogram support.
3145 */
3146 if (zfs_metaslab_segment_weight_enabled &&
3147 spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
3148 (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size ==
3149 sizeof (space_map_phys_t))) {
3150 weight = metaslab_segment_weight(msp);
3151 } else {
3152 weight = metaslab_space_weight(msp);
3153 }
3154 return (weight);
3155 }
3156
3157 void
metaslab_recalculate_weight_and_sort(metaslab_t * msp)3158 metaslab_recalculate_weight_and_sort(metaslab_t *msp)
3159 {
3160 ASSERT(MUTEX_HELD(&msp->ms_lock));
3161
3162 /* note: we preserve the mask (e.g. indication of primary, etc..) */
3163 uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
3164 metaslab_group_sort(msp->ms_group, msp,
3165 metaslab_weight(msp) | was_active);
3166 }
3167
3168 static int
metaslab_activate_allocator(metaslab_group_t * mg,metaslab_t * msp,int allocator,uint64_t activation_weight)3169 metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3170 int allocator, uint64_t activation_weight)
3171 {
3172 ASSERT(MUTEX_HELD(&msp->ms_lock));
3173
3174 /*
3175 * If we're activating for the claim code, we don't want to actually
3176 * set the metaslab up for a specific allocator.
3177 */
3178 if (activation_weight == METASLAB_WEIGHT_CLAIM) {
3179 ASSERT0(msp->ms_activation_weight);
3180 msp->ms_activation_weight = msp->ms_weight;
3181 metaslab_group_sort(mg, msp, msp->ms_weight |
3182 activation_weight);
3183 return (0);
3184 }
3185
3186 metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
3187 mg->mg_primaries : mg->mg_secondaries);
3188
3189 mutex_enter(&mg->mg_lock);
3190 if (arr[allocator] != NULL) {
3191 mutex_exit(&mg->mg_lock);
3192 return (EEXIST);
3193 }
3194
3195 arr[allocator] = msp;
3196 ASSERT3S(msp->ms_allocator, ==, -1);
3197 msp->ms_allocator = allocator;
3198 msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
3199
3200 ASSERT0(msp->ms_activation_weight);
3201 msp->ms_activation_weight = msp->ms_weight;
3202 metaslab_group_sort_impl(mg, msp,
3203 msp->ms_weight | activation_weight);
3204
3205 mutex_exit(&mg->mg_lock);
3206
3207 return (0);
3208 }
3209
3210 static int
metaslab_activate(metaslab_t * msp,int allocator,uint64_t activation_weight)3211 metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
3212 {
3213 ASSERT(MUTEX_HELD(&msp->ms_lock));
3214
3215 /*
3216 * The current metaslab is already activated for us so there
3217 * is nothing to do. Already activated though, doesn't mean
3218 * that this metaslab is activated for our allocator nor our
3219 * requested activation weight. The metaslab could have started
3220 * as an active one for our allocator but changed allocators
3221 * while we were waiting to grab its ms_lock or we stole it
3222 * [see find_valid_metaslab()]. This means that there is a
3223 * possibility of passivating a metaslab of another allocator
3224 * or from a different activation mask, from this thread.
3225 */
3226 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3227 ASSERT(msp->ms_loaded);
3228 return (0);
3229 }
3230
3231 int error = metaslab_load(msp);
3232 if (error != 0) {
3233 metaslab_group_sort(msp->ms_group, msp, 0);
3234 return (error);
3235 }
3236
3237 /*
3238 * When entering metaslab_load() we may have dropped the
3239 * ms_lock because we were loading this metaslab, or we
3240 * were waiting for another thread to load it for us. In
3241 * that scenario, we recheck the weight of the metaslab
3242 * to see if it was activated by another thread.
3243 *
3244 * If the metaslab was activated for another allocator or
3245 * it was activated with a different activation weight (e.g.
3246 * we wanted to make it a primary but it was activated as
3247 * secondary) we return error (EBUSY).
3248 *
3249 * If the metaslab was activated for the same allocator
3250 * and requested activation mask, skip activating it.
3251 */
3252 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
3253 if (msp->ms_allocator != allocator)
3254 return (EBUSY);
3255
3256 if ((msp->ms_weight & activation_weight) == 0)
3257 return (EBUSY);
3258
3259 EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY),
3260 msp->ms_primary);
3261 return (0);
3262 }
3263
3264 /*
3265 * If the metaslab has literally 0 space, it will have weight 0. In
3266 * that case, don't bother activating it. This can happen if the
3267 * metaslab had space during find_valid_metaslab, but another thread
3268 * loaded it and used all that space while we were waiting to grab the
3269 * lock.
3270 */
3271 if (msp->ms_weight == 0) {
3272 ASSERT0(range_tree_space(msp->ms_allocatable));
3273 return (SET_ERROR(ENOSPC));
3274 }
3275
3276 if ((error = metaslab_activate_allocator(msp->ms_group, msp,
3277 allocator, activation_weight)) != 0) {
3278 return (error);
3279 }
3280
3281 ASSERT(msp->ms_loaded);
3282 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
3283
3284 return (0);
3285 }
3286
3287 static void
metaslab_passivate_allocator(metaslab_group_t * mg,metaslab_t * msp,uint64_t weight)3288 metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
3289 uint64_t weight)
3290 {
3291 ASSERT(MUTEX_HELD(&msp->ms_lock));
3292 ASSERT(msp->ms_loaded);
3293
3294 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
3295 metaslab_group_sort(mg, msp, weight);
3296 return;
3297 }
3298
3299 mutex_enter(&mg->mg_lock);
3300 ASSERT3P(msp->ms_group, ==, mg);
3301 ASSERT3S(0, <=, msp->ms_allocator);
3302 ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
3303
3304 if (msp->ms_primary) {
3305 ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
3306 ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
3307 mg->mg_primaries[msp->ms_allocator] = NULL;
3308 } else {
3309 ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
3310 ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
3311 mg->mg_secondaries[msp->ms_allocator] = NULL;
3312 }
3313 msp->ms_allocator = -1;
3314 metaslab_group_sort_impl(mg, msp, weight);
3315 mutex_exit(&mg->mg_lock);
3316 }
3317
3318 static void
metaslab_passivate(metaslab_t * msp,uint64_t weight)3319 metaslab_passivate(metaslab_t *msp, uint64_t weight)
3320 {
3321 uint64_t size = weight & ~METASLAB_WEIGHT_TYPE;
3322
3323 /*
3324 * If size < SPA_MINBLOCKSIZE, then we will not allocate from
3325 * this metaslab again. In that case, it had better be empty,
3326 * or we would be leaving space on the table.
3327 */
3328 ASSERT(size >= SPA_MINBLOCKSIZE ||
3329 range_tree_is_empty(msp->ms_allocatable));
3330 ASSERT0(weight & METASLAB_ACTIVE_MASK);
3331
3332 ASSERT(msp->ms_activation_weight != 0);
3333 msp->ms_activation_weight = 0;
3334 metaslab_passivate_allocator(msp->ms_group, msp, weight);
3335 ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK);
3336 }
3337
3338 /*
3339 * Segment-based metaslabs are activated once and remain active until
3340 * we either fail an allocation attempt (similar to space-based metaslabs)
3341 * or have exhausted the free space in zfs_metaslab_switch_threshold
3342 * buckets since the metaslab was activated. This function checks to see
3343 * if we've exhaused the zfs_metaslab_switch_threshold buckets in the
3344 * metaslab and passivates it proactively. This will allow us to select a
3345 * metaslabs with larger contiguous region if any remaining within this
3346 * metaslab group. If we're in sync pass > 1, then we continue using this
3347 * metaslab so that we don't dirty more block and cause more sync passes.
3348 */
3349 void
metaslab_segment_may_passivate(metaslab_t * msp)3350 metaslab_segment_may_passivate(metaslab_t *msp)
3351 {
3352 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3353
3354 if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1)
3355 return;
3356
3357 /*
3358 * Since we are in the middle of a sync pass, the most accurate
3359 * information that is accessible to us is the in-core range tree
3360 * histogram; calculate the new weight based on that information.
3361 */
3362 uint64_t weight = metaslab_weight_from_range_tree(msp);
3363 int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight);
3364 int current_idx = WEIGHT_GET_INDEX(weight);
3365
3366 if (current_idx <= activation_idx - zfs_metaslab_switch_threshold)
3367 metaslab_passivate(msp, weight);
3368 }
3369
3370 static void
metaslab_preload(void * arg)3371 metaslab_preload(void *arg)
3372 {
3373 metaslab_t *msp = arg;
3374 metaslab_class_t *mc = msp->ms_group->mg_class;
3375 spa_t *spa = mc->mc_spa;
3376
3377 ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
3378
3379 mutex_enter(&msp->ms_lock);
3380 (void) metaslab_load(msp);
3381 metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
3382 mutex_exit(&msp->ms_lock);
3383 }
3384
3385 static void
metaslab_group_preload(metaslab_group_t * mg)3386 metaslab_group_preload(metaslab_group_t *mg)
3387 {
3388 spa_t *spa = mg->mg_vd->vdev_spa;
3389 metaslab_t *msp;
3390 avl_tree_t *t = &mg->mg_metaslab_tree;
3391 int m = 0;
3392
3393 if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
3394 taskq_wait(mg->mg_taskq);
3395 return;
3396 }
3397
3398 mutex_enter(&mg->mg_lock);
3399
3400 /*
3401 * Load the next potential metaslabs
3402 */
3403 for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) {
3404 ASSERT3P(msp->ms_group, ==, mg);
3405
3406 /*
3407 * We preload only the maximum number of metaslabs specified
3408 * by metaslab_preload_limit. If a metaslab is being forced
3409 * to condense then we preload it too. This will ensure
3410 * that force condensing happens in the next txg.
3411 */
3412 if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
3413 continue;
3414 }
3415
3416 VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
3417 msp, TQ_SLEEP) != TASKQID_INVALID);
3418 }
3419 mutex_exit(&mg->mg_lock);
3420 }
3421
3422 /*
3423 * Determine if the space map's on-disk footprint is past our tolerance for
3424 * inefficiency. We would like to use the following criteria to make our
3425 * decision:
3426 *
3427 * 1. Do not condense if the size of the space map object would dramatically
3428 * increase as a result of writing out the free space range tree.
3429 *
3430 * 2. Condense if the on on-disk space map representation is at least
3431 * zfs_condense_pct/100 times the size of the optimal representation
3432 * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB).
3433 *
3434 * 3. Do not condense if the on-disk size of the space map does not actually
3435 * decrease.
3436 *
3437 * Unfortunately, we cannot compute the on-disk size of the space map in this
3438 * context because we cannot accurately compute the effects of compression, etc.
3439 * Instead, we apply the heuristic described in the block comment for
3440 * zfs_metaslab_condense_block_threshold - we only condense if the space used
3441 * is greater than a threshold number of blocks.
3442 */
3443 static boolean_t
metaslab_should_condense(metaslab_t * msp)3444 metaslab_should_condense(metaslab_t *msp)
3445 {
3446 space_map_t *sm = msp->ms_sm;
3447 vdev_t *vd = msp->ms_group->mg_vd;
3448 uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
3449
3450 ASSERT(MUTEX_HELD(&msp->ms_lock));
3451 ASSERT(msp->ms_loaded);
3452 ASSERT(sm != NULL);
3453 ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1);
3454
3455 /*
3456 * We always condense metaslabs that are empty and metaslabs for
3457 * which a condense request has been made.
3458 */
3459 if (range_tree_numsegs(msp->ms_allocatable) == 0 ||
3460 msp->ms_condense_wanted)
3461 return (B_TRUE);
3462
3463 uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize);
3464 uint64_t object_size = space_map_length(sm);
3465 uint64_t optimal_size = space_map_estimate_optimal_size(sm,
3466 msp->ms_allocatable, SM_NO_VDEVID);
3467
3468 return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
3469 object_size > zfs_metaslab_condense_block_threshold * record_size);
3470 }
3471
3472 /*
3473 * Condense the on-disk space map representation to its minimized form.
3474 * The minimized form consists of a small number of allocations followed
3475 * by the entries of the free range tree (ms_allocatable). The condensed
3476 * spacemap contains all the entries of previous TXGs (including those in
3477 * the pool-wide log spacemaps; thus this is effectively a superset of
3478 * metaslab_flush()), but this TXG's entries still need to be written.
3479 */
3480 static void
metaslab_condense(metaslab_t * msp,dmu_tx_t * tx)3481 metaslab_condense(metaslab_t *msp, dmu_tx_t *tx)
3482 {
3483 range_tree_t *condense_tree;
3484 space_map_t *sm = msp->ms_sm;
3485 uint64_t txg = dmu_tx_get_txg(tx);
3486 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3487
3488 ASSERT(MUTEX_HELD(&msp->ms_lock));
3489 ASSERT(msp->ms_loaded);
3490 ASSERT(msp->ms_sm != NULL);
3491
3492 /*
3493 * In order to condense the space map, we need to change it so it
3494 * only describes which segments are currently allocated and free.
3495 *
3496 * All the current free space resides in the ms_allocatable, all
3497 * the ms_defer trees, and all the ms_allocating trees. We ignore
3498 * ms_freed because it is empty because we're in sync pass 1. We
3499 * ignore ms_freeing because these changes are not yet reflected
3500 * in the spacemap (they will be written later this txg).
3501 *
3502 * So to truncate the space map to represent all the entries of
3503 * previous TXGs we do the following:
3504 *
3505 * 1] We create a range tree (condense tree) that is 100% empty.
3506 * 2] We add to it all segments found in the ms_defer trees
3507 * as those segments are marked as free in the original space
3508 * map. We do the same with the ms_allocating trees for the same
3509 * reason. Adding these segments should be a relatively
3510 * inexpensive operation since we expect these trees to have a
3511 * small number of nodes.
3512 * 3] We vacate any unflushed allocs, since they are not frees we
3513 * need to add to the condense tree. Then we vacate any
3514 * unflushed frees as they should already be part of ms_allocatable.
3515 * 4] At this point, we would ideally like to add all segments
3516 * in the ms_allocatable tree from the condense tree. This way
3517 * we would write all the entries of the condense tree as the
3518 * condensed space map, which would only contain freeed
3519 * segments with everything else assumed to be allocated.
3520 *
3521 * Doing so can be prohibitively expensive as ms_allocatable can
3522 * be large, and therefore computationally expensive to add to
3523 * the condense_tree. Instead we first sync out an entry marking
3524 * everything as allocated, then the condense_tree and then the
3525 * ms_allocatable, in the condensed space map. While this is not
3526 * optimal, it is typically close to optimal and more importantly
3527 * much cheaper to compute.
3528 *
3529 * 5] Finally, as both of the unflushed trees were written to our
3530 * new and condensed metaslab space map, we basically flushed
3531 * all the unflushed changes to disk, thus we call
3532 * metaslab_flush_update().
3533 */
3534 ASSERT3U(spa_sync_pass(spa), ==, 1);
3535 ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */
3536
3537 zfs_dbgmsg("condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
3538 "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
3539 msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
3540 spa->spa_name, space_map_length(msp->ms_sm),
3541 range_tree_numsegs(msp->ms_allocatable),
3542 msp->ms_condense_wanted ? "TRUE" : "FALSE");
3543
3544 msp->ms_condense_wanted = B_FALSE;
3545
3546 range_seg_type_t type;
3547 uint64_t shift, start;
3548 type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp,
3549 &start, &shift);
3550
3551 condense_tree = range_tree_create(NULL, type, NULL, start, shift);
3552
3553 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3554 range_tree_walk(msp->ms_defer[t],
3555 range_tree_add, condense_tree);
3556 }
3557
3558 for (int t = 0; t < TXG_CONCURRENT_STATES; t++) {
3559 range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK],
3560 range_tree_add, condense_tree);
3561 }
3562
3563 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3564 metaslab_unflushed_changes_memused(msp));
3565 spa->spa_unflushed_stats.sus_memused -=
3566 metaslab_unflushed_changes_memused(msp);
3567 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3568 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3569
3570 /*
3571 * We're about to drop the metaslab's lock thus allowing other
3572 * consumers to change its content. Set the metaslab's ms_condensing
3573 * flag to ensure that allocations on this metaslab do not occur
3574 * while we're in the middle of committing it to disk. This is only
3575 * critical for ms_allocatable as all other range trees use per TXG
3576 * views of their content.
3577 */
3578 msp->ms_condensing = B_TRUE;
3579
3580 mutex_exit(&msp->ms_lock);
3581 uint64_t object = space_map_object(msp->ms_sm);
3582 space_map_truncate(sm,
3583 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3584 zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx);
3585
3586 /*
3587 * space_map_truncate() may have reallocated the spacemap object.
3588 * If so, update the vdev_ms_array.
3589 */
3590 if (space_map_object(msp->ms_sm) != object) {
3591 object = space_map_object(msp->ms_sm);
3592 dmu_write(spa->spa_meta_objset,
3593 msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) *
3594 msp->ms_id, sizeof (uint64_t), &object, tx);
3595 }
3596
3597 /*
3598 * Note:
3599 * When the log space map feature is enabled, each space map will
3600 * always have ALLOCS followed by FREES for each sync pass. This is
3601 * typically true even when the log space map feature is disabled,
3602 * except from the case where a metaslab goes through metaslab_sync()
3603 * and gets condensed. In that case the metaslab's space map will have
3604 * ALLOCS followed by FREES (due to condensing) followed by ALLOCS
3605 * followed by FREES (due to space_map_write() in metaslab_sync()) for
3606 * sync pass 1.
3607 */
3608 range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start,
3609 shift);
3610 range_tree_add(tmp_tree, msp->ms_start, msp->ms_size);
3611 space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx);
3612 space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
3613 space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx);
3614
3615 range_tree_vacate(condense_tree, NULL, NULL);
3616 range_tree_destroy(condense_tree);
3617 range_tree_vacate(tmp_tree, NULL, NULL);
3618 range_tree_destroy(tmp_tree);
3619 mutex_enter(&msp->ms_lock);
3620
3621 msp->ms_condensing = B_FALSE;
3622 metaslab_flush_update(msp, tx);
3623 }
3624
3625 /*
3626 * Called when the metaslab has been flushed (its own spacemap now reflects
3627 * all the contents of the pool-wide spacemap log). Updates the metaslab's
3628 * metadata and any pool-wide related log space map data (e.g. summary,
3629 * obsolete logs, etc.) to reflect that.
3630 */
3631 static void
metaslab_flush_update(metaslab_t * msp,dmu_tx_t * tx)3632 metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx)
3633 {
3634 metaslab_group_t *mg = msp->ms_group;
3635 spa_t *spa = mg->mg_vd->vdev_spa;
3636
3637 ASSERT(MUTEX_HELD(&msp->ms_lock));
3638
3639 ASSERT3U(spa_sync_pass(spa), ==, 1);
3640 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3641 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3642
3643 /*
3644 * Just because a metaslab got flushed, that doesn't mean that
3645 * it will pass through metaslab_sync_done(). Thus, make sure to
3646 * update ms_synced_length here in case it doesn't.
3647 */
3648 msp->ms_synced_length = space_map_length(msp->ms_sm);
3649
3650 /*
3651 * We may end up here from metaslab_condense() without the
3652 * feature being active. In that case this is a no-op.
3653 */
3654 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
3655 return;
3656
3657 ASSERT(spa_syncing_log_sm(spa) != NULL);
3658 ASSERT(msp->ms_sm != NULL);
3659 ASSERT(metaslab_unflushed_txg(msp) != 0);
3660 ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp);
3661
3662 VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa));
3663
3664 /* update metaslab's position in our flushing tree */
3665 uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp);
3666 mutex_enter(&spa->spa_flushed_ms_lock);
3667 avl_remove(&spa->spa_metaslabs_by_flushed, msp);
3668 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3669 avl_add(&spa->spa_metaslabs_by_flushed, msp);
3670 mutex_exit(&spa->spa_flushed_ms_lock);
3671
3672 /* update metaslab counts of spa_log_sm_t nodes */
3673 spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg);
3674 spa_log_sm_increment_current_mscount(spa);
3675
3676 /* cleanup obsolete logs if any */
3677 uint64_t log_blocks_before = spa_log_sm_nblocks(spa);
3678 spa_cleanup_old_sm_logs(spa, tx);
3679 uint64_t log_blocks_after = spa_log_sm_nblocks(spa);
3680 VERIFY3U(log_blocks_after, <=, log_blocks_before);
3681
3682 /* update log space map summary */
3683 uint64_t blocks_gone = log_blocks_before - log_blocks_after;
3684 spa_log_summary_add_flushed_metaslab(spa);
3685 spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg);
3686 spa_log_summary_decrement_blkcount(spa, blocks_gone);
3687 }
3688
3689 boolean_t
metaslab_flush(metaslab_t * msp,dmu_tx_t * tx)3690 metaslab_flush(metaslab_t *msp, dmu_tx_t *tx)
3691 {
3692 spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
3693
3694 ASSERT(MUTEX_HELD(&msp->ms_lock));
3695 ASSERT3U(spa_sync_pass(spa), ==, 1);
3696 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
3697
3698 ASSERT(msp->ms_sm != NULL);
3699 ASSERT(metaslab_unflushed_txg(msp) != 0);
3700 ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL);
3701
3702 /*
3703 * There is nothing wrong with flushing the same metaslab twice, as
3704 * this codepath should work on that case. However, the current
3705 * flushing scheme makes sure to avoid this situation as we would be
3706 * making all these calls without having anything meaningful to write
3707 * to disk. We assert this behavior here.
3708 */
3709 ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx));
3710
3711 /*
3712 * We can not flush while loading, because then we would
3713 * not load the ms_unflushed_{allocs,frees}.
3714 */
3715 if (msp->ms_loading)
3716 return (B_FALSE);
3717
3718 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3719 metaslab_verify_weight_and_frag(msp);
3720
3721 /*
3722 * Metaslab condensing is effectively flushing. Therefore if the
3723 * metaslab can be condensed we can just condense it instead of
3724 * flushing it.
3725 *
3726 * Note that metaslab_condense() does call metaslab_flush_update()
3727 * so we can just return immediately after condensing. We also
3728 * don't need to care about setting ms_flushing or broadcasting
3729 * ms_flush_cv, even if we temporarily drop the ms_lock in
3730 * metaslab_condense(), as the metaslab is already loaded.
3731 */
3732 if (msp->ms_loaded && metaslab_should_condense(msp)) {
3733 metaslab_group_t *mg = msp->ms_group;
3734
3735 /*
3736 * For all histogram operations below refer to the
3737 * comments of metaslab_sync() where we follow a
3738 * similar procedure.
3739 */
3740 metaslab_group_histogram_verify(mg);
3741 metaslab_class_histogram_verify(mg->mg_class);
3742 metaslab_group_histogram_remove(mg, msp);
3743
3744 metaslab_condense(msp, tx);
3745
3746 space_map_histogram_clear(msp->ms_sm);
3747 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
3748 ASSERT(range_tree_is_empty(msp->ms_freed));
3749 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
3750 space_map_histogram_add(msp->ms_sm,
3751 msp->ms_defer[t], tx);
3752 }
3753 metaslab_aux_histograms_update(msp);
3754
3755 metaslab_group_histogram_add(mg, msp);
3756 metaslab_group_histogram_verify(mg);
3757 metaslab_class_histogram_verify(mg->mg_class);
3758
3759 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3760
3761 /*
3762 * Since we recreated the histogram (and potentially
3763 * the ms_sm too while condensing) ensure that the
3764 * weight is updated too because we are not guaranteed
3765 * that this metaslab is dirty and will go through
3766 * metaslab_sync_done().
3767 */
3768 metaslab_recalculate_weight_and_sort(msp);
3769 return (B_TRUE);
3770 }
3771
3772 msp->ms_flushing = B_TRUE;
3773 uint64_t sm_len_before = space_map_length(msp->ms_sm);
3774
3775 mutex_exit(&msp->ms_lock);
3776 space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC,
3777 SM_NO_VDEVID, tx);
3778 space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE,
3779 SM_NO_VDEVID, tx);
3780 mutex_enter(&msp->ms_lock);
3781
3782 uint64_t sm_len_after = space_map_length(msp->ms_sm);
3783 if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) {
3784 zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, "
3785 "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, "
3786 "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa),
3787 msp->ms_group->mg_vd->vdev_id, msp->ms_id,
3788 range_tree_space(msp->ms_unflushed_allocs),
3789 range_tree_space(msp->ms_unflushed_frees),
3790 (sm_len_after - sm_len_before));
3791 }
3792
3793 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3794 metaslab_unflushed_changes_memused(msp));
3795 spa->spa_unflushed_stats.sus_memused -=
3796 metaslab_unflushed_changes_memused(msp);
3797 range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
3798 range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
3799
3800 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3801 metaslab_verify_weight_and_frag(msp);
3802
3803 metaslab_flush_update(msp, tx);
3804
3805 metaslab_verify_space(msp, dmu_tx_get_txg(tx));
3806 metaslab_verify_weight_and_frag(msp);
3807
3808 msp->ms_flushing = B_FALSE;
3809 cv_broadcast(&msp->ms_flush_cv);
3810 return (B_TRUE);
3811 }
3812
3813 /*
3814 * Write a metaslab to disk in the context of the specified transaction group.
3815 */
3816 void
metaslab_sync(metaslab_t * msp,uint64_t txg)3817 metaslab_sync(metaslab_t *msp, uint64_t txg)
3818 {
3819 metaslab_group_t *mg = msp->ms_group;
3820 vdev_t *vd = mg->mg_vd;
3821 spa_t *spa = vd->vdev_spa;
3822 objset_t *mos = spa_meta_objset(spa);
3823 range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK];
3824 dmu_tx_t *tx;
3825
3826 ASSERT(!vd->vdev_ishole);
3827
3828 /*
3829 * This metaslab has just been added so there's no work to do now.
3830 */
3831 if (msp->ms_freeing == NULL) {
3832 ASSERT3P(alloctree, ==, NULL);
3833 return;
3834 }
3835
3836 ASSERT3P(alloctree, !=, NULL);
3837 ASSERT3P(msp->ms_freeing, !=, NULL);
3838 ASSERT3P(msp->ms_freed, !=, NULL);
3839 ASSERT3P(msp->ms_checkpointing, !=, NULL);
3840 ASSERT3P(msp->ms_trim, !=, NULL);
3841
3842 /*
3843 * Normally, we don't want to process a metaslab if there are no
3844 * allocations or frees to perform. However, if the metaslab is being
3845 * forced to condense, it's loaded and we're not beyond the final
3846 * dirty txg, we need to let it through. Not condensing beyond the
3847 * final dirty txg prevents an issue where metaslabs that need to be
3848 * condensed but were loaded for other reasons could cause a panic
3849 * here. By only checking the txg in that branch of the conditional,
3850 * we preserve the utility of the VERIFY statements in all other
3851 * cases.
3852 */
3853 if (range_tree_is_empty(alloctree) &&
3854 range_tree_is_empty(msp->ms_freeing) &&
3855 range_tree_is_empty(msp->ms_checkpointing) &&
3856 !(msp->ms_loaded && msp->ms_condense_wanted &&
3857 txg <= spa_final_dirty_txg(spa)))
3858 return;
3859
3860
3861 VERIFY3U(txg, <=, spa_final_dirty_txg(spa));
3862
3863 /*
3864 * The only state that can actually be changing concurrently
3865 * with metaslab_sync() is the metaslab's ms_allocatable. No
3866 * other thread can be modifying this txg's alloc, freeing,
3867 * freed, or space_map_phys_t. We drop ms_lock whenever we
3868 * could call into the DMU, because the DMU can call down to
3869 * us (e.g. via zio_free()) at any time.
3870 *
3871 * The spa_vdev_remove_thread() can be reading metaslab state
3872 * concurrently, and it is locked out by the ms_sync_lock.
3873 * Note that the ms_lock is insufficient for this, because it
3874 * is dropped by space_map_write().
3875 */
3876 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
3877
3878 /*
3879 * Generate a log space map if one doesn't exist already.
3880 */
3881 spa_generate_syncing_log_sm(spa, tx);
3882
3883 if (msp->ms_sm == NULL) {
3884 uint64_t new_object = space_map_alloc(mos,
3885 spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ?
3886 zfs_metaslab_sm_blksz_with_log :
3887 zfs_metaslab_sm_blksz_no_log, tx);
3888 VERIFY3U(new_object, !=, 0);
3889
3890 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
3891 msp->ms_id, sizeof (uint64_t), &new_object, tx);
3892
3893 VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
3894 msp->ms_start, msp->ms_size, vd->vdev_ashift));
3895 ASSERT(msp->ms_sm != NULL);
3896
3897 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3898 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3899 ASSERT0(metaslab_allocated_space(msp));
3900 }
3901
3902 if (metaslab_unflushed_txg(msp) == 0 &&
3903 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
3904 ASSERT(spa_syncing_log_sm(spa) != NULL);
3905
3906 metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx);
3907 spa_log_sm_increment_current_mscount(spa);
3908 spa_log_summary_add_flushed_metaslab(spa);
3909
3910 ASSERT(msp->ms_sm != NULL);
3911 mutex_enter(&spa->spa_flushed_ms_lock);
3912 avl_add(&spa->spa_metaslabs_by_flushed, msp);
3913 mutex_exit(&spa->spa_flushed_ms_lock);
3914
3915 ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
3916 ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
3917 }
3918
3919 if (!range_tree_is_empty(msp->ms_checkpointing) &&
3920 vd->vdev_checkpoint_sm == NULL) {
3921 ASSERT(spa_has_checkpoint(spa));
3922
3923 uint64_t new_object = space_map_alloc(mos,
3924 zfs_vdev_standard_sm_blksz, tx);
3925 VERIFY3U(new_object, !=, 0);
3926
3927 VERIFY0(space_map_open(&vd->vdev_checkpoint_sm,
3928 mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift));
3929 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3930
3931 /*
3932 * We save the space map object as an entry in vdev_top_zap
3933 * so it can be retrieved when the pool is reopened after an
3934 * export or through zdb.
3935 */
3936 VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset,
3937 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM,
3938 sizeof (new_object), 1, &new_object, tx));
3939 }
3940
3941 mutex_enter(&msp->ms_sync_lock);
3942 mutex_enter(&msp->ms_lock);
3943
3944 /*
3945 * Note: metaslab_condense() clears the space map's histogram.
3946 * Therefore we must verify and remove this histogram before
3947 * condensing.
3948 */
3949 metaslab_group_histogram_verify(mg);
3950 metaslab_class_histogram_verify(mg->mg_class);
3951 metaslab_group_histogram_remove(mg, msp);
3952
3953 if (spa->spa_sync_pass == 1 && msp->ms_loaded &&
3954 metaslab_should_condense(msp))
3955 metaslab_condense(msp, tx);
3956
3957 /*
3958 * We'll be going to disk to sync our space accounting, thus we
3959 * drop the ms_lock during that time so allocations coming from
3960 * open-context (ZIL) for future TXGs do not block.
3961 */
3962 mutex_exit(&msp->ms_lock);
3963 space_map_t *log_sm = spa_syncing_log_sm(spa);
3964 if (log_sm != NULL) {
3965 ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3966
3967 space_map_write(log_sm, alloctree, SM_ALLOC,
3968 vd->vdev_id, tx);
3969 space_map_write(log_sm, msp->ms_freeing, SM_FREE,
3970 vd->vdev_id, tx);
3971 mutex_enter(&msp->ms_lock);
3972
3973 ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
3974 metaslab_unflushed_changes_memused(msp));
3975 spa->spa_unflushed_stats.sus_memused -=
3976 metaslab_unflushed_changes_memused(msp);
3977 range_tree_remove_xor_add(alloctree,
3978 msp->ms_unflushed_frees, msp->ms_unflushed_allocs);
3979 range_tree_remove_xor_add(msp->ms_freeing,
3980 msp->ms_unflushed_allocs, msp->ms_unflushed_frees);
3981 spa->spa_unflushed_stats.sus_memused +=
3982 metaslab_unflushed_changes_memused(msp);
3983 } else {
3984 ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP));
3985
3986 space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
3987 SM_NO_VDEVID, tx);
3988 space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
3989 SM_NO_VDEVID, tx);
3990 mutex_enter(&msp->ms_lock);
3991 }
3992
3993 msp->ms_allocated_space += range_tree_space(alloctree);
3994 ASSERT3U(msp->ms_allocated_space, >=,
3995 range_tree_space(msp->ms_freeing));
3996 msp->ms_allocated_space -= range_tree_space(msp->ms_freeing);
3997
3998 if (!range_tree_is_empty(msp->ms_checkpointing)) {
3999 ASSERT(spa_has_checkpoint(spa));
4000 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
4001
4002 /*
4003 * Since we are doing writes to disk and the ms_checkpointing
4004 * tree won't be changing during that time, we drop the
4005 * ms_lock while writing to the checkpoint space map, for the
4006 * same reason mentioned above.
4007 */
4008 mutex_exit(&msp->ms_lock);
4009 space_map_write(vd->vdev_checkpoint_sm,
4010 msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
4011 mutex_enter(&msp->ms_lock);
4012
4013 spa->spa_checkpoint_info.sci_dspace +=
4014 range_tree_space(msp->ms_checkpointing);
4015 vd->vdev_stat.vs_checkpoint_space +=
4016 range_tree_space(msp->ms_checkpointing);
4017 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==,
4018 -space_map_allocated(vd->vdev_checkpoint_sm));
4019
4020 range_tree_vacate(msp->ms_checkpointing, NULL, NULL);
4021 }
4022
4023 if (msp->ms_loaded) {
4024 /*
4025 * When the space map is loaded, we have an accurate
4026 * histogram in the range tree. This gives us an opportunity
4027 * to bring the space map's histogram up-to-date so we clear
4028 * it first before updating it.
4029 */
4030 space_map_histogram_clear(msp->ms_sm);
4031 space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx);
4032
4033 /*
4034 * Since we've cleared the histogram we need to add back
4035 * any free space that has already been processed, plus
4036 * any deferred space. This allows the on-disk histogram
4037 * to accurately reflect all free space even if some space
4038 * is not yet available for allocation (i.e. deferred).
4039 */
4040 space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx);
4041
4042 /*
4043 * Add back any deferred free space that has not been
4044 * added back into the in-core free tree yet. This will
4045 * ensure that we don't end up with a space map histogram
4046 * that is completely empty unless the metaslab is fully
4047 * allocated.
4048 */
4049 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
4050 space_map_histogram_add(msp->ms_sm,
4051 msp->ms_defer[t], tx);
4052 }
4053 }
4054
4055 /*
4056 * Always add the free space from this sync pass to the space
4057 * map histogram. We want to make sure that the on-disk histogram
4058 * accounts for all free space. If the space map is not loaded,
4059 * then we will lose some accuracy but will correct it the next
4060 * time we load the space map.
4061 */
4062 space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx);
4063 metaslab_aux_histograms_update(msp);
4064
4065 metaslab_group_histogram_add(mg, msp);
4066 metaslab_group_histogram_verify(mg);
4067 metaslab_class_histogram_verify(mg->mg_class);
4068
4069 /*
4070 * For sync pass 1, we avoid traversing this txg's free range tree
4071 * and instead will just swap the pointers for freeing and freed.
4072 * We can safely do this since the freed_tree is guaranteed to be
4073 * empty on the initial pass.
4074 *
4075 * Keep in mind that even if we are currently using a log spacemap
4076 * we want current frees to end up in the ms_allocatable (but not
4077 * get appended to the ms_sm) so their ranges can be reused as usual.
4078 */
4079 if (spa_sync_pass(spa) == 1) {
4080 range_tree_swap(&msp->ms_freeing, &msp->ms_freed);
4081 ASSERT0(msp->ms_allocated_this_txg);
4082 } else {
4083 range_tree_vacate(msp->ms_freeing,
4084 range_tree_add, msp->ms_freed);
4085 }
4086 msp->ms_allocated_this_txg += range_tree_space(alloctree);
4087 range_tree_vacate(alloctree, NULL, NULL);
4088
4089 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4090 ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg)
4091 & TXG_MASK]));
4092 ASSERT0(range_tree_space(msp->ms_freeing));
4093 ASSERT0(range_tree_space(msp->ms_checkpointing));
4094
4095 mutex_exit(&msp->ms_lock);
4096
4097 /*
4098 * Verify that the space map object ID has been recorded in the
4099 * vdev_ms_array.
4100 */
4101 uint64_t object;
4102 VERIFY0(dmu_read(mos, vd->vdev_ms_array,
4103 msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0));
4104 VERIFY3U(object, ==, space_map_object(msp->ms_sm));
4105
4106 mutex_exit(&msp->ms_sync_lock);
4107 dmu_tx_commit(tx);
4108 }
4109
4110 static void
metaslab_evict(metaslab_t * msp,uint64_t txg)4111 metaslab_evict(metaslab_t *msp, uint64_t txg)
4112 {
4113 if (!msp->ms_loaded || msp->ms_disabled != 0)
4114 return;
4115
4116 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
4117 VERIFY0(range_tree_space(
4118 msp->ms_allocating[(txg + t) & TXG_MASK]));
4119 }
4120 if (msp->ms_allocator != -1)
4121 metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
4122
4123 if (!metaslab_debug_unload)
4124 metaslab_unload(msp);
4125 }
4126
4127 /*
4128 * Called after a transaction group has completely synced to mark
4129 * all of the metaslab's free space as usable.
4130 */
4131 void
metaslab_sync_done(metaslab_t * msp,uint64_t txg)4132 metaslab_sync_done(metaslab_t *msp, uint64_t txg)
4133 {
4134 metaslab_group_t *mg = msp->ms_group;
4135 vdev_t *vd = mg->mg_vd;
4136 spa_t *spa = vd->vdev_spa;
4137 range_tree_t **defer_tree;
4138 int64_t alloc_delta, defer_delta;
4139 boolean_t defer_allowed = B_TRUE;
4140
4141 ASSERT(!vd->vdev_ishole);
4142
4143 mutex_enter(&msp->ms_lock);
4144
4145 /*
4146 * If this metaslab is just becoming available, initialize its
4147 * range trees and add its capacity to the vdev.
4148 */
4149 if (msp->ms_freed == NULL) {
4150 range_seg_type_t type;
4151 uint64_t shift, start;
4152 type = metaslab_calculate_range_tree_type(vd, msp, &start,
4153 &shift);
4154
4155 for (int t = 0; t < TXG_SIZE; t++) {
4156 ASSERT(msp->ms_allocating[t] == NULL);
4157
4158 msp->ms_allocating[t] = range_tree_create(NULL, type,
4159 NULL, start, shift);
4160 }
4161
4162 ASSERT3P(msp->ms_freeing, ==, NULL);
4163 msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
4164 shift);
4165
4166 ASSERT3P(msp->ms_freed, ==, NULL);
4167 msp->ms_freed = range_tree_create(NULL, type, NULL, start,
4168 shift);
4169
4170 for (int t = 0; t < TXG_DEFER_SIZE; t++) {
4171 ASSERT3P(msp->ms_defer[t], ==, NULL);
4172 msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
4173 start, shift);
4174 }
4175
4176 ASSERT3P(msp->ms_checkpointing, ==, NULL);
4177 msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
4178 start, shift);
4179
4180 ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
4181 msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
4182 start, shift);
4183
4184 metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
4185 mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
4186 mrap->mra_floor_shift = metaslab_by_size_min_shift;
4187 ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
4188 msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
4189 type, mrap, start, shift);
4190
4191 metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
4192 }
4193 ASSERT0(range_tree_space(msp->ms_freeing));
4194 ASSERT0(range_tree_space(msp->ms_checkpointing));
4195
4196 defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE];
4197
4198 uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) -
4199 metaslab_class_get_alloc(spa_normal_class(spa));
4200 if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) {
4201 defer_allowed = B_FALSE;
4202 }
4203
4204 defer_delta = 0;
4205 alloc_delta = msp->ms_allocated_this_txg -
4206 range_tree_space(msp->ms_freed);
4207
4208 if (defer_allowed) {
4209 defer_delta = range_tree_space(msp->ms_freed) -
4210 range_tree_space(*defer_tree);
4211 } else {
4212 defer_delta -= range_tree_space(*defer_tree);
4213 }
4214 metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta,
4215 defer_delta, 0);
4216
4217 if (spa_syncing_log_sm(spa) == NULL) {
4218 /*
4219 * If there's a metaslab_load() in progress and we don't have
4220 * a log space map, it means that we probably wrote to the
4221 * metaslab's space map. If this is the case, we need to
4222 * make sure that we wait for the load to complete so that we
4223 * have a consistent view at the in-core side of the metaslab.
4224 */
4225 metaslab_load_wait(msp);
4226 } else {
4227 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
4228 }
4229
4230 /*
4231 * When auto-trimming is enabled, free ranges which are added to
4232 * ms_allocatable are also be added to ms_trim. The ms_trim tree is
4233 * periodically consumed by the vdev_autotrim_thread() which issues
4234 * trims for all ranges and then vacates the tree. The ms_trim tree
4235 * can be discarded at any time with the sole consequence of recent
4236 * frees not being trimmed.
4237 */
4238 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) {
4239 range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim);
4240 if (!defer_allowed) {
4241 range_tree_walk(msp->ms_freed, range_tree_add,
4242 msp->ms_trim);
4243 }
4244 } else {
4245 range_tree_vacate(msp->ms_trim, NULL, NULL);
4246 }
4247
4248 /*
4249 * Move the frees from the defer_tree back to the free
4250 * range tree (if it's loaded). Swap the freed_tree and
4251 * the defer_tree -- this is safe to do because we've
4252 * just emptied out the defer_tree.
4253 */
4254 range_tree_vacate(*defer_tree,
4255 msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable);
4256 if (defer_allowed) {
4257 range_tree_swap(&msp->ms_freed, defer_tree);
4258 } else {
4259 range_tree_vacate(msp->ms_freed,
4260 msp->ms_loaded ? range_tree_add : NULL,
4261 msp->ms_allocatable);
4262 }
4263
4264 msp->ms_synced_length = space_map_length(msp->ms_sm);
4265
4266 msp->ms_deferspace += defer_delta;
4267 ASSERT3S(msp->ms_deferspace, >=, 0);
4268 ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
4269 if (msp->ms_deferspace != 0) {
4270 /*
4271 * Keep syncing this metaslab until all deferred frees
4272 * are back in circulation.
4273 */
4274 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
4275 }
4276 metaslab_aux_histograms_update_done(msp, defer_allowed);
4277
4278 if (msp->ms_new) {
4279 msp->ms_new = B_FALSE;
4280 mutex_enter(&mg->mg_lock);
4281 mg->mg_ms_ready++;
4282 mutex_exit(&mg->mg_lock);
4283 }
4284
4285 /*
4286 * Re-sort metaslab within its group now that we've adjusted
4287 * its allocatable space.
4288 */
4289 metaslab_recalculate_weight_and_sort(msp);
4290
4291 /*
4292 * If the metaslab is loaded and we've not tried to load or allocate
4293 * from it in 'metaslab_unload_delay' txgs, then unload it.
4294 */
4295 if (msp->ms_loaded &&
4296 msp->ms_disabled == 0 &&
4297 msp->ms_selected_txg + metaslab_unload_delay < txg) {
4298
4299 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
4300 VERIFY0(range_tree_space(
4301 msp->ms_allocating[(txg + t) & TXG_MASK]));
4302 }
4303 if (msp->ms_allocator != -1) {
4304 metaslab_passivate(msp, msp->ms_weight &
4305 ~METASLAB_ACTIVE_MASK);
4306 }
4307
4308 if (!metaslab_debug_unload)
4309 metaslab_unload(msp);
4310 }
4311
4312 ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK]));
4313 ASSERT0(range_tree_space(msp->ms_freeing));
4314 ASSERT0(range_tree_space(msp->ms_freed));
4315 ASSERT0(range_tree_space(msp->ms_checkpointing));
4316 msp->ms_allocating_total -= msp->ms_allocated_this_txg;
4317 msp->ms_allocated_this_txg = 0;
4318 mutex_exit(&msp->ms_lock);
4319 }
4320
4321 void
metaslab_sync_reassess(metaslab_group_t * mg)4322 metaslab_sync_reassess(metaslab_group_t *mg)
4323 {
4324 spa_t *spa = mg->mg_class->mc_spa;
4325
4326 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
4327 metaslab_group_alloc_update(mg);
4328 mg->mg_fragmentation = metaslab_group_fragmentation(mg);
4329
4330 /*
4331 * Preload the next potential metaslabs but only on active
4332 * metaslab groups. We can get into a state where the metaslab
4333 * is no longer active since we dirty metaslabs as we remove a
4334 * a device, thus potentially making the metaslab group eligible
4335 * for preloading.
4336 */
4337 if (mg->mg_activation_count > 0) {
4338 metaslab_group_preload(mg);
4339 }
4340 spa_config_exit(spa, SCL_ALLOC, FTAG);
4341 }
4342
4343 /*
4344 * When writing a ditto block (i.e. more than one DVA for a given BP) on
4345 * the same vdev as an existing DVA of this BP, then try to allocate it
4346 * on a different metaslab than existing DVAs (i.e. a unique metaslab).
4347 */
4348 static boolean_t
metaslab_is_unique(metaslab_t * msp,dva_t * dva)4349 metaslab_is_unique(metaslab_t *msp, dva_t *dva)
4350 {
4351 uint64_t dva_ms_id;
4352
4353 if (DVA_GET_ASIZE(dva) == 0)
4354 return (B_TRUE);
4355
4356 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
4357 return (B_TRUE);
4358
4359 dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift;
4360
4361 return (msp->ms_id != dva_ms_id);
4362 }
4363
4364 /*
4365 * ==========================================================================
4366 * Metaslab allocation tracing facility
4367 * ==========================================================================
4368 */
4369
4370 /*
4371 * Add an allocation trace element to the allocation tracing list.
4372 */
4373 static void
metaslab_trace_add(zio_alloc_list_t * zal,metaslab_group_t * mg,metaslab_t * msp,uint64_t psize,uint32_t dva_id,uint64_t offset,int allocator)4374 metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg,
4375 metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset,
4376 int allocator)
4377 {
4378 if (!metaslab_trace_enabled)
4379 return;
4380
4381 /*
4382 * When the tracing list reaches its maximum we remove
4383 * the second element in the list before adding a new one.
4384 * By removing the second element we preserve the original
4385 * entry as a clue to what allocations steps have already been
4386 * performed.
4387 */
4388 if (zal->zal_size == metaslab_trace_max_entries) {
4389 metaslab_alloc_trace_t *mat_next;
4390 #ifdef DEBUG
4391 panic("too many entries in allocation list");
4392 #endif
4393 METASLABSTAT_BUMP(metaslabstat_trace_over_limit);
4394 zal->zal_size--;
4395 mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list));
4396 list_remove(&zal->zal_list, mat_next);
4397 kmem_cache_free(metaslab_alloc_trace_cache, mat_next);
4398 }
4399
4400 metaslab_alloc_trace_t *mat =
4401 kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP);
4402 list_link_init(&mat->mat_list_node);
4403 mat->mat_mg = mg;
4404 mat->mat_msp = msp;
4405 mat->mat_size = psize;
4406 mat->mat_dva_id = dva_id;
4407 mat->mat_offset = offset;
4408 mat->mat_weight = 0;
4409 mat->mat_allocator = allocator;
4410
4411 if (msp != NULL)
4412 mat->mat_weight = msp->ms_weight;
4413
4414 /*
4415 * The list is part of the zio so locking is not required. Only
4416 * a single thread will perform allocations for a given zio.
4417 */
4418 list_insert_tail(&zal->zal_list, mat);
4419 zal->zal_size++;
4420
4421 ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries);
4422 }
4423
4424 void
metaslab_trace_init(zio_alloc_list_t * zal)4425 metaslab_trace_init(zio_alloc_list_t *zal)
4426 {
4427 list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t),
4428 offsetof(metaslab_alloc_trace_t, mat_list_node));
4429 zal->zal_size = 0;
4430 }
4431
4432 void
metaslab_trace_fini(zio_alloc_list_t * zal)4433 metaslab_trace_fini(zio_alloc_list_t *zal)
4434 {
4435 metaslab_alloc_trace_t *mat;
4436
4437 while ((mat = list_remove_head(&zal->zal_list)) != NULL)
4438 kmem_cache_free(metaslab_alloc_trace_cache, mat);
4439 list_destroy(&zal->zal_list);
4440 zal->zal_size = 0;
4441 }
4442
4443 /*
4444 * ==========================================================================
4445 * Metaslab block operations
4446 * ==========================================================================
4447 */
4448
4449 static void
metaslab_group_alloc_increment(spa_t * spa,uint64_t vdev,void * tag,int flags,int allocator)4450 metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
4451 int allocator)
4452 {
4453 if (!(flags & METASLAB_ASYNC_ALLOC) ||
4454 (flags & METASLAB_DONT_THROTTLE))
4455 return;
4456
4457 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4458 if (!mg->mg_class->mc_alloc_throttle_enabled)
4459 return;
4460
4461 (void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
4462 }
4463
4464 static void
metaslab_group_increment_qdepth(metaslab_group_t * mg,int allocator)4465 metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
4466 {
4467 uint64_t max = mg->mg_max_alloc_queue_depth;
4468 uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4469 while (cur < max) {
4470 if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
4471 cur, cur + 1) == cur) {
4472 atomic_inc_64(
4473 &mg->mg_class->mc_alloc_max_slots[allocator]);
4474 return;
4475 }
4476 cur = mg->mg_cur_max_alloc_queue_depth[allocator];
4477 }
4478 }
4479
4480 void
metaslab_group_alloc_decrement(spa_t * spa,uint64_t vdev,void * tag,int flags,int allocator,boolean_t io_complete)4481 metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
4482 int allocator, boolean_t io_complete)
4483 {
4484 if (!(flags & METASLAB_ASYNC_ALLOC) ||
4485 (flags & METASLAB_DONT_THROTTLE))
4486 return;
4487
4488 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4489 if (!mg->mg_class->mc_alloc_throttle_enabled)
4490 return;
4491
4492 (void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
4493 if (io_complete)
4494 metaslab_group_increment_qdepth(mg, allocator);
4495 }
4496
4497 void
metaslab_group_alloc_verify(spa_t * spa,const blkptr_t * bp,void * tag,int allocator)4498 metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
4499 int allocator)
4500 {
4501 #ifdef ZFS_DEBUG
4502 const dva_t *dva = bp->blk_dva;
4503 int ndvas = BP_GET_NDVAS(bp);
4504
4505 for (int d = 0; d < ndvas; d++) {
4506 uint64_t vdev = DVA_GET_VDEV(&dva[d]);
4507 metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
4508 VERIFY(zfs_refcount_not_held(
4509 &mg->mg_alloc_queue_depth[allocator], tag));
4510 }
4511 #endif
4512 }
4513
4514 static uint64_t
metaslab_block_alloc(metaslab_t * msp,uint64_t size,uint64_t txg)4515 metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
4516 {
4517 uint64_t start;
4518 range_tree_t *rt = msp->ms_allocatable;
4519 metaslab_class_t *mc = msp->ms_group->mg_class;
4520
4521 ASSERT(MUTEX_HELD(&msp->ms_lock));
4522 VERIFY(!msp->ms_condensing);
4523 VERIFY0(msp->ms_disabled);
4524
4525 start = mc->mc_ops->msop_alloc(msp, size);
4526 if (start != -1ULL) {
4527 metaslab_group_t *mg = msp->ms_group;
4528 vdev_t *vd = mg->mg_vd;
4529
4530 VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
4531 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
4532 VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
4533 range_tree_remove(rt, start, size);
4534 range_tree_clear(msp->ms_trim, start, size);
4535
4536 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
4537 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
4538
4539 range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
4540 msp->ms_allocating_total += size;
4541
4542 /* Track the last successful allocation */
4543 msp->ms_alloc_txg = txg;
4544 metaslab_verify_space(msp, txg);
4545 }
4546
4547 /*
4548 * Now that we've attempted the allocation we need to update the
4549 * metaslab's maximum block size since it may have changed.
4550 */
4551 msp->ms_max_size = metaslab_largest_allocatable(msp);
4552 return (start);
4553 }
4554
4555 /*
4556 * Find the metaslab with the highest weight that is less than what we've
4557 * already tried. In the common case, this means that we will examine each
4558 * metaslab at most once. Note that concurrent callers could reorder metaslabs
4559 * by activation/passivation once we have dropped the mg_lock. If a metaslab is
4560 * activated by another thread, and we fail to allocate from the metaslab we
4561 * have selected, we may not try the newly-activated metaslab, and instead
4562 * activate another metaslab. This is not optimal, but generally does not cause
4563 * any problems (a possible exception being if every metaslab is completely full
4564 * except for the the newly-activated metaslab which we fail to examine).
4565 */
4566 static metaslab_t *
find_valid_metaslab(metaslab_group_t * mg,uint64_t activation_weight,dva_t * dva,int d,boolean_t want_unique,uint64_t asize,int allocator,boolean_t try_hard,zio_alloc_list_t * zal,metaslab_t * search,boolean_t * was_active)4567 find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight,
4568 dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator,
4569 boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search,
4570 boolean_t *was_active)
4571 {
4572 avl_index_t idx;
4573 avl_tree_t *t = &mg->mg_metaslab_tree;
4574 metaslab_t *msp = avl_find(t, search, &idx);
4575 if (msp == NULL)
4576 msp = avl_nearest(t, idx, AVL_AFTER);
4577
4578 for (; msp != NULL; msp = AVL_NEXT(t, msp)) {
4579 int i;
4580 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4581 metaslab_trace_add(zal, mg, msp, asize, d,
4582 TRACE_TOO_SMALL, allocator);
4583 continue;
4584 }
4585
4586 /*
4587 * If the selected metaslab is condensing or disabled,
4588 * skip it.
4589 */
4590 if (msp->ms_condensing || msp->ms_disabled > 0)
4591 continue;
4592
4593 *was_active = msp->ms_allocator != -1;
4594 /*
4595 * If we're activating as primary, this is our first allocation
4596 * from this disk, so we don't need to check how close we are.
4597 * If the metaslab under consideration was already active,
4598 * we're getting desperate enough to steal another allocator's
4599 * metaslab, so we still don't care about distances.
4600 */
4601 if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active)
4602 break;
4603
4604 for (i = 0; i < d; i++) {
4605 if (want_unique &&
4606 !metaslab_is_unique(msp, &dva[i]))
4607 break; /* try another metaslab */
4608 }
4609 if (i == d)
4610 break;
4611 }
4612
4613 if (msp != NULL) {
4614 search->ms_weight = msp->ms_weight;
4615 search->ms_start = msp->ms_start + 1;
4616 search->ms_allocator = msp->ms_allocator;
4617 search->ms_primary = msp->ms_primary;
4618 }
4619 return (msp);
4620 }
4621
4622 void
metaslab_active_mask_verify(metaslab_t * msp)4623 metaslab_active_mask_verify(metaslab_t *msp)
4624 {
4625 ASSERT(MUTEX_HELD(&msp->ms_lock));
4626
4627 if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0)
4628 return;
4629
4630 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0)
4631 return;
4632
4633 if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) {
4634 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4635 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4636 VERIFY3S(msp->ms_allocator, !=, -1);
4637 VERIFY(msp->ms_primary);
4638 return;
4639 }
4640
4641 if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) {
4642 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4643 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM);
4644 VERIFY3S(msp->ms_allocator, !=, -1);
4645 VERIFY(!msp->ms_primary);
4646 return;
4647 }
4648
4649 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) {
4650 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
4651 VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
4652 VERIFY3S(msp->ms_allocator, ==, -1);
4653 return;
4654 }
4655 }
4656
4657 /* ARGSUSED */
4658 static uint64_t
metaslab_group_alloc_normal(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,boolean_t want_unique,dva_t * dva,int d,int allocator,boolean_t try_hard)4659 metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
4660 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4661 int allocator, boolean_t try_hard)
4662 {
4663 metaslab_t *msp = NULL;
4664 uint64_t offset = -1ULL;
4665
4666 uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY;
4667 for (int i = 0; i < d; i++) {
4668 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4669 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4670 activation_weight = METASLAB_WEIGHT_SECONDARY;
4671 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4672 DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
4673 activation_weight = METASLAB_WEIGHT_CLAIM;
4674 break;
4675 }
4676 }
4677
4678 /*
4679 * If we don't have enough metaslabs active to fill the entire array, we
4680 * just use the 0th slot.
4681 */
4682 if (mg->mg_ms_ready < mg->mg_allocators * 3)
4683 allocator = 0;
4684
4685 ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
4686
4687 metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP);
4688 search->ms_weight = UINT64_MAX;
4689 search->ms_start = 0;
4690 /*
4691 * At the end of the metaslab tree are the already-active metaslabs,
4692 * first the primaries, then the secondaries. When we resume searching
4693 * through the tree, we need to consider ms_allocator and ms_primary so
4694 * we start in the location right after where we left off, and don't
4695 * accidentally loop forever considering the same metaslabs.
4696 */
4697 search->ms_allocator = -1;
4698 search->ms_primary = B_TRUE;
4699 for (;;) {
4700 boolean_t was_active = B_FALSE;
4701
4702 mutex_enter(&mg->mg_lock);
4703
4704 if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
4705 mg->mg_primaries[allocator] != NULL) {
4706 msp = mg->mg_primaries[allocator];
4707
4708 /*
4709 * Even though we don't hold the ms_lock for the
4710 * primary metaslab, those fields should not
4711 * change while we hold the mg_lock. Thus is is
4712 * safe to make assertions on them.
4713 */
4714 ASSERT(msp->ms_primary);
4715 ASSERT3S(msp->ms_allocator, ==, allocator);
4716 ASSERT(msp->ms_loaded);
4717
4718 was_active = B_TRUE;
4719 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4720 } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
4721 mg->mg_secondaries[allocator] != NULL) {
4722 msp = mg->mg_secondaries[allocator];
4723
4724 /*
4725 * See comment above about the similar assertions
4726 * for the primary metaslab.
4727 */
4728 ASSERT(!msp->ms_primary);
4729 ASSERT3S(msp->ms_allocator, ==, allocator);
4730 ASSERT(msp->ms_loaded);
4731
4732 was_active = B_TRUE;
4733 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
4734 } else {
4735 msp = find_valid_metaslab(mg, activation_weight, dva, d,
4736 want_unique, asize, allocator, try_hard, zal,
4737 search, &was_active);
4738 }
4739
4740 mutex_exit(&mg->mg_lock);
4741 if (msp == NULL) {
4742 kmem_free(search, sizeof (*search));
4743 return (-1ULL);
4744 }
4745 mutex_enter(&msp->ms_lock);
4746
4747 metaslab_active_mask_verify(msp);
4748
4749 /*
4750 * This code is disabled out because of issues with
4751 * tracepoints in non-gpl kernel modules.
4752 */
4753 #if 0
4754 DTRACE_PROBE3(ms__activation__attempt,
4755 metaslab_t *, msp, uint64_t, activation_weight,
4756 boolean_t, was_active);
4757 #endif
4758
4759 /*
4760 * Ensure that the metaslab we have selected is still
4761 * capable of handling our request. It's possible that
4762 * another thread may have changed the weight while we
4763 * were blocked on the metaslab lock. We check the
4764 * active status first to see if we need to set_selected_txg
4765 * a new metaslab.
4766 */
4767 if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
4768 ASSERT3S(msp->ms_allocator, ==, -1);
4769 mutex_exit(&msp->ms_lock);
4770 continue;
4771 }
4772
4773 /*
4774 * If the metaslab was activated for another allocator
4775 * while we were waiting in the ms_lock above, or it's
4776 * a primary and we're seeking a secondary (or vice versa),
4777 * we go back and select a new metaslab.
4778 */
4779 if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) &&
4780 (msp->ms_allocator != -1) &&
4781 (msp->ms_allocator != allocator || ((activation_weight ==
4782 METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) {
4783 ASSERT(msp->ms_loaded);
4784 ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) ||
4785 msp->ms_allocator != -1);
4786 mutex_exit(&msp->ms_lock);
4787 continue;
4788 }
4789
4790 /*
4791 * This metaslab was used for claiming regions allocated
4792 * by the ZIL during pool import. Once these regions are
4793 * claimed we don't need to keep the CLAIM bit set
4794 * anymore. Passivate this metaslab to zero its activation
4795 * mask.
4796 */
4797 if (msp->ms_weight & METASLAB_WEIGHT_CLAIM &&
4798 activation_weight != METASLAB_WEIGHT_CLAIM) {
4799 ASSERT(msp->ms_loaded);
4800 ASSERT3S(msp->ms_allocator, ==, -1);
4801 metaslab_passivate(msp, msp->ms_weight &
4802 ~METASLAB_WEIGHT_CLAIM);
4803 mutex_exit(&msp->ms_lock);
4804 continue;
4805 }
4806
4807 metaslab_set_selected_txg(msp, txg);
4808
4809 int activation_error =
4810 metaslab_activate(msp, allocator, activation_weight);
4811 metaslab_active_mask_verify(msp);
4812
4813 /*
4814 * If the metaslab was activated by another thread for
4815 * another allocator or activation_weight (EBUSY), or it
4816 * failed because another metaslab was assigned as primary
4817 * for this allocator (EEXIST) we continue using this
4818 * metaslab for our allocation, rather than going on to a
4819 * worse metaslab (we waited for that metaslab to be loaded
4820 * after all).
4821 *
4822 * If the activation failed due to an I/O error or ENOSPC we
4823 * skip to the next metaslab.
4824 */
4825 boolean_t activated;
4826 if (activation_error == 0) {
4827 activated = B_TRUE;
4828 } else if (activation_error == EBUSY ||
4829 activation_error == EEXIST) {
4830 activated = B_FALSE;
4831 } else {
4832 mutex_exit(&msp->ms_lock);
4833 continue;
4834 }
4835 ASSERT(msp->ms_loaded);
4836
4837 /*
4838 * Now that we have the lock, recheck to see if we should
4839 * continue to use this metaslab for this allocation. The
4840 * the metaslab is now loaded so metaslab_should_allocate()
4841 * can accurately determine if the allocation attempt should
4842 * proceed.
4843 */
4844 if (!metaslab_should_allocate(msp, asize, try_hard)) {
4845 /* Passivate this metaslab and select a new one. */
4846 metaslab_trace_add(zal, mg, msp, asize, d,
4847 TRACE_TOO_SMALL, allocator);
4848 goto next;
4849 }
4850
4851 /*
4852 * If this metaslab is currently condensing then pick again
4853 * as we can't manipulate this metaslab until it's committed
4854 * to disk. If this metaslab is being initialized, we shouldn't
4855 * allocate from it since the allocated region might be
4856 * overwritten after allocation.
4857 */
4858 if (msp->ms_condensing) {
4859 metaslab_trace_add(zal, mg, msp, asize, d,
4860 TRACE_CONDENSING, allocator);
4861 if (activated) {
4862 metaslab_passivate(msp, msp->ms_weight &
4863 ~METASLAB_ACTIVE_MASK);
4864 }
4865 mutex_exit(&msp->ms_lock);
4866 continue;
4867 } else if (msp->ms_disabled > 0) {
4868 metaslab_trace_add(zal, mg, msp, asize, d,
4869 TRACE_DISABLED, allocator);
4870 if (activated) {
4871 metaslab_passivate(msp, msp->ms_weight &
4872 ~METASLAB_ACTIVE_MASK);
4873 }
4874 mutex_exit(&msp->ms_lock);
4875 continue;
4876 }
4877
4878 offset = metaslab_block_alloc(msp, asize, txg);
4879 metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator);
4880
4881 if (offset != -1ULL) {
4882 /* Proactively passivate the metaslab, if needed */
4883 if (activated)
4884 metaslab_segment_may_passivate(msp);
4885 break;
4886 }
4887 next:
4888 ASSERT(msp->ms_loaded);
4889
4890 /*
4891 * This code is disabled out because of issues with
4892 * tracepoints in non-gpl kernel modules.
4893 */
4894 #if 0
4895 DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp,
4896 uint64_t, asize);
4897 #endif
4898
4899 /*
4900 * We were unable to allocate from this metaslab so determine
4901 * a new weight for this metaslab. Now that we have loaded
4902 * the metaslab we can provide a better hint to the metaslab
4903 * selector.
4904 *
4905 * For space-based metaslabs, we use the maximum block size.
4906 * This information is only available when the metaslab
4907 * is loaded and is more accurate than the generic free
4908 * space weight that was calculated by metaslab_weight().
4909 * This information allows us to quickly compare the maximum
4910 * available allocation in the metaslab to the allocation
4911 * size being requested.
4912 *
4913 * For segment-based metaslabs, determine the new weight
4914 * based on the highest bucket in the range tree. We
4915 * explicitly use the loaded segment weight (i.e. the range
4916 * tree histogram) since it contains the space that is
4917 * currently available for allocation and is accurate
4918 * even within a sync pass.
4919 */
4920 uint64_t weight;
4921 if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) {
4922 weight = metaslab_largest_allocatable(msp);
4923 WEIGHT_SET_SPACEBASED(weight);
4924 } else {
4925 weight = metaslab_weight_from_range_tree(msp);
4926 }
4927
4928 if (activated) {
4929 metaslab_passivate(msp, weight);
4930 } else {
4931 /*
4932 * For the case where we use the metaslab that is
4933 * active for another allocator we want to make
4934 * sure that we retain the activation mask.
4935 *
4936 * Note that we could attempt to use something like
4937 * metaslab_recalculate_weight_and_sort() that
4938 * retains the activation mask here. That function
4939 * uses metaslab_weight() to set the weight though
4940 * which is not as accurate as the calculations
4941 * above.
4942 */
4943 weight |= msp->ms_weight & METASLAB_ACTIVE_MASK;
4944 metaslab_group_sort(mg, msp, weight);
4945 }
4946 metaslab_active_mask_verify(msp);
4947
4948 /*
4949 * We have just failed an allocation attempt, check
4950 * that metaslab_should_allocate() agrees. Otherwise,
4951 * we may end up in an infinite loop retrying the same
4952 * metaslab.
4953 */
4954 ASSERT(!metaslab_should_allocate(msp, asize, try_hard));
4955
4956 mutex_exit(&msp->ms_lock);
4957 }
4958 mutex_exit(&msp->ms_lock);
4959 kmem_free(search, sizeof (*search));
4960 return (offset);
4961 }
4962
4963 static uint64_t
metaslab_group_alloc(metaslab_group_t * mg,zio_alloc_list_t * zal,uint64_t asize,uint64_t txg,boolean_t want_unique,dva_t * dva,int d,int allocator,boolean_t try_hard)4964 metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal,
4965 uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d,
4966 int allocator, boolean_t try_hard)
4967 {
4968 uint64_t offset;
4969 ASSERT(mg->mg_initialized);
4970
4971 offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique,
4972 dva, d, allocator, try_hard);
4973
4974 mutex_enter(&mg->mg_lock);
4975 if (offset == -1ULL) {
4976 mg->mg_failed_allocations++;
4977 metaslab_trace_add(zal, mg, NULL, asize, d,
4978 TRACE_GROUP_FAILURE, allocator);
4979 if (asize == SPA_GANGBLOCKSIZE) {
4980 /*
4981 * This metaslab group was unable to allocate
4982 * the minimum gang block size so it must be out of
4983 * space. We must notify the allocation throttle
4984 * to start skipping allocation attempts to this
4985 * metaslab group until more space becomes available.
4986 * Note: this failure cannot be caused by the
4987 * allocation throttle since the allocation throttle
4988 * is only responsible for skipping devices and
4989 * not failing block allocations.
4990 */
4991 mg->mg_no_free_space = B_TRUE;
4992 }
4993 }
4994 mg->mg_allocations++;
4995 mutex_exit(&mg->mg_lock);
4996 return (offset);
4997 }
4998
4999 /*
5000 * Allocate a block for the specified i/o.
5001 */
5002 int
metaslab_alloc_dva(spa_t * spa,metaslab_class_t * mc,uint64_t psize,dva_t * dva,int d,dva_t * hintdva,uint64_t txg,int flags,zio_alloc_list_t * zal,int allocator)5003 metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
5004 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags,
5005 zio_alloc_list_t *zal, int allocator)
5006 {
5007 metaslab_group_t *mg, *rotor;
5008 vdev_t *vd;
5009 boolean_t try_hard = B_FALSE;
5010
5011 ASSERT(!DVA_IS_VALID(&dva[d]));
5012
5013 /*
5014 * For testing, make some blocks above a certain size be gang blocks.
5015 * This will also test spilling from special to normal.
5016 */
5017 if (psize >= metaslab_force_ganging && (ddi_get_lbolt() & 3) == 0) {
5018 metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG,
5019 allocator);
5020 return (SET_ERROR(ENOSPC));
5021 }
5022
5023 /*
5024 * Start at the rotor and loop through all mgs until we find something.
5025 * Note that there's no locking on mc_rotor or mc_aliquot because
5026 * nothing actually breaks if we miss a few updates -- we just won't
5027 * allocate quite as evenly. It all balances out over time.
5028 *
5029 * If we are doing ditto or log blocks, try to spread them across
5030 * consecutive vdevs. If we're forced to reuse a vdev before we've
5031 * allocated all of our ditto blocks, then try and spread them out on
5032 * that vdev as much as possible. If it turns out to not be possible,
5033 * gradually lower our standards until anything becomes acceptable.
5034 * Also, allocating on consecutive vdevs (as opposed to random vdevs)
5035 * gives us hope of containing our fault domains to something we're
5036 * able to reason about. Otherwise, any two top-level vdev failures
5037 * will guarantee the loss of data. With consecutive allocation,
5038 * only two adjacent top-level vdev failures will result in data loss.
5039 *
5040 * If we are doing gang blocks (hintdva is non-NULL), try to keep
5041 * ourselves on the same vdev as our gang block header. That
5042 * way, we can hope for locality in vdev_cache, plus it makes our
5043 * fault domains something tractable.
5044 */
5045 if (hintdva) {
5046 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
5047
5048 /*
5049 * It's possible the vdev we're using as the hint no
5050 * longer exists or its mg has been closed (e.g. by
5051 * device removal). Consult the rotor when
5052 * all else fails.
5053 */
5054 if (vd != NULL && vd->vdev_mg != NULL) {
5055 mg = vd->vdev_mg;
5056
5057 if (flags & METASLAB_HINTBP_AVOID &&
5058 mg->mg_next != NULL)
5059 mg = mg->mg_next;
5060 } else {
5061 mg = mc->mc_rotor;
5062 }
5063 } else if (d != 0) {
5064 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
5065 mg = vd->vdev_mg->mg_next;
5066 } else {
5067 ASSERT(mc->mc_rotor != NULL);
5068 mg = mc->mc_rotor;
5069 }
5070
5071 /*
5072 * If the hint put us into the wrong metaslab class, or into a
5073 * metaslab group that has been passivated, just follow the rotor.
5074 */
5075 if (mg->mg_class != mc || mg->mg_activation_count <= 0)
5076 mg = mc->mc_rotor;
5077
5078 rotor = mg;
5079 top:
5080 do {
5081 boolean_t allocatable;
5082
5083 ASSERT(mg->mg_activation_count == 1);
5084 vd = mg->mg_vd;
5085
5086 /*
5087 * Don't allocate from faulted devices.
5088 */
5089 if (try_hard) {
5090 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
5091 allocatable = vdev_allocatable(vd);
5092 spa_config_exit(spa, SCL_ZIO, FTAG);
5093 } else {
5094 allocatable = vdev_allocatable(vd);
5095 }
5096
5097 /*
5098 * Determine if the selected metaslab group is eligible
5099 * for allocations. If we're ganging then don't allow
5100 * this metaslab group to skip allocations since that would
5101 * inadvertently return ENOSPC and suspend the pool
5102 * even though space is still available.
5103 */
5104 if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) {
5105 allocatable = metaslab_group_allocatable(mg, rotor,
5106 psize, allocator, d);
5107 }
5108
5109 if (!allocatable) {
5110 metaslab_trace_add(zal, mg, NULL, psize, d,
5111 TRACE_NOT_ALLOCATABLE, allocator);
5112 goto next;
5113 }
5114
5115 ASSERT(mg->mg_initialized);
5116
5117 /*
5118 * Avoid writing single-copy data to a failing,
5119 * non-redundant vdev, unless we've already tried all
5120 * other vdevs.
5121 */
5122 if ((vd->vdev_stat.vs_write_errors > 0 ||
5123 vd->vdev_state < VDEV_STATE_HEALTHY) &&
5124 d == 0 && !try_hard && vd->vdev_children == 0) {
5125 metaslab_trace_add(zal, mg, NULL, psize, d,
5126 TRACE_VDEV_ERROR, allocator);
5127 goto next;
5128 }
5129
5130 ASSERT(mg->mg_class == mc);
5131
5132 uint64_t asize = vdev_psize_to_asize(vd, psize);
5133 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
5134
5135 /*
5136 * If we don't need to try hard, then require that the
5137 * block be on an different metaslab from any other DVAs
5138 * in this BP (unique=true). If we are trying hard, then
5139 * allow any metaslab to be used (unique=false).
5140 */
5141 uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg,
5142 !try_hard, dva, d, allocator, try_hard);
5143
5144 if (offset != -1ULL) {
5145 /*
5146 * If we've just selected this metaslab group,
5147 * figure out whether the corresponding vdev is
5148 * over- or under-used relative to the pool,
5149 * and set an allocation bias to even it out.
5150 */
5151 if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
5152 vdev_stat_t *vs = &vd->vdev_stat;
5153 int64_t vu, cu;
5154
5155 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
5156 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
5157
5158 /*
5159 * Calculate how much more or less we should
5160 * try to allocate from this device during
5161 * this iteration around the rotor.
5162 * For example, if a device is 80% full
5163 * and the pool is 20% full then we should
5164 * reduce allocations by 60% on this device.
5165 *
5166 * mg_bias = (20 - 80) * 512K / 100 = -307K
5167 *
5168 * This reduces allocations by 307K for this
5169 * iteration.
5170 */
5171 mg->mg_bias = ((cu - vu) *
5172 (int64_t)mg->mg_aliquot) / 100;
5173 } else if (!metaslab_bias_enabled) {
5174 mg->mg_bias = 0;
5175 }
5176
5177 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
5178 mg->mg_aliquot + mg->mg_bias) {
5179 mc->mc_rotor = mg->mg_next;
5180 mc->mc_aliquot = 0;
5181 }
5182
5183 DVA_SET_VDEV(&dva[d], vd->vdev_id);
5184 DVA_SET_OFFSET(&dva[d], offset);
5185 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER));
5186 DVA_SET_ASIZE(&dva[d], asize);
5187
5188 return (0);
5189 }
5190 next:
5191 mc->mc_rotor = mg->mg_next;
5192 mc->mc_aliquot = 0;
5193 } while ((mg = mg->mg_next) != rotor);
5194
5195 /*
5196 * If we haven't tried hard, do so now.
5197 */
5198 if (!try_hard) {
5199 try_hard = B_TRUE;
5200 goto top;
5201 }
5202
5203 bzero(&dva[d], sizeof (dva_t));
5204
5205 metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator);
5206 return (SET_ERROR(ENOSPC));
5207 }
5208
5209 void
metaslab_free_concrete(vdev_t * vd,uint64_t offset,uint64_t asize,boolean_t checkpoint)5210 metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize,
5211 boolean_t checkpoint)
5212 {
5213 metaslab_t *msp;
5214 spa_t *spa = vd->vdev_spa;
5215
5216 ASSERT(vdev_is_concrete(vd));
5217 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5218 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5219
5220 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5221
5222 VERIFY(!msp->ms_condensing);
5223 VERIFY3U(offset, >=, msp->ms_start);
5224 VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size);
5225 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5226 VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift));
5227
5228 metaslab_check_free_impl(vd, offset, asize);
5229
5230 mutex_enter(&msp->ms_lock);
5231 if (range_tree_is_empty(msp->ms_freeing) &&
5232 range_tree_is_empty(msp->ms_checkpointing)) {
5233 vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa));
5234 }
5235
5236 if (checkpoint) {
5237 ASSERT(spa_has_checkpoint(spa));
5238 range_tree_add(msp->ms_checkpointing, offset, asize);
5239 } else {
5240 range_tree_add(msp->ms_freeing, offset, asize);
5241 }
5242 mutex_exit(&msp->ms_lock);
5243 }
5244
5245 /* ARGSUSED */
5246 void
metaslab_free_impl_cb(uint64_t inner_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)5247 metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5248 uint64_t size, void *arg)
5249 {
5250 boolean_t *checkpoint = arg;
5251
5252 ASSERT3P(checkpoint, !=, NULL);
5253
5254 if (vd->vdev_ops->vdev_op_remap != NULL)
5255 vdev_indirect_mark_obsolete(vd, offset, size);
5256 else
5257 metaslab_free_impl(vd, offset, size, *checkpoint);
5258 }
5259
5260 static void
metaslab_free_impl(vdev_t * vd,uint64_t offset,uint64_t size,boolean_t checkpoint)5261 metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size,
5262 boolean_t checkpoint)
5263 {
5264 spa_t *spa = vd->vdev_spa;
5265
5266 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5267
5268 if (spa_syncing_txg(spa) > spa_freeze_txg(spa))
5269 return;
5270
5271 if (spa->spa_vdev_removal != NULL &&
5272 spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id &&
5273 vdev_is_concrete(vd)) {
5274 /*
5275 * Note: we check if the vdev is concrete because when
5276 * we complete the removal, we first change the vdev to be
5277 * an indirect vdev (in open context), and then (in syncing
5278 * context) clear spa_vdev_removal.
5279 */
5280 free_from_removing_vdev(vd, offset, size);
5281 } else if (vd->vdev_ops->vdev_op_remap != NULL) {
5282 vdev_indirect_mark_obsolete(vd, offset, size);
5283 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5284 metaslab_free_impl_cb, &checkpoint);
5285 } else {
5286 metaslab_free_concrete(vd, offset, size, checkpoint);
5287 }
5288 }
5289
5290 typedef struct remap_blkptr_cb_arg {
5291 blkptr_t *rbca_bp;
5292 spa_remap_cb_t rbca_cb;
5293 vdev_t *rbca_remap_vd;
5294 uint64_t rbca_remap_offset;
5295 void *rbca_cb_arg;
5296 } remap_blkptr_cb_arg_t;
5297
5298 void
remap_blkptr_cb(uint64_t inner_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)5299 remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5300 uint64_t size, void *arg)
5301 {
5302 remap_blkptr_cb_arg_t *rbca = arg;
5303 blkptr_t *bp = rbca->rbca_bp;
5304
5305 /* We can not remap split blocks. */
5306 if (size != DVA_GET_ASIZE(&bp->blk_dva[0]))
5307 return;
5308 ASSERT0(inner_offset);
5309
5310 if (rbca->rbca_cb != NULL) {
5311 /*
5312 * At this point we know that we are not handling split
5313 * blocks and we invoke the callback on the previous
5314 * vdev which must be indirect.
5315 */
5316 ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops);
5317
5318 rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id,
5319 rbca->rbca_remap_offset, size, rbca->rbca_cb_arg);
5320
5321 /* set up remap_blkptr_cb_arg for the next call */
5322 rbca->rbca_remap_vd = vd;
5323 rbca->rbca_remap_offset = offset;
5324 }
5325
5326 /*
5327 * The phys birth time is that of dva[0]. This ensures that we know
5328 * when each dva was written, so that resilver can determine which
5329 * blocks need to be scrubbed (i.e. those written during the time
5330 * the vdev was offline). It also ensures that the key used in
5331 * the ARC hash table is unique (i.e. dva[0] + phys_birth). If
5332 * we didn't change the phys_birth, a lookup in the ARC for a
5333 * remapped BP could find the data that was previously stored at
5334 * this vdev + offset.
5335 */
5336 vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa,
5337 DVA_GET_VDEV(&bp->blk_dva[0]));
5338 vdev_indirect_births_t *vib = oldvd->vdev_indirect_births;
5339 bp->blk_phys_birth = vdev_indirect_births_physbirth(vib,
5340 DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0]));
5341
5342 DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
5343 DVA_SET_OFFSET(&bp->blk_dva[0], offset);
5344 }
5345
5346 /*
5347 * If the block pointer contains any indirect DVAs, modify them to refer to
5348 * concrete DVAs. Note that this will sometimes not be possible, leaving
5349 * the indirect DVA in place. This happens if the indirect DVA spans multiple
5350 * segments in the mapping (i.e. it is a "split block").
5351 *
5352 * If the BP was remapped, calls the callback on the original dva (note the
5353 * callback can be called multiple times if the original indirect DVA refers
5354 * to another indirect DVA, etc).
5355 *
5356 * Returns TRUE if the BP was remapped.
5357 */
5358 boolean_t
spa_remap_blkptr(spa_t * spa,blkptr_t * bp,spa_remap_cb_t callback,void * arg)5359 spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg)
5360 {
5361 remap_blkptr_cb_arg_t rbca;
5362
5363 if (!zfs_remap_blkptr_enable)
5364 return (B_FALSE);
5365
5366 if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS))
5367 return (B_FALSE);
5368
5369 /*
5370 * Dedup BP's can not be remapped, because ddt_phys_select() depends
5371 * on DVA[0] being the same in the BP as in the DDT (dedup table).
5372 */
5373 if (BP_GET_DEDUP(bp))
5374 return (B_FALSE);
5375
5376 /*
5377 * Gang blocks can not be remapped, because
5378 * zio_checksum_gang_verifier() depends on the DVA[0] that's in
5379 * the BP used to read the gang block header (GBH) being the same
5380 * as the DVA[0] that we allocated for the GBH.
5381 */
5382 if (BP_IS_GANG(bp))
5383 return (B_FALSE);
5384
5385 /*
5386 * Embedded BP's have no DVA to remap.
5387 */
5388 if (BP_GET_NDVAS(bp) < 1)
5389 return (B_FALSE);
5390
5391 /*
5392 * Note: we only remap dva[0]. If we remapped other dvas, we
5393 * would no longer know what their phys birth txg is.
5394 */
5395 dva_t *dva = &bp->blk_dva[0];
5396
5397 uint64_t offset = DVA_GET_OFFSET(dva);
5398 uint64_t size = DVA_GET_ASIZE(dva);
5399 vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
5400
5401 if (vd->vdev_ops->vdev_op_remap == NULL)
5402 return (B_FALSE);
5403
5404 rbca.rbca_bp = bp;
5405 rbca.rbca_cb = callback;
5406 rbca.rbca_remap_vd = vd;
5407 rbca.rbca_remap_offset = offset;
5408 rbca.rbca_cb_arg = arg;
5409
5410 /*
5411 * remap_blkptr_cb() will be called in order for each level of
5412 * indirection, until a concrete vdev is reached or a split block is
5413 * encountered. old_vd and old_offset are updated within the callback
5414 * as we go from the one indirect vdev to the next one (either concrete
5415 * or indirect again) in that order.
5416 */
5417 vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca);
5418
5419 /* Check if the DVA wasn't remapped because it is a split block */
5420 if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id)
5421 return (B_FALSE);
5422
5423 return (B_TRUE);
5424 }
5425
5426 /*
5427 * Undo the allocation of a DVA which happened in the given transaction group.
5428 */
5429 void
metaslab_unalloc_dva(spa_t * spa,const dva_t * dva,uint64_t txg)5430 metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5431 {
5432 metaslab_t *msp;
5433 vdev_t *vd;
5434 uint64_t vdev = DVA_GET_VDEV(dva);
5435 uint64_t offset = DVA_GET_OFFSET(dva);
5436 uint64_t size = DVA_GET_ASIZE(dva);
5437
5438 ASSERT(DVA_IS_VALID(dva));
5439 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5440
5441 if (txg > spa_freeze_txg(spa))
5442 return;
5443
5444 if ((vd = vdev_lookup_top(spa, vdev)) == NULL ||
5445 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) {
5446 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu",
5447 (u_longlong_t)vdev, (u_longlong_t)offset);
5448 ASSERT(0);
5449 return;
5450 }
5451
5452 ASSERT(!vd->vdev_removing);
5453 ASSERT(vdev_is_concrete(vd));
5454 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
5455 ASSERT3P(vd->vdev_indirect_mapping, ==, NULL);
5456
5457 if (DVA_GET_GANG(dva))
5458 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5459
5460 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5461
5462 mutex_enter(&msp->ms_lock);
5463 range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
5464 offset, size);
5465 msp->ms_allocating_total -= size;
5466
5467 VERIFY(!msp->ms_condensing);
5468 VERIFY3U(offset, >=, msp->ms_start);
5469 VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
5470 VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=,
5471 msp->ms_size);
5472 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5473 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5474 range_tree_add(msp->ms_allocatable, offset, size);
5475 mutex_exit(&msp->ms_lock);
5476 }
5477
5478 /*
5479 * Free the block represented by the given DVA.
5480 */
5481 void
metaslab_free_dva(spa_t * spa,const dva_t * dva,boolean_t checkpoint)5482 metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint)
5483 {
5484 uint64_t vdev = DVA_GET_VDEV(dva);
5485 uint64_t offset = DVA_GET_OFFSET(dva);
5486 uint64_t size = DVA_GET_ASIZE(dva);
5487 vdev_t *vd = vdev_lookup_top(spa, vdev);
5488
5489 ASSERT(DVA_IS_VALID(dva));
5490 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5491
5492 if (DVA_GET_GANG(dva)) {
5493 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5494 }
5495
5496 metaslab_free_impl(vd, offset, size, checkpoint);
5497 }
5498
5499 /*
5500 * Reserve some allocation slots. The reservation system must be called
5501 * before we call into the allocator. If there aren't any available slots
5502 * then the I/O will be throttled until an I/O completes and its slots are
5503 * freed up. The function returns true if it was successful in placing
5504 * the reservation.
5505 */
5506 boolean_t
metaslab_class_throttle_reserve(metaslab_class_t * mc,int slots,int allocator,zio_t * zio,int flags)5507 metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
5508 zio_t *zio, int flags)
5509 {
5510 uint64_t available_slots = 0;
5511 boolean_t slot_reserved = B_FALSE;
5512 uint64_t max = mc->mc_alloc_max_slots[allocator];
5513
5514 ASSERT(mc->mc_alloc_throttle_enabled);
5515 mutex_enter(&mc->mc_lock);
5516
5517 uint64_t reserved_slots =
5518 zfs_refcount_count(&mc->mc_alloc_slots[allocator]);
5519 if (reserved_slots < max)
5520 available_slots = max - reserved_slots;
5521
5522 if (slots <= available_slots || GANG_ALLOCATION(flags) ||
5523 flags & METASLAB_MUST_RESERVE) {
5524 /*
5525 * We reserve the slots individually so that we can unreserve
5526 * them individually when an I/O completes.
5527 */
5528 zfs_refcount_add_few(&mc->mc_alloc_slots[allocator], slots,
5529 zio);
5530 zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
5531 slot_reserved = B_TRUE;
5532 }
5533
5534 mutex_exit(&mc->mc_lock);
5535 return (slot_reserved);
5536 }
5537
5538 void
metaslab_class_throttle_unreserve(metaslab_class_t * mc,int slots,int allocator,zio_t * zio)5539 metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
5540 int allocator, zio_t *zio)
5541 {
5542 ASSERT(mc->mc_alloc_throttle_enabled);
5543 mutex_enter(&mc->mc_lock);
5544 zfs_refcount_remove_few(&mc->mc_alloc_slots[allocator], slots, zio);
5545 mutex_exit(&mc->mc_lock);
5546 }
5547
5548 static int
metaslab_claim_concrete(vdev_t * vd,uint64_t offset,uint64_t size,uint64_t txg)5549 metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
5550 uint64_t txg)
5551 {
5552 metaslab_t *msp;
5553 spa_t *spa = vd->vdev_spa;
5554 int error = 0;
5555
5556 if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count)
5557 return (ENXIO);
5558
5559 ASSERT3P(vd->vdev_ms, !=, NULL);
5560 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5561
5562 mutex_enter(&msp->ms_lock);
5563
5564 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
5565 error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM);
5566 /*
5567 * No need to fail in that case; someone else has activated the
5568 * metaslab, but that doesn't preclude us from using it.
5569 */
5570 if (error == EBUSY)
5571 error = 0;
5572
5573 if (error == 0 &&
5574 !range_tree_contains(msp->ms_allocatable, offset, size))
5575 error = SET_ERROR(ENOENT);
5576
5577 if (error || txg == 0) { /* txg == 0 indicates dry run */
5578 mutex_exit(&msp->ms_lock);
5579 return (error);
5580 }
5581
5582 VERIFY(!msp->ms_condensing);
5583 VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
5584 VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
5585 VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=,
5586 msp->ms_size);
5587 range_tree_remove(msp->ms_allocatable, offset, size);
5588 range_tree_clear(msp->ms_trim, offset, size);
5589
5590 if (spa_writeable(spa)) { /* don't dirty if we're zdb(8) */
5591 metaslab_class_t *mc = msp->ms_group->mg_class;
5592 multilist_sublist_t *mls =
5593 multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
5594 if (!multilist_link_active(&msp->ms_class_txg_node)) {
5595 msp->ms_selected_txg = txg;
5596 multilist_sublist_insert_head(mls, msp);
5597 }
5598 multilist_sublist_unlock(mls);
5599
5600 if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
5601 vdev_dirty(vd, VDD_METASLAB, msp, txg);
5602 range_tree_add(msp->ms_allocating[txg & TXG_MASK],
5603 offset, size);
5604 msp->ms_allocating_total += size;
5605 }
5606
5607 mutex_exit(&msp->ms_lock);
5608
5609 return (0);
5610 }
5611
5612 typedef struct metaslab_claim_cb_arg_t {
5613 uint64_t mcca_txg;
5614 int mcca_error;
5615 } metaslab_claim_cb_arg_t;
5616
5617 /* ARGSUSED */
5618 static void
metaslab_claim_impl_cb(uint64_t inner_offset,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)5619 metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset,
5620 uint64_t size, void *arg)
5621 {
5622 metaslab_claim_cb_arg_t *mcca_arg = arg;
5623
5624 if (mcca_arg->mcca_error == 0) {
5625 mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset,
5626 size, mcca_arg->mcca_txg);
5627 }
5628 }
5629
5630 int
metaslab_claim_impl(vdev_t * vd,uint64_t offset,uint64_t size,uint64_t txg)5631 metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg)
5632 {
5633 if (vd->vdev_ops->vdev_op_remap != NULL) {
5634 metaslab_claim_cb_arg_t arg;
5635
5636 /*
5637 * Only zdb(8) can claim on indirect vdevs. This is used
5638 * to detect leaks of mapped space (that are not accounted
5639 * for in the obsolete counts, spacemap, or bpobj).
5640 */
5641 ASSERT(!spa_writeable(vd->vdev_spa));
5642 arg.mcca_error = 0;
5643 arg.mcca_txg = txg;
5644
5645 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5646 metaslab_claim_impl_cb, &arg);
5647
5648 if (arg.mcca_error == 0) {
5649 arg.mcca_error = metaslab_claim_concrete(vd,
5650 offset, size, txg);
5651 }
5652 return (arg.mcca_error);
5653 } else {
5654 return (metaslab_claim_concrete(vd, offset, size, txg));
5655 }
5656 }
5657
5658 /*
5659 * Intent log support: upon opening the pool after a crash, notify the SPA
5660 * of blocks that the intent log has allocated for immediate write, but
5661 * which are still considered free by the SPA because the last transaction
5662 * group didn't commit yet.
5663 */
5664 static int
metaslab_claim_dva(spa_t * spa,const dva_t * dva,uint64_t txg)5665 metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
5666 {
5667 uint64_t vdev = DVA_GET_VDEV(dva);
5668 uint64_t offset = DVA_GET_OFFSET(dva);
5669 uint64_t size = DVA_GET_ASIZE(dva);
5670 vdev_t *vd;
5671
5672 if ((vd = vdev_lookup_top(spa, vdev)) == NULL) {
5673 return (SET_ERROR(ENXIO));
5674 }
5675
5676 ASSERT(DVA_IS_VALID(dva));
5677
5678 if (DVA_GET_GANG(dva))
5679 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5680
5681 return (metaslab_claim_impl(vd, offset, size, txg));
5682 }
5683
5684 int
metaslab_alloc(spa_t * spa,metaslab_class_t * mc,uint64_t psize,blkptr_t * bp,int ndvas,uint64_t txg,blkptr_t * hintbp,int flags,zio_alloc_list_t * zal,zio_t * zio,int allocator)5685 metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
5686 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags,
5687 zio_alloc_list_t *zal, zio_t *zio, int allocator)
5688 {
5689 dva_t *dva = bp->blk_dva;
5690 dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL;
5691 int error = 0;
5692
5693 ASSERT(bp->blk_birth == 0);
5694 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
5695
5696 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5697
5698 if (mc->mc_rotor == NULL) { /* no vdevs in this class */
5699 spa_config_exit(spa, SCL_ALLOC, FTAG);
5700 return (SET_ERROR(ENOSPC));
5701 }
5702
5703 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa));
5704 ASSERT(BP_GET_NDVAS(bp) == 0);
5705 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp));
5706 ASSERT3P(zal, !=, NULL);
5707
5708 for (int d = 0; d < ndvas; d++) {
5709 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
5710 txg, flags, zal, allocator);
5711 if (error != 0) {
5712 for (d--; d >= 0; d--) {
5713 metaslab_unalloc_dva(spa, &dva[d], txg);
5714 metaslab_group_alloc_decrement(spa,
5715 DVA_GET_VDEV(&dva[d]), zio, flags,
5716 allocator, B_FALSE);
5717 bzero(&dva[d], sizeof (dva_t));
5718 }
5719 spa_config_exit(spa, SCL_ALLOC, FTAG);
5720 return (error);
5721 } else {
5722 /*
5723 * Update the metaslab group's queue depth
5724 * based on the newly allocated dva.
5725 */
5726 metaslab_group_alloc_increment(spa,
5727 DVA_GET_VDEV(&dva[d]), zio, flags, allocator);
5728 }
5729
5730 }
5731 ASSERT(error == 0);
5732 ASSERT(BP_GET_NDVAS(bp) == ndvas);
5733
5734 spa_config_exit(spa, SCL_ALLOC, FTAG);
5735
5736 BP_SET_BIRTH(bp, txg, txg);
5737
5738 return (0);
5739 }
5740
5741 void
metaslab_free(spa_t * spa,const blkptr_t * bp,uint64_t txg,boolean_t now)5742 metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
5743 {
5744 const dva_t *dva = bp->blk_dva;
5745 int ndvas = BP_GET_NDVAS(bp);
5746
5747 ASSERT(!BP_IS_HOLE(bp));
5748 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
5749
5750 /*
5751 * If we have a checkpoint for the pool we need to make sure that
5752 * the blocks that we free that are part of the checkpoint won't be
5753 * reused until the checkpoint is discarded or we revert to it.
5754 *
5755 * The checkpoint flag is passed down the metaslab_free code path
5756 * and is set whenever we want to add a block to the checkpoint's
5757 * accounting. That is, we "checkpoint" blocks that existed at the
5758 * time the checkpoint was created and are therefore referenced by
5759 * the checkpointed uberblock.
5760 *
5761 * Note that, we don't checkpoint any blocks if the current
5762 * syncing txg <= spa_checkpoint_txg. We want these frees to sync
5763 * normally as they will be referenced by the checkpointed uberblock.
5764 */
5765 boolean_t checkpoint = B_FALSE;
5766 if (bp->blk_birth <= spa->spa_checkpoint_txg &&
5767 spa_syncing_txg(spa) > spa->spa_checkpoint_txg) {
5768 /*
5769 * At this point, if the block is part of the checkpoint
5770 * there is no way it was created in the current txg.
5771 */
5772 ASSERT(!now);
5773 ASSERT3U(spa_syncing_txg(spa), ==, txg);
5774 checkpoint = B_TRUE;
5775 }
5776
5777 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
5778
5779 for (int d = 0; d < ndvas; d++) {
5780 if (now) {
5781 metaslab_unalloc_dva(spa, &dva[d], txg);
5782 } else {
5783 ASSERT3U(txg, ==, spa_syncing_txg(spa));
5784 metaslab_free_dva(spa, &dva[d], checkpoint);
5785 }
5786 }
5787
5788 spa_config_exit(spa, SCL_FREE, FTAG);
5789 }
5790
5791 int
metaslab_claim(spa_t * spa,const blkptr_t * bp,uint64_t txg)5792 metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg)
5793 {
5794 const dva_t *dva = bp->blk_dva;
5795 int ndvas = BP_GET_NDVAS(bp);
5796 int error = 0;
5797
5798 ASSERT(!BP_IS_HOLE(bp));
5799
5800 if (txg != 0) {
5801 /*
5802 * First do a dry run to make sure all DVAs are claimable,
5803 * so we don't have to unwind from partial failures below.
5804 */
5805 if ((error = metaslab_claim(spa, bp, 0)) != 0)
5806 return (error);
5807 }
5808
5809 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
5810
5811 for (int d = 0; d < ndvas; d++) {
5812 error = metaslab_claim_dva(spa, &dva[d], txg);
5813 if (error != 0)
5814 break;
5815 }
5816
5817 spa_config_exit(spa, SCL_ALLOC, FTAG);
5818
5819 ASSERT(error == 0 || txg == 0);
5820
5821 return (error);
5822 }
5823
5824 /* ARGSUSED */
5825 static void
metaslab_check_free_impl_cb(uint64_t inner,vdev_t * vd,uint64_t offset,uint64_t size,void * arg)5826 metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset,
5827 uint64_t size, void *arg)
5828 {
5829 if (vd->vdev_ops == &vdev_indirect_ops)
5830 return;
5831
5832 metaslab_check_free_impl(vd, offset, size);
5833 }
5834
5835 static void
metaslab_check_free_impl(vdev_t * vd,uint64_t offset,uint64_t size)5836 metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size)
5837 {
5838 metaslab_t *msp;
5839 spa_t *spa = vd->vdev_spa;
5840
5841 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5842 return;
5843
5844 if (vd->vdev_ops->vdev_op_remap != NULL) {
5845 vd->vdev_ops->vdev_op_remap(vd, offset, size,
5846 metaslab_check_free_impl_cb, NULL);
5847 return;
5848 }
5849
5850 ASSERT(vdev_is_concrete(vd));
5851 ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count);
5852 ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0);
5853
5854 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
5855
5856 mutex_enter(&msp->ms_lock);
5857 if (msp->ms_loaded) {
5858 range_tree_verify_not_present(msp->ms_allocatable,
5859 offset, size);
5860 }
5861
5862 /*
5863 * Check all segments that currently exist in the freeing pipeline.
5864 *
5865 * It would intuitively make sense to also check the current allocating
5866 * tree since metaslab_unalloc_dva() exists for extents that are
5867 * allocated and freed in the same sync pass withing the same txg.
5868 * Unfortunately there are places (e.g. the ZIL) where we allocate a
5869 * segment but then we free part of it within the same txg
5870 * [see zil_sync()]. Thus, we don't call range_tree_verify() in the
5871 * current allocating tree.
5872 */
5873 range_tree_verify_not_present(msp->ms_freeing, offset, size);
5874 range_tree_verify_not_present(msp->ms_checkpointing, offset, size);
5875 range_tree_verify_not_present(msp->ms_freed, offset, size);
5876 for (int j = 0; j < TXG_DEFER_SIZE; j++)
5877 range_tree_verify_not_present(msp->ms_defer[j], offset, size);
5878 range_tree_verify_not_present(msp->ms_trim, offset, size);
5879 mutex_exit(&msp->ms_lock);
5880 }
5881
5882 void
metaslab_check_free(spa_t * spa,const blkptr_t * bp)5883 metaslab_check_free(spa_t *spa, const blkptr_t *bp)
5884 {
5885 if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0)
5886 return;
5887
5888 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
5889 for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
5890 uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
5891 vdev_t *vd = vdev_lookup_top(spa, vdev);
5892 uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
5893 uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
5894
5895 if (DVA_GET_GANG(&bp->blk_dva[i]))
5896 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
5897
5898 ASSERT3P(vd, !=, NULL);
5899
5900 metaslab_check_free_impl(vd, offset, size);
5901 }
5902 spa_config_exit(spa, SCL_VDEV, FTAG);
5903 }
5904
5905 static void
metaslab_group_disable_wait(metaslab_group_t * mg)5906 metaslab_group_disable_wait(metaslab_group_t *mg)
5907 {
5908 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5909 while (mg->mg_disabled_updating) {
5910 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5911 }
5912 }
5913
5914 static void
metaslab_group_disabled_increment(metaslab_group_t * mg)5915 metaslab_group_disabled_increment(metaslab_group_t *mg)
5916 {
5917 ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock));
5918 ASSERT(mg->mg_disabled_updating);
5919
5920 while (mg->mg_ms_disabled >= max_disabled_ms) {
5921 cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock);
5922 }
5923 mg->mg_ms_disabled++;
5924 ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms);
5925 }
5926
5927 /*
5928 * Mark the metaslab as disabled to prevent any allocations on this metaslab.
5929 * We must also track how many metaslabs are currently disabled within a
5930 * metaslab group and limit them to prevent allocation failures from
5931 * occurring because all metaslabs are disabled.
5932 */
5933 void
metaslab_disable(metaslab_t * msp)5934 metaslab_disable(metaslab_t *msp)
5935 {
5936 ASSERT(!MUTEX_HELD(&msp->ms_lock));
5937 metaslab_group_t *mg = msp->ms_group;
5938
5939 mutex_enter(&mg->mg_ms_disabled_lock);
5940
5941 /*
5942 * To keep an accurate count of how many threads have disabled
5943 * a specific metaslab group, we only allow one thread to mark
5944 * the metaslab group at a time. This ensures that the value of
5945 * ms_disabled will be accurate when we decide to mark a metaslab
5946 * group as disabled. To do this we force all other threads
5947 * to wait till the metaslab's mg_disabled_updating flag is no
5948 * longer set.
5949 */
5950 metaslab_group_disable_wait(mg);
5951 mg->mg_disabled_updating = B_TRUE;
5952 if (msp->ms_disabled == 0) {
5953 metaslab_group_disabled_increment(mg);
5954 }
5955 mutex_enter(&msp->ms_lock);
5956 msp->ms_disabled++;
5957 mutex_exit(&msp->ms_lock);
5958
5959 mg->mg_disabled_updating = B_FALSE;
5960 cv_broadcast(&mg->mg_ms_disabled_cv);
5961 mutex_exit(&mg->mg_ms_disabled_lock);
5962 }
5963
5964 void
metaslab_enable(metaslab_t * msp,boolean_t sync,boolean_t unload)5965 metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
5966 {
5967 metaslab_group_t *mg = msp->ms_group;
5968 spa_t *spa = mg->mg_vd->vdev_spa;
5969
5970 /*
5971 * Wait for the outstanding IO to be synced to prevent newly
5972 * allocated blocks from being overwritten. This used by
5973 * initialize and TRIM which are modifying unallocated space.
5974 */
5975 if (sync)
5976 txg_wait_synced(spa_get_dsl(spa), 0);
5977
5978 mutex_enter(&mg->mg_ms_disabled_lock);
5979 mutex_enter(&msp->ms_lock);
5980 if (--msp->ms_disabled == 0) {
5981 mg->mg_ms_disabled--;
5982 cv_broadcast(&mg->mg_ms_disabled_cv);
5983 if (unload)
5984 metaslab_unload(msp);
5985 }
5986 mutex_exit(&msp->ms_lock);
5987 mutex_exit(&mg->mg_ms_disabled_lock);
5988 }
5989
5990 static void
metaslab_update_ondisk_flush_data(metaslab_t * ms,dmu_tx_t * tx)5991 metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx)
5992 {
5993 vdev_t *vd = ms->ms_group->mg_vd;
5994 spa_t *spa = vd->vdev_spa;
5995 objset_t *mos = spa_meta_objset(spa);
5996
5997 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP));
5998
5999 metaslab_unflushed_phys_t entry = {
6000 .msp_unflushed_txg = metaslab_unflushed_txg(ms),
6001 };
6002 uint64_t entry_size = sizeof (entry);
6003 uint64_t entry_offset = ms->ms_id * entry_size;
6004
6005 uint64_t object = 0;
6006 int err = zap_lookup(mos, vd->vdev_top_zap,
6007 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6008 &object);
6009 if (err == ENOENT) {
6010 object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA,
6011 SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
6012 VERIFY0(zap_add(mos, vd->vdev_top_zap,
6013 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1,
6014 &object, tx));
6015 } else {
6016 VERIFY0(err);
6017 }
6018
6019 dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size,
6020 &entry, tx);
6021 }
6022
6023 void
metaslab_set_unflushed_txg(metaslab_t * ms,uint64_t txg,dmu_tx_t * tx)6024 metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx)
6025 {
6026 spa_t *spa = ms->ms_group->mg_vd->vdev_spa;
6027
6028 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
6029 return;
6030
6031 ms->ms_unflushed_txg = txg;
6032 metaslab_update_ondisk_flush_data(ms, tx);
6033 }
6034
6035 uint64_t
metaslab_unflushed_txg(metaslab_t * ms)6036 metaslab_unflushed_txg(metaslab_t *ms)
6037 {
6038 return (ms->ms_unflushed_txg);
6039 }
6040