1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2011, 2019 by Delphix. All rights reserved. 29 */ 30 31 #ifndef _SYS_METASLAB_IMPL_H 32 #define _SYS_METASLAB_IMPL_H 33 34 #include <sys/metaslab.h> 35 #include <sys/space_map.h> 36 #include <sys/range_tree.h> 37 #include <sys/vdev.h> 38 #include <sys/txg.h> 39 #include <sys/avl.h> 40 #include <sys/multilist.h> 41 42 #ifdef __cplusplus 43 extern "C" { 44 #endif 45 46 /* 47 * Metaslab allocation tracing record. 48 */ 49 typedef struct metaslab_alloc_trace { 50 list_node_t mat_list_node; 51 metaslab_group_t *mat_mg; 52 metaslab_t *mat_msp; 53 uint64_t mat_size; 54 uint64_t mat_weight; 55 uint32_t mat_dva_id; 56 uint64_t mat_offset; 57 int mat_allocator; 58 } metaslab_alloc_trace_t; 59 60 /* 61 * Used by the metaslab allocation tracing facility to indicate 62 * error conditions. These errors are stored to the offset member 63 * of the metaslab_alloc_trace_t record and displayed by mdb. 64 */ 65 typedef enum trace_alloc_type { 66 TRACE_ALLOC_FAILURE = -1ULL, 67 TRACE_TOO_SMALL = -2ULL, 68 TRACE_FORCE_GANG = -3ULL, 69 TRACE_NOT_ALLOCATABLE = -4ULL, 70 TRACE_GROUP_FAILURE = -5ULL, 71 TRACE_ENOSPC = -6ULL, 72 TRACE_CONDENSING = -7ULL, 73 TRACE_VDEV_ERROR = -8ULL, 74 TRACE_DISABLED = -9ULL, 75 } trace_alloc_type_t; 76 77 #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 78 #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 79 #define METASLAB_WEIGHT_CLAIM (1ULL << 61) 80 #define METASLAB_WEIGHT_TYPE (1ULL << 60) 81 #define METASLAB_ACTIVE_MASK \ 82 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ 83 METASLAB_WEIGHT_CLAIM) 84 85 /* 86 * The metaslab weight is used to encode the amount of free space in a 87 * metaslab, such that the "best" metaslab appears first when sorting the 88 * metaslabs by weight. The weight (and therefore the "best" metaslab) can 89 * be determined in two different ways: by computing a weighted sum of all 90 * the free space in the metaslab (a space based weight) or by counting only 91 * the free segments of the largest size (a segment based weight). We prefer 92 * the segment based weight because it reflects how the free space is 93 * comprised, but we cannot always use it -- legacy pools do not have the 94 * space map histogram information necessary to determine the largest 95 * contiguous regions. Pools that have the space map histogram determine 96 * the segment weight by looking at each bucket in the histogram and 97 * determining the free space whose size in bytes is in the range: 98 * [2^i, 2^(i+1)) 99 * We then encode the largest index, i, that contains regions into the 100 * segment-weighted value. 101 * 102 * Space-based weight: 103 * 104 * 64 56 48 40 32 24 16 8 0 105 * +-------+-------+-------+-------+-------+-------+-------+-------+ 106 * |PSC1| weighted-free space | 107 * +-------+-------+-------+-------+-------+-------+-------+-------+ 108 * 109 * PS - indicates primary and secondary activation 110 * C - indicates activation for claimed block zio 111 * space - the fragmentation-weighted space 112 * 113 * Segment-based weight: 114 * 115 * 64 56 48 40 32 24 16 8 0 116 * +-------+-------+-------+-------+-------+-------+-------+-------+ 117 * |PSC0| idx| count of segments in region | 118 * +-------+-------+-------+-------+-------+-------+-------+-------+ 119 * 120 * PS - indicates primary and secondary activation 121 * C - indicates activation for claimed block zio 122 * idx - index for the highest bucket in the histogram 123 * count - number of segments in the specified bucket 124 */ 125 #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) 126 #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) 127 128 #define WEIGHT_IS_SPACEBASED(weight) \ 129 ((weight) == 0 || BF64_GET((weight), 60, 1)) 130 #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) 131 132 /* 133 * These macros are only applicable to segment-based weighting. 134 */ 135 #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) 136 #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) 137 #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) 138 #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) 139 140 /* 141 * Per-allocator data structure. 142 */ 143 typedef struct metaslab_class_allocator { 144 kmutex_t mca_lock; 145 avl_tree_t mca_tree; 146 147 metaslab_group_t *mca_rotor; 148 uint64_t mca_aliquot; 149 150 /* 151 * The allocation throttle works on a reservation system. Whenever 152 * an asynchronous zio wants to perform an allocation it must 153 * first reserve the number of bytes that it wants to allocate. 154 * If there aren't sufficient slots available for the pending zio 155 * then that I/O is throttled until more slots free up. The current 156 * size of reserved allocations is maintained by mca_reserved. 157 * The maximum total size of reserved allocations is determined by 158 * mc_alloc_max in the metaslab_class_t. Gang blocks are allowed 159 * to reserve for their headers even if we've reached the maximum. 160 */ 161 uint64_t mca_reserved; 162 } ____cacheline_aligned metaslab_class_allocator_t; 163 164 /* 165 * A metaslab class encompasses a category of allocatable top-level vdevs. 166 * Each top-level vdev is associated with a metaslab group which defines 167 * the allocatable region for that vdev. Examples of these categories include 168 * "normal" for data block allocations (i.e. main pool allocations) or "log" 169 * for allocations designated for intent log devices (i.e. slog devices). 170 * When a block allocation is requested from the SPA it is associated with a 171 * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging 172 * to the class can be used to satisfy that request. Allocations are done 173 * by traversing the metaslab groups that are linked off of the mca_rotor field. 174 * This rotor points to the next metaslab group where allocations will be 175 * attempted. Allocating a block is a 3 step process -- select the metaslab 176 * group, select the metaslab, and then allocate the block. The metaslab 177 * class defines the low-level block allocator that will be used as the 178 * final step in allocation. These allocators are pluggable allowing each class 179 * to use a block allocator that best suits that class. 180 */ 181 struct metaslab_class { 182 kmutex_t mc_lock; 183 spa_t *mc_spa; 184 const metaslab_ops_t *mc_ops; 185 186 /* 187 * Track the number of metaslab groups that have been initialized 188 * and can accept allocations. An initialized metaslab group is 189 * one has been completely added to the config (i.e. we have 190 * updated the MOS config and the space has been added to the pool). 191 */ 192 uint64_t mc_groups; 193 194 boolean_t mc_is_log; 195 boolean_t mc_alloc_throttle_enabled; 196 uint64_t mc_alloc_io_size; 197 uint64_t mc_alloc_max; 198 199 uint64_t mc_alloc_groups; /* # of allocatable groups */ 200 201 uint64_t mc_alloc; /* total allocated space */ 202 uint64_t mc_deferred; /* total deferred frees */ 203 uint64_t mc_space; /* total space (alloc + free) */ 204 uint64_t mc_dspace; /* total deflated space */ 205 uint64_t mc_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; 206 207 /* 208 * List of all loaded metaslabs in the class, sorted in order of most 209 * recent use. 210 */ 211 multilist_t mc_metaslab_txg_list; 212 213 metaslab_class_allocator_t mc_allocator[]; 214 }; 215 216 /* 217 * Per-allocator data structure. 218 */ 219 typedef struct metaslab_group_allocator { 220 zfs_refcount_t mga_queue_depth; 221 metaslab_t *mga_primary; 222 metaslab_t *mga_secondary; 223 } ____cacheline_aligned metaslab_group_allocator_t; 224 225 /* 226 * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) 227 * of a top-level vdev. They are linked together to form a circular linked 228 * list and can belong to only one metaslab class. Metaslab groups may become 229 * ineligible for allocations for a number of reasons such as limited free 230 * space, fragmentation, or going offline. When this happens the allocator will 231 * simply find the next metaslab group in the linked list and attempt 232 * to allocate from that group instead. 233 */ 234 struct metaslab_group { 235 kmutex_t mg_lock; 236 avl_tree_t mg_metaslab_tree; 237 uint64_t mg_aliquot; 238 uint64_t mg_queue_target; 239 boolean_t mg_allocatable; /* can we allocate? */ 240 uint64_t mg_ms_ready; 241 242 /* 243 * A metaslab group is considered to be initialized only after 244 * we have updated the MOS config and added the space to the pool. 245 * We only allow allocation attempts to a metaslab group if it 246 * has been initialized. 247 */ 248 boolean_t mg_initialized; 249 250 int64_t mg_activation_count; 251 metaslab_class_t *mg_class; 252 vdev_t *mg_vd; 253 metaslab_group_t *mg_prev; 254 metaslab_group_t *mg_next; 255 256 /* 257 * A metalab group that can no longer allocate the minimum block 258 * size will set mg_no_free_space. Once a metaslab group is out 259 * of space then its share of work must be distributed to other 260 * groups. 261 */ 262 boolean_t mg_no_free_space; 263 264 uint64_t mg_fragmentation; 265 uint64_t mg_histogram[ZFS_RANGE_TREE_HISTOGRAM_SIZE]; 266 267 int mg_ms_disabled; 268 boolean_t mg_disabled_updating; 269 kmutex_t mg_ms_disabled_lock; 270 kcondvar_t mg_ms_disabled_cv; 271 272 int mg_allocators; 273 metaslab_group_allocator_t mg_allocator[]; 274 }; 275 276 /* 277 * This value defines the number of elements in the ms_lbas array. The value 278 * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. 279 * This is the equivalent of highbit(UINT64_MAX). 280 */ 281 #define MAX_LBAS 64 282 283 /* 284 * Each metaslab maintains a set of in-core trees to track metaslab 285 * operations. The in-core free tree (ms_allocatable) contains the list of 286 * free segments which are eligible for allocation. As blocks are 287 * allocated, the allocated segments are removed from the ms_allocatable and 288 * added to a per txg allocation tree (ms_allocating). As blocks are 289 * freed, they are added to the free tree (ms_freeing). These trees 290 * allow us to process all allocations and frees in syncing context 291 * where it is safe to update the on-disk space maps. An additional set 292 * of in-core trees is maintained to track deferred frees 293 * (ms_defer). Once a block is freed it will move from the 294 * ms_freed to the ms_defer tree. A deferred free means that a block 295 * has been freed but cannot be used by the pool until TXG_DEFER_SIZE 296 * transactions groups later. For example, a block that is freed in txg 297 * 50 will not be available for reallocation until txg 52 (50 + 298 * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. 299 * A pool could be safely rolled back TXG_DEFERS_SIZE transactions 300 * groups and ensure that no block has been reallocated. 301 * 302 * The simplified transition diagram looks like this: 303 * 304 * 305 * ALLOCATE 306 * | 307 * V 308 * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) 309 * ^ 310 * | ms_freeing <--- FREE 311 * | | 312 * | v 313 * | ms_freed 314 * | | 315 * +-------- ms_defer[2] <-------+-------> (write to space map) 316 * 317 * 318 * Each metaslab's space is tracked in a single space map in the MOS, 319 * which is only updated in syncing context. Each time we sync a txg, 320 * we append the allocs and frees from that txg to the space map. The 321 * pool space is only updated once all metaslabs have finished syncing. 322 * 323 * To load the in-core free tree we read the space map from disk. This 324 * object contains a series of alloc and free records that are combined 325 * to make up the list of all free segments in this metaslab. These 326 * segments are represented in-core by the ms_allocatable and are stored 327 * in an AVL tree. 328 * 329 * As the space map grows (as a result of the appends) it will 330 * eventually become space-inefficient. When the metaslab's in-core 331 * free tree is zfs_condense_pct/100 times the size of the minimal 332 * on-disk representation, we rewrite it in its minimized form. If a 333 * metaslab needs to condense then we must set the ms_condensing flag to 334 * ensure that allocations are not performed on the metaslab that is 335 * being written. 336 */ 337 struct metaslab { 338 /* 339 * This is the main lock of the metaslab and its purpose is to 340 * coordinate our allocations and frees [e.g., metaslab_block_alloc(), 341 * metaslab_free_concrete(), ..etc] with our various syncing 342 * procedures [e.g., metaslab_sync(), metaslab_sync_done(), ..etc]. 343 * 344 * The lock is also used during some miscellaneous operations like 345 * using the metaslab's histogram for the metaslab group's histogram 346 * aggregation, or marking the metaslab for initialization. 347 */ 348 kmutex_t ms_lock; 349 350 /* 351 * Acquired together with the ms_lock whenever we expect to 352 * write to metaslab data on-disk (i.e flushing entries to 353 * the metaslab's space map). It helps coordinate readers of 354 * the metaslab's space map [see spa_vdev_remove_thread()] 355 * with writers [see metaslab_sync() or metaslab_flush()]. 356 * 357 * Note that metaslab_load(), even though a reader, uses 358 * a completely different mechanism to deal with the reading 359 * of the metaslab's space map based on ms_synced_length. That 360 * said, the function still uses the ms_sync_lock after it 361 * has read the ms_sm [see relevant comment in metaslab_load() 362 * as to why]. 363 */ 364 kmutex_t ms_sync_lock; 365 366 kcondvar_t ms_load_cv; 367 space_map_t *ms_sm; 368 uint64_t ms_id; 369 uint64_t ms_start; 370 uint64_t ms_size; 371 uint64_t ms_fragmentation; 372 373 zfs_range_tree_t *ms_allocating[TXG_SIZE]; 374 zfs_range_tree_t *ms_allocatable; 375 uint64_t ms_allocated_this_txg; 376 uint64_t ms_allocating_total; 377 378 /* 379 * The following range trees are accessed only from syncing context. 380 * ms_free*tree only have entries while syncing, and are empty 381 * between syncs. 382 */ 383 zfs_range_tree_t *ms_freeing; /* to free this syncing txg */ 384 /* already freed this syncing txg */ 385 zfs_range_tree_t *ms_freed; 386 zfs_range_tree_t *ms_defer[TXG_DEFER_SIZE]; 387 /* to add to the checkpoint */ 388 zfs_range_tree_t *ms_checkpointing; 389 390 /* 391 * The ms_trim tree is the set of allocatable segments which are 392 * eligible for trimming. (When the metaslab is loaded, it's a 393 * subset of ms_allocatable.) It's kept in-core as long as the 394 * autotrim property is set and is not vacated when the metaslab 395 * is unloaded. Its purpose is to aggregate freed ranges to 396 * facilitate efficient trimming. 397 */ 398 zfs_range_tree_t *ms_trim; 399 400 boolean_t ms_condensing; /* condensing? */ 401 boolean_t ms_condense_wanted; 402 403 /* 404 * The number of consumers which have disabled the metaslab. 405 */ 406 uint64_t ms_disabled; 407 408 /* 409 * We must always hold the ms_lock when modifying ms_loaded 410 * and ms_loading. 411 */ 412 boolean_t ms_loaded; 413 boolean_t ms_loading; 414 kcondvar_t ms_flush_cv; 415 boolean_t ms_flushing; 416 417 /* 418 * The following histograms count entries that are in the 419 * metaslab's space map (and its histogram) but are not in 420 * ms_allocatable yet, because they are in ms_freed, ms_freeing, 421 * or ms_defer[]. 422 * 423 * When the metaslab is not loaded, its ms_weight needs to 424 * reflect what is allocatable (i.e. what will be part of 425 * ms_allocatable if it is loaded). The weight is computed from 426 * the spacemap histogram, but that includes ranges that are 427 * not yet allocatable (because they are in ms_freed, 428 * ms_freeing, or ms_defer[]). Therefore, when calculating the 429 * weight, we need to remove those ranges. 430 * 431 * The ranges in the ms_freed and ms_defer[] range trees are all 432 * present in the spacemap. However, the spacemap may have 433 * multiple entries to represent a contiguous range, because it 434 * is written across multiple sync passes, but the changes of 435 * all sync passes are consolidated into the range trees. 436 * Adjacent ranges that are freed in different sync passes of 437 * one txg will be represented separately (as 2 or more entries) 438 * in the space map (and its histogram), but these adjacent 439 * ranges will be consolidated (represented as one entry) in the 440 * ms_freed/ms_defer[] range trees (and their histograms). 441 * 442 * When calculating the weight, we can not simply subtract the 443 * range trees' histograms from the spacemap's histogram, 444 * because the range trees' histograms may have entries in 445 * higher buckets than the spacemap, due to consolidation. 446 * Instead we must subtract the exact entries that were added to 447 * the spacemap's histogram. ms_synchist and ms_deferhist[] 448 * represent these exact entries, so we can subtract them from 449 * the spacemap's histogram when calculating ms_weight. 450 * 451 * ms_synchist represents the same ranges as ms_freeing + 452 * ms_freed, but without consolidation across sync passes. 453 * 454 * ms_deferhist[i] represents the same ranges as ms_defer[i], 455 * but without consolidation across sync passes. 456 */ 457 uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; 458 uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; 459 460 /* 461 * Tracks the exact amount of allocated space of this metaslab 462 * (and specifically the metaslab's space map) up to the most 463 * recently completed sync pass [see usage in metaslab_sync()]. 464 */ 465 uint64_t ms_allocated_space; 466 int64_t ms_deferspace; /* sum of ms_defermap[] space */ 467 uint64_t ms_weight; /* weight vs. others in group */ 468 uint64_t ms_activation_weight; /* activation weight */ 469 470 /* 471 * Track of whenever a metaslab is selected for loading or allocation. 472 * We use this value to determine how long the metaslab should 473 * stay cached. 474 */ 475 uint64_t ms_selected_txg; 476 /* 477 * ms_load/unload_time can be used for performance monitoring 478 * (e.g. by dtrace or mdb). 479 */ 480 hrtime_t ms_load_time; /* time last loaded */ 481 hrtime_t ms_unload_time; /* time last unloaded */ 482 uint64_t ms_selected_time; /* time last allocated from (secs) */ 483 484 uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ 485 uint64_t ms_max_size; /* maximum allocatable size */ 486 487 /* 488 * -1 if it's not active in an allocator, otherwise set to the allocator 489 * this metaslab is active for. 490 */ 491 int ms_allocator; 492 boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ 493 494 /* 495 * The metaslab block allocators can optionally use a size-ordered 496 * range tree and/or an array of LBAs. Not all allocators use 497 * this functionality. The ms_allocatable_by_size should always 498 * contain the same number of segments as the ms_allocatable. The 499 * only difference is that the ms_allocatable_by_size is ordered by 500 * segment sizes. 501 */ 502 zfs_btree_t ms_allocatable_by_size; 503 zfs_btree_t ms_unflushed_frees_by_size; 504 uint64_t ms_lbas[MAX_LBAS]; 505 506 metaslab_group_t *ms_group; /* metaslab group */ 507 avl_node_t ms_group_node; /* node in metaslab group tree */ 508 txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ 509 avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ 510 /* 511 * Node in metaslab class's selected txg list 512 */ 513 multilist_node_t ms_class_txg_node; 514 515 /* 516 * Allocs and frees that are committed to the vdev log spacemap but 517 * not yet to this metaslab's spacemap. 518 */ 519 zfs_range_tree_t *ms_unflushed_allocs; 520 zfs_range_tree_t *ms_unflushed_frees; 521 522 /* 523 * We have flushed entries up to but not including this TXG. In 524 * other words, all changes from this TXG and onward should not 525 * be in this metaslab's space map and must be read from the 526 * log space maps. 527 */ 528 uint64_t ms_unflushed_txg; 529 boolean_t ms_unflushed_dirty; 530 531 /* updated every time we are done syncing the metaslab's space map */ 532 uint64_t ms_synced_length; 533 534 boolean_t ms_new; 535 }; 536 537 typedef struct metaslab_unflushed_phys { 538 /* on-disk counterpart of ms_unflushed_txg */ 539 uint64_t msp_unflushed_txg; 540 } metaslab_unflushed_phys_t; 541 542 #ifdef __cplusplus 543 } 544 #endif 545 546 #endif /* _SYS_METASLAB_IMPL_H */ 547