1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Copyright 2023 Red Hat 4 */ 5 6 #ifndef VDO_SLAB_DEPOT_H 7 #define VDO_SLAB_DEPOT_H 8 9 #include <linux/atomic.h> 10 #include <linux/dm-kcopyd.h> 11 #include <linux/list.h> 12 13 #include "numeric.h" 14 15 #include "admin-state.h" 16 #include "completion.h" 17 #include "data-vio.h" 18 #include "encodings.h" 19 #include "physical-zone.h" 20 #include "priority-table.h" 21 #include "recovery-journal.h" 22 #include "statistics.h" 23 #include "types.h" 24 #include "vio.h" 25 #include "wait-queue.h" 26 27 /* 28 * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has 29 * a single array of slabs in order to eliminate the need for additional math in order to compute 30 * which physical zone a PBN is in. It also has a block_allocator per zone. 31 * 32 * Each physical zone has a single dedicated queue and thread for performing all updates to the 33 * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the 34 * code to omit more fine-grained locking for the various slab structures. Each physical zone 35 * maintains a separate copy of the slab summary to remove the need for explicit locking on that 36 * structure as well. 37 * 38 * Load operations must be performed on the admin thread. Normal operations, such as allocations 39 * and reference count updates, must be performed on the appropriate physical zone thread. Requests 40 * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery 41 * journal thread to run on the appropriate physical zone thread. Save operations must be launched 42 * from the same admin thread as the original load operation. 43 */ 44 45 enum { 46 /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ 47 BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, 48 49 /* 50 * The number of vios in the vio pool used for loading reference count data. A slab's 51 * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be 52 * plenty. 53 */ 54 BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9, 55 }; 56 57 /* 58 * Represents the possible status of a block. 59 */ 60 enum reference_status { 61 RS_FREE, /* this block is free */ 62 RS_SINGLE, /* this block is singly-referenced */ 63 RS_SHARED, /* this block is shared */ 64 RS_PROVISIONAL /* this block is provisionally allocated */ 65 }; 66 67 struct vdo_slab; 68 69 struct journal_lock { 70 u16 count; 71 sequence_number_t recovery_start; 72 }; 73 74 struct slab_journal { 75 /* A waiter object for getting a VIO pool entry */ 76 struct vdo_waiter resource_waiter; 77 /* A waiter object for updating the slab summary */ 78 struct vdo_waiter slab_summary_waiter; 79 /* A waiter object for getting a vio with which to flush */ 80 struct vdo_waiter flush_waiter; 81 /* The queue of VIOs waiting to make an entry */ 82 struct vdo_wait_queue entry_waiters; 83 /* The parent slab reference of this journal */ 84 struct vdo_slab *slab; 85 86 /* Whether a tail block commit is pending */ 87 bool waiting_to_commit; 88 /* Whether the journal is updating the slab summary */ 89 bool updating_slab_summary; 90 /* Whether the journal is adding entries from the entry_waiters queue */ 91 bool adding_entries; 92 /* Whether a partial write is in progress */ 93 bool partial_write_in_progress; 94 95 /* The oldest block in the journal on disk */ 96 sequence_number_t head; 97 /* The oldest block in the journal which may not be reaped */ 98 sequence_number_t unreapable; 99 /* The end of the half-open interval of the active journal */ 100 sequence_number_t tail; 101 /* The next journal block to be committed */ 102 sequence_number_t next_commit; 103 /* The tail sequence number that is written in the slab summary */ 104 sequence_number_t summarized; 105 /* The tail sequence number that was last summarized in slab summary */ 106 sequence_number_t last_summarized; 107 108 /* The sequence number of the recovery journal lock */ 109 sequence_number_t recovery_lock; 110 111 /* 112 * The number of entries which fit in a single block. Can't use the constant because unit 113 * tests change this number. 114 */ 115 journal_entry_count_t entries_per_block; 116 /* 117 * The number of full entries which fit in a single block. Can't use the constant because 118 * unit tests change this number. 119 */ 120 journal_entry_count_t full_entries_per_block; 121 122 /* The recovery journal of the VDO (slab journal holds locks on it) */ 123 struct recovery_journal *recovery_journal; 124 125 /* The statistics shared by all slab journals in our physical zone */ 126 struct slab_journal_statistics *events; 127 /* A list of the VIO pool entries for outstanding journal block writes */ 128 struct list_head uncommitted_blocks; 129 130 /* 131 * The current tail block header state. This will be packed into the block just before it 132 * is written. 133 */ 134 struct slab_journal_block_header tail_header; 135 /* A pointer to a block-sized buffer holding the packed block data */ 136 struct packed_slab_journal_block *block; 137 138 /* The number of blocks in the on-disk journal */ 139 block_count_t size; 140 /* The number of blocks at which to start pushing reference blocks */ 141 block_count_t flushing_threshold; 142 /* The number of blocks at which all reference blocks should be writing */ 143 block_count_t flushing_deadline; 144 /* The number of blocks at which to wait for reference blocks to write */ 145 block_count_t blocking_threshold; 146 /* The number of blocks at which to scrub the slab before coming online */ 147 block_count_t scrubbing_threshold; 148 149 /* This list entry is for block_allocator to keep a queue of dirty journals */ 150 struct list_head dirty_entry; 151 152 /* The lock for the oldest unreaped block of the journal */ 153 struct journal_lock *reap_lock; 154 /* The locks for each on disk block */ 155 struct journal_lock *locks; 156 }; 157 158 /* 159 * Reference_block structure 160 * 161 * Blocks are used as a proxy, permitting saves of partial refcounts. 162 */ 163 struct reference_block { 164 /* This block waits on the ref_counts to tell it to write */ 165 struct vdo_waiter waiter; 166 /* The slab to which this reference_block belongs */ 167 struct vdo_slab *slab; 168 /* The number of references in this block that represent allocations */ 169 block_size_t allocated_count; 170 /* The slab journal block on which this block must hold a lock */ 171 sequence_number_t slab_journal_lock; 172 /* The slab journal block which should be released when this block is committed */ 173 sequence_number_t slab_journal_lock_to_release; 174 /* The point up to which each sector is accurate on disk */ 175 struct journal_point commit_points[VDO_SECTORS_PER_BLOCK]; 176 /* Whether this block has been modified since it was written to disk */ 177 bool is_dirty; 178 /* Whether this block is currently writing */ 179 bool is_writing; 180 }; 181 182 /* The search_cursor represents the saved position of a free block search. */ 183 struct search_cursor { 184 /* The reference block containing the current search index */ 185 struct reference_block *block; 186 /* The position at which to start searching for the next free counter */ 187 slab_block_number index; 188 /* The position just past the last valid counter in the current block */ 189 slab_block_number end_index; 190 191 /* A pointer to the first reference block in the slab */ 192 struct reference_block *first_block; 193 /* A pointer to the last reference block in the slab */ 194 struct reference_block *last_block; 195 }; 196 197 enum slab_rebuild_status { 198 VDO_SLAB_REBUILT, 199 VDO_SLAB_REPLAYING, 200 VDO_SLAB_REQUIRES_SCRUBBING, 201 VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, 202 VDO_SLAB_REBUILDING, 203 }; 204 205 /* 206 * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of 207 * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for 208 * metadata storage for the reference counts and slab journal for the slab. 209 * 210 * A reference count is maintained for each physical block number. The vast majority of blocks have 211 * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS 212 * (254) the reference count is stored in counters[pbn]. 213 */ 214 struct vdo_slab { 215 /* A list entry to queue this slab in a block_allocator list */ 216 struct list_head allocq_entry; 217 218 /* The struct block_allocator that owns this slab */ 219 struct block_allocator *allocator; 220 221 /* The journal for this slab */ 222 struct slab_journal journal; 223 224 /* The slab number of this slab */ 225 slab_count_t slab_number; 226 /* The offset in the allocator partition of the first block in this slab */ 227 physical_block_number_t start; 228 /* The offset of the first block past the end of this slab */ 229 physical_block_number_t end; 230 /* The starting translated PBN of the slab journal */ 231 physical_block_number_t journal_origin; 232 /* The starting translated PBN of the reference counts */ 233 physical_block_number_t ref_counts_origin; 234 235 /* The administrative state of the slab */ 236 struct admin_state state; 237 /* The status of the slab */ 238 enum slab_rebuild_status status; 239 /* Whether the slab was ever queued for scrubbing */ 240 bool was_queued_for_scrubbing; 241 242 /* The priority at which this slab has been queued for allocation */ 243 u8 priority; 244 245 /* Fields beyond this point are the reference counts for the data blocks in this slab. */ 246 /* The size of the counters array */ 247 u32 block_count; 248 /* The number of free blocks */ 249 u32 free_blocks; 250 /* The array of reference counts */ 251 vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */ 252 253 /* The saved block pointer and array indexes for the free block search */ 254 struct search_cursor search_cursor; 255 256 /* A list of the dirty blocks waiting to be written out */ 257 struct vdo_wait_queue dirty_blocks; 258 /* The number of blocks which are currently reading or writing */ 259 size_t active_count; 260 261 /* A waiter object for updating the slab summary */ 262 struct vdo_waiter summary_waiter; 263 264 /* The latest slab journal for which there has been a reference count update */ 265 struct journal_point slab_journal_point; 266 267 /* The number of reference count blocks */ 268 u32 reference_block_count; 269 /* reference count block array */ 270 struct reference_block *reference_blocks; 271 }; 272 273 enum block_allocator_drain_step { 274 VDO_DRAIN_ALLOCATOR_START, 275 VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER, 276 VDO_DRAIN_ALLOCATOR_STEP_SLABS, 277 VDO_DRAIN_ALLOCATOR_STEP_SUMMARY, 278 VDO_DRAIN_ALLOCATOR_STEP_FINISHED, 279 }; 280 281 struct slab_scrubber { 282 /* The queue of slabs to scrub first */ 283 struct list_head high_priority_slabs; 284 /* The queue of slabs to scrub once there are no high_priority_slabs */ 285 struct list_head slabs; 286 /* The queue of VIOs waiting for a slab to be scrubbed */ 287 struct vdo_wait_queue waiters; 288 289 /* 290 * The number of slabs that are unrecovered or being scrubbed. This field is modified by 291 * the physical zone thread, but is queried by other threads. 292 */ 293 slab_count_t slab_count; 294 295 /* The administrative state of the scrubber */ 296 struct admin_state admin_state; 297 /* Whether to only scrub high-priority slabs */ 298 bool high_priority_only; 299 /* The slab currently being scrubbed */ 300 struct vdo_slab *slab; 301 /* The vio for loading slab journal blocks */ 302 struct vio vio; 303 }; 304 305 /* A sub-structure for applying actions in parallel to all an allocator's slabs. */ 306 struct slab_actor { 307 /* The number of slabs performing a slab action */ 308 slab_count_t slab_action_count; 309 /* The method to call when a slab action has been completed by all slabs */ 310 vdo_action_fn callback; 311 }; 312 313 /* A slab_iterator is a structure for iterating over a set of slabs. */ 314 struct slab_iterator { 315 struct vdo_slab **slabs; 316 struct vdo_slab *next; 317 slab_count_t end; 318 slab_count_t stride; 319 }; 320 321 /* 322 * The slab_summary provides hints during load and recovery about the state of the slabs in order 323 * to avoid the need to read the slab journals in their entirety before a VDO can come online. 324 * 325 * The information in the summary for each slab includes the rough number of free blocks (which is 326 * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free 327 * space will be used on restart), and the location of the tail block of the slab's journal. 328 * 329 * The slab_summary has its own partition at the end of the volume which is sized to allow for a 330 * complete copy of the summary for each of up to 16 physical zones. 331 * 332 * During resize, the slab_summary moves its backing partition and is saved once moved; the 333 * slab_summary is not permitted to overwrite the previous recovery journal space. 334 * 335 * The slab_summary does not have its own version information, but relies on the VDO volume version 336 * number. 337 */ 338 339 /* 340 * A slab status is a very small structure for use in determining the ordering of slabs in the 341 * scrubbing process. 342 */ 343 struct slab_status { 344 slab_count_t slab_number; 345 bool is_clean; 346 u8 emptiness; 347 }; 348 349 struct slab_summary_block { 350 /* The block_allocator to which this block belongs */ 351 struct block_allocator *allocator; 352 /* The index of this block in its zone's summary */ 353 block_count_t index; 354 /* Whether this block has a write outstanding */ 355 bool writing; 356 /* Ring of updates waiting on the outstanding write */ 357 struct vdo_wait_queue current_update_waiters; 358 /* Ring of updates waiting on the next write */ 359 struct vdo_wait_queue next_update_waiters; 360 /* The active slab_summary_entry array for this block */ 361 struct slab_summary_entry *entries; 362 /* The vio used to write this block */ 363 struct vio vio; 364 /* The packed entries, one block long, backing the vio */ 365 char *outgoing_entries; 366 }; 367 368 /* 369 * The statistics for all the slab summary zones owned by this slab summary. These fields are all 370 * mutated only by their physical zone threads, but are read by other threads when gathering 371 * statistics for the entire depot. 372 */ 373 struct atomic_slab_summary_statistics { 374 /* Number of blocks written */ 375 atomic64_t blocks_written; 376 }; 377 378 struct block_allocator { 379 struct vdo_completion completion; 380 /* The slab depot for this allocator */ 381 struct slab_depot *depot; 382 /* The nonce of the VDO */ 383 nonce_t nonce; 384 /* The physical zone number of this allocator */ 385 zone_count_t zone_number; 386 /* The thread ID for this allocator's physical zone */ 387 thread_id_t thread_id; 388 /* The number of slabs in this allocator */ 389 slab_count_t slab_count; 390 /* The number of the last slab owned by this allocator */ 391 slab_count_t last_slab; 392 /* The reduced priority level used to preserve unopened slabs */ 393 unsigned int unopened_slab_priority; 394 /* The state of this allocator */ 395 struct admin_state state; 396 /* The actor for applying an action to all slabs */ 397 struct slab_actor slab_actor; 398 399 /* The slab from which blocks are currently being allocated */ 400 struct vdo_slab *open_slab; 401 /* A priority queue containing all slabs available for allocation */ 402 struct priority_table *prioritized_slabs; 403 /* The slab scrubber */ 404 struct slab_scrubber scrubber; 405 /* What phase of the close operation the allocator is to perform */ 406 enum block_allocator_drain_step drain_step; 407 408 /* 409 * These statistics are all mutated only by the physical zone thread, but are read by other 410 * threads when gathering statistics for the entire depot. 411 */ 412 /* 413 * The count of allocated blocks in this zone. Not in block_allocator_statistics for 414 * historical reasons. 415 */ 416 u64 allocated_blocks; 417 /* Statistics for this block allocator */ 418 struct block_allocator_statistics statistics; 419 /* Cumulative statistics for the slab journals in this zone */ 420 struct slab_journal_statistics slab_journal_statistics; 421 /* Cumulative statistics for the reference counters in this zone */ 422 struct ref_counts_statistics ref_counts_statistics; 423 424 /* 425 * This is the head of a queue of slab journals which have entries in their tail blocks 426 * which have not yet started to commit. When the recovery journal is under space pressure, 427 * slab journals which have uncommitted entries holding a lock on the recovery journal head 428 * are forced to commit their blocks early. This list is kept in order, with the tail 429 * containing the slab journal holding the most recent recovery journal lock. 430 */ 431 struct list_head dirty_slab_journals; 432 433 /* The vio pool for reading and writing block allocator metadata */ 434 struct vio_pool *vio_pool; 435 /* The vio pool for large initial reads of ref count areas */ 436 struct vio_pool *refcount_big_vio_pool; 437 /* How many ref count blocks are read per vio at initial load */ 438 u32 refcount_blocks_per_big_vio; 439 /* The dm_kcopyd client for erasing slab journals */ 440 struct dm_kcopyd_client *eraser; 441 /* Iterator over the slabs to be erased */ 442 struct slab_iterator slabs_to_erase; 443 444 /* The portion of the slab summary managed by this allocator */ 445 /* The state of the slab summary */ 446 struct admin_state summary_state; 447 /* The number of outstanding summary writes */ 448 block_count_t summary_write_count; 449 /* The array (owned by the blocks) of all entries */ 450 struct slab_summary_entry *summary_entries; 451 /* The array of slab_summary_blocks */ 452 struct slab_summary_block *summary_blocks; 453 }; 454 455 enum slab_depot_load_type { 456 VDO_SLAB_DEPOT_NORMAL_LOAD, 457 VDO_SLAB_DEPOT_RECOVERY_LOAD, 458 VDO_SLAB_DEPOT_REBUILD_LOAD 459 }; 460 461 struct slab_depot { 462 zone_count_t zone_count; 463 zone_count_t old_zone_count; 464 struct vdo *vdo; 465 struct slab_config slab_config; 466 struct action_manager *action_manager; 467 468 physical_block_number_t first_block; 469 physical_block_number_t last_block; 470 physical_block_number_t origin; 471 472 /* slab_size == (1 << slab_size_shift) */ 473 unsigned int slab_size_shift; 474 475 /* Determines how slabs should be queued during load */ 476 enum slab_depot_load_type load_type; 477 478 /* The state for notifying slab journals to release recovery journal */ 479 sequence_number_t active_release_request; 480 sequence_number_t new_release_request; 481 482 /* State variables for scrubbing complete handling */ 483 atomic_t zones_to_scrub; 484 485 /* Array of pointers to individually allocated slabs */ 486 struct vdo_slab **slabs; 487 /* The number of slabs currently allocated and stored in 'slabs' */ 488 slab_count_t slab_count; 489 490 /* Array of pointers to a larger set of slabs (used during resize) */ 491 struct vdo_slab **new_slabs; 492 /* The number of slabs currently allocated and stored in 'new_slabs' */ 493 slab_count_t new_slab_count; 494 /* The size that 'new_slabs' was allocated for */ 495 block_count_t new_size; 496 497 /* The last block before resize, for rollback */ 498 physical_block_number_t old_last_block; 499 /* The last block after resize, for resize */ 500 physical_block_number_t new_last_block; 501 502 /* The statistics for the slab summary */ 503 struct atomic_slab_summary_statistics summary_statistics; 504 /* The start of the slab summary partition */ 505 physical_block_number_t summary_origin; 506 /* The number of bits to shift to get a 7-bit fullness hint */ 507 unsigned int hint_shift; 508 /* The slab summary entries for all of the zones the partition can hold */ 509 struct slab_summary_entry *summary_entries; 510 511 /* The block allocators for this depot */ 512 struct block_allocator allocators[]; 513 }; 514 515 struct reference_updater; 516 517 bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab, 518 physical_block_number_t pbn, 519 enum journal_operation operation, 520 bool increment, 521 struct journal_point *recovery_point, 522 struct vdo_completion *parent); 523 524 int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot, 525 physical_block_number_t pbn, 526 enum journal_operation operation); 527 528 static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion) 529 { 530 vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION); 531 return container_of(completion, struct block_allocator, completion); 532 } 533 534 int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab, 535 physical_block_number_t pbn, 536 struct pbn_lock *lock); 537 538 int __must_check vdo_allocate_block(struct block_allocator *allocator, 539 physical_block_number_t *block_number_ptr); 540 541 int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, 542 struct vdo_waiter *waiter); 543 544 void vdo_modify_reference_count(struct vdo_completion *completion, 545 struct reference_updater *updater); 546 547 int __must_check vdo_release_block_reference(struct block_allocator *allocator, 548 physical_block_number_t pbn); 549 550 void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion); 551 552 void vdo_dump_block_allocator(const struct block_allocator *allocator); 553 554 int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state, 555 struct vdo *vdo, 556 struct partition *summary_partition, 557 struct slab_depot **depot_ptr); 558 559 void vdo_free_slab_depot(struct slab_depot *depot); 560 561 struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot); 562 563 int __must_check vdo_allocate_reference_counters(struct slab_depot *depot); 564 565 struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot, 566 physical_block_number_t pbn); 567 568 u8 __must_check vdo_get_increment_limit(struct slab_depot *depot, 569 physical_block_number_t pbn); 570 571 bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot, 572 physical_block_number_t pbn); 573 574 block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot); 575 576 block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot); 577 578 void vdo_get_slab_depot_statistics(const struct slab_depot *depot, 579 struct vdo_statistics *stats); 580 581 void vdo_load_slab_depot(struct slab_depot *depot, 582 const struct admin_state_code *operation, 583 struct vdo_completion *parent, void *context); 584 585 void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, 586 enum slab_depot_load_type load_type, 587 struct vdo_completion *parent); 588 589 void vdo_update_slab_depot_size(struct slab_depot *depot); 590 591 int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, 592 const struct partition *partition); 593 594 void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent); 595 596 void vdo_abandon_new_slabs(struct slab_depot *depot); 597 598 void vdo_drain_slab_depot(struct slab_depot *depot, 599 const struct admin_state_code *operation, 600 struct vdo_completion *parent); 601 602 void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent); 603 604 void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, 605 sequence_number_t recovery_block_number); 606 607 void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, 608 struct vdo_completion *parent); 609 610 void vdo_dump_slab_depot(const struct slab_depot *depot); 611 612 #endif /* VDO_SLAB_DEPOT_H */ 613