1 /* SPDX-License-Identifier: GPL-2.0-only */ 2 /* 3 * Copyright 2023 Red Hat 4 */ 5 6 #ifndef VDO_SLAB_DEPOT_H 7 #define VDO_SLAB_DEPOT_H 8 9 #include <linux/atomic.h> 10 #include <linux/dm-kcopyd.h> 11 #include <linux/list.h> 12 13 #include "numeric.h" 14 15 #include "admin-state.h" 16 #include "completion.h" 17 #include "data-vio.h" 18 #include "encodings.h" 19 #include "physical-zone.h" 20 #include "priority-table.h" 21 #include "recovery-journal.h" 22 #include "statistics.h" 23 #include "types.h" 24 #include "vio.h" 25 #include "wait-queue.h" 26 27 /* 28 * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has 29 * a single array of slabs in order to eliminate the need for additional math in order to compute 30 * which physical zone a PBN is in. It also has a block_allocator per zone. 31 * 32 * Each physical zone has a single dedicated queue and thread for performing all updates to the 33 * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the 34 * code to omit more fine-grained locking for the various slab structures. Each physical zone 35 * maintains a separate copy of the slab summary to remove the need for explicit locking on that 36 * structure as well. 37 * 38 * Load operations must be performed on the admin thread. Normal operations, such as allocations 39 * and reference count updates, must be performed on the appropriate physical zone thread. Requests 40 * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery 41 * journal thread to run on the appropriate physical zone thread. Save operations must be launched 42 * from the same admin thread as the original load operation. 43 */ 44 45 enum { 46 /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ 47 BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, 48 }; 49 50 /* 51 * Represents the possible status of a block. 52 */ 53 enum reference_status { 54 RS_FREE, /* this block is free */ 55 RS_SINGLE, /* this block is singly-referenced */ 56 RS_SHARED, /* this block is shared */ 57 RS_PROVISIONAL /* this block is provisionally allocated */ 58 }; 59 60 struct vdo_slab; 61 62 struct journal_lock { 63 u16 count; 64 sequence_number_t recovery_start; 65 }; 66 67 struct slab_journal { 68 /* A waiter object for getting a VIO pool entry */ 69 struct vdo_waiter resource_waiter; 70 /* A waiter object for updating the slab summary */ 71 struct vdo_waiter slab_summary_waiter; 72 /* A waiter object for getting a vio with which to flush */ 73 struct vdo_waiter flush_waiter; 74 /* The queue of VIOs waiting to make an entry */ 75 struct vdo_wait_queue entry_waiters; 76 /* The parent slab reference of this journal */ 77 struct vdo_slab *slab; 78 79 /* Whether a tail block commit is pending */ 80 bool waiting_to_commit; 81 /* Whether the journal is updating the slab summary */ 82 bool updating_slab_summary; 83 /* Whether the journal is adding entries from the entry_waiters queue */ 84 bool adding_entries; 85 /* Whether a partial write is in progress */ 86 bool partial_write_in_progress; 87 88 /* The oldest block in the journal on disk */ 89 sequence_number_t head; 90 /* The oldest block in the journal which may not be reaped */ 91 sequence_number_t unreapable; 92 /* The end of the half-open interval of the active journal */ 93 sequence_number_t tail; 94 /* The next journal block to be committed */ 95 sequence_number_t next_commit; 96 /* The tail sequence number that is written in the slab summary */ 97 sequence_number_t summarized; 98 /* The tail sequence number that was last summarized in slab summary */ 99 sequence_number_t last_summarized; 100 101 /* The sequence number of the recovery journal lock */ 102 sequence_number_t recovery_lock; 103 104 /* 105 * The number of entries which fit in a single block. Can't use the constant because unit 106 * tests change this number. 107 */ 108 journal_entry_count_t entries_per_block; 109 /* 110 * The number of full entries which fit in a single block. Can't use the constant because 111 * unit tests change this number. 112 */ 113 journal_entry_count_t full_entries_per_block; 114 115 /* The recovery journal of the VDO (slab journal holds locks on it) */ 116 struct recovery_journal *recovery_journal; 117 118 /* The statistics shared by all slab journals in our physical zone */ 119 struct slab_journal_statistics *events; 120 /* A list of the VIO pool entries for outstanding journal block writes */ 121 struct list_head uncommitted_blocks; 122 123 /* 124 * The current tail block header state. This will be packed into the block just before it 125 * is written. 126 */ 127 struct slab_journal_block_header tail_header; 128 /* A pointer to a block-sized buffer holding the packed block data */ 129 struct packed_slab_journal_block *block; 130 131 /* The number of blocks in the on-disk journal */ 132 block_count_t size; 133 /* The number of blocks at which to start pushing reference blocks */ 134 block_count_t flushing_threshold; 135 /* The number of blocks at which all reference blocks should be writing */ 136 block_count_t flushing_deadline; 137 /* The number of blocks at which to wait for reference blocks to write */ 138 block_count_t blocking_threshold; 139 /* The number of blocks at which to scrub the slab before coming online */ 140 block_count_t scrubbing_threshold; 141 142 /* This list entry is for block_allocator to keep a queue of dirty journals */ 143 struct list_head dirty_entry; 144 145 /* The lock for the oldest unreaped block of the journal */ 146 struct journal_lock *reap_lock; 147 /* The locks for each on disk block */ 148 struct journal_lock *locks; 149 }; 150 151 /* 152 * Reference_block structure 153 * 154 * Blocks are used as a proxy, permitting saves of partial refcounts. 155 */ 156 struct reference_block { 157 /* This block waits on the ref_counts to tell it to write */ 158 struct vdo_waiter waiter; 159 /* The slab to which this reference_block belongs */ 160 struct vdo_slab *slab; 161 /* The number of references in this block that represent allocations */ 162 block_size_t allocated_count; 163 /* The slab journal block on which this block must hold a lock */ 164 sequence_number_t slab_journal_lock; 165 /* The slab journal block which should be released when this block is committed */ 166 sequence_number_t slab_journal_lock_to_release; 167 /* The point up to which each sector is accurate on disk */ 168 struct journal_point commit_points[VDO_SECTORS_PER_BLOCK]; 169 /* Whether this block has been modified since it was written to disk */ 170 bool is_dirty; 171 /* Whether this block is currently writing */ 172 bool is_writing; 173 }; 174 175 /* The search_cursor represents the saved position of a free block search. */ 176 struct search_cursor { 177 /* The reference block containing the current search index */ 178 struct reference_block *block; 179 /* The position at which to start searching for the next free counter */ 180 slab_block_number index; 181 /* The position just past the last valid counter in the current block */ 182 slab_block_number end_index; 183 184 /* A pointer to the first reference block in the slab */ 185 struct reference_block *first_block; 186 /* A pointer to the last reference block in the slab */ 187 struct reference_block *last_block; 188 }; 189 190 enum slab_rebuild_status { 191 VDO_SLAB_REBUILT, 192 VDO_SLAB_REPLAYING, 193 VDO_SLAB_REQUIRES_SCRUBBING, 194 VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING, 195 VDO_SLAB_REBUILDING, 196 }; 197 198 /* 199 * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of 200 * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for 201 * metadata storage for the reference counts and slab journal for the slab. 202 * 203 * A reference count is maintained for each physical block number. The vast majority of blocks have 204 * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS 205 * (254) the reference count is stored in counters[pbn]. 206 */ 207 struct vdo_slab { 208 /* A list entry to queue this slab in a block_allocator list */ 209 struct list_head allocq_entry; 210 211 /* The struct block_allocator that owns this slab */ 212 struct block_allocator *allocator; 213 214 /* The journal for this slab */ 215 struct slab_journal journal; 216 217 /* The slab number of this slab */ 218 slab_count_t slab_number; 219 /* The offset in the allocator partition of the first block in this slab */ 220 physical_block_number_t start; 221 /* The offset of the first block past the end of this slab */ 222 physical_block_number_t end; 223 /* The starting translated PBN of the slab journal */ 224 physical_block_number_t journal_origin; 225 /* The starting translated PBN of the reference counts */ 226 physical_block_number_t ref_counts_origin; 227 228 /* The administrative state of the slab */ 229 struct admin_state state; 230 /* The status of the slab */ 231 enum slab_rebuild_status status; 232 /* Whether the slab was ever queued for scrubbing */ 233 bool was_queued_for_scrubbing; 234 235 /* The priority at which this slab has been queued for allocation */ 236 u8 priority; 237 238 /* Fields beyond this point are the reference counts for the data blocks in this slab. */ 239 /* The size of the counters array */ 240 u32 block_count; 241 /* The number of free blocks */ 242 u32 free_blocks; 243 /* The array of reference counts */ 244 vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */ 245 246 /* The saved block pointer and array indexes for the free block search */ 247 struct search_cursor search_cursor; 248 249 /* A list of the dirty blocks waiting to be written out */ 250 struct vdo_wait_queue dirty_blocks; 251 /* The number of blocks which are currently writing */ 252 size_t active_count; 253 254 /* A waiter object for updating the slab summary */ 255 struct vdo_waiter summary_waiter; 256 257 /* The latest slab journal for which there has been a reference count update */ 258 struct journal_point slab_journal_point; 259 260 /* The number of reference count blocks */ 261 u32 reference_block_count; 262 /* reference count block array */ 263 struct reference_block *reference_blocks; 264 }; 265 266 enum block_allocator_drain_step { 267 VDO_DRAIN_ALLOCATOR_START, 268 VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER, 269 VDO_DRAIN_ALLOCATOR_STEP_SLABS, 270 VDO_DRAIN_ALLOCATOR_STEP_SUMMARY, 271 VDO_DRAIN_ALLOCATOR_STEP_FINISHED, 272 }; 273 274 struct slab_scrubber { 275 /* The queue of slabs to scrub first */ 276 struct list_head high_priority_slabs; 277 /* The queue of slabs to scrub once there are no high_priority_slabs */ 278 struct list_head slabs; 279 /* The queue of VIOs waiting for a slab to be scrubbed */ 280 struct vdo_wait_queue waiters; 281 282 /* 283 * The number of slabs that are unrecovered or being scrubbed. This field is modified by 284 * the physical zone thread, but is queried by other threads. 285 */ 286 slab_count_t slab_count; 287 288 /* The administrative state of the scrubber */ 289 struct admin_state admin_state; 290 /* Whether to only scrub high-priority slabs */ 291 bool high_priority_only; 292 /* The slab currently being scrubbed */ 293 struct vdo_slab *slab; 294 /* The vio for loading slab journal blocks */ 295 struct vio vio; 296 }; 297 298 /* A sub-structure for applying actions in parallel to all an allocator's slabs. */ 299 struct slab_actor { 300 /* The number of slabs performing a slab action */ 301 slab_count_t slab_action_count; 302 /* The method to call when a slab action has been completed by all slabs */ 303 vdo_action_fn callback; 304 }; 305 306 /* A slab_iterator is a structure for iterating over a set of slabs. */ 307 struct slab_iterator { 308 struct vdo_slab **slabs; 309 struct vdo_slab *next; 310 slab_count_t end; 311 slab_count_t stride; 312 }; 313 314 /* 315 * The slab_summary provides hints during load and recovery about the state of the slabs in order 316 * to avoid the need to read the slab journals in their entirety before a VDO can come online. 317 * 318 * The information in the summary for each slab includes the rough number of free blocks (which is 319 * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free 320 * space will be used on restart), and the location of the tail block of the slab's journal. 321 * 322 * The slab_summary has its own partition at the end of the volume which is sized to allow for a 323 * complete copy of the summary for each of up to 16 physical zones. 324 * 325 * During resize, the slab_summary moves its backing partition and is saved once moved; the 326 * slab_summary is not permitted to overwrite the previous recovery journal space. 327 * 328 * The slab_summary does not have its own version information, but relies on the VDO volume version 329 * number. 330 */ 331 332 /* 333 * A slab status is a very small structure for use in determining the ordering of slabs in the 334 * scrubbing process. 335 */ 336 struct slab_status { 337 slab_count_t slab_number; 338 bool is_clean; 339 u8 emptiness; 340 }; 341 342 struct slab_summary_block { 343 /* The block_allocator to which this block belongs */ 344 struct block_allocator *allocator; 345 /* The index of this block in its zone's summary */ 346 block_count_t index; 347 /* Whether this block has a write outstanding */ 348 bool writing; 349 /* Ring of updates waiting on the outstanding write */ 350 struct vdo_wait_queue current_update_waiters; 351 /* Ring of updates waiting on the next write */ 352 struct vdo_wait_queue next_update_waiters; 353 /* The active slab_summary_entry array for this block */ 354 struct slab_summary_entry *entries; 355 /* The vio used to write this block */ 356 struct vio vio; 357 /* The packed entries, one block long, backing the vio */ 358 char *outgoing_entries; 359 }; 360 361 /* 362 * The statistics for all the slab summary zones owned by this slab summary. These fields are all 363 * mutated only by their physical zone threads, but are read by other threads when gathering 364 * statistics for the entire depot. 365 */ 366 struct atomic_slab_summary_statistics { 367 /* Number of blocks written */ 368 atomic64_t blocks_written; 369 }; 370 371 struct block_allocator { 372 struct vdo_completion completion; 373 /* The slab depot for this allocator */ 374 struct slab_depot *depot; 375 /* The nonce of the VDO */ 376 nonce_t nonce; 377 /* The physical zone number of this allocator */ 378 zone_count_t zone_number; 379 /* The thread ID for this allocator's physical zone */ 380 thread_id_t thread_id; 381 /* The number of slabs in this allocator */ 382 slab_count_t slab_count; 383 /* The number of the last slab owned by this allocator */ 384 slab_count_t last_slab; 385 /* The reduced priority level used to preserve unopened slabs */ 386 unsigned int unopened_slab_priority; 387 /* The state of this allocator */ 388 struct admin_state state; 389 /* The actor for applying an action to all slabs */ 390 struct slab_actor slab_actor; 391 392 /* The slab from which blocks are currently being allocated */ 393 struct vdo_slab *open_slab; 394 /* A priority queue containing all slabs available for allocation */ 395 struct priority_table *prioritized_slabs; 396 /* The slab scrubber */ 397 struct slab_scrubber scrubber; 398 /* What phase of the close operation the allocator is to perform */ 399 enum block_allocator_drain_step drain_step; 400 401 /* 402 * These statistics are all mutated only by the physical zone thread, but are read by other 403 * threads when gathering statistics for the entire depot. 404 */ 405 /* 406 * The count of allocated blocks in this zone. Not in block_allocator_statistics for 407 * historical reasons. 408 */ 409 u64 allocated_blocks; 410 /* Statistics for this block allocator */ 411 struct block_allocator_statistics statistics; 412 /* Cumulative statistics for the slab journals in this zone */ 413 struct slab_journal_statistics slab_journal_statistics; 414 /* Cumulative statistics for the reference counters in this zone */ 415 struct ref_counts_statistics ref_counts_statistics; 416 417 /* 418 * This is the head of a queue of slab journals which have entries in their tail blocks 419 * which have not yet started to commit. When the recovery journal is under space pressure, 420 * slab journals which have uncommitted entries holding a lock on the recovery journal head 421 * are forced to commit their blocks early. This list is kept in order, with the tail 422 * containing the slab journal holding the most recent recovery journal lock. 423 */ 424 struct list_head dirty_slab_journals; 425 426 /* The vio pool for reading and writing block allocator metadata */ 427 struct vio_pool *vio_pool; 428 /* The dm_kcopyd client for erasing slab journals */ 429 struct dm_kcopyd_client *eraser; 430 /* Iterator over the slabs to be erased */ 431 struct slab_iterator slabs_to_erase; 432 433 /* The portion of the slab summary managed by this allocator */ 434 /* The state of the slab summary */ 435 struct admin_state summary_state; 436 /* The number of outstanding summary writes */ 437 block_count_t summary_write_count; 438 /* The array (owned by the blocks) of all entries */ 439 struct slab_summary_entry *summary_entries; 440 /* The array of slab_summary_blocks */ 441 struct slab_summary_block *summary_blocks; 442 }; 443 444 enum slab_depot_load_type { 445 VDO_SLAB_DEPOT_NORMAL_LOAD, 446 VDO_SLAB_DEPOT_RECOVERY_LOAD, 447 VDO_SLAB_DEPOT_REBUILD_LOAD 448 }; 449 450 struct slab_depot { 451 zone_count_t zone_count; 452 zone_count_t old_zone_count; 453 struct vdo *vdo; 454 struct slab_config slab_config; 455 struct action_manager *action_manager; 456 457 physical_block_number_t first_block; 458 physical_block_number_t last_block; 459 physical_block_number_t origin; 460 461 /* slab_size == (1 << slab_size_shift) */ 462 unsigned int slab_size_shift; 463 464 /* Determines how slabs should be queued during load */ 465 enum slab_depot_load_type load_type; 466 467 /* The state for notifying slab journals to release recovery journal */ 468 sequence_number_t active_release_request; 469 sequence_number_t new_release_request; 470 471 /* State variables for scrubbing complete handling */ 472 atomic_t zones_to_scrub; 473 474 /* Array of pointers to individually allocated slabs */ 475 struct vdo_slab **slabs; 476 /* The number of slabs currently allocated and stored in 'slabs' */ 477 slab_count_t slab_count; 478 479 /* Array of pointers to a larger set of slabs (used during resize) */ 480 struct vdo_slab **new_slabs; 481 /* The number of slabs currently allocated and stored in 'new_slabs' */ 482 slab_count_t new_slab_count; 483 /* The size that 'new_slabs' was allocated for */ 484 block_count_t new_size; 485 486 /* The last block before resize, for rollback */ 487 physical_block_number_t old_last_block; 488 /* The last block after resize, for resize */ 489 physical_block_number_t new_last_block; 490 491 /* The statistics for the slab summary */ 492 struct atomic_slab_summary_statistics summary_statistics; 493 /* The start of the slab summary partition */ 494 physical_block_number_t summary_origin; 495 /* The number of bits to shift to get a 7-bit fullness hint */ 496 unsigned int hint_shift; 497 /* The slab summary entries for all of the zones the partition can hold */ 498 struct slab_summary_entry *summary_entries; 499 500 /* The block allocators for this depot */ 501 struct block_allocator allocators[]; 502 }; 503 504 struct reference_updater; 505 506 bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab, 507 physical_block_number_t pbn, 508 enum journal_operation operation, 509 bool increment, 510 struct journal_point *recovery_point, 511 struct vdo_completion *parent); 512 513 int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot, 514 physical_block_number_t pbn, 515 enum journal_operation operation); 516 517 static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion) 518 { 519 vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION); 520 return container_of(completion, struct block_allocator, completion); 521 } 522 523 int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab, 524 physical_block_number_t pbn, 525 struct pbn_lock *lock); 526 527 int __must_check vdo_allocate_block(struct block_allocator *allocator, 528 physical_block_number_t *block_number_ptr); 529 530 int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator, 531 struct vdo_waiter *waiter); 532 533 void vdo_modify_reference_count(struct vdo_completion *completion, 534 struct reference_updater *updater); 535 536 int __must_check vdo_release_block_reference(struct block_allocator *allocator, 537 physical_block_number_t pbn); 538 539 void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion); 540 541 void vdo_dump_block_allocator(const struct block_allocator *allocator); 542 543 int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state, 544 struct vdo *vdo, 545 struct partition *summary_partition, 546 struct slab_depot **depot_ptr); 547 548 void vdo_free_slab_depot(struct slab_depot *depot); 549 550 struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot); 551 552 int __must_check vdo_allocate_reference_counters(struct slab_depot *depot); 553 554 struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot, 555 physical_block_number_t pbn); 556 557 u8 __must_check vdo_get_increment_limit(struct slab_depot *depot, 558 physical_block_number_t pbn); 559 560 bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot, 561 physical_block_number_t pbn); 562 563 block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot); 564 565 block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot); 566 567 void vdo_get_slab_depot_statistics(const struct slab_depot *depot, 568 struct vdo_statistics *stats); 569 570 void vdo_load_slab_depot(struct slab_depot *depot, 571 const struct admin_state_code *operation, 572 struct vdo_completion *parent, void *context); 573 574 void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot, 575 enum slab_depot_load_type load_type, 576 struct vdo_completion *parent); 577 578 void vdo_update_slab_depot_size(struct slab_depot *depot); 579 580 int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot, 581 const struct partition *partition); 582 583 void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent); 584 585 void vdo_abandon_new_slabs(struct slab_depot *depot); 586 587 void vdo_drain_slab_depot(struct slab_depot *depot, 588 const struct admin_state_code *operation, 589 struct vdo_completion *parent); 590 591 void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent); 592 593 void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot, 594 sequence_number_t recovery_block_number); 595 596 void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot, 597 struct vdo_completion *parent); 598 599 void vdo_dump_slab_depot(const struct slab_depot *depot); 600 601 #endif /* VDO_SLAB_DEPOT_H */ 602