1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3 * Copyright 2023 Red Hat
4 */
5
6 #ifndef VDO_SLAB_DEPOT_H
7 #define VDO_SLAB_DEPOT_H
8
9 #include <linux/atomic.h>
10 #include <linux/dm-kcopyd.h>
11 #include <linux/list.h>
12
13 #include "numeric.h"
14
15 #include "admin-state.h"
16 #include "completion.h"
17 #include "data-vio.h"
18 #include "encodings.h"
19 #include "physical-zone.h"
20 #include "priority-table.h"
21 #include "recovery-journal.h"
22 #include "statistics.h"
23 #include "types.h"
24 #include "vio.h"
25 #include "wait-queue.h"
26
27 /*
28 * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
29 * a single array of slabs in order to eliminate the need for additional math in order to compute
30 * which physical zone a PBN is in. It also has a block_allocator per zone.
31 *
32 * Each physical zone has a single dedicated queue and thread for performing all updates to the
33 * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the
34 * code to omit more fine-grained locking for the various slab structures. Each physical zone
35 * maintains a separate copy of the slab summary to remove the need for explicit locking on that
36 * structure as well.
37 *
38 * Load operations must be performed on the admin thread. Normal operations, such as allocations
39 * and reference count updates, must be performed on the appropriate physical zone thread. Requests
40 * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery
41 * journal thread to run on the appropriate physical zone thread. Save operations must be launched
42 * from the same admin thread as the original load operation.
43 */
44
45 enum {
46 /* The number of vios in the vio pool is proportional to the throughput of the VDO. */
47 BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
48
49 /*
50 * The number of vios in the vio pool used for loading reference count data. A slab's
51 * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be
52 * plenty.
53 */
54 BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9,
55 };
56
57 /*
58 * Represents the possible status of a block.
59 */
60 enum reference_status {
61 RS_FREE, /* this block is free */
62 RS_SINGLE, /* this block is singly-referenced */
63 RS_SHARED, /* this block is shared */
64 RS_PROVISIONAL /* this block is provisionally allocated */
65 };
66
67 struct vdo_slab;
68
69 struct journal_lock {
70 u16 count;
71 sequence_number_t recovery_start;
72 };
73
74 struct slab_journal {
75 /* A waiter object for getting a VIO pool entry */
76 struct vdo_waiter resource_waiter;
77 /* A waiter object for updating the slab summary */
78 struct vdo_waiter slab_summary_waiter;
79 /* A waiter object for getting a vio with which to flush */
80 struct vdo_waiter flush_waiter;
81 /* The queue of VIOs waiting to make an entry */
82 struct vdo_wait_queue entry_waiters;
83 /* The parent slab reference of this journal */
84 struct vdo_slab *slab;
85
86 /* Whether a tail block commit is pending */
87 bool waiting_to_commit;
88 /* Whether the journal is updating the slab summary */
89 bool updating_slab_summary;
90 /* Whether the journal is adding entries from the entry_waiters queue */
91 bool adding_entries;
92 /* Whether a partial write is in progress */
93 bool partial_write_in_progress;
94
95 /* The oldest block in the journal on disk */
96 sequence_number_t head;
97 /* The oldest block in the journal which may not be reaped */
98 sequence_number_t unreapable;
99 /* The end of the half-open interval of the active journal */
100 sequence_number_t tail;
101 /* The next journal block to be committed */
102 sequence_number_t next_commit;
103 /* The tail sequence number that is written in the slab summary */
104 sequence_number_t summarized;
105 /* The tail sequence number that was last summarized in slab summary */
106 sequence_number_t last_summarized;
107
108 /* The sequence number of the recovery journal lock */
109 sequence_number_t recovery_lock;
110
111 /*
112 * The number of entries which fit in a single block. Can't use the constant because unit
113 * tests change this number.
114 */
115 journal_entry_count_t entries_per_block;
116 /*
117 * The number of full entries which fit in a single block. Can't use the constant because
118 * unit tests change this number.
119 */
120 journal_entry_count_t full_entries_per_block;
121
122 /* The recovery journal of the VDO (slab journal holds locks on it) */
123 struct recovery_journal *recovery_journal;
124
125 /* The statistics shared by all slab journals in our physical zone */
126 struct slab_journal_statistics *events;
127 /* A list of the VIO pool entries for outstanding journal block writes */
128 struct list_head uncommitted_blocks;
129
130 /*
131 * The current tail block header state. This will be packed into the block just before it
132 * is written.
133 */
134 struct slab_journal_block_header tail_header;
135 /* A pointer to a block-sized buffer holding the packed block data */
136 struct packed_slab_journal_block *block;
137
138 /* The number of blocks in the on-disk journal */
139 block_count_t size;
140 /* The number of blocks at which to start pushing reference blocks */
141 block_count_t flushing_threshold;
142 /* The number of blocks at which all reference blocks should be writing */
143 block_count_t flushing_deadline;
144 /* The number of blocks at which to wait for reference blocks to write */
145 block_count_t blocking_threshold;
146 /* The number of blocks at which to scrub the slab before coming online */
147 block_count_t scrubbing_threshold;
148
149 /* This list entry is for block_allocator to keep a queue of dirty journals */
150 struct list_head dirty_entry;
151
152 /* The lock for the oldest unreaped block of the journal */
153 struct journal_lock *reap_lock;
154 /* The locks for each on disk block */
155 struct journal_lock *locks;
156 };
157
158 /*
159 * Reference_block structure
160 *
161 * Blocks are used as a proxy, permitting saves of partial refcounts.
162 */
163 struct reference_block {
164 /* This block waits on the ref_counts to tell it to write */
165 struct vdo_waiter waiter;
166 /* The slab to which this reference_block belongs */
167 struct vdo_slab *slab;
168 /* The number of references in this block that represent allocations */
169 block_size_t allocated_count;
170 /* The slab journal block on which this block must hold a lock */
171 sequence_number_t slab_journal_lock;
172 /* The slab journal block which should be released when this block is committed */
173 sequence_number_t slab_journal_lock_to_release;
174 /* The point up to which each sector is accurate on disk */
175 struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
176 /* Whether this block has been modified since it was written to disk */
177 bool is_dirty;
178 /* Whether this block is currently writing */
179 bool is_writing;
180 };
181
182 /* The search_cursor represents the saved position of a free block search. */
183 struct search_cursor {
184 /* The reference block containing the current search index */
185 struct reference_block *block;
186 /* The position at which to start searching for the next free counter */
187 slab_block_number index;
188 /* The position just past the last valid counter in the current block */
189 slab_block_number end_index;
190
191 /* A pointer to the first reference block in the slab */
192 struct reference_block *first_block;
193 /* A pointer to the last reference block in the slab */
194 struct reference_block *last_block;
195 };
196
197 enum slab_rebuild_status {
198 VDO_SLAB_REBUILT,
199 VDO_SLAB_REPLAYING,
200 VDO_SLAB_REQUIRES_SCRUBBING,
201 VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
202 VDO_SLAB_REBUILDING,
203 };
204
205 /*
206 * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
207 * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
208 * metadata storage for the reference counts and slab journal for the slab.
209 *
210 * A reference count is maintained for each physical block number. The vast majority of blocks have
211 * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
212 * (254) the reference count is stored in counters[pbn].
213 */
214 struct vdo_slab {
215 /* A list entry to queue this slab in a block_allocator list */
216 struct list_head allocq_entry;
217
218 /* The struct block_allocator that owns this slab */
219 struct block_allocator *allocator;
220
221 /* The journal for this slab */
222 struct slab_journal journal;
223
224 /* The slab number of this slab */
225 slab_count_t slab_number;
226 /* The offset in the allocator partition of the first block in this slab */
227 physical_block_number_t start;
228 /* The offset of the first block past the end of this slab */
229 physical_block_number_t end;
230 /* The starting translated PBN of the slab journal */
231 physical_block_number_t journal_origin;
232 /* The starting translated PBN of the reference counts */
233 physical_block_number_t ref_counts_origin;
234
235 /* The administrative state of the slab */
236 struct admin_state state;
237 /* The status of the slab */
238 enum slab_rebuild_status status;
239 /* Whether the slab was ever queued for scrubbing */
240 bool was_queued_for_scrubbing;
241
242 /* The priority at which this slab has been queued for allocation */
243 u8 priority;
244
245 /* Fields beyond this point are the reference counts for the data blocks in this slab. */
246 /* The size of the counters array */
247 u32 block_count;
248 /* The number of free blocks */
249 u32 free_blocks;
250 /* The array of reference counts */
251 vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */
252
253 /* The saved block pointer and array indexes for the free block search */
254 struct search_cursor search_cursor;
255
256 /* A list of the dirty blocks waiting to be written out */
257 struct vdo_wait_queue dirty_blocks;
258 /* The number of blocks which are currently reading or writing */
259 size_t active_count;
260
261 /* A waiter object for updating the slab summary */
262 struct vdo_waiter summary_waiter;
263
264 /* The latest slab journal for which there has been a reference count update */
265 struct journal_point slab_journal_point;
266
267 /* The number of reference count blocks */
268 u32 reference_block_count;
269 /* reference count block array */
270 struct reference_block *reference_blocks;
271 };
272
273 enum block_allocator_drain_step {
274 VDO_DRAIN_ALLOCATOR_START,
275 VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER,
276 VDO_DRAIN_ALLOCATOR_STEP_SLABS,
277 VDO_DRAIN_ALLOCATOR_STEP_SUMMARY,
278 VDO_DRAIN_ALLOCATOR_STEP_FINISHED,
279 };
280
281 struct slab_scrubber {
282 /* The queue of slabs to scrub first */
283 struct list_head high_priority_slabs;
284 /* The queue of slabs to scrub once there are no high_priority_slabs */
285 struct list_head slabs;
286 /* The queue of VIOs waiting for a slab to be scrubbed */
287 struct vdo_wait_queue waiters;
288
289 /*
290 * The number of slabs that are unrecovered or being scrubbed. This field is modified by
291 * the physical zone thread, but is queried by other threads.
292 */
293 slab_count_t slab_count;
294
295 /* The administrative state of the scrubber */
296 struct admin_state admin_state;
297 /* Whether to only scrub high-priority slabs */
298 bool high_priority_only;
299 /* The slab currently being scrubbed */
300 struct vdo_slab *slab;
301 /* The vio for loading slab journal blocks */
302 struct vio vio;
303 };
304
305 /* A sub-structure for applying actions in parallel to all an allocator's slabs. */
306 struct slab_actor {
307 /* The number of slabs performing a slab action */
308 slab_count_t slab_action_count;
309 /* The method to call when a slab action has been completed by all slabs */
310 vdo_action_fn callback;
311 };
312
313 /* A slab_iterator is a structure for iterating over a set of slabs. */
314 struct slab_iterator {
315 struct vdo_slab **slabs;
316 struct vdo_slab *next;
317 slab_count_t end;
318 slab_count_t stride;
319 };
320
321 /*
322 * The slab_summary provides hints during load and recovery about the state of the slabs in order
323 * to avoid the need to read the slab journals in their entirety before a VDO can come online.
324 *
325 * The information in the summary for each slab includes the rough number of free blocks (which is
326 * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
327 * space will be used on restart), and the location of the tail block of the slab's journal.
328 *
329 * The slab_summary has its own partition at the end of the volume which is sized to allow for a
330 * complete copy of the summary for each of up to 16 physical zones.
331 *
332 * During resize, the slab_summary moves its backing partition and is saved once moved; the
333 * slab_summary is not permitted to overwrite the previous recovery journal space.
334 *
335 * The slab_summary does not have its own version information, but relies on the VDO volume version
336 * number.
337 */
338
339 /*
340 * A slab status is a very small structure for use in determining the ordering of slabs in the
341 * scrubbing process.
342 */
343 struct slab_status {
344 slab_count_t slab_number;
345 bool is_clean;
346 u8 emptiness;
347 };
348
349 struct slab_summary_block {
350 /* The block_allocator to which this block belongs */
351 struct block_allocator *allocator;
352 /* The index of this block in its zone's summary */
353 block_count_t index;
354 /* Whether this block has a write outstanding */
355 bool writing;
356 /* Ring of updates waiting on the outstanding write */
357 struct vdo_wait_queue current_update_waiters;
358 /* Ring of updates waiting on the next write */
359 struct vdo_wait_queue next_update_waiters;
360 /* The active slab_summary_entry array for this block */
361 struct slab_summary_entry *entries;
362 /* The vio used to write this block */
363 struct vio vio;
364 /* The packed entries, one block long, backing the vio */
365 char *outgoing_entries;
366 };
367
368 /*
369 * The statistics for all the slab summary zones owned by this slab summary. These fields are all
370 * mutated only by their physical zone threads, but are read by other threads when gathering
371 * statistics for the entire depot.
372 */
373 struct atomic_slab_summary_statistics {
374 /* Number of blocks written */
375 atomic64_t blocks_written;
376 };
377
378 struct block_allocator {
379 struct vdo_completion completion;
380 /* The slab depot for this allocator */
381 struct slab_depot *depot;
382 /* The nonce of the VDO */
383 nonce_t nonce;
384 /* The physical zone number of this allocator */
385 zone_count_t zone_number;
386 /* The thread ID for this allocator's physical zone */
387 thread_id_t thread_id;
388 /* The number of slabs in this allocator */
389 slab_count_t slab_count;
390 /* The number of the last slab owned by this allocator */
391 slab_count_t last_slab;
392 /* The reduced priority level used to preserve unopened slabs */
393 unsigned int unopened_slab_priority;
394 /* The state of this allocator */
395 struct admin_state state;
396 /* The actor for applying an action to all slabs */
397 struct slab_actor slab_actor;
398
399 /* The slab from which blocks are currently being allocated */
400 struct vdo_slab *open_slab;
401 /* A priority queue containing all slabs available for allocation */
402 struct priority_table *prioritized_slabs;
403 /* The slab scrubber */
404 struct slab_scrubber scrubber;
405 /* What phase of the close operation the allocator is to perform */
406 enum block_allocator_drain_step drain_step;
407
408 /*
409 * These statistics are all mutated only by the physical zone thread, but are read by other
410 * threads when gathering statistics for the entire depot.
411 */
412 /*
413 * The count of allocated blocks in this zone. Not in block_allocator_statistics for
414 * historical reasons.
415 */
416 u64 allocated_blocks;
417 /* Statistics for this block allocator */
418 struct block_allocator_statistics statistics;
419 /* Cumulative statistics for the slab journals in this zone */
420 struct slab_journal_statistics slab_journal_statistics;
421 /* Cumulative statistics for the reference counters in this zone */
422 struct ref_counts_statistics ref_counts_statistics;
423
424 /*
425 * This is the head of a queue of slab journals which have entries in their tail blocks
426 * which have not yet started to commit. When the recovery journal is under space pressure,
427 * slab journals which have uncommitted entries holding a lock on the recovery journal head
428 * are forced to commit their blocks early. This list is kept in order, with the tail
429 * containing the slab journal holding the most recent recovery journal lock.
430 */
431 struct list_head dirty_slab_journals;
432
433 /* The vio pool for reading and writing block allocator metadata */
434 struct vio_pool *vio_pool;
435 /* The vio pool for large initial reads of ref count areas */
436 struct vio_pool *refcount_big_vio_pool;
437 /* How many ref count blocks are read per vio at initial load */
438 u32 refcount_blocks_per_big_vio;
439 /* The dm_kcopyd client for erasing slab journals */
440 struct dm_kcopyd_client *eraser;
441 /* Iterator over the slabs to be erased */
442 struct slab_iterator slabs_to_erase;
443
444 /* The portion of the slab summary managed by this allocator */
445 /* The state of the slab summary */
446 struct admin_state summary_state;
447 /* The number of outstanding summary writes */
448 block_count_t summary_write_count;
449 /* The array (owned by the blocks) of all entries */
450 struct slab_summary_entry *summary_entries;
451 /* The array of slab_summary_blocks */
452 struct slab_summary_block *summary_blocks;
453 };
454
455 enum slab_depot_load_type {
456 VDO_SLAB_DEPOT_NORMAL_LOAD,
457 VDO_SLAB_DEPOT_RECOVERY_LOAD,
458 VDO_SLAB_DEPOT_REBUILD_LOAD
459 };
460
461 struct slab_depot {
462 zone_count_t zone_count;
463 zone_count_t old_zone_count;
464 struct vdo *vdo;
465 struct slab_config slab_config;
466 struct action_manager *action_manager;
467
468 physical_block_number_t first_block;
469 physical_block_number_t last_block;
470 physical_block_number_t origin;
471
472 /* slab_size == (1 << slab_size_shift) */
473 unsigned int slab_size_shift;
474
475 /* Determines how slabs should be queued during load */
476 enum slab_depot_load_type load_type;
477
478 /* The state for notifying slab journals to release recovery journal */
479 sequence_number_t active_release_request;
480 sequence_number_t new_release_request;
481
482 /* State variables for scrubbing complete handling */
483 atomic_t zones_to_scrub;
484
485 /* Array of pointers to individually allocated slabs */
486 struct vdo_slab **slabs;
487 /* The number of slabs currently allocated and stored in 'slabs' */
488 slab_count_t slab_count;
489
490 /* Array of pointers to a larger set of slabs (used during resize) */
491 struct vdo_slab **new_slabs;
492 /* The number of slabs currently allocated and stored in 'new_slabs' */
493 slab_count_t new_slab_count;
494 /* The size that 'new_slabs' was allocated for */
495 block_count_t new_size;
496
497 /* The last block before resize, for rollback */
498 physical_block_number_t old_last_block;
499 /* The last block after resize, for resize */
500 physical_block_number_t new_last_block;
501
502 /* The statistics for the slab summary */
503 struct atomic_slab_summary_statistics summary_statistics;
504 /* The start of the slab summary partition */
505 physical_block_number_t summary_origin;
506 /* The number of bits to shift to get a 7-bit fullness hint */
507 unsigned int hint_shift;
508 /* The slab summary entries for all of the zones the partition can hold */
509 struct slab_summary_entry *summary_entries;
510
511 /* The block allocators for this depot */
512 struct block_allocator allocators[];
513 };
514
515 struct reference_updater;
516
517 bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab,
518 physical_block_number_t pbn,
519 enum journal_operation operation,
520 bool increment,
521 struct journal_point *recovery_point,
522 struct vdo_completion *parent);
523
524 int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
525 physical_block_number_t pbn,
526 enum journal_operation operation);
527
vdo_as_block_allocator(struct vdo_completion * completion)528 static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion)
529 {
530 vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION);
531 return container_of(completion, struct block_allocator, completion);
532 }
533
534 int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab,
535 physical_block_number_t pbn,
536 struct pbn_lock *lock);
537
538 int __must_check vdo_allocate_block(struct block_allocator *allocator,
539 physical_block_number_t *block_number_ptr);
540
541 int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
542 struct vdo_waiter *waiter);
543
544 void vdo_modify_reference_count(struct vdo_completion *completion,
545 struct reference_updater *updater);
546
547 int __must_check vdo_release_block_reference(struct block_allocator *allocator,
548 physical_block_number_t pbn);
549
550 void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
551
552 void vdo_dump_block_allocator(const struct block_allocator *allocator);
553
554 int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
555 struct vdo *vdo,
556 struct partition *summary_partition,
557 struct slab_depot **depot_ptr);
558
559 void vdo_free_slab_depot(struct slab_depot *depot);
560
561 struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot);
562
563 int __must_check vdo_allocate_reference_counters(struct slab_depot *depot);
564
565 struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot,
566 physical_block_number_t pbn);
567
568 u8 __must_check vdo_get_increment_limit(struct slab_depot *depot,
569 physical_block_number_t pbn);
570
571 bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot,
572 physical_block_number_t pbn);
573
574 block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot);
575
576 block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot);
577
578 void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
579 struct vdo_statistics *stats);
580
581 void vdo_load_slab_depot(struct slab_depot *depot,
582 const struct admin_state_code *operation,
583 struct vdo_completion *parent, void *context);
584
585 void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
586 enum slab_depot_load_type load_type,
587 struct vdo_completion *parent);
588
589 void vdo_update_slab_depot_size(struct slab_depot *depot);
590
591 int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
592 const struct partition *partition);
593
594 void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent);
595
596 void vdo_abandon_new_slabs(struct slab_depot *depot);
597
598 void vdo_drain_slab_depot(struct slab_depot *depot,
599 const struct admin_state_code *operation,
600 struct vdo_completion *parent);
601
602 void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent);
603
604 void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
605 sequence_number_t recovery_block_number);
606
607 void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
608 struct vdo_completion *parent);
609
610 void vdo_dump_slab_depot(const struct slab_depot *depot);
611
612 #endif /* VDO_SLAB_DEPOT_H */
613