xref: /linux/drivers/md/dm-vdo/slab-depot.h (revision 5014bebee0cffda14fafae5a2534d08120b7b9e8)
1 /* SPDX-License-Identifier: GPL-2.0-only */
2 /*
3  * Copyright 2023 Red Hat
4  */
5 
6 #ifndef VDO_SLAB_DEPOT_H
7 #define VDO_SLAB_DEPOT_H
8 
9 #include <linux/atomic.h>
10 #include <linux/dm-kcopyd.h>
11 #include <linux/list.h>
12 
13 #include "numeric.h"
14 
15 #include "admin-state.h"
16 #include "completion.h"
17 #include "data-vio.h"
18 #include "encodings.h"
19 #include "physical-zone.h"
20 #include "priority-table.h"
21 #include "recovery-journal.h"
22 #include "statistics.h"
23 #include "types.h"
24 #include "vio.h"
25 #include "wait-queue.h"
26 
27 /*
28  * A slab_depot is responsible for managing all of the slabs and block allocators of a VDO. It has
29  * a single array of slabs in order to eliminate the need for additional math in order to compute
30  * which physical zone a PBN is in. It also has a block_allocator per zone.
31  *
32  * Each physical zone has a single dedicated queue and thread for performing all updates to the
33  * slabs assigned to that zone. The concurrency guarantees of this single-threaded model allow the
34  * code to omit more fine-grained locking for the various slab structures. Each physical zone
35  * maintains a separate copy of the slab summary to remove the need for explicit locking on that
36  * structure as well.
37  *
38  * Load operations must be performed on the admin thread. Normal operations, such as allocations
39  * and reference count updates, must be performed on the appropriate physical zone thread. Requests
40  * from the recovery journal to commit slab journal tail blocks must be scheduled from the recovery
41  * journal thread to run on the appropriate physical zone thread. Save operations must be launched
42  * from the same admin thread as the original load operation.
43  */
44 
45 enum {
46 	/* The number of vios in the vio pool is proportional to the throughput of the VDO. */
47 	BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128,
48 
49 	/*
50 	 * The number of vios in the vio pool used for loading reference count data. A slab's
51 	 * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be
52 	 * plenty.
53 	 */
54 	BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9,
55 };
56 
57 /*
58  * Represents the possible status of a block.
59  */
60 enum reference_status {
61 	RS_FREE, /* this block is free */
62 	RS_SINGLE, /* this block is singly-referenced */
63 	RS_SHARED, /* this block is shared */
64 	RS_PROVISIONAL /* this block is provisionally allocated */
65 };
66 
67 struct vdo_slab;
68 
69 struct journal_lock {
70 	u16 count;
71 	sequence_number_t recovery_start;
72 };
73 
74 struct slab_journal {
75 	/* A waiter object for getting a VIO pool entry */
76 	struct vdo_waiter resource_waiter;
77 	/* A waiter object for updating the slab summary */
78 	struct vdo_waiter slab_summary_waiter;
79 	/* A waiter object for getting a vio with which to flush */
80 	struct vdo_waiter flush_waiter;
81 	/* The queue of VIOs waiting to make an entry */
82 	struct vdo_wait_queue entry_waiters;
83 	/* The parent slab reference of this journal */
84 	struct vdo_slab *slab;
85 
86 	/* Whether a tail block commit is pending */
87 	bool waiting_to_commit;
88 	/* Whether the journal is updating the slab summary */
89 	bool updating_slab_summary;
90 	/* Whether the journal is adding entries from the entry_waiters queue */
91 	bool adding_entries;
92 	/* Whether a partial write is in progress */
93 	bool partial_write_in_progress;
94 
95 	/* The oldest block in the journal on disk */
96 	sequence_number_t head;
97 	/* The oldest block in the journal which may not be reaped */
98 	sequence_number_t unreapable;
99 	/* The end of the half-open interval of the active journal */
100 	sequence_number_t tail;
101 	/* The next journal block to be committed */
102 	sequence_number_t next_commit;
103 	/* The tail sequence number that is written in the slab summary */
104 	sequence_number_t summarized;
105 	/* The tail sequence number that was last summarized in slab summary */
106 	sequence_number_t last_summarized;
107 
108 	/* The sequence number of the recovery journal lock */
109 	sequence_number_t recovery_lock;
110 
111 	/*
112 	 * The number of entries which fit in a single block. Can't use the constant because unit
113 	 * tests change this number.
114 	 */
115 	journal_entry_count_t entries_per_block;
116 	/*
117 	 * The number of full entries which fit in a single block. Can't use the constant because
118 	 * unit tests change this number.
119 	 */
120 	journal_entry_count_t full_entries_per_block;
121 
122 	/* The recovery journal of the VDO (slab journal holds locks on it) */
123 	struct recovery_journal *recovery_journal;
124 
125 	/* The statistics shared by all slab journals in our physical zone */
126 	struct slab_journal_statistics *events;
127 	/* A list of the VIO pool entries for outstanding journal block writes */
128 	struct list_head uncommitted_blocks;
129 
130 	/*
131 	 * The current tail block header state. This will be packed into the block just before it
132 	 * is written.
133 	 */
134 	struct slab_journal_block_header tail_header;
135 	/* A pointer to a block-sized buffer holding the packed block data */
136 	struct packed_slab_journal_block *block;
137 
138 	/* The number of blocks in the on-disk journal */
139 	block_count_t size;
140 	/* The number of blocks at which to start pushing reference blocks */
141 	block_count_t flushing_threshold;
142 	/* The number of blocks at which all reference blocks should be writing */
143 	block_count_t flushing_deadline;
144 	/* The number of blocks at which to wait for reference blocks to write */
145 	block_count_t blocking_threshold;
146 	/* The number of blocks at which to scrub the slab before coming online */
147 	block_count_t scrubbing_threshold;
148 
149 	/* This list entry is for block_allocator to keep a queue of dirty journals */
150 	struct list_head dirty_entry;
151 
152 	/* The lock for the oldest unreaped block of the journal */
153 	struct journal_lock *reap_lock;
154 	/* The locks for each on disk block */
155 	struct journal_lock *locks;
156 };
157 
158 /*
159  * Reference_block structure
160  *
161  * Blocks are used as a proxy, permitting saves of partial refcounts.
162  */
163 struct reference_block {
164 	/* This block waits on the ref_counts to tell it to write */
165 	struct vdo_waiter waiter;
166 	/* The slab to which this reference_block belongs */
167 	struct vdo_slab *slab;
168 	/* The number of references in this block that represent allocations */
169 	block_size_t allocated_count;
170 	/* The slab journal block on which this block must hold a lock */
171 	sequence_number_t slab_journal_lock;
172 	/* The slab journal block which should be released when this block is committed */
173 	sequence_number_t slab_journal_lock_to_release;
174 	/* The point up to which each sector is accurate on disk */
175 	struct journal_point commit_points[VDO_SECTORS_PER_BLOCK];
176 	/* Whether this block has been modified since it was written to disk */
177 	bool is_dirty;
178 	/* Whether this block is currently writing */
179 	bool is_writing;
180 };
181 
182 /* The search_cursor represents the saved position of a free block search. */
183 struct search_cursor {
184 	/* The reference block containing the current search index */
185 	struct reference_block *block;
186 	/* The position at which to start searching for the next free counter */
187 	slab_block_number index;
188 	/* The position just past the last valid counter in the current block */
189 	slab_block_number end_index;
190 
191 	/* A pointer to the first reference block in the slab */
192 	struct reference_block *first_block;
193 	/* A pointer to the last reference block in the slab */
194 	struct reference_block *last_block;
195 };
196 
197 enum slab_rebuild_status {
198 	VDO_SLAB_REBUILT,
199 	VDO_SLAB_REPLAYING,
200 	VDO_SLAB_REQUIRES_SCRUBBING,
201 	VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING,
202 	VDO_SLAB_REBUILDING,
203 };
204 
205 /*
206  * This is the type declaration for the vdo_slab type. A vdo_slab currently consists of a run of
207  * 2^23 data blocks, but that will soon change to dedicate a small number of those blocks for
208  * metadata storage for the reference counts and slab journal for the slab.
209  *
210  * A reference count is maintained for each physical block number. The vast majority of blocks have
211  * a very small reference count (usually 0 or 1). For references less than or equal to MAXIMUM_REFS
212  * (254) the reference count is stored in counters[pbn].
213  */
214 struct vdo_slab {
215 	/* A list entry to queue this slab in a block_allocator list */
216 	struct list_head allocq_entry;
217 
218 	/* The struct block_allocator that owns this slab */
219 	struct block_allocator *allocator;
220 
221 	/* The journal for this slab */
222 	struct slab_journal journal;
223 
224 	/* The slab number of this slab */
225 	slab_count_t slab_number;
226 	/* The offset in the allocator partition of the first block in this slab */
227 	physical_block_number_t start;
228 	/* The offset of the first block past the end of this slab */
229 	physical_block_number_t end;
230 	/* The starting translated PBN of the slab journal */
231 	physical_block_number_t journal_origin;
232 	/* The starting translated PBN of the reference counts */
233 	physical_block_number_t ref_counts_origin;
234 
235 	/* The administrative state of the slab */
236 	struct admin_state state;
237 	/* The status of the slab */
238 	enum slab_rebuild_status status;
239 	/* Whether the slab was ever queued for scrubbing */
240 	bool was_queued_for_scrubbing;
241 
242 	/* The priority at which this slab has been queued for allocation */
243 	u8 priority;
244 
245 	/* Fields beyond this point are the reference counts for the data blocks in this slab. */
246 	/* The size of the counters array */
247 	u32 block_count;
248 	/* The number of free blocks */
249 	u32 free_blocks;
250 	/* The array of reference counts */
251 	vdo_refcount_t *counters; /* use vdo_allocate() to align data ptr */
252 
253 	/* The saved block pointer and array indexes for the free block search */
254 	struct search_cursor search_cursor;
255 
256 	/* A list of the dirty blocks waiting to be written out */
257 	struct vdo_wait_queue dirty_blocks;
258 	/* The number of blocks which are currently reading or writing */
259 	size_t active_count;
260 
261 	/* A waiter object for updating the slab summary */
262 	struct vdo_waiter summary_waiter;
263 
264 	/* The latest slab journal for which there has been a reference count update */
265 	struct journal_point slab_journal_point;
266 
267 	/* The number of reference count blocks */
268 	u32 reference_block_count;
269 	/* reference count block array */
270 	struct reference_block *reference_blocks;
271 };
272 
273 enum block_allocator_drain_step {
274 	VDO_DRAIN_ALLOCATOR_START,
275 	VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER,
276 	VDO_DRAIN_ALLOCATOR_STEP_SLABS,
277 	VDO_DRAIN_ALLOCATOR_STEP_SUMMARY,
278 	VDO_DRAIN_ALLOCATOR_STEP_FINISHED,
279 };
280 
281 struct slab_scrubber {
282 	/* The queue of slabs to scrub first */
283 	struct list_head high_priority_slabs;
284 	/* The queue of slabs to scrub once there are no high_priority_slabs */
285 	struct list_head slabs;
286 	/* The queue of VIOs waiting for a slab to be scrubbed */
287 	struct vdo_wait_queue waiters;
288 
289 	/*
290 	 * The number of slabs that are unrecovered or being scrubbed. This field is modified by
291 	 * the physical zone thread, but is queried by other threads.
292 	 */
293 	slab_count_t slab_count;
294 
295 	/* The administrative state of the scrubber */
296 	struct admin_state admin_state;
297 	/* Whether to only scrub high-priority slabs */
298 	bool high_priority_only;
299 	/* The slab currently being scrubbed */
300 	struct vdo_slab *slab;
301 	/* The vio for loading slab journal blocks */
302 	struct vio vio;
303 };
304 
305 /* A sub-structure for applying actions in parallel to all an allocator's slabs. */
306 struct slab_actor {
307 	/* The number of slabs performing a slab action */
308 	slab_count_t slab_action_count;
309 	/* The method to call when a slab action has been completed by all slabs */
310 	vdo_action_fn callback;
311 };
312 
313 /* A slab_iterator is a structure for iterating over a set of slabs. */
314 struct slab_iterator {
315 	struct vdo_slab **slabs;
316 	struct vdo_slab *next;
317 	slab_count_t end;
318 	slab_count_t stride;
319 };
320 
321 /*
322  * The slab_summary provides hints during load and recovery about the state of the slabs in order
323  * to avoid the need to read the slab journals in their entirety before a VDO can come online.
324  *
325  * The information in the summary for each slab includes the rough number of free blocks (which is
326  * used to prioritize scrubbing), the cleanliness of a slab (so that clean slabs containing free
327  * space will be used on restart), and the location of the tail block of the slab's journal.
328  *
329  * The slab_summary has its own partition at the end of the volume which is sized to allow for a
330  * complete copy of the summary for each of up to 16 physical zones.
331  *
332  * During resize, the slab_summary moves its backing partition and is saved once moved; the
333  * slab_summary is not permitted to overwrite the previous recovery journal space.
334  *
335  * The slab_summary does not have its own version information, but relies on the VDO volume version
336  * number.
337  */
338 
339 /*
340  * A slab status is a very small structure for use in determining the ordering of slabs in the
341  * scrubbing process.
342  */
343 struct slab_status {
344 	slab_count_t slab_number;
345 	bool is_clean;
346 	u8 emptiness;
347 };
348 
349 struct slab_summary_block {
350 	/* The block_allocator to which this block belongs */
351 	struct block_allocator *allocator;
352 	/* The index of this block in its zone's summary */
353 	block_count_t index;
354 	/* Whether this block has a write outstanding */
355 	bool writing;
356 	/* Ring of updates waiting on the outstanding write */
357 	struct vdo_wait_queue current_update_waiters;
358 	/* Ring of updates waiting on the next write */
359 	struct vdo_wait_queue next_update_waiters;
360 	/* The active slab_summary_entry array for this block */
361 	struct slab_summary_entry *entries;
362 	/* The vio used to write this block */
363 	struct vio vio;
364 	/* The packed entries, one block long, backing the vio */
365 	char *outgoing_entries;
366 };
367 
368 /*
369  * The statistics for all the slab summary zones owned by this slab summary. These fields are all
370  * mutated only by their physical zone threads, but are read by other threads when gathering
371  * statistics for the entire depot.
372  */
373 struct atomic_slab_summary_statistics {
374 	/* Number of blocks written */
375 	atomic64_t blocks_written;
376 };
377 
378 struct block_allocator {
379 	struct vdo_completion completion;
380 	/* The slab depot for this allocator */
381 	struct slab_depot *depot;
382 	/* The nonce of the VDO */
383 	nonce_t nonce;
384 	/* The physical zone number of this allocator */
385 	zone_count_t zone_number;
386 	/* The thread ID for this allocator's physical zone */
387 	thread_id_t thread_id;
388 	/* The number of slabs in this allocator */
389 	slab_count_t slab_count;
390 	/* The number of the last slab owned by this allocator */
391 	slab_count_t last_slab;
392 	/* The reduced priority level used to preserve unopened slabs */
393 	unsigned int unopened_slab_priority;
394 	/* The state of this allocator */
395 	struct admin_state state;
396 	/* The actor for applying an action to all slabs */
397 	struct slab_actor slab_actor;
398 
399 	/* The slab from which blocks are currently being allocated */
400 	struct vdo_slab *open_slab;
401 	/* A priority queue containing all slabs available for allocation */
402 	struct priority_table *prioritized_slabs;
403 	/* The slab scrubber */
404 	struct slab_scrubber scrubber;
405 	/* What phase of the close operation the allocator is to perform */
406 	enum block_allocator_drain_step drain_step;
407 
408 	/*
409 	 * These statistics are all mutated only by the physical zone thread, but are read by other
410 	 * threads when gathering statistics for the entire depot.
411 	 */
412 	/*
413 	 * The count of allocated blocks in this zone. Not in block_allocator_statistics for
414 	 * historical reasons.
415 	 */
416 	u64 allocated_blocks;
417 	/* Statistics for this block allocator */
418 	struct block_allocator_statistics statistics;
419 	/* Cumulative statistics for the slab journals in this zone */
420 	struct slab_journal_statistics slab_journal_statistics;
421 	/* Cumulative statistics for the reference counters in this zone */
422 	struct ref_counts_statistics ref_counts_statistics;
423 
424 	/*
425 	 * This is the head of a queue of slab journals which have entries in their tail blocks
426 	 * which have not yet started to commit. When the recovery journal is under space pressure,
427 	 * slab journals which have uncommitted entries holding a lock on the recovery journal head
428 	 * are forced to commit their blocks early. This list is kept in order, with the tail
429 	 * containing the slab journal holding the most recent recovery journal lock.
430 	 */
431 	struct list_head dirty_slab_journals;
432 
433 	/* The vio pool for reading and writing block allocator metadata */
434 	struct vio_pool *vio_pool;
435 	/* The vio pool for large initial reads of ref count areas */
436 	struct vio_pool *refcount_big_vio_pool;
437 	/* How many ref count blocks are read per vio at initial load */
438 	u32 refcount_blocks_per_big_vio;
439 	/* The dm_kcopyd client for erasing slab journals */
440 	struct dm_kcopyd_client *eraser;
441 	/* Iterator over the slabs to be erased */
442 	struct slab_iterator slabs_to_erase;
443 
444 	/* The portion of the slab summary managed by this allocator */
445 	/* The state of the slab summary */
446 	struct admin_state summary_state;
447 	/* The number of outstanding summary writes */
448 	block_count_t summary_write_count;
449 	/* The array (owned by the blocks) of all entries */
450 	struct slab_summary_entry *summary_entries;
451 	/* The array of slab_summary_blocks */
452 	struct slab_summary_block *summary_blocks;
453 };
454 
455 enum slab_depot_load_type {
456 	VDO_SLAB_DEPOT_NORMAL_LOAD,
457 	VDO_SLAB_DEPOT_RECOVERY_LOAD,
458 	VDO_SLAB_DEPOT_REBUILD_LOAD
459 };
460 
461 struct slab_depot {
462 	zone_count_t zone_count;
463 	zone_count_t old_zone_count;
464 	struct vdo *vdo;
465 	struct slab_config slab_config;
466 	struct action_manager *action_manager;
467 
468 	physical_block_number_t first_block;
469 	physical_block_number_t last_block;
470 	physical_block_number_t origin;
471 
472 	/* slab_size == (1 << slab_size_shift) */
473 	unsigned int slab_size_shift;
474 
475 	/* Determines how slabs should be queued during load */
476 	enum slab_depot_load_type load_type;
477 
478 	/* The state for notifying slab journals to release recovery journal */
479 	sequence_number_t active_release_request;
480 	sequence_number_t new_release_request;
481 
482 	/* State variables for scrubbing complete handling */
483 	atomic_t zones_to_scrub;
484 
485 	/* Array of pointers to individually allocated slabs */
486 	struct vdo_slab **slabs;
487 	/* The number of slabs currently allocated and stored in 'slabs' */
488 	slab_count_t slab_count;
489 
490 	/* Array of pointers to a larger set of slabs (used during resize) */
491 	struct vdo_slab **new_slabs;
492 	/* The number of slabs currently allocated and stored in 'new_slabs' */
493 	slab_count_t new_slab_count;
494 	/* The size that 'new_slabs' was allocated for */
495 	block_count_t new_size;
496 
497 	/* The last block before resize, for rollback */
498 	physical_block_number_t old_last_block;
499 	/* The last block after resize, for resize */
500 	physical_block_number_t new_last_block;
501 
502 	/* The statistics for the slab summary */
503 	struct atomic_slab_summary_statistics summary_statistics;
504 	/* The start of the slab summary partition */
505 	physical_block_number_t summary_origin;
506 	/* The number of bits to shift to get a 7-bit fullness hint */
507 	unsigned int hint_shift;
508 	/* The slab summary entries for all of the zones the partition can hold */
509 	struct slab_summary_entry *summary_entries;
510 
511 	/* The block allocators for this depot */
512 	struct block_allocator allocators[];
513 };
514 
515 struct reference_updater;
516 
517 bool __must_check vdo_attempt_replay_into_slab(struct vdo_slab *slab,
518 					       physical_block_number_t pbn,
519 					       enum journal_operation operation,
520 					       bool increment,
521 					       struct journal_point *recovery_point,
522 					       struct vdo_completion *parent);
523 
524 int __must_check vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
525 							physical_block_number_t pbn,
526 							enum journal_operation operation);
527 
vdo_as_block_allocator(struct vdo_completion * completion)528 static inline struct block_allocator *vdo_as_block_allocator(struct vdo_completion *completion)
529 {
530 	vdo_assert_completion_type(completion, VDO_BLOCK_ALLOCATOR_COMPLETION);
531 	return container_of(completion, struct block_allocator, completion);
532 }
533 
534 int __must_check vdo_acquire_provisional_reference(struct vdo_slab *slab,
535 						   physical_block_number_t pbn,
536 						   struct pbn_lock *lock);
537 
538 int __must_check vdo_allocate_block(struct block_allocator *allocator,
539 				    physical_block_number_t *block_number_ptr);
540 
541 int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
542 				  struct vdo_waiter *waiter);
543 
544 void vdo_modify_reference_count(struct vdo_completion *completion,
545 				struct reference_updater *updater);
546 
547 int __must_check vdo_release_block_reference(struct block_allocator *allocator,
548 					     physical_block_number_t pbn);
549 
550 void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion);
551 
552 void vdo_dump_block_allocator(const struct block_allocator *allocator);
553 
554 int __must_check vdo_decode_slab_depot(struct slab_depot_state_2_0 state,
555 				       struct vdo *vdo,
556 				       struct partition *summary_partition,
557 				       struct slab_depot **depot_ptr);
558 
559 void vdo_free_slab_depot(struct slab_depot *depot);
560 
561 struct slab_depot_state_2_0 __must_check vdo_record_slab_depot(const struct slab_depot *depot);
562 
563 int __must_check vdo_allocate_reference_counters(struct slab_depot *depot);
564 
565 struct vdo_slab * __must_check vdo_get_slab(const struct slab_depot *depot,
566 					    physical_block_number_t pbn);
567 
568 u8 __must_check vdo_get_increment_limit(struct slab_depot *depot,
569 					physical_block_number_t pbn);
570 
571 bool __must_check vdo_is_physical_data_block(const struct slab_depot *depot,
572 					     physical_block_number_t pbn);
573 
574 block_count_t __must_check vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot);
575 
576 block_count_t __must_check vdo_get_slab_depot_data_blocks(const struct slab_depot *depot);
577 
578 void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
579 				   struct vdo_statistics *stats);
580 
581 void vdo_load_slab_depot(struct slab_depot *depot,
582 			 const struct admin_state_code *operation,
583 			 struct vdo_completion *parent, void *context);
584 
585 void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
586 					enum slab_depot_load_type load_type,
587 					struct vdo_completion *parent);
588 
589 void vdo_update_slab_depot_size(struct slab_depot *depot);
590 
591 int __must_check vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
592 						const struct partition *partition);
593 
594 void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent);
595 
596 void vdo_abandon_new_slabs(struct slab_depot *depot);
597 
598 void vdo_drain_slab_depot(struct slab_depot *depot,
599 			  const struct admin_state_code *operation,
600 			  struct vdo_completion *parent);
601 
602 void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent);
603 
604 void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
605 						sequence_number_t recovery_block_number);
606 
607 void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
608 				     struct vdo_completion *parent);
609 
610 void vdo_dump_slab_depot(const struct slab_depot *depot);
611 
612 #endif /* VDO_SLAB_DEPOT_H */
613