xref: /linux/drivers/md/dm-vdo/slab-depot.c (revision 0637a68b9c6c1dfffcc1fca003cb7cd3257c3c03)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright 2023 Red Hat
4  */
5 
6 #include "slab-depot.h"
7 
8 #include <linux/atomic.h>
9 #include <linux/bio.h>
10 #include <linux/err.h>
11 #include <linux/log2.h>
12 #include <linux/min_heap.h>
13 #include <linux/minmax.h>
14 
15 #include "logger.h"
16 #include "memory-alloc.h"
17 #include "numeric.h"
18 #include "permassert.h"
19 #include "string-utils.h"
20 
21 #include "action-manager.h"
22 #include "admin-state.h"
23 #include "completion.h"
24 #include "constants.h"
25 #include "data-vio.h"
26 #include "encodings.h"
27 #include "io-submitter.h"
28 #include "physical-zone.h"
29 #include "priority-table.h"
30 #include "recovery-journal.h"
31 #include "repair.h"
32 #include "status-codes.h"
33 #include "types.h"
34 #include "vdo.h"
35 #include "vio.h"
36 #include "wait-queue.h"
37 
38 static const u64 BYTES_PER_WORD = sizeof(u64);
39 static const bool NORMAL_OPERATION = true;
40 
41 /**
42  * get_lock() - Get the lock object for a slab journal block by sequence number.
43  * @journal: vdo_slab journal to retrieve from.
44  * @sequence_number: Sequence number of the block.
45  *
46  * Return: The lock object for the given sequence number.
47  */
48 static inline struct journal_lock * __must_check get_lock(struct slab_journal *journal,
49 							  sequence_number_t sequence_number)
50 {
51 	return &journal->locks[sequence_number % journal->size];
52 }
53 
54 static bool is_slab_open(struct vdo_slab *slab)
55 {
56 	return (!vdo_is_state_quiescing(&slab->state) &&
57 		!vdo_is_state_quiescent(&slab->state));
58 }
59 
60 /**
61  * must_make_entries_to_flush() - Check whether there are entry waiters which should delay a flush.
62  * @journal: The journal to check.
63  *
64  * Return: true if there are no entry waiters, or if the slab is unrecovered.
65  */
66 static inline bool __must_check must_make_entries_to_flush(struct slab_journal *journal)
67 {
68 	return ((journal->slab->status != VDO_SLAB_REBUILDING) &&
69 		vdo_waitq_has_waiters(&journal->entry_waiters));
70 }
71 
72 /**
73  * is_reaping() - Check whether a reap is currently in progress.
74  * @journal: The journal which may be reaping.
75  *
76  * Return: true if the journal is reaping.
77  */
78 static inline bool __must_check is_reaping(struct slab_journal *journal)
79 {
80 	return (journal->head != journal->unreapable);
81 }
82 
83 /**
84  * initialize_tail_block() - Initialize tail block as a new block.
85  * @journal: The journal whose tail block is being initialized.
86  */
87 static void initialize_tail_block(struct slab_journal *journal)
88 {
89 	struct slab_journal_block_header *header = &journal->tail_header;
90 
91 	header->sequence_number = journal->tail;
92 	header->entry_count = 0;
93 	header->has_block_map_increments = false;
94 }
95 
96 /**
97  * initialize_journal_state() - Set all journal fields appropriately to start journaling.
98  * @journal: The journal to be reset, based on its tail sequence number.
99  */
100 static void initialize_journal_state(struct slab_journal *journal)
101 {
102 	journal->unreapable = journal->head;
103 	journal->reap_lock = get_lock(journal, journal->unreapable);
104 	journal->next_commit = journal->tail;
105 	journal->summarized = journal->last_summarized = journal->tail;
106 	initialize_tail_block(journal);
107 }
108 
109 /**
110  * block_is_full() - Check whether a journal block is full.
111  * @journal: The slab journal for the block.
112  *
113  * Return: true if the tail block is full.
114  */
115 static bool __must_check block_is_full(struct slab_journal *journal)
116 {
117 	journal_entry_count_t count = journal->tail_header.entry_count;
118 
119 	return (journal->tail_header.has_block_map_increments ?
120 		(journal->full_entries_per_block == count) :
121 		(journal->entries_per_block == count));
122 }
123 
124 static void add_entries(struct slab_journal *journal);
125 static void update_tail_block_location(struct slab_journal *journal);
126 static void release_journal_locks(struct vdo_waiter *waiter, void *context);
127 
128 /**
129  * is_slab_journal_blank() - Check whether a slab's journal is blank.
130  *
131  * A slab journal is blank if it has never had any entries recorded in it.
132  *
133  * Return: true if the slab's journal has never been modified.
134  */
135 static bool is_slab_journal_blank(const struct vdo_slab *slab)
136 {
137 	return ((slab->journal.tail == 1) &&
138 		(slab->journal.tail_header.entry_count == 0));
139 }
140 
141 /**
142  * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct
143  *                             order.
144  * @journal: The journal to be marked dirty.
145  * @lock: The recovery journal lock held by the slab journal.
146  */
147 static void mark_slab_journal_dirty(struct slab_journal *journal, sequence_number_t lock)
148 {
149 	struct slab_journal *dirty_journal;
150 	struct list_head *dirty_list = &journal->slab->allocator->dirty_slab_journals;
151 
152 	VDO_ASSERT_LOG_ONLY(journal->recovery_lock == 0, "slab journal was clean");
153 
154 	journal->recovery_lock = lock;
155 	list_for_each_entry_reverse(dirty_journal, dirty_list, dirty_entry) {
156 		if (dirty_journal->recovery_lock <= journal->recovery_lock)
157 			break;
158 	}
159 
160 	list_move_tail(&journal->dirty_entry, dirty_journal->dirty_entry.next);
161 }
162 
163 static void mark_slab_journal_clean(struct slab_journal *journal)
164 {
165 	journal->recovery_lock = 0;
166 	list_del_init(&journal->dirty_entry);
167 }
168 
169 static void check_if_slab_drained(struct vdo_slab *slab)
170 {
171 	bool read_only;
172 	struct slab_journal *journal = &slab->journal;
173 	const struct admin_state_code *code;
174 
175 	if (!vdo_is_state_draining(&slab->state) ||
176 	    must_make_entries_to_flush(journal) ||
177 	    is_reaping(journal) ||
178 	    journal->waiting_to_commit ||
179 	    !list_empty(&journal->uncommitted_blocks) ||
180 	    journal->updating_slab_summary ||
181 	    (slab->active_count > 0))
182 		return;
183 
184 	/* When not suspending or recovering, the slab must be clean. */
185 	code = vdo_get_admin_state_code(&slab->state);
186 	read_only = vdo_is_read_only(slab->allocator->depot->vdo);
187 	if (!read_only &&
188 	    vdo_waitq_has_waiters(&slab->dirty_blocks) &&
189 	    (code != VDO_ADMIN_STATE_SUSPENDING) &&
190 	    (code != VDO_ADMIN_STATE_RECOVERING))
191 		return;
192 
193 	vdo_finish_draining_with_result(&slab->state,
194 					(read_only ? VDO_READ_ONLY : VDO_SUCCESS));
195 }
196 
197 /* FULLNESS HINT COMPUTATION */
198 
199 /**
200  * compute_fullness_hint() - Translate a slab's free block count into a 'fullness hint' that can be
201  *                           stored in a slab_summary_entry's 7 bits that are dedicated to its free
202  *                           count.
203  * @depot: The depot whose summary being updated.
204  * @free_blocks: The number of free blocks.
205  *
206  * Note: the number of free blocks must be strictly less than 2^23 blocks, even though
207  * theoretically slabs could contain precisely 2^23 blocks; there is an assumption that at least
208  * one block is used by metadata. This assumption is necessary; otherwise, the fullness hint might
209  * overflow. The fullness hint formula is roughly (fullness >> 16) & 0x7f, but (2^23 >> 16) & 0x7f
210  * is 0, which would make it impossible to distinguish completely full from completely empty.
211  *
212  * Return: A fullness hint, which can be stored in 7 bits.
213  */
214 static u8 __must_check compute_fullness_hint(struct slab_depot *depot,
215 					     block_count_t free_blocks)
216 {
217 	block_count_t hint;
218 
219 	VDO_ASSERT_LOG_ONLY((free_blocks < (1 << 23)), "free blocks must be less than 2^23");
220 
221 	if (free_blocks == 0)
222 		return 0;
223 
224 	hint = free_blocks >> depot->hint_shift;
225 	return ((hint == 0) ? 1 : hint);
226 }
227 
228 /**
229  * check_summary_drain_complete() - Check whether an allocators summary has finished draining.
230  */
231 static void check_summary_drain_complete(struct block_allocator *allocator)
232 {
233 	if (!vdo_is_state_draining(&allocator->summary_state) ||
234 	    (allocator->summary_write_count > 0))
235 		return;
236 
237 	vdo_finish_operation(&allocator->summary_state,
238 			     (vdo_is_read_only(allocator->depot->vdo) ?
239 			      VDO_READ_ONLY : VDO_SUCCESS));
240 }
241 
242 /**
243  * notify_summary_waiters() - Wake all the waiters in a given queue.
244  * @allocator: The block allocator summary which owns the queue.
245  * @queue: The queue to notify.
246  */
247 static void notify_summary_waiters(struct block_allocator *allocator,
248 				   struct vdo_wait_queue *queue)
249 {
250 	int result = (vdo_is_read_only(allocator->depot->vdo) ?
251 		      VDO_READ_ONLY : VDO_SUCCESS);
252 
253 	vdo_waitq_notify_all_waiters(queue, NULL, &result);
254 }
255 
256 static void launch_write(struct slab_summary_block *summary_block);
257 
258 /**
259  * finish_updating_slab_summary_block() - Finish processing a block which attempted to write,
260  *                                        whether or not the attempt succeeded.
261  * @block: The block.
262  */
263 static void finish_updating_slab_summary_block(struct slab_summary_block *block)
264 {
265 	notify_summary_waiters(block->allocator, &block->current_update_waiters);
266 	block->writing = false;
267 	block->allocator->summary_write_count--;
268 	if (vdo_waitq_has_waiters(&block->next_update_waiters))
269 		launch_write(block);
270 	else
271 		check_summary_drain_complete(block->allocator);
272 }
273 
274 /**
275  * finish_update() - This is the callback for a successful summary block write.
276  * @completion: The write vio.
277  */
278 static void finish_update(struct vdo_completion *completion)
279 {
280 	struct slab_summary_block *block =
281 		container_of(as_vio(completion), struct slab_summary_block, vio);
282 
283 	atomic64_inc(&block->allocator->depot->summary_statistics.blocks_written);
284 	finish_updating_slab_summary_block(block);
285 }
286 
287 /**
288  * handle_write_error() - Handle an error writing a slab summary block.
289  * @completion: The write VIO.
290  */
291 static void handle_write_error(struct vdo_completion *completion)
292 {
293 	struct slab_summary_block *block =
294 		container_of(as_vio(completion), struct slab_summary_block, vio);
295 
296 	vio_record_metadata_io_error(as_vio(completion));
297 	vdo_enter_read_only_mode(completion->vdo, completion->result);
298 	finish_updating_slab_summary_block(block);
299 }
300 
301 static void write_slab_summary_endio(struct bio *bio)
302 {
303 	struct vio *vio = bio->bi_private;
304 	struct slab_summary_block *block =
305 		container_of(vio, struct slab_summary_block, vio);
306 
307 	continue_vio_after_io(vio, finish_update, block->allocator->thread_id);
308 }
309 
310 /**
311  * launch_write() - Write a slab summary block unless it is currently out for writing.
312  * @block: The block that needs to be committed.
313  */
314 static void launch_write(struct slab_summary_block *block)
315 {
316 	struct block_allocator *allocator = block->allocator;
317 	struct slab_depot *depot = allocator->depot;
318 	physical_block_number_t pbn;
319 
320 	if (block->writing)
321 		return;
322 
323 	allocator->summary_write_count++;
324 	vdo_waitq_transfer_all_waiters(&block->next_update_waiters,
325 				       &block->current_update_waiters);
326 	block->writing = true;
327 
328 	if (vdo_is_read_only(depot->vdo)) {
329 		finish_updating_slab_summary_block(block);
330 		return;
331 	}
332 
333 	memcpy(block->outgoing_entries, block->entries, VDO_BLOCK_SIZE);
334 
335 	/*
336 	 * Flush before writing to ensure that the slab journal tail blocks and reference updates
337 	 * covered by this summary update are stable. Otherwise, a subsequent recovery could
338 	 * encounter a slab summary update that refers to a slab journal tail block that has not
339 	 * actually been written. In such cases, the slab journal referenced will be treated as
340 	 * empty, causing any data within the slab which predates the existing recovery journal
341 	 * entries to be lost.
342 	 */
343 	pbn = (depot->summary_origin +
344 	       (VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE * allocator->zone_number) +
345 	       block->index);
346 	vdo_submit_metadata_vio(&block->vio, pbn, write_slab_summary_endio,
347 				handle_write_error, REQ_OP_WRITE | REQ_PREFLUSH);
348 }
349 
350 /**
351  * update_slab_summary_entry() - Update the entry for a slab.
352  * @slab: The slab whose entry is to be updated
353  * @waiter: The waiter that is updating the summary.
354  * @tail_block_offset: The offset of the slab journal's tail block.
355  * @load_ref_counts: Whether the reference counts must be loaded from disk on the vdo load.
356  * @is_clean: Whether the slab is clean.
357  * @free_blocks: The number of free blocks.
358  */
359 static void update_slab_summary_entry(struct vdo_slab *slab, struct vdo_waiter *waiter,
360 				      tail_block_offset_t tail_block_offset,
361 				      bool load_ref_counts, bool is_clean,
362 				      block_count_t free_blocks)
363 {
364 	u8 index = slab->slab_number / VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK;
365 	struct block_allocator *allocator = slab->allocator;
366 	struct slab_summary_block *block = &allocator->summary_blocks[index];
367 	int result;
368 	struct slab_summary_entry *entry;
369 
370 	if (vdo_is_read_only(block->vio.completion.vdo)) {
371 		result = VDO_READ_ONLY;
372 		waiter->callback(waiter, &result);
373 		return;
374 	}
375 
376 	if (vdo_is_state_draining(&allocator->summary_state) ||
377 	    vdo_is_state_quiescent(&allocator->summary_state)) {
378 		result = VDO_INVALID_ADMIN_STATE;
379 		waiter->callback(waiter, &result);
380 		return;
381 	}
382 
383 	entry = &allocator->summary_entries[slab->slab_number];
384 	*entry = (struct slab_summary_entry) {
385 		.tail_block_offset = tail_block_offset,
386 		.load_ref_counts = (entry->load_ref_counts || load_ref_counts),
387 		.is_dirty = !is_clean,
388 		.fullness_hint = compute_fullness_hint(allocator->depot, free_blocks),
389 	};
390 	vdo_waitq_enqueue_waiter(&block->next_update_waiters, waiter);
391 	launch_write(block);
392 }
393 
394 /**
395  * finish_reaping() - Actually advance the head of the journal now that any necessary flushes are
396  *                    complete.
397  * @journal: The journal to be reaped.
398  */
399 static void finish_reaping(struct slab_journal *journal)
400 {
401 	journal->head = journal->unreapable;
402 	add_entries(journal);
403 	check_if_slab_drained(journal->slab);
404 }
405 
406 static void reap_slab_journal(struct slab_journal *journal);
407 
408 /**
409  * complete_reaping() - Finish reaping now that we have flushed the lower layer and then try
410  *                      reaping again in case we deferred reaping due to an outstanding vio.
411  * @completion: The flush vio.
412  */
413 static void complete_reaping(struct vdo_completion *completion)
414 {
415 	struct slab_journal *journal = completion->parent;
416 
417 	return_vio_to_pool(journal->slab->allocator->vio_pool,
418 			   vio_as_pooled_vio(as_vio(vdo_forget(completion))));
419 	finish_reaping(journal);
420 	reap_slab_journal(journal);
421 }
422 
423 /**
424  * handle_flush_error() - Handle an error flushing the lower layer.
425  * @completion: The flush vio.
426  */
427 static void handle_flush_error(struct vdo_completion *completion)
428 {
429 	vio_record_metadata_io_error(as_vio(completion));
430 	vdo_enter_read_only_mode(completion->vdo, completion->result);
431 	complete_reaping(completion);
432 }
433 
434 static void flush_endio(struct bio *bio)
435 {
436 	struct vio *vio = bio->bi_private;
437 	struct slab_journal *journal = vio->completion.parent;
438 
439 	continue_vio_after_io(vio, complete_reaping,
440 			      journal->slab->allocator->thread_id);
441 }
442 
443 /**
444  * flush_for_reaping() - A waiter callback for getting a vio with which to flush the lower layer
445  *                       prior to reaping.
446  * @waiter: The journal as a flush waiter.
447  * @context: The newly acquired flush vio.
448  */
449 static void flush_for_reaping(struct vdo_waiter *waiter, void *context)
450 {
451 	struct slab_journal *journal =
452 		container_of(waiter, struct slab_journal, flush_waiter);
453 	struct pooled_vio *pooled = context;
454 	struct vio *vio = &pooled->vio;
455 
456 	vio->completion.parent = journal;
457 	vdo_submit_flush_vio(vio, flush_endio, handle_flush_error);
458 }
459 
460 /**
461  * reap_slab_journal() - Conduct a reap on a slab journal to reclaim unreferenced blocks.
462  * @journal: The slab journal.
463  */
464 static void reap_slab_journal(struct slab_journal *journal)
465 {
466 	bool reaped = false;
467 
468 	if (is_reaping(journal)) {
469 		/* We already have a reap in progress so wait for it to finish. */
470 		return;
471 	}
472 
473 	if ((journal->slab->status != VDO_SLAB_REBUILT) ||
474 	    !vdo_is_state_normal(&journal->slab->state) ||
475 	    vdo_is_read_only(journal->slab->allocator->depot->vdo)) {
476 		/*
477 		 * We must not reap in the first two cases, and there's no point in read-only mode.
478 		 */
479 		return;
480 	}
481 
482 	/*
483 	 * Start reclaiming blocks only when the journal head has no references. Then stop when a
484 	 * block is referenced or reap reaches the most recently written block, referenced by the
485 	 * slab summary, which has the sequence number just before the tail.
486 	 */
487 	while ((journal->unreapable < journal->tail) && (journal->reap_lock->count == 0)) {
488 		reaped = true;
489 		journal->unreapable++;
490 		journal->reap_lock++;
491 		if (journal->reap_lock == &journal->locks[journal->size])
492 			journal->reap_lock = &journal->locks[0];
493 	}
494 
495 	if (!reaped)
496 		return;
497 
498 	/*
499 	 * It is never safe to reap a slab journal block without first issuing a flush, regardless
500 	 * of whether a user flush has been received or not. In the absence of the flush, the
501 	 * reference block write which released the locks allowing the slab journal to reap may not
502 	 * be persisted. Although slab summary writes will eventually issue flushes, multiple slab
503 	 * journal block writes can be issued while previous slab summary updates have not yet been
504 	 * made. Even though those slab journal block writes will be ignored if the slab summary
505 	 * update is not persisted, they may still overwrite the to-be-reaped slab journal block
506 	 * resulting in a loss of reference count updates.
507 	 */
508 	journal->flush_waiter.callback = flush_for_reaping;
509 	acquire_vio_from_pool(journal->slab->allocator->vio_pool,
510 			      &journal->flush_waiter);
511 }
512 
513 /**
514  * adjust_slab_journal_block_reference() - Adjust the reference count for a slab journal block.
515  * @journal: The slab journal.
516  * @sequence_number: The journal sequence number of the referenced block.
517  * @adjustment: Amount to adjust the reference counter.
518  *
519  * Note that when the adjustment is negative, the slab journal will be reaped.
520  */
521 static void adjust_slab_journal_block_reference(struct slab_journal *journal,
522 						sequence_number_t sequence_number,
523 						int adjustment)
524 {
525 	struct journal_lock *lock;
526 
527 	if (sequence_number == 0)
528 		return;
529 
530 	if (journal->slab->status == VDO_SLAB_REPLAYING) {
531 		/* Locks should not be used during offline replay. */
532 		return;
533 	}
534 
535 	VDO_ASSERT_LOG_ONLY((adjustment != 0), "adjustment must be non-zero");
536 	lock = get_lock(journal, sequence_number);
537 	if (adjustment < 0) {
538 		VDO_ASSERT_LOG_ONLY((-adjustment <= lock->count),
539 				    "adjustment %d of lock count %u for slab journal block %llu must not underflow",
540 				    adjustment, lock->count,
541 				    (unsigned long long) sequence_number);
542 	}
543 
544 	lock->count += adjustment;
545 	if (lock->count == 0)
546 		reap_slab_journal(journal);
547 }
548 
549 /**
550  * release_journal_locks() - Callback invoked after a slab summary update completes.
551  * @waiter: The slab summary waiter that has just been notified.
552  * @context: The result code of the update.
553  *
554  * Registered in the constructor on behalf of update_tail_block_location().
555  *
556  * Implements waiter_callback_fn.
557  */
558 static void release_journal_locks(struct vdo_waiter *waiter, void *context)
559 {
560 	sequence_number_t first, i;
561 	struct slab_journal *journal =
562 		container_of(waiter, struct slab_journal, slab_summary_waiter);
563 	int result = *((int *) context);
564 
565 	if (result != VDO_SUCCESS) {
566 		if (result != VDO_READ_ONLY) {
567 			/*
568 			 * Don't bother logging what might be lots of errors if we are already in
569 			 * read-only mode.
570 			 */
571 			vdo_log_error_strerror(result, "failed slab summary update %llu",
572 					       (unsigned long long) journal->summarized);
573 		}
574 
575 		journal->updating_slab_summary = false;
576 		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
577 		check_if_slab_drained(journal->slab);
578 		return;
579 	}
580 
581 	if (journal->partial_write_in_progress && (journal->summarized == journal->tail)) {
582 		journal->partial_write_in_progress = false;
583 		add_entries(journal);
584 	}
585 
586 	first = journal->last_summarized;
587 	journal->last_summarized = journal->summarized;
588 	for (i = journal->summarized - 1; i >= first; i--) {
589 		/*
590 		 * Release the lock the summarized block held on the recovery journal. (During
591 		 * replay, recovery_start will always be 0.)
592 		 */
593 		if (journal->recovery_journal != NULL) {
594 			zone_count_t zone_number = journal->slab->allocator->zone_number;
595 			struct journal_lock *lock = get_lock(journal, i);
596 
597 			vdo_release_recovery_journal_block_reference(journal->recovery_journal,
598 								     lock->recovery_start,
599 								     VDO_ZONE_TYPE_PHYSICAL,
600 								     zone_number);
601 		}
602 
603 		/*
604 		 * Release our own lock against reaping for blocks that are committed. (This
605 		 * function will not change locks during replay.)
606 		 */
607 		adjust_slab_journal_block_reference(journal, i, -1);
608 	}
609 
610 	journal->updating_slab_summary = false;
611 
612 	reap_slab_journal(journal);
613 
614 	/* Check if the slab summary needs to be updated again. */
615 	update_tail_block_location(journal);
616 }
617 
618 /**
619  * update_tail_block_location() - Update the tail block location in the slab summary, if necessary.
620  * @journal: The slab journal that is updating its tail block location.
621  */
622 static void update_tail_block_location(struct slab_journal *journal)
623 {
624 	block_count_t free_block_count;
625 	struct vdo_slab *slab = journal->slab;
626 
627 	if (journal->updating_slab_summary ||
628 	    vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
629 	    (journal->last_summarized >= journal->next_commit)) {
630 		check_if_slab_drained(slab);
631 		return;
632 	}
633 
634 	if (slab->status != VDO_SLAB_REBUILT) {
635 		u8 hint = slab->allocator->summary_entries[slab->slab_number].fullness_hint;
636 
637 		free_block_count = ((block_count_t) hint) << slab->allocator->depot->hint_shift;
638 	} else {
639 		free_block_count = slab->free_blocks;
640 	}
641 
642 	journal->summarized = journal->next_commit;
643 	journal->updating_slab_summary = true;
644 
645 	/*
646 	 * Update slab summary as dirty.
647 	 * vdo_slab journal can only reap past sequence number 1 when all the ref counts for this
648 	 * slab have been written to the layer. Therefore, indicate that the ref counts must be
649 	 * loaded when the journal head has reaped past sequence number 1.
650 	 */
651 	update_slab_summary_entry(slab, &journal->slab_summary_waiter,
652 				  journal->summarized % journal->size,
653 				  (journal->head > 1), false, free_block_count);
654 }
655 
656 /**
657  * reopen_slab_journal() - Reopen a slab's journal by emptying it and then adding pending entries.
658  */
659 static void reopen_slab_journal(struct vdo_slab *slab)
660 {
661 	struct slab_journal *journal = &slab->journal;
662 	sequence_number_t block;
663 
664 	VDO_ASSERT_LOG_ONLY(journal->tail_header.entry_count == 0,
665 			    "vdo_slab journal's active block empty before reopening");
666 	journal->head = journal->tail;
667 	initialize_journal_state(journal);
668 
669 	/* Ensure no locks are spuriously held on an empty journal. */
670 	for (block = 1; block <= journal->size; block++) {
671 		VDO_ASSERT_LOG_ONLY((get_lock(journal, block)->count == 0),
672 				    "Scrubbed journal's block %llu is not locked",
673 				    (unsigned long long) block);
674 	}
675 
676 	add_entries(journal);
677 }
678 
679 static sequence_number_t get_committing_sequence_number(const struct pooled_vio *vio)
680 {
681 	const struct packed_slab_journal_block *block =
682 		(const struct packed_slab_journal_block *) vio->vio.data;
683 
684 	return __le64_to_cpu(block->header.sequence_number);
685 }
686 
687 /**
688  * complete_write() - Handle post-commit processing.
689  * @completion: The write vio as a completion.
690  *
691  * This is the callback registered by write_slab_journal_block().
692  */
693 static void complete_write(struct vdo_completion *completion)
694 {
695 	int result = completion->result;
696 	struct pooled_vio *pooled = vio_as_pooled_vio(as_vio(completion));
697 	struct slab_journal *journal = completion->parent;
698 	sequence_number_t committed = get_committing_sequence_number(pooled);
699 
700 	list_del_init(&pooled->list_entry);
701 	return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled));
702 
703 	if (result != VDO_SUCCESS) {
704 		vio_record_metadata_io_error(as_vio(completion));
705 		vdo_log_error_strerror(result, "cannot write slab journal block %llu",
706 				       (unsigned long long) committed);
707 		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
708 		check_if_slab_drained(journal->slab);
709 		return;
710 	}
711 
712 	WRITE_ONCE(journal->events->blocks_written, journal->events->blocks_written + 1);
713 
714 	if (list_empty(&journal->uncommitted_blocks)) {
715 		/* If no blocks are outstanding, then the commit point is at the tail. */
716 		journal->next_commit = journal->tail;
717 	} else {
718 		/* The commit point is always the beginning of the oldest incomplete block. */
719 		pooled = container_of(journal->uncommitted_blocks.next,
720 				      struct pooled_vio, list_entry);
721 		journal->next_commit = get_committing_sequence_number(pooled);
722 	}
723 
724 	update_tail_block_location(journal);
725 }
726 
727 static void write_slab_journal_endio(struct bio *bio)
728 {
729 	struct vio *vio = bio->bi_private;
730 	struct slab_journal *journal = vio->completion.parent;
731 
732 	continue_vio_after_io(vio, complete_write, journal->slab->allocator->thread_id);
733 }
734 
735 /**
736  * write_slab_journal_block() - Write a slab journal block.
737  * @waiter: The vio pool waiter which was just notified.
738  * @context: The vio pool entry for the write.
739  *
740  * Callback from acquire_vio_from_pool() registered in commit_tail().
741  */
742 static void write_slab_journal_block(struct vdo_waiter *waiter, void *context)
743 {
744 	struct pooled_vio *pooled = context;
745 	struct vio *vio = &pooled->vio;
746 	struct slab_journal *journal =
747 		container_of(waiter, struct slab_journal, resource_waiter);
748 	struct slab_journal_block_header *header = &journal->tail_header;
749 	int unused_entries = journal->entries_per_block - header->entry_count;
750 	physical_block_number_t block_number;
751 	const struct admin_state_code *operation;
752 
753 	header->head = journal->head;
754 	list_add_tail(&pooled->list_entry, &journal->uncommitted_blocks);
755 	vdo_pack_slab_journal_block_header(header, &journal->block->header);
756 
757 	/* Copy the tail block into the vio. */
758 	memcpy(pooled->vio.data, journal->block, VDO_BLOCK_SIZE);
759 
760 	VDO_ASSERT_LOG_ONLY(unused_entries >= 0, "vdo_slab journal block is not overfull");
761 	if (unused_entries > 0) {
762 		/*
763 		 * Release the per-entry locks for any unused entries in the block we are about to
764 		 * write.
765 		 */
766 		adjust_slab_journal_block_reference(journal, header->sequence_number,
767 						    -unused_entries);
768 		journal->partial_write_in_progress = !block_is_full(journal);
769 	}
770 
771 	block_number = journal->slab->journal_origin +
772 		(header->sequence_number % journal->size);
773 	vio->completion.parent = journal;
774 
775 	/*
776 	 * This block won't be read in recovery until the slab summary is updated to refer to it.
777 	 * The slab summary update does a flush which is sufficient to protect us from corruption
778 	 * due to out of order slab journal, reference block, or block map writes.
779 	 */
780 	vdo_submit_metadata_vio(vdo_forget(vio), block_number, write_slab_journal_endio,
781 				complete_write, REQ_OP_WRITE);
782 
783 	/* Since the write is submitted, the tail block structure can be reused. */
784 	journal->tail++;
785 	initialize_tail_block(journal);
786 	journal->waiting_to_commit = false;
787 
788 	operation = vdo_get_admin_state_code(&journal->slab->state);
789 	if (operation == VDO_ADMIN_STATE_WAITING_FOR_RECOVERY) {
790 		vdo_finish_operation(&journal->slab->state,
791 				     (vdo_is_read_only(journal->slab->allocator->depot->vdo) ?
792 				      VDO_READ_ONLY : VDO_SUCCESS));
793 		return;
794 	}
795 
796 	add_entries(journal);
797 }
798 
799 /**
800  * commit_tail() - Commit the tail block of the slab journal.
801  * @journal: The journal whose tail block should be committed.
802  */
803 static void commit_tail(struct slab_journal *journal)
804 {
805 	if ((journal->tail_header.entry_count == 0) && must_make_entries_to_flush(journal)) {
806 		/*
807 		 * There are no entries at the moment, but there are some waiters, so defer
808 		 * initiating the flush until those entries are ready to write.
809 		 */
810 		return;
811 	}
812 
813 	if (vdo_is_read_only(journal->slab->allocator->depot->vdo) ||
814 	    journal->waiting_to_commit ||
815 	    (journal->tail_header.entry_count == 0)) {
816 		/*
817 		 * There is nothing to do since the tail block is empty, or writing, or the journal
818 		 * is in read-only mode.
819 		 */
820 		return;
821 	}
822 
823 	/*
824 	 * Since we are about to commit the tail block, this journal no longer needs to be on the
825 	 * ring of journals which the recovery journal might ask to commit.
826 	 */
827 	mark_slab_journal_clean(journal);
828 
829 	journal->waiting_to_commit = true;
830 
831 	journal->resource_waiter.callback = write_slab_journal_block;
832 	acquire_vio_from_pool(journal->slab->allocator->vio_pool,
833 			      &journal->resource_waiter);
834 }
835 
836 /**
837  * encode_slab_journal_entry() - Encode a slab journal entry.
838  * @tail_header: The unpacked header for the block.
839  * @payload: The journal block payload to hold the entry.
840  * @sbn: The slab block number of the entry to encode.
841  * @operation: The type of the entry.
842  * @increment: True if this is an increment.
843  *
844  * Exposed for unit tests.
845  */
846 static void encode_slab_journal_entry(struct slab_journal_block_header *tail_header,
847 				      slab_journal_payload *payload,
848 				      slab_block_number sbn,
849 				      enum journal_operation operation,
850 				      bool increment)
851 {
852 	journal_entry_count_t entry_number = tail_header->entry_count++;
853 
854 	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
855 		if (!tail_header->has_block_map_increments) {
856 			memset(payload->full_entries.entry_types, 0,
857 			       VDO_SLAB_JOURNAL_ENTRY_TYPES_SIZE);
858 			tail_header->has_block_map_increments = true;
859 		}
860 
861 		payload->full_entries.entry_types[entry_number / 8] |=
862 			((u8)1 << (entry_number % 8));
863 	}
864 
865 	vdo_pack_slab_journal_entry(&payload->entries[entry_number], sbn, increment);
866 }
867 
868 /**
869  * expand_journal_point() - Convert a recovery journal journal_point which refers to both an
870  *                          increment and a decrement to a single point which refers to one or the
871  *                          other.
872  * @recovery_point: The journal point to convert.
873  * @increment: Whether the current entry is an increment.
874  *
875  * Return: The expanded journal point
876  *
877  * Because each data_vio has but a single recovery journal point, but may need to make both
878  * increment and decrement entries in the same slab journal. In order to distinguish the two
879  * entries, the entry count of the expanded journal point is twice the actual recovery journal
880  * entry count for increments, and one more than that for decrements.
881  */
882 static struct journal_point expand_journal_point(struct journal_point recovery_point,
883 						 bool increment)
884 {
885 	recovery_point.entry_count *= 2;
886 	if (!increment)
887 		recovery_point.entry_count++;
888 
889 	return recovery_point;
890 }
891 
892 /**
893  * add_entry() - Actually add an entry to the slab journal, potentially firing off a write if a
894  *               block becomes full.
895  * @journal: The slab journal to append to.
896  * @pbn: The pbn being adjusted.
897  * @operation: The type of entry to make.
898  * @increment: True if this is an increment.
899  * @recovery_point: The expanded recovery point.
900  *
901  * This function is synchronous.
902  */
903 static void add_entry(struct slab_journal *journal, physical_block_number_t pbn,
904 		      enum journal_operation operation, bool increment,
905 		      struct journal_point recovery_point)
906 {
907 	struct packed_slab_journal_block *block = journal->block;
908 	int result;
909 
910 	result = VDO_ASSERT(vdo_before_journal_point(&journal->tail_header.recovery_point,
911 						     &recovery_point),
912 			    "recovery journal point is monotonically increasing, recovery point: %llu.%u, block recovery point: %llu.%u",
913 			    (unsigned long long) recovery_point.sequence_number,
914 			    recovery_point.entry_count,
915 			    (unsigned long long) journal->tail_header.recovery_point.sequence_number,
916 			    journal->tail_header.recovery_point.entry_count);
917 	if (result != VDO_SUCCESS) {
918 		vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo, result);
919 		return;
920 	}
921 
922 	if (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
923 		result = VDO_ASSERT((journal->tail_header.entry_count <
924 				     journal->full_entries_per_block),
925 				    "block has room for full entries");
926 		if (result != VDO_SUCCESS) {
927 			vdo_enter_read_only_mode(journal->slab->allocator->depot->vdo,
928 						 result);
929 			return;
930 		}
931 	}
932 
933 	encode_slab_journal_entry(&journal->tail_header, &block->payload,
934 				  pbn - journal->slab->start, operation, increment);
935 	journal->tail_header.recovery_point = recovery_point;
936 	if (block_is_full(journal))
937 		commit_tail(journal);
938 }
939 
940 static inline block_count_t journal_length(const struct slab_journal *journal)
941 {
942 	return journal->tail - journal->head;
943 }
944 
945 /**
946  * vdo_attempt_replay_into_slab() - Replay a recovery journal entry into a slab's journal.
947  * @slab: The slab to play into.
948  * @pbn: The PBN for the entry.
949  * @operation: The type of entry to add.
950  * @increment: True if this entry is an increment.
951  * @recovery_point: The recovery journal point corresponding to this entry.
952  * @parent: The completion to notify when there is space to add the entry if the entry could not be
953  *          added immediately.
954  *
955  * Return: true if the entry was added immediately.
956  */
957 bool vdo_attempt_replay_into_slab(struct vdo_slab *slab, physical_block_number_t pbn,
958 				  enum journal_operation operation, bool increment,
959 				  struct journal_point *recovery_point,
960 				  struct vdo_completion *parent)
961 {
962 	struct slab_journal *journal = &slab->journal;
963 	struct slab_journal_block_header *header = &journal->tail_header;
964 	struct journal_point expanded = expand_journal_point(*recovery_point, increment);
965 
966 	/* Only accept entries after the current recovery point. */
967 	if (!vdo_before_journal_point(&journal->tail_header.recovery_point, &expanded))
968 		return true;
969 
970 	if ((header->entry_count >= journal->full_entries_per_block) &&
971 	    (header->has_block_map_increments || (operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING))) {
972 		/*
973 		 * The tail block does not have room for the entry we are attempting to add so
974 		 * commit the tail block now.
975 		 */
976 		commit_tail(journal);
977 	}
978 
979 	if (journal->waiting_to_commit) {
980 		vdo_start_operation_with_waiter(&journal->slab->state,
981 						VDO_ADMIN_STATE_WAITING_FOR_RECOVERY,
982 						parent, NULL);
983 		return false;
984 	}
985 
986 	if (journal_length(journal) >= journal->size) {
987 		/*
988 		 * We must have reaped the current head before the crash, since the blocked
989 		 * threshold keeps us from having more entries than fit in a slab journal; hence we
990 		 * can just advance the head (and unreapable block), as needed.
991 		 */
992 		journal->head++;
993 		journal->unreapable++;
994 	}
995 
996 	if (journal->slab->status == VDO_SLAB_REBUILT)
997 		journal->slab->status = VDO_SLAB_REPLAYING;
998 
999 	add_entry(journal, pbn, operation, increment, expanded);
1000 	return true;
1001 }
1002 
1003 /**
1004  * requires_reaping() - Check whether the journal must be reaped before adding new entries.
1005  * @journal: The journal to check.
1006  *
1007  * Return: true if the journal must be reaped.
1008  */
1009 static bool requires_reaping(const struct slab_journal *journal)
1010 {
1011 	return (journal_length(journal) >= journal->blocking_threshold);
1012 }
1013 
1014 /** finish_summary_update() - A waiter callback that resets the writing state of a slab. */
1015 static void finish_summary_update(struct vdo_waiter *waiter, void *context)
1016 {
1017 	struct vdo_slab *slab = container_of(waiter, struct vdo_slab, summary_waiter);
1018 	int result = *((int *) context);
1019 
1020 	slab->active_count--;
1021 
1022 	if ((result != VDO_SUCCESS) && (result != VDO_READ_ONLY)) {
1023 		vdo_log_error_strerror(result, "failed to update slab summary");
1024 		vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
1025 	}
1026 
1027 	check_if_slab_drained(slab);
1028 }
1029 
1030 static void write_reference_block(struct vdo_waiter *waiter, void *context);
1031 
1032 /**
1033  * launch_reference_block_write() - Launch the write of a dirty reference block by first acquiring
1034  *                                  a VIO for it from the pool.
1035  * @waiter: The waiter of the block which is starting to write.
1036  * @context: The parent slab of the block.
1037  *
1038  * This can be asynchronous since the writer will have to wait if all VIOs in the pool are
1039  * currently in use.
1040  */
1041 static void launch_reference_block_write(struct vdo_waiter *waiter, void *context)
1042 {
1043 	struct vdo_slab *slab = context;
1044 
1045 	if (vdo_is_read_only(slab->allocator->depot->vdo))
1046 		return;
1047 
1048 	slab->active_count++;
1049 	container_of(waiter, struct reference_block, waiter)->is_writing = true;
1050 	waiter->callback = write_reference_block;
1051 	acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
1052 }
1053 
1054 static void save_dirty_reference_blocks(struct vdo_slab *slab)
1055 {
1056 	vdo_waitq_notify_all_waiters(&slab->dirty_blocks,
1057 				     launch_reference_block_write, slab);
1058 	check_if_slab_drained(slab);
1059 }
1060 
1061 /**
1062  * finish_reference_block_write() - After a reference block has written, clean it, release its
1063  *                                  locks, and return its VIO to the pool.
1064  * @completion: The VIO that just finished writing.
1065  */
1066 static void finish_reference_block_write(struct vdo_completion *completion)
1067 {
1068 	struct vio *vio = as_vio(completion);
1069 	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
1070 	struct reference_block *block = completion->parent;
1071 	struct vdo_slab *slab = block->slab;
1072 	tail_block_offset_t offset;
1073 
1074 	slab->active_count--;
1075 
1076 	/* Release the slab journal lock. */
1077 	adjust_slab_journal_block_reference(&slab->journal,
1078 					    block->slab_journal_lock_to_release, -1);
1079 	return_vio_to_pool(slab->allocator->vio_pool, pooled);
1080 
1081 	/*
1082 	 * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause
1083 	 * us to be dirtied again, but we don't want to double enqueue.
1084 	 */
1085 	block->is_writing = false;
1086 
1087 	if (vdo_is_read_only(completion->vdo)) {
1088 		check_if_slab_drained(slab);
1089 		return;
1090 	}
1091 
1092 	/* Re-queue the block if it was re-dirtied while it was writing. */
1093 	if (block->is_dirty) {
1094 		vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
1095 		if (vdo_is_state_draining(&slab->state)) {
1096 			/* We must be saving, and this block will otherwise not be relaunched. */
1097 			save_dirty_reference_blocks(slab);
1098 		}
1099 
1100 		return;
1101 	}
1102 
1103 	/*
1104 	 * Mark the slab as clean in the slab summary if there are no dirty or writing blocks
1105 	 * and no summary update in progress.
1106 	 */
1107 	if ((slab->active_count > 0) || vdo_waitq_has_waiters(&slab->dirty_blocks)) {
1108 		check_if_slab_drained(slab);
1109 		return;
1110 	}
1111 
1112 	offset = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
1113 	slab->active_count++;
1114 	slab->summary_waiter.callback = finish_summary_update;
1115 	update_slab_summary_entry(slab, &slab->summary_waiter, offset,
1116 				  true, true, slab->free_blocks);
1117 }
1118 
1119 /**
1120  * get_reference_counters_for_block() - Find the reference counters for a given block.
1121  * @block: The reference_block in question.
1122  *
1123  * Return: A pointer to the reference counters for this block.
1124  */
1125 static vdo_refcount_t * __must_check get_reference_counters_for_block(struct reference_block *block)
1126 {
1127 	size_t block_index = block - block->slab->reference_blocks;
1128 
1129 	return &block->slab->counters[block_index * COUNTS_PER_BLOCK];
1130 }
1131 
1132 /**
1133  * pack_reference_block() - Copy data from a reference block to a buffer ready to be written out.
1134  * @block: The block to copy.
1135  * @buffer: The char buffer to fill with the packed block.
1136  */
1137 static void pack_reference_block(struct reference_block *block, void *buffer)
1138 {
1139 	struct packed_reference_block *packed = buffer;
1140 	vdo_refcount_t *counters = get_reference_counters_for_block(block);
1141 	sector_count_t i;
1142 	struct packed_journal_point commit_point;
1143 
1144 	vdo_pack_journal_point(&block->slab->slab_journal_point, &commit_point);
1145 
1146 	for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
1147 		packed->sectors[i].commit_point = commit_point;
1148 		memcpy(packed->sectors[i].counts, counters + (i * COUNTS_PER_SECTOR),
1149 		       (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
1150 	}
1151 }
1152 
1153 static void write_reference_block_endio(struct bio *bio)
1154 {
1155 	struct vio *vio = bio->bi_private;
1156 	struct reference_block *block = vio->completion.parent;
1157 	thread_id_t thread_id = block->slab->allocator->thread_id;
1158 
1159 	continue_vio_after_io(vio, finish_reference_block_write, thread_id);
1160 }
1161 
1162 /**
1163  * handle_io_error() - Handle an I/O error reading or writing a reference count block.
1164  * @completion: The VIO doing the I/O as a completion.
1165  */
1166 static void handle_io_error(struct vdo_completion *completion)
1167 {
1168 	int result = completion->result;
1169 	struct vio *vio = as_vio(completion);
1170 	struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab;
1171 
1172 	vio_record_metadata_io_error(vio);
1173 	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
1174 	slab->active_count--;
1175 	vdo_enter_read_only_mode(slab->allocator->depot->vdo, result);
1176 	check_if_slab_drained(slab);
1177 }
1178 
1179 /**
1180  * write_reference_block() - After a dirty block waiter has gotten a VIO from the VIO pool, copy
1181  *                           its counters and associated data into the VIO, and launch the write.
1182  * @waiter: The waiter of the dirty block.
1183  * @context: The VIO returned by the pool.
1184  */
1185 static void write_reference_block(struct vdo_waiter *waiter, void *context)
1186 {
1187 	size_t block_offset;
1188 	physical_block_number_t pbn;
1189 	struct pooled_vio *pooled = context;
1190 	struct vdo_completion *completion = &pooled->vio.completion;
1191 	struct reference_block *block = container_of(waiter, struct reference_block,
1192 						     waiter);
1193 
1194 	pack_reference_block(block, pooled->vio.data);
1195 	block_offset = (block - block->slab->reference_blocks);
1196 	pbn = (block->slab->ref_counts_origin + block_offset);
1197 	block->slab_journal_lock_to_release = block->slab_journal_lock;
1198 	completion->parent = block;
1199 
1200 	/*
1201 	 * Mark the block as clean, since we won't be committing any updates that happen after this
1202 	 * moment. As long as VIO order is preserved, two VIOs updating this block at once will not
1203 	 * cause complications.
1204 	 */
1205 	block->is_dirty = false;
1206 
1207 	/*
1208 	 * Flush before writing to ensure that the recovery journal and slab journal entries which
1209 	 * cover this reference update are stable. This prevents data corruption that can be caused
1210 	 * by out of order writes.
1211 	 */
1212 	WRITE_ONCE(block->slab->allocator->ref_counts_statistics.blocks_written,
1213 		   block->slab->allocator->ref_counts_statistics.blocks_written + 1);
1214 
1215 	completion->callback_thread_id = ((struct block_allocator *) pooled->context)->thread_id;
1216 	vdo_submit_metadata_vio(&pooled->vio, pbn, write_reference_block_endio,
1217 				handle_io_error, REQ_OP_WRITE | REQ_PREFLUSH);
1218 }
1219 
1220 static void reclaim_journal_space(struct slab_journal *journal)
1221 {
1222 	block_count_t length = journal_length(journal);
1223 	struct vdo_slab *slab = journal->slab;
1224 	block_count_t write_count = vdo_waitq_num_waiters(&slab->dirty_blocks);
1225 	block_count_t written;
1226 
1227 	if ((length < journal->flushing_threshold) || (write_count == 0))
1228 		return;
1229 
1230 	/* The slab journal is over the first threshold, schedule some reference block writes. */
1231 	WRITE_ONCE(journal->events->flush_count, journal->events->flush_count + 1);
1232 	if (length < journal->flushing_deadline) {
1233 		/* Schedule more writes the closer to the deadline we get. */
1234 		write_count /= journal->flushing_deadline - length + 1;
1235 		write_count = max_t(block_count_t, write_count, 1);
1236 	}
1237 
1238 	for (written = 0; written < write_count; written++) {
1239 		vdo_waitq_notify_next_waiter(&slab->dirty_blocks,
1240 					     launch_reference_block_write, slab);
1241 	}
1242 }
1243 
1244 /**
1245  * reference_count_to_status() - Convert a reference count to a reference status.
1246  * @count: The count to convert.
1247  *
1248  * Return: The appropriate reference status.
1249  */
1250 static enum reference_status __must_check reference_count_to_status(vdo_refcount_t count)
1251 {
1252 	if (count == EMPTY_REFERENCE_COUNT)
1253 		return RS_FREE;
1254 	else if (count == 1)
1255 		return RS_SINGLE;
1256 	else if (count == PROVISIONAL_REFERENCE_COUNT)
1257 		return RS_PROVISIONAL;
1258 	else
1259 		return RS_SHARED;
1260 }
1261 
1262 /**
1263  * dirty_block() - Mark a reference count block as dirty, potentially adding it to the dirty queue
1264  *                 if it wasn't already dirty.
1265  * @block: The reference block to mark as dirty.
1266  */
1267 static void dirty_block(struct reference_block *block)
1268 {
1269 	if (block->is_dirty)
1270 		return;
1271 
1272 	block->is_dirty = true;
1273 	if (!block->is_writing)
1274 		vdo_waitq_enqueue_waiter(&block->slab->dirty_blocks, &block->waiter);
1275 }
1276 
1277 /**
1278  * get_reference_block() - Get the reference block that covers the given block index.
1279  */
1280 static struct reference_block * __must_check get_reference_block(struct vdo_slab *slab,
1281 								 slab_block_number index)
1282 {
1283 	return &slab->reference_blocks[index / COUNTS_PER_BLOCK];
1284 }
1285 
1286 /**
1287  * slab_block_number_from_pbn() - Determine the index within the slab of a particular physical
1288  *                                block number.
1289  * @slab: The slab.
1290  * @physical_block_number: The physical block number.
1291  * @slab_block_number_ptr: A pointer to the slab block number.
1292  *
1293  * Return: VDO_SUCCESS or an error code.
1294  */
1295 static int __must_check slab_block_number_from_pbn(struct vdo_slab *slab,
1296 						   physical_block_number_t pbn,
1297 						   slab_block_number *slab_block_number_ptr)
1298 {
1299 	u64 slab_block_number;
1300 
1301 	if (pbn < slab->start)
1302 		return VDO_OUT_OF_RANGE;
1303 
1304 	slab_block_number = pbn - slab->start;
1305 	if (slab_block_number >= slab->allocator->depot->slab_config.data_blocks)
1306 		return VDO_OUT_OF_RANGE;
1307 
1308 	*slab_block_number_ptr = slab_block_number;
1309 	return VDO_SUCCESS;
1310 }
1311 
1312 /**
1313  * get_reference_counter() - Get the reference counter that covers the given physical block number.
1314  * @slab: The slab to query.
1315  * @pbn: The physical block number.
1316  * @counter_ptr: A pointer to the reference counter.
1317  */
1318 static int __must_check get_reference_counter(struct vdo_slab *slab,
1319 					      physical_block_number_t pbn,
1320 					      vdo_refcount_t **counter_ptr)
1321 {
1322 	slab_block_number index;
1323 	int result = slab_block_number_from_pbn(slab, pbn, &index);
1324 
1325 	if (result != VDO_SUCCESS)
1326 		return result;
1327 
1328 	*counter_ptr = &slab->counters[index];
1329 
1330 	return VDO_SUCCESS;
1331 }
1332 
1333 static unsigned int calculate_slab_priority(struct vdo_slab *slab)
1334 {
1335 	block_count_t free_blocks = slab->free_blocks;
1336 	unsigned int unopened_slab_priority = slab->allocator->unopened_slab_priority;
1337 	unsigned int priority;
1338 
1339 	/*
1340 	 * Wholly full slabs must be the only ones with lowest priority, 0.
1341 	 *
1342 	 * Slabs that have never been opened (empty, newly initialized, and never been written to)
1343 	 * have lower priority than previously opened slabs that have a significant number of free
1344 	 * blocks. This ranking causes VDO to avoid writing physical blocks for the first time
1345 	 * unless there are very few free blocks that have been previously written to.
1346 	 *
1347 	 * Since VDO doesn't discard blocks currently, reusing previously written blocks makes VDO
1348 	 * a better client of any underlying storage that is thinly-provisioned (though discarding
1349 	 * would be better).
1350 	 *
1351 	 * For all other slabs, the priority is derived from the logarithm of the number of free
1352 	 * blocks. Slabs with the same order of magnitude of free blocks have the same priority.
1353 	 * With 2^23 blocks, the priority will range from 1 to 25. The reserved
1354 	 * unopened_slab_priority divides the range and is skipped by the logarithmic mapping.
1355 	 */
1356 
1357 	if (free_blocks == 0)
1358 		return 0;
1359 
1360 	if (is_slab_journal_blank(slab))
1361 		return unopened_slab_priority;
1362 
1363 	priority = (1 + ilog2(free_blocks));
1364 	return ((priority < unopened_slab_priority) ? priority : priority + 1);
1365 }
1366 
1367 /*
1368  * Slabs are essentially prioritized by an approximation of the number of free blocks in the slab
1369  * so slabs with lots of free blocks will be opened for allocation before slabs that have few free
1370  * blocks.
1371  */
1372 static void prioritize_slab(struct vdo_slab *slab)
1373 {
1374 	VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
1375 			    "a slab must not already be on a ring when prioritizing");
1376 	slab->priority = calculate_slab_priority(slab);
1377 	vdo_priority_table_enqueue(slab->allocator->prioritized_slabs,
1378 				   slab->priority, &slab->allocq_entry);
1379 }
1380 
1381 /**
1382  * adjust_free_block_count() - Adjust the free block count and (if needed) reprioritize the slab.
1383  * @incremented: true if the free block count went up.
1384  */
1385 static void adjust_free_block_count(struct vdo_slab *slab, bool incremented)
1386 {
1387 	struct block_allocator *allocator = slab->allocator;
1388 
1389 	WRITE_ONCE(allocator->allocated_blocks,
1390 		   allocator->allocated_blocks + (incremented ? -1 : 1));
1391 
1392 	/* The open slab doesn't need to be reprioritized until it is closed. */
1393 	if (slab == allocator->open_slab)
1394 		return;
1395 
1396 	/* Don't bother adjusting the priority table if unneeded. */
1397 	if (slab->priority == calculate_slab_priority(slab))
1398 		return;
1399 
1400 	/*
1401 	 * Reprioritize the slab to reflect the new free block count by removing it from the table
1402 	 * and re-enqueuing it with the new priority.
1403 	 */
1404 	vdo_priority_table_remove(allocator->prioritized_slabs, &slab->allocq_entry);
1405 	prioritize_slab(slab);
1406 }
1407 
1408 /**
1409  * increment_for_data() - Increment the reference count for a data block.
1410  * @slab: The slab which owns the block.
1411  * @block: The reference block which contains the block being updated.
1412  * @block_number: The block to update.
1413  * @old_status: The reference status of the data block before this increment.
1414  * @lock: The pbn_lock associated with this increment (may be NULL).
1415  * @counter_ptr: A pointer to the count for the data block (in, out).
1416  * @adjust_block_count: Whether to update the allocator's free block count.
1417  *
1418  * Return: VDO_SUCCESS or an error.
1419  */
1420 static int increment_for_data(struct vdo_slab *slab, struct reference_block *block,
1421 			      slab_block_number block_number,
1422 			      enum reference_status old_status,
1423 			      struct pbn_lock *lock, vdo_refcount_t *counter_ptr,
1424 			      bool adjust_block_count)
1425 {
1426 	switch (old_status) {
1427 	case RS_FREE:
1428 		*counter_ptr = 1;
1429 		block->allocated_count++;
1430 		slab->free_blocks--;
1431 		if (adjust_block_count)
1432 			adjust_free_block_count(slab, false);
1433 
1434 		break;
1435 
1436 	case RS_PROVISIONAL:
1437 		*counter_ptr = 1;
1438 		break;
1439 
1440 	default:
1441 		/* Single or shared */
1442 		if (*counter_ptr >= MAXIMUM_REFERENCE_COUNT) {
1443 			return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
1444 						      "Incrementing a block already having 254 references (slab %u, offset %u)",
1445 						      slab->slab_number, block_number);
1446 		}
1447 		(*counter_ptr)++;
1448 	}
1449 
1450 	if (lock != NULL)
1451 		vdo_unassign_pbn_lock_provisional_reference(lock);
1452 	return VDO_SUCCESS;
1453 }
1454 
1455 /**
1456  * decrement_for_data() - Decrement the reference count for a data block.
1457  * @slab: The slab which owns the block.
1458  * @block: The reference block which contains the block being updated.
1459  * @block_number: The block to update.
1460  * @old_status: The reference status of the data block before this decrement.
1461  * @updater: The reference updater doing this operation in case we need to look up the pbn lock.
1462  * @lock: The pbn_lock associated with the block being decremented (may be NULL).
1463  * @counter_ptr: A pointer to the count for the data block (in, out).
1464  * @adjust_block_count: Whether to update the allocator's free block count.
1465  *
1466  * Return: VDO_SUCCESS or an error.
1467  */
1468 static int decrement_for_data(struct vdo_slab *slab, struct reference_block *block,
1469 			      slab_block_number block_number,
1470 			      enum reference_status old_status,
1471 			      struct reference_updater *updater,
1472 			      vdo_refcount_t *counter_ptr, bool adjust_block_count)
1473 {
1474 	switch (old_status) {
1475 	case RS_FREE:
1476 		return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
1477 					      "Decrementing free block at offset %u in slab %u",
1478 					      block_number, slab->slab_number);
1479 
1480 	case RS_PROVISIONAL:
1481 	case RS_SINGLE:
1482 		if (updater->zpbn.zone != NULL) {
1483 			struct pbn_lock *lock = vdo_get_physical_zone_pbn_lock(updater->zpbn.zone,
1484 									       updater->zpbn.pbn);
1485 
1486 			if (lock != NULL) {
1487 				/*
1488 				 * There is a read lock on this block, so the block must not become
1489 				 * unreferenced.
1490 				 */
1491 				*counter_ptr = PROVISIONAL_REFERENCE_COUNT;
1492 				vdo_assign_pbn_lock_provisional_reference(lock);
1493 				break;
1494 			}
1495 		}
1496 
1497 		*counter_ptr = EMPTY_REFERENCE_COUNT;
1498 		block->allocated_count--;
1499 		slab->free_blocks++;
1500 		if (adjust_block_count)
1501 			adjust_free_block_count(slab, true);
1502 
1503 		break;
1504 
1505 	default:
1506 		/* Shared */
1507 		(*counter_ptr)--;
1508 	}
1509 
1510 	return VDO_SUCCESS;
1511 }
1512 
1513 /**
1514  * increment_for_block_map() - Increment the reference count for a block map page.
1515  * @slab: The slab which owns the block.
1516  * @block: The reference block which contains the block being updated.
1517  * @block_number: The block to update.
1518  * @old_status: The reference status of the block before this increment.
1519  * @lock: The pbn_lock associated with this increment (may be NULL).
1520  * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
1521  * @counter_ptr: A pointer to the count for the block (in, out).
1522  * @adjust_block_count: Whether to update the allocator's free block count.
1523  *
1524  * All block map increments should be from provisional to MAXIMUM_REFERENCE_COUNT. Since block map
1525  * blocks never dedupe they should never be adjusted from any other state. The adjustment always
1526  * results in MAXIMUM_REFERENCE_COUNT as this value is used to prevent dedupe against block map
1527  * blocks.
1528  *
1529  * Return: VDO_SUCCESS or an error.
1530  */
1531 static int increment_for_block_map(struct vdo_slab *slab, struct reference_block *block,
1532 				   slab_block_number block_number,
1533 				   enum reference_status old_status,
1534 				   struct pbn_lock *lock, bool normal_operation,
1535 				   vdo_refcount_t *counter_ptr, bool adjust_block_count)
1536 {
1537 	switch (old_status) {
1538 	case RS_FREE:
1539 		if (normal_operation) {
1540 			return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
1541 						      "Incrementing unallocated block map block (slab %u, offset %u)",
1542 						      slab->slab_number, block_number);
1543 		}
1544 
1545 		*counter_ptr = MAXIMUM_REFERENCE_COUNT;
1546 		block->allocated_count++;
1547 		slab->free_blocks--;
1548 		if (adjust_block_count)
1549 			adjust_free_block_count(slab, false);
1550 
1551 		return VDO_SUCCESS;
1552 
1553 	case RS_PROVISIONAL:
1554 		if (!normal_operation)
1555 			return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
1556 						      "Block map block had provisional reference during replay (slab %u, offset %u)",
1557 						      slab->slab_number, block_number);
1558 
1559 		*counter_ptr = MAXIMUM_REFERENCE_COUNT;
1560 		if (lock != NULL)
1561 			vdo_unassign_pbn_lock_provisional_reference(lock);
1562 		return VDO_SUCCESS;
1563 
1564 	default:
1565 		return vdo_log_error_strerror(VDO_REF_COUNT_INVALID,
1566 					      "Incrementing a block map block which is already referenced %u times (slab %u, offset %u)",
1567 					      *counter_ptr, slab->slab_number,
1568 					      block_number);
1569 	}
1570 }
1571 
1572 static bool __must_check is_valid_journal_point(const struct journal_point *point)
1573 {
1574 	return ((point != NULL) && (point->sequence_number > 0));
1575 }
1576 
1577 /**
1578  * update_reference_count() - Update the reference count of a block.
1579  * @slab: The slab which owns the block.
1580  * @block: The reference block which contains the block being updated.
1581  * @block_number: The block to update.
1582  * @slab_journal_point: The slab journal point at which this update is journaled.
1583  * @updater: The reference updater.
1584  * @normal_operation: Whether we are in normal operation vs. recovery or rebuild.
1585  * @adjust_block_count: Whether to update the slab's free block count.
1586  * @provisional_decrement_ptr: A pointer which will be set to true if this update was a decrement
1587  *                             of a provisional reference.
1588  *
1589  * Return: VDO_SUCCESS or an error.
1590  */
1591 static int update_reference_count(struct vdo_slab *slab, struct reference_block *block,
1592 				  slab_block_number block_number,
1593 				  const struct journal_point *slab_journal_point,
1594 				  struct reference_updater *updater,
1595 				  bool normal_operation, bool adjust_block_count,
1596 				  bool *provisional_decrement_ptr)
1597 {
1598 	vdo_refcount_t *counter_ptr = &slab->counters[block_number];
1599 	enum reference_status old_status = reference_count_to_status(*counter_ptr);
1600 	int result;
1601 
1602 	if (!updater->increment) {
1603 		result = decrement_for_data(slab, block, block_number, old_status,
1604 					    updater, counter_ptr, adjust_block_count);
1605 		if ((result == VDO_SUCCESS) && (old_status == RS_PROVISIONAL)) {
1606 			if (provisional_decrement_ptr != NULL)
1607 				*provisional_decrement_ptr = true;
1608 			return VDO_SUCCESS;
1609 		}
1610 	} else if (updater->operation == VDO_JOURNAL_DATA_REMAPPING) {
1611 		result = increment_for_data(slab, block, block_number, old_status,
1612 					    updater->lock, counter_ptr, adjust_block_count);
1613 	} else {
1614 		result = increment_for_block_map(slab, block, block_number, old_status,
1615 						 updater->lock, normal_operation,
1616 						 counter_ptr, adjust_block_count);
1617 	}
1618 
1619 	if (result != VDO_SUCCESS)
1620 		return result;
1621 
1622 	if (is_valid_journal_point(slab_journal_point))
1623 		slab->slab_journal_point = *slab_journal_point;
1624 
1625 	return VDO_SUCCESS;
1626 }
1627 
1628 static int __must_check adjust_reference_count(struct vdo_slab *slab,
1629 					       struct reference_updater *updater,
1630 					       const struct journal_point *slab_journal_point)
1631 {
1632 	slab_block_number block_number;
1633 	int result;
1634 	struct reference_block *block;
1635 	bool provisional_decrement = false;
1636 
1637 	if (!is_slab_open(slab))
1638 		return VDO_INVALID_ADMIN_STATE;
1639 
1640 	result = slab_block_number_from_pbn(slab, updater->zpbn.pbn, &block_number);
1641 	if (result != VDO_SUCCESS)
1642 		return result;
1643 
1644 	block = get_reference_block(slab, block_number);
1645 	result = update_reference_count(slab, block, block_number, slab_journal_point,
1646 					updater, NORMAL_OPERATION, true,
1647 					&provisional_decrement);
1648 	if ((result != VDO_SUCCESS) || provisional_decrement)
1649 		return result;
1650 
1651 	if (block->is_dirty && (block->slab_journal_lock > 0)) {
1652 		sequence_number_t entry_lock = slab_journal_point->sequence_number;
1653 		/*
1654 		 * This block is already dirty and a slab journal entry has been made for it since
1655 		 * the last time it was clean. We must release the per-entry slab journal lock for
1656 		 * the entry associated with the update we are now doing.
1657 		 */
1658 		result = VDO_ASSERT(is_valid_journal_point(slab_journal_point),
1659 				    "Reference count adjustments need slab journal points.");
1660 		if (result != VDO_SUCCESS)
1661 			return result;
1662 
1663 		adjust_slab_journal_block_reference(&slab->journal, entry_lock, -1);
1664 		return VDO_SUCCESS;
1665 	}
1666 
1667 	/*
1668 	 * This may be the first time we are applying an update for which there is a slab journal
1669 	 * entry to this block since the block was cleaned. Therefore, we convert the per-entry
1670 	 * slab journal lock to an uncommitted reference block lock, if there is a per-entry lock.
1671 	 */
1672 	if (is_valid_journal_point(slab_journal_point))
1673 		block->slab_journal_lock = slab_journal_point->sequence_number;
1674 	else
1675 		block->slab_journal_lock = 0;
1676 
1677 	dirty_block(block);
1678 	return VDO_SUCCESS;
1679 }
1680 
1681 /**
1682  * add_entry_from_waiter() - Add an entry to the slab journal.
1683  * @waiter: The vio which should make an entry now.
1684  * @context: The slab journal to make an entry in.
1685  *
1686  * This callback is invoked by add_entries() once it has determined that we are ready to make
1687  * another entry in the slab journal. Implements waiter_callback_fn.
1688  */
1689 static void add_entry_from_waiter(struct vdo_waiter *waiter, void *context)
1690 {
1691 	int result;
1692 	struct reference_updater *updater =
1693 		container_of(waiter, struct reference_updater, waiter);
1694 	struct data_vio *data_vio = data_vio_from_reference_updater(updater);
1695 	struct slab_journal *journal = context;
1696 	struct slab_journal_block_header *header = &journal->tail_header;
1697 	struct journal_point slab_journal_point = {
1698 		.sequence_number = header->sequence_number,
1699 		.entry_count = header->entry_count,
1700 	};
1701 	sequence_number_t recovery_block = data_vio->recovery_journal_point.sequence_number;
1702 
1703 	if (header->entry_count == 0) {
1704 		/*
1705 		 * This is the first entry in the current tail block, so get a lock on the recovery
1706 		 * journal which we will hold until this tail block is committed.
1707 		 */
1708 		get_lock(journal, header->sequence_number)->recovery_start = recovery_block;
1709 		if (journal->recovery_journal != NULL) {
1710 			zone_count_t zone_number = journal->slab->allocator->zone_number;
1711 
1712 			vdo_acquire_recovery_journal_block_reference(journal->recovery_journal,
1713 								     recovery_block,
1714 								     VDO_ZONE_TYPE_PHYSICAL,
1715 								     zone_number);
1716 		}
1717 
1718 		mark_slab_journal_dirty(journal, recovery_block);
1719 		reclaim_journal_space(journal);
1720 	}
1721 
1722 	add_entry(journal, updater->zpbn.pbn, updater->operation, updater->increment,
1723 		  expand_journal_point(data_vio->recovery_journal_point,
1724 				       updater->increment));
1725 
1726 	if (journal->slab->status != VDO_SLAB_REBUILT) {
1727 		/*
1728 		 * If the slab is unrecovered, scrubbing will take care of the count since the
1729 		 * update is now recorded in the journal.
1730 		 */
1731 		adjust_slab_journal_block_reference(journal,
1732 						    slab_journal_point.sequence_number, -1);
1733 		result = VDO_SUCCESS;
1734 	} else {
1735 		/* Now that an entry has been made in the slab journal, update the counter. */
1736 		result = adjust_reference_count(journal->slab, updater,
1737 						&slab_journal_point);
1738 	}
1739 
1740 	if (updater->increment)
1741 		continue_data_vio_with_error(data_vio, result);
1742 	else
1743 		vdo_continue_completion(&data_vio->decrement_completion, result);
1744 }
1745 
1746 /**
1747  * is_next_entry_a_block_map_increment() - Check whether the next entry to be made is a block map
1748  *                                         increment.
1749  * @journal: The journal.
1750  *
1751  * Return: true if the first entry waiter's operation is a block map increment.
1752  */
1753 static inline bool is_next_entry_a_block_map_increment(struct slab_journal *journal)
1754 {
1755 	struct vdo_waiter *waiter = vdo_waitq_get_first_waiter(&journal->entry_waiters);
1756 	struct reference_updater *updater =
1757 		container_of(waiter, struct reference_updater, waiter);
1758 
1759 	return (updater->operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING);
1760 }
1761 
1762 /**
1763  * add_entries() - Add as many entries as possible from the queue of vios waiting to make entries.
1764  * @journal: The journal to which entries may be added.
1765  *
1766  * By processing the queue in order, we ensure that slab journal entries are made in the same order
1767  * as recovery journal entries for the same increment or decrement.
1768  */
1769 static void add_entries(struct slab_journal *journal)
1770 {
1771 	if (journal->adding_entries) {
1772 		/* Protect against re-entrancy. */
1773 		return;
1774 	}
1775 
1776 	journal->adding_entries = true;
1777 	while (vdo_waitq_has_waiters(&journal->entry_waiters)) {
1778 		struct slab_journal_block_header *header = &journal->tail_header;
1779 
1780 		if (journal->partial_write_in_progress ||
1781 		    (journal->slab->status == VDO_SLAB_REBUILDING)) {
1782 			/*
1783 			 * Don't add entries while rebuilding or while a partial write is
1784 			 * outstanding, as it could result in reference count corruption.
1785 			 */
1786 			break;
1787 		}
1788 
1789 		if (journal->waiting_to_commit) {
1790 			/*
1791 			 * If we are waiting for resources to write the tail block, and the tail
1792 			 * block is full, we can't make another entry.
1793 			 */
1794 			WRITE_ONCE(journal->events->tail_busy_count,
1795 				   journal->events->tail_busy_count + 1);
1796 			break;
1797 		} else if (is_next_entry_a_block_map_increment(journal) &&
1798 			   (header->entry_count >= journal->full_entries_per_block)) {
1799 			/*
1800 			 * The tail block does not have room for a block map increment, so commit
1801 			 * it now.
1802 			 */
1803 			commit_tail(journal);
1804 			if (journal->waiting_to_commit) {
1805 				WRITE_ONCE(journal->events->tail_busy_count,
1806 					   journal->events->tail_busy_count + 1);
1807 				break;
1808 			}
1809 		}
1810 
1811 		/* If the slab is over the blocking threshold, make the vio wait. */
1812 		if (requires_reaping(journal)) {
1813 			WRITE_ONCE(journal->events->blocked_count,
1814 				   journal->events->blocked_count + 1);
1815 			save_dirty_reference_blocks(journal->slab);
1816 			break;
1817 		}
1818 
1819 		if (header->entry_count == 0) {
1820 			struct journal_lock *lock =
1821 				get_lock(journal, header->sequence_number);
1822 
1823 			/*
1824 			 * Check if the on disk slab journal is full. Because of the blocking and
1825 			 * scrubbing thresholds, this should never happen.
1826 			 */
1827 			if (lock->count > 0) {
1828 				VDO_ASSERT_LOG_ONLY((journal->head + journal->size) == journal->tail,
1829 						    "New block has locks, but journal is not full");
1830 
1831 				/*
1832 				 * The blocking threshold must let the journal fill up if the new
1833 				 * block has locks; if the blocking threshold is smaller than the
1834 				 * journal size, the new block cannot possibly have locks already.
1835 				 */
1836 				VDO_ASSERT_LOG_ONLY((journal->blocking_threshold >= journal->size),
1837 						    "New block can have locks already iff blocking threshold is at the end of the journal");
1838 
1839 				WRITE_ONCE(journal->events->disk_full_count,
1840 					   journal->events->disk_full_count + 1);
1841 				save_dirty_reference_blocks(journal->slab);
1842 				break;
1843 			}
1844 
1845 			/*
1846 			 * Don't allow the new block to be reaped until all of the reference count
1847 			 * blocks are written and the journal block has been fully committed as
1848 			 * well.
1849 			 */
1850 			lock->count = journal->entries_per_block + 1;
1851 
1852 			if (header->sequence_number == 1) {
1853 				struct vdo_slab *slab = journal->slab;
1854 				block_count_t i;
1855 
1856 				/*
1857 				 * This is the first entry in this slab journal, ever. Dirty all of
1858 				 * the reference count blocks. Each will acquire a lock on the tail
1859 				 * block so that the journal won't be reaped until the reference
1860 				 * counts are initialized. The lock acquisition must be done by the
1861 				 * ref_counts since here we don't know how many reference blocks
1862 				 * the ref_counts has.
1863 				 */
1864 				for (i = 0; i < slab->reference_block_count; i++) {
1865 					slab->reference_blocks[i].slab_journal_lock = 1;
1866 					dirty_block(&slab->reference_blocks[i]);
1867 				}
1868 
1869 				adjust_slab_journal_block_reference(journal, 1,
1870 								    slab->reference_block_count);
1871 			}
1872 		}
1873 
1874 		vdo_waitq_notify_next_waiter(&journal->entry_waiters,
1875 					     add_entry_from_waiter, journal);
1876 	}
1877 
1878 	journal->adding_entries = false;
1879 
1880 	/* If there are no waiters, and we are flushing or saving, commit the tail block. */
1881 	if (vdo_is_state_draining(&journal->slab->state) &&
1882 	    !vdo_is_state_suspending(&journal->slab->state) &&
1883 	    !vdo_waitq_has_waiters(&journal->entry_waiters))
1884 		commit_tail(journal);
1885 }
1886 
1887 /**
1888  * reset_search_cursor() - Reset the free block search back to the first reference counter in the
1889  *                         first reference block of a slab.
1890  */
1891 static void reset_search_cursor(struct vdo_slab *slab)
1892 {
1893 	struct search_cursor *cursor = &slab->search_cursor;
1894 
1895 	cursor->block = cursor->first_block;
1896 	cursor->index = 0;
1897 	/* Unit tests have slabs with only one reference block (and it's a runt). */
1898 	cursor->end_index = min_t(u32, COUNTS_PER_BLOCK, slab->block_count);
1899 }
1900 
1901 /**
1902  * advance_search_cursor() - Advance the search cursor to the start of the next reference block in
1903  *                           a slab,
1904  *
1905  * Wraps around to the first reference block if the current block is the last reference block.
1906  *
1907  * Return: true unless the cursor was at the last reference block.
1908  */
1909 static bool advance_search_cursor(struct vdo_slab *slab)
1910 {
1911 	struct search_cursor *cursor = &slab->search_cursor;
1912 
1913 	/*
1914 	 * If we just finished searching the last reference block, then wrap back around to the
1915 	 * start of the array.
1916 	 */
1917 	if (cursor->block == cursor->last_block) {
1918 		reset_search_cursor(slab);
1919 		return false;
1920 	}
1921 
1922 	/* We're not already at the end, so advance to cursor to the next block. */
1923 	cursor->block++;
1924 	cursor->index = cursor->end_index;
1925 
1926 	if (cursor->block == cursor->last_block) {
1927 		/* The last reference block will usually be a runt. */
1928 		cursor->end_index = slab->block_count;
1929 	} else {
1930 		cursor->end_index += COUNTS_PER_BLOCK;
1931 	}
1932 
1933 	return true;
1934 }
1935 
1936 /**
1937  * vdo_adjust_reference_count_for_rebuild() - Adjust the reference count of a block during rebuild.
1938  *
1939  * Return: VDO_SUCCESS or an error.
1940  */
1941 int vdo_adjust_reference_count_for_rebuild(struct slab_depot *depot,
1942 					   physical_block_number_t pbn,
1943 					   enum journal_operation operation)
1944 {
1945 	int result;
1946 	slab_block_number block_number;
1947 	struct reference_block *block;
1948 	struct vdo_slab *slab = vdo_get_slab(depot, pbn);
1949 	struct reference_updater updater = {
1950 		.operation = operation,
1951 		.increment = true,
1952 	};
1953 
1954 	result = slab_block_number_from_pbn(slab, pbn, &block_number);
1955 	if (result != VDO_SUCCESS)
1956 		return result;
1957 
1958 	block = get_reference_block(slab, block_number);
1959 	result = update_reference_count(slab, block, block_number, NULL,
1960 					&updater, !NORMAL_OPERATION, false, NULL);
1961 	if (result != VDO_SUCCESS)
1962 		return result;
1963 
1964 	dirty_block(block);
1965 	return VDO_SUCCESS;
1966 }
1967 
1968 /**
1969  * replay_reference_count_change() - Replay the reference count adjustment from a slab journal
1970  *                                   entry into the reference count for a block.
1971  * @slab: The slab.
1972  * @entry_point: The slab journal point for the entry.
1973  * @entry: The slab journal entry being replayed.
1974  *
1975  * The adjustment will be ignored if it was already recorded in the reference count.
1976  *
1977  * Return: VDO_SUCCESS or an error code.
1978  */
1979 static int replay_reference_count_change(struct vdo_slab *slab,
1980 					 const struct journal_point *entry_point,
1981 					 struct slab_journal_entry entry)
1982 {
1983 	int result;
1984 	struct reference_block *block = get_reference_block(slab, entry.sbn);
1985 	sector_count_t sector = (entry.sbn % COUNTS_PER_BLOCK) / COUNTS_PER_SECTOR;
1986 	struct reference_updater updater = {
1987 		.operation = entry.operation,
1988 		.increment = entry.increment,
1989 	};
1990 
1991 	if (!vdo_before_journal_point(&block->commit_points[sector], entry_point)) {
1992 		/* This entry is already reflected in the existing counts, so do nothing. */
1993 		return VDO_SUCCESS;
1994 	}
1995 
1996 	/* This entry is not yet counted in the reference counts. */
1997 	result = update_reference_count(slab, block, entry.sbn, entry_point,
1998 					&updater, !NORMAL_OPERATION, false, NULL);
1999 	if (result != VDO_SUCCESS)
2000 		return result;
2001 
2002 	dirty_block(block);
2003 	return VDO_SUCCESS;
2004 }
2005 
2006 /**
2007  * find_zero_byte_in_word() - Find the array index of the first zero byte in word-sized range of
2008  *                            reference counters.
2009  * @word_ptr: A pointer to the eight counter bytes to check.
2010  * @start_index: The array index corresponding to word_ptr[0].
2011  * @fail_index: The array index to return if no zero byte is found.
2012  *
2013  * The search does no bounds checking; the function relies on the array being sufficiently padded.
2014  *
2015  * Return: The array index of the first zero byte in the word, or the value passed as fail_index if
2016  *         no zero byte was found.
2017  */
2018 static inline slab_block_number find_zero_byte_in_word(const u8 *word_ptr,
2019 						       slab_block_number start_index,
2020 						       slab_block_number fail_index)
2021 {
2022 	u64 word = get_unaligned_le64(word_ptr);
2023 
2024 	/* This looks like a loop, but GCC will unroll the eight iterations for us. */
2025 	unsigned int offset;
2026 
2027 	for (offset = 0; offset < BYTES_PER_WORD; offset++) {
2028 		/* Assumes little-endian byte order, which we have on X86. */
2029 		if ((word & 0xFF) == 0)
2030 			return (start_index + offset);
2031 		word >>= 8;
2032 	}
2033 
2034 	return fail_index;
2035 }
2036 
2037 /**
2038  * find_free_block() - Find the first block with a reference count of zero in the specified
2039  *                     range of reference counter indexes.
2040  * @slab: The slab counters to scan.
2041  * @index_ptr: A pointer to hold the array index of the free block.
2042  *
2043  * Exposed for unit testing.
2044  *
2045  * Return: true if a free block was found in the specified range.
2046  */
2047 static bool find_free_block(const struct vdo_slab *slab, slab_block_number *index_ptr)
2048 {
2049 	slab_block_number zero_index;
2050 	slab_block_number next_index = slab->search_cursor.index;
2051 	slab_block_number end_index = slab->search_cursor.end_index;
2052 	u8 *next_counter = &slab->counters[next_index];
2053 	u8 *end_counter = &slab->counters[end_index];
2054 
2055 	/*
2056 	 * Search every byte of the first unaligned word. (Array is padded so reading past end is
2057 	 * safe.)
2058 	 */
2059 	zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
2060 	if (zero_index < end_index) {
2061 		*index_ptr = zero_index;
2062 		return true;
2063 	}
2064 
2065 	/*
2066 	 * On architectures where unaligned word access is expensive, this would be a good place to
2067 	 * advance to an alignment boundary.
2068 	 */
2069 	next_index += BYTES_PER_WORD;
2070 	next_counter += BYTES_PER_WORD;
2071 
2072 	/*
2073 	 * Now we're word-aligned; check an word at a time until we find a word containing a zero.
2074 	 * (Array is padded so reading past end is safe.)
2075 	 */
2076 	while (next_counter < end_counter) {
2077 		/*
2078 		 * The following code is currently an exact copy of the code preceding the loop,
2079 		 * but if you try to merge them by using a do loop, it runs slower because a jump
2080 		 * instruction gets added at the start of the iteration.
2081 		 */
2082 		zero_index = find_zero_byte_in_word(next_counter, next_index, end_index);
2083 		if (zero_index < end_index) {
2084 			*index_ptr = zero_index;
2085 			return true;
2086 		}
2087 
2088 		next_index += BYTES_PER_WORD;
2089 		next_counter += BYTES_PER_WORD;
2090 	}
2091 
2092 	return false;
2093 }
2094 
2095 /**
2096  * search_current_reference_block() - Search the reference block currently saved in the search
2097  *                                    cursor for a reference count of zero, starting at the saved
2098  *                                    counter index.
2099  * @slab: The slab to search.
2100  * @free_index_ptr: A pointer to receive the array index of the zero reference count.
2101  *
2102  * Return: true if an unreferenced counter was found.
2103  */
2104 static bool search_current_reference_block(const struct vdo_slab *slab,
2105 					   slab_block_number *free_index_ptr)
2106 {
2107 	/* Don't bother searching if the current block is known to be full. */
2108 	return ((slab->search_cursor.block->allocated_count < COUNTS_PER_BLOCK) &&
2109 		find_free_block(slab, free_index_ptr));
2110 }
2111 
2112 /**
2113  * search_reference_blocks() - Search each reference block for a reference count of zero.
2114  * @slab: The slab to search.
2115  * @free_index_ptr: A pointer to receive the array index of the zero reference count.
2116  *
2117  * Searches each reference block for a reference count of zero, starting at the reference block and
2118  * counter index saved in the search cursor and searching up to the end of the last reference
2119  * block. The search does not wrap.
2120  *
2121  * Return: true if an unreferenced counter was found.
2122  */
2123 static bool search_reference_blocks(struct vdo_slab *slab,
2124 				    slab_block_number *free_index_ptr)
2125 {
2126 	/* Start searching at the saved search position in the current block. */
2127 	if (search_current_reference_block(slab, free_index_ptr))
2128 		return true;
2129 
2130 	/* Search each reference block up to the end of the slab. */
2131 	while (advance_search_cursor(slab)) {
2132 		if (search_current_reference_block(slab, free_index_ptr))
2133 			return true;
2134 	}
2135 
2136 	return false;
2137 }
2138 
2139 /**
2140  * make_provisional_reference() - Do the bookkeeping for making a provisional reference.
2141  */
2142 static void make_provisional_reference(struct vdo_slab *slab,
2143 				       slab_block_number block_number)
2144 {
2145 	struct reference_block *block = get_reference_block(slab, block_number);
2146 
2147 	/*
2148 	 * Make the initial transition from an unreferenced block to a
2149 	 * provisionally allocated block.
2150 	 */
2151 	slab->counters[block_number] = PROVISIONAL_REFERENCE_COUNT;
2152 
2153 	/* Account for the allocation. */
2154 	block->allocated_count++;
2155 	slab->free_blocks--;
2156 }
2157 
2158 /**
2159  * dirty_all_reference_blocks() - Mark all reference count blocks in a slab as dirty.
2160  */
2161 static void dirty_all_reference_blocks(struct vdo_slab *slab)
2162 {
2163 	block_count_t i;
2164 
2165 	for (i = 0; i < slab->reference_block_count; i++)
2166 		dirty_block(&slab->reference_blocks[i]);
2167 }
2168 
2169 /**
2170  * clear_provisional_references() - Clear the provisional reference counts from a reference block.
2171  * @block: The block to clear.
2172  */
2173 static void clear_provisional_references(struct reference_block *block)
2174 {
2175 	vdo_refcount_t *counters = get_reference_counters_for_block(block);
2176 	block_count_t j;
2177 
2178 	for (j = 0; j < COUNTS_PER_BLOCK; j++) {
2179 		if (counters[j] == PROVISIONAL_REFERENCE_COUNT) {
2180 			counters[j] = EMPTY_REFERENCE_COUNT;
2181 			block->allocated_count--;
2182 		}
2183 	}
2184 }
2185 
2186 static inline bool journal_points_equal(struct journal_point first,
2187 					struct journal_point second)
2188 {
2189 	return ((first.sequence_number == second.sequence_number) &&
2190 		(first.entry_count == second.entry_count));
2191 }
2192 
2193 /**
2194  * unpack_reference_block() - Unpack reference counts blocks into the internal memory structure.
2195  * @packed: The written reference block to be unpacked.
2196  * @block: The internal reference block to be loaded.
2197  */
2198 static void unpack_reference_block(struct packed_reference_block *packed,
2199 				   struct reference_block *block)
2200 {
2201 	block_count_t index;
2202 	sector_count_t i;
2203 	struct vdo_slab *slab = block->slab;
2204 	vdo_refcount_t *counters = get_reference_counters_for_block(block);
2205 
2206 	for (i = 0; i < VDO_SECTORS_PER_BLOCK; i++) {
2207 		struct packed_reference_sector *sector = &packed->sectors[i];
2208 
2209 		vdo_unpack_journal_point(&sector->commit_point, &block->commit_points[i]);
2210 		memcpy(counters + (i * COUNTS_PER_SECTOR), sector->counts,
2211 		       (sizeof(vdo_refcount_t) * COUNTS_PER_SECTOR));
2212 		/* The slab_journal_point must be the latest point found in any sector. */
2213 		if (vdo_before_journal_point(&slab->slab_journal_point,
2214 					     &block->commit_points[i]))
2215 			slab->slab_journal_point = block->commit_points[i];
2216 
2217 		if ((i > 0) &&
2218 		    !journal_points_equal(block->commit_points[0],
2219 					  block->commit_points[i])) {
2220 			size_t block_index = block - block->slab->reference_blocks;
2221 
2222 			vdo_log_warning("Torn write detected in sector %u of reference block %zu of slab %u",
2223 					i, block_index, block->slab->slab_number);
2224 		}
2225 	}
2226 
2227 	block->allocated_count = 0;
2228 	for (index = 0; index < COUNTS_PER_BLOCK; index++) {
2229 		if (counters[index] != EMPTY_REFERENCE_COUNT)
2230 			block->allocated_count++;
2231 	}
2232 }
2233 
2234 /**
2235  * finish_reference_block_load() - After a reference block has been read, unpack it.
2236  * @completion: The VIO that just finished reading.
2237  */
2238 static void finish_reference_block_load(struct vdo_completion *completion)
2239 {
2240 	struct vio *vio = as_vio(completion);
2241 	struct pooled_vio *pooled = vio_as_pooled_vio(vio);
2242 	struct reference_block *block = completion->parent;
2243 	struct vdo_slab *slab = block->slab;
2244 
2245 	unpack_reference_block((struct packed_reference_block *) vio->data, block);
2246 	return_vio_to_pool(slab->allocator->vio_pool, pooled);
2247 	slab->active_count--;
2248 	clear_provisional_references(block);
2249 
2250 	slab->free_blocks -= block->allocated_count;
2251 	check_if_slab_drained(slab);
2252 }
2253 
2254 static void load_reference_block_endio(struct bio *bio)
2255 {
2256 	struct vio *vio = bio->bi_private;
2257 	struct reference_block *block = vio->completion.parent;
2258 
2259 	continue_vio_after_io(vio, finish_reference_block_load,
2260 			      block->slab->allocator->thread_id);
2261 }
2262 
2263 /**
2264  * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the
2265  *                          block.
2266  * @waiter: The waiter of the block to load.
2267  * @context: The VIO returned by the pool.
2268  */
2269 static void load_reference_block(struct vdo_waiter *waiter, void *context)
2270 {
2271 	struct pooled_vio *pooled = context;
2272 	struct vio *vio = &pooled->vio;
2273 	struct reference_block *block =
2274 		container_of(waiter, struct reference_block, waiter);
2275 	size_t block_offset = (block - block->slab->reference_blocks);
2276 
2277 	vio->completion.parent = block;
2278 	vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset,
2279 				load_reference_block_endio, handle_io_error,
2280 				REQ_OP_READ);
2281 }
2282 
2283 /**
2284  * load_reference_blocks() - Load a slab's reference blocks from the underlying storage into a
2285  *                           pre-allocated reference counter.
2286  */
2287 static void load_reference_blocks(struct vdo_slab *slab)
2288 {
2289 	block_count_t i;
2290 
2291 	slab->free_blocks = slab->block_count;
2292 	slab->active_count = slab->reference_block_count;
2293 	for (i = 0; i < slab->reference_block_count; i++) {
2294 		struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter;
2295 
2296 		waiter->callback = load_reference_block;
2297 		acquire_vio_from_pool(slab->allocator->vio_pool, waiter);
2298 	}
2299 }
2300 
2301 /**
2302  * drain_slab() - Drain all reference count I/O.
2303  *
2304  * Depending upon the type of drain being performed (as recorded in the ref_count's vdo_slab), the
2305  * reference blocks may be loaded from disk or dirty reference blocks may be written out.
2306  */
2307 static void drain_slab(struct vdo_slab *slab)
2308 {
2309 	bool save;
2310 	bool load;
2311 	const struct admin_state_code *state = vdo_get_admin_state_code(&slab->state);
2312 
2313 	if (state == VDO_ADMIN_STATE_SUSPENDING)
2314 		return;
2315 
2316 	if ((state != VDO_ADMIN_STATE_REBUILDING) &&
2317 	    (state != VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING))
2318 		commit_tail(&slab->journal);
2319 
2320 	if ((state == VDO_ADMIN_STATE_RECOVERING) || (slab->counters == NULL))
2321 		return;
2322 
2323 	save = false;
2324 	load = slab->allocator->summary_entries[slab->slab_number].load_ref_counts;
2325 	if (state == VDO_ADMIN_STATE_SCRUBBING) {
2326 		if (load) {
2327 			load_reference_blocks(slab);
2328 			return;
2329 		}
2330 	} else if (state == VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING) {
2331 		if (!load) {
2332 			/* These reference counts were never written, so mark them all dirty. */
2333 			dirty_all_reference_blocks(slab);
2334 		}
2335 		save = true;
2336 	} else if (state == VDO_ADMIN_STATE_REBUILDING) {
2337 		/*
2338 		 * Write out the counters if the slab has written them before, or it has any
2339 		 * non-zero reference counts, or there are any slab journal blocks.
2340 		 */
2341 		block_count_t data_blocks = slab->allocator->depot->slab_config.data_blocks;
2342 
2343 		if (load || (slab->free_blocks != data_blocks) ||
2344 		    !is_slab_journal_blank(slab)) {
2345 			dirty_all_reference_blocks(slab);
2346 			save = true;
2347 		}
2348 	} else if (state == VDO_ADMIN_STATE_SAVING) {
2349 		save = (slab->status == VDO_SLAB_REBUILT);
2350 	} else {
2351 		vdo_finish_draining_with_result(&slab->state, VDO_SUCCESS);
2352 		return;
2353 	}
2354 
2355 	if (save)
2356 		save_dirty_reference_blocks(slab);
2357 }
2358 
2359 static int allocate_slab_counters(struct vdo_slab *slab)
2360 {
2361 	int result;
2362 	size_t index, bytes;
2363 
2364 	result = VDO_ASSERT(slab->reference_blocks == NULL,
2365 			    "vdo_slab %u doesn't allocate refcounts twice",
2366 			    slab->slab_number);
2367 	if (result != VDO_SUCCESS)
2368 		return result;
2369 
2370 	result = vdo_allocate(slab->reference_block_count, struct reference_block,
2371 			      __func__, &slab->reference_blocks);
2372 	if (result != VDO_SUCCESS)
2373 		return result;
2374 
2375 	/*
2376 	 * Allocate such that the runt slab has a full-length memory array, plus a little padding
2377 	 * so we can word-search even at the very end.
2378 	 */
2379 	bytes = (slab->reference_block_count * COUNTS_PER_BLOCK) + (2 * BYTES_PER_WORD);
2380 	result = vdo_allocate(bytes, vdo_refcount_t, "ref counts array",
2381 			      &slab->counters);
2382 	if (result != VDO_SUCCESS) {
2383 		vdo_free(vdo_forget(slab->reference_blocks));
2384 		return result;
2385 	}
2386 
2387 	slab->search_cursor.first_block = slab->reference_blocks;
2388 	slab->search_cursor.last_block = &slab->reference_blocks[slab->reference_block_count - 1];
2389 	reset_search_cursor(slab);
2390 
2391 	for (index = 0; index < slab->reference_block_count; index++) {
2392 		slab->reference_blocks[index] = (struct reference_block) {
2393 			.slab = slab,
2394 		};
2395 	}
2396 
2397 	return VDO_SUCCESS;
2398 }
2399 
2400 static int allocate_counters_if_clean(struct vdo_slab *slab)
2401 {
2402 	if (vdo_is_state_clean_load(&slab->state))
2403 		return allocate_slab_counters(slab);
2404 
2405 	return VDO_SUCCESS;
2406 }
2407 
2408 static void finish_loading_journal(struct vdo_completion *completion)
2409 {
2410 	struct vio *vio = as_vio(completion);
2411 	struct slab_journal *journal = completion->parent;
2412 	struct vdo_slab *slab = journal->slab;
2413 	struct packed_slab_journal_block *block = (struct packed_slab_journal_block *) vio->data;
2414 	struct slab_journal_block_header header;
2415 
2416 	vdo_unpack_slab_journal_block_header(&block->header, &header);
2417 
2418 	/* FIXME: should it be an error if the following conditional fails? */
2419 	if ((header.metadata_type == VDO_METADATA_SLAB_JOURNAL) &&
2420 	    (header.nonce == slab->allocator->nonce)) {
2421 		journal->tail = header.sequence_number + 1;
2422 
2423 		/*
2424 		 * If the slab is clean, this implies the slab journal is empty, so advance the
2425 		 * head appropriately.
2426 		 */
2427 		journal->head = (slab->allocator->summary_entries[slab->slab_number].is_dirty ?
2428 				 header.head : journal->tail);
2429 		journal->tail_header = header;
2430 		initialize_journal_state(journal);
2431 	}
2432 
2433 	return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio));
2434 	vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab));
2435 }
2436 
2437 static void read_slab_journal_tail_endio(struct bio *bio)
2438 {
2439 	struct vio *vio = bio->bi_private;
2440 	struct slab_journal *journal = vio->completion.parent;
2441 
2442 	continue_vio_after_io(vio, finish_loading_journal,
2443 			      journal->slab->allocator->thread_id);
2444 }
2445 
2446 static void handle_load_error(struct vdo_completion *completion)
2447 {
2448 	int result = completion->result;
2449 	struct slab_journal *journal = completion->parent;
2450 	struct vio *vio = as_vio(completion);
2451 
2452 	vio_record_metadata_io_error(vio);
2453 	return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio));
2454 	vdo_finish_loading_with_result(&journal->slab->state, result);
2455 }
2456 
2457 /**
2458  * read_slab_journal_tail() - Read the slab journal tail block by using a vio acquired from the vio
2459  *                            pool.
2460  * @waiter: The vio pool waiter which has just been notified.
2461  * @context: The vio pool entry given to the waiter.
2462  *
2463  * This is the success callback from acquire_vio_from_pool() when loading a slab journal.
2464  */
2465 static void read_slab_journal_tail(struct vdo_waiter *waiter, void *context)
2466 {
2467 	struct slab_journal *journal =
2468 		container_of(waiter, struct slab_journal, resource_waiter);
2469 	struct vdo_slab *slab = journal->slab;
2470 	struct pooled_vio *pooled = context;
2471 	struct vio *vio = &pooled->vio;
2472 	tail_block_offset_t last_commit_point =
2473 		slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
2474 
2475 	/*
2476 	 * Slab summary keeps the commit point offset, so the tail block is the block before that.
2477 	 * Calculation supports small journals in unit tests.
2478 	 */
2479 	tail_block_offset_t tail_block = ((last_commit_point == 0) ?
2480 					  (tail_block_offset_t)(journal->size - 1) :
2481 					  (last_commit_point - 1));
2482 
2483 	vio->completion.parent = journal;
2484 	vio->completion.callback_thread_id = slab->allocator->thread_id;
2485 	vdo_submit_metadata_vio(vio, slab->journal_origin + tail_block,
2486 				read_slab_journal_tail_endio, handle_load_error,
2487 				REQ_OP_READ);
2488 }
2489 
2490 /**
2491  * load_slab_journal() - Load a slab's journal by reading the journal's tail.
2492  */
2493 static void load_slab_journal(struct vdo_slab *slab)
2494 {
2495 	struct slab_journal *journal = &slab->journal;
2496 	tail_block_offset_t last_commit_point;
2497 
2498 	last_commit_point = slab->allocator->summary_entries[slab->slab_number].tail_block_offset;
2499 	if ((last_commit_point == 0) &&
2500 	    !slab->allocator->summary_entries[slab->slab_number].load_ref_counts) {
2501 		/*
2502 		 * This slab claims that it has a tail block at (journal->size - 1), but a head of
2503 		 * 1. This is impossible, due to the scrubbing threshold, on a real system, so
2504 		 * don't bother reading the (bogus) data off disk.
2505 		 */
2506 		VDO_ASSERT_LOG_ONLY(((journal->size < 16) ||
2507 				     (journal->scrubbing_threshold < (journal->size - 1))),
2508 				    "Scrubbing threshold protects against reads of unwritten slab journal blocks");
2509 		vdo_finish_loading_with_result(&slab->state,
2510 					       allocate_counters_if_clean(slab));
2511 		return;
2512 	}
2513 
2514 	journal->resource_waiter.callback = read_slab_journal_tail;
2515 	acquire_vio_from_pool(slab->allocator->vio_pool, &journal->resource_waiter);
2516 }
2517 
2518 static void register_slab_for_scrubbing(struct vdo_slab *slab, bool high_priority)
2519 {
2520 	struct slab_scrubber *scrubber = &slab->allocator->scrubber;
2521 
2522 	VDO_ASSERT_LOG_ONLY((slab->status != VDO_SLAB_REBUILT),
2523 			    "slab to be scrubbed is unrecovered");
2524 
2525 	if (slab->status != VDO_SLAB_REQUIRES_SCRUBBING)
2526 		return;
2527 
2528 	list_del_init(&slab->allocq_entry);
2529 	if (!slab->was_queued_for_scrubbing) {
2530 		WRITE_ONCE(scrubber->slab_count, scrubber->slab_count + 1);
2531 		slab->was_queued_for_scrubbing = true;
2532 	}
2533 
2534 	if (high_priority) {
2535 		slab->status = VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING;
2536 		list_add_tail(&slab->allocq_entry, &scrubber->high_priority_slabs);
2537 		return;
2538 	}
2539 
2540 	list_add_tail(&slab->allocq_entry, &scrubber->slabs);
2541 }
2542 
2543 /* Queue a slab for allocation or scrubbing. */
2544 static void queue_slab(struct vdo_slab *slab)
2545 {
2546 	struct block_allocator *allocator = slab->allocator;
2547 	block_count_t free_blocks;
2548 	int result;
2549 
2550 	VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry),
2551 			"a requeued slab must not already be on a ring");
2552 
2553 	if (vdo_is_read_only(allocator->depot->vdo))
2554 		return;
2555 
2556 	free_blocks = slab->free_blocks;
2557 	result = VDO_ASSERT((free_blocks <= allocator->depot->slab_config.data_blocks),
2558 			    "rebuilt slab %u must have a valid free block count (has %llu, expected maximum %llu)",
2559 			    slab->slab_number, (unsigned long long) free_blocks,
2560 			    (unsigned long long) allocator->depot->slab_config.data_blocks);
2561 	if (result != VDO_SUCCESS) {
2562 		vdo_enter_read_only_mode(allocator->depot->vdo, result);
2563 		return;
2564 	}
2565 
2566 	if (slab->status != VDO_SLAB_REBUILT) {
2567 		register_slab_for_scrubbing(slab, false);
2568 		return;
2569 	}
2570 
2571 	if (!vdo_is_state_resuming(&slab->state)) {
2572 		/*
2573 		 * If the slab is resuming, we've already accounted for it here, so don't do it
2574 		 * again.
2575 		 * FIXME: under what situation would the slab be resuming here?
2576 		 */
2577 		WRITE_ONCE(allocator->allocated_blocks,
2578 			   allocator->allocated_blocks - free_blocks);
2579 		if (!is_slab_journal_blank(slab)) {
2580 			WRITE_ONCE(allocator->statistics.slabs_opened,
2581 				   allocator->statistics.slabs_opened + 1);
2582 		}
2583 	}
2584 
2585 	if (allocator->depot->vdo->suspend_type == VDO_ADMIN_STATE_SAVING)
2586 		reopen_slab_journal(slab);
2587 
2588 	prioritize_slab(slab);
2589 }
2590 
2591 /**
2592  * initiate_slab_action() - Initiate a slab action.
2593  *
2594  * Implements vdo_admin_initiator_fn.
2595  */
2596 static void initiate_slab_action(struct admin_state *state)
2597 {
2598 	struct vdo_slab *slab = container_of(state, struct vdo_slab, state);
2599 
2600 	if (vdo_is_state_draining(state)) {
2601 		const struct admin_state_code *operation = vdo_get_admin_state_code(state);
2602 
2603 		if (operation == VDO_ADMIN_STATE_SCRUBBING)
2604 			slab->status = VDO_SLAB_REBUILDING;
2605 
2606 		drain_slab(slab);
2607 		check_if_slab_drained(slab);
2608 		return;
2609 	}
2610 
2611 	if (vdo_is_state_loading(state)) {
2612 		load_slab_journal(slab);
2613 		return;
2614 	}
2615 
2616 	if (vdo_is_state_resuming(state)) {
2617 		queue_slab(slab);
2618 		vdo_finish_resuming(state);
2619 		return;
2620 	}
2621 
2622 	vdo_finish_operation(state, VDO_INVALID_ADMIN_STATE);
2623 }
2624 
2625 /**
2626  * get_next_slab() - Get the next slab to scrub.
2627  * @scrubber: The slab scrubber.
2628  *
2629  * Return: The next slab to scrub or NULL if there are none.
2630  */
2631 static struct vdo_slab *get_next_slab(struct slab_scrubber *scrubber)
2632 {
2633 	struct vdo_slab *slab;
2634 
2635 	slab = list_first_entry_or_null(&scrubber->high_priority_slabs,
2636 					struct vdo_slab, allocq_entry);
2637 	if (slab != NULL)
2638 		return slab;
2639 
2640 	return list_first_entry_or_null(&scrubber->slabs, struct vdo_slab,
2641 					allocq_entry);
2642 }
2643 
2644 /**
2645  * has_slabs_to_scrub() - Check whether a scrubber has slabs to scrub.
2646  * @scrubber: The scrubber to check.
2647  *
2648  * Return: true if the scrubber has slabs to scrub.
2649  */
2650 static inline bool __must_check has_slabs_to_scrub(struct slab_scrubber *scrubber)
2651 {
2652 	return (get_next_slab(scrubber) != NULL);
2653 }
2654 
2655 /**
2656  * uninitialize_scrubber_vio() - Clean up the slab_scrubber's vio.
2657  * @scrubber: The scrubber.
2658  */
2659 static void uninitialize_scrubber_vio(struct slab_scrubber *scrubber)
2660 {
2661 	vdo_free(vdo_forget(scrubber->vio.data));
2662 	free_vio_components(&scrubber->vio);
2663 }
2664 
2665 /**
2666  * finish_scrubbing() - Stop scrubbing, either because there are no more slabs to scrub or because
2667  *                      there's been an error.
2668  * @scrubber: The scrubber.
2669  */
2670 static void finish_scrubbing(struct slab_scrubber *scrubber, int result)
2671 {
2672 	bool notify = vdo_waitq_has_waiters(&scrubber->waiters);
2673 	bool done = !has_slabs_to_scrub(scrubber);
2674 	struct block_allocator *allocator =
2675 		container_of(scrubber, struct block_allocator, scrubber);
2676 
2677 	if (done)
2678 		uninitialize_scrubber_vio(scrubber);
2679 
2680 	if (scrubber->high_priority_only) {
2681 		scrubber->high_priority_only = false;
2682 		vdo_fail_completion(vdo_forget(scrubber->vio.completion.parent), result);
2683 	} else if (done && (atomic_add_return(-1, &allocator->depot->zones_to_scrub) == 0)) {
2684 		/* All of our slabs were scrubbed, and we're the last allocator to finish. */
2685 		enum vdo_state prior_state =
2686 			atomic_cmpxchg(&allocator->depot->vdo->state, VDO_RECOVERING,
2687 				       VDO_DIRTY);
2688 
2689 		/*
2690 		 * To be safe, even if the CAS failed, ensure anything that follows is ordered with
2691 		 * respect to whatever state change did happen.
2692 		 */
2693 		smp_mb__after_atomic();
2694 
2695 		/*
2696 		 * We must check the VDO state here and not the depot's read_only_notifier since
2697 		 * the compare-swap-above could have failed due to a read-only entry which our own
2698 		 * thread does not yet know about.
2699 		 */
2700 		if (prior_state == VDO_DIRTY)
2701 			vdo_log_info("VDO commencing normal operation");
2702 		else if (prior_state == VDO_RECOVERING)
2703 			vdo_log_info("Exiting recovery mode");
2704 	}
2705 
2706 	/*
2707 	 * Note that the scrubber has stopped, and inform anyone who might be waiting for that to
2708 	 * happen.
2709 	 */
2710 	if (!vdo_finish_draining(&scrubber->admin_state))
2711 		WRITE_ONCE(scrubber->admin_state.current_state,
2712 			   VDO_ADMIN_STATE_SUSPENDED);
2713 
2714 	/*
2715 	 * We can't notify waiters until after we've finished draining or they'll just requeue.
2716 	 * Fortunately if there were waiters, we can't have been freed yet.
2717 	 */
2718 	if (notify)
2719 		vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
2720 }
2721 
2722 static void scrub_next_slab(struct slab_scrubber *scrubber);
2723 
2724 /**
2725  * slab_scrubbed() - Notify the scrubber that a slab has been scrubbed.
2726  * @completion: The slab rebuild completion.
2727  *
2728  * This callback is registered in apply_journal_entries().
2729  */
2730 static void slab_scrubbed(struct vdo_completion *completion)
2731 {
2732 	struct slab_scrubber *scrubber =
2733 		container_of(as_vio(completion), struct slab_scrubber, vio);
2734 	struct vdo_slab *slab = scrubber->slab;
2735 
2736 	slab->status = VDO_SLAB_REBUILT;
2737 	queue_slab(slab);
2738 	reopen_slab_journal(slab);
2739 	WRITE_ONCE(scrubber->slab_count, scrubber->slab_count - 1);
2740 	scrub_next_slab(scrubber);
2741 }
2742 
2743 /**
2744  * abort_scrubbing() - Abort scrubbing due to an error.
2745  * @scrubber: The slab scrubber.
2746  * @result: The error.
2747  */
2748 static void abort_scrubbing(struct slab_scrubber *scrubber, int result)
2749 {
2750 	vdo_enter_read_only_mode(scrubber->vio.completion.vdo, result);
2751 	finish_scrubbing(scrubber, result);
2752 }
2753 
2754 /**
2755  * handle_scrubber_error() - Handle errors while rebuilding a slab.
2756  * @completion: The slab rebuild completion.
2757  */
2758 static void handle_scrubber_error(struct vdo_completion *completion)
2759 {
2760 	struct vio *vio = as_vio(completion);
2761 
2762 	vio_record_metadata_io_error(vio);
2763 	abort_scrubbing(container_of(vio, struct slab_scrubber, vio),
2764 			completion->result);
2765 }
2766 
2767 /**
2768  * apply_block_entries() - Apply all the entries in a block to the reference counts.
2769  * @block: A block with entries to apply.
2770  * @entry_count: The number of entries to apply.
2771  * @block_number: The sequence number of the block.
2772  * @slab: The slab to apply the entries to.
2773  *
2774  * Return: VDO_SUCCESS or an error code.
2775  */
2776 static int apply_block_entries(struct packed_slab_journal_block *block,
2777 			       journal_entry_count_t entry_count,
2778 			       sequence_number_t block_number, struct vdo_slab *slab)
2779 {
2780 	struct journal_point entry_point = {
2781 		.sequence_number = block_number,
2782 		.entry_count = 0,
2783 	};
2784 	int result;
2785 	slab_block_number max_sbn = slab->end - slab->start;
2786 
2787 	while (entry_point.entry_count < entry_count) {
2788 		struct slab_journal_entry entry =
2789 			vdo_decode_slab_journal_entry(block, entry_point.entry_count);
2790 
2791 		if (entry.sbn > max_sbn) {
2792 			/* This entry is out of bounds. */
2793 			return vdo_log_error_strerror(VDO_CORRUPT_JOURNAL,
2794 						      "vdo_slab journal entry (%llu, %u) had invalid offset %u in slab (size %u blocks)",
2795 						      (unsigned long long) block_number,
2796 						      entry_point.entry_count,
2797 						      entry.sbn, max_sbn);
2798 		}
2799 
2800 		result = replay_reference_count_change(slab, &entry_point, entry);
2801 		if (result != VDO_SUCCESS) {
2802 			vdo_log_error_strerror(result,
2803 					       "vdo_slab journal entry (%llu, %u) (%s of offset %u) could not be applied in slab %u",
2804 					       (unsigned long long) block_number,
2805 					       entry_point.entry_count,
2806 					       vdo_get_journal_operation_name(entry.operation),
2807 					       entry.sbn, slab->slab_number);
2808 			return result;
2809 		}
2810 		entry_point.entry_count++;
2811 	}
2812 
2813 	return VDO_SUCCESS;
2814 }
2815 
2816 /**
2817  * apply_journal_entries() - Find the relevant vio of the slab journal and apply all valid entries.
2818  * @completion: The metadata read vio completion.
2819  *
2820  * This is a callback registered in start_scrubbing().
2821  */
2822 static void apply_journal_entries(struct vdo_completion *completion)
2823 {
2824 	int result;
2825 	struct slab_scrubber *scrubber =
2826 		container_of(as_vio(completion), struct slab_scrubber, vio);
2827 	struct vdo_slab *slab = scrubber->slab;
2828 	struct slab_journal *journal = &slab->journal;
2829 
2830 	/* Find the boundaries of the useful part of the journal. */
2831 	sequence_number_t tail = journal->tail;
2832 	tail_block_offset_t end_index = (tail - 1) % journal->size;
2833 	char *end_data = scrubber->vio.data + (end_index * VDO_BLOCK_SIZE);
2834 	struct packed_slab_journal_block *end_block =
2835 		(struct packed_slab_journal_block *) end_data;
2836 
2837 	sequence_number_t head = __le64_to_cpu(end_block->header.head);
2838 	tail_block_offset_t head_index = head % journal->size;
2839 	block_count_t index = head_index;
2840 
2841 	struct journal_point ref_counts_point = slab->slab_journal_point;
2842 	struct journal_point last_entry_applied = ref_counts_point;
2843 	sequence_number_t sequence;
2844 
2845 	for (sequence = head; sequence < tail; sequence++) {
2846 		char *block_data = scrubber->vio.data + (index * VDO_BLOCK_SIZE);
2847 		struct packed_slab_journal_block *block =
2848 			(struct packed_slab_journal_block *) block_data;
2849 		struct slab_journal_block_header header;
2850 
2851 		vdo_unpack_slab_journal_block_header(&block->header, &header);
2852 
2853 		if ((header.nonce != slab->allocator->nonce) ||
2854 		    (header.metadata_type != VDO_METADATA_SLAB_JOURNAL) ||
2855 		    (header.sequence_number != sequence) ||
2856 		    (header.entry_count > journal->entries_per_block) ||
2857 		    (header.has_block_map_increments &&
2858 		     (header.entry_count > journal->full_entries_per_block))) {
2859 			/* The block is not what we expect it to be. */
2860 			vdo_log_error("vdo_slab journal block for slab %u was invalid",
2861 				      slab->slab_number);
2862 			abort_scrubbing(scrubber, VDO_CORRUPT_JOURNAL);
2863 			return;
2864 		}
2865 
2866 		result = apply_block_entries(block, header.entry_count, sequence, slab);
2867 		if (result != VDO_SUCCESS) {
2868 			abort_scrubbing(scrubber, result);
2869 			return;
2870 		}
2871 
2872 		last_entry_applied.sequence_number = sequence;
2873 		last_entry_applied.entry_count = header.entry_count - 1;
2874 		index++;
2875 		if (index == journal->size)
2876 			index = 0;
2877 	}
2878 
2879 	/*
2880 	 * At the end of rebuild, the reference counters should be accurate to the end of the
2881 	 * journal we just applied.
2882 	 */
2883 	result = VDO_ASSERT(!vdo_before_journal_point(&last_entry_applied,
2884 						      &ref_counts_point),
2885 			    "Refcounts are not more accurate than the slab journal");
2886 	if (result != VDO_SUCCESS) {
2887 		abort_scrubbing(scrubber, result);
2888 		return;
2889 	}
2890 
2891 	/* Save out the rebuilt reference blocks. */
2892 	vdo_prepare_completion(completion, slab_scrubbed, handle_scrubber_error,
2893 			       slab->allocator->thread_id, completion->parent);
2894 	vdo_start_operation_with_waiter(&slab->state,
2895 					VDO_ADMIN_STATE_SAVE_FOR_SCRUBBING,
2896 					completion, initiate_slab_action);
2897 }
2898 
2899 static void read_slab_journal_endio(struct bio *bio)
2900 {
2901 	struct vio *vio = bio->bi_private;
2902 	struct slab_scrubber *scrubber = container_of(vio, struct slab_scrubber, vio);
2903 
2904 	continue_vio_after_io(bio->bi_private, apply_journal_entries,
2905 			      scrubber->slab->allocator->thread_id);
2906 }
2907 
2908 /**
2909  * start_scrubbing() - Read the current slab's journal from disk now that it has been flushed.
2910  * @completion: The scrubber's vio completion.
2911  *
2912  * This callback is registered in scrub_next_slab().
2913  */
2914 static void start_scrubbing(struct vdo_completion *completion)
2915 {
2916 	struct slab_scrubber *scrubber =
2917 		container_of(as_vio(completion), struct slab_scrubber, vio);
2918 	struct vdo_slab *slab = scrubber->slab;
2919 
2920 	if (!slab->allocator->summary_entries[slab->slab_number].is_dirty) {
2921 		slab_scrubbed(completion);
2922 		return;
2923 	}
2924 
2925 	vdo_submit_metadata_vio(&scrubber->vio, slab->journal_origin,
2926 				read_slab_journal_endio, handle_scrubber_error,
2927 				REQ_OP_READ);
2928 }
2929 
2930 /**
2931  * scrub_next_slab() - Scrub the next slab if there is one.
2932  * @scrubber: The scrubber.
2933  */
2934 static void scrub_next_slab(struct slab_scrubber *scrubber)
2935 {
2936 	struct vdo_completion *completion = &scrubber->vio.completion;
2937 	struct vdo_slab *slab;
2938 
2939 	/*
2940 	 * Note: this notify call is always safe only because scrubbing can only be started when
2941 	 * the VDO is quiescent.
2942 	 */
2943 	vdo_waitq_notify_all_waiters(&scrubber->waiters, NULL, NULL);
2944 
2945 	if (vdo_is_read_only(completion->vdo)) {
2946 		finish_scrubbing(scrubber, VDO_READ_ONLY);
2947 		return;
2948 	}
2949 
2950 	slab = get_next_slab(scrubber);
2951 	if ((slab == NULL) ||
2952 	    (scrubber->high_priority_only && list_empty(&scrubber->high_priority_slabs))) {
2953 		finish_scrubbing(scrubber, VDO_SUCCESS);
2954 		return;
2955 	}
2956 
2957 	if (vdo_finish_draining(&scrubber->admin_state))
2958 		return;
2959 
2960 	list_del_init(&slab->allocq_entry);
2961 	scrubber->slab = slab;
2962 	vdo_prepare_completion(completion, start_scrubbing, handle_scrubber_error,
2963 			       slab->allocator->thread_id, completion->parent);
2964 	vdo_start_operation_with_waiter(&slab->state, VDO_ADMIN_STATE_SCRUBBING,
2965 					completion, initiate_slab_action);
2966 }
2967 
2968 /**
2969  * scrub_slabs() - Scrub all of an allocator's slabs that are eligible for scrubbing.
2970  * @allocator: The block_allocator to scrub.
2971  * @parent: The completion to notify when scrubbing is done, implies high_priority, may be NULL.
2972  */
2973 static void scrub_slabs(struct block_allocator *allocator, struct vdo_completion *parent)
2974 {
2975 	struct slab_scrubber *scrubber = &allocator->scrubber;
2976 
2977 	scrubber->vio.completion.parent = parent;
2978 	scrubber->high_priority_only = (parent != NULL);
2979 	if (!has_slabs_to_scrub(scrubber)) {
2980 		finish_scrubbing(scrubber, VDO_SUCCESS);
2981 		return;
2982 	}
2983 
2984 	if (scrubber->high_priority_only &&
2985 	    vdo_is_priority_table_empty(allocator->prioritized_slabs) &&
2986 	    list_empty(&scrubber->high_priority_slabs))
2987 		register_slab_for_scrubbing(get_next_slab(scrubber), true);
2988 
2989 	vdo_resume_if_quiescent(&scrubber->admin_state);
2990 	scrub_next_slab(scrubber);
2991 }
2992 
2993 static inline void assert_on_allocator_thread(thread_id_t thread_id,
2994 					      const char *function_name)
2995 {
2996 	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == thread_id),
2997 			    "%s called on correct thread", function_name);
2998 }
2999 
3000 static void register_slab_with_allocator(struct block_allocator *allocator,
3001 					 struct vdo_slab *slab)
3002 {
3003 	allocator->slab_count++;
3004 	allocator->last_slab = slab->slab_number;
3005 }
3006 
3007 /**
3008  * get_depot_slab_iterator() - Return a slab_iterator over the slabs in a slab_depot.
3009  * @depot: The depot over which to iterate.
3010  * @start: The number of the slab to start iterating from.
3011  * @end: The number of the last slab which may be returned.
3012  * @stride: The difference in slab number between successive slabs.
3013  *
3014  * Iteration always occurs from higher to lower numbered slabs.
3015  *
3016  * Return: An initialized iterator structure.
3017  */
3018 static struct slab_iterator get_depot_slab_iterator(struct slab_depot *depot,
3019 						    slab_count_t start, slab_count_t end,
3020 						    slab_count_t stride)
3021 {
3022 	struct vdo_slab **slabs = depot->slabs;
3023 
3024 	return (struct slab_iterator) {
3025 		.slabs = slabs,
3026 		.next = (((slabs == NULL) || (start < end)) ? NULL : slabs[start]),
3027 		.end = end,
3028 		.stride = stride,
3029 	};
3030 }
3031 
3032 static struct slab_iterator get_slab_iterator(const struct block_allocator *allocator)
3033 {
3034 	return get_depot_slab_iterator(allocator->depot, allocator->last_slab,
3035 				       allocator->zone_number,
3036 				       allocator->depot->zone_count);
3037 }
3038 
3039 /**
3040  * next_slab() - Get the next slab from a slab_iterator and advance the iterator
3041  * @iterator: The slab_iterator.
3042  *
3043  * Return: The next slab or NULL if the iterator is exhausted.
3044  */
3045 static struct vdo_slab *next_slab(struct slab_iterator *iterator)
3046 {
3047 	struct vdo_slab *slab = iterator->next;
3048 
3049 	if ((slab == NULL) || (slab->slab_number < iterator->end + iterator->stride))
3050 		iterator->next = NULL;
3051 	else
3052 		iterator->next = iterator->slabs[slab->slab_number - iterator->stride];
3053 
3054 	return slab;
3055 }
3056 
3057 /**
3058  * abort_waiter() - Abort vios waiting to make journal entries when read-only.
3059  *
3060  * This callback is invoked on all vios waiting to make slab journal entries after the VDO has gone
3061  * into read-only mode. Implements waiter_callback_fn.
3062  */
3063 static void abort_waiter(struct vdo_waiter *waiter, void *context __always_unused)
3064 {
3065 	struct reference_updater *updater =
3066 		container_of(waiter, struct reference_updater, waiter);
3067 	struct data_vio *data_vio = data_vio_from_reference_updater(updater);
3068 
3069 	if (updater->increment) {
3070 		continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
3071 		return;
3072 	}
3073 
3074 	vdo_continue_completion(&data_vio->decrement_completion, VDO_READ_ONLY);
3075 }
3076 
3077 /* Implements vdo_read_only_notification_fn. */
3078 static void notify_block_allocator_of_read_only_mode(void *listener,
3079 						     struct vdo_completion *parent)
3080 {
3081 	struct block_allocator *allocator = listener;
3082 	struct slab_iterator iterator;
3083 
3084 	assert_on_allocator_thread(allocator->thread_id, __func__);
3085 	iterator = get_slab_iterator(allocator);
3086 	while (iterator.next != NULL) {
3087 		struct vdo_slab *slab = next_slab(&iterator);
3088 
3089 		vdo_waitq_notify_all_waiters(&slab->journal.entry_waiters,
3090 					     abort_waiter, &slab->journal);
3091 		check_if_slab_drained(slab);
3092 	}
3093 
3094 	vdo_finish_completion(parent);
3095 }
3096 
3097 /**
3098  * vdo_acquire_provisional_reference() - Acquire a provisional reference on behalf of a PBN lock if
3099  *                                       the block it locks is unreferenced.
3100  * @slab: The slab which contains the block.
3101  * @pbn: The physical block to reference.
3102  * @lock: The lock.
3103  *
3104  * Return: VDO_SUCCESS or an error.
3105  */
3106 int vdo_acquire_provisional_reference(struct vdo_slab *slab, physical_block_number_t pbn,
3107 				      struct pbn_lock *lock)
3108 {
3109 	slab_block_number block_number;
3110 	int result;
3111 
3112 	if (vdo_pbn_lock_has_provisional_reference(lock))
3113 		return VDO_SUCCESS;
3114 
3115 	if (!is_slab_open(slab))
3116 		return VDO_INVALID_ADMIN_STATE;
3117 
3118 	result = slab_block_number_from_pbn(slab, pbn, &block_number);
3119 	if (result != VDO_SUCCESS)
3120 		return result;
3121 
3122 	if (slab->counters[block_number] == EMPTY_REFERENCE_COUNT) {
3123 		make_provisional_reference(slab, block_number);
3124 		if (lock != NULL)
3125 			vdo_assign_pbn_lock_provisional_reference(lock);
3126 	}
3127 
3128 	if (vdo_pbn_lock_has_provisional_reference(lock))
3129 		adjust_free_block_count(slab, false);
3130 
3131 	return VDO_SUCCESS;
3132 }
3133 
3134 static int __must_check allocate_slab_block(struct vdo_slab *slab,
3135 					    physical_block_number_t *block_number_ptr)
3136 {
3137 	slab_block_number free_index;
3138 
3139 	if (!is_slab_open(slab))
3140 		return VDO_INVALID_ADMIN_STATE;
3141 
3142 	if (!search_reference_blocks(slab, &free_index))
3143 		return VDO_NO_SPACE;
3144 
3145 	VDO_ASSERT_LOG_ONLY((slab->counters[free_index] == EMPTY_REFERENCE_COUNT),
3146 			    "free block must have ref count of zero");
3147 	make_provisional_reference(slab, free_index);
3148 	adjust_free_block_count(slab, false);
3149 
3150 	/*
3151 	 * Update the search hint so the next search will start at the array index just past the
3152 	 * free block we just found.
3153 	 */
3154 	slab->search_cursor.index = (free_index + 1);
3155 
3156 	*block_number_ptr = slab->start + free_index;
3157 	return VDO_SUCCESS;
3158 }
3159 
3160 /**
3161  * open_slab() - Prepare a slab to be allocated from.
3162  * @slab: The slab.
3163  */
3164 static void open_slab(struct vdo_slab *slab)
3165 {
3166 	reset_search_cursor(slab);
3167 	if (is_slab_journal_blank(slab)) {
3168 		WRITE_ONCE(slab->allocator->statistics.slabs_opened,
3169 			   slab->allocator->statistics.slabs_opened + 1);
3170 		dirty_all_reference_blocks(slab);
3171 	} else {
3172 		WRITE_ONCE(slab->allocator->statistics.slabs_reopened,
3173 			   slab->allocator->statistics.slabs_reopened + 1);
3174 	}
3175 
3176 	slab->allocator->open_slab = slab;
3177 }
3178 
3179 
3180 /*
3181  * The block allocated will have a provisional reference and the reference must be either confirmed
3182  * with a subsequent increment or vacated with a subsequent decrement via
3183  * vdo_release_block_reference().
3184  */
3185 int vdo_allocate_block(struct block_allocator *allocator,
3186 		       physical_block_number_t *block_number_ptr)
3187 {
3188 	int result;
3189 
3190 	if (allocator->open_slab != NULL) {
3191 		/* Try to allocate the next block in the currently open slab. */
3192 		result = allocate_slab_block(allocator->open_slab, block_number_ptr);
3193 		if ((result == VDO_SUCCESS) || (result != VDO_NO_SPACE))
3194 			return result;
3195 
3196 		/* Put the exhausted open slab back into the priority table. */
3197 		prioritize_slab(allocator->open_slab);
3198 	}
3199 
3200 	/* Remove the highest priority slab from the priority table and make it the open slab. */
3201 	open_slab(list_entry(vdo_priority_table_dequeue(allocator->prioritized_slabs),
3202 			     struct vdo_slab, allocq_entry));
3203 
3204 	/*
3205 	 * Try allocating again. If we're out of space immediately after opening a slab, then every
3206 	 * slab must be fully allocated.
3207 	 */
3208 	return allocate_slab_block(allocator->open_slab, block_number_ptr);
3209 }
3210 
3211 /**
3212  * vdo_enqueue_clean_slab_waiter() - Wait for a clean slab.
3213  * @allocator: The block_allocator on which to wait.
3214  * @waiter: The waiter.
3215  *
3216  * Return: VDO_SUCCESS if the waiter was queued, VDO_NO_SPACE if there are no slabs to scrub, and
3217  *         some other error otherwise.
3218  */
3219 int vdo_enqueue_clean_slab_waiter(struct block_allocator *allocator,
3220 				  struct vdo_waiter *waiter)
3221 {
3222 	if (vdo_is_read_only(allocator->depot->vdo))
3223 		return VDO_READ_ONLY;
3224 
3225 	if (vdo_is_state_quiescent(&allocator->scrubber.admin_state))
3226 		return VDO_NO_SPACE;
3227 
3228 	vdo_waitq_enqueue_waiter(&allocator->scrubber.waiters, waiter);
3229 	return VDO_SUCCESS;
3230 }
3231 
3232 /**
3233  * vdo_modify_reference_count() - Modify the reference count of a block by first making a slab
3234  *                                journal entry and then updating the reference counter.
3235  *
3236  * @data_vio: The data_vio for which to add the entry.
3237  * @updater: Which of the data_vio's reference updaters is being submitted.
3238  */
3239 void vdo_modify_reference_count(struct vdo_completion *completion,
3240 				struct reference_updater *updater)
3241 {
3242 	struct vdo_slab *slab = vdo_get_slab(completion->vdo->depot, updater->zpbn.pbn);
3243 
3244 	if (!is_slab_open(slab)) {
3245 		vdo_continue_completion(completion, VDO_INVALID_ADMIN_STATE);
3246 		return;
3247 	}
3248 
3249 	if (vdo_is_read_only(completion->vdo)) {
3250 		vdo_continue_completion(completion, VDO_READ_ONLY);
3251 		return;
3252 	}
3253 
3254 	vdo_waitq_enqueue_waiter(&slab->journal.entry_waiters, &updater->waiter);
3255 	if ((slab->status != VDO_SLAB_REBUILT) && requires_reaping(&slab->journal))
3256 		register_slab_for_scrubbing(slab, true);
3257 
3258 	add_entries(&slab->journal);
3259 }
3260 
3261 /* Release an unused provisional reference. */
3262 int vdo_release_block_reference(struct block_allocator *allocator,
3263 				physical_block_number_t pbn)
3264 {
3265 	struct reference_updater updater;
3266 
3267 	if (pbn == VDO_ZERO_BLOCK)
3268 		return VDO_SUCCESS;
3269 
3270 	updater = (struct reference_updater) {
3271 		.operation = VDO_JOURNAL_DATA_REMAPPING,
3272 		.increment = false,
3273 		.zpbn = {
3274 			.pbn = pbn,
3275 		},
3276 	};
3277 
3278 	return adjust_reference_count(vdo_get_slab(allocator->depot, pbn),
3279 				      &updater, NULL);
3280 }
3281 
3282 /*
3283  * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as
3284  * the primary key and the 'emptiness' field as the secondary key.
3285  *
3286  * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping
3287  * should always get the most empty first, so pushing should be from most empty to least empty.
3288  * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements
3289  * before larger ones.
3290  */
3291 static bool slab_status_is_less_than(const void *item1, const void *item2,
3292 					void __always_unused *args)
3293 {
3294 	const struct slab_status *info1 = item1;
3295 	const struct slab_status *info2 = item2;
3296 
3297 	if (info1->is_clean != info2->is_clean)
3298 		return info1->is_clean;
3299 	if (info1->emptiness != info2->emptiness)
3300 		return info1->emptiness > info2->emptiness;
3301 	return info1->slab_number < info2->slab_number;
3302 }
3303 
3304 static const struct min_heap_callbacks slab_status_min_heap = {
3305 	.less = slab_status_is_less_than,
3306 	.swp = NULL,
3307 };
3308 
3309 /* Inform the slab actor that a action has finished on some slab; used by apply_to_slabs(). */
3310 static void slab_action_callback(struct vdo_completion *completion)
3311 {
3312 	struct block_allocator *allocator = vdo_as_block_allocator(completion);
3313 	struct slab_actor *actor = &allocator->slab_actor;
3314 
3315 	if (--actor->slab_action_count == 0) {
3316 		actor->callback(completion);
3317 		return;
3318 	}
3319 
3320 	vdo_reset_completion(completion);
3321 }
3322 
3323 /* Preserve the error from part of an action and continue. */
3324 static void handle_operation_error(struct vdo_completion *completion)
3325 {
3326 	struct block_allocator *allocator = vdo_as_block_allocator(completion);
3327 
3328 	if (allocator->state.waiter != NULL)
3329 		vdo_set_completion_result(allocator->state.waiter, completion->result);
3330 	completion->callback(completion);
3331 }
3332 
3333 /* Perform an action on each of an allocator's slabs in parallel. */
3334 static void apply_to_slabs(struct block_allocator *allocator, vdo_action_fn callback)
3335 {
3336 	struct slab_iterator iterator;
3337 
3338 	vdo_prepare_completion(&allocator->completion, slab_action_callback,
3339 			       handle_operation_error, allocator->thread_id, NULL);
3340 	allocator->completion.requeue = false;
3341 
3342 	/*
3343 	 * Since we are going to dequeue all of the slabs, the open slab will become invalid, so
3344 	 * clear it.
3345 	 */
3346 	allocator->open_slab = NULL;
3347 
3348 	/* Ensure that we don't finish before we're done starting. */
3349 	allocator->slab_actor = (struct slab_actor) {
3350 		.slab_action_count = 1,
3351 		.callback = callback,
3352 	};
3353 
3354 	iterator = get_slab_iterator(allocator);
3355 	while (iterator.next != NULL) {
3356 		const struct admin_state_code *operation =
3357 			vdo_get_admin_state_code(&allocator->state);
3358 		struct vdo_slab *slab = next_slab(&iterator);
3359 
3360 		list_del_init(&slab->allocq_entry);
3361 		allocator->slab_actor.slab_action_count++;
3362 		vdo_start_operation_with_waiter(&slab->state, operation,
3363 						&allocator->completion,
3364 						initiate_slab_action);
3365 	}
3366 
3367 	slab_action_callback(&allocator->completion);
3368 }
3369 
3370 static void finish_loading_allocator(struct vdo_completion *completion)
3371 {
3372 	struct block_allocator *allocator = vdo_as_block_allocator(completion);
3373 	const struct admin_state_code *operation =
3374 		vdo_get_admin_state_code(&allocator->state);
3375 
3376 	if (allocator->eraser != NULL)
3377 		dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
3378 
3379 	if (operation == VDO_ADMIN_STATE_LOADING_FOR_RECOVERY) {
3380 		void *context =
3381 			vdo_get_current_action_context(allocator->depot->action_manager);
3382 
3383 		vdo_replay_into_slab_journals(allocator, context);
3384 		return;
3385 	}
3386 
3387 	vdo_finish_loading(&allocator->state);
3388 }
3389 
3390 static void erase_next_slab_journal(struct block_allocator *allocator);
3391 
3392 static void copy_callback(int read_err, unsigned long write_err, void *context)
3393 {
3394 	struct block_allocator *allocator = context;
3395 	int result = (((read_err == 0) && (write_err == 0)) ? VDO_SUCCESS : -EIO);
3396 
3397 	if (result != VDO_SUCCESS) {
3398 		vdo_fail_completion(&allocator->completion, result);
3399 		return;
3400 	}
3401 
3402 	erase_next_slab_journal(allocator);
3403 }
3404 
3405 /* erase_next_slab_journal() - Erase the next slab journal. */
3406 static void erase_next_slab_journal(struct block_allocator *allocator)
3407 {
3408 	struct vdo_slab *slab;
3409 	physical_block_number_t pbn;
3410 	struct dm_io_region regions[1];
3411 	struct slab_depot *depot = allocator->depot;
3412 	block_count_t blocks = depot->slab_config.slab_journal_blocks;
3413 
3414 	if (allocator->slabs_to_erase.next == NULL) {
3415 		vdo_finish_completion(&allocator->completion);
3416 		return;
3417 	}
3418 
3419 	slab = next_slab(&allocator->slabs_to_erase);
3420 	pbn = slab->journal_origin - depot->vdo->geometry.bio_offset;
3421 	regions[0] = (struct dm_io_region) {
3422 		.bdev = vdo_get_backing_device(depot->vdo),
3423 		.sector = pbn * VDO_SECTORS_PER_BLOCK,
3424 		.count = blocks * VDO_SECTORS_PER_BLOCK,
3425 	};
3426 	dm_kcopyd_zero(allocator->eraser, 1, regions, 0, copy_callback, allocator);
3427 }
3428 
3429 /* Implements vdo_admin_initiator_fn. */
3430 static void initiate_load(struct admin_state *state)
3431 {
3432 	struct block_allocator *allocator =
3433 		container_of(state, struct block_allocator, state);
3434 	const struct admin_state_code *operation = vdo_get_admin_state_code(state);
3435 
3436 	if (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD) {
3437 		/*
3438 		 * Must requeue because the kcopyd client cannot be freed in the same stack frame
3439 		 * as the kcopyd callback, lest it deadlock.
3440 		 */
3441 		vdo_prepare_completion_for_requeue(&allocator->completion,
3442 						   finish_loading_allocator,
3443 						   handle_operation_error,
3444 						   allocator->thread_id, NULL);
3445 		allocator->eraser = dm_kcopyd_client_create(NULL);
3446 		if (IS_ERR(allocator->eraser)) {
3447 			vdo_fail_completion(&allocator->completion,
3448 					    PTR_ERR(allocator->eraser));
3449 			allocator->eraser = NULL;
3450 			return;
3451 		}
3452 		allocator->slabs_to_erase = get_slab_iterator(allocator);
3453 
3454 		erase_next_slab_journal(allocator);
3455 		return;
3456 	}
3457 
3458 	apply_to_slabs(allocator, finish_loading_allocator);
3459 }
3460 
3461 /**
3462  * vdo_notify_slab_journals_are_recovered() - Inform a block allocator that its slab journals have
3463  *                                            been recovered from the recovery journal.
3464  * @completion The allocator completion
3465  */
3466 void vdo_notify_slab_journals_are_recovered(struct vdo_completion *completion)
3467 {
3468 	struct block_allocator *allocator = vdo_as_block_allocator(completion);
3469 
3470 	vdo_finish_loading_with_result(&allocator->state, completion->result);
3471 }
3472 
3473 static int get_slab_statuses(struct block_allocator *allocator,
3474 			     struct slab_status **statuses_ptr)
3475 {
3476 	int result;
3477 	struct slab_status *statuses;
3478 	struct slab_iterator iterator = get_slab_iterator(allocator);
3479 
3480 	result = vdo_allocate(allocator->slab_count, struct slab_status, __func__,
3481 			      &statuses);
3482 	if (result != VDO_SUCCESS)
3483 		return result;
3484 
3485 	*statuses_ptr = statuses;
3486 
3487 	while (iterator.next != NULL)  {
3488 		slab_count_t slab_number = next_slab(&iterator)->slab_number;
3489 
3490 		*statuses++ = (struct slab_status) {
3491 			.slab_number = slab_number,
3492 			.is_clean = !allocator->summary_entries[slab_number].is_dirty,
3493 			.emptiness = allocator->summary_entries[slab_number].fullness_hint,
3494 		};
3495 	}
3496 
3497 	return VDO_SUCCESS;
3498 }
3499 
3500 /* Prepare slabs for allocation or scrubbing. */
3501 static int __must_check vdo_prepare_slabs_for_allocation(struct block_allocator *allocator)
3502 {
3503 	struct slab_status current_slab_status;
3504 	DEFINE_MIN_HEAP(struct slab_status, heap) heap;
3505 	int result;
3506 	struct slab_status *slab_statuses;
3507 	struct slab_depot *depot = allocator->depot;
3508 
3509 	WRITE_ONCE(allocator->allocated_blocks,
3510 		   allocator->slab_count * depot->slab_config.data_blocks);
3511 	result = get_slab_statuses(allocator, &slab_statuses);
3512 	if (result != VDO_SUCCESS)
3513 		return result;
3514 
3515 	/* Sort the slabs by cleanliness, then by emptiness hint. */
3516 	heap = (struct heap) {
3517 		.data = slab_statuses,
3518 		.nr = allocator->slab_count,
3519 		.size = allocator->slab_count,
3520 	};
3521 	min_heapify_all(&heap, &slab_status_min_heap, NULL);
3522 
3523 	while (heap.nr > 0) {
3524 		bool high_priority;
3525 		struct vdo_slab *slab;
3526 		struct slab_journal *journal;
3527 
3528 		current_slab_status = slab_statuses[0];
3529 		min_heap_pop(&heap, &slab_status_min_heap, NULL);
3530 		slab = depot->slabs[current_slab_status.slab_number];
3531 
3532 		if ((depot->load_type == VDO_SLAB_DEPOT_REBUILD_LOAD) ||
3533 		    (!allocator->summary_entries[slab->slab_number].load_ref_counts &&
3534 		     current_slab_status.is_clean)) {
3535 			queue_slab(slab);
3536 			continue;
3537 		}
3538 
3539 		slab->status = VDO_SLAB_REQUIRES_SCRUBBING;
3540 		journal = &slab->journal;
3541 		high_priority = ((current_slab_status.is_clean &&
3542 				 (depot->load_type == VDO_SLAB_DEPOT_NORMAL_LOAD)) ||
3543 				 (journal_length(journal) >= journal->scrubbing_threshold));
3544 		register_slab_for_scrubbing(slab, high_priority);
3545 	}
3546 
3547 	vdo_free(slab_statuses);
3548 	return VDO_SUCCESS;
3549 }
3550 
3551 static const char *status_to_string(enum slab_rebuild_status status)
3552 {
3553 	switch (status) {
3554 	case VDO_SLAB_REBUILT:
3555 		return "REBUILT";
3556 	case VDO_SLAB_REQUIRES_SCRUBBING:
3557 		return "SCRUBBING";
3558 	case VDO_SLAB_REQUIRES_HIGH_PRIORITY_SCRUBBING:
3559 		return "PRIORITY_SCRUBBING";
3560 	case VDO_SLAB_REBUILDING:
3561 		return "REBUILDING";
3562 	case VDO_SLAB_REPLAYING:
3563 		return "REPLAYING";
3564 	default:
3565 		return "UNKNOWN";
3566 	}
3567 }
3568 
3569 void vdo_dump_block_allocator(const struct block_allocator *allocator)
3570 {
3571 	unsigned int pause_counter = 0;
3572 	struct slab_iterator iterator = get_slab_iterator(allocator);
3573 	const struct slab_scrubber *scrubber = &allocator->scrubber;
3574 
3575 	vdo_log_info("block_allocator zone %u", allocator->zone_number);
3576 	while (iterator.next != NULL) {
3577 		struct vdo_slab *slab = next_slab(&iterator);
3578 		struct slab_journal *journal = &slab->journal;
3579 
3580 		if (slab->reference_blocks != NULL) {
3581 			/* Terse because there are a lot of slabs to dump and syslog is lossy. */
3582 			vdo_log_info("slab %u: P%u, %llu free", slab->slab_number,
3583 				     slab->priority,
3584 				     (unsigned long long) slab->free_blocks);
3585 		} else {
3586 			vdo_log_info("slab %u: status %s", slab->slab_number,
3587 				     status_to_string(slab->status));
3588 		}
3589 
3590 		vdo_log_info("  slab journal: entry_waiters=%zu waiting_to_commit=%s updating_slab_summary=%s head=%llu unreapable=%llu tail=%llu next_commit=%llu summarized=%llu last_summarized=%llu recovery_lock=%llu dirty=%s",
3591 			     vdo_waitq_num_waiters(&journal->entry_waiters),
3592 			     vdo_bool_to_string(journal->waiting_to_commit),
3593 			     vdo_bool_to_string(journal->updating_slab_summary),
3594 			     (unsigned long long) journal->head,
3595 			     (unsigned long long) journal->unreapable,
3596 			     (unsigned long long) journal->tail,
3597 			     (unsigned long long) journal->next_commit,
3598 			     (unsigned long long) journal->summarized,
3599 			     (unsigned long long) journal->last_summarized,
3600 			     (unsigned long long) journal->recovery_lock,
3601 			     vdo_bool_to_string(journal->recovery_lock != 0));
3602 		/*
3603 		 * Given the frequency with which the locks are just a tiny bit off, it might be
3604 		 * worth dumping all the locks, but that might be too much logging.
3605 		 */
3606 
3607 		if (slab->counters != NULL) {
3608 			/* Terse because there are a lot of slabs to dump and syslog is lossy. */
3609 			vdo_log_info("  slab: free=%u/%u blocks=%u dirty=%zu active=%zu journal@(%llu,%u)",
3610 				     slab->free_blocks, slab->block_count,
3611 				     slab->reference_block_count,
3612 				     vdo_waitq_num_waiters(&slab->dirty_blocks),
3613 				     slab->active_count,
3614 				     (unsigned long long) slab->slab_journal_point.sequence_number,
3615 				     slab->slab_journal_point.entry_count);
3616 		} else {
3617 			vdo_log_info("  no counters");
3618 		}
3619 
3620 		/*
3621 		 * Wait for a while after each batch of 32 slabs dumped, an arbitrary number,
3622 		 * allowing the kernel log a chance to be flushed instead of being overrun.
3623 		 */
3624 		if (pause_counter++ == 31) {
3625 			pause_counter = 0;
3626 			vdo_pause_for_logger();
3627 		}
3628 	}
3629 
3630 	vdo_log_info("slab_scrubber slab_count %u waiters %zu %s%s",
3631 		     READ_ONCE(scrubber->slab_count),
3632 		     vdo_waitq_num_waiters(&scrubber->waiters),
3633 		     vdo_get_admin_state_code(&scrubber->admin_state)->name,
3634 		     scrubber->high_priority_only ? ", high_priority_only " : "");
3635 }
3636 
3637 static void free_slab(struct vdo_slab *slab)
3638 {
3639 	if (slab == NULL)
3640 		return;
3641 
3642 	list_del(&slab->allocq_entry);
3643 	vdo_free(vdo_forget(slab->journal.block));
3644 	vdo_free(vdo_forget(slab->journal.locks));
3645 	vdo_free(vdo_forget(slab->counters));
3646 	vdo_free(vdo_forget(slab->reference_blocks));
3647 	vdo_free(slab);
3648 }
3649 
3650 static int initialize_slab_journal(struct vdo_slab *slab)
3651 {
3652 	struct slab_journal *journal = &slab->journal;
3653 	const struct slab_config *slab_config = &slab->allocator->depot->slab_config;
3654 	int result;
3655 
3656 	result = vdo_allocate(slab_config->slab_journal_blocks, struct journal_lock,
3657 			      __func__, &journal->locks);
3658 	if (result != VDO_SUCCESS)
3659 		return result;
3660 
3661 	result = vdo_allocate(VDO_BLOCK_SIZE, char, "struct packed_slab_journal_block",
3662 			      (char **) &journal->block);
3663 	if (result != VDO_SUCCESS)
3664 		return result;
3665 
3666 	journal->slab = slab;
3667 	journal->size = slab_config->slab_journal_blocks;
3668 	journal->flushing_threshold = slab_config->slab_journal_flushing_threshold;
3669 	journal->blocking_threshold = slab_config->slab_journal_blocking_threshold;
3670 	journal->scrubbing_threshold = slab_config->slab_journal_scrubbing_threshold;
3671 	journal->entries_per_block = VDO_SLAB_JOURNAL_ENTRIES_PER_BLOCK;
3672 	journal->full_entries_per_block = VDO_SLAB_JOURNAL_FULL_ENTRIES_PER_BLOCK;
3673 	journal->events = &slab->allocator->slab_journal_statistics;
3674 	journal->recovery_journal = slab->allocator->depot->vdo->recovery_journal;
3675 	journal->tail = 1;
3676 	journal->head = 1;
3677 
3678 	journal->flushing_deadline = journal->flushing_threshold;
3679 	/*
3680 	 * Set there to be some time between the deadline and the blocking threshold, so that
3681 	 * hopefully all are done before blocking.
3682 	 */
3683 	if ((journal->blocking_threshold - journal->flushing_threshold) > 5)
3684 		journal->flushing_deadline = journal->blocking_threshold - 5;
3685 
3686 	journal->slab_summary_waiter.callback = release_journal_locks;
3687 
3688 	INIT_LIST_HEAD(&journal->dirty_entry);
3689 	INIT_LIST_HEAD(&journal->uncommitted_blocks);
3690 
3691 	journal->tail_header.nonce = slab->allocator->nonce;
3692 	journal->tail_header.metadata_type = VDO_METADATA_SLAB_JOURNAL;
3693 	initialize_journal_state(journal);
3694 	return VDO_SUCCESS;
3695 }
3696 
3697 /**
3698  * make_slab() - Construct a new, empty slab.
3699  * @slab_origin: The physical block number within the block allocator partition of the first block
3700  *               in the slab.
3701  * @allocator: The block allocator to which the slab belongs.
3702  * @slab_number: The slab number of the slab.
3703  * @is_new: true if this slab is being allocated as part of a resize.
3704  * @slab_ptr: A pointer to receive the new slab.
3705  *
3706  * Return: VDO_SUCCESS or an error code.
3707  */
3708 static int __must_check make_slab(physical_block_number_t slab_origin,
3709 				  struct block_allocator *allocator,
3710 				  slab_count_t slab_number, bool is_new,
3711 				  struct vdo_slab **slab_ptr)
3712 {
3713 	const struct slab_config *slab_config = &allocator->depot->slab_config;
3714 	struct vdo_slab *slab;
3715 	int result;
3716 
3717 	result = vdo_allocate(1, struct vdo_slab, __func__, &slab);
3718 	if (result != VDO_SUCCESS)
3719 		return result;
3720 
3721 	*slab = (struct vdo_slab) {
3722 		.allocator = allocator,
3723 		.start = slab_origin,
3724 		.end = slab_origin + slab_config->slab_blocks,
3725 		.slab_number = slab_number,
3726 		.ref_counts_origin = slab_origin + slab_config->data_blocks,
3727 		.journal_origin =
3728 			vdo_get_slab_journal_start_block(slab_config, slab_origin),
3729 		.block_count = slab_config->data_blocks,
3730 		.free_blocks = slab_config->data_blocks,
3731 		.reference_block_count =
3732 			vdo_get_saved_reference_count_size(slab_config->data_blocks),
3733 	};
3734 	INIT_LIST_HEAD(&slab->allocq_entry);
3735 
3736 	result = initialize_slab_journal(slab);
3737 	if (result != VDO_SUCCESS) {
3738 		free_slab(slab);
3739 		return result;
3740 	}
3741 
3742 	if (is_new) {
3743 		vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NEW);
3744 		result = allocate_slab_counters(slab);
3745 		if (result != VDO_SUCCESS) {
3746 			free_slab(slab);
3747 			return result;
3748 		}
3749 	} else {
3750 		vdo_set_admin_state_code(&slab->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
3751 	}
3752 
3753 	*slab_ptr = slab;
3754 	return VDO_SUCCESS;
3755 }
3756 
3757 /**
3758  * allocate_slabs() - Allocate a new slab pointer array.
3759  * @depot: The depot.
3760  * @slab_count: The number of slabs the depot should have in the new array.
3761  *
3762  * Any existing slab pointers will be copied into the new array, and slabs will be allocated as
3763  * needed. The newly allocated slabs will not be distributed for use by the block allocators.
3764  *
3765  * Return: VDO_SUCCESS or an error code.
3766  */
3767 static int allocate_slabs(struct slab_depot *depot, slab_count_t slab_count)
3768 {
3769 	block_count_t slab_size;
3770 	bool resizing = false;
3771 	physical_block_number_t slab_origin;
3772 	int result;
3773 
3774 	result = vdo_allocate(slab_count, struct vdo_slab *,
3775 			      "slab pointer array", &depot->new_slabs);
3776 	if (result != VDO_SUCCESS)
3777 		return result;
3778 
3779 	if (depot->slabs != NULL) {
3780 		memcpy(depot->new_slabs, depot->slabs,
3781 		       depot->slab_count * sizeof(struct vdo_slab *));
3782 		resizing = true;
3783 	}
3784 
3785 	slab_size = depot->slab_config.slab_blocks;
3786 	slab_origin = depot->first_block + (depot->slab_count * slab_size);
3787 
3788 	for (depot->new_slab_count = depot->slab_count;
3789 	     depot->new_slab_count < slab_count;
3790 	     depot->new_slab_count++, slab_origin += slab_size) {
3791 		struct block_allocator *allocator =
3792 			&depot->allocators[depot->new_slab_count % depot->zone_count];
3793 		struct vdo_slab **slab_ptr = &depot->new_slabs[depot->new_slab_count];
3794 
3795 		result = make_slab(slab_origin, allocator, depot->new_slab_count,
3796 				   resizing, slab_ptr);
3797 		if (result != VDO_SUCCESS)
3798 			return result;
3799 	}
3800 
3801 	return VDO_SUCCESS;
3802 }
3803 
3804 /**
3805  * vdo_abandon_new_slabs() - Abandon any new slabs in this depot, freeing them as needed.
3806  * @depot: The depot.
3807  */
3808 void vdo_abandon_new_slabs(struct slab_depot *depot)
3809 {
3810 	slab_count_t i;
3811 
3812 	if (depot->new_slabs == NULL)
3813 		return;
3814 
3815 	for (i = depot->slab_count; i < depot->new_slab_count; i++)
3816 		free_slab(vdo_forget(depot->new_slabs[i]));
3817 	depot->new_slab_count = 0;
3818 	depot->new_size = 0;
3819 	vdo_free(vdo_forget(depot->new_slabs));
3820 }
3821 
3822 /**
3823  * get_allocator_thread_id() - Get the ID of the thread on which a given allocator operates.
3824  *
3825  * Implements vdo_zone_thread_getter_fn.
3826  */
3827 static thread_id_t get_allocator_thread_id(void *context, zone_count_t zone_number)
3828 {
3829 	return ((struct slab_depot *) context)->allocators[zone_number].thread_id;
3830 }
3831 
3832 /**
3833  * release_recovery_journal_lock() - Request the slab journal to release the recovery journal lock
3834  *                                   it may hold on a specified recovery journal block.
3835  * @journal: The slab journal.
3836  * @recovery_lock: The sequence number of the recovery journal block whose locks should be
3837  *                 released.
3838  *
3839  * Return: true if the journal does hold a lock on the specified block (which it will release).
3840  */
3841 static bool __must_check release_recovery_journal_lock(struct slab_journal *journal,
3842 						       sequence_number_t recovery_lock)
3843 {
3844 	if (recovery_lock > journal->recovery_lock) {
3845 		VDO_ASSERT_LOG_ONLY((recovery_lock < journal->recovery_lock),
3846 				    "slab journal recovery lock is not older than the recovery journal head");
3847 		return false;
3848 	}
3849 
3850 	if ((recovery_lock < journal->recovery_lock) ||
3851 	    vdo_is_read_only(journal->slab->allocator->depot->vdo))
3852 		return false;
3853 
3854 	/* All locks are held by the block which is in progress; write it. */
3855 	commit_tail(journal);
3856 	return true;
3857 }
3858 
3859 /*
3860  * Request a commit of all dirty tail blocks which are locking the recovery journal block the depot
3861  * is seeking to release.
3862  *
3863  * Implements vdo_zone_action_fn.
3864  */
3865 static void release_tail_block_locks(void *context, zone_count_t zone_number,
3866 				     struct vdo_completion *parent)
3867 {
3868 	struct slab_journal *journal, *tmp;
3869 	struct slab_depot *depot = context;
3870 	struct list_head *list = &depot->allocators[zone_number].dirty_slab_journals;
3871 
3872 	list_for_each_entry_safe(journal, tmp, list, dirty_entry) {
3873 		if (!release_recovery_journal_lock(journal,
3874 						   depot->active_release_request))
3875 			break;
3876 	}
3877 
3878 	vdo_finish_completion(parent);
3879 }
3880 
3881 /**
3882  * prepare_for_tail_block_commit() - Prepare to commit oldest tail blocks.
3883  *
3884  * Implements vdo_action_preamble_fn.
3885  */
3886 static void prepare_for_tail_block_commit(void *context, struct vdo_completion *parent)
3887 {
3888 	struct slab_depot *depot = context;
3889 
3890 	depot->active_release_request = depot->new_release_request;
3891 	vdo_finish_completion(parent);
3892 }
3893 
3894 /**
3895  * schedule_tail_block_commit() - Schedule a tail block commit if necessary.
3896  *
3897  * This method should not be called directly. Rather, call vdo_schedule_default_action() on the
3898  * depot's action manager.
3899  *
3900  * Implements vdo_action_scheduler_fn.
3901  */
3902 static bool schedule_tail_block_commit(void *context)
3903 {
3904 	struct slab_depot *depot = context;
3905 
3906 	if (depot->new_release_request == depot->active_release_request)
3907 		return false;
3908 
3909 	return vdo_schedule_action(depot->action_manager,
3910 				   prepare_for_tail_block_commit,
3911 				   release_tail_block_locks,
3912 				   NULL, NULL);
3913 }
3914 
3915 /**
3916  * initialize_slab_scrubber() - Initialize an allocator's slab scrubber.
3917  * @allocator: The allocator being initialized
3918  *
3919  * Return: VDO_SUCCESS or an error.
3920  */
3921 static int initialize_slab_scrubber(struct block_allocator *allocator)
3922 {
3923 	struct slab_scrubber *scrubber = &allocator->scrubber;
3924 	block_count_t slab_journal_size =
3925 		allocator->depot->slab_config.slab_journal_blocks;
3926 	char *journal_data;
3927 	int result;
3928 
3929 	result = vdo_allocate(VDO_BLOCK_SIZE * slab_journal_size,
3930 			      char, __func__, &journal_data);
3931 	if (result != VDO_SUCCESS)
3932 		return result;
3933 
3934 	result = allocate_vio_components(allocator->completion.vdo,
3935 					 VIO_TYPE_SLAB_JOURNAL,
3936 					 VIO_PRIORITY_METADATA,
3937 					 allocator, slab_journal_size,
3938 					 journal_data, &scrubber->vio);
3939 	if (result != VDO_SUCCESS) {
3940 		vdo_free(journal_data);
3941 		return result;
3942 	}
3943 
3944 	INIT_LIST_HEAD(&scrubber->high_priority_slabs);
3945 	INIT_LIST_HEAD(&scrubber->slabs);
3946 	vdo_set_admin_state_code(&scrubber->admin_state, VDO_ADMIN_STATE_SUSPENDED);
3947 	return VDO_SUCCESS;
3948 }
3949 
3950 /**
3951  * initialize_slab_summary_block() - Initialize a slab_summary_block.
3952  * @allocator: The allocator which owns the block.
3953  * @index: The index of this block in its zone's summary.
3954  *
3955  * Return: VDO_SUCCESS or an error.
3956  */
3957 static int __must_check initialize_slab_summary_block(struct block_allocator *allocator,
3958 						      block_count_t index)
3959 {
3960 	struct slab_summary_block *block = &allocator->summary_blocks[index];
3961 	int result;
3962 
3963 	result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &block->outgoing_entries);
3964 	if (result != VDO_SUCCESS)
3965 		return result;
3966 
3967 	result = allocate_vio_components(allocator->depot->vdo, VIO_TYPE_SLAB_SUMMARY,
3968 					 VIO_PRIORITY_METADATA, NULL, 1,
3969 					 block->outgoing_entries, &block->vio);
3970 	if (result != VDO_SUCCESS)
3971 		return result;
3972 
3973 	block->allocator = allocator;
3974 	block->entries = &allocator->summary_entries[VDO_SLAB_SUMMARY_ENTRIES_PER_BLOCK * index];
3975 	block->index = index;
3976 	return VDO_SUCCESS;
3977 }
3978 
3979 static int __must_check initialize_block_allocator(struct slab_depot *depot,
3980 						   zone_count_t zone)
3981 {
3982 	int result;
3983 	block_count_t i;
3984 	struct block_allocator *allocator = &depot->allocators[zone];
3985 	struct vdo *vdo = depot->vdo;
3986 	block_count_t max_free_blocks = depot->slab_config.data_blocks;
3987 	unsigned int max_priority = (2 + ilog2(max_free_blocks));
3988 
3989 	*allocator = (struct block_allocator) {
3990 		.depot = depot,
3991 		.zone_number = zone,
3992 		.thread_id = vdo->thread_config.physical_threads[zone],
3993 		.nonce = vdo->states.vdo.nonce,
3994 	};
3995 
3996 	INIT_LIST_HEAD(&allocator->dirty_slab_journals);
3997 	vdo_set_admin_state_code(&allocator->state, VDO_ADMIN_STATE_NORMAL_OPERATION);
3998 	result = vdo_register_read_only_listener(vdo, allocator,
3999 						 notify_block_allocator_of_read_only_mode,
4000 						 allocator->thread_id);
4001 	if (result != VDO_SUCCESS)
4002 		return result;
4003 
4004 	vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION);
4005 	result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id,
4006 			       VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA,
4007 			       allocator, &allocator->vio_pool);
4008 	if (result != VDO_SUCCESS)
4009 		return result;
4010 
4011 	result = initialize_slab_scrubber(allocator);
4012 	if (result != VDO_SUCCESS)
4013 		return result;
4014 
4015 	result = vdo_make_priority_table(max_priority, &allocator->prioritized_slabs);
4016 	if (result != VDO_SUCCESS)
4017 		return result;
4018 
4019 	result = vdo_allocate(VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE,
4020 			      struct slab_summary_block, __func__,
4021 			      &allocator->summary_blocks);
4022 	if (result != VDO_SUCCESS)
4023 		return result;
4024 
4025 	vdo_set_admin_state_code(&allocator->summary_state,
4026 				 VDO_ADMIN_STATE_NORMAL_OPERATION);
4027 	allocator->summary_entries = depot->summary_entries + (MAX_VDO_SLABS * zone);
4028 
4029 	/* Initialize each summary block. */
4030 	for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
4031 		result = initialize_slab_summary_block(allocator, i);
4032 		if (result != VDO_SUCCESS)
4033 			return result;
4034 	}
4035 
4036 	/*
4037 	 * Performing well atop thin provisioned storage requires either that VDO discards freed
4038 	 * blocks, or that the block allocator try to use slabs that already have allocated blocks
4039 	 * in preference to slabs that have never been opened. For reasons we have not been able to
4040 	 * fully understand, some SSD machines have been have been very sensitive (50% reduction in
4041 	 * test throughput) to very slight differences in the timing and locality of block
4042 	 * allocation. Assigning a low priority to unopened slabs (max_priority/2, say) would be
4043 	 * ideal for the story, but anything less than a very high threshold (max_priority - 1)
4044 	 * hurts on these machines.
4045 	 *
4046 	 * This sets the free block threshold for preferring to open an unopened slab to the binary
4047 	 * floor of 3/4ths the total number of data blocks in a slab, which will generally evaluate
4048 	 * to about half the slab size.
4049 	 */
4050 	allocator->unopened_slab_priority = (1 + ilog2((max_free_blocks * 3) / 4));
4051 
4052 	return VDO_SUCCESS;
4053 }
4054 
4055 static int allocate_components(struct slab_depot *depot,
4056 			       struct partition *summary_partition)
4057 {
4058 	int result;
4059 	zone_count_t zone;
4060 	slab_count_t slab_count;
4061 	u8 hint;
4062 	u32 i;
4063 	const struct thread_config *thread_config = &depot->vdo->thread_config;
4064 
4065 	result = vdo_make_action_manager(depot->zone_count, get_allocator_thread_id,
4066 					 thread_config->journal_thread, depot,
4067 					 schedule_tail_block_commit,
4068 					 depot->vdo, &depot->action_manager);
4069 	if (result != VDO_SUCCESS)
4070 		return result;
4071 
4072 	depot->origin = depot->first_block;
4073 
4074 	/* block size must be a multiple of entry size */
4075 	BUILD_BUG_ON((VDO_BLOCK_SIZE % sizeof(struct slab_summary_entry)) != 0);
4076 
4077 	depot->summary_origin = summary_partition->offset;
4078 	depot->hint_shift = vdo_get_slab_summary_hint_shift(depot->slab_size_shift);
4079 	result = vdo_allocate(MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES,
4080 			      struct slab_summary_entry, __func__,
4081 			      &depot->summary_entries);
4082 	if (result != VDO_SUCCESS)
4083 		return result;
4084 
4085 
4086 	/* Initialize all the entries. */
4087 	hint = compute_fullness_hint(depot, depot->slab_config.data_blocks);
4088 	for (i = 0; i < MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES; i++) {
4089 		/*
4090 		 * This default tail block offset must be reflected in
4091 		 * slabJournal.c::read_slab_journal_tail().
4092 		 */
4093 		depot->summary_entries[i] = (struct slab_summary_entry) {
4094 			.tail_block_offset = 0,
4095 			.fullness_hint = hint,
4096 			.load_ref_counts = false,
4097 			.is_dirty = false,
4098 		};
4099 	}
4100 
4101 	slab_count = vdo_compute_slab_count(depot->first_block, depot->last_block,
4102 					    depot->slab_size_shift);
4103 	if (thread_config->physical_zone_count > slab_count) {
4104 		return vdo_log_error_strerror(VDO_BAD_CONFIGURATION,
4105 					      "%u physical zones exceeds slab count %u",
4106 					      thread_config->physical_zone_count,
4107 					      slab_count);
4108 	}
4109 
4110 	/* Initialize the block allocators. */
4111 	for (zone = 0; zone < depot->zone_count; zone++) {
4112 		result = initialize_block_allocator(depot, zone);
4113 		if (result != VDO_SUCCESS)
4114 			return result;
4115 	}
4116 
4117 	/* Allocate slabs. */
4118 	result = allocate_slabs(depot, slab_count);
4119 	if (result != VDO_SUCCESS)
4120 		return result;
4121 
4122 	/* Use the new slabs. */
4123 	for (i = depot->slab_count; i < depot->new_slab_count; i++) {
4124 		struct vdo_slab *slab = depot->new_slabs[i];
4125 
4126 		register_slab_with_allocator(slab->allocator, slab);
4127 		WRITE_ONCE(depot->slab_count, depot->slab_count + 1);
4128 	}
4129 
4130 	depot->slabs = depot->new_slabs;
4131 	depot->new_slabs = NULL;
4132 	depot->new_slab_count = 0;
4133 
4134 	return VDO_SUCCESS;
4135 }
4136 
4137 /**
4138  * vdo_decode_slab_depot() - Make a slab depot and configure it with the state read from the super
4139  *                           block.
4140  * @state: The slab depot state from the super block.
4141  * @vdo: The VDO which will own the depot.
4142  * @summary_partition: The partition which holds the slab summary.
4143  * @depot_ptr: A pointer to hold the depot.
4144  *
4145  * Return: A success or error code.
4146  */
4147 int vdo_decode_slab_depot(struct slab_depot_state_2_0 state, struct vdo *vdo,
4148 			  struct partition *summary_partition,
4149 			  struct slab_depot **depot_ptr)
4150 {
4151 	unsigned int slab_size_shift;
4152 	struct slab_depot *depot;
4153 	int result;
4154 
4155 	/*
4156 	 * Calculate the bit shift for efficiently mapping block numbers to slabs. Using a shift
4157 	 * requires that the slab size be a power of two.
4158 	 */
4159 	block_count_t slab_size = state.slab_config.slab_blocks;
4160 
4161 	if (!is_power_of_2(slab_size)) {
4162 		return vdo_log_error_strerror(UDS_INVALID_ARGUMENT,
4163 					      "slab size must be a power of two");
4164 	}
4165 	slab_size_shift = ilog2(slab_size);
4166 
4167 	result = vdo_allocate_extended(struct slab_depot,
4168 				       vdo->thread_config.physical_zone_count,
4169 				       struct block_allocator, __func__, &depot);
4170 	if (result != VDO_SUCCESS)
4171 		return result;
4172 
4173 	depot->vdo = vdo;
4174 	depot->old_zone_count = state.zone_count;
4175 	depot->zone_count = vdo->thread_config.physical_zone_count;
4176 	depot->slab_config = state.slab_config;
4177 	depot->first_block = state.first_block;
4178 	depot->last_block = state.last_block;
4179 	depot->slab_size_shift = slab_size_shift;
4180 
4181 	result = allocate_components(depot, summary_partition);
4182 	if (result != VDO_SUCCESS) {
4183 		vdo_free_slab_depot(depot);
4184 		return result;
4185 	}
4186 
4187 	*depot_ptr = depot;
4188 	return VDO_SUCCESS;
4189 }
4190 
4191 static void uninitialize_allocator_summary(struct block_allocator *allocator)
4192 {
4193 	block_count_t i;
4194 
4195 	if (allocator->summary_blocks == NULL)
4196 		return;
4197 
4198 	for (i = 0; i < VDO_SLAB_SUMMARY_BLOCKS_PER_ZONE; i++) {
4199 		free_vio_components(&allocator->summary_blocks[i].vio);
4200 		vdo_free(vdo_forget(allocator->summary_blocks[i].outgoing_entries));
4201 	}
4202 
4203 	vdo_free(vdo_forget(allocator->summary_blocks));
4204 }
4205 
4206 /**
4207  * vdo_free_slab_depot() - Destroy a slab depot.
4208  * @depot: The depot to destroy.
4209  */
4210 void vdo_free_slab_depot(struct slab_depot *depot)
4211 {
4212 	zone_count_t zone = 0;
4213 
4214 	if (depot == NULL)
4215 		return;
4216 
4217 	vdo_abandon_new_slabs(depot);
4218 
4219 	for (zone = 0; zone < depot->zone_count; zone++) {
4220 		struct block_allocator *allocator = &depot->allocators[zone];
4221 
4222 		if (allocator->eraser != NULL)
4223 			dm_kcopyd_client_destroy(vdo_forget(allocator->eraser));
4224 
4225 		uninitialize_allocator_summary(allocator);
4226 		uninitialize_scrubber_vio(&allocator->scrubber);
4227 		free_vio_pool(vdo_forget(allocator->vio_pool));
4228 		vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs));
4229 	}
4230 
4231 	if (depot->slabs != NULL) {
4232 		slab_count_t i;
4233 
4234 		for (i = 0; i < depot->slab_count; i++)
4235 			free_slab(vdo_forget(depot->slabs[i]));
4236 	}
4237 
4238 	vdo_free(vdo_forget(depot->slabs));
4239 	vdo_free(vdo_forget(depot->action_manager));
4240 	vdo_free(vdo_forget(depot->summary_entries));
4241 	vdo_free(depot);
4242 }
4243 
4244 /**
4245  * vdo_record_slab_depot() - Record the state of a slab depot for encoding into the super block.
4246  * @depot: The depot to encode.
4247  *
4248  * Return: The depot state.
4249  */
4250 struct slab_depot_state_2_0 vdo_record_slab_depot(const struct slab_depot *depot)
4251 {
4252 	/*
4253 	 * If this depot is currently using 0 zones, it must have been synchronously loaded by a
4254 	 * tool and is now being saved. We did not load and combine the slab summary, so we still
4255 	 * need to do that next time we load with the old zone count rather than 0.
4256 	 */
4257 	struct slab_depot_state_2_0 state;
4258 	zone_count_t zones_to_record = depot->zone_count;
4259 
4260 	if (depot->zone_count == 0)
4261 		zones_to_record = depot->old_zone_count;
4262 
4263 	state = (struct slab_depot_state_2_0) {
4264 		.slab_config = depot->slab_config,
4265 		.first_block = depot->first_block,
4266 		.last_block = depot->last_block,
4267 		.zone_count = zones_to_record,
4268 	};
4269 
4270 	return state;
4271 }
4272 
4273 /**
4274  * vdo_allocate_reference_counters() - Allocate the reference counters for all slabs in the depot.
4275  *
4276  * Context: This method may be called only before entering normal operation from the load thread.
4277  *
4278  * Return: VDO_SUCCESS or an error.
4279  */
4280 int vdo_allocate_reference_counters(struct slab_depot *depot)
4281 {
4282 	struct slab_iterator iterator =
4283 		get_depot_slab_iterator(depot, depot->slab_count - 1, 0, 1);
4284 
4285 	while (iterator.next != NULL) {
4286 		int result = allocate_slab_counters(next_slab(&iterator));
4287 
4288 		if (result != VDO_SUCCESS)
4289 			return result;
4290 	}
4291 
4292 	return VDO_SUCCESS;
4293 }
4294 
4295 /**
4296  * get_slab_number() - Get the number of the slab that contains a specified block.
4297  * @depot: The slab depot.
4298  * @pbn: The physical block number.
4299  * @slab_number_ptr: A pointer to hold the slab number.
4300  *
4301  * Return: VDO_SUCCESS or an error.
4302  */
4303 static int __must_check get_slab_number(const struct slab_depot *depot,
4304 					physical_block_number_t pbn,
4305 					slab_count_t *slab_number_ptr)
4306 {
4307 	slab_count_t slab_number;
4308 
4309 	if (pbn < depot->first_block)
4310 		return VDO_OUT_OF_RANGE;
4311 
4312 	slab_number = (pbn - depot->first_block) >> depot->slab_size_shift;
4313 	if (slab_number >= depot->slab_count)
4314 		return VDO_OUT_OF_RANGE;
4315 
4316 	*slab_number_ptr = slab_number;
4317 	return VDO_SUCCESS;
4318 }
4319 
4320 /**
4321  * vdo_get_slab() - Get the slab object for the slab that contains a specified block.
4322  * @depot: The slab depot.
4323  * @pbn: The physical block number.
4324  *
4325  * Will put the VDO in read-only mode if the PBN is not a valid data block nor the zero block.
4326  *
4327  * Return: The slab containing the block, or NULL if the block number is the zero block or
4328  * otherwise out of range.
4329  */
4330 struct vdo_slab *vdo_get_slab(const struct slab_depot *depot,
4331 			      physical_block_number_t pbn)
4332 {
4333 	slab_count_t slab_number;
4334 	int result;
4335 
4336 	if (pbn == VDO_ZERO_BLOCK)
4337 		return NULL;
4338 
4339 	result = get_slab_number(depot, pbn, &slab_number);
4340 	if (result != VDO_SUCCESS) {
4341 		vdo_enter_read_only_mode(depot->vdo, result);
4342 		return NULL;
4343 	}
4344 
4345 	return depot->slabs[slab_number];
4346 }
4347 
4348 /**
4349  * vdo_get_increment_limit() - Determine how many new references a block can acquire.
4350  * @depot: The slab depot.
4351  * @pbn: The physical block number that is being queried.
4352  *
4353  * Context: This method must be called from the physical zone thread of the PBN.
4354  *
4355  * Return: The number of available references.
4356  */
4357 u8 vdo_get_increment_limit(struct slab_depot *depot, physical_block_number_t pbn)
4358 {
4359 	struct vdo_slab *slab = vdo_get_slab(depot, pbn);
4360 	vdo_refcount_t *counter_ptr = NULL;
4361 	int result;
4362 
4363 	if ((slab == NULL) || (slab->status != VDO_SLAB_REBUILT))
4364 		return 0;
4365 
4366 	result = get_reference_counter(slab, pbn, &counter_ptr);
4367 	if (result != VDO_SUCCESS)
4368 		return 0;
4369 
4370 	if (*counter_ptr == PROVISIONAL_REFERENCE_COUNT)
4371 		return (MAXIMUM_REFERENCE_COUNT - 1);
4372 
4373 	return (MAXIMUM_REFERENCE_COUNT - *counter_ptr);
4374 }
4375 
4376 /**
4377  * vdo_is_physical_data_block() - Determine whether the given PBN refers to a data block.
4378  * @depot: The depot.
4379  * @pbn: The physical block number to ask about.
4380  *
4381  * Return: True if the PBN corresponds to a data block.
4382  */
4383 bool vdo_is_physical_data_block(const struct slab_depot *depot,
4384 				physical_block_number_t pbn)
4385 {
4386 	slab_count_t slab_number;
4387 	slab_block_number sbn;
4388 
4389 	return ((pbn == VDO_ZERO_BLOCK) ||
4390 		((get_slab_number(depot, pbn, &slab_number) == VDO_SUCCESS) &&
4391 		 (slab_block_number_from_pbn(depot->slabs[slab_number], pbn, &sbn) ==
4392 		  VDO_SUCCESS)));
4393 }
4394 
4395 /**
4396  * vdo_get_slab_depot_allocated_blocks() - Get the total number of data blocks allocated across all
4397  * the slabs in the depot.
4398  * @depot: The slab depot.
4399  *
4400  * This is the total number of blocks with a non-zero reference count.
4401  *
4402  * Context: This may be called from any thread.
4403  *
4404  * Return: The total number of blocks with a non-zero reference count.
4405  */
4406 block_count_t vdo_get_slab_depot_allocated_blocks(const struct slab_depot *depot)
4407 {
4408 	block_count_t total = 0;
4409 	zone_count_t zone;
4410 
4411 	for (zone = 0; zone < depot->zone_count; zone++) {
4412 		/* The allocators are responsible for thread safety. */
4413 		total += READ_ONCE(depot->allocators[zone].allocated_blocks);
4414 	}
4415 
4416 	return total;
4417 }
4418 
4419 /**
4420  * vdo_get_slab_depot_data_blocks() - Get the total number of data blocks in all the slabs in the
4421  *                                    depot.
4422  * @depot: The slab depot.
4423  *
4424  * Context: This may be called from any thread.
4425  *
4426  * Return: The total number of data blocks in all slabs.
4427  */
4428 block_count_t vdo_get_slab_depot_data_blocks(const struct slab_depot *depot)
4429 {
4430 	return (READ_ONCE(depot->slab_count) * depot->slab_config.data_blocks);
4431 }
4432 
4433 /**
4434  * finish_combining_zones() - Clean up after saving out the combined slab summary.
4435  * @completion: The vio which was used to write the summary data.
4436  */
4437 static void finish_combining_zones(struct vdo_completion *completion)
4438 {
4439 	int result = completion->result;
4440 	struct vdo_completion *parent = completion->parent;
4441 
4442 	free_vio(as_vio(vdo_forget(completion)));
4443 	vdo_fail_completion(parent, result);
4444 }
4445 
4446 static void handle_combining_error(struct vdo_completion *completion)
4447 {
4448 	vio_record_metadata_io_error(as_vio(completion));
4449 	finish_combining_zones(completion);
4450 }
4451 
4452 static void write_summary_endio(struct bio *bio)
4453 {
4454 	struct vio *vio = bio->bi_private;
4455 	struct vdo *vdo = vio->completion.vdo;
4456 
4457 	continue_vio_after_io(vio, finish_combining_zones,
4458 			      vdo->thread_config.admin_thread);
4459 }
4460 
4461 /**
4462  * combine_summaries() - Treating the current entries buffer as the on-disk value of all zones,
4463  *                       update every zone to the correct values for every slab.
4464  * @depot: The depot whose summary entries should be combined.
4465  */
4466 static void combine_summaries(struct slab_depot *depot)
4467 {
4468 	/*
4469 	 * Combine all the old summary data into the portion of the buffer corresponding to the
4470 	 * first zone.
4471 	 */
4472 	zone_count_t zone = 0;
4473 	struct slab_summary_entry *entries = depot->summary_entries;
4474 
4475 	if (depot->old_zone_count > 1) {
4476 		slab_count_t entry_number;
4477 
4478 		for (entry_number = 0; entry_number < MAX_VDO_SLABS; entry_number++) {
4479 			if (zone != 0) {
4480 				memcpy(entries + entry_number,
4481 				       entries + (zone * MAX_VDO_SLABS) + entry_number,
4482 				       sizeof(struct slab_summary_entry));
4483 			}
4484 
4485 			zone++;
4486 			if (zone == depot->old_zone_count)
4487 				zone = 0;
4488 		}
4489 	}
4490 
4491 	/* Copy the combined data to each zones's region of the buffer. */
4492 	for (zone = 1; zone < MAX_VDO_PHYSICAL_ZONES; zone++) {
4493 		memcpy(entries + (zone * MAX_VDO_SLABS), entries,
4494 		       MAX_VDO_SLABS * sizeof(struct slab_summary_entry));
4495 	}
4496 }
4497 
4498 /**
4499  * finish_loading_summary() - Finish loading slab summary data.
4500  * @completion: The vio which was used to read the summary data.
4501  *
4502  * Combines the slab summary data from all the previously written zones and copies the combined
4503  * summary to each partition's data region. Then writes the combined summary back out to disk. This
4504  * callback is registered in load_summary_endio().
4505  */
4506 static void finish_loading_summary(struct vdo_completion *completion)
4507 {
4508 	struct slab_depot *depot = completion->vdo->depot;
4509 
4510 	/* Combine the summary from each zone so each zone is correct for all slabs. */
4511 	combine_summaries(depot);
4512 
4513 	/* Write the combined summary back out. */
4514 	vdo_submit_metadata_vio(as_vio(completion), depot->summary_origin,
4515 				write_summary_endio, handle_combining_error,
4516 				REQ_OP_WRITE);
4517 }
4518 
4519 static void load_summary_endio(struct bio *bio)
4520 {
4521 	struct vio *vio = bio->bi_private;
4522 	struct vdo *vdo = vio->completion.vdo;
4523 
4524 	continue_vio_after_io(vio, finish_loading_summary,
4525 			      vdo->thread_config.admin_thread);
4526 }
4527 
4528 /**
4529  * load_slab_summary() - The preamble of a load operation.
4530  *
4531  * Implements vdo_action_preamble_fn.
4532  */
4533 static void load_slab_summary(void *context, struct vdo_completion *parent)
4534 {
4535 	int result;
4536 	struct vio *vio;
4537 	struct slab_depot *depot = context;
4538 	const struct admin_state_code *operation =
4539 		vdo_get_current_manager_operation(depot->action_manager);
4540 
4541 	result = create_multi_block_metadata_vio(depot->vdo, VIO_TYPE_SLAB_SUMMARY,
4542 						 VIO_PRIORITY_METADATA, parent,
4543 						 VDO_SLAB_SUMMARY_BLOCKS,
4544 						 (char *) depot->summary_entries, &vio);
4545 	if (result != VDO_SUCCESS) {
4546 		vdo_fail_completion(parent, result);
4547 		return;
4548 	}
4549 
4550 	if ((operation == VDO_ADMIN_STATE_FORMATTING) ||
4551 	    (operation == VDO_ADMIN_STATE_LOADING_FOR_REBUILD)) {
4552 		finish_loading_summary(&vio->completion);
4553 		return;
4554 	}
4555 
4556 	vdo_submit_metadata_vio(vio, depot->summary_origin, load_summary_endio,
4557 				handle_combining_error, REQ_OP_READ);
4558 }
4559 
4560 /* Implements vdo_zone_action_fn. */
4561 static void load_allocator(void *context, zone_count_t zone_number,
4562 			   struct vdo_completion *parent)
4563 {
4564 	struct slab_depot *depot = context;
4565 
4566 	vdo_start_loading(&depot->allocators[zone_number].state,
4567 			  vdo_get_current_manager_operation(depot->action_manager),
4568 			  parent, initiate_load);
4569 }
4570 
4571 /**
4572  * vdo_load_slab_depot() - Asynchronously load any slab depot state that isn't included in the
4573  *                         super_block component.
4574  * @depot: The depot to load.
4575  * @operation: The type of load to perform.
4576  * @parent: The completion to notify when the load is complete.
4577  * @context: Additional context for the load operation; may be NULL.
4578  *
4579  * This method may be called only before entering normal operation from the load thread.
4580  */
4581 void vdo_load_slab_depot(struct slab_depot *depot,
4582 			 const struct admin_state_code *operation,
4583 			 struct vdo_completion *parent, void *context)
4584 {
4585 	if (!vdo_assert_load_operation(operation, parent))
4586 		return;
4587 
4588 	vdo_schedule_operation_with_context(depot->action_manager, operation,
4589 					    load_slab_summary, load_allocator,
4590 					    NULL, context, parent);
4591 }
4592 
4593 /* Implements vdo_zone_action_fn. */
4594 static void prepare_to_allocate(void *context, zone_count_t zone_number,
4595 				struct vdo_completion *parent)
4596 {
4597 	struct slab_depot *depot = context;
4598 	struct block_allocator *allocator = &depot->allocators[zone_number];
4599 	int result;
4600 
4601 	result = vdo_prepare_slabs_for_allocation(allocator);
4602 	if (result != VDO_SUCCESS) {
4603 		vdo_fail_completion(parent, result);
4604 		return;
4605 	}
4606 
4607 	scrub_slabs(allocator, parent);
4608 }
4609 
4610 /**
4611  * vdo_prepare_slab_depot_to_allocate() - Prepare the slab depot to come online and start
4612  *                                        allocating blocks.
4613  * @depot: The depot to prepare.
4614  * @load_type: The load type.
4615  * @parent: The completion to notify when the operation is complete.
4616  *
4617  * This method may be called only before entering normal operation from the load thread. It must be
4618  * called before allocation may proceed.
4619  */
4620 void vdo_prepare_slab_depot_to_allocate(struct slab_depot *depot,
4621 					enum slab_depot_load_type load_type,
4622 					struct vdo_completion *parent)
4623 {
4624 	depot->load_type = load_type;
4625 	atomic_set(&depot->zones_to_scrub, depot->zone_count);
4626 	vdo_schedule_action(depot->action_manager, NULL,
4627 			    prepare_to_allocate, NULL, parent);
4628 }
4629 
4630 /**
4631  * vdo_update_slab_depot_size() - Update the slab depot to reflect its new size in memory.
4632  * @depot: The depot to update.
4633  *
4634  * This size is saved to disk as part of the super block.
4635  */
4636 void vdo_update_slab_depot_size(struct slab_depot *depot)
4637 {
4638 	depot->last_block = depot->new_last_block;
4639 }
4640 
4641 /**
4642  * vdo_prepare_to_grow_slab_depot() - Allocate new memory needed for a resize of a slab depot to
4643  *                                    the given size.
4644  * @depot: The depot to prepare to resize.
4645  * @partition: The new depot partition
4646  *
4647  * Return: VDO_SUCCESS or an error.
4648  */
4649 int vdo_prepare_to_grow_slab_depot(struct slab_depot *depot,
4650 				   const struct partition *partition)
4651 {
4652 	struct slab_depot_state_2_0 new_state;
4653 	int result;
4654 	slab_count_t new_slab_count;
4655 
4656 	if ((partition->count >> depot->slab_size_shift) <= depot->slab_count)
4657 		return VDO_INCREMENT_TOO_SMALL;
4658 
4659 	/* Generate the depot configuration for the new block count. */
4660 	VDO_ASSERT_LOG_ONLY(depot->first_block == partition->offset,
4661 			    "New slab depot partition doesn't change origin");
4662 	result = vdo_configure_slab_depot(partition, depot->slab_config,
4663 					  depot->zone_count, &new_state);
4664 	if (result != VDO_SUCCESS)
4665 		return result;
4666 
4667 	new_slab_count = vdo_compute_slab_count(depot->first_block,
4668 						new_state.last_block,
4669 						depot->slab_size_shift);
4670 	if (new_slab_count <= depot->slab_count)
4671 		return vdo_log_error_strerror(VDO_INCREMENT_TOO_SMALL,
4672 					      "Depot can only grow");
4673 	if (new_slab_count == depot->new_slab_count) {
4674 		/* Check it out, we've already got all the new slabs allocated! */
4675 		return VDO_SUCCESS;
4676 	}
4677 
4678 	vdo_abandon_new_slabs(depot);
4679 	result = allocate_slabs(depot, new_slab_count);
4680 	if (result != VDO_SUCCESS) {
4681 		vdo_abandon_new_slabs(depot);
4682 		return result;
4683 	}
4684 
4685 	depot->new_size = partition->count;
4686 	depot->old_last_block = depot->last_block;
4687 	depot->new_last_block = new_state.last_block;
4688 
4689 	return VDO_SUCCESS;
4690 }
4691 
4692 /**
4693  * finish_registration() - Finish registering new slabs now that all of the allocators have
4694  *                         received their new slabs.
4695  *
4696  * Implements vdo_action_conclusion_fn.
4697  */
4698 static int finish_registration(void *context)
4699 {
4700 	struct slab_depot *depot = context;
4701 
4702 	WRITE_ONCE(depot->slab_count, depot->new_slab_count);
4703 	vdo_free(depot->slabs);
4704 	depot->slabs = depot->new_slabs;
4705 	depot->new_slabs = NULL;
4706 	depot->new_slab_count = 0;
4707 	return VDO_SUCCESS;
4708 }
4709 
4710 /* Implements vdo_zone_action_fn. */
4711 static void register_new_slabs(void *context, zone_count_t zone_number,
4712 			       struct vdo_completion *parent)
4713 {
4714 	struct slab_depot *depot = context;
4715 	struct block_allocator *allocator = &depot->allocators[zone_number];
4716 	slab_count_t i;
4717 
4718 	for (i = depot->slab_count; i < depot->new_slab_count; i++) {
4719 		struct vdo_slab *slab = depot->new_slabs[i];
4720 
4721 		if (slab->allocator == allocator)
4722 			register_slab_with_allocator(allocator, slab);
4723 	}
4724 
4725 	vdo_finish_completion(parent);
4726 }
4727 
4728 /**
4729  * vdo_use_new_slabs() - Use the new slabs allocated for resize.
4730  * @depot: The depot.
4731  * @parent: The object to notify when complete.
4732  */
4733 void vdo_use_new_slabs(struct slab_depot *depot, struct vdo_completion *parent)
4734 {
4735 	VDO_ASSERT_LOG_ONLY(depot->new_slabs != NULL, "Must have new slabs to use");
4736 	vdo_schedule_operation(depot->action_manager,
4737 			       VDO_ADMIN_STATE_SUSPENDED_OPERATION,
4738 			       NULL, register_new_slabs,
4739 			       finish_registration, parent);
4740 }
4741 
4742 /**
4743  * stop_scrubbing() - Tell the scrubber to stop scrubbing after it finishes the slab it is
4744  *                    currently working on.
4745  * @scrubber: The scrubber to stop.
4746  * @parent: The completion to notify when scrubbing has stopped.
4747  */
4748 static void stop_scrubbing(struct block_allocator *allocator)
4749 {
4750 	struct slab_scrubber *scrubber = &allocator->scrubber;
4751 
4752 	if (vdo_is_state_quiescent(&scrubber->admin_state)) {
4753 		vdo_finish_completion(&allocator->completion);
4754 	} else {
4755 		vdo_start_draining(&scrubber->admin_state,
4756 				   VDO_ADMIN_STATE_SUSPENDING,
4757 				   &allocator->completion, NULL);
4758 	}
4759 }
4760 
4761 /* Implements vdo_admin_initiator_fn. */
4762 static void initiate_summary_drain(struct admin_state *state)
4763 {
4764 	check_summary_drain_complete(container_of(state, struct block_allocator,
4765 						  summary_state));
4766 }
4767 
4768 static void do_drain_step(struct vdo_completion *completion)
4769 {
4770 	struct block_allocator *allocator = vdo_as_block_allocator(completion);
4771 
4772 	vdo_prepare_completion_for_requeue(&allocator->completion, do_drain_step,
4773 					   handle_operation_error, allocator->thread_id,
4774 					   NULL);
4775 	switch (++allocator->drain_step) {
4776 	case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
4777 		stop_scrubbing(allocator);
4778 		return;
4779 
4780 	case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
4781 		apply_to_slabs(allocator, do_drain_step);
4782 		return;
4783 
4784 	case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
4785 		vdo_start_draining(&allocator->summary_state,
4786 				   vdo_get_admin_state_code(&allocator->state),
4787 				   completion, initiate_summary_drain);
4788 		return;
4789 
4790 	case VDO_DRAIN_ALLOCATOR_STEP_FINISHED:
4791 		VDO_ASSERT_LOG_ONLY(!is_vio_pool_busy(allocator->vio_pool),
4792 				    "vio pool not busy");
4793 		vdo_finish_draining_with_result(&allocator->state, completion->result);
4794 		return;
4795 
4796 	default:
4797 		vdo_finish_draining_with_result(&allocator->state, UDS_BAD_STATE);
4798 	}
4799 }
4800 
4801 /* Implements vdo_admin_initiator_fn. */
4802 static void initiate_drain(struct admin_state *state)
4803 {
4804 	struct block_allocator *allocator =
4805 		container_of(state, struct block_allocator, state);
4806 
4807 	allocator->drain_step = VDO_DRAIN_ALLOCATOR_START;
4808 	do_drain_step(&allocator->completion);
4809 }
4810 
4811 /*
4812  * Drain all allocator I/O. Depending upon the type of drain, some or all dirty metadata may be
4813  * written to disk. The type of drain will be determined from the state of the allocator's depot.
4814  *
4815  * Implements vdo_zone_action_fn.
4816  */
4817 static void drain_allocator(void *context, zone_count_t zone_number,
4818 			    struct vdo_completion *parent)
4819 {
4820 	struct slab_depot *depot = context;
4821 
4822 	vdo_start_draining(&depot->allocators[zone_number].state,
4823 			   vdo_get_current_manager_operation(depot->action_manager),
4824 			   parent, initiate_drain);
4825 }
4826 
4827 /**
4828  * vdo_drain_slab_depot() - Drain all slab depot I/O.
4829  * @depot: The depot to drain.
4830  * @operation: The drain operation (flush, rebuild, suspend, or save).
4831  * @parent: The completion to finish when the drain is complete.
4832  *
4833  * If saving, or flushing, all dirty depot metadata will be written out. If saving or suspending,
4834  * the depot will be left in a suspended state.
4835  */
4836 void vdo_drain_slab_depot(struct slab_depot *depot,
4837 			  const struct admin_state_code *operation,
4838 			  struct vdo_completion *parent)
4839 {
4840 	vdo_schedule_operation(depot->action_manager, operation,
4841 			       NULL, drain_allocator, NULL, parent);
4842 }
4843 
4844 /**
4845  * resume_scrubbing() - Tell the scrubber to resume scrubbing if it has been stopped.
4846  * @allocator: The allocator being resumed.
4847  */
4848 static void resume_scrubbing(struct block_allocator *allocator)
4849 {
4850 	int result;
4851 	struct slab_scrubber *scrubber = &allocator->scrubber;
4852 
4853 	if (!has_slabs_to_scrub(scrubber)) {
4854 		vdo_finish_completion(&allocator->completion);
4855 		return;
4856 	}
4857 
4858 	result = vdo_resume_if_quiescent(&scrubber->admin_state);
4859 	if (result != VDO_SUCCESS) {
4860 		vdo_fail_completion(&allocator->completion, result);
4861 		return;
4862 	}
4863 
4864 	scrub_next_slab(scrubber);
4865 	vdo_finish_completion(&allocator->completion);
4866 }
4867 
4868 static void do_resume_step(struct vdo_completion *completion)
4869 {
4870 	struct block_allocator *allocator = vdo_as_block_allocator(completion);
4871 
4872 	vdo_prepare_completion_for_requeue(&allocator->completion, do_resume_step,
4873 					   handle_operation_error,
4874 					   allocator->thread_id, NULL);
4875 	switch (--allocator->drain_step) {
4876 	case VDO_DRAIN_ALLOCATOR_STEP_SUMMARY:
4877 		vdo_fail_completion(completion,
4878 				    vdo_resume_if_quiescent(&allocator->summary_state));
4879 		return;
4880 
4881 	case VDO_DRAIN_ALLOCATOR_STEP_SLABS:
4882 		apply_to_slabs(allocator, do_resume_step);
4883 		return;
4884 
4885 	case VDO_DRAIN_ALLOCATOR_STEP_SCRUBBER:
4886 		resume_scrubbing(allocator);
4887 		return;
4888 
4889 	case VDO_DRAIN_ALLOCATOR_START:
4890 		vdo_finish_resuming_with_result(&allocator->state, completion->result);
4891 		return;
4892 
4893 	default:
4894 		vdo_finish_resuming_with_result(&allocator->state, UDS_BAD_STATE);
4895 	}
4896 }
4897 
4898 /* Implements vdo_admin_initiator_fn. */
4899 static void initiate_resume(struct admin_state *state)
4900 {
4901 	struct block_allocator *allocator =
4902 		container_of(state, struct block_allocator, state);
4903 
4904 	allocator->drain_step = VDO_DRAIN_ALLOCATOR_STEP_FINISHED;
4905 	do_resume_step(&allocator->completion);
4906 }
4907 
4908 /* Implements vdo_zone_action_fn. */
4909 static void resume_allocator(void *context, zone_count_t zone_number,
4910 			     struct vdo_completion *parent)
4911 {
4912 	struct slab_depot *depot = context;
4913 
4914 	vdo_start_resuming(&depot->allocators[zone_number].state,
4915 			   vdo_get_current_manager_operation(depot->action_manager),
4916 			   parent, initiate_resume);
4917 }
4918 
4919 /**
4920  * vdo_resume_slab_depot() - Resume a suspended slab depot.
4921  * @depot: The depot to resume.
4922  * @parent: The completion to finish when the depot has resumed.
4923  */
4924 void vdo_resume_slab_depot(struct slab_depot *depot, struct vdo_completion *parent)
4925 {
4926 	if (vdo_is_read_only(depot->vdo)) {
4927 		vdo_continue_completion(parent, VDO_READ_ONLY);
4928 		return;
4929 	}
4930 
4931 	vdo_schedule_operation(depot->action_manager, VDO_ADMIN_STATE_RESUMING,
4932 			       NULL, resume_allocator, NULL, parent);
4933 }
4934 
4935 /**
4936  * vdo_commit_oldest_slab_journal_tail_blocks() - Commit all dirty tail blocks which are locking a
4937  *                                                given recovery journal block.
4938  * @depot: The depot.
4939  * @recovery_block_number: The sequence number of the recovery journal block whose locks should be
4940  *                         released.
4941  *
4942  * Context: This method must be called from the journal zone thread.
4943  */
4944 void vdo_commit_oldest_slab_journal_tail_blocks(struct slab_depot *depot,
4945 						sequence_number_t recovery_block_number)
4946 {
4947 	if (depot == NULL)
4948 		return;
4949 
4950 	depot->new_release_request = recovery_block_number;
4951 	vdo_schedule_default_action(depot->action_manager);
4952 }
4953 
4954 /* Implements vdo_zone_action_fn. */
4955 static void scrub_all_unrecovered_slabs(void *context, zone_count_t zone_number,
4956 					struct vdo_completion *parent)
4957 {
4958 	struct slab_depot *depot = context;
4959 
4960 	scrub_slabs(&depot->allocators[zone_number], NULL);
4961 	vdo_launch_completion(parent);
4962 }
4963 
4964 /**
4965  * vdo_scrub_all_unrecovered_slabs() - Scrub all unrecovered slabs.
4966  * @depot: The depot to scrub.
4967  * @parent: The object to notify when scrubbing has been launched for all zones.
4968  */
4969 void vdo_scrub_all_unrecovered_slabs(struct slab_depot *depot,
4970 				     struct vdo_completion *parent)
4971 {
4972 	vdo_schedule_action(depot->action_manager, NULL,
4973 			    scrub_all_unrecovered_slabs,
4974 			    NULL, parent);
4975 }
4976 
4977 /**
4978  * get_block_allocator_statistics() - Get the total of the statistics from all the block allocators
4979  *                                    in the depot.
4980  * @depot: The slab depot.
4981  *
4982  * Return: The statistics from all block allocators in the depot.
4983  */
4984 static struct block_allocator_statistics __must_check
4985 get_block_allocator_statistics(const struct slab_depot *depot)
4986 {
4987 	struct block_allocator_statistics totals;
4988 	zone_count_t zone;
4989 
4990 	memset(&totals, 0, sizeof(totals));
4991 
4992 	for (zone = 0; zone < depot->zone_count; zone++) {
4993 		const struct block_allocator *allocator = &depot->allocators[zone];
4994 		const struct block_allocator_statistics *stats = &allocator->statistics;
4995 
4996 		totals.slab_count += allocator->slab_count;
4997 		totals.slabs_opened += READ_ONCE(stats->slabs_opened);
4998 		totals.slabs_reopened += READ_ONCE(stats->slabs_reopened);
4999 	}
5000 
5001 	return totals;
5002 }
5003 
5004 /**
5005  * get_ref_counts_statistics() - Get the cumulative ref_counts statistics for the depot.
5006  * @depot: The slab depot.
5007  *
5008  * Return: The cumulative statistics for all ref_counts in the depot.
5009  */
5010 static struct ref_counts_statistics __must_check
5011 get_ref_counts_statistics(const struct slab_depot *depot)
5012 {
5013 	struct ref_counts_statistics totals;
5014 	zone_count_t zone;
5015 
5016 	memset(&totals, 0, sizeof(totals));
5017 
5018 	for (zone = 0; zone < depot->zone_count; zone++) {
5019 		totals.blocks_written +=
5020 			READ_ONCE(depot->allocators[zone].ref_counts_statistics.blocks_written);
5021 	}
5022 
5023 	return totals;
5024 }
5025 
5026 /**
5027  * get_slab_journal_statistics() - Get the aggregated slab journal statistics for the depot.
5028  * @depot: The slab depot.
5029  *
5030  * Return: The aggregated statistics for all slab journals in the depot.
5031  */
5032 static struct slab_journal_statistics __must_check
5033 get_slab_journal_statistics(const struct slab_depot *depot)
5034 {
5035 	struct slab_journal_statistics totals;
5036 	zone_count_t zone;
5037 
5038 	memset(&totals, 0, sizeof(totals));
5039 
5040 	for (zone = 0; zone < depot->zone_count; zone++) {
5041 		const struct slab_journal_statistics *stats =
5042 			&depot->allocators[zone].slab_journal_statistics;
5043 
5044 		totals.disk_full_count += READ_ONCE(stats->disk_full_count);
5045 		totals.flush_count += READ_ONCE(stats->flush_count);
5046 		totals.blocked_count += READ_ONCE(stats->blocked_count);
5047 		totals.blocks_written += READ_ONCE(stats->blocks_written);
5048 		totals.tail_busy_count += READ_ONCE(stats->tail_busy_count);
5049 	}
5050 
5051 	return totals;
5052 }
5053 
5054 /**
5055  * vdo_get_slab_depot_statistics() - Get all the vdo_statistics fields that are properties of the
5056  *                                   slab depot.
5057  * @depot: The slab depot.
5058  * @stats: The vdo statistics structure to partially fill.
5059  */
5060 void vdo_get_slab_depot_statistics(const struct slab_depot *depot,
5061 				   struct vdo_statistics *stats)
5062 {
5063 	slab_count_t slab_count = READ_ONCE(depot->slab_count);
5064 	slab_count_t unrecovered = 0;
5065 	zone_count_t zone;
5066 
5067 	for (zone = 0; zone < depot->zone_count; zone++) {
5068 		/* The allocators are responsible for thread safety. */
5069 		unrecovered += READ_ONCE(depot->allocators[zone].scrubber.slab_count);
5070 	}
5071 
5072 	stats->recovery_percentage = (slab_count - unrecovered) * 100 / slab_count;
5073 	stats->allocator = get_block_allocator_statistics(depot);
5074 	stats->ref_counts = get_ref_counts_statistics(depot);
5075 	stats->slab_journal = get_slab_journal_statistics(depot);
5076 	stats->slab_summary = (struct slab_summary_statistics) {
5077 		.blocks_written = atomic64_read(&depot->summary_statistics.blocks_written),
5078 	};
5079 }
5080 
5081 /**
5082  * vdo_dump_slab_depot() - Dump the slab depot, in a thread-unsafe fashion.
5083  * @depot: The slab depot.
5084  */
5085 void vdo_dump_slab_depot(const struct slab_depot *depot)
5086 {
5087 	vdo_log_info("vdo slab depot");
5088 	vdo_log_info("  zone_count=%u old_zone_count=%u slabCount=%u active_release_request=%llu new_release_request=%llu",
5089 		     (unsigned int) depot->zone_count,
5090 		     (unsigned int) depot->old_zone_count, READ_ONCE(depot->slab_count),
5091 		     (unsigned long long) depot->active_release_request,
5092 		     (unsigned long long) depot->new_release_request);
5093 }
5094