recovery-journal.c (revision d358e5254674b70f34c847715ca509e46eb81e6f) - OpenGrok cross reference for /linux/drivers/md/dm-vdo/recovery-journal.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2023 Red Hat
 */

#include "recovery-journal.h"

#include <linux/atomic.h>
#include <linux/bio.h>

#include "logger.h"
#include "memory-alloc.h"
#include "permassert.h"

#include "block-map.h"
#include "completion.h"
#include "constants.h"
#include "data-vio.h"
#include "encodings.h"
#include "io-submitter.h"
#include "slab-depot.h"
#include "types.h"
#include "vdo.h"
#include "vio.h"
#include "wait-queue.h"

static const u64 RECOVERY_COUNT_MASK = 0xff;

/*
 * The number of reserved blocks must be large enough to prevent a new recovery journal
 * block write from overwriting a block which appears to still be a valid head block of the
 * journal. Currently, that means reserving enough space for all 2048 data_vios.
 */
#define RECOVERY_JOURNAL_RESERVED_BLOCKS				\
	((MAXIMUM_VDO_USER_VIOS / RECOVERY_JOURNAL_ENTRIES_PER_BLOCK) + 2)

/**
 * DOC: Lock Counters.
 *
 * A lock_counter is intended to keep all of the locks for the blocks in the recovery journal. The
 * per-zone counters are all kept in a single array which is arranged by zone (i.e. zone 0's lock 0
 * is at index 0, zone 0's lock 1 is at index 1, and zone 1's lock 0 is at index 'locks'. This
 * arrangement is intended to minimize cache-line contention for counters from different zones.
 *
 * The locks are implemented as a single object instead of as a lock counter per lock both to
 * afford this opportunity to reduce cache line contention and also to eliminate the need to have a
 * completion per lock.
 *
 * Lock sets are laid out with the set for recovery journal first, followed by the logical zones,
 * and then the physical zones.
 */

enum lock_counter_state {
	LOCK_COUNTER_STATE_NOT_NOTIFYING,
	LOCK_COUNTER_STATE_NOTIFYING,
	LOCK_COUNTER_STATE_SUSPENDED,
};

/**
 * get_zone_count_ptr() - Get a pointer to the zone count for a given lock on a given zone.
 * @journal: The recovery journal.
 * @lock_number: The lock to get.
 * @zone_type: The zone type whose count is desired.
 *
 * Return: A pointer to the zone count for the given lock and zone.
 */
static inline atomic_t *get_zone_count_ptr(struct recovery_journal *journal,
					   block_count_t lock_number,
					   enum vdo_zone_type zone_type)
{
	return ((zone_type == VDO_ZONE_TYPE_LOGICAL)
		? &journal->lock_counter.logical_zone_counts[lock_number]
		: &journal->lock_counter.physical_zone_counts[lock_number]);
}

/**
 * get_counter() - Get the zone counter for a given lock on a given zone.
 * @journal: The recovery journal.
 * @lock_number: The lock to get.
 * @zone_type: The zone type whose count is desired.
 * @zone_id: The zone index whose count is desired.
 *
 * Return: The counter for the given lock and zone.
 */
static inline u16 *get_counter(struct recovery_journal *journal,
			       block_count_t lock_number, enum vdo_zone_type zone_type,
			       zone_count_t zone_id)
{
	struct lock_counter *counter = &journal->lock_counter;
	block_count_t zone_counter = (counter->locks * zone_id) + lock_number;

	if (zone_type == VDO_ZONE_TYPE_JOURNAL)
		return &counter->journal_counters[zone_counter];

	if (zone_type == VDO_ZONE_TYPE_LOGICAL)
		return &counter->logical_counters[zone_counter];

	return &counter->physical_counters[zone_counter];
}

static atomic_t *get_decrement_counter(struct recovery_journal *journal,
				       block_count_t lock_number)
{
	return &journal->lock_counter.journal_decrement_counts[lock_number];
}

/**
 * is_journal_zone_locked() - Check whether the journal zone is locked for a given lock.
 * @journal: The recovery journal.
 * @lock_number: The lock to check.
 *
 * Return: True if the journal zone is locked.
 */
static bool is_journal_zone_locked(struct recovery_journal *journal,
				   block_count_t lock_number)
{
	u16 journal_value = *get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0);
	u32 decrements = atomic_read(get_decrement_counter(journal, lock_number));

	/* Pairs with barrier in vdo_release_journal_entry_lock() */
	smp_rmb();
	VDO_ASSERT_LOG_ONLY((decrements <= journal_value),
			    "journal zone lock counter must not underflow");
	return (journal_value != decrements);
}

/**
 * vdo_release_recovery_journal_block_reference() - Release a reference to a recovery journal
 *                                                  block.
 * @journal: The recovery journal.
 * @sequence_number: The journal sequence number of the referenced block.
 * @zone_type: The type of the zone making the adjustment.
 * @zone_id: The ID of the zone making the adjustment.
 *
 * If this is the last reference for a given zone type, an attempt will be made to reap the
 * journal.
 */
void vdo_release_recovery_journal_block_reference(struct recovery_journal *journal,
						  sequence_number_t sequence_number,
						  enum vdo_zone_type zone_type,
						  zone_count_t zone_id)
{
	u16 *current_value;
	block_count_t lock_number;
	int prior_state;

	if (sequence_number == 0)
		return;

	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
	current_value = get_counter(journal, lock_number, zone_type, zone_id);

	VDO_ASSERT_LOG_ONLY((*current_value >= 1),
			    "decrement of lock counter must not underflow");
	*current_value -= 1;

	if (zone_type == VDO_ZONE_TYPE_JOURNAL) {
		if (is_journal_zone_locked(journal, lock_number))
			return;
	} else {
		atomic_t *zone_count;

		if (*current_value != 0)
			return;

		zone_count = get_zone_count_ptr(journal, lock_number, zone_type);

		if (atomic_add_return(-1, zone_count) > 0)
			return;
	}

	/*
	 * Extra barriers because this was original developed using a CAS operation that implicitly
	 * had them.
	 */
	smp_mb__before_atomic();
	prior_state = atomic_cmpxchg(&journal->lock_counter.state,
				     LOCK_COUNTER_STATE_NOT_NOTIFYING,
				     LOCK_COUNTER_STATE_NOTIFYING);
	/* same as before_atomic */
	smp_mb__after_atomic();

	if (prior_state != LOCK_COUNTER_STATE_NOT_NOTIFYING)
		return;

	vdo_launch_completion(&journal->lock_counter.completion);
}

static inline struct recovery_journal_block * __must_check get_journal_block(struct list_head *list)
{
	return list_first_entry_or_null(list, struct recovery_journal_block, list_node);
}

/**
 * pop_free_list() - Get a block from the end of the free list.
 * @journal: The journal.
 *
 * Return: The block or NULL if the list is empty.
 */
static struct recovery_journal_block * __must_check pop_free_list(struct recovery_journal *journal)
{
	struct recovery_journal_block *block;

	if (list_empty(&journal->free_tail_blocks))
		return NULL;

	block = list_last_entry(&journal->free_tail_blocks,
				struct recovery_journal_block, list_node);
	list_del_init(&block->list_node);
	return block;
}

/**
 * is_block_dirty() - Check whether a recovery block is dirty.
 * @block: The block to check.
 *
 * Indicates it has any uncommitted entries, which includes both entries not written and entries
 * written but not yet acknowledged.
 *
 * Return: True if the block has any uncommitted entries.
 */
static inline bool __must_check is_block_dirty(const struct recovery_journal_block *block)
{
	return (block->uncommitted_entry_count > 0);
}

/**
 * is_block_empty() - Check whether a journal block is empty.
 * @block: The block to check.
 *
 * Return: True if the block has no entries.
 */
static inline bool __must_check is_block_empty(const struct recovery_journal_block *block)
{
	return (block->entry_count == 0);
}

/**
 * is_block_full() - Check whether a journal block is full.
 * @block: The block to check.
 *
 * Return: True if the block is full.
 */
static inline bool __must_check is_block_full(const struct recovery_journal_block *block)
{
	return ((block == NULL) || (block->journal->entries_per_block == block->entry_count));
}

/**
 * assert_on_journal_thread() - Assert that we are running on the journal thread.
 * @journal: The journal.
 * @function_name: The function doing the check (for logging).
 */
static void assert_on_journal_thread(struct recovery_journal *journal,
				     const char *function_name)
{
	VDO_ASSERT_LOG_ONLY((vdo_get_callback_thread_id() == journal->thread_id),
			    "%s() called on journal thread", function_name);
}

/**
 * continue_waiter() - Release a data_vio from the journal.
 * @waiter: The data_vio waiting on journal activity.
 * @context: The result of the journal operation.
 *
 * Invoked whenever a data_vio is to be released from the journal, either because its entry was
 * committed to disk, or because there was an error. Implements waiter_callback_fn.
 */
static void continue_waiter(struct vdo_waiter *waiter, void *context)
{
	continue_data_vio_with_error(vdo_waiter_as_data_vio(waiter), *((int *) context));
}

/**
 * has_block_waiters() - Check whether the journal has any waiters on any blocks.
 * @journal: The journal in question.
 *
 * Return: True if any block has a waiter.
 */
static inline bool has_block_waiters(struct recovery_journal *journal)
{
	struct recovery_journal_block *block = get_journal_block(&journal->active_tail_blocks);

	/*
	 * Either the first active tail block (if it exists) has waiters, or no active tail block
	 * has waiters.
	 */
	return ((block != NULL) &&
		(vdo_waitq_has_waiters(&block->entry_waiters) ||
		 vdo_waitq_has_waiters(&block->commit_waiters)));
}

static void recycle_journal_blocks(struct recovery_journal *journal);
static void recycle_journal_block(struct recovery_journal_block *block);
static void notify_commit_waiters(struct recovery_journal *journal);

/**
 * suspend_lock_counter() - Prevent the lock counter from notifying.
 * @counter: The counter.
 *
 * Return: True if the lock counter was not notifying and hence the suspend was efficacious.
 */
static bool suspend_lock_counter(struct lock_counter *counter)
{
	int prior_state;

	/*
	 * Extra barriers because this was originally developed using a CAS operation that
	 * implicitly had them.
	 */
	smp_mb__before_atomic();
	prior_state = atomic_cmpxchg(&counter->state, LOCK_COUNTER_STATE_NOT_NOTIFYING,
				     LOCK_COUNTER_STATE_SUSPENDED);
	/* same as before_atomic */
	smp_mb__after_atomic();

	return ((prior_state == LOCK_COUNTER_STATE_SUSPENDED) ||
		(prior_state == LOCK_COUNTER_STATE_NOT_NOTIFYING));
}

static inline bool is_read_only(struct recovery_journal *journal)
{
	return vdo_is_read_only(journal->flush_vio->completion.vdo);
}

/**
 * check_for_drain_complete() - Check whether the journal has drained.
 * @journal: The journal which may have just drained.
 */
static void check_for_drain_complete(struct recovery_journal *journal)
{
	int result = VDO_SUCCESS;

	if (is_read_only(journal)) {
		result = VDO_READ_ONLY;
		/*
		 * Clean up any full active blocks which were not written due to read-only mode.
		 *
		 * FIXME: This would probably be better as a short-circuit in write_block().
		 */
		notify_commit_waiters(journal);
		recycle_journal_blocks(journal);

		/* Release any data_vios waiting to be assigned entries. */
		vdo_waitq_notify_all_waiters(&journal->entry_waiters,
					     continue_waiter, &result);
	}

	if (!vdo_is_state_draining(&journal->state) ||
	    journal->reaping ||
	    has_block_waiters(journal) ||
	    vdo_waitq_has_waiters(&journal->entry_waiters) ||
	    !suspend_lock_counter(&journal->lock_counter))
		return;

	if (vdo_is_state_saving(&journal->state)) {
		if (journal->active_block != NULL) {
			VDO_ASSERT_LOG_ONLY(((result == VDO_READ_ONLY) ||
					     !is_block_dirty(journal->active_block)),
					    "journal being saved has clean active block");
			recycle_journal_block(journal->active_block);
		}

		VDO_ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
				    "all blocks in a journal being saved must be inactive");
	}

	vdo_finish_draining_with_result(&journal->state, result);
}

/**
 * notify_recovery_journal_of_read_only_mode() - Notify a recovery journal that the VDO has gone
 *                                               read-only.
 * @listener: The journal.
 * @parent: The completion to notify in order to acknowledge the notification.
 *
 * Implements vdo_read_only_notification_fn.
 */
static void notify_recovery_journal_of_read_only_mode(void *listener,
						      struct vdo_completion *parent)
{
	check_for_drain_complete(listener);
	vdo_finish_completion(parent);
}

/**
 * enter_journal_read_only_mode() - Put the journal in read-only mode.
 * @journal: The journal which has failed.
 * @error_code: The error result triggering this call.
 *
 * All attempts to add entries after this function is called will fail. All VIOs waiting for
 * commits will be awakened with an error.
 */
static void enter_journal_read_only_mode(struct recovery_journal *journal,
					 int error_code)
{
	vdo_enter_read_only_mode(journal->flush_vio->completion.vdo, error_code);
	check_for_drain_complete(journal);
}

/**
 * vdo_get_recovery_journal_current_sequence_number() - Obtain the recovery journal's current
 *                                                      sequence number.
 * @journal: The journal in question.
 *
 * Exposed only so the block map can be initialized therefrom.
 *
 * Return: The sequence number of the tail block.
 */
sequence_number_t vdo_get_recovery_journal_current_sequence_number(struct recovery_journal *journal)
{
	return journal->tail;
}

/**
 * get_recovery_journal_head() - Get the head of the recovery journal.
 * @journal: The journal.
 *
 * The head is the lowest sequence number of the block map head and the slab journal head.
 *
 * Return: The head of the journal.
 */
static inline sequence_number_t get_recovery_journal_head(const struct recovery_journal *journal)
{
	return min(journal->block_map_head, journal->slab_journal_head);
}

/**
 * compute_recovery_count_byte() - Compute the recovery count byte for a given recovery count.
 * @recovery_count: The recovery count.
 *
 * Return: The byte corresponding to the recovery count.
 */
static inline u8 __must_check compute_recovery_count_byte(u64 recovery_count)
{
	return (u8)(recovery_count & RECOVERY_COUNT_MASK);
}

/**
 * check_slab_journal_commit_threshold() - Check whether the journal is over the threshold, and if
 *                                         so, force the oldest slab journal tail block to commit.
 * @journal: The journal.
 */
static void check_slab_journal_commit_threshold(struct recovery_journal *journal)
{
	block_count_t current_length = journal->tail - journal->slab_journal_head;

	if (current_length > journal->slab_journal_commit_threshold) {
		journal->events.slab_journal_commits_requested++;
		vdo_commit_oldest_slab_journal_tail_blocks(journal->depot,
							   journal->slab_journal_head);
	}
}

static void reap_recovery_journal(struct recovery_journal *journal);
static void assign_entries(struct recovery_journal *journal);

/**
 * finish_reaping() - Finish reaping the journal.
 * @journal: The journal being reaped.
 */
static void finish_reaping(struct recovery_journal *journal)
{
	block_count_t blocks_reaped;
	sequence_number_t old_head = get_recovery_journal_head(journal);

	journal->block_map_head = journal->block_map_reap_head;
	journal->slab_journal_head = journal->slab_journal_reap_head;
	blocks_reaped = get_recovery_journal_head(journal) - old_head;
	journal->available_space += blocks_reaped * journal->entries_per_block;
	journal->reaping = false;
	check_slab_journal_commit_threshold(journal);
	assign_entries(journal);
	check_for_drain_complete(journal);
}

/**
 * complete_reaping() - Finish reaping the journal after flushing the lower layer.
 * @completion: The journal's flush VIO.
 *
 * This is the callback registered in reap_recovery_journal().
 */
static void complete_reaping(struct vdo_completion *completion)
{
	struct recovery_journal *journal = completion->parent;

	finish_reaping(journal);

	/* Try reaping again in case more locks were released while flush was out. */
	reap_recovery_journal(journal);
}

/**
 * handle_flush_error() - Handle an error when flushing the lower layer due to reaping.
 * @completion: The journal's flush VIO.
 */
static void handle_flush_error(struct vdo_completion *completion)
{
	struct recovery_journal *journal = completion->parent;

	vio_record_metadata_io_error(as_vio(completion));
	journal->reaping = false;
	enter_journal_read_only_mode(journal, completion->result);
}

static void flush_endio(struct bio *bio)
{
	struct vio *vio = bio->bi_private;
	struct recovery_journal *journal = vio->completion.parent;

	continue_vio_after_io(vio, complete_reaping, journal->thread_id);
}

/**
 * initialize_journal_state() - Set all journal fields appropriately to start journaling from the
 *                              current active block.
 * @journal: The journal to be reset based on its active block.
 */
static void initialize_journal_state(struct recovery_journal *journal)
{
	journal->append_point.sequence_number = journal->tail;
	journal->last_write_acknowledged = journal->tail;
	journal->block_map_head = journal->tail;
	journal->slab_journal_head = journal->tail;
	journal->block_map_reap_head = journal->tail;
	journal->slab_journal_reap_head = journal->tail;
	journal->block_map_head_block_number =
		vdo_get_recovery_journal_block_number(journal, journal->block_map_head);
	journal->slab_journal_head_block_number =
		vdo_get_recovery_journal_block_number(journal,
						      journal->slab_journal_head);
	journal->available_space =
		(journal->entries_per_block * vdo_get_recovery_journal_length(journal->size));
}

/**
 * vdo_get_recovery_journal_length() - Get the number of usable recovery journal blocks.
 * @journal_size: The size of the recovery journal in blocks.
 *
 * Return: The number of recovery journal blocks usable for entries.
 */
block_count_t vdo_get_recovery_journal_length(block_count_t journal_size)
{
	block_count_t reserved_blocks = journal_size / 4;

	if (reserved_blocks > RECOVERY_JOURNAL_RESERVED_BLOCKS)
		reserved_blocks = RECOVERY_JOURNAL_RESERVED_BLOCKS;
	return (journal_size - reserved_blocks);
}

/**
 * reap_recovery_journal_callback() - Attempt to reap the journal.
 * @completion: The lock counter completion.
 *
 * Attempts to reap the journal now that all the locks on some journal block have been released.
 * This is the callback registered with the lock counter.
 */
static void reap_recovery_journal_callback(struct vdo_completion *completion)
{
	struct recovery_journal *journal = (struct recovery_journal *) completion->parent;
	/*
	 * The acknowledgment must be done before reaping so that there is no race between
	 * acknowledging the notification and unlocks wishing to notify.
	 */
	smp_wmb();
	atomic_set(&journal->lock_counter.state, LOCK_COUNTER_STATE_NOT_NOTIFYING);

	if (vdo_is_state_quiescing(&journal->state)) {
		/*
		 * Don't start reaping when the journal is trying to quiesce. Do check if this
		 * notification is the last thing the is waiting on.
		 */
		check_for_drain_complete(journal);
		return;
	}

	reap_recovery_journal(journal);
	check_slab_journal_commit_threshold(journal);
}

/**
 * initialize_lock_counter() - Initialize a lock counter.
 *
 * @journal: The recovery journal.
 * @vdo: The vdo.
 *
 * Return: VDO_SUCCESS or an error.
 */
static int __must_check initialize_lock_counter(struct recovery_journal *journal,
						struct vdo *vdo)
{
	int result;
	struct thread_config *config = &vdo->thread_config;
	struct lock_counter *counter = &journal->lock_counter;

	result = vdo_allocate(journal->size, u16, __func__, &counter->journal_counters);
	if (result != VDO_SUCCESS)
		return result;

	result = vdo_allocate(journal->size, atomic_t, __func__,
			      &counter->journal_decrement_counts);
	if (result != VDO_SUCCESS)
		return result;

	result = vdo_allocate(journal->size * config->logical_zone_count, u16, __func__,
			      &counter->logical_counters);
	if (result != VDO_SUCCESS)
		return result;

	result = vdo_allocate(journal->size, atomic_t, __func__,
			      &counter->logical_zone_counts);
	if (result != VDO_SUCCESS)
		return result;

	result = vdo_allocate(journal->size * config->physical_zone_count, u16, __func__,
			      &counter->physical_counters);
	if (result != VDO_SUCCESS)
		return result;

	result = vdo_allocate(journal->size, atomic_t, __func__,
			      &counter->physical_zone_counts);
	if (result != VDO_SUCCESS)
		return result;

	vdo_initialize_completion(&counter->completion, vdo,
				  VDO_LOCK_COUNTER_COMPLETION);
	vdo_prepare_completion(&counter->completion, reap_recovery_journal_callback,
			       reap_recovery_journal_callback, config->journal_thread,
			       journal);
	counter->logical_zones = config->logical_zone_count;
	counter->physical_zones = config->physical_zone_count;
	counter->locks = journal->size;
	return VDO_SUCCESS;
}

/**
 * set_journal_tail() - Set the journal's tail sequence number.
 * @journal: The journal whose tail is to be set.
 * @tail: The new tail value.
 */
static void set_journal_tail(struct recovery_journal *journal, sequence_number_t tail)
{
	/* VDO does not support sequence numbers above 1 << 48 in the slab journal. */
	if (tail >= (1ULL << 48))
		enter_journal_read_only_mode(journal, VDO_JOURNAL_OVERFLOW);

	journal->tail = tail;
}

/**
 * initialize_recovery_block() - Initialize a journal block.
 * @vdo: The vdo from which to construct vios.
 * @journal: The journal to which the block will belong.
 * @block: The block to initialize.
 *
 * Return: VDO_SUCCESS or an error.
 */
static int initialize_recovery_block(struct vdo *vdo, struct recovery_journal *journal,
				     struct recovery_journal_block *block)
{
	char *data;
	int result;

	/*
	 * Ensure that a block is large enough to store RECOVERY_JOURNAL_ENTRIES_PER_BLOCK entries.
	 */
	BUILD_BUG_ON(RECOVERY_JOURNAL_ENTRIES_PER_BLOCK >
		     ((VDO_BLOCK_SIZE - sizeof(struct packed_journal_header)) /
		      sizeof(struct packed_recovery_journal_entry)));

	/*
	 * Allocate a full block for the journal block even though not all of the space is used
	 * since the VIO needs to write a full disk block.
	 */
	result = vdo_allocate(VDO_BLOCK_SIZE, char, __func__, &data);
	if (result != VDO_SUCCESS)
		return result;

	result = allocate_vio_components(vdo, VIO_TYPE_RECOVERY_JOURNAL,
					 VIO_PRIORITY_HIGH, block, 1, data, &block->vio);
	if (result != VDO_SUCCESS) {
		vdo_free(data);
		return result;
	}

	list_add_tail(&block->list_node, &journal->free_tail_blocks);
	block->journal = journal;
	return VDO_SUCCESS;
}

/**
 * vdo_decode_recovery_journal() - Make a recovery journal and initialize it with the state that
 *                                 was decoded from the super block.
 *
 * @state: The decoded state of the journal.
 * @nonce: The nonce of the VDO.
 * @vdo: The VDO.
 * @partition: The partition for the journal.
 * @recovery_count: The VDO's number of completed recoveries.
 * @journal_size: The number of blocks in the journal on disk.
 * @journal_ptr: The pointer to hold the new recovery journal.
 *
 * Return: A success or error code.
 */
int vdo_decode_recovery_journal(struct recovery_journal_state_7_0 state, nonce_t nonce,
				struct vdo *vdo, struct partition *partition,
				u64 recovery_count, block_count_t journal_size,
				struct recovery_journal **journal_ptr)
{
	block_count_t i;
	struct recovery_journal *journal;
	int result;

	result = vdo_allocate_extended(struct recovery_journal,
				       RECOVERY_JOURNAL_RESERVED_BLOCKS,
				       struct recovery_journal_block, __func__,
				       &journal);
	if (result != VDO_SUCCESS)
		return result;

	INIT_LIST_HEAD(&journal->free_tail_blocks);
	INIT_LIST_HEAD(&journal->active_tail_blocks);
	vdo_waitq_init(&journal->pending_writes);

	journal->thread_id = vdo->thread_config.journal_thread;
	journal->origin = partition->offset;
	journal->nonce = nonce;
	journal->recovery_count = compute_recovery_count_byte(recovery_count);
	journal->size = journal_size;
	journal->slab_journal_commit_threshold = (journal_size * 2) / 3;
	journal->logical_blocks_used = state.logical_blocks_used;
	journal->block_map_data_blocks = state.block_map_data_blocks;
	journal->entries_per_block = RECOVERY_JOURNAL_ENTRIES_PER_BLOCK;
	set_journal_tail(journal, state.journal_start);
	initialize_journal_state(journal);
	/* TODO: this will have to change if we make initial resume of a VDO a real resume */
	vdo_set_admin_state_code(&journal->state, VDO_ADMIN_STATE_SUSPENDED);

	for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
		struct recovery_journal_block *block = &journal->blocks[i];

		result = initialize_recovery_block(vdo, journal, block);
		if (result != VDO_SUCCESS) {
			vdo_free_recovery_journal(journal);
			return result;
		}
	}

	result = initialize_lock_counter(journal, vdo);
	if (result != VDO_SUCCESS) {
		vdo_free_recovery_journal(journal);
		return result;
	}

	result = create_metadata_vio(vdo, VIO_TYPE_RECOVERY_JOURNAL, VIO_PRIORITY_HIGH,
				     journal, NULL, &journal->flush_vio);
	if (result != VDO_SUCCESS) {
		vdo_free_recovery_journal(journal);
		return result;
	}

	result = vdo_register_read_only_listener(vdo, journal,
						 notify_recovery_journal_of_read_only_mode,
						 journal->thread_id);
	if (result != VDO_SUCCESS) {
		vdo_free_recovery_journal(journal);
		return result;
	}

	result = vdo_make_default_thread(vdo, journal->thread_id);
	if (result != VDO_SUCCESS) {
		vdo_free_recovery_journal(journal);
		return result;
	}

	journal->flush_vio->completion.callback_thread_id = journal->thread_id;
	*journal_ptr = journal;
	return VDO_SUCCESS;
}

/**
 * vdo_free_recovery_journal() - Free a recovery journal.
 * @journal: The recovery journal to free.
 */
void vdo_free_recovery_journal(struct recovery_journal *journal)
{
	block_count_t i;

	if (journal == NULL)
		return;

	vdo_free(vdo_forget(journal->lock_counter.logical_zone_counts));
	vdo_free(vdo_forget(journal->lock_counter.physical_zone_counts));
	vdo_free(vdo_forget(journal->lock_counter.journal_counters));
	vdo_free(vdo_forget(journal->lock_counter.journal_decrement_counts));
	vdo_free(vdo_forget(journal->lock_counter.logical_counters));
	vdo_free(vdo_forget(journal->lock_counter.physical_counters));
	free_vio(vdo_forget(journal->flush_vio));

	/*
	 * FIXME: eventually, the journal should be constructed in a quiescent state which
	 *        requires opening before use.
	 */
	if (!vdo_is_state_quiescent(&journal->state)) {
		VDO_ASSERT_LOG_ONLY(list_empty(&journal->active_tail_blocks),
				    "journal being freed has no active tail blocks");
	} else if (!vdo_is_state_saved(&journal->state) &&
		   !list_empty(&journal->active_tail_blocks)) {
		vdo_log_warning("journal being freed has uncommitted entries");
	}

	for (i = 0; i < RECOVERY_JOURNAL_RESERVED_BLOCKS; i++) {
		struct recovery_journal_block *block = &journal->blocks[i];

		vdo_free(vdo_forget(block->vio.data));
		free_vio_components(&block->vio);
	}

	vdo_free(journal);
}

/**
 * vdo_initialize_recovery_journal_post_repair() - Initialize the journal after a repair.
 * @journal: The journal in question.
 * @recovery_count: The number of completed recoveries.
 * @tail: The new tail block sequence number.
 * @logical_blocks_used: The new number of logical blocks used.
 * @block_map_data_blocks: The new number of block map data blocks.
 */
void vdo_initialize_recovery_journal_post_repair(struct recovery_journal *journal,
						 u64 recovery_count,
						 sequence_number_t tail,
						 block_count_t logical_blocks_used,
						 block_count_t block_map_data_blocks)
{
	set_journal_tail(journal, tail + 1);
	journal->recovery_count = compute_recovery_count_byte(recovery_count);
	initialize_journal_state(journal);
	journal->logical_blocks_used = logical_blocks_used;
	journal->block_map_data_blocks = block_map_data_blocks;
}

/**
 * vdo_get_journal_block_map_data_blocks_used() - Get the number of block map pages, allocated from
 *                                                data blocks, currently in use.
 * @journal: The journal in question.
 *
 * Return: The number of block map pages allocated from slabs.
 */
block_count_t vdo_get_journal_block_map_data_blocks_used(struct recovery_journal *journal)
{
	return journal->block_map_data_blocks;
}

/**
 * vdo_get_recovery_journal_thread_id() - Get the ID of a recovery journal's thread.
 * @journal: The journal to query.
 *
 * Return: The ID of the journal's thread.
 */
thread_id_t vdo_get_recovery_journal_thread_id(struct recovery_journal *journal)
{
	return journal->thread_id;
}

/**
 * vdo_open_recovery_journal() - Prepare the journal for new entries.
 * @journal: The journal in question.
 * @depot: The slab depot for this VDO.
 * @block_map: The block map for this VDO.
 */
void vdo_open_recovery_journal(struct recovery_journal *journal,
			       struct slab_depot *depot, struct block_map *block_map)
{
	journal->depot = depot;
	journal->block_map = block_map;
	WRITE_ONCE(journal->state.current_state, VDO_ADMIN_STATE_NORMAL_OPERATION);
}

/**
 * vdo_record_recovery_journal() - Record the state of a recovery journal for encoding in the super
 *                                 block.
 * @journal: the recovery journal.
 *
 * Return: the state of the journal.
 */
struct recovery_journal_state_7_0
vdo_record_recovery_journal(const struct recovery_journal *journal)
{
	struct recovery_journal_state_7_0 state = {
		.logical_blocks_used = journal->logical_blocks_used,
		.block_map_data_blocks = journal->block_map_data_blocks,
	};

	if (vdo_is_state_saved(&journal->state)) {
		/*
		 * If the journal is saved, we should start one past the active block (since the
		 * active block is not guaranteed to be empty).
		 */
		state.journal_start = journal->tail;
	} else {
		/*
		 * When we're merely suspended or have gone read-only, we must record the first
		 * block that might have entries that need to be applied.
		 */
		state.journal_start = get_recovery_journal_head(journal);
	}

	return state;
}

/**
 * get_block_header() - Get a pointer to the packed journal block header in the block buffer.
 * @block: The recovery block.
 *
 * Return: The block's header.
 */
static inline struct packed_journal_header *
get_block_header(const struct recovery_journal_block *block)
{
	return (struct packed_journal_header *) block->vio.data;
}

/**
 * set_active_sector() - Set the current sector of the current block and initialize it.
 * @block: The block to update.
 * @sector: A pointer to the first byte of the new sector.
 */
static void set_active_sector(struct recovery_journal_block *block, void *sector)
{
	block->sector = sector;
	block->sector->check_byte = get_block_header(block)->check_byte;
	block->sector->recovery_count = block->journal->recovery_count;
	block->sector->entry_count = 0;
}

/**
 * advance_tail() - Advance the tail of the journal.
 * @journal: The journal whose tail should be advanced.
 *
 * Return: true if the tail was advanced.
 */
static bool advance_tail(struct recovery_journal *journal)
{
	struct recovery_block_header unpacked;
	struct packed_journal_header *header;
	struct recovery_journal_block *block;

	block = journal->active_block = pop_free_list(journal);
	if (block == NULL)
		return false;

	list_move_tail(&block->list_node, &journal->active_tail_blocks);

	unpacked = (struct recovery_block_header) {
		.metadata_type = VDO_METADATA_RECOVERY_JOURNAL_2,
		.block_map_data_blocks = journal->block_map_data_blocks,
		.logical_blocks_used = journal->logical_blocks_used,
		.nonce = journal->nonce,
		.recovery_count = journal->recovery_count,
		.sequence_number = journal->tail,
		.check_byte = vdo_compute_recovery_journal_check_byte(journal,
								      journal->tail),
	};

	header = get_block_header(block);
	memset(block->vio.data, 0x0, VDO_BLOCK_SIZE);
	block->sequence_number = journal->tail;
	block->entry_count = 0;
	block->uncommitted_entry_count = 0;
	block->block_number = vdo_get_recovery_journal_block_number(journal,
								    journal->tail);

	vdo_pack_recovery_block_header(&unpacked, header);
	set_active_sector(block, vdo_get_journal_block_sector(header, 1));
	set_journal_tail(journal, journal->tail + 1);
	vdo_advance_block_map_era(journal->block_map, journal->tail);
	return true;
}

/**
 * initialize_lock_count() - Initialize the value of the journal zone's counter for a given lock.
 * @journal: The recovery journal.
 *
 * Context: This must be called from the journal zone.
 */
static void initialize_lock_count(struct recovery_journal *journal)
{
	u16 *journal_value;
	block_count_t lock_number = journal->active_block->block_number;
	atomic_t *decrement_counter = get_decrement_counter(journal, lock_number);

	journal_value = get_counter(journal, lock_number, VDO_ZONE_TYPE_JOURNAL, 0);
	VDO_ASSERT_LOG_ONLY((*journal_value == atomic_read(decrement_counter)),
			    "count to be initialized not in use");
	*journal_value = journal->entries_per_block + 1;
	atomic_set(decrement_counter, 0);
}

/**
 * prepare_to_assign_entry() - Prepare the currently active block to receive an entry and check
 *			       whether an entry of the given type may be assigned at this time.
 * @journal: The journal receiving an entry.
 *
 * Return: true if there is space in the journal to store an entry of the specified type.
 */
static bool prepare_to_assign_entry(struct recovery_journal *journal)
{
	if (journal->available_space == 0)
		return false;

	if (is_block_full(journal->active_block) && !advance_tail(journal))
		return false;

	if (!is_block_empty(journal->active_block))
		return true;

	if ((journal->tail - get_recovery_journal_head(journal)) > journal->size) {
		/* Cannot use this block since the journal is full. */
		journal->events.disk_full++;
		return false;
	}

	/*
	 * Don't allow the new block to be reaped until all of its entries have been committed to
	 * the block map and until the journal block has been fully committed as well. Because the
	 * block map update is done only after any slab journal entries have been made, the
	 * per-entry lock for the block map entry serves to protect those as well.
	 */
	initialize_lock_count(journal);
	return true;
}

static void write_blocks(struct recovery_journal *journal);

/**
 * schedule_block_write() - Queue a block for writing.
 * @journal: The journal in question.
 * @block: The block which is now ready to write.
 *
 * The block is expected to be full. If the block is currently writing, this is a noop as the block
 * will be queued for writing when the write finishes. The block must not currently be queued for
 * writing.
 */
static void schedule_block_write(struct recovery_journal *journal,
				 struct recovery_journal_block *block)
{
	if (!block->committing)
		vdo_waitq_enqueue_waiter(&journal->pending_writes, &block->write_waiter);
	/*
	 * At the end of adding entries, or discovering this partial block is now full and ready to
	 * rewrite, we will call write_blocks() and write a whole batch.
	 */
}

/**
 * release_journal_block_reference() - Release a reference to a journal block.
 * @block: The journal block from which to release a reference.
 */
static void release_journal_block_reference(struct recovery_journal_block *block)
{
	vdo_release_recovery_journal_block_reference(block->journal,
						     block->sequence_number,
						     VDO_ZONE_TYPE_JOURNAL, 0);
}

static void update_usages(struct recovery_journal *journal, struct data_vio *data_vio)
{
	if (data_vio->increment_updater.operation == VDO_JOURNAL_BLOCK_MAP_REMAPPING) {
		journal->block_map_data_blocks++;
		return;
	}

	if (data_vio->new_mapped.state != VDO_MAPPING_STATE_UNMAPPED)
		journal->logical_blocks_used++;

	if (data_vio->mapped.state != VDO_MAPPING_STATE_UNMAPPED)
		journal->logical_blocks_used--;
}

/**
 * assign_entry() - Assign an entry waiter to the active block.
 * @waiter: The data_vio.
 * @context: The recovery journal block.
 *
 * Implements waiter_callback_fn.
 */
static void assign_entry(struct vdo_waiter *waiter, void *context)
{
	struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
	struct recovery_journal_block *block = context;
	struct recovery_journal *journal = block->journal;

	/* Record the point at which we will make the journal entry. */
	data_vio->recovery_journal_point = (struct journal_point) {
		.sequence_number = block->sequence_number,
		.entry_count = block->entry_count,
	};

	update_usages(journal, data_vio);
	journal->available_space--;

	if (!vdo_waitq_has_waiters(&block->entry_waiters))
		journal->events.blocks.started++;

	vdo_waitq_enqueue_waiter(&block->entry_waiters, &data_vio->waiter);
	block->entry_count++;
	block->uncommitted_entry_count++;
	journal->events.entries.started++;

	if (is_block_full(block)) {
		/*
		 * The block is full, so we can write it anytime henceforth. If it is already
		 * committing, we'll queue it for writing when it comes back.
		 */
		schedule_block_write(journal, block);
	}

	/* Force out slab journal tail blocks when threshold is reached. */
	check_slab_journal_commit_threshold(journal);
}

static void assign_entries(struct recovery_journal *journal)
{
	if (journal->adding_entries) {
		/* Protect against re-entrancy. */
		return;
	}

	journal->adding_entries = true;
	while (vdo_waitq_has_waiters(&journal->entry_waiters) &&
	       prepare_to_assign_entry(journal)) {
		vdo_waitq_notify_next_waiter(&journal->entry_waiters,
					     assign_entry, journal->active_block);
	}

	/* Now that we've finished with entries, see if we have a batch of blocks to write. */
	write_blocks(journal);
	journal->adding_entries = false;
}

/**
 * recycle_journal_block() - Prepare an in-memory journal block to be reused now that it has been
 *                           fully committed.
 * @block: The block to be recycled.
 */
static void recycle_journal_block(struct recovery_journal_block *block)
{
	struct recovery_journal *journal = block->journal;
	block_count_t i;

	list_move_tail(&block->list_node, &journal->free_tail_blocks);

	/* Release any unused entry locks. */
	for (i = block->entry_count; i < journal->entries_per_block; i++)
		release_journal_block_reference(block);

	/*
	 * Release our own lock against reaping now that the block is completely committed, or
	 * we're giving up because we're in read-only mode.
	 */
	if (block->entry_count > 0)
		release_journal_block_reference(block);

	if (block == journal->active_block)
		journal->active_block = NULL;
}

/**
 * continue_committed_waiter() - invoked whenever a VIO is to be released from the journal because
 *                               its entry was committed to disk.
 * @waiter: The data_vio waiting on a journal write.
 * @context: A pointer to the recovery journal.
 *
 * Implements waiter_callback_fn.
 */
static void continue_committed_waiter(struct vdo_waiter *waiter, void *context)
{
	struct data_vio *data_vio = vdo_waiter_as_data_vio(waiter);
	struct recovery_journal *journal = context;
	int result = (is_read_only(journal) ? VDO_READ_ONLY : VDO_SUCCESS);
	bool has_decrement;

	VDO_ASSERT_LOG_ONLY(vdo_before_journal_point(&journal->commit_point,
						     &data_vio->recovery_journal_point),
			    "DataVIOs released from recovery journal in order. Recovery journal point is (%llu, %u), but commit waiter point is (%llu, %u)",
			    (unsigned long long) journal->commit_point.sequence_number,
			    journal->commit_point.entry_count,
			    (unsigned long long) data_vio->recovery_journal_point.sequence_number,
			    data_vio->recovery_journal_point.entry_count);

	journal->commit_point = data_vio->recovery_journal_point;
	data_vio->last_async_operation = VIO_ASYNC_OP_UPDATE_REFERENCE_COUNTS;
	if (result != VDO_SUCCESS) {
		continue_data_vio_with_error(data_vio, result);
		return;
	}

	/*
	 * The increment must be launched first since it must come before the
	 * decrement if they are in the same slab.
	 */
	has_decrement = (data_vio->decrement_updater.zpbn.pbn != VDO_ZERO_BLOCK);
	if ((data_vio->increment_updater.zpbn.pbn != VDO_ZERO_BLOCK) || !has_decrement)
		continue_data_vio(data_vio);

	if (has_decrement)
		vdo_launch_completion(&data_vio->decrement_completion);
}

/**
 * notify_commit_waiters() - Notify any VIOs whose entries have now committed.
 * @journal: The recovery journal to update.
 */
static void notify_commit_waiters(struct recovery_journal *journal)
{
	struct recovery_journal_block *block;

	list_for_each_entry(block, &journal->active_tail_blocks, list_node) {
		if (block->committing)
			return;

		vdo_waitq_notify_all_waiters(&block->commit_waiters,
					     continue_committed_waiter, journal);
		if (is_read_only(journal)) {
			vdo_waitq_notify_all_waiters(&block->entry_waiters,
						     continue_committed_waiter,
						     journal);
		} else if (is_block_dirty(block) || !is_block_full(block)) {
			/* Stop at partially-committed or partially-filled blocks. */
			return;
		}
	}
}

/**
 * recycle_journal_blocks() - Recycle any journal blocks which have been fully committed.
 * @journal: The recovery journal to update.
 */
static void recycle_journal_blocks(struct recovery_journal *journal)
{
	struct recovery_journal_block *block, *tmp;

	list_for_each_entry_safe(block, tmp, &journal->active_tail_blocks, list_node) {
		if (block->committing) {
			/* Don't recycle committing blocks. */
			return;
		}

		if (!is_read_only(journal) &&
		    (is_block_dirty(block) || !is_block_full(block))) {
			/*
			 * Don't recycle partially written or partially full blocks, except in
			 * read-only mode.
			 */
			return;
		}

		recycle_journal_block(block);
	}
}

/**
 * complete_write() - Handle post-commit processing.
 * @completion: The completion of the VIO writing this block.
 *
 * This is the callback registered by write_block(). If more entries accumulated in the block being
 * committed while the commit was in progress, another commit will be initiated.
 */
static void complete_write(struct vdo_completion *completion)
{
	struct recovery_journal_block *block = completion->parent;
	struct recovery_journal *journal = block->journal;
	struct recovery_journal_block *last_active_block;

	assert_on_journal_thread(journal, __func__);

	journal->pending_write_count -= 1;
	journal->events.blocks.committed += 1;
	journal->events.entries.committed += block->entries_in_commit;
	block->uncommitted_entry_count -= block->entries_in_commit;
	block->entries_in_commit = 0;
	block->committing = false;

	/* If this block is the latest block to be acknowledged, record that fact. */
	if (block->sequence_number > journal->last_write_acknowledged)
		journal->last_write_acknowledged = block->sequence_number;

	last_active_block = get_journal_block(&journal->active_tail_blocks);
	VDO_ASSERT_LOG_ONLY((block->sequence_number >= last_active_block->sequence_number),
			    "completed journal write is still active");

	notify_commit_waiters(journal);

	/*
	 * Is this block now full? Reaping, and adding entries, might have already sent it off for
	 * rewriting; else, queue it for rewrite.
	 */
	if (is_block_dirty(block) && is_block_full(block))
		schedule_block_write(journal, block);

	recycle_journal_blocks(journal);
	write_blocks(journal);

	check_for_drain_complete(journal);
}

static void handle_write_error(struct vdo_completion *completion)
{
	struct recovery_journal_block *block = completion->parent;
	struct recovery_journal *journal = block->journal;

	vio_record_metadata_io_error(as_vio(completion));
	vdo_log_error_strerror(completion->result,
			       "cannot write recovery journal block %llu",
			       (unsigned long long) block->sequence_number);
	enter_journal_read_only_mode(journal, completion->result);
	complete_write(completion);
}

static void complete_write_endio(struct bio *bio)
{
	struct vio *vio = bio->bi_private;
	struct recovery_journal_block *block = vio->completion.parent;
	struct recovery_journal *journal = block->journal;

	continue_vio_after_io(vio, complete_write, journal->thread_id);
}

/**
 * add_queued_recovery_entries() - Actually add entries from the queue to the given block.
 * @block: The journal block.
 */
static void add_queued_recovery_entries(struct recovery_journal_block *block)
{
	while (vdo_waitq_has_waiters(&block->entry_waiters)) {
		struct data_vio *data_vio =
			vdo_waiter_as_data_vio(vdo_waitq_dequeue_waiter(&block->entry_waiters));
		struct tree_lock *lock = &data_vio->tree_lock;
		struct packed_recovery_journal_entry *packed_entry;
		struct recovery_journal_entry new_entry;

		if (block->sector->entry_count == RECOVERY_JOURNAL_ENTRIES_PER_SECTOR)
			set_active_sector(block,
					  (char *) block->sector + VDO_SECTOR_SIZE);

		/* Compose and encode the entry. */
		packed_entry = &block->sector->entries[block->sector->entry_count++];
		new_entry = (struct recovery_journal_entry) {
			.mapping = {
				.pbn = data_vio->increment_updater.zpbn.pbn,
				.state = data_vio->increment_updater.zpbn.state,
			},
			.unmapping = {
				.pbn = data_vio->decrement_updater.zpbn.pbn,
				.state = data_vio->decrement_updater.zpbn.state,
			},
			.operation = data_vio->increment_updater.operation,
			.slot = lock->tree_slots[lock->height].block_map_slot,
		};
		*packed_entry = vdo_pack_recovery_journal_entry(&new_entry);
		data_vio->recovery_sequence_number = block->sequence_number;

		/* Enqueue the data_vio to wait for its entry to commit. */
		vdo_waitq_enqueue_waiter(&block->commit_waiters, &data_vio->waiter);
	}
}

/**
 * write_block() - Issue a block for writing.
 * @waiter: The recovery journal block to write.
 * @context: Not used.
 *
 * Implements waiter_callback_fn.
 */
static void write_block(struct vdo_waiter *waiter, void __always_unused *context)
{
	struct recovery_journal_block *block =
		container_of(waiter, struct recovery_journal_block, write_waiter);
	struct recovery_journal *journal = block->journal;
	struct packed_journal_header *header = get_block_header(block);

	if (block->committing || !vdo_waitq_has_waiters(&block->entry_waiters) ||
	    is_read_only(journal))
		return;

	block->entries_in_commit = vdo_waitq_num_waiters(&block->entry_waiters);
	add_queued_recovery_entries(block);

	journal->pending_write_count += 1;
	journal->events.blocks.written += 1;
	journal->events.entries.written += block->entries_in_commit;

	header->block_map_head = __cpu_to_le64(journal->block_map_head);
	header->slab_journal_head = __cpu_to_le64(journal->slab_journal_head);
	header->entry_count = __cpu_to_le16(block->entry_count);

	block->committing = true;

	/*
	 * We must issue a flush and a FUA for every commit. The flush is necessary to ensure that
	 * the data being referenced is stable. The FUA is necessary to ensure that the journal
	 * block itself is stable before allowing overwrites of the lbn's previous data.
	 */
	vdo_submit_metadata_vio(&block->vio, journal->origin + block->block_number,
				complete_write_endio, handle_write_error,
				REQ_OP_WRITE | REQ_PRIO | REQ_PREFLUSH | REQ_SYNC | REQ_FUA);
}


/**
 * write_blocks() - Attempt to commit blocks, according to write policy.
 * @journal: The recovery journal.
 */
static void write_blocks(struct recovery_journal *journal)
{
	assert_on_journal_thread(journal, __func__);
	/*
	 * We call this function after adding entries to the journal and after finishing a block
	 * write. Thus, when this function terminates we must either have no VIOs waiting in the
	 * journal or have some outstanding IO to provide a future wakeup.
	 *
	 * We want to only issue full blocks if there are no pending writes. However, if there are
	 * no outstanding writes and some unwritten entries, we must issue a block, even if it's
	 * the active block and it isn't full.
	 */
	if (journal->pending_write_count > 0)
		return;

	/* Write all the full blocks. */
	vdo_waitq_notify_all_waiters(&journal->pending_writes, write_block, NULL);

	/*
	 * Do we need to write the active block? Only if we have no outstanding writes, even after
	 * issuing all of the full writes.
	 */
	if ((journal->pending_write_count == 0) && (journal->active_block != NULL))
		write_block(&journal->active_block->write_waiter, NULL);
}

/**
 * vdo_add_recovery_journal_entry() - Add an entry to a recovery journal.
 * @journal: The journal in which to make an entry.
 * @data_vio: The data_vio for which to add the entry. The entry will be taken
 *	      from the logical and new_mapped fields of the data_vio. The
 *	      data_vio's recovery_sequence_number field will be set to the
 *	      sequence number of the journal block in which the entry was
 *	      made.
 *
 * This method is asynchronous. The data_vio will not be called back until the entry is committed
 * to the on-disk journal.
 */
void vdo_add_recovery_journal_entry(struct recovery_journal *journal,
				    struct data_vio *data_vio)
{
	assert_on_journal_thread(journal, __func__);
	if (!vdo_is_state_normal(&journal->state)) {
		continue_data_vio_with_error(data_vio, VDO_INVALID_ADMIN_STATE);
		return;
	}

	if (is_read_only(journal)) {
		continue_data_vio_with_error(data_vio, VDO_READ_ONLY);
		return;
	}

	VDO_ASSERT_LOG_ONLY(data_vio->recovery_sequence_number == 0,
			    "journal lock not held for new entry");

	vdo_advance_journal_point(&journal->append_point, journal->entries_per_block);
	vdo_waitq_enqueue_waiter(&journal->entry_waiters, &data_vio->waiter);
	assign_entries(journal);
}

/**
 * is_lock_locked() - Check whether a lock is locked for a zone type.
 * @journal: The recovery journal.
 * @lock_number: The lock to check.
 * @zone_type: The type of the zone.
 *
 * If the recovery journal has a lock on the lock number, both logical and physical zones are
 * considered locked.
 *
 * Return: true if the specified lock has references (is locked).
 */
static bool is_lock_locked(struct recovery_journal *journal, block_count_t lock_number,
			   enum vdo_zone_type zone_type)
{
	atomic_t *zone_count;
	bool locked;

	if (is_journal_zone_locked(journal, lock_number))
		return true;

	zone_count = get_zone_count_ptr(journal, lock_number, zone_type);
	locked = (atomic_read(zone_count) != 0);
	/* Pairs with implicit barrier in vdo_release_recovery_journal_block_reference() */
	smp_rmb();
	return locked;
}

/**
 * reap_recovery_journal() - Conduct a sweep on a recovery journal to reclaim unreferenced blocks.
 * @journal: The recovery journal.
 */
static void reap_recovery_journal(struct recovery_journal *journal)
{
	if (journal->reaping) {
		/*
		 * We already have an outstanding reap in progress. We need to wait for it to
		 * finish.
		 */
		return;
	}

	if (vdo_is_state_quiescent(&journal->state)) {
		/* We are supposed to not do IO. Don't botch it by reaping. */
		return;
	}

	/*
	 * Start reclaiming blocks only when the journal head has no references. Then stop when a
	 * block is referenced.
	 */
	while ((journal->block_map_reap_head < journal->last_write_acknowledged) &&
		!is_lock_locked(journal, journal->block_map_head_block_number,
				VDO_ZONE_TYPE_LOGICAL)) {
		journal->block_map_reap_head++;
		if (++journal->block_map_head_block_number == journal->size)
			journal->block_map_head_block_number = 0;
	}

	while ((journal->slab_journal_reap_head < journal->last_write_acknowledged) &&
		!is_lock_locked(journal, journal->slab_journal_head_block_number,
				VDO_ZONE_TYPE_PHYSICAL)) {
		journal->slab_journal_reap_head++;
		if (++journal->slab_journal_head_block_number == journal->size)
			journal->slab_journal_head_block_number = 0;
	}

	if ((journal->block_map_reap_head == journal->block_map_head) &&
	    (journal->slab_journal_reap_head == journal->slab_journal_head)) {
		/* Nothing happened. */
		return;
	}

	/*
	 * If the block map head will advance, we must flush any block map page modified by the
	 * entries we are reaping. If the slab journal head will advance, we must flush the slab
	 * summary update covering the slab journal that just released some lock.
	 */
	journal->reaping = true;
	vdo_submit_flush_vio(journal->flush_vio, flush_endio, handle_flush_error);
}

/**
 * vdo_acquire_recovery_journal_block_reference() - Acquire a reference to a recovery journal block
 *                                                  from somewhere other than the journal itself.
 * @journal: The recovery journal.
 * @sequence_number: The journal sequence number of the referenced block.
 * @zone_type: The type of the zone making the adjustment.
 * @zone_id: The ID of the zone making the adjustment.
 */
void vdo_acquire_recovery_journal_block_reference(struct recovery_journal *journal,
						  sequence_number_t sequence_number,
						  enum vdo_zone_type zone_type,
						  zone_count_t zone_id)
{
	block_count_t lock_number;
	u16 *current_value;

	if (sequence_number == 0)
		return;

	VDO_ASSERT_LOG_ONLY((zone_type != VDO_ZONE_TYPE_JOURNAL),
			    "invalid lock count increment from journal zone");

	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
	current_value = get_counter(journal, lock_number, zone_type, zone_id);
	VDO_ASSERT_LOG_ONLY(*current_value < U16_MAX,
			    "increment of lock counter must not overflow");

	if (*current_value == 0) {
		/*
		 * This zone is acquiring this lock for the first time. Extra barriers because this
		 * was original developed using an atomic add operation that implicitly had them.
		 */
		smp_mb__before_atomic();
		atomic_inc(get_zone_count_ptr(journal, lock_number, zone_type));
		/* same as before_atomic */
		smp_mb__after_atomic();
	}

	*current_value += 1;
}

/**
 * vdo_release_journal_entry_lock() - Release a single per-entry reference count for a recovery
 *                                    journal block.
 * @journal: The recovery journal.
 * @sequence_number: The journal sequence number of the referenced block.
 */
void vdo_release_journal_entry_lock(struct recovery_journal *journal,
				    sequence_number_t sequence_number)
{
	block_count_t lock_number;

	if (sequence_number == 0)
		return;

	lock_number = vdo_get_recovery_journal_block_number(journal, sequence_number);
	/*
	 * Extra barriers because this was originally developed using an atomic add operation that
	 * implicitly had them.
	 */
	smp_mb__before_atomic();
	atomic_inc(get_decrement_counter(journal, lock_number));
	/* same as before_atomic */
	smp_mb__after_atomic();
}

/** Implements vdo_admin_initiator_fn. */
static void initiate_drain(struct admin_state *state)
{
	check_for_drain_complete(container_of(state, struct recovery_journal, state));
}

/**
 * vdo_drain_recovery_journal() - Drain recovery journal I/O.
 * @journal: The journal to drain.
 * @operation: The drain operation (suspend or save).
 * @parent: The completion to notify once the journal is drained.
 *
 * All uncommitted entries will be written out.
 */
void vdo_drain_recovery_journal(struct recovery_journal *journal,
				const struct admin_state_code *operation,
				struct vdo_completion *parent)
{
	assert_on_journal_thread(journal, __func__);
	vdo_start_draining(&journal->state, operation, parent, initiate_drain);
}

/**
 * resume_lock_counter() - Re-allow notifications from a suspended lock counter.
 * @counter: The counter.
 *
 * Return: true if the lock counter was suspended.
 */
static bool resume_lock_counter(struct lock_counter *counter)
{
	int prior_state;

	/*
	 * Extra barriers because this was original developed using a CAS operation that implicitly
	 * had them.
	 */
	smp_mb__before_atomic();
	prior_state = atomic_cmpxchg(&counter->state, LOCK_COUNTER_STATE_SUSPENDED,
				     LOCK_COUNTER_STATE_NOT_NOTIFYING);
	/* same as before_atomic */
	smp_mb__after_atomic();

	return (prior_state == LOCK_COUNTER_STATE_SUSPENDED);
}

/**
 * vdo_resume_recovery_journal() - Resume a recovery journal which has been drained.
 * @journal: The journal to resume.
 * @parent: The completion to finish once the journal is resumed.
 */
void vdo_resume_recovery_journal(struct recovery_journal *journal,
				 struct vdo_completion *parent)
{
	bool saved;

	assert_on_journal_thread(journal, __func__);
	saved = vdo_is_state_saved(&journal->state);
	vdo_set_completion_result(parent, vdo_resume_if_quiescent(&journal->state));
	if (is_read_only(journal)) {
		vdo_continue_completion(parent, VDO_READ_ONLY);
		return;
	}

	if (saved)
		initialize_journal_state(journal);

	if (resume_lock_counter(&journal->lock_counter)) {
		/* We might have missed a notification. */
		reap_recovery_journal(journal);
	}

	vdo_launch_completion(parent);
}

/**
 * vdo_get_recovery_journal_logical_blocks_used() - Get the number of logical blocks in use by the
 *                                                  VDO.
 * @journal: The journal.
 *
 * Return: The number of logical blocks in use by the VDO.
 */
block_count_t vdo_get_recovery_journal_logical_blocks_used(const struct recovery_journal *journal)
{
	return journal->logical_blocks_used;
}

/**
 * vdo_get_recovery_journal_statistics() - Get the current statistics from the recovery journal.
 * @journal: The recovery journal to query.
 *
 * Return: A copy of the current statistics for the journal.
 */
struct recovery_journal_statistics
vdo_get_recovery_journal_statistics(const struct recovery_journal *journal)
{
	return journal->events;
}

/**
 * dump_recovery_block() - Dump the contents of the recovery block to the log.
 * @block: The block to dump.
 */
static void dump_recovery_block(const struct recovery_journal_block *block)
{
	vdo_log_info("    sequence number %llu; entries %u; %s; %zu entry waiters; %zu commit waiters",
		     (unsigned long long) block->sequence_number, block->entry_count,
		     (block->committing ? "committing" : "waiting"),
		     vdo_waitq_num_waiters(&block->entry_waiters),
		     vdo_waitq_num_waiters(&block->commit_waiters));
}

/**
 * vdo_dump_recovery_journal_statistics() - Dump some current statistics and other debug info from
 *                                          the recovery journal.
 * @journal: The recovery journal to dump.
 */
void vdo_dump_recovery_journal_statistics(const struct recovery_journal *journal)
{
	const struct recovery_journal_block *block;
	struct recovery_journal_statistics stats = vdo_get_recovery_journal_statistics(journal);

	vdo_log_info("Recovery Journal");
	vdo_log_info("	block_map_head=%llu slab_journal_head=%llu last_write_acknowledged=%llu tail=%llu block_map_reap_head=%llu slab_journal_reap_head=%llu disk_full=%llu slab_journal_commits_requested=%llu entry_waiters=%zu",
		     (unsigned long long) journal->block_map_head,
		     (unsigned long long) journal->slab_journal_head,
		     (unsigned long long) journal->last_write_acknowledged,
		     (unsigned long long) journal->tail,
		     (unsigned long long) journal->block_map_reap_head,
		     (unsigned long long) journal->slab_journal_reap_head,
		     (unsigned long long) stats.disk_full,
		     (unsigned long long) stats.slab_journal_commits_requested,
		     vdo_waitq_num_waiters(&journal->entry_waiters));
	vdo_log_info("	entries: started=%llu written=%llu committed=%llu",
		     (unsigned long long) stats.entries.started,
		     (unsigned long long) stats.entries.written,
		     (unsigned long long) stats.entries.committed);
	vdo_log_info("	blocks: started=%llu written=%llu committed=%llu",
		     (unsigned long long) stats.blocks.started,
		     (unsigned long long) stats.blocks.written,
		     (unsigned long long) stats.blocks.committed);

	vdo_log_info("	active blocks:");
	list_for_each_entry(block, &journal->active_tail_blocks, list_node)
		dump_recovery_block(block);
}