fs/xfs/xfs_zone_space_resv.c

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (c) 2023-2025 Christoph Hellwig.
 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
 */
#include "xfs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_rtbitmap.h"
#include "xfs_zone_alloc.h"
#include "xfs_zone_priv.h"
#include "xfs_zones.h"

/*
 * Note: the zoned allocator does not support a rtextsize > 1, so this code and
 * the allocator itself uses file system blocks interchangeable with realtime
 * extents without doing the otherwise required conversions.
 */

/*
 * Per-task space reservation.
 *
 * Tasks that need to wait for GC to free up space allocate one of these
 * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
 * The GC thread will then wake the tasks in order when space becomes available.
 */
struct xfs_zone_reservation {
	struct list_head	entry;
	struct task_struct	*task;
	xfs_filblks_t		count_fsb;
};

/*
 * Calculate the number of reserved blocks.
 *
 * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
 * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
 * available for writes without waiting for GC.
 *
 * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
 * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
 * is further restricted by at least one zone as well as the optional
 * persistently reserved blocks.  This allows the allocator to run more
 * smoothly by not always triggering GC.
 */
uint64_t
xfs_zoned_default_resblks(
	struct xfs_mount	*mp,
	enum xfs_free_counter	ctr)
{
	switch (ctr) {
	case XC_FREE_RTEXTENTS:
		return (uint64_t)XFS_RESERVED_ZONES *
			mp->m_groups[XG_TYPE_RTG].blocks +
			mp->m_sb.sb_rtreserved;
	case XC_FREE_RTAVAILABLE:
		return (uint64_t)XFS_GC_ZONES *
			mp->m_groups[XG_TYPE_RTG].blocks;
	default:
		ASSERT(0);
		return 0;
	}
}

void
xfs_zoned_resv_wake_all(
	struct xfs_mount		*mp)
{
	struct xfs_zone_info		*zi = mp->m_zone_info;
	struct xfs_zone_reservation	*reservation;

	spin_lock(&zi->zi_reservation_lock);
	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
		wake_up_process(reservation->task);
	spin_unlock(&zi->zi_reservation_lock);
}

void
xfs_zoned_add_available(
	struct xfs_mount		*mp,
	xfs_filblks_t			count_fsb)
{
	struct xfs_zone_info		*zi = mp->m_zone_info;
	struct xfs_zone_reservation	*reservation;

	if (list_empty_careful(&zi->zi_reclaim_reservations)) {
		xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
		return;
	}

	spin_lock(&zi->zi_reservation_lock);
	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
	count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
		if (reservation->count_fsb > count_fsb)
			break;
		wake_up_process(reservation->task);
		count_fsb -= reservation->count_fsb;

	}
	spin_unlock(&zi->zi_reservation_lock);
}

static int
xfs_zoned_space_wait_error(
	struct xfs_mount		*mp)
{
	if (xfs_is_shutdown(mp))
		return -EIO;
	if (fatal_signal_pending(current))
		return -EINTR;
	return 0;
}

static int
xfs_zoned_reserve_available(
	struct xfs_inode		*ip,
	xfs_filblks_t			count_fsb,
	unsigned int			flags)
{
	struct xfs_mount		*mp = ip->i_mount;
	struct xfs_zone_info		*zi = mp->m_zone_info;
	struct xfs_zone_reservation	reservation = {
		.task		= current,
		.count_fsb	= count_fsb,
	};
	int				error;

	/*
	 * If there are no waiters, try to directly grab the available blocks
	 * from the percpu counter.
	 *
	 * If the caller wants to dip into the reserved pool also bypass the
	 * wait list.  This relies on the fact that we have a very graciously
	 * sized reserved pool that always has enough space.  If the reserved
	 * allocations fail we're in trouble.
	 */
	if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
	    (flags & XFS_ZR_RESERVED))) {
		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
				flags & XFS_ZR_RESERVED);
		if (error != -ENOSPC)
			return error;
	}

	if (flags & XFS_ZR_NOWAIT)
		return -EAGAIN;

	spin_lock(&zi->zi_reservation_lock);
	list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
	while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
		set_current_state(TASK_KILLABLE);

		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
				flags & XFS_ZR_RESERVED);
		if (error != -ENOSPC)
			break;

		/*
		 * Make sure to start GC if it is not running already. As we
		 * check the rtavailable count when filling up zones, GC is
		 * normally already running at this point, but in some setups
		 * with very few zones we may completely run out of non-
		 * reserved blocks in between filling zones.
		 */
		if (!xfs_is_zonegc_running(mp))
			wake_up_process(zi->zi_gc_thread);

		/*
		 * If there is no reclaimable group left and we aren't still
		 * processing a pending GC request give up as we're fully out
		 * of space.
		 */
		if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
		    !xfs_is_zonegc_running(mp))
			break;

		spin_unlock(&zi->zi_reservation_lock);
		schedule();
		spin_lock(&zi->zi_reservation_lock);
	}
	list_del(&reservation.entry);
	spin_unlock(&zi->zi_reservation_lock);

	__set_current_state(TASK_RUNNING);
	return error;
}

/*
 * Implement greedy space allocation for short writes by trying to grab all
 * that is left after locking out other threads from trying to do the same.
 *
 * This isn't exactly optimal and can hopefully be replaced by a proper
 * percpu_counter primitive one day.
 */
static int
xfs_zoned_reserve_extents_greedy(
	struct xfs_inode		*ip,
	xfs_filblks_t			*count_fsb,
	unsigned int			flags)
{
	struct xfs_mount		*mp = ip->i_mount;
	struct xfs_zone_info		*zi = mp->m_zone_info;
	s64				len = *count_fsb;
	int				error = -ENOSPC;

	spin_lock(&zi->zi_reservation_lock);
	len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
	if (len > 0) {
		*count_fsb = len;
		error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
				flags & XFS_ZR_RESERVED);
	}
	spin_unlock(&zi->zi_reservation_lock);
	return error;
}

int
xfs_zoned_space_reserve(
	struct xfs_inode		*ip,
	xfs_filblks_t			count_fsb,
	unsigned int			flags,
	struct xfs_zone_alloc_ctx	*ac)
{
	struct xfs_mount		*mp = ip->i_mount;
	int				error;

	ASSERT(ac->reserved_blocks == 0);
	ASSERT(ac->open_zone == NULL);

	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
			flags & XFS_ZR_RESERVED);
	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
		error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
	if (error)
		return error;

	error = xfs_zoned_reserve_available(ip, count_fsb, flags);
	if (error) {
		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
		return error;
	}
	ac->reserved_blocks = count_fsb;
	return 0;
}

void
xfs_zoned_space_unreserve(
	struct xfs_inode		*ip,
	struct xfs_zone_alloc_ctx	*ac)
{
	if (ac->reserved_blocks > 0) {
		struct xfs_mount	*mp = ip->i_mount;

		xfs_zoned_add_available(mp, ac->reserved_blocks);
		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
	}
	if (ac->open_zone)
		xfs_open_zone_put(ac->open_zone);
}