fs/bcachefs/rcu_pending.c

// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) "%s() " fmt "\n", __func__

#include <linux/generic-radix-tree.h>
#include <linux/mm.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/srcu.h>
#include <linux/vmalloc.h>

#include "rcu_pending.h"
#include "darray.h"
#include "util.h"

#define static_array_for_each(_a, _i)			\
	for (typeof(&(_a)[0]) _i = _a;			\
	     _i < (_a) + ARRAY_SIZE(_a);		\
	     _i++)

enum rcu_pending_special {
	RCU_PENDING_KVFREE	= 1,
	RCU_PENDING_CALL_RCU	= 2,
};

#define RCU_PENDING_KVFREE_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
#define RCU_PENDING_CALL_RCU_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)

#ifdef __KERNEL__
typedef unsigned long			rcu_gp_poll_state_t;

static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
{
	return l == r;
}
#else
typedef struct urcu_gp_poll_state	rcu_gp_poll_state_t;

static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
{
	return l.grace_period_id == r.grace_period_id;
}
#endif

static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
{
	return ssp
		? get_state_synchronize_srcu(ssp)
		: get_state_synchronize_rcu();
}

static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
{
	return ssp
		? start_poll_synchronize_srcu(ssp)
		: start_poll_synchronize_rcu();
}

static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
{
	return ssp
		? poll_state_synchronize_srcu(ssp, cookie)
		: poll_state_synchronize_rcu(cookie);
}

static inline void __rcu_barrier(struct srcu_struct *ssp)
{
	return ssp
		? srcu_barrier(ssp)
		: rcu_barrier();
}

static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
			      rcu_callback_t func)
{
	if (ssp)
		call_srcu(ssp, rhp, func);
	else
		call_rcu(rhp, func);
}

struct rcu_pending_seq {
	/*
	 * We're using a radix tree like a vector - we're just pushing elements
	 * onto the end; we're using a radix tree instead of an actual vector to
	 * avoid reallocation overhead
	 */
	GENRADIX(struct rcu_head *)	objs;
	size_t				nr;
	struct rcu_head			**cursor;
	rcu_gp_poll_state_t		seq;
};

struct rcu_pending_list {
	struct rcu_head			*head;
	struct rcu_head			*tail;
	rcu_gp_poll_state_t		seq;
};

struct rcu_pending_pcpu {
	struct rcu_pending		*parent;
	spinlock_t			lock;
	int				cpu;

	/*
	 * We can't bound the number of unprocessed gp sequence numbers, and we
	 * can't efficiently merge radix trees for expired grace periods, so we
	 * need darray/vector:
	 */
	DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;

	/* Third entry is for expired objects: */
	struct rcu_pending_list		lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];

	struct rcu_head			cb;
	bool				cb_armed;
	struct work_struct		work;
};

static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
{
	if (p->objs.nr)
		return true;

	static_array_for_each(p->lists, i)
		if (i->head)
			return true;

	return false;
}

static void rcu_pending_list_merge(struct rcu_pending_list *l1,
				   struct rcu_pending_list *l2)
{
#ifdef __KERNEL__
	if (!l1->head)
		l1->head = l2->head;
	else
		l1->tail->next = l2->head;
#else
	if (!l1->head)
		l1->head = l2->head;
	else
		l1->tail->next.next = (void *) l2->head;
#endif

	l1->tail = l2->tail;
	l2->head = l2->tail = NULL;
}

static void rcu_pending_list_add(struct rcu_pending_list *l,
				 struct rcu_head *n)
{
#ifdef __KERNEL__
	if (!l->head)
		l->head = n;
	else
		l->tail->next = n;
	l->tail = n;
	n->next = NULL;
#else
	if (!l->head)
		l->head = n;
	else
		l->tail->next.next = (void *) n;
	l->tail = n;
	n->next.next = NULL;
#endif
}

static void merge_expired_lists(struct rcu_pending_pcpu *p)
{
	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];

	for (struct rcu_pending_list *i = p->lists; i < expired; i++)
		if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
			rcu_pending_list_merge(expired, i);
}

#ifndef __KERNEL__
static inline void kfree_bulk(size_t nr, void ** p)
{
	while (nr--)
		kfree(*p);
}
#endif

static noinline void __process_finished_items(struct rcu_pending *pending,
					      struct rcu_pending_pcpu *p,
					      unsigned long flags)
{
	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
	struct rcu_pending_seq objs = {};
	struct rcu_head *list = NULL;

	if (p->objs.nr &&
	    __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
		objs = p->objs.data[0];
		darray_remove_item(&p->objs, p->objs.data);
	}

	merge_expired_lists(p);

	list = expired->head;
	expired->head = expired->tail = NULL;

	spin_unlock_irqrestore(&p->lock, flags);

	switch ((ulong) pending->process) {
	case RCU_PENDING_KVFREE:
		for (size_t i = 0; i < objs.nr; ) {
			size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);

			kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
			i += nr_this_node;
		}
		genradix_free(&objs.objs);

		while (list) {
			struct rcu_head *obj = list;
#ifdef __KERNEL__
			list = obj->next;
#else
			list = (void *) obj->next.next;
#endif

			/*
			 * low bit of pointer indicates whether rcu_head needs
			 * to be freed - kvfree_rcu_mightsleep()
			 */
			BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);

			void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
			bool free_head = ((unsigned long) obj->func) & 1UL;

			kvfree(ptr);
			if (free_head)
				kfree(obj);
		}

		break;

	case RCU_PENDING_CALL_RCU:
		for (size_t i = 0; i < objs.nr; i++) {
			struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
			obj->func(obj);
		}
		genradix_free(&objs.objs);

		while (list) {
			struct rcu_head *obj = list;
#ifdef __KERNEL__
			list = obj->next;
#else
			list = (void *) obj->next.next;
#endif
			obj->func(obj);
		}
		break;

	default:
		for (size_t i = 0; i < objs.nr; i++)
			pending->process(pending, *genradix_ptr(&objs.objs, i));
		genradix_free(&objs.objs);

		while (list) {
			struct rcu_head *obj = list;
#ifdef __KERNEL__
			list = obj->next;
#else
			list = (void *) obj->next.next;
#endif
			pending->process(pending, obj);
		}
		break;
	}
}

static bool process_finished_items(struct rcu_pending *pending,
				   struct rcu_pending_pcpu *p,
				   unsigned long flags)
{
	/*
	 * XXX: we should grab the gp seq once and avoid multiple function
	 * calls, this is called from __rcu_pending_enqueue() fastpath in
	 * may_sleep==true mode
	 */
	if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
	    (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
	    (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
	    p->lists[2].head) {
		__process_finished_items(pending, p, flags);
		return true;
	}

	return false;
}

static void rcu_pending_work(struct work_struct *work)
{
	struct rcu_pending_pcpu *p =
		container_of(work, struct rcu_pending_pcpu, work);
	struct rcu_pending *pending = p->parent;
	unsigned long flags;

	do {
		spin_lock_irqsave(&p->lock, flags);
	} while (process_finished_items(pending, p, flags));

	spin_unlock_irqrestore(&p->lock, flags);
}

static void rcu_pending_rcu_cb(struct rcu_head *rcu)
{
	struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);

	schedule_work_on(p->cpu, &p->work);

	unsigned long flags;
	spin_lock_irqsave(&p->lock, flags);
	if (__rcu_pending_has_pending(p)) {
		spin_unlock_irqrestore(&p->lock, flags);
		__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
	} else {
		p->cb_armed = false;
		spin_unlock_irqrestore(&p->lock, flags);
	}
}

static __always_inline struct rcu_pending_seq *
get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
{
	darray_for_each_reverse(p->objs, objs)
		if (rcu_gp_poll_cookie_eq(objs->seq, seq))
			return objs;

	if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
		return NULL;

	return &darray_last(p->objs);
}

static noinline bool
rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
			 struct rcu_head *head, void *ptr,
			 unsigned long *flags)
{
	if (ptr) {
		if (!head) {
			/*
			 * kvfree_rcu_mightsleep(): we weren't passed an
			 * rcu_head, but we need one: use the low bit of the
			 * ponter to free to flag that the head needs to be
			 * freed as well:
			 */
			ptr = (void *)(((unsigned long) ptr)|1UL);
			head = kmalloc(sizeof(*head), __GFP_NOWARN);
			if (!head) {
				spin_unlock_irqrestore(&p->lock, *flags);
				head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
				/*
				 * dropped lock, did GFP_KERNEL allocation,
				 * check for gp expiration
				 */
				if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
					kvfree(--ptr);
					kfree(head);
					spin_lock_irqsave(&p->lock, *flags);
					return false;
				}
			}
		}

		head->func = ptr;
	}
again:
	for (struct rcu_pending_list *i = p->lists;
	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
		if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
			rcu_pending_list_add(i, head);
			return false;
		}
	}

	for (struct rcu_pending_list *i = p->lists;
	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
		if (!i->head) {
			i->seq = seq;
			rcu_pending_list_add(i, head);
			return true;
		}
	}

	merge_expired_lists(p);
	goto again;
}

/*
 * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
 * pending->pracess) once grace period elapses.
 *
 * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
 * back to a linked list.
 *
 * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
 *   process callback
 *
 * - If @ptr and @head are both not NULL, we're kvfree_rcu()
 *
 * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
 *
 * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
 *   expired items.
 */
static __always_inline void
__rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
		      void *ptr, bool may_sleep)
{

	struct rcu_pending_pcpu *p;
	struct rcu_pending_seq *objs;
	struct genradix_node *new_node = NULL;
	unsigned long flags;
	bool start_gp = false;

	BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));

	/* We could technically be scheduled before taking the lock and end up
	 * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock
	 * anyways
	 *
	 * And we have to do it this way to avoid breaking PREEMPT_RT, which
	 * redefines how spinlocks work:
	 */
	p = raw_cpu_ptr(pending->p);
	spin_lock_irqsave(&p->lock, flags);
	rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
restart:
	if (may_sleep &&
	    unlikely(process_finished_items(pending, p, flags)))
		goto check_expired;

	/*
	 * In kvfree_rcu() mode, the radix tree is only for slab pointers so
	 * that we can do kfree_bulk() - vmalloc pointers always use the linked
	 * list:
	 */
	if (ptr && unlikely(is_vmalloc_addr(ptr)))
		goto list_add;

	objs = get_object_radix(p, seq);
	if (unlikely(!objs))
		goto list_add;

	if (unlikely(!objs->cursor)) {
		/*
		 * New radix tree nodes must be added under @p->lock because the
		 * tree root is in a darray that can be resized (typically,
		 * genradix supports concurrent unlocked allocation of new
		 * nodes) - hence preallocation and the retry loop:
		 */
		objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
						objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
		if (unlikely(!objs->cursor)) {
			if (may_sleep) {
				spin_unlock_irqrestore(&p->lock, flags);

				gfp_t gfp = GFP_KERNEL;
				if (!head)
					gfp |= __GFP_NOFAIL;

				new_node = genradix_alloc_node(gfp);
				if (!new_node)
					may_sleep = false;
				goto check_expired;
			}
list_add:
			start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
			goto start_gp;
		}
	}

	*objs->cursor++ = ptr ?: head;
	/* zero cursor if we hit the end of a radix tree node: */
	if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
		objs->cursor = NULL;
	start_gp = !objs->nr;
	objs->nr++;
start_gp:
	if (unlikely(start_gp)) {
		/*
		 * We only have one callback (ideally, we would have one for
		 * every outstanding graceperiod) - so if our callback is
		 * already in flight, we may still have to start a grace period
		 * (since we used get_state() above, not start_poll())
		 */
		if (!p->cb_armed) {
			p->cb_armed = true;
			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
		} else {
			__start_poll_synchronize_rcu(pending->srcu);
		}
	}
	spin_unlock_irqrestore(&p->lock, flags);
free_node:
	if (new_node)
		genradix_free_node(new_node);
	return;
check_expired:
	if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
		switch ((ulong) pending->process) {
		case RCU_PENDING_KVFREE:
			kvfree(ptr);
			break;
		case RCU_PENDING_CALL_RCU:
			head->func(head);
			break;
		default:
			pending->process(pending, head);
			break;
		}
		goto free_node;
	}

	p = raw_cpu_ptr(pending->p);
	spin_lock_irqsave(&p->lock, flags);
	goto restart;
}

void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
{
	__rcu_pending_enqueue(pending, obj, NULL, true);
}

static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
{
	struct rcu_head *ret = NULL;

	spin_lock_irq(&p->lock);
	darray_for_each(p->objs, objs)
		if (objs->nr) {
			ret = *genradix_ptr(&objs->objs, --objs->nr);
			objs->cursor = NULL;
			if (!objs->nr)
				genradix_free(&objs->objs);
			goto out;
		}

	static_array_for_each(p->lists, i)
		if (i->head) {
			ret = i->head;
#ifdef __KERNEL__
			i->head = ret->next;
#else
			i->head = (void *) ret->next.next;
#endif
			if (!i->head)
				i->tail = NULL;
			goto out;
		}
out:
	spin_unlock_irq(&p->lock);

	return ret;
}

struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
{
	return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
}

struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
{
	struct rcu_head *ret = rcu_pending_dequeue(pending);

	if (ret)
		return ret;

	int cpu;
	for_each_possible_cpu(cpu) {
		ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
		if (ret)
			break;
	}
	return ret;
}

static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
{
	int cpu;
	for_each_possible_cpu(cpu) {
		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
		spin_lock_irq(&p->lock);
		if (__rcu_pending_has_pending(p) || p->cb_armed) {
			spin_unlock_irq(&p->lock);
			return true;
		}
		spin_unlock_irq(&p->lock);
	}

	return false;
}

void rcu_pending_exit(struct rcu_pending *pending)
{
	int cpu;

	if (!pending->p)
		return;

	while (rcu_pending_has_pending_or_armed(pending)) {
		__rcu_barrier(pending->srcu);

		for_each_possible_cpu(cpu) {
			struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
			flush_work(&p->work);
		}
	}

	for_each_possible_cpu(cpu) {
		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
		flush_work(&p->work);
	}

	for_each_possible_cpu(cpu) {
		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);

		static_array_for_each(p->lists, i)
			WARN_ON(i->head);
		WARN_ON(p->objs.nr);
		darray_exit(&p->objs);
	}
	free_percpu(pending->p);
}

/**
 * rcu_pending_init: - initialize a rcu_pending
 *
 * @pending:	Object to init
 * @srcu:	May optionally be used with an srcu_struct; if NULL, uses normal
 *		RCU flavor
 * @process:	Callback function invoked on objects once their RCU barriers
 *		have completed; if NULL, kvfree() is used.
 */
int rcu_pending_init(struct rcu_pending *pending,
		     struct srcu_struct *srcu,
		     rcu_pending_process_fn process)
{
	pending->p = alloc_percpu(struct rcu_pending_pcpu);
	if (!pending->p)
		return -ENOMEM;

	int cpu;
	for_each_possible_cpu(cpu) {
		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
		p->parent	= pending;
		p->cpu		= cpu;
		spin_lock_init(&p->lock);
		darray_init(&p->objs);
		INIT_WORK(&p->work, rcu_pending_work);
	}

	pending->srcu = srcu;
	pending->process = process;

	return 0;
}