xref: /linux/fs/bcachefs/rcu_pending.c (revision 3a39d672e7f48b8d6b91a09afa4b55352773b4b5)
1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) "%s() " fmt "\n", __func__
3 
4 #include <linux/generic-radix-tree.h>
5 #include <linux/mm.h>
6 #include <linux/percpu.h>
7 #include <linux/slab.h>
8 #include <linux/srcu.h>
9 #include <linux/vmalloc.h>
10 
11 #include "rcu_pending.h"
12 #include "darray.h"
13 #include "util.h"
14 
15 #define static_array_for_each(_a, _i)			\
16 	for (typeof(&(_a)[0]) _i = _a;			\
17 	     _i < (_a) + ARRAY_SIZE(_a);		\
18 	     _i++)
19 
20 enum rcu_pending_special {
21 	RCU_PENDING_KVFREE	= 1,
22 	RCU_PENDING_CALL_RCU	= 2,
23 };
24 
25 #define RCU_PENDING_KVFREE_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
26 #define RCU_PENDING_CALL_RCU_FN		((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
27 
__get_state_synchronize_rcu(struct srcu_struct * ssp)28 static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp)
29 {
30 	return ssp
31 		? get_state_synchronize_srcu(ssp)
32 		: get_state_synchronize_rcu();
33 }
34 
__start_poll_synchronize_rcu(struct srcu_struct * ssp)35 static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp)
36 {
37 	return ssp
38 		? start_poll_synchronize_srcu(ssp)
39 		: start_poll_synchronize_rcu();
40 }
41 
__poll_state_synchronize_rcu(struct srcu_struct * ssp,unsigned long cookie)42 static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie)
43 {
44 	return ssp
45 		? poll_state_synchronize_srcu(ssp, cookie)
46 		: poll_state_synchronize_rcu(cookie);
47 }
48 
__rcu_barrier(struct srcu_struct * ssp)49 static inline void __rcu_barrier(struct srcu_struct *ssp)
50 {
51 	return ssp
52 		? srcu_barrier(ssp)
53 		: rcu_barrier();
54 }
55 
__call_rcu(struct srcu_struct * ssp,struct rcu_head * rhp,rcu_callback_t func)56 static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
57 			      rcu_callback_t func)
58 {
59 	if (ssp)
60 		call_srcu(ssp, rhp, func);
61 	else
62 		call_rcu(rhp, func);
63 }
64 
65 struct rcu_pending_seq {
66 	/*
67 	 * We're using a radix tree like a vector - we're just pushing elements
68 	 * onto the end; we're using a radix tree instead of an actual vector to
69 	 * avoid reallocation overhead
70 	 */
71 	GENRADIX(struct rcu_head *)	objs;
72 	size_t				nr;
73 	struct rcu_head			**cursor;
74 	unsigned long			seq;
75 };
76 
77 struct rcu_pending_list {
78 	struct rcu_head			*head;
79 	struct rcu_head			*tail;
80 	unsigned long			seq;
81 };
82 
83 struct rcu_pending_pcpu {
84 	struct rcu_pending		*parent;
85 	spinlock_t			lock;
86 	int				cpu;
87 
88 	/*
89 	 * We can't bound the number of unprocessed gp sequence numbers, and we
90 	 * can't efficiently merge radix trees for expired grace periods, so we
91 	 * need darray/vector:
92 	 */
93 	DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
94 
95 	/* Third entry is for expired objects: */
96 	struct rcu_pending_list		lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
97 
98 	struct rcu_head			cb;
99 	bool				cb_armed;
100 	struct work_struct		work;
101 };
102 
__rcu_pending_has_pending(struct rcu_pending_pcpu * p)103 static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
104 {
105 	if (p->objs.nr)
106 		return true;
107 
108 	static_array_for_each(p->lists, i)
109 		if (i->head)
110 			return true;
111 
112 	return false;
113 }
114 
rcu_pending_list_merge(struct rcu_pending_list * l1,struct rcu_pending_list * l2)115 static void rcu_pending_list_merge(struct rcu_pending_list *l1,
116 				   struct rcu_pending_list *l2)
117 {
118 #ifdef __KERNEL__
119 	if (!l1->head)
120 		l1->head = l2->head;
121 	else
122 		l1->tail->next = l2->head;
123 #else
124 	if (!l1->head)
125 		l1->head = l2->head;
126 	else
127 		l1->tail->next.next = (void *) l2->head;
128 #endif
129 
130 	l1->tail = l2->tail;
131 	l2->head = l2->tail = NULL;
132 }
133 
rcu_pending_list_add(struct rcu_pending_list * l,struct rcu_head * n)134 static void rcu_pending_list_add(struct rcu_pending_list *l,
135 				 struct rcu_head *n)
136 {
137 #ifdef __KERNEL__
138 	if (!l->head)
139 		l->head = n;
140 	else
141 		l->tail->next = n;
142 	l->tail = n;
143 	n->next = NULL;
144 #else
145 	if (!l->head)
146 		l->head = n;
147 	else
148 		l->tail->next.next = (void *) n;
149 	l->tail = n;
150 	n->next.next = NULL;
151 #endif
152 }
153 
merge_expired_lists(struct rcu_pending_pcpu * p)154 static void merge_expired_lists(struct rcu_pending_pcpu *p)
155 {
156 	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
157 
158 	for (struct rcu_pending_list *i = p->lists; i < expired; i++)
159 		if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
160 			rcu_pending_list_merge(expired, i);
161 }
162 
163 #ifndef __KERNEL__
kfree_bulk(size_t nr,void ** p)164 static inline void kfree_bulk(size_t nr, void ** p)
165 {
166 	while (nr--)
167 		kfree(*p);
168 }
169 
170 #define local_irq_save(flags)		\
171 do {					\
172 	flags = 0;			\
173 } while (0)
174 #endif
175 
__process_finished_items(struct rcu_pending * pending,struct rcu_pending_pcpu * p,unsigned long flags)176 static noinline void __process_finished_items(struct rcu_pending *pending,
177 					      struct rcu_pending_pcpu *p,
178 					      unsigned long flags)
179 {
180 	struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
181 	struct rcu_pending_seq objs = {};
182 	struct rcu_head *list = NULL;
183 
184 	if (p->objs.nr &&
185 	    __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
186 		objs = p->objs.data[0];
187 		darray_remove_item(&p->objs, p->objs.data);
188 	}
189 
190 	merge_expired_lists(p);
191 
192 	list = expired->head;
193 	expired->head = expired->tail = NULL;
194 
195 	spin_unlock_irqrestore(&p->lock, flags);
196 
197 	switch ((ulong) pending->process) {
198 	case RCU_PENDING_KVFREE:
199 		for (size_t i = 0; i < objs.nr; ) {
200 			size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
201 
202 			kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
203 			i += nr_this_node;
204 		}
205 		genradix_free(&objs.objs);
206 
207 		while (list) {
208 			struct rcu_head *obj = list;
209 #ifdef __KERNEL__
210 			list = obj->next;
211 #else
212 			list = (void *) obj->next.next;
213 #endif
214 
215 			/*
216 			 * low bit of pointer indicates whether rcu_head needs
217 			 * to be freed - kvfree_rcu_mightsleep()
218 			 */
219 			BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
220 
221 			void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
222 			bool free_head = ((unsigned long) obj->func) & 1UL;
223 
224 			kvfree(ptr);
225 			if (free_head)
226 				kfree(obj);
227 		}
228 
229 		break;
230 
231 	case RCU_PENDING_CALL_RCU:
232 		for (size_t i = 0; i < objs.nr; i++) {
233 			struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
234 			obj->func(obj);
235 		}
236 		genradix_free(&objs.objs);
237 
238 		while (list) {
239 			struct rcu_head *obj = list;
240 #ifdef __KERNEL__
241 			list = obj->next;
242 #else
243 			list = (void *) obj->next.next;
244 #endif
245 			obj->func(obj);
246 		}
247 		break;
248 
249 	default:
250 		for (size_t i = 0; i < objs.nr; i++)
251 			pending->process(pending, *genradix_ptr(&objs.objs, i));
252 		genradix_free(&objs.objs);
253 
254 		while (list) {
255 			struct rcu_head *obj = list;
256 #ifdef __KERNEL__
257 			list = obj->next;
258 #else
259 			list = (void *) obj->next.next;
260 #endif
261 			pending->process(pending, obj);
262 		}
263 		break;
264 	}
265 }
266 
process_finished_items(struct rcu_pending * pending,struct rcu_pending_pcpu * p,unsigned long flags)267 static bool process_finished_items(struct rcu_pending *pending,
268 				   struct rcu_pending_pcpu *p,
269 				   unsigned long flags)
270 {
271 	/*
272 	 * XXX: we should grab the gp seq once and avoid multiple function
273 	 * calls, this is called from __rcu_pending_enqueue() fastpath in
274 	 * may_sleep==true mode
275 	 */
276 	if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
277 	    (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
278 	    (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
279 	    p->lists[2].head) {
280 		__process_finished_items(pending, p, flags);
281 		return true;
282 	}
283 
284 	return false;
285 }
286 
rcu_pending_work(struct work_struct * work)287 static void rcu_pending_work(struct work_struct *work)
288 {
289 	struct rcu_pending_pcpu *p =
290 		container_of(work, struct rcu_pending_pcpu, work);
291 	struct rcu_pending *pending = p->parent;
292 	unsigned long flags;
293 
294 	do {
295 		spin_lock_irqsave(&p->lock, flags);
296 	} while (process_finished_items(pending, p, flags));
297 
298 	spin_unlock_irqrestore(&p->lock, flags);
299 }
300 
rcu_pending_rcu_cb(struct rcu_head * rcu)301 static void rcu_pending_rcu_cb(struct rcu_head *rcu)
302 {
303 	struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
304 
305 	schedule_work_on(p->cpu, &p->work);
306 
307 	unsigned long flags;
308 	spin_lock_irqsave(&p->lock, flags);
309 	if (__rcu_pending_has_pending(p)) {
310 		spin_unlock_irqrestore(&p->lock, flags);
311 		__call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
312 	} else {
313 		p->cb_armed = false;
314 		spin_unlock_irqrestore(&p->lock, flags);
315 	}
316 }
317 
318 static __always_inline struct rcu_pending_seq *
get_object_radix(struct rcu_pending_pcpu * p,unsigned long seq)319 get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq)
320 {
321 	darray_for_each_reverse(p->objs, objs)
322 		if (objs->seq == seq)
323 			return objs;
324 
325 	if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
326 		return NULL;
327 
328 	return &darray_last(p->objs);
329 }
330 
331 static noinline bool
rcu_pending_enqueue_list(struct rcu_pending_pcpu * p,unsigned long seq,struct rcu_head * head,void * ptr,unsigned long * flags)332 rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq,
333 			 struct rcu_head *head, void *ptr,
334 			 unsigned long *flags)
335 {
336 	if (ptr) {
337 		if (!head) {
338 			/*
339 			 * kvfree_rcu_mightsleep(): we weren't passed an
340 			 * rcu_head, but we need one: use the low bit of the
341 			 * ponter to free to flag that the head needs to be
342 			 * freed as well:
343 			 */
344 			ptr = (void *)(((unsigned long) ptr)|1UL);
345 			head = kmalloc(sizeof(*head), __GFP_NOWARN);
346 			if (!head) {
347 				spin_unlock_irqrestore(&p->lock, *flags);
348 				head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
349 				/*
350 				 * dropped lock, did GFP_KERNEL allocation,
351 				 * check for gp expiration
352 				 */
353 				if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
354 					kvfree(--ptr);
355 					kfree(head);
356 					spin_lock_irqsave(&p->lock, *flags);
357 					return false;
358 				}
359 			}
360 		}
361 
362 		head->func = ptr;
363 	}
364 again:
365 	for (struct rcu_pending_list *i = p->lists;
366 	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
367 		if (i->seq == seq) {
368 			rcu_pending_list_add(i, head);
369 			return false;
370 		}
371 	}
372 
373 	for (struct rcu_pending_list *i = p->lists;
374 	     i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
375 		if (!i->head) {
376 			i->seq = seq;
377 			rcu_pending_list_add(i, head);
378 			return true;
379 		}
380 	}
381 
382 	merge_expired_lists(p);
383 	goto again;
384 }
385 
386 /*
387  * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
388  * pending->pracess) once grace period elapses.
389  *
390  * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
391  * back to a linked list.
392  *
393  * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
394  *   process callback
395  *
396  * - If @ptr and @head are both not NULL, we're kvfree_rcu()
397  *
398  * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
399  *
400  * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
401  *   expired items.
402  */
403 static __always_inline void
__rcu_pending_enqueue(struct rcu_pending * pending,struct rcu_head * head,void * ptr,bool may_sleep)404 __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
405 		      void *ptr, bool may_sleep)
406 {
407 
408 	struct rcu_pending_pcpu *p;
409 	struct rcu_pending_seq *objs;
410 	struct genradix_node *new_node = NULL;
411 	unsigned long seq, flags;
412 	bool start_gp = false;
413 
414 	BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
415 
416 	local_irq_save(flags);
417 	p = this_cpu_ptr(pending->p);
418 	spin_lock(&p->lock);
419 	seq = __get_state_synchronize_rcu(pending->srcu);
420 restart:
421 	if (may_sleep &&
422 	    unlikely(process_finished_items(pending, p, flags)))
423 		goto check_expired;
424 
425 	/*
426 	 * In kvfree_rcu() mode, the radix tree is only for slab pointers so
427 	 * that we can do kfree_bulk() - vmalloc pointers always use the linked
428 	 * list:
429 	 */
430 	if (ptr && unlikely(is_vmalloc_addr(ptr)))
431 		goto list_add;
432 
433 	objs = get_object_radix(p, seq);
434 	if (unlikely(!objs))
435 		goto list_add;
436 
437 	if (unlikely(!objs->cursor)) {
438 		/*
439 		 * New radix tree nodes must be added under @p->lock because the
440 		 * tree root is in a darray that can be resized (typically,
441 		 * genradix supports concurrent unlocked allocation of new
442 		 * nodes) - hence preallocation and the retry loop:
443 		 */
444 		objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
445 						objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
446 		if (unlikely(!objs->cursor)) {
447 			if (may_sleep) {
448 				spin_unlock_irqrestore(&p->lock, flags);
449 
450 				gfp_t gfp = GFP_KERNEL;
451 				if (!head)
452 					gfp |= __GFP_NOFAIL;
453 
454 				new_node = genradix_alloc_node(gfp);
455 				if (!new_node)
456 					may_sleep = false;
457 				goto check_expired;
458 			}
459 list_add:
460 			start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
461 			goto start_gp;
462 		}
463 	}
464 
465 	*objs->cursor++ = ptr ?: head;
466 	/* zero cursor if we hit the end of a radix tree node: */
467 	if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
468 		objs->cursor = NULL;
469 	start_gp = !objs->nr;
470 	objs->nr++;
471 start_gp:
472 	if (unlikely(start_gp)) {
473 		/*
474 		 * We only have one callback (ideally, we would have one for
475 		 * every outstanding graceperiod) - so if our callback is
476 		 * already in flight, we may still have to start a grace period
477 		 * (since we used get_state() above, not start_poll())
478 		 */
479 		if (!p->cb_armed) {
480 			p->cb_armed = true;
481 			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
482 		} else {
483 			__start_poll_synchronize_rcu(pending->srcu);
484 		}
485 	}
486 	spin_unlock_irqrestore(&p->lock, flags);
487 free_node:
488 	if (new_node)
489 		genradix_free_node(new_node);
490 	return;
491 check_expired:
492 	if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
493 		switch ((ulong) pending->process) {
494 		case RCU_PENDING_KVFREE:
495 			kvfree(ptr);
496 			break;
497 		case RCU_PENDING_CALL_RCU:
498 			head->func(head);
499 			break;
500 		default:
501 			pending->process(pending, head);
502 			break;
503 		}
504 		goto free_node;
505 	}
506 
507 	local_irq_save(flags);
508 	p = this_cpu_ptr(pending->p);
509 	spin_lock(&p->lock);
510 	goto restart;
511 }
512 
rcu_pending_enqueue(struct rcu_pending * pending,struct rcu_head * obj)513 void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
514 {
515 	__rcu_pending_enqueue(pending, obj, NULL, true);
516 }
517 
rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu * p)518 static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
519 {
520 	struct rcu_head *ret = NULL;
521 
522 	spin_lock_irq(&p->lock);
523 	darray_for_each(p->objs, objs)
524 		if (objs->nr) {
525 			ret = *genradix_ptr(&objs->objs, --objs->nr);
526 			objs->cursor = NULL;
527 			if (!objs->nr)
528 				genradix_free(&objs->objs);
529 			goto out;
530 		}
531 
532 	static_array_for_each(p->lists, i)
533 		if (i->head) {
534 			ret = i->head;
535 #ifdef __KERNEL__
536 			i->head = ret->next;
537 #else
538 			i->head = (void *) ret->next.next;
539 #endif
540 			if (!i->head)
541 				i->tail = NULL;
542 			goto out;
543 		}
544 out:
545 	spin_unlock_irq(&p->lock);
546 
547 	return ret;
548 }
549 
rcu_pending_dequeue(struct rcu_pending * pending)550 struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
551 {
552 	return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
553 }
554 
rcu_pending_dequeue_from_all(struct rcu_pending * pending)555 struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
556 {
557 	struct rcu_head *ret = rcu_pending_dequeue(pending);
558 
559 	if (ret)
560 		return ret;
561 
562 	int cpu;
563 	for_each_possible_cpu(cpu) {
564 		ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
565 		if (ret)
566 			break;
567 	}
568 	return ret;
569 }
570 
rcu_pending_has_pending_or_armed(struct rcu_pending * pending)571 static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
572 {
573 	int cpu;
574 	for_each_possible_cpu(cpu) {
575 		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
576 		spin_lock_irq(&p->lock);
577 		if (__rcu_pending_has_pending(p) || p->cb_armed) {
578 			spin_unlock_irq(&p->lock);
579 			return true;
580 		}
581 		spin_unlock_irq(&p->lock);
582 	}
583 
584 	return false;
585 }
586 
rcu_pending_exit(struct rcu_pending * pending)587 void rcu_pending_exit(struct rcu_pending *pending)
588 {
589 	int cpu;
590 
591 	if (!pending->p)
592 		return;
593 
594 	while (rcu_pending_has_pending_or_armed(pending)) {
595 		__rcu_barrier(pending->srcu);
596 
597 		for_each_possible_cpu(cpu) {
598 			struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
599 			flush_work(&p->work);
600 		}
601 	}
602 
603 	for_each_possible_cpu(cpu) {
604 		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
605 		flush_work(&p->work);
606 	}
607 
608 	for_each_possible_cpu(cpu) {
609 		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
610 
611 		static_array_for_each(p->lists, i)
612 			WARN_ON(i->head);
613 		WARN_ON(p->objs.nr);
614 		darray_exit(&p->objs);
615 	}
616 	free_percpu(pending->p);
617 }
618 
619 /**
620  * rcu_pending_init: - initialize a rcu_pending
621  *
622  * @pending:	Object to init
623  * @srcu:	May optionally be used with an srcu_struct; if NULL, uses normal
624  *		RCU flavor
625  * @process:	Callback function invoked on objects once their RCU barriers
626  *		have completed; if NULL, kvfree() is used.
627  */
rcu_pending_init(struct rcu_pending * pending,struct srcu_struct * srcu,rcu_pending_process_fn process)628 int rcu_pending_init(struct rcu_pending *pending,
629 		     struct srcu_struct *srcu,
630 		     rcu_pending_process_fn process)
631 {
632 	pending->p = alloc_percpu(struct rcu_pending_pcpu);
633 	if (!pending->p)
634 		return -ENOMEM;
635 
636 	int cpu;
637 	for_each_possible_cpu(cpu) {
638 		struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
639 		p->parent	= pending;
640 		p->cpu		= cpu;
641 		spin_lock_init(&p->lock);
642 		darray_init(&p->objs);
643 		INIT_WORK(&p->work, rcu_pending_work);
644 	}
645 
646 	pending->srcu = srcu;
647 	pending->process = process;
648 
649 	return 0;
650 }
651