1 // SPDX-License-Identifier: GPL-2.0
2 #define pr_fmt(fmt) "%s() " fmt "\n", __func__
3
4 #include <linux/generic-radix-tree.h>
5 #include <linux/mm.h>
6 #include <linux/percpu.h>
7 #include <linux/slab.h>
8 #include <linux/srcu.h>
9 #include <linux/vmalloc.h>
10
11 #include "rcu_pending.h"
12 #include "darray.h"
13 #include "util.h"
14
15 #define static_array_for_each(_a, _i) \
16 for (typeof(&(_a)[0]) _i = _a; \
17 _i < (_a) + ARRAY_SIZE(_a); \
18 _i++)
19
20 enum rcu_pending_special {
21 RCU_PENDING_KVFREE = 1,
22 RCU_PENDING_CALL_RCU = 2,
23 };
24
25 #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE)
26 #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU)
27
28 #ifdef __KERNEL__
29 typedef unsigned long rcu_gp_poll_state_t;
30
rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l,rcu_gp_poll_state_t r)31 static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
32 {
33 return l == r;
34 }
35 #else
36 typedef struct urcu_gp_poll_state rcu_gp_poll_state_t;
37
rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l,rcu_gp_poll_state_t r)38 static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r)
39 {
40 return l.grace_period_id == r.grace_period_id;
41 }
42 #endif
43
__get_state_synchronize_rcu(struct srcu_struct * ssp)44 static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp)
45 {
46 return ssp
47 ? get_state_synchronize_srcu(ssp)
48 : get_state_synchronize_rcu();
49 }
50
__start_poll_synchronize_rcu(struct srcu_struct * ssp)51 static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp)
52 {
53 return ssp
54 ? start_poll_synchronize_srcu(ssp)
55 : start_poll_synchronize_rcu();
56 }
57
__poll_state_synchronize_rcu(struct srcu_struct * ssp,rcu_gp_poll_state_t cookie)58 static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie)
59 {
60 return ssp
61 ? poll_state_synchronize_srcu(ssp, cookie)
62 : poll_state_synchronize_rcu(cookie);
63 }
64
__rcu_barrier(struct srcu_struct * ssp)65 static inline void __rcu_barrier(struct srcu_struct *ssp)
66 {
67 return ssp
68 ? srcu_barrier(ssp)
69 : rcu_barrier();
70 }
71
__call_rcu(struct srcu_struct * ssp,struct rcu_head * rhp,rcu_callback_t func)72 static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp,
73 rcu_callback_t func)
74 {
75 if (ssp)
76 call_srcu(ssp, rhp, func);
77 else
78 call_rcu(rhp, func);
79 }
80
81 struct rcu_pending_seq {
82 /*
83 * We're using a radix tree like a vector - we're just pushing elements
84 * onto the end; we're using a radix tree instead of an actual vector to
85 * avoid reallocation overhead
86 */
87 GENRADIX(struct rcu_head *) objs;
88 size_t nr;
89 struct rcu_head **cursor;
90 rcu_gp_poll_state_t seq;
91 };
92
93 struct rcu_pending_list {
94 struct rcu_head *head;
95 struct rcu_head *tail;
96 rcu_gp_poll_state_t seq;
97 };
98
99 struct rcu_pending_pcpu {
100 struct rcu_pending *parent;
101 spinlock_t lock;
102 int cpu;
103
104 /*
105 * We can't bound the number of unprocessed gp sequence numbers, and we
106 * can't efficiently merge radix trees for expired grace periods, so we
107 * need darray/vector:
108 */
109 DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs;
110
111 /* Third entry is for expired objects: */
112 struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1];
113
114 struct rcu_head cb;
115 bool cb_armed;
116 struct work_struct work;
117 };
118
__rcu_pending_has_pending(struct rcu_pending_pcpu * p)119 static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p)
120 {
121 if (p->objs.nr)
122 return true;
123
124 static_array_for_each(p->lists, i)
125 if (i->head)
126 return true;
127
128 return false;
129 }
130
rcu_pending_list_merge(struct rcu_pending_list * l1,struct rcu_pending_list * l2)131 static void rcu_pending_list_merge(struct rcu_pending_list *l1,
132 struct rcu_pending_list *l2)
133 {
134 #ifdef __KERNEL__
135 if (!l1->head)
136 l1->head = l2->head;
137 else
138 l1->tail->next = l2->head;
139 #else
140 if (!l1->head)
141 l1->head = l2->head;
142 else
143 l1->tail->next.next = (void *) l2->head;
144 #endif
145
146 l1->tail = l2->tail;
147 l2->head = l2->tail = NULL;
148 }
149
rcu_pending_list_add(struct rcu_pending_list * l,struct rcu_head * n)150 static void rcu_pending_list_add(struct rcu_pending_list *l,
151 struct rcu_head *n)
152 {
153 #ifdef __KERNEL__
154 if (!l->head)
155 l->head = n;
156 else
157 l->tail->next = n;
158 l->tail = n;
159 n->next = NULL;
160 #else
161 if (!l->head)
162 l->head = n;
163 else
164 l->tail->next.next = (void *) n;
165 l->tail = n;
166 n->next.next = NULL;
167 #endif
168 }
169
merge_expired_lists(struct rcu_pending_pcpu * p)170 static void merge_expired_lists(struct rcu_pending_pcpu *p)
171 {
172 struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
173
174 for (struct rcu_pending_list *i = p->lists; i < expired; i++)
175 if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq))
176 rcu_pending_list_merge(expired, i);
177 }
178
179 #ifndef __KERNEL__
kfree_bulk(size_t nr,void ** p)180 static inline void kfree_bulk(size_t nr, void ** p)
181 {
182 while (nr--)
183 kfree(*p);
184 }
185 #endif
186
__process_finished_items(struct rcu_pending * pending,struct rcu_pending_pcpu * p,unsigned long flags)187 static noinline void __process_finished_items(struct rcu_pending *pending,
188 struct rcu_pending_pcpu *p,
189 unsigned long flags)
190 {
191 struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE];
192 struct rcu_pending_seq objs = {};
193 struct rcu_head *list = NULL;
194
195 if (p->objs.nr &&
196 __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) {
197 objs = p->objs.data[0];
198 darray_remove_item(&p->objs, p->objs.data);
199 }
200
201 merge_expired_lists(p);
202
203 list = expired->head;
204 expired->head = expired->tail = NULL;
205
206 spin_unlock_irqrestore(&p->lock, flags);
207
208 switch ((ulong) pending->process) {
209 case RCU_PENDING_KVFREE:
210 for (size_t i = 0; i < objs.nr; ) {
211 size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i);
212
213 kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i));
214 i += nr_this_node;
215 }
216 genradix_free(&objs.objs);
217
218 while (list) {
219 struct rcu_head *obj = list;
220 #ifdef __KERNEL__
221 list = obj->next;
222 #else
223 list = (void *) obj->next.next;
224 #endif
225
226 /*
227 * low bit of pointer indicates whether rcu_head needs
228 * to be freed - kvfree_rcu_mightsleep()
229 */
230 BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0);
231
232 void *ptr = (void *)(((unsigned long) obj->func) & ~1UL);
233 bool free_head = ((unsigned long) obj->func) & 1UL;
234
235 kvfree(ptr);
236 if (free_head)
237 kfree(obj);
238 }
239
240 break;
241
242 case RCU_PENDING_CALL_RCU:
243 for (size_t i = 0; i < objs.nr; i++) {
244 struct rcu_head *obj = *genradix_ptr(&objs.objs, i);
245 obj->func(obj);
246 }
247 genradix_free(&objs.objs);
248
249 while (list) {
250 struct rcu_head *obj = list;
251 #ifdef __KERNEL__
252 list = obj->next;
253 #else
254 list = (void *) obj->next.next;
255 #endif
256 obj->func(obj);
257 }
258 break;
259
260 default:
261 for (size_t i = 0; i < objs.nr; i++)
262 pending->process(pending, *genradix_ptr(&objs.objs, i));
263 genradix_free(&objs.objs);
264
265 while (list) {
266 struct rcu_head *obj = list;
267 #ifdef __KERNEL__
268 list = obj->next;
269 #else
270 list = (void *) obj->next.next;
271 #endif
272 pending->process(pending, obj);
273 }
274 break;
275 }
276 }
277
process_finished_items(struct rcu_pending * pending,struct rcu_pending_pcpu * p,unsigned long flags)278 static bool process_finished_items(struct rcu_pending *pending,
279 struct rcu_pending_pcpu *p,
280 unsigned long flags)
281 {
282 /*
283 * XXX: we should grab the gp seq once and avoid multiple function
284 * calls, this is called from __rcu_pending_enqueue() fastpath in
285 * may_sleep==true mode
286 */
287 if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) ||
288 (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) ||
289 (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) ||
290 p->lists[2].head) {
291 __process_finished_items(pending, p, flags);
292 return true;
293 }
294
295 return false;
296 }
297
rcu_pending_work(struct work_struct * work)298 static void rcu_pending_work(struct work_struct *work)
299 {
300 struct rcu_pending_pcpu *p =
301 container_of(work, struct rcu_pending_pcpu, work);
302 struct rcu_pending *pending = p->parent;
303 unsigned long flags;
304
305 do {
306 spin_lock_irqsave(&p->lock, flags);
307 } while (process_finished_items(pending, p, flags));
308
309 spin_unlock_irqrestore(&p->lock, flags);
310 }
311
rcu_pending_rcu_cb(struct rcu_head * rcu)312 static void rcu_pending_rcu_cb(struct rcu_head *rcu)
313 {
314 struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb);
315
316 schedule_work_on(p->cpu, &p->work);
317
318 unsigned long flags;
319 spin_lock_irqsave(&p->lock, flags);
320 if (__rcu_pending_has_pending(p)) {
321 spin_unlock_irqrestore(&p->lock, flags);
322 __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb);
323 } else {
324 p->cb_armed = false;
325 spin_unlock_irqrestore(&p->lock, flags);
326 }
327 }
328
329 static __always_inline struct rcu_pending_seq *
get_object_radix(struct rcu_pending_pcpu * p,rcu_gp_poll_state_t seq)330 get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq)
331 {
332 darray_for_each_reverse(p->objs, objs)
333 if (rcu_gp_poll_cookie_eq(objs->seq, seq))
334 return objs;
335
336 if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC))
337 return NULL;
338
339 return &darray_last(p->objs);
340 }
341
342 static noinline bool
rcu_pending_enqueue_list(struct rcu_pending_pcpu * p,rcu_gp_poll_state_t seq,struct rcu_head * head,void * ptr,unsigned long * flags)343 rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq,
344 struct rcu_head *head, void *ptr,
345 unsigned long *flags)
346 {
347 if (ptr) {
348 if (!head) {
349 /*
350 * kvfree_rcu_mightsleep(): we weren't passed an
351 * rcu_head, but we need one: use the low bit of the
352 * ponter to free to flag that the head needs to be
353 * freed as well:
354 */
355 ptr = (void *)(((unsigned long) ptr)|1UL);
356 head = kmalloc(sizeof(*head), __GFP_NOWARN);
357 if (!head) {
358 spin_unlock_irqrestore(&p->lock, *flags);
359 head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL);
360 /*
361 * dropped lock, did GFP_KERNEL allocation,
362 * check for gp expiration
363 */
364 if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) {
365 kvfree(--ptr);
366 kfree(head);
367 spin_lock_irqsave(&p->lock, *flags);
368 return false;
369 }
370 }
371 }
372
373 head->func = ptr;
374 }
375 again:
376 for (struct rcu_pending_list *i = p->lists;
377 i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
378 if (rcu_gp_poll_cookie_eq(i->seq, seq)) {
379 rcu_pending_list_add(i, head);
380 return false;
381 }
382 }
383
384 for (struct rcu_pending_list *i = p->lists;
385 i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) {
386 if (!i->head) {
387 i->seq = seq;
388 rcu_pending_list_add(i, head);
389 return true;
390 }
391 }
392
393 merge_expired_lists(p);
394 goto again;
395 }
396
397 /*
398 * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via
399 * pending->pracess) once grace period elapses.
400 *
401 * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall
402 * back to a linked list.
403 *
404 * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a
405 * process callback
406 *
407 * - If @ptr and @head are both not NULL, we're kvfree_rcu()
408 *
409 * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep()
410 *
411 * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process
412 * expired items.
413 */
414 static __always_inline void
__rcu_pending_enqueue(struct rcu_pending * pending,struct rcu_head * head,void * ptr,bool may_sleep)415 __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head,
416 void *ptr, bool may_sleep)
417 {
418
419 struct rcu_pending_pcpu *p;
420 struct rcu_pending_seq *objs;
421 struct genradix_node *new_node = NULL;
422 unsigned long flags;
423 bool start_gp = false;
424
425 BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN));
426
427 /* We could technically be scheduled before taking the lock and end up
428 * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock
429 * anyways
430 *
431 * And we have to do it this way to avoid breaking PREEMPT_RT, which
432 * redefines how spinlocks work:
433 */
434 p = raw_cpu_ptr(pending->p);
435 spin_lock_irqsave(&p->lock, flags);
436 rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu);
437 restart:
438 if (may_sleep &&
439 unlikely(process_finished_items(pending, p, flags)))
440 goto check_expired;
441
442 /*
443 * In kvfree_rcu() mode, the radix tree is only for slab pointers so
444 * that we can do kfree_bulk() - vmalloc pointers always use the linked
445 * list:
446 */
447 if (ptr && unlikely(is_vmalloc_addr(ptr)))
448 goto list_add;
449
450 objs = get_object_radix(p, seq);
451 if (unlikely(!objs))
452 goto list_add;
453
454 if (unlikely(!objs->cursor)) {
455 /*
456 * New radix tree nodes must be added under @p->lock because the
457 * tree root is in a darray that can be resized (typically,
458 * genradix supports concurrent unlocked allocation of new
459 * nodes) - hence preallocation and the retry loop:
460 */
461 objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs,
462 objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN);
463 if (unlikely(!objs->cursor)) {
464 if (may_sleep) {
465 spin_unlock_irqrestore(&p->lock, flags);
466
467 gfp_t gfp = GFP_KERNEL;
468 if (!head)
469 gfp |= __GFP_NOFAIL;
470
471 new_node = genradix_alloc_node(gfp);
472 if (!new_node)
473 may_sleep = false;
474 goto check_expired;
475 }
476 list_add:
477 start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags);
478 goto start_gp;
479 }
480 }
481
482 *objs->cursor++ = ptr ?: head;
483 /* zero cursor if we hit the end of a radix tree node: */
484 if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1)))
485 objs->cursor = NULL;
486 start_gp = !objs->nr;
487 objs->nr++;
488 start_gp:
489 if (unlikely(start_gp)) {
490 /*
491 * We only have one callback (ideally, we would have one for
492 * every outstanding graceperiod) - so if our callback is
493 * already in flight, we may still have to start a grace period
494 * (since we used get_state() above, not start_poll())
495 */
496 if (!p->cb_armed) {
497 p->cb_armed = true;
498 __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
499 } else {
500 __start_poll_synchronize_rcu(pending->srcu);
501 }
502 }
503 spin_unlock_irqrestore(&p->lock, flags);
504 free_node:
505 if (new_node)
506 genradix_free_node(new_node);
507 return;
508 check_expired:
509 if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) {
510 switch ((ulong) pending->process) {
511 case RCU_PENDING_KVFREE:
512 kvfree(ptr);
513 break;
514 case RCU_PENDING_CALL_RCU:
515 head->func(head);
516 break;
517 default:
518 pending->process(pending, head);
519 break;
520 }
521 goto free_node;
522 }
523
524 p = raw_cpu_ptr(pending->p);
525 spin_lock_irqsave(&p->lock, flags);
526 goto restart;
527 }
528
rcu_pending_enqueue(struct rcu_pending * pending,struct rcu_head * obj)529 void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj)
530 {
531 __rcu_pending_enqueue(pending, obj, NULL, true);
532 }
533
rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu * p)534 static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p)
535 {
536 struct rcu_head *ret = NULL;
537
538 spin_lock_irq(&p->lock);
539 darray_for_each(p->objs, objs)
540 if (objs->nr) {
541 ret = *genradix_ptr(&objs->objs, --objs->nr);
542 objs->cursor = NULL;
543 if (!objs->nr)
544 genradix_free(&objs->objs);
545 goto out;
546 }
547
548 static_array_for_each(p->lists, i)
549 if (i->head) {
550 ret = i->head;
551 #ifdef __KERNEL__
552 i->head = ret->next;
553 #else
554 i->head = (void *) ret->next.next;
555 #endif
556 if (!i->head)
557 i->tail = NULL;
558 goto out;
559 }
560 out:
561 spin_unlock_irq(&p->lock);
562
563 return ret;
564 }
565
rcu_pending_dequeue(struct rcu_pending * pending)566 struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending)
567 {
568 return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p));
569 }
570
rcu_pending_dequeue_from_all(struct rcu_pending * pending)571 struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending)
572 {
573 struct rcu_head *ret = rcu_pending_dequeue(pending);
574
575 if (ret)
576 return ret;
577
578 int cpu;
579 for_each_possible_cpu(cpu) {
580 ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu));
581 if (ret)
582 break;
583 }
584 return ret;
585 }
586
rcu_pending_has_pending_or_armed(struct rcu_pending * pending)587 static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending)
588 {
589 int cpu;
590 for_each_possible_cpu(cpu) {
591 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
592 spin_lock_irq(&p->lock);
593 if (__rcu_pending_has_pending(p) || p->cb_armed) {
594 spin_unlock_irq(&p->lock);
595 return true;
596 }
597 spin_unlock_irq(&p->lock);
598 }
599
600 return false;
601 }
602
rcu_pending_exit(struct rcu_pending * pending)603 void rcu_pending_exit(struct rcu_pending *pending)
604 {
605 int cpu;
606
607 if (!pending->p)
608 return;
609
610 while (rcu_pending_has_pending_or_armed(pending)) {
611 __rcu_barrier(pending->srcu);
612
613 for_each_possible_cpu(cpu) {
614 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
615 flush_work(&p->work);
616 }
617 }
618
619 for_each_possible_cpu(cpu) {
620 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
621 flush_work(&p->work);
622 }
623
624 for_each_possible_cpu(cpu) {
625 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
626
627 static_array_for_each(p->lists, i)
628 WARN_ON(i->head);
629 WARN_ON(p->objs.nr);
630 darray_exit(&p->objs);
631 }
632 free_percpu(pending->p);
633 }
634
635 /**
636 * rcu_pending_init: - initialize a rcu_pending
637 *
638 * @pending: Object to init
639 * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal
640 * RCU flavor
641 * @process: Callback function invoked on objects once their RCU barriers
642 * have completed; if NULL, kvfree() is used.
643 */
rcu_pending_init(struct rcu_pending * pending,struct srcu_struct * srcu,rcu_pending_process_fn process)644 int rcu_pending_init(struct rcu_pending *pending,
645 struct srcu_struct *srcu,
646 rcu_pending_process_fn process)
647 {
648 pending->p = alloc_percpu(struct rcu_pending_pcpu);
649 if (!pending->p)
650 return -ENOMEM;
651
652 int cpu;
653 for_each_possible_cpu(cpu) {
654 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu);
655 p->parent = pending;
656 p->cpu = cpu;
657 spin_lock_init(&p->lock);
658 darray_init(&p->objs);
659 INIT_WORK(&p->work, rcu_pending_work);
660 }
661
662 pending->srcu = srcu;
663 pending->process = process;
664
665 return 0;
666 }
667