1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) "%s() " fmt "\n", __func__ 3 4 #include <linux/generic-radix-tree.h> 5 #include <linux/mm.h> 6 #include <linux/percpu.h> 7 #include <linux/slab.h> 8 #include <linux/srcu.h> 9 #include <linux/vmalloc.h> 10 11 #include "rcu_pending.h" 12 #include "darray.h" 13 #include "util.h" 14 15 #define static_array_for_each(_a, _i) \ 16 for (typeof(&(_a)[0]) _i = _a; \ 17 _i < (_a) + ARRAY_SIZE(_a); \ 18 _i++) 19 20 enum rcu_pending_special { 21 RCU_PENDING_KVFREE = 1, 22 RCU_PENDING_CALL_RCU = 2, 23 }; 24 25 #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) 26 #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) 27 28 #ifdef __KERNEL__ 29 typedef unsigned long rcu_gp_poll_state_t; 30 31 static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) 32 { 33 return l == r; 34 } 35 #else 36 typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; 37 38 static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) 39 { 40 return l.grace_period_id == r.grace_period_id; 41 } 42 #endif 43 44 static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) 45 { 46 return ssp 47 ? get_state_synchronize_srcu(ssp) 48 : get_state_synchronize_rcu(); 49 } 50 51 static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) 52 { 53 return ssp 54 ? start_poll_synchronize_srcu(ssp) 55 : start_poll_synchronize_rcu(); 56 } 57 58 static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) 59 { 60 return ssp 61 ? poll_state_synchronize_srcu(ssp, cookie) 62 : poll_state_synchronize_rcu(cookie); 63 } 64 65 static inline void __rcu_barrier(struct srcu_struct *ssp) 66 { 67 return ssp 68 ? srcu_barrier(ssp) 69 : rcu_barrier(); 70 } 71 72 static inline void __call_rcu(struct srcu_struct *ssp, struct rcu_head *rhp, 73 rcu_callback_t func) 74 { 75 if (ssp) 76 call_srcu(ssp, rhp, func); 77 else 78 call_rcu(rhp, func); 79 } 80 81 struct rcu_pending_seq { 82 /* 83 * We're using a radix tree like a vector - we're just pushing elements 84 * onto the end; we're using a radix tree instead of an actual vector to 85 * avoid reallocation overhead 86 */ 87 GENRADIX(struct rcu_head *) objs; 88 size_t nr; 89 struct rcu_head **cursor; 90 rcu_gp_poll_state_t seq; 91 }; 92 93 struct rcu_pending_list { 94 struct rcu_head *head; 95 struct rcu_head *tail; 96 rcu_gp_poll_state_t seq; 97 }; 98 99 struct rcu_pending_pcpu { 100 struct rcu_pending *parent; 101 spinlock_t lock; 102 int cpu; 103 104 /* 105 * We can't bound the number of unprocessed gp sequence numbers, and we 106 * can't efficiently merge radix trees for expired grace periods, so we 107 * need darray/vector: 108 */ 109 DARRAY_PREALLOCATED(struct rcu_pending_seq, 4) objs; 110 111 /* Third entry is for expired objects: */ 112 struct rcu_pending_list lists[NUM_ACTIVE_RCU_POLL_OLDSTATE + 1]; 113 114 struct rcu_head cb; 115 bool cb_armed; 116 struct work_struct work; 117 }; 118 119 static bool __rcu_pending_has_pending(struct rcu_pending_pcpu *p) 120 { 121 if (p->objs.nr) 122 return true; 123 124 static_array_for_each(p->lists, i) 125 if (i->head) 126 return true; 127 128 return false; 129 } 130 131 static void rcu_pending_list_merge(struct rcu_pending_list *l1, 132 struct rcu_pending_list *l2) 133 { 134 #ifdef __KERNEL__ 135 if (!l1->head) 136 l1->head = l2->head; 137 else 138 l1->tail->next = l2->head; 139 #else 140 if (!l1->head) 141 l1->head = l2->head; 142 else 143 l1->tail->next.next = (void *) l2->head; 144 #endif 145 146 l1->tail = l2->tail; 147 l2->head = l2->tail = NULL; 148 } 149 150 static void rcu_pending_list_add(struct rcu_pending_list *l, 151 struct rcu_head *n) 152 { 153 #ifdef __KERNEL__ 154 if (!l->head) 155 l->head = n; 156 else 157 l->tail->next = n; 158 l->tail = n; 159 n->next = NULL; 160 #else 161 if (!l->head) 162 l->head = n; 163 else 164 l->tail->next.next = (void *) n; 165 l->tail = n; 166 n->next.next = NULL; 167 #endif 168 } 169 170 static void merge_expired_lists(struct rcu_pending_pcpu *p) 171 { 172 struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; 173 174 for (struct rcu_pending_list *i = p->lists; i < expired; i++) 175 if (i->head && __poll_state_synchronize_rcu(p->parent->srcu, i->seq)) 176 rcu_pending_list_merge(expired, i); 177 } 178 179 #ifndef __KERNEL__ 180 static inline void kfree_bulk(size_t nr, void ** p) 181 { 182 while (nr--) 183 kfree(*p); 184 } 185 #endif 186 187 static noinline void __process_finished_items(struct rcu_pending *pending, 188 struct rcu_pending_pcpu *p, 189 unsigned long flags) 190 { 191 struct rcu_pending_list *expired = &p->lists[NUM_ACTIVE_RCU_POLL_OLDSTATE]; 192 struct rcu_pending_seq objs = {}; 193 struct rcu_head *list = NULL; 194 195 if (p->objs.nr && 196 __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) { 197 objs = p->objs.data[0]; 198 darray_remove_item(&p->objs, p->objs.data); 199 } 200 201 merge_expired_lists(p); 202 203 list = expired->head; 204 expired->head = expired->tail = NULL; 205 206 spin_unlock_irqrestore(&p->lock, flags); 207 208 switch ((ulong) pending->process) { 209 case RCU_PENDING_KVFREE: 210 for (size_t i = 0; i < objs.nr; ) { 211 size_t nr_this_node = min(GENRADIX_NODE_SIZE / sizeof(void *), objs.nr - i); 212 213 kfree_bulk(nr_this_node, (void **) genradix_ptr(&objs.objs, i)); 214 i += nr_this_node; 215 } 216 genradix_free(&objs.objs); 217 218 while (list) { 219 struct rcu_head *obj = list; 220 #ifdef __KERNEL__ 221 list = obj->next; 222 #else 223 list = (void *) obj->next.next; 224 #endif 225 226 /* 227 * low bit of pointer indicates whether rcu_head needs 228 * to be freed - kvfree_rcu_mightsleep() 229 */ 230 BUILD_BUG_ON(ARCH_SLAB_MINALIGN == 0); 231 232 void *ptr = (void *)(((unsigned long) obj->func) & ~1UL); 233 bool free_head = ((unsigned long) obj->func) & 1UL; 234 235 kvfree(ptr); 236 if (free_head) 237 kfree(obj); 238 } 239 240 break; 241 242 case RCU_PENDING_CALL_RCU: 243 for (size_t i = 0; i < objs.nr; i++) { 244 struct rcu_head *obj = *genradix_ptr(&objs.objs, i); 245 obj->func(obj); 246 } 247 genradix_free(&objs.objs); 248 249 while (list) { 250 struct rcu_head *obj = list; 251 #ifdef __KERNEL__ 252 list = obj->next; 253 #else 254 list = (void *) obj->next.next; 255 #endif 256 obj->func(obj); 257 } 258 break; 259 260 default: 261 for (size_t i = 0; i < objs.nr; i++) 262 pending->process(pending, *genradix_ptr(&objs.objs, i)); 263 genradix_free(&objs.objs); 264 265 while (list) { 266 struct rcu_head *obj = list; 267 #ifdef __KERNEL__ 268 list = obj->next; 269 #else 270 list = (void *) obj->next.next; 271 #endif 272 pending->process(pending, obj); 273 } 274 break; 275 } 276 } 277 278 static bool process_finished_items(struct rcu_pending *pending, 279 struct rcu_pending_pcpu *p, 280 unsigned long flags) 281 { 282 /* 283 * XXX: we should grab the gp seq once and avoid multiple function 284 * calls, this is called from __rcu_pending_enqueue() fastpath in 285 * may_sleep==true mode 286 */ 287 if ((p->objs.nr && __poll_state_synchronize_rcu(pending->srcu, p->objs.data[0].seq)) || 288 (p->lists[0].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[0].seq)) || 289 (p->lists[1].head && __poll_state_synchronize_rcu(pending->srcu, p->lists[1].seq)) || 290 p->lists[2].head) { 291 __process_finished_items(pending, p, flags); 292 return true; 293 } 294 295 return false; 296 } 297 298 static void rcu_pending_work(struct work_struct *work) 299 { 300 struct rcu_pending_pcpu *p = 301 container_of(work, struct rcu_pending_pcpu, work); 302 struct rcu_pending *pending = p->parent; 303 unsigned long flags; 304 305 do { 306 spin_lock_irqsave(&p->lock, flags); 307 } while (process_finished_items(pending, p, flags)); 308 309 spin_unlock_irqrestore(&p->lock, flags); 310 } 311 312 static void rcu_pending_rcu_cb(struct rcu_head *rcu) 313 { 314 struct rcu_pending_pcpu *p = container_of(rcu, struct rcu_pending_pcpu, cb); 315 316 schedule_work_on(p->cpu, &p->work); 317 318 unsigned long flags; 319 spin_lock_irqsave(&p->lock, flags); 320 if (__rcu_pending_has_pending(p)) { 321 spin_unlock_irqrestore(&p->lock, flags); 322 __call_rcu(p->parent->srcu, &p->cb, rcu_pending_rcu_cb); 323 } else { 324 p->cb_armed = false; 325 spin_unlock_irqrestore(&p->lock, flags); 326 } 327 } 328 329 static __always_inline struct rcu_pending_seq * 330 get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) 331 { 332 darray_for_each_reverse(p->objs, objs) 333 if (rcu_gp_poll_cookie_eq(objs->seq, seq)) 334 return objs; 335 336 if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) 337 return NULL; 338 339 return &darray_last(p->objs); 340 } 341 342 static noinline bool 343 rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, 344 struct rcu_head *head, void *ptr, 345 unsigned long *flags) 346 { 347 if (ptr) { 348 if (!head) { 349 /* 350 * kvfree_rcu_mightsleep(): we weren't passed an 351 * rcu_head, but we need one: use the low bit of the 352 * ponter to free to flag that the head needs to be 353 * freed as well: 354 */ 355 ptr = (void *)(((unsigned long) ptr)|1UL); 356 head = kmalloc(sizeof(*head), __GFP_NOWARN); 357 if (!head) { 358 spin_unlock_irqrestore(&p->lock, *flags); 359 head = kmalloc(sizeof(*head), GFP_KERNEL|__GFP_NOFAIL); 360 /* 361 * dropped lock, did GFP_KERNEL allocation, 362 * check for gp expiration 363 */ 364 if (unlikely(__poll_state_synchronize_rcu(p->parent->srcu, seq))) { 365 kvfree(--ptr); 366 kfree(head); 367 spin_lock_irqsave(&p->lock, *flags); 368 return false; 369 } 370 } 371 } 372 373 head->func = ptr; 374 } 375 again: 376 for (struct rcu_pending_list *i = p->lists; 377 i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { 378 if (rcu_gp_poll_cookie_eq(i->seq, seq)) { 379 rcu_pending_list_add(i, head); 380 return false; 381 } 382 } 383 384 for (struct rcu_pending_list *i = p->lists; 385 i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { 386 if (!i->head) { 387 i->seq = seq; 388 rcu_pending_list_add(i, head); 389 return true; 390 } 391 } 392 393 merge_expired_lists(p); 394 goto again; 395 } 396 397 /* 398 * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via 399 * pending->pracess) once grace period elapses. 400 * 401 * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall 402 * back to a linked list. 403 * 404 * - If @ptr is NULL, we're enqueuing an item for a generic @pending with a 405 * process callback 406 * 407 * - If @ptr and @head are both not NULL, we're kvfree_rcu() 408 * 409 * - If @ptr is not NULL and @head is, we're kvfree_rcu_mightsleep() 410 * 411 * - If @may_sleep is true, will do GFP_KERNEL memory allocations and process 412 * expired items. 413 */ 414 static __always_inline void 415 __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, 416 void *ptr, bool may_sleep) 417 { 418 419 struct rcu_pending_pcpu *p; 420 struct rcu_pending_seq *objs; 421 struct genradix_node *new_node = NULL; 422 unsigned long flags; 423 bool start_gp = false; 424 425 BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); 426 427 /* We could technically be scheduled before taking the lock and end up 428 * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock 429 * anyways 430 * 431 * And we have to do it this way to avoid breaking PREEMPT_RT, which 432 * redefines how spinlocks work: 433 */ 434 p = raw_cpu_ptr(pending->p); 435 spin_lock_irqsave(&p->lock, flags); 436 rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); 437 restart: 438 if (may_sleep && 439 unlikely(process_finished_items(pending, p, flags))) 440 goto check_expired; 441 442 /* 443 * In kvfree_rcu() mode, the radix tree is only for slab pointers so 444 * that we can do kfree_bulk() - vmalloc pointers always use the linked 445 * list: 446 */ 447 if (ptr && unlikely(is_vmalloc_addr(ptr))) 448 goto list_add; 449 450 objs = get_object_radix(p, seq); 451 if (unlikely(!objs)) 452 goto list_add; 453 454 if (unlikely(!objs->cursor)) { 455 /* 456 * New radix tree nodes must be added under @p->lock because the 457 * tree root is in a darray that can be resized (typically, 458 * genradix supports concurrent unlocked allocation of new 459 * nodes) - hence preallocation and the retry loop: 460 */ 461 objs->cursor = genradix_ptr_alloc_preallocated_inlined(&objs->objs, 462 objs->nr, &new_node, GFP_ATOMIC|__GFP_NOWARN); 463 if (unlikely(!objs->cursor)) { 464 if (may_sleep) { 465 spin_unlock_irqrestore(&p->lock, flags); 466 467 gfp_t gfp = GFP_KERNEL; 468 if (!head) 469 gfp |= __GFP_NOFAIL; 470 471 new_node = genradix_alloc_node(gfp); 472 if (!new_node) 473 may_sleep = false; 474 goto check_expired; 475 } 476 list_add: 477 start_gp = rcu_pending_enqueue_list(p, seq, head, ptr, &flags); 478 goto start_gp; 479 } 480 } 481 482 *objs->cursor++ = ptr ?: head; 483 /* zero cursor if we hit the end of a radix tree node: */ 484 if (!(((ulong) objs->cursor) & (GENRADIX_NODE_SIZE - 1))) 485 objs->cursor = NULL; 486 start_gp = !objs->nr; 487 objs->nr++; 488 start_gp: 489 if (unlikely(start_gp)) { 490 /* 491 * We only have one callback (ideally, we would have one for 492 * every outstanding graceperiod) - so if our callback is 493 * already in flight, we may still have to start a grace period 494 * (since we used get_state() above, not start_poll()) 495 */ 496 if (!p->cb_armed) { 497 p->cb_armed = true; 498 __call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb); 499 } else { 500 __start_poll_synchronize_rcu(pending->srcu); 501 } 502 } 503 spin_unlock_irqrestore(&p->lock, flags); 504 free_node: 505 if (new_node) 506 genradix_free_node(new_node); 507 return; 508 check_expired: 509 if (unlikely(__poll_state_synchronize_rcu(pending->srcu, seq))) { 510 switch ((ulong) pending->process) { 511 case RCU_PENDING_KVFREE: 512 kvfree(ptr); 513 break; 514 case RCU_PENDING_CALL_RCU: 515 head->func(head); 516 break; 517 default: 518 pending->process(pending, head); 519 break; 520 } 521 goto free_node; 522 } 523 524 p = raw_cpu_ptr(pending->p); 525 spin_lock_irqsave(&p->lock, flags); 526 goto restart; 527 } 528 529 void rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *obj) 530 { 531 __rcu_pending_enqueue(pending, obj, NULL, true); 532 } 533 534 static struct rcu_head *rcu_pending_pcpu_dequeue(struct rcu_pending_pcpu *p) 535 { 536 struct rcu_head *ret = NULL; 537 538 spin_lock_irq(&p->lock); 539 darray_for_each(p->objs, objs) 540 if (objs->nr) { 541 ret = *genradix_ptr(&objs->objs, --objs->nr); 542 objs->cursor = NULL; 543 if (!objs->nr) 544 genradix_free(&objs->objs); 545 goto out; 546 } 547 548 static_array_for_each(p->lists, i) 549 if (i->head) { 550 ret = i->head; 551 #ifdef __KERNEL__ 552 i->head = ret->next; 553 #else 554 i->head = (void *) ret->next.next; 555 #endif 556 if (!i->head) 557 i->tail = NULL; 558 goto out; 559 } 560 out: 561 spin_unlock_irq(&p->lock); 562 563 return ret; 564 } 565 566 struct rcu_head *rcu_pending_dequeue(struct rcu_pending *pending) 567 { 568 return rcu_pending_pcpu_dequeue(raw_cpu_ptr(pending->p)); 569 } 570 571 struct rcu_head *rcu_pending_dequeue_from_all(struct rcu_pending *pending) 572 { 573 struct rcu_head *ret = rcu_pending_dequeue(pending); 574 575 if (ret) 576 return ret; 577 578 int cpu; 579 for_each_possible_cpu(cpu) { 580 ret = rcu_pending_pcpu_dequeue(per_cpu_ptr(pending->p, cpu)); 581 if (ret) 582 break; 583 } 584 return ret; 585 } 586 587 static bool rcu_pending_has_pending_or_armed(struct rcu_pending *pending) 588 { 589 int cpu; 590 for_each_possible_cpu(cpu) { 591 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); 592 spin_lock_irq(&p->lock); 593 if (__rcu_pending_has_pending(p) || p->cb_armed) { 594 spin_unlock_irq(&p->lock); 595 return true; 596 } 597 spin_unlock_irq(&p->lock); 598 } 599 600 return false; 601 } 602 603 void rcu_pending_exit(struct rcu_pending *pending) 604 { 605 int cpu; 606 607 if (!pending->p) 608 return; 609 610 while (rcu_pending_has_pending_or_armed(pending)) { 611 __rcu_barrier(pending->srcu); 612 613 for_each_possible_cpu(cpu) { 614 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); 615 flush_work(&p->work); 616 } 617 } 618 619 for_each_possible_cpu(cpu) { 620 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); 621 flush_work(&p->work); 622 } 623 624 for_each_possible_cpu(cpu) { 625 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); 626 627 static_array_for_each(p->lists, i) 628 WARN_ON(i->head); 629 WARN_ON(p->objs.nr); 630 darray_exit(&p->objs); 631 } 632 free_percpu(pending->p); 633 } 634 635 /** 636 * rcu_pending_init: - initialize a rcu_pending 637 * 638 * @pending: Object to init 639 * @srcu: May optionally be used with an srcu_struct; if NULL, uses normal 640 * RCU flavor 641 * @process: Callback function invoked on objects once their RCU barriers 642 * have completed; if NULL, kvfree() is used. 643 */ 644 int rcu_pending_init(struct rcu_pending *pending, 645 struct srcu_struct *srcu, 646 rcu_pending_process_fn process) 647 { 648 pending->p = alloc_percpu(struct rcu_pending_pcpu); 649 if (!pending->p) 650 return -ENOMEM; 651 652 int cpu; 653 for_each_possible_cpu(cpu) { 654 struct rcu_pending_pcpu *p = per_cpu_ptr(pending->p, cpu); 655 p->parent = pending; 656 p->cpu = cpu; 657 spin_lock_init(&p->lock); 658 darray_init(&p->objs); 659 INIT_WORK(&p->work, rcu_pending_work); 660 } 661 662 pending->srcu = srcu; 663 pending->process = process; 664 665 return 0; 666 } 667