1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 #define BPF_MEM_ALLOC_SIZE_MAX 4096 39 40 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 41 static u8 size_index[24] __ro_after_init = { 42 3, /* 8 */ 43 3, /* 16 */ 44 4, /* 24 */ 45 4, /* 32 */ 46 5, /* 40 */ 47 5, /* 48 */ 48 5, /* 56 */ 49 5, /* 64 */ 50 1, /* 72 */ 51 1, /* 80 */ 52 1, /* 88 */ 53 1, /* 96 */ 54 6, /* 104 */ 55 6, /* 112 */ 56 6, /* 120 */ 57 6, /* 128 */ 58 2, /* 136 */ 59 2, /* 144 */ 60 2, /* 152 */ 61 2, /* 160 */ 62 2, /* 168 */ 63 2, /* 176 */ 64 2, /* 184 */ 65 2 /* 192 */ 66 }; 67 68 static int bpf_mem_cache_idx(size_t size) 69 { 70 if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) 71 return -1; 72 73 if (size <= 192) 74 return size_index[(size - 1) / 8] - 1; 75 76 return fls(size - 1) - 2; 77 } 78 79 #define NUM_CACHES 11 80 81 struct bpf_mem_cache { 82 /* per-cpu list of free objects of size 'unit_size'. 83 * All accesses are done with interrupts disabled and 'active' counter 84 * protection with __llist_add() and __llist_del_first(). 85 */ 86 struct llist_head free_llist; 87 local_t active; 88 89 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 90 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 91 * fail. When 'active' is busy the unit_free() will add an object to 92 * free_llist_extra. 93 */ 94 struct llist_head free_llist_extra; 95 96 struct irq_work refill_work; 97 struct obj_cgroup *objcg; 98 int unit_size; 99 /* count of objects in free_llist */ 100 int free_cnt; 101 int low_watermark, high_watermark, batch; 102 int percpu_size; 103 bool draining; 104 struct bpf_mem_cache *tgt; 105 void (*dtor)(void *obj, void *ctx); 106 void *dtor_ctx; 107 108 /* list of objects to be freed after RCU GP */ 109 struct llist_head free_by_rcu; 110 struct llist_node *free_by_rcu_tail; 111 struct llist_head waiting_for_gp; 112 struct llist_node *waiting_for_gp_tail; 113 struct rcu_head rcu; 114 atomic_t call_rcu_in_progress; 115 struct llist_head free_llist_extra_rcu; 116 117 /* list of objects to be freed after RCU tasks trace GP */ 118 struct llist_head free_by_rcu_ttrace; 119 struct llist_head waiting_for_gp_ttrace; 120 struct rcu_head rcu_ttrace; 121 atomic_t call_rcu_ttrace_in_progress; 122 }; 123 124 struct bpf_mem_caches { 125 struct bpf_mem_cache cache[NUM_CACHES]; 126 }; 127 128 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 129 130 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 131 { 132 struct llist_node *entry, *next; 133 134 entry = head->first; 135 if (!entry) 136 return NULL; 137 next = entry->next; 138 head->first = next; 139 return entry; 140 } 141 142 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 143 { 144 if (c->percpu_size) { 145 void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); 146 void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 147 148 if (!obj || !pptr) { 149 free_percpu(pptr); 150 kfree(obj); 151 return NULL; 152 } 153 obj[1] = pptr; 154 return obj; 155 } 156 157 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 158 } 159 160 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 161 { 162 #ifdef CONFIG_MEMCG 163 if (c->objcg) 164 return get_mem_cgroup_from_objcg(c->objcg); 165 return root_mem_cgroup; 166 #else 167 return NULL; 168 #endif 169 } 170 171 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 172 { 173 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 174 /* In RT irq_work runs in per-cpu kthread, so disable 175 * interrupts to avoid preemption and interrupts and 176 * reduce the chance of bpf prog executing on this cpu 177 * when active counter is busy. 178 */ 179 local_irq_save(*flags); 180 /* alloc_bulk runs from irq_work which will not preempt a bpf 181 * program that does unit_alloc/unit_free since IRQs are 182 * disabled there. There is no race to increment 'active' 183 * counter. It protects free_llist from corruption in case NMI 184 * bpf prog preempted this loop. 185 */ 186 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 187 } 188 189 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 190 { 191 local_dec(&c->active); 192 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 193 local_irq_restore(*flags); 194 } 195 196 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 197 { 198 unsigned long flags; 199 200 inc_active(c, &flags); 201 __llist_add(obj, &c->free_llist); 202 c->free_cnt++; 203 dec_active(c, &flags); 204 } 205 206 /* Mostly runs from irq_work except __init phase. */ 207 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 208 { 209 struct mem_cgroup *memcg = NULL, *old_memcg; 210 gfp_t gfp; 211 void *obj; 212 int i; 213 214 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 215 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 216 217 for (i = 0; i < cnt; i++) { 218 /* 219 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 220 * done only by one CPU == current CPU. Other CPUs might 221 * llist_add() and llist_del_all() in parallel. 222 */ 223 obj = llist_del_first(&c->free_by_rcu_ttrace); 224 if (!obj) 225 break; 226 add_obj_to_free_list(c, obj); 227 } 228 if (i >= cnt) 229 return; 230 231 for (; i < cnt; i++) { 232 obj = llist_del_first(&c->waiting_for_gp_ttrace); 233 if (!obj) 234 break; 235 add_obj_to_free_list(c, obj); 236 } 237 if (i >= cnt) 238 return; 239 240 memcg = get_memcg(c); 241 old_memcg = set_active_memcg(memcg); 242 for (; i < cnt; i++) { 243 /* Allocate, but don't deplete atomic reserves that typical 244 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 245 * will allocate from the current numa node which is what we 246 * want here. 247 */ 248 obj = __alloc(c, node, gfp); 249 if (!obj) 250 break; 251 add_obj_to_free_list(c, obj); 252 } 253 set_active_memcg(old_memcg); 254 mem_cgroup_put(memcg); 255 } 256 257 static void free_one(void *obj, bool percpu) 258 { 259 if (percpu) 260 free_percpu(((void __percpu **)obj)[1]); 261 262 kfree(obj); 263 } 264 265 static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu) 266 { 267 struct llist_node *pos, *t; 268 int cnt = 0; 269 270 llist_for_each_safe(pos, t, llnode) { 271 if (c->dtor) 272 c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx); 273 free_one(pos, percpu); 274 cnt++; 275 } 276 return cnt; 277 } 278 279 static void __free_rcu(struct rcu_head *head) 280 { 281 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 282 283 free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 284 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 285 } 286 287 static void __free_rcu_tasks_trace(struct rcu_head *head) 288 { 289 /* If RCU Tasks Trace grace period implies RCU grace period, 290 * there is no need to invoke call_rcu(). 291 */ 292 if (rcu_trace_implies_rcu_gp()) 293 __free_rcu(head); 294 else 295 call_rcu(head, __free_rcu); 296 } 297 298 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 299 { 300 struct llist_node *llnode = obj; 301 302 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 303 * Nothing races to add to free_by_rcu_ttrace list. 304 */ 305 llist_add(llnode, &c->free_by_rcu_ttrace); 306 } 307 308 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 309 { 310 struct llist_node *llnode, *t; 311 312 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 313 if (unlikely(READ_ONCE(c->draining))) { 314 llnode = llist_del_all(&c->free_by_rcu_ttrace); 315 free_all(c, llnode, !!c->percpu_size); 316 } 317 return; 318 } 319 320 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 321 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 322 llist_add(llnode, &c->waiting_for_gp_ttrace); 323 324 if (unlikely(READ_ONCE(c->draining))) { 325 __free_rcu(&c->rcu_ttrace); 326 return; 327 } 328 329 /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 330 * If RCU Tasks Trace grace period implies RCU grace period, free 331 * these elements directly, else use call_rcu() to wait for normal 332 * progs to finish and finally do free_one() on each element. 333 */ 334 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); 335 } 336 337 static void free_bulk(struct bpf_mem_cache *c) 338 { 339 struct bpf_mem_cache *tgt = c->tgt; 340 struct llist_node *llnode, *t; 341 unsigned long flags; 342 int cnt; 343 344 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 345 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 346 347 do { 348 inc_active(c, &flags); 349 llnode = __llist_del_first(&c->free_llist); 350 if (llnode) 351 cnt = --c->free_cnt; 352 else 353 cnt = 0; 354 dec_active(c, &flags); 355 if (llnode) 356 enque_to_free(tgt, llnode); 357 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 358 359 /* and drain free_llist_extra */ 360 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 361 enque_to_free(tgt, llnode); 362 do_call_rcu_ttrace(tgt); 363 } 364 365 static void __free_by_rcu(struct rcu_head *head) 366 { 367 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 368 struct bpf_mem_cache *tgt = c->tgt; 369 struct llist_node *llnode; 370 371 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 372 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 373 374 llnode = llist_del_all(&c->waiting_for_gp); 375 if (!llnode) 376 goto out; 377 378 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 379 380 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 381 do_call_rcu_ttrace(tgt); 382 out: 383 atomic_set(&c->call_rcu_in_progress, 0); 384 } 385 386 static void check_free_by_rcu(struct bpf_mem_cache *c) 387 { 388 struct llist_node *llnode, *t; 389 unsigned long flags; 390 391 /* drain free_llist_extra_rcu */ 392 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 393 inc_active(c, &flags); 394 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 395 if (__llist_add(llnode, &c->free_by_rcu)) 396 c->free_by_rcu_tail = llnode; 397 dec_active(c, &flags); 398 } 399 400 if (llist_empty(&c->free_by_rcu)) 401 return; 402 403 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 404 /* 405 * Instead of kmalloc-ing new rcu_head and triggering 10k 406 * call_rcu() to hit rcutree.qhimark and force RCU to notice 407 * the overload just ask RCU to hurry up. There could be many 408 * objects in free_by_rcu list. 409 * This hint reduces memory consumption for an artificial 410 * benchmark from 2 Gbyte to 150 Mbyte. 411 */ 412 rcu_request_urgent_qs_task(current); 413 return; 414 } 415 416 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 417 418 inc_active(c, &flags); 419 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 420 c->waiting_for_gp_tail = c->free_by_rcu_tail; 421 dec_active(c, &flags); 422 423 if (unlikely(READ_ONCE(c->draining))) { 424 free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 425 atomic_set(&c->call_rcu_in_progress, 0); 426 } else { 427 call_rcu_hurry(&c->rcu, __free_by_rcu); 428 } 429 } 430 431 static void bpf_mem_refill(struct irq_work *work) 432 { 433 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 434 int cnt; 435 436 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 437 cnt = c->free_cnt; 438 if (cnt < c->low_watermark) 439 /* irq_work runs on this cpu and kmalloc will allocate 440 * from the current numa node which is what we want here. 441 */ 442 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 443 else if (cnt > c->high_watermark) 444 free_bulk(c); 445 446 check_free_by_rcu(c); 447 } 448 449 static void notrace irq_work_raise(struct bpf_mem_cache *c) 450 { 451 irq_work_queue(&c->refill_work); 452 } 453 454 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 455 * the freelist cache will be elem_size * 64 (or less) on each cpu. 456 * 457 * For bpf programs that don't have statically known allocation sizes and 458 * assuming (low_mark + high_mark) / 2 as an average number of elements per 459 * bucket and all buckets are used the total amount of memory in freelists 460 * on each cpu will be: 461 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 462 * == ~ 116 Kbyte using below heuristic. 463 * Initialized, but unused bpf allocator (not bpf map specific one) will 464 * consume ~ 11 Kbyte per cpu. 465 * Typical case will be between 11K and 116K closer to 11K. 466 * bpf progs can and should share bpf_mem_cache when possible. 467 * 468 * Percpu allocation is typically rare. To avoid potential unnecessary large 469 * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. 470 */ 471 static void init_refill_work(struct bpf_mem_cache *c) 472 { 473 init_irq_work(&c->refill_work, bpf_mem_refill); 474 if (c->percpu_size) { 475 c->low_watermark = 1; 476 c->high_watermark = 3; 477 } else if (c->unit_size <= 256) { 478 c->low_watermark = 32; 479 c->high_watermark = 96; 480 } else { 481 /* When page_size == 4k, order-0 cache will have low_mark == 2 482 * and high_mark == 6 with batch alloc of 3 individual pages at 483 * a time. 484 * 8k allocs and above low == 1, high == 3, batch == 1. 485 */ 486 c->low_watermark = max(32 * 256 / c->unit_size, 1); 487 c->high_watermark = max(96 * 256 / c->unit_size, 3); 488 } 489 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 490 } 491 492 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 493 { 494 int cnt = 1; 495 496 /* To avoid consuming memory, for non-percpu allocation, assume that 497 * 1st run of bpf prog won't be doing more than 4 map_update_elem from 498 * irq disabled region if unit size is less than or equal to 256. 499 * For all other cases, let us just do one allocation. 500 */ 501 if (!c->percpu_size && c->unit_size <= 256) 502 cnt = 4; 503 alloc_bulk(c, cnt, cpu_to_node(cpu), false); 504 } 505 506 /* When size != 0 bpf_mem_cache for each cpu. 507 * This is typical bpf hash map use case when all elements have equal size. 508 * 509 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 510 * kmalloc/kfree. Max allocation size is 4096 in this case. 511 * This is bpf_dynptr and bpf_kptr use case. 512 */ 513 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 514 { 515 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 516 struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; 517 struct obj_cgroup *objcg = NULL; 518 int cpu, i, unit_size, percpu_size = 0; 519 520 if (percpu && size == 0) 521 return -EINVAL; 522 523 /* room for llist_node and per-cpu pointer */ 524 if (percpu) 525 percpu_size = LLIST_NODE_SZ + sizeof(void *); 526 ma->percpu = percpu; 527 528 if (size) { 529 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 530 if (!pc) 531 return -ENOMEM; 532 533 if (!percpu) 534 size += LLIST_NODE_SZ; /* room for llist_node */ 535 unit_size = size; 536 537 #ifdef CONFIG_MEMCG 538 if (memcg_bpf_enabled()) 539 objcg = get_obj_cgroup_from_current(); 540 #endif 541 ma->objcg = objcg; 542 543 for_each_possible_cpu(cpu) { 544 c = per_cpu_ptr(pc, cpu); 545 c->unit_size = unit_size; 546 c->objcg = objcg; 547 c->percpu_size = percpu_size; 548 c->tgt = c; 549 init_refill_work(c); 550 prefill_mem_cache(c, cpu); 551 } 552 ma->cache = pc; 553 return 0; 554 } 555 556 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 557 if (!pcc) 558 return -ENOMEM; 559 #ifdef CONFIG_MEMCG 560 objcg = get_obj_cgroup_from_current(); 561 #endif 562 ma->objcg = objcg; 563 for_each_possible_cpu(cpu) { 564 cc = per_cpu_ptr(pcc, cpu); 565 for (i = 0; i < NUM_CACHES; i++) { 566 c = &cc->cache[i]; 567 c->unit_size = sizes[i]; 568 c->objcg = objcg; 569 c->percpu_size = percpu_size; 570 c->tgt = c; 571 572 init_refill_work(c); 573 prefill_mem_cache(c, cpu); 574 } 575 } 576 577 ma->caches = pcc; 578 return 0; 579 } 580 581 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 582 { 583 struct bpf_mem_caches __percpu *pcc; 584 585 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 586 if (!pcc) 587 return -ENOMEM; 588 589 ma->caches = pcc; 590 ma->objcg = objcg; 591 ma->percpu = true; 592 return 0; 593 } 594 595 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 596 { 597 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 598 int cpu, i, unit_size, percpu_size; 599 struct obj_cgroup *objcg; 600 struct bpf_mem_cache *c; 601 602 i = bpf_mem_cache_idx(size); 603 if (i < 0) 604 return -EINVAL; 605 606 /* room for llist_node and per-cpu pointer */ 607 percpu_size = LLIST_NODE_SZ + sizeof(void *); 608 609 unit_size = sizes[i]; 610 objcg = ma->objcg; 611 pcc = ma->caches; 612 613 for_each_possible_cpu(cpu) { 614 cc = per_cpu_ptr(pcc, cpu); 615 c = &cc->cache[i]; 616 if (c->unit_size) 617 break; 618 619 c->unit_size = unit_size; 620 c->objcg = objcg; 621 c->percpu_size = percpu_size; 622 c->tgt = c; 623 624 init_refill_work(c); 625 prefill_mem_cache(c, cpu); 626 } 627 628 return 0; 629 } 630 631 static void drain_mem_cache(struct bpf_mem_cache *c) 632 { 633 bool percpu = !!c->percpu_size; 634 635 /* No progs are using this bpf_mem_cache, but htab_map_free() called 636 * bpf_mem_cache_free() for all remaining elements and they can be in 637 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 638 * 639 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 640 * on these lists, so it is safe to use __llist_del_all(). 641 */ 642 free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu); 643 free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu); 644 free_all(c, __llist_del_all(&c->free_llist), percpu); 645 free_all(c, __llist_del_all(&c->free_llist_extra), percpu); 646 free_all(c, __llist_del_all(&c->free_by_rcu), percpu); 647 free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu); 648 free_all(c, llist_del_all(&c->waiting_for_gp), percpu); 649 } 650 651 static void check_mem_cache(struct bpf_mem_cache *c) 652 { 653 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 654 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 655 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 656 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 657 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 658 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 659 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 660 } 661 662 static void check_leaked_objs(struct bpf_mem_alloc *ma) 663 { 664 struct bpf_mem_caches *cc; 665 struct bpf_mem_cache *c; 666 int cpu, i; 667 668 if (ma->cache) { 669 for_each_possible_cpu(cpu) { 670 c = per_cpu_ptr(ma->cache, cpu); 671 check_mem_cache(c); 672 } 673 } 674 if (ma->caches) { 675 for_each_possible_cpu(cpu) { 676 cc = per_cpu_ptr(ma->caches, cpu); 677 for (i = 0; i < NUM_CACHES; i++) { 678 c = &cc->cache[i]; 679 check_mem_cache(c); 680 } 681 } 682 } 683 } 684 685 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 686 { 687 /* We can free dtor ctx only once all callbacks are done using it. */ 688 if (ma->dtor_ctx_free) 689 ma->dtor_ctx_free(ma->dtor_ctx); 690 check_leaked_objs(ma); 691 free_percpu(ma->cache); 692 free_percpu(ma->caches); 693 ma->cache = NULL; 694 ma->caches = NULL; 695 } 696 697 static void free_mem_alloc(struct bpf_mem_alloc *ma) 698 { 699 /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 700 * might still execute. Wait for them. 701 * 702 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 703 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 704 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), 705 * so if call_rcu(head, __free_rcu) is skipped due to 706 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by 707 * using rcu_trace_implies_rcu_gp() as well. 708 */ 709 rcu_barrier(); /* wait for __free_by_rcu */ 710 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 711 if (!rcu_trace_implies_rcu_gp()) 712 rcu_barrier(); 713 free_mem_alloc_no_barrier(ma); 714 } 715 716 static void free_mem_alloc_deferred(struct work_struct *work) 717 { 718 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 719 720 free_mem_alloc(ma); 721 kfree(ma); 722 } 723 724 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 725 { 726 struct bpf_mem_alloc *copy; 727 728 if (!rcu_in_progress) { 729 /* Fast path. No callbacks are pending, hence no need to do 730 * rcu_barrier-s. 731 */ 732 free_mem_alloc_no_barrier(ma); 733 return; 734 } 735 736 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 737 if (!copy) { 738 /* Slow path with inline barrier-s */ 739 free_mem_alloc(ma); 740 return; 741 } 742 743 /* Defer barriers into worker to let the rest of map memory to be freed */ 744 memset(ma, 0, sizeof(*ma)); 745 INIT_WORK(©->work, free_mem_alloc_deferred); 746 queue_work(system_dfl_wq, ©->work); 747 } 748 749 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 750 { 751 struct bpf_mem_caches *cc; 752 struct bpf_mem_cache *c; 753 int cpu, i, rcu_in_progress; 754 755 if (ma->cache) { 756 rcu_in_progress = 0; 757 for_each_possible_cpu(cpu) { 758 c = per_cpu_ptr(ma->cache, cpu); 759 WRITE_ONCE(c->draining, true); 760 irq_work_sync(&c->refill_work); 761 drain_mem_cache(c); 762 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 763 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 764 } 765 obj_cgroup_put(ma->objcg); 766 destroy_mem_alloc(ma, rcu_in_progress); 767 } 768 if (ma->caches) { 769 rcu_in_progress = 0; 770 for_each_possible_cpu(cpu) { 771 cc = per_cpu_ptr(ma->caches, cpu); 772 for (i = 0; i < NUM_CACHES; i++) { 773 c = &cc->cache[i]; 774 WRITE_ONCE(c->draining, true); 775 irq_work_sync(&c->refill_work); 776 drain_mem_cache(c); 777 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 778 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 779 } 780 } 781 obj_cgroup_put(ma->objcg); 782 destroy_mem_alloc(ma, rcu_in_progress); 783 } 784 } 785 786 /* notrace is necessary here and in other functions to make sure 787 * bpf programs cannot attach to them and cause llist corruptions. 788 */ 789 static void notrace *unit_alloc(struct bpf_mem_cache *c) 790 { 791 struct llist_node *llnode = NULL; 792 unsigned long flags; 793 int cnt = 0; 794 795 /* Disable irqs to prevent the following race for majority of prog types: 796 * prog_A 797 * bpf_mem_alloc 798 * preemption or irq -> prog_B 799 * bpf_mem_alloc 800 * 801 * but prog_B could be a perf_event NMI prog. 802 * Use per-cpu 'active' counter to order free_list access between 803 * unit_alloc/unit_free/bpf_mem_refill. 804 */ 805 local_irq_save(flags); 806 if (local_inc_return(&c->active) == 1) { 807 llnode = __llist_del_first(&c->free_llist); 808 if (llnode) { 809 cnt = --c->free_cnt; 810 *(struct bpf_mem_cache **)llnode = c; 811 } 812 } 813 local_dec(&c->active); 814 815 WARN_ON(cnt < 0); 816 817 if (cnt < c->low_watermark) 818 irq_work_raise(c); 819 /* Enable IRQ after the enqueue of irq work completes, so irq work 820 * will run after IRQ is enabled and free_llist may be refilled by 821 * irq work before other task preempts current task. 822 */ 823 local_irq_restore(flags); 824 825 return llnode; 826 } 827 828 /* Though 'ptr' object could have been allocated on a different cpu 829 * add it to the free_llist of the current cpu. 830 * Let kfree() logic deal with it when it's later called from irq_work. 831 */ 832 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 833 { 834 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 835 unsigned long flags; 836 int cnt = 0; 837 838 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 839 840 /* 841 * Remember bpf_mem_cache that allocated this object. 842 * The hint is not accurate. 843 */ 844 c->tgt = *(struct bpf_mem_cache **)llnode; 845 846 local_irq_save(flags); 847 if (local_inc_return(&c->active) == 1) { 848 __llist_add(llnode, &c->free_llist); 849 cnt = ++c->free_cnt; 850 } else { 851 /* unit_free() cannot fail. Therefore add an object to atomic 852 * llist. free_bulk() will drain it. Though free_llist_extra is 853 * a per-cpu list we have to use atomic llist_add here, since 854 * it also can be interrupted by bpf nmi prog that does another 855 * unit_free() into the same free_llist_extra. 856 */ 857 llist_add(llnode, &c->free_llist_extra); 858 } 859 local_dec(&c->active); 860 861 if (cnt > c->high_watermark) 862 /* free few objects from current cpu into global kmalloc pool */ 863 irq_work_raise(c); 864 /* Enable IRQ after irq_work_raise() completes, otherwise when current 865 * task is preempted by task which does unit_alloc(), unit_alloc() may 866 * return NULL unexpectedly because irq work is already pending but can 867 * not been triggered and free_llist can not be refilled timely. 868 */ 869 local_irq_restore(flags); 870 } 871 872 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 873 { 874 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 875 unsigned long flags; 876 877 c->tgt = *(struct bpf_mem_cache **)llnode; 878 879 local_irq_save(flags); 880 if (local_inc_return(&c->active) == 1) { 881 if (__llist_add(llnode, &c->free_by_rcu)) 882 c->free_by_rcu_tail = llnode; 883 } else { 884 llist_add(llnode, &c->free_llist_extra_rcu); 885 } 886 local_dec(&c->active); 887 888 if (!atomic_read(&c->call_rcu_in_progress)) 889 irq_work_raise(c); 890 local_irq_restore(flags); 891 } 892 893 /* Called from BPF program or from sys_bpf syscall. 894 * In both cases migration is disabled. 895 */ 896 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 897 { 898 int idx; 899 void *ret; 900 901 if (!size) 902 return NULL; 903 904 if (!ma->percpu) 905 size += LLIST_NODE_SZ; 906 idx = bpf_mem_cache_idx(size); 907 if (idx < 0) 908 return NULL; 909 910 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 911 return !ret ? NULL : ret + LLIST_NODE_SZ; 912 } 913 914 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 915 { 916 struct bpf_mem_cache *c; 917 int idx; 918 919 if (!ptr) 920 return; 921 922 c = *(void **)(ptr - LLIST_NODE_SZ); 923 idx = bpf_mem_cache_idx(c->unit_size); 924 if (WARN_ON_ONCE(idx < 0)) 925 return; 926 927 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 928 } 929 930 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 931 { 932 struct bpf_mem_cache *c; 933 int idx; 934 935 if (!ptr) 936 return; 937 938 c = *(void **)(ptr - LLIST_NODE_SZ); 939 idx = bpf_mem_cache_idx(c->unit_size); 940 if (WARN_ON_ONCE(idx < 0)) 941 return; 942 943 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 944 } 945 946 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 947 { 948 void *ret; 949 950 ret = unit_alloc(this_cpu_ptr(ma->cache)); 951 return !ret ? NULL : ret + LLIST_NODE_SZ; 952 } 953 954 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 955 { 956 if (!ptr) 957 return; 958 959 unit_free(this_cpu_ptr(ma->cache), ptr); 960 } 961 962 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 963 { 964 if (!ptr) 965 return; 966 967 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 968 } 969 970 /* Directly does a kfree() without putting 'ptr' back to the free_llist 971 * for reuse and without waiting for a rcu_tasks_trace gp. 972 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 973 * before calling bpf_mem_cache_raw_free(). 974 * It could be used when the rcu_tasks_trace callback does not have 975 * a hold on the original bpf_mem_alloc object that allocated the 976 * 'ptr'. This should only be used in the uncommon code path. 977 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 978 * and may affect performance. 979 */ 980 void bpf_mem_cache_raw_free(void *ptr) 981 { 982 if (!ptr) 983 return; 984 985 kfree(ptr - LLIST_NODE_SZ); 986 } 987 988 /* When flags == GFP_KERNEL, it signals that the caller will not cause 989 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 990 * kmalloc if the free_llist is empty. 991 */ 992 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 993 { 994 struct bpf_mem_cache *c; 995 void *ret; 996 997 c = this_cpu_ptr(ma->cache); 998 999 ret = unit_alloc(c); 1000 if (!ret && flags == GFP_KERNEL) { 1001 struct mem_cgroup *memcg, *old_memcg; 1002 1003 memcg = get_memcg(c); 1004 old_memcg = set_active_memcg(memcg); 1005 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 1006 if (ret) 1007 *(struct bpf_mem_cache **)ret = c; 1008 set_active_memcg(old_memcg); 1009 mem_cgroup_put(memcg); 1010 } 1011 1012 return !ret ? NULL : ret + LLIST_NODE_SZ; 1013 } 1014 1015 int bpf_mem_alloc_check_size(bool percpu, size_t size) 1016 { 1017 /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ 1018 if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || 1019 (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) 1020 return -E2BIG; 1021 1022 return 0; 1023 } 1024 1025 void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx), 1026 void (*dtor_ctx_free)(void *ctx), void *ctx) 1027 { 1028 struct bpf_mem_caches *cc; 1029 struct bpf_mem_cache *c; 1030 int cpu, i; 1031 1032 ma->dtor_ctx_free = dtor_ctx_free; 1033 ma->dtor_ctx = ctx; 1034 1035 if (ma->cache) { 1036 for_each_possible_cpu(cpu) { 1037 c = per_cpu_ptr(ma->cache, cpu); 1038 c->dtor = dtor; 1039 c->dtor_ctx = ctx; 1040 } 1041 } 1042 if (ma->caches) { 1043 for_each_possible_cpu(cpu) { 1044 cc = per_cpu_ptr(ma->caches, cpu); 1045 for (i = 0; i < NUM_CACHES; i++) { 1046 c = &cc->cache[i]; 1047 c->dtor = dtor; 1048 c->dtor_ctx = ctx; 1049 } 1050 } 1051 } 1052 } 1053