1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 #define BPF_MEM_ALLOC_SIZE_MAX 4096 39 40 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 41 static u8 size_index[24] __ro_after_init = { 42 3, /* 8 */ 43 3, /* 16 */ 44 4, /* 24 */ 45 4, /* 32 */ 46 5, /* 40 */ 47 5, /* 48 */ 48 5, /* 56 */ 49 5, /* 64 */ 50 1, /* 72 */ 51 1, /* 80 */ 52 1, /* 88 */ 53 1, /* 96 */ 54 6, /* 104 */ 55 6, /* 112 */ 56 6, /* 120 */ 57 6, /* 128 */ 58 2, /* 136 */ 59 2, /* 144 */ 60 2, /* 152 */ 61 2, /* 160 */ 62 2, /* 168 */ 63 2, /* 176 */ 64 2, /* 184 */ 65 2 /* 192 */ 66 }; 67 68 static int bpf_mem_cache_idx(size_t size) 69 { 70 if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) 71 return -1; 72 73 if (size <= 192) 74 return size_index[(size - 1) / 8] - 1; 75 76 return fls(size - 1) - 2; 77 } 78 79 #define NUM_CACHES 11 80 81 struct bpf_mem_cache { 82 /* per-cpu list of free objects of size 'unit_size'. 83 * All accesses are done with interrupts disabled and 'active' counter 84 * protection with __llist_add() and __llist_del_first(). 85 */ 86 struct llist_head free_llist; 87 local_t active; 88 89 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 90 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 91 * fail. When 'active' is busy the unit_free() will add an object to 92 * free_llist_extra. 93 */ 94 struct llist_head free_llist_extra; 95 96 struct irq_work refill_work; 97 struct obj_cgroup *objcg; 98 int unit_size; 99 /* count of objects in free_llist */ 100 int free_cnt; 101 int low_watermark, high_watermark, batch; 102 int percpu_size; 103 bool draining; 104 struct bpf_mem_cache *tgt; 105 void (*dtor)(void *obj, void *ctx); 106 void *dtor_ctx; 107 108 /* list of objects to be freed after RCU GP */ 109 struct llist_head free_by_rcu; 110 struct llist_node *free_by_rcu_tail; 111 struct llist_head waiting_for_gp; 112 struct llist_node *waiting_for_gp_tail; 113 struct rcu_head rcu; 114 atomic_t call_rcu_in_progress; 115 struct llist_head free_llist_extra_rcu; 116 117 /* list of objects to be freed after RCU tasks trace GP */ 118 struct llist_head free_by_rcu_ttrace; 119 struct llist_head waiting_for_gp_ttrace; 120 struct rcu_head rcu_ttrace; 121 atomic_t call_rcu_ttrace_in_progress; 122 }; 123 124 struct bpf_mem_caches { 125 struct bpf_mem_cache cache[NUM_CACHES]; 126 }; 127 128 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 129 130 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 131 { 132 struct llist_node *entry, *next; 133 134 entry = head->first; 135 if (!entry) 136 return NULL; 137 next = entry->next; 138 head->first = next; 139 return entry; 140 } 141 142 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 143 { 144 if (c->percpu_size) { 145 void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); 146 void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 147 148 if (!obj || !pptr) { 149 free_percpu(pptr); 150 kfree(obj); 151 return NULL; 152 } 153 obj[1] = pptr; 154 return obj; 155 } 156 157 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 158 } 159 160 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 161 { 162 #ifdef CONFIG_MEMCG 163 if (c->objcg) 164 return get_mem_cgroup_from_objcg(c->objcg); 165 return root_mem_cgroup; 166 #else 167 return NULL; 168 #endif 169 } 170 171 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 172 { 173 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 174 /* In RT irq_work runs in per-cpu kthread, so disable 175 * interrupts to avoid preemption and interrupts and 176 * reduce the chance of bpf prog executing on this cpu 177 * when active counter is busy. 178 */ 179 local_irq_save(*flags); 180 /* alloc_bulk runs from irq_work which will not preempt a bpf 181 * program that does unit_alloc/unit_free since IRQs are 182 * disabled there. There is no race to increment 'active' 183 * counter. It protects free_llist from corruption in case NMI 184 * bpf prog preempted this loop. 185 */ 186 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 187 } 188 189 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 190 { 191 local_dec(&c->active); 192 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 193 local_irq_restore(*flags); 194 } 195 196 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 197 { 198 unsigned long flags; 199 200 inc_active(c, &flags); 201 __llist_add(obj, &c->free_llist); 202 c->free_cnt++; 203 dec_active(c, &flags); 204 } 205 206 /* Mostly runs from irq_work except __init phase. */ 207 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 208 { 209 struct mem_cgroup *memcg = NULL, *old_memcg; 210 gfp_t gfp; 211 void *obj; 212 int i; 213 214 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 215 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 216 217 for (i = 0; i < cnt; i++) { 218 /* 219 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 220 * done only by one CPU == current CPU. Other CPUs might 221 * llist_add() and llist_del_all() in parallel. 222 */ 223 obj = llist_del_first(&c->free_by_rcu_ttrace); 224 if (!obj) 225 break; 226 add_obj_to_free_list(c, obj); 227 } 228 if (i >= cnt) 229 return; 230 231 for (; i < cnt; i++) { 232 obj = llist_del_first(&c->waiting_for_gp_ttrace); 233 if (!obj) 234 break; 235 add_obj_to_free_list(c, obj); 236 } 237 if (i >= cnt) 238 return; 239 240 memcg = get_memcg(c); 241 old_memcg = set_active_memcg(memcg); 242 for (; i < cnt; i++) { 243 /* Allocate, but don't deplete atomic reserves that typical 244 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 245 * will allocate from the current numa node which is what we 246 * want here. 247 */ 248 obj = __alloc(c, node, gfp); 249 if (!obj) 250 break; 251 add_obj_to_free_list(c, obj); 252 } 253 set_active_memcg(old_memcg); 254 mem_cgroup_put(memcg); 255 } 256 257 static void free_one(void *obj, bool percpu) 258 { 259 if (percpu) 260 free_percpu(((void __percpu **)obj)[1]); 261 262 kfree(obj); 263 } 264 265 static int free_all(struct bpf_mem_cache *c, struct llist_node *llnode, bool percpu) 266 { 267 struct llist_node *pos, *t; 268 int cnt = 0; 269 270 llist_for_each_safe(pos, t, llnode) { 271 if (c->dtor) 272 c->dtor((void *)pos + LLIST_NODE_SZ, c->dtor_ctx); 273 free_one(pos, percpu); 274 cnt++; 275 } 276 return cnt; 277 } 278 279 static void __free_rcu(struct rcu_head *head) 280 { 281 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 282 283 free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 284 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 285 } 286 287 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 288 { 289 struct llist_node *llnode = obj; 290 291 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 292 * Nothing races to add to free_by_rcu_ttrace list. 293 */ 294 llist_add(llnode, &c->free_by_rcu_ttrace); 295 } 296 297 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 298 { 299 struct llist_node *llnode, *t; 300 301 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 302 if (unlikely(READ_ONCE(c->draining))) { 303 llnode = llist_del_all(&c->free_by_rcu_ttrace); 304 free_all(c, llnode, !!c->percpu_size); 305 } 306 return; 307 } 308 309 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 310 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 311 llist_add(llnode, &c->waiting_for_gp_ttrace); 312 313 if (unlikely(READ_ONCE(c->draining))) { 314 __free_rcu(&c->rcu_ttrace); 315 return; 316 } 317 318 /* 319 * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 320 * RCU Tasks Trace grace period implies RCU grace period, so pass 321 * __free_rcu directly as the callback. 322 */ 323 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu); 324 } 325 326 static void free_bulk(struct bpf_mem_cache *c) 327 { 328 struct bpf_mem_cache *tgt = c->tgt; 329 struct llist_node *llnode, *t; 330 unsigned long flags; 331 int cnt; 332 333 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 334 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 335 336 do { 337 inc_active(c, &flags); 338 llnode = __llist_del_first(&c->free_llist); 339 if (llnode) 340 cnt = --c->free_cnt; 341 else 342 cnt = 0; 343 dec_active(c, &flags); 344 if (llnode) 345 enque_to_free(tgt, llnode); 346 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 347 348 /* and drain free_llist_extra */ 349 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 350 enque_to_free(tgt, llnode); 351 do_call_rcu_ttrace(tgt); 352 } 353 354 static void __free_by_rcu(struct rcu_head *head) 355 { 356 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 357 struct bpf_mem_cache *tgt = c->tgt; 358 struct llist_node *llnode; 359 360 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 361 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 362 363 llnode = llist_del_all(&c->waiting_for_gp); 364 if (!llnode) 365 goto out; 366 367 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 368 369 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 370 do_call_rcu_ttrace(tgt); 371 out: 372 atomic_set(&c->call_rcu_in_progress, 0); 373 } 374 375 static void check_free_by_rcu(struct bpf_mem_cache *c) 376 { 377 struct llist_node *llnode, *t; 378 unsigned long flags; 379 380 /* drain free_llist_extra_rcu */ 381 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 382 inc_active(c, &flags); 383 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 384 if (__llist_add(llnode, &c->free_by_rcu)) 385 c->free_by_rcu_tail = llnode; 386 dec_active(c, &flags); 387 } 388 389 if (llist_empty(&c->free_by_rcu)) 390 return; 391 392 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 393 /* 394 * Instead of kmalloc-ing new rcu_head and triggering 10k 395 * call_rcu() to hit rcutree.qhimark and force RCU to notice 396 * the overload just ask RCU to hurry up. There could be many 397 * objects in free_by_rcu list. 398 * This hint reduces memory consumption for an artificial 399 * benchmark from 2 Gbyte to 150 Mbyte. 400 */ 401 rcu_request_urgent_qs_task(current); 402 return; 403 } 404 405 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 406 407 inc_active(c, &flags); 408 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 409 c->waiting_for_gp_tail = c->free_by_rcu_tail; 410 dec_active(c, &flags); 411 412 if (unlikely(READ_ONCE(c->draining))) { 413 free_all(c, llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 414 atomic_set(&c->call_rcu_in_progress, 0); 415 } else { 416 call_rcu_hurry(&c->rcu, __free_by_rcu); 417 } 418 } 419 420 static void bpf_mem_refill(struct irq_work *work) 421 { 422 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 423 int cnt; 424 425 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 426 cnt = c->free_cnt; 427 if (cnt < c->low_watermark) 428 /* irq_work runs on this cpu and kmalloc will allocate 429 * from the current numa node which is what we want here. 430 */ 431 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 432 else if (cnt > c->high_watermark) 433 free_bulk(c); 434 435 check_free_by_rcu(c); 436 } 437 438 static void notrace irq_work_raise(struct bpf_mem_cache *c) 439 { 440 irq_work_queue(&c->refill_work); 441 } 442 443 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 444 * the freelist cache will be elem_size * 64 (or less) on each cpu. 445 * 446 * For bpf programs that don't have statically known allocation sizes and 447 * assuming (low_mark + high_mark) / 2 as an average number of elements per 448 * bucket and all buckets are used the total amount of memory in freelists 449 * on each cpu will be: 450 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 451 * == ~ 116 Kbyte using below heuristic. 452 * Initialized, but unused bpf allocator (not bpf map specific one) will 453 * consume ~ 11 Kbyte per cpu. 454 * Typical case will be between 11K and 116K closer to 11K. 455 * bpf progs can and should share bpf_mem_cache when possible. 456 * 457 * Percpu allocation is typically rare. To avoid potential unnecessary large 458 * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. 459 */ 460 static void init_refill_work(struct bpf_mem_cache *c) 461 { 462 init_irq_work(&c->refill_work, bpf_mem_refill); 463 if (c->percpu_size) { 464 c->low_watermark = 1; 465 c->high_watermark = 3; 466 } else if (c->unit_size <= 256) { 467 c->low_watermark = 32; 468 c->high_watermark = 96; 469 } else { 470 /* When page_size == 4k, order-0 cache will have low_mark == 2 471 * and high_mark == 6 with batch alloc of 3 individual pages at 472 * a time. 473 * 8k allocs and above low == 1, high == 3, batch == 1. 474 */ 475 c->low_watermark = max(32 * 256 / c->unit_size, 1); 476 c->high_watermark = max(96 * 256 / c->unit_size, 3); 477 } 478 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 479 } 480 481 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 482 { 483 int cnt = 1; 484 485 /* To avoid consuming memory, for non-percpu allocation, assume that 486 * 1st run of bpf prog won't be doing more than 4 map_update_elem from 487 * irq disabled region if unit size is less than or equal to 256. 488 * For all other cases, let us just do one allocation. 489 */ 490 if (!c->percpu_size && c->unit_size <= 256) 491 cnt = 4; 492 alloc_bulk(c, cnt, cpu_to_node(cpu), false); 493 } 494 495 /* When size != 0 bpf_mem_cache for each cpu. 496 * This is typical bpf hash map use case when all elements have equal size. 497 * 498 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 499 * kmalloc/kfree. Max allocation size is 4096 in this case. 500 * This is bpf_dynptr and bpf_kptr use case. 501 */ 502 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 503 { 504 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 505 struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; 506 struct obj_cgroup *objcg = NULL; 507 int cpu, i, unit_size, percpu_size = 0; 508 509 if (percpu && size == 0) 510 return -EINVAL; 511 512 /* room for llist_node and per-cpu pointer */ 513 if (percpu) 514 percpu_size = LLIST_NODE_SZ + sizeof(void *); 515 ma->percpu = percpu; 516 517 if (size) { 518 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 519 if (!pc) 520 return -ENOMEM; 521 522 if (!percpu) 523 size += LLIST_NODE_SZ; /* room for llist_node */ 524 unit_size = size; 525 526 #ifdef CONFIG_MEMCG 527 if (memcg_bpf_enabled()) 528 objcg = get_obj_cgroup_from_current(); 529 #endif 530 ma->objcg = objcg; 531 532 for_each_possible_cpu(cpu) { 533 c = per_cpu_ptr(pc, cpu); 534 c->unit_size = unit_size; 535 c->objcg = objcg; 536 c->percpu_size = percpu_size; 537 c->tgt = c; 538 init_refill_work(c); 539 prefill_mem_cache(c, cpu); 540 } 541 ma->cache = pc; 542 return 0; 543 } 544 545 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 546 if (!pcc) 547 return -ENOMEM; 548 #ifdef CONFIG_MEMCG 549 objcg = get_obj_cgroup_from_current(); 550 #endif 551 ma->objcg = objcg; 552 for_each_possible_cpu(cpu) { 553 cc = per_cpu_ptr(pcc, cpu); 554 for (i = 0; i < NUM_CACHES; i++) { 555 c = &cc->cache[i]; 556 c->unit_size = sizes[i]; 557 c->objcg = objcg; 558 c->percpu_size = percpu_size; 559 c->tgt = c; 560 561 init_refill_work(c); 562 prefill_mem_cache(c, cpu); 563 } 564 } 565 566 ma->caches = pcc; 567 return 0; 568 } 569 570 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 571 { 572 struct bpf_mem_caches __percpu *pcc; 573 574 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 575 if (!pcc) 576 return -ENOMEM; 577 578 ma->caches = pcc; 579 ma->objcg = objcg; 580 ma->percpu = true; 581 return 0; 582 } 583 584 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 585 { 586 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 587 int cpu, i, unit_size, percpu_size; 588 struct obj_cgroup *objcg; 589 struct bpf_mem_cache *c; 590 591 i = bpf_mem_cache_idx(size); 592 if (i < 0) 593 return -EINVAL; 594 595 /* room for llist_node and per-cpu pointer */ 596 percpu_size = LLIST_NODE_SZ + sizeof(void *); 597 598 unit_size = sizes[i]; 599 objcg = ma->objcg; 600 pcc = ma->caches; 601 602 for_each_possible_cpu(cpu) { 603 cc = per_cpu_ptr(pcc, cpu); 604 c = &cc->cache[i]; 605 if (c->unit_size) 606 break; 607 608 c->unit_size = unit_size; 609 c->objcg = objcg; 610 c->percpu_size = percpu_size; 611 c->tgt = c; 612 613 init_refill_work(c); 614 prefill_mem_cache(c, cpu); 615 } 616 617 return 0; 618 } 619 620 static void drain_mem_cache(struct bpf_mem_cache *c) 621 { 622 bool percpu = !!c->percpu_size; 623 624 /* No progs are using this bpf_mem_cache, but htab_map_free() called 625 * bpf_mem_cache_free() for all remaining elements and they can be in 626 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 627 * 628 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 629 * on these lists, so it is safe to use __llist_del_all(). 630 */ 631 free_all(c, llist_del_all(&c->free_by_rcu_ttrace), percpu); 632 free_all(c, llist_del_all(&c->waiting_for_gp_ttrace), percpu); 633 free_all(c, __llist_del_all(&c->free_llist), percpu); 634 free_all(c, __llist_del_all(&c->free_llist_extra), percpu); 635 free_all(c, __llist_del_all(&c->free_by_rcu), percpu); 636 free_all(c, __llist_del_all(&c->free_llist_extra_rcu), percpu); 637 free_all(c, llist_del_all(&c->waiting_for_gp), percpu); 638 } 639 640 static void check_mem_cache(struct bpf_mem_cache *c) 641 { 642 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 643 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 644 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 645 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 646 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 647 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 648 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 649 } 650 651 static void check_leaked_objs(struct bpf_mem_alloc *ma) 652 { 653 struct bpf_mem_caches *cc; 654 struct bpf_mem_cache *c; 655 int cpu, i; 656 657 if (ma->cache) { 658 for_each_possible_cpu(cpu) { 659 c = per_cpu_ptr(ma->cache, cpu); 660 check_mem_cache(c); 661 } 662 } 663 if (ma->caches) { 664 for_each_possible_cpu(cpu) { 665 cc = per_cpu_ptr(ma->caches, cpu); 666 for (i = 0; i < NUM_CACHES; i++) { 667 c = &cc->cache[i]; 668 check_mem_cache(c); 669 } 670 } 671 } 672 } 673 674 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 675 { 676 /* We can free dtor ctx only once all callbacks are done using it. */ 677 if (ma->dtor_ctx_free) 678 ma->dtor_ctx_free(ma->dtor_ctx); 679 check_leaked_objs(ma); 680 free_percpu(ma->cache); 681 free_percpu(ma->caches); 682 ma->cache = NULL; 683 ma->caches = NULL; 684 } 685 686 static void free_mem_alloc(struct bpf_mem_alloc *ma) 687 { 688 /* 689 * waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 690 * might still execute. Wait for them. 691 * 692 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 693 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 694 * to wait for the pending __free_by_rcu(), and __free_rcu(). RCU Tasks 695 * Trace grace period implies RCU grace period, so all __free_rcu don't 696 * need extra call_rcu() (and thus extra rcu_barrier() here). 697 */ 698 rcu_barrier(); /* wait for __free_by_rcu */ 699 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 700 free_mem_alloc_no_barrier(ma); 701 } 702 703 static void free_mem_alloc_deferred(struct work_struct *work) 704 { 705 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 706 707 free_mem_alloc(ma); 708 kfree(ma); 709 } 710 711 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 712 { 713 struct bpf_mem_alloc *copy; 714 715 if (!rcu_in_progress) { 716 /* Fast path. No callbacks are pending, hence no need to do 717 * rcu_barrier-s. 718 */ 719 free_mem_alloc_no_barrier(ma); 720 return; 721 } 722 723 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 724 if (!copy) { 725 /* Slow path with inline barrier-s */ 726 free_mem_alloc(ma); 727 return; 728 } 729 730 /* Defer barriers into worker to let the rest of map memory to be freed */ 731 memset(ma, 0, sizeof(*ma)); 732 INIT_WORK(©->work, free_mem_alloc_deferred); 733 queue_work(system_dfl_wq, ©->work); 734 } 735 736 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 737 { 738 struct bpf_mem_caches *cc; 739 struct bpf_mem_cache *c; 740 int cpu, i, rcu_in_progress; 741 742 if (ma->cache) { 743 rcu_in_progress = 0; 744 for_each_possible_cpu(cpu) { 745 c = per_cpu_ptr(ma->cache, cpu); 746 WRITE_ONCE(c->draining, true); 747 irq_work_sync(&c->refill_work); 748 drain_mem_cache(c); 749 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 750 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 751 } 752 obj_cgroup_put(ma->objcg); 753 destroy_mem_alloc(ma, rcu_in_progress); 754 } 755 if (ma->caches) { 756 rcu_in_progress = 0; 757 for_each_possible_cpu(cpu) { 758 cc = per_cpu_ptr(ma->caches, cpu); 759 for (i = 0; i < NUM_CACHES; i++) { 760 c = &cc->cache[i]; 761 WRITE_ONCE(c->draining, true); 762 irq_work_sync(&c->refill_work); 763 drain_mem_cache(c); 764 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 765 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 766 } 767 } 768 obj_cgroup_put(ma->objcg); 769 destroy_mem_alloc(ma, rcu_in_progress); 770 } 771 } 772 773 /* notrace is necessary here and in other functions to make sure 774 * bpf programs cannot attach to them and cause llist corruptions. 775 */ 776 static void notrace *unit_alloc(struct bpf_mem_cache *c) 777 { 778 struct llist_node *llnode = NULL; 779 unsigned long flags; 780 int cnt = 0; 781 782 /* Disable irqs to prevent the following race for majority of prog types: 783 * prog_A 784 * bpf_mem_alloc 785 * preemption or irq -> prog_B 786 * bpf_mem_alloc 787 * 788 * but prog_B could be a perf_event NMI prog. 789 * Use per-cpu 'active' counter to order free_list access between 790 * unit_alloc/unit_free/bpf_mem_refill. 791 */ 792 local_irq_save(flags); 793 if (local_inc_return(&c->active) == 1) { 794 llnode = __llist_del_first(&c->free_llist); 795 if (llnode) { 796 cnt = --c->free_cnt; 797 *(struct bpf_mem_cache **)llnode = c; 798 } 799 } 800 local_dec(&c->active); 801 802 WARN_ON(cnt < 0); 803 804 if (cnt < c->low_watermark) 805 irq_work_raise(c); 806 /* Enable IRQ after the enqueue of irq work completes, so irq work 807 * will run after IRQ is enabled and free_llist may be refilled by 808 * irq work before other task preempts current task. 809 */ 810 local_irq_restore(flags); 811 812 return llnode; 813 } 814 815 /* Though 'ptr' object could have been allocated on a different cpu 816 * add it to the free_llist of the current cpu. 817 * Let kfree() logic deal with it when it's later called from irq_work. 818 */ 819 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 820 { 821 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 822 unsigned long flags; 823 int cnt = 0; 824 825 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 826 827 /* 828 * Remember bpf_mem_cache that allocated this object. 829 * The hint is not accurate. 830 */ 831 c->tgt = *(struct bpf_mem_cache **)llnode; 832 833 local_irq_save(flags); 834 if (local_inc_return(&c->active) == 1) { 835 __llist_add(llnode, &c->free_llist); 836 cnt = ++c->free_cnt; 837 } else { 838 /* unit_free() cannot fail. Therefore add an object to atomic 839 * llist. free_bulk() will drain it. Though free_llist_extra is 840 * a per-cpu list we have to use atomic llist_add here, since 841 * it also can be interrupted by bpf nmi prog that does another 842 * unit_free() into the same free_llist_extra. 843 */ 844 llist_add(llnode, &c->free_llist_extra); 845 } 846 local_dec(&c->active); 847 848 if (cnt > c->high_watermark) 849 /* free few objects from current cpu into global kmalloc pool */ 850 irq_work_raise(c); 851 /* Enable IRQ after irq_work_raise() completes, otherwise when current 852 * task is preempted by task which does unit_alloc(), unit_alloc() may 853 * return NULL unexpectedly because irq work is already pending but can 854 * not been triggered and free_llist can not be refilled timely. 855 */ 856 local_irq_restore(flags); 857 } 858 859 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 860 { 861 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 862 unsigned long flags; 863 864 c->tgt = *(struct bpf_mem_cache **)llnode; 865 866 local_irq_save(flags); 867 if (local_inc_return(&c->active) == 1) { 868 if (__llist_add(llnode, &c->free_by_rcu)) 869 c->free_by_rcu_tail = llnode; 870 } else { 871 llist_add(llnode, &c->free_llist_extra_rcu); 872 } 873 local_dec(&c->active); 874 875 if (!atomic_read(&c->call_rcu_in_progress)) 876 irq_work_raise(c); 877 local_irq_restore(flags); 878 } 879 880 /* Called from BPF program or from sys_bpf syscall. 881 * In both cases migration is disabled. 882 */ 883 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 884 { 885 int idx; 886 void *ret; 887 888 if (!size) 889 return NULL; 890 891 if (!ma->percpu) 892 size += LLIST_NODE_SZ; 893 idx = bpf_mem_cache_idx(size); 894 if (idx < 0) 895 return NULL; 896 897 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 898 return !ret ? NULL : ret + LLIST_NODE_SZ; 899 } 900 901 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 902 { 903 struct bpf_mem_cache *c; 904 int idx; 905 906 if (!ptr) 907 return; 908 909 c = *(void **)(ptr - LLIST_NODE_SZ); 910 idx = bpf_mem_cache_idx(c->unit_size); 911 if (WARN_ON_ONCE(idx < 0)) 912 return; 913 914 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 915 } 916 917 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 918 { 919 struct bpf_mem_cache *c; 920 int idx; 921 922 if (!ptr) 923 return; 924 925 c = *(void **)(ptr - LLIST_NODE_SZ); 926 idx = bpf_mem_cache_idx(c->unit_size); 927 if (WARN_ON_ONCE(idx < 0)) 928 return; 929 930 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 931 } 932 933 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 934 { 935 void *ret; 936 937 ret = unit_alloc(this_cpu_ptr(ma->cache)); 938 return !ret ? NULL : ret + LLIST_NODE_SZ; 939 } 940 941 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 942 { 943 if (!ptr) 944 return; 945 946 unit_free(this_cpu_ptr(ma->cache), ptr); 947 } 948 949 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 950 { 951 if (!ptr) 952 return; 953 954 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 955 } 956 957 /* Directly does a kfree() without putting 'ptr' back to the free_llist 958 * for reuse and without waiting for a rcu_tasks_trace gp. 959 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 960 * before calling bpf_mem_cache_raw_free(). 961 * It could be used when the rcu_tasks_trace callback does not have 962 * a hold on the original bpf_mem_alloc object that allocated the 963 * 'ptr'. This should only be used in the uncommon code path. 964 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 965 * and may affect performance. 966 */ 967 void bpf_mem_cache_raw_free(void *ptr) 968 { 969 if (!ptr) 970 return; 971 972 kfree(ptr - LLIST_NODE_SZ); 973 } 974 975 /* When flags == GFP_KERNEL, it signals that the caller will not cause 976 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 977 * kmalloc if the free_llist is empty. 978 */ 979 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 980 { 981 struct bpf_mem_cache *c; 982 void *ret; 983 984 c = this_cpu_ptr(ma->cache); 985 986 ret = unit_alloc(c); 987 if (!ret && flags == GFP_KERNEL) { 988 struct mem_cgroup *memcg, *old_memcg; 989 990 memcg = get_memcg(c); 991 old_memcg = set_active_memcg(memcg); 992 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 993 if (ret) 994 *(struct bpf_mem_cache **)ret = c; 995 set_active_memcg(old_memcg); 996 mem_cgroup_put(memcg); 997 } 998 999 return !ret ? NULL : ret + LLIST_NODE_SZ; 1000 } 1001 1002 int bpf_mem_alloc_check_size(bool percpu, size_t size) 1003 { 1004 /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ 1005 if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || 1006 (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) 1007 return -E2BIG; 1008 1009 return 0; 1010 } 1011 1012 void bpf_mem_alloc_set_dtor(struct bpf_mem_alloc *ma, void (*dtor)(void *obj, void *ctx), 1013 void (*dtor_ctx_free)(void *ctx), void *ctx) 1014 { 1015 struct bpf_mem_caches *cc; 1016 struct bpf_mem_cache *c; 1017 int cpu, i; 1018 1019 ma->dtor_ctx_free = dtor_ctx_free; 1020 ma->dtor_ctx = ctx; 1021 1022 if (ma->cache) { 1023 for_each_possible_cpu(cpu) { 1024 c = per_cpu_ptr(ma->cache, cpu); 1025 c->dtor = dtor; 1026 c->dtor_ctx = ctx; 1027 } 1028 } 1029 if (ma->caches) { 1030 for_each_possible_cpu(cpu) { 1031 cc = per_cpu_ptr(ma->caches, cpu); 1032 for (i = 0; i < NUM_CACHES; i++) { 1033 c = &cc->cache[i]; 1034 c->dtor = dtor; 1035 c->dtor_ctx = ctx; 1036 } 1037 } 1038 } 1039 } 1040