1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 39 static u8 size_index[24] __ro_after_init = { 40 3, /* 8 */ 41 3, /* 16 */ 42 4, /* 24 */ 43 4, /* 32 */ 44 5, /* 40 */ 45 5, /* 48 */ 46 5, /* 56 */ 47 5, /* 64 */ 48 1, /* 72 */ 49 1, /* 80 */ 50 1, /* 88 */ 51 1, /* 96 */ 52 6, /* 104 */ 53 6, /* 112 */ 54 6, /* 120 */ 55 6, /* 128 */ 56 2, /* 136 */ 57 2, /* 144 */ 58 2, /* 152 */ 59 2, /* 160 */ 60 2, /* 168 */ 61 2, /* 176 */ 62 2, /* 184 */ 63 2 /* 192 */ 64 }; 65 66 static int bpf_mem_cache_idx(size_t size) 67 { 68 if (!size || size > 4096) 69 return -1; 70 71 if (size <= 192) 72 return size_index[(size - 1) / 8] - 1; 73 74 return fls(size - 1) - 2; 75 } 76 77 #define NUM_CACHES 11 78 79 struct bpf_mem_cache { 80 /* per-cpu list of free objects of size 'unit_size'. 81 * All accesses are done with interrupts disabled and 'active' counter 82 * protection with __llist_add() and __llist_del_first(). 83 */ 84 struct llist_head free_llist; 85 local_t active; 86 87 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 88 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 89 * fail. When 'active' is busy the unit_free() will add an object to 90 * free_llist_extra. 91 */ 92 struct llist_head free_llist_extra; 93 94 struct irq_work refill_work; 95 struct obj_cgroup *objcg; 96 int unit_size; 97 /* count of objects in free_llist */ 98 int free_cnt; 99 int low_watermark, high_watermark, batch; 100 int percpu_size; 101 bool draining; 102 struct bpf_mem_cache *tgt; 103 104 /* list of objects to be freed after RCU GP */ 105 struct llist_head free_by_rcu; 106 struct llist_node *free_by_rcu_tail; 107 struct llist_head waiting_for_gp; 108 struct llist_node *waiting_for_gp_tail; 109 struct rcu_head rcu; 110 atomic_t call_rcu_in_progress; 111 struct llist_head free_llist_extra_rcu; 112 113 /* list of objects to be freed after RCU tasks trace GP */ 114 struct llist_head free_by_rcu_ttrace; 115 struct llist_head waiting_for_gp_ttrace; 116 struct rcu_head rcu_ttrace; 117 atomic_t call_rcu_ttrace_in_progress; 118 }; 119 120 struct bpf_mem_caches { 121 struct bpf_mem_cache cache[NUM_CACHES]; 122 }; 123 124 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 125 126 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 127 { 128 struct llist_node *entry, *next; 129 130 entry = head->first; 131 if (!entry) 132 return NULL; 133 next = entry->next; 134 head->first = next; 135 return entry; 136 } 137 138 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 139 { 140 if (c->percpu_size) { 141 void **obj = kmalloc_node(c->percpu_size, flags, node); 142 void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 143 144 if (!obj || !pptr) { 145 free_percpu(pptr); 146 kfree(obj); 147 return NULL; 148 } 149 obj[1] = pptr; 150 return obj; 151 } 152 153 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 154 } 155 156 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 157 { 158 #ifdef CONFIG_MEMCG_KMEM 159 if (c->objcg) 160 return get_mem_cgroup_from_objcg(c->objcg); 161 #endif 162 163 #ifdef CONFIG_MEMCG 164 return root_mem_cgroup; 165 #else 166 return NULL; 167 #endif 168 } 169 170 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 171 { 172 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 173 /* In RT irq_work runs in per-cpu kthread, so disable 174 * interrupts to avoid preemption and interrupts and 175 * reduce the chance of bpf prog executing on this cpu 176 * when active counter is busy. 177 */ 178 local_irq_save(*flags); 179 /* alloc_bulk runs from irq_work which will not preempt a bpf 180 * program that does unit_alloc/unit_free since IRQs are 181 * disabled there. There is no race to increment 'active' 182 * counter. It protects free_llist from corruption in case NMI 183 * bpf prog preempted this loop. 184 */ 185 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 186 } 187 188 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 189 { 190 local_dec(&c->active); 191 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 192 local_irq_restore(*flags); 193 } 194 195 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 196 { 197 unsigned long flags; 198 199 inc_active(c, &flags); 200 __llist_add(obj, &c->free_llist); 201 c->free_cnt++; 202 dec_active(c, &flags); 203 } 204 205 /* Mostly runs from irq_work except __init phase. */ 206 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 207 { 208 struct mem_cgroup *memcg = NULL, *old_memcg; 209 gfp_t gfp; 210 void *obj; 211 int i; 212 213 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 214 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 215 216 for (i = 0; i < cnt; i++) { 217 /* 218 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 219 * done only by one CPU == current CPU. Other CPUs might 220 * llist_add() and llist_del_all() in parallel. 221 */ 222 obj = llist_del_first(&c->free_by_rcu_ttrace); 223 if (!obj) 224 break; 225 add_obj_to_free_list(c, obj); 226 } 227 if (i >= cnt) 228 return; 229 230 for (; i < cnt; i++) { 231 obj = llist_del_first(&c->waiting_for_gp_ttrace); 232 if (!obj) 233 break; 234 add_obj_to_free_list(c, obj); 235 } 236 if (i >= cnt) 237 return; 238 239 memcg = get_memcg(c); 240 old_memcg = set_active_memcg(memcg); 241 for (; i < cnt; i++) { 242 /* Allocate, but don't deplete atomic reserves that typical 243 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 244 * will allocate from the current numa node which is what we 245 * want here. 246 */ 247 obj = __alloc(c, node, gfp); 248 if (!obj) 249 break; 250 add_obj_to_free_list(c, obj); 251 } 252 set_active_memcg(old_memcg); 253 mem_cgroup_put(memcg); 254 } 255 256 static void free_one(void *obj, bool percpu) 257 { 258 if (percpu) { 259 free_percpu(((void **)obj)[1]); 260 kfree(obj); 261 return; 262 } 263 264 kfree(obj); 265 } 266 267 static int free_all(struct llist_node *llnode, bool percpu) 268 { 269 struct llist_node *pos, *t; 270 int cnt = 0; 271 272 llist_for_each_safe(pos, t, llnode) { 273 free_one(pos, percpu); 274 cnt++; 275 } 276 return cnt; 277 } 278 279 static void __free_rcu(struct rcu_head *head) 280 { 281 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 282 283 free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 284 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 285 } 286 287 static void __free_rcu_tasks_trace(struct rcu_head *head) 288 { 289 /* If RCU Tasks Trace grace period implies RCU grace period, 290 * there is no need to invoke call_rcu(). 291 */ 292 if (rcu_trace_implies_rcu_gp()) 293 __free_rcu(head); 294 else 295 call_rcu(head, __free_rcu); 296 } 297 298 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 299 { 300 struct llist_node *llnode = obj; 301 302 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 303 * Nothing races to add to free_by_rcu_ttrace list. 304 */ 305 llist_add(llnode, &c->free_by_rcu_ttrace); 306 } 307 308 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 309 { 310 struct llist_node *llnode, *t; 311 312 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 313 if (unlikely(READ_ONCE(c->draining))) { 314 llnode = llist_del_all(&c->free_by_rcu_ttrace); 315 free_all(llnode, !!c->percpu_size); 316 } 317 return; 318 } 319 320 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 321 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 322 llist_add(llnode, &c->waiting_for_gp_ttrace); 323 324 if (unlikely(READ_ONCE(c->draining))) { 325 __free_rcu(&c->rcu_ttrace); 326 return; 327 } 328 329 /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 330 * If RCU Tasks Trace grace period implies RCU grace period, free 331 * these elements directly, else use call_rcu() to wait for normal 332 * progs to finish and finally do free_one() on each element. 333 */ 334 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); 335 } 336 337 static void free_bulk(struct bpf_mem_cache *c) 338 { 339 struct bpf_mem_cache *tgt = c->tgt; 340 struct llist_node *llnode, *t; 341 unsigned long flags; 342 int cnt; 343 344 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 345 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 346 347 do { 348 inc_active(c, &flags); 349 llnode = __llist_del_first(&c->free_llist); 350 if (llnode) 351 cnt = --c->free_cnt; 352 else 353 cnt = 0; 354 dec_active(c, &flags); 355 if (llnode) 356 enque_to_free(tgt, llnode); 357 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 358 359 /* and drain free_llist_extra */ 360 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 361 enque_to_free(tgt, llnode); 362 do_call_rcu_ttrace(tgt); 363 } 364 365 static void __free_by_rcu(struct rcu_head *head) 366 { 367 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 368 struct bpf_mem_cache *tgt = c->tgt; 369 struct llist_node *llnode; 370 371 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 372 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 373 374 llnode = llist_del_all(&c->waiting_for_gp); 375 if (!llnode) 376 goto out; 377 378 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 379 380 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 381 do_call_rcu_ttrace(tgt); 382 out: 383 atomic_set(&c->call_rcu_in_progress, 0); 384 } 385 386 static void check_free_by_rcu(struct bpf_mem_cache *c) 387 { 388 struct llist_node *llnode, *t; 389 unsigned long flags; 390 391 /* drain free_llist_extra_rcu */ 392 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 393 inc_active(c, &flags); 394 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 395 if (__llist_add(llnode, &c->free_by_rcu)) 396 c->free_by_rcu_tail = llnode; 397 dec_active(c, &flags); 398 } 399 400 if (llist_empty(&c->free_by_rcu)) 401 return; 402 403 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 404 /* 405 * Instead of kmalloc-ing new rcu_head and triggering 10k 406 * call_rcu() to hit rcutree.qhimark and force RCU to notice 407 * the overload just ask RCU to hurry up. There could be many 408 * objects in free_by_rcu list. 409 * This hint reduces memory consumption for an artificial 410 * benchmark from 2 Gbyte to 150 Mbyte. 411 */ 412 rcu_request_urgent_qs_task(current); 413 return; 414 } 415 416 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 417 418 inc_active(c, &flags); 419 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 420 c->waiting_for_gp_tail = c->free_by_rcu_tail; 421 dec_active(c, &flags); 422 423 if (unlikely(READ_ONCE(c->draining))) { 424 free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 425 atomic_set(&c->call_rcu_in_progress, 0); 426 } else { 427 call_rcu_hurry(&c->rcu, __free_by_rcu); 428 } 429 } 430 431 static void bpf_mem_refill(struct irq_work *work) 432 { 433 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 434 int cnt; 435 436 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 437 cnt = c->free_cnt; 438 if (cnt < c->low_watermark) 439 /* irq_work runs on this cpu and kmalloc will allocate 440 * from the current numa node which is what we want here. 441 */ 442 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 443 else if (cnt > c->high_watermark) 444 free_bulk(c); 445 446 check_free_by_rcu(c); 447 } 448 449 static void notrace irq_work_raise(struct bpf_mem_cache *c) 450 { 451 irq_work_queue(&c->refill_work); 452 } 453 454 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 455 * the freelist cache will be elem_size * 64 (or less) on each cpu. 456 * 457 * For bpf programs that don't have statically known allocation sizes and 458 * assuming (low_mark + high_mark) / 2 as an average number of elements per 459 * bucket and all buckets are used the total amount of memory in freelists 460 * on each cpu will be: 461 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 462 * == ~ 116 Kbyte using below heuristic. 463 * Initialized, but unused bpf allocator (not bpf map specific one) will 464 * consume ~ 11 Kbyte per cpu. 465 * Typical case will be between 11K and 116K closer to 11K. 466 * bpf progs can and should share bpf_mem_cache when possible. 467 * 468 * Percpu allocation is typically rare. To avoid potential unnecessary large 469 * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. 470 */ 471 static void init_refill_work(struct bpf_mem_cache *c) 472 { 473 init_irq_work(&c->refill_work, bpf_mem_refill); 474 if (c->percpu_size) { 475 c->low_watermark = 1; 476 c->high_watermark = 3; 477 } else if (c->unit_size <= 256) { 478 c->low_watermark = 32; 479 c->high_watermark = 96; 480 } else { 481 /* When page_size == 4k, order-0 cache will have low_mark == 2 482 * and high_mark == 6 with batch alloc of 3 individual pages at 483 * a time. 484 * 8k allocs and above low == 1, high == 3, batch == 1. 485 */ 486 c->low_watermark = max(32 * 256 / c->unit_size, 1); 487 c->high_watermark = max(96 * 256 / c->unit_size, 3); 488 } 489 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 490 } 491 492 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 493 { 494 int cnt = 1; 495 496 /* To avoid consuming memory, for non-percpu allocation, assume that 497 * 1st run of bpf prog won't be doing more than 4 map_update_elem from 498 * irq disabled region if unit size is less than or equal to 256. 499 * For all other cases, let us just do one allocation. 500 */ 501 if (!c->percpu_size && c->unit_size <= 256) 502 cnt = 4; 503 alloc_bulk(c, cnt, cpu_to_node(cpu), false); 504 } 505 506 /* When size != 0 bpf_mem_cache for each cpu. 507 * This is typical bpf hash map use case when all elements have equal size. 508 * 509 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 510 * kmalloc/kfree. Max allocation size is 4096 in this case. 511 * This is bpf_dynptr and bpf_kptr use case. 512 */ 513 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 514 { 515 struct bpf_mem_caches *cc, __percpu *pcc; 516 struct bpf_mem_cache *c, __percpu *pc; 517 struct obj_cgroup *objcg = NULL; 518 int cpu, i, unit_size, percpu_size = 0; 519 520 if (percpu && size == 0) 521 return -EINVAL; 522 523 /* room for llist_node and per-cpu pointer */ 524 if (percpu) 525 percpu_size = LLIST_NODE_SZ + sizeof(void *); 526 ma->percpu = percpu; 527 528 if (size) { 529 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 530 if (!pc) 531 return -ENOMEM; 532 533 if (!percpu) 534 size += LLIST_NODE_SZ; /* room for llist_node */ 535 unit_size = size; 536 537 #ifdef CONFIG_MEMCG_KMEM 538 if (memcg_bpf_enabled()) 539 objcg = get_obj_cgroup_from_current(); 540 #endif 541 ma->objcg = objcg; 542 543 for_each_possible_cpu(cpu) { 544 c = per_cpu_ptr(pc, cpu); 545 c->unit_size = unit_size; 546 c->objcg = objcg; 547 c->percpu_size = percpu_size; 548 c->tgt = c; 549 init_refill_work(c); 550 prefill_mem_cache(c, cpu); 551 } 552 ma->cache = pc; 553 return 0; 554 } 555 556 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 557 if (!pcc) 558 return -ENOMEM; 559 #ifdef CONFIG_MEMCG_KMEM 560 objcg = get_obj_cgroup_from_current(); 561 #endif 562 ma->objcg = objcg; 563 for_each_possible_cpu(cpu) { 564 cc = per_cpu_ptr(pcc, cpu); 565 for (i = 0; i < NUM_CACHES; i++) { 566 c = &cc->cache[i]; 567 c->unit_size = sizes[i]; 568 c->objcg = objcg; 569 c->percpu_size = percpu_size; 570 c->tgt = c; 571 572 init_refill_work(c); 573 prefill_mem_cache(c, cpu); 574 } 575 } 576 577 ma->caches = pcc; 578 return 0; 579 } 580 581 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 582 { 583 struct bpf_mem_caches __percpu *pcc; 584 585 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 586 if (!pcc) 587 return -ENOMEM; 588 589 ma->caches = pcc; 590 ma->objcg = objcg; 591 ma->percpu = true; 592 return 0; 593 } 594 595 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 596 { 597 struct bpf_mem_caches *cc, __percpu *pcc; 598 int cpu, i, unit_size, percpu_size; 599 struct obj_cgroup *objcg; 600 struct bpf_mem_cache *c; 601 602 i = bpf_mem_cache_idx(size); 603 if (i < 0) 604 return -EINVAL; 605 606 /* room for llist_node and per-cpu pointer */ 607 percpu_size = LLIST_NODE_SZ + sizeof(void *); 608 609 unit_size = sizes[i]; 610 objcg = ma->objcg; 611 pcc = ma->caches; 612 613 for_each_possible_cpu(cpu) { 614 cc = per_cpu_ptr(pcc, cpu); 615 c = &cc->cache[i]; 616 if (c->unit_size) 617 break; 618 619 c->unit_size = unit_size; 620 c->objcg = objcg; 621 c->percpu_size = percpu_size; 622 c->tgt = c; 623 624 init_refill_work(c); 625 prefill_mem_cache(c, cpu); 626 } 627 628 return 0; 629 } 630 631 static void drain_mem_cache(struct bpf_mem_cache *c) 632 { 633 bool percpu = !!c->percpu_size; 634 635 /* No progs are using this bpf_mem_cache, but htab_map_free() called 636 * bpf_mem_cache_free() for all remaining elements and they can be in 637 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 638 * 639 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 640 * on these lists, so it is safe to use __llist_del_all(). 641 */ 642 free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 643 free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 644 free_all(__llist_del_all(&c->free_llist), percpu); 645 free_all(__llist_del_all(&c->free_llist_extra), percpu); 646 free_all(__llist_del_all(&c->free_by_rcu), percpu); 647 free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); 648 free_all(llist_del_all(&c->waiting_for_gp), percpu); 649 } 650 651 static void check_mem_cache(struct bpf_mem_cache *c) 652 { 653 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 654 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 655 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 656 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 657 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 658 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 659 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 660 } 661 662 static void check_leaked_objs(struct bpf_mem_alloc *ma) 663 { 664 struct bpf_mem_caches *cc; 665 struct bpf_mem_cache *c; 666 int cpu, i; 667 668 if (ma->cache) { 669 for_each_possible_cpu(cpu) { 670 c = per_cpu_ptr(ma->cache, cpu); 671 check_mem_cache(c); 672 } 673 } 674 if (ma->caches) { 675 for_each_possible_cpu(cpu) { 676 cc = per_cpu_ptr(ma->caches, cpu); 677 for (i = 0; i < NUM_CACHES; i++) { 678 c = &cc->cache[i]; 679 check_mem_cache(c); 680 } 681 } 682 } 683 } 684 685 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 686 { 687 check_leaked_objs(ma); 688 free_percpu(ma->cache); 689 free_percpu(ma->caches); 690 ma->cache = NULL; 691 ma->caches = NULL; 692 } 693 694 static void free_mem_alloc(struct bpf_mem_alloc *ma) 695 { 696 /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 697 * might still execute. Wait for them. 698 * 699 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 700 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 701 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), 702 * so if call_rcu(head, __free_rcu) is skipped due to 703 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by 704 * using rcu_trace_implies_rcu_gp() as well. 705 */ 706 rcu_barrier(); /* wait for __free_by_rcu */ 707 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 708 if (!rcu_trace_implies_rcu_gp()) 709 rcu_barrier(); 710 free_mem_alloc_no_barrier(ma); 711 } 712 713 static void free_mem_alloc_deferred(struct work_struct *work) 714 { 715 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 716 717 free_mem_alloc(ma); 718 kfree(ma); 719 } 720 721 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 722 { 723 struct bpf_mem_alloc *copy; 724 725 if (!rcu_in_progress) { 726 /* Fast path. No callbacks are pending, hence no need to do 727 * rcu_barrier-s. 728 */ 729 free_mem_alloc_no_barrier(ma); 730 return; 731 } 732 733 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 734 if (!copy) { 735 /* Slow path with inline barrier-s */ 736 free_mem_alloc(ma); 737 return; 738 } 739 740 /* Defer barriers into worker to let the rest of map memory to be freed */ 741 memset(ma, 0, sizeof(*ma)); 742 INIT_WORK(©->work, free_mem_alloc_deferred); 743 queue_work(system_unbound_wq, ©->work); 744 } 745 746 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 747 { 748 struct bpf_mem_caches *cc; 749 struct bpf_mem_cache *c; 750 int cpu, i, rcu_in_progress; 751 752 if (ma->cache) { 753 rcu_in_progress = 0; 754 for_each_possible_cpu(cpu) { 755 c = per_cpu_ptr(ma->cache, cpu); 756 WRITE_ONCE(c->draining, true); 757 irq_work_sync(&c->refill_work); 758 drain_mem_cache(c); 759 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 760 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 761 } 762 if (ma->objcg) 763 obj_cgroup_put(ma->objcg); 764 destroy_mem_alloc(ma, rcu_in_progress); 765 } 766 if (ma->caches) { 767 rcu_in_progress = 0; 768 for_each_possible_cpu(cpu) { 769 cc = per_cpu_ptr(ma->caches, cpu); 770 for (i = 0; i < NUM_CACHES; i++) { 771 c = &cc->cache[i]; 772 WRITE_ONCE(c->draining, true); 773 irq_work_sync(&c->refill_work); 774 drain_mem_cache(c); 775 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 776 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 777 } 778 } 779 if (ma->objcg) 780 obj_cgroup_put(ma->objcg); 781 destroy_mem_alloc(ma, rcu_in_progress); 782 } 783 } 784 785 /* notrace is necessary here and in other functions to make sure 786 * bpf programs cannot attach to them and cause llist corruptions. 787 */ 788 static void notrace *unit_alloc(struct bpf_mem_cache *c) 789 { 790 struct llist_node *llnode = NULL; 791 unsigned long flags; 792 int cnt = 0; 793 794 /* Disable irqs to prevent the following race for majority of prog types: 795 * prog_A 796 * bpf_mem_alloc 797 * preemption or irq -> prog_B 798 * bpf_mem_alloc 799 * 800 * but prog_B could be a perf_event NMI prog. 801 * Use per-cpu 'active' counter to order free_list access between 802 * unit_alloc/unit_free/bpf_mem_refill. 803 */ 804 local_irq_save(flags); 805 if (local_inc_return(&c->active) == 1) { 806 llnode = __llist_del_first(&c->free_llist); 807 if (llnode) { 808 cnt = --c->free_cnt; 809 *(struct bpf_mem_cache **)llnode = c; 810 } 811 } 812 local_dec(&c->active); 813 814 WARN_ON(cnt < 0); 815 816 if (cnt < c->low_watermark) 817 irq_work_raise(c); 818 /* Enable IRQ after the enqueue of irq work completes, so irq work 819 * will run after IRQ is enabled and free_llist may be refilled by 820 * irq work before other task preempts current task. 821 */ 822 local_irq_restore(flags); 823 824 return llnode; 825 } 826 827 /* Though 'ptr' object could have been allocated on a different cpu 828 * add it to the free_llist of the current cpu. 829 * Let kfree() logic deal with it when it's later called from irq_work. 830 */ 831 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 832 { 833 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 834 unsigned long flags; 835 int cnt = 0; 836 837 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 838 839 /* 840 * Remember bpf_mem_cache that allocated this object. 841 * The hint is not accurate. 842 */ 843 c->tgt = *(struct bpf_mem_cache **)llnode; 844 845 local_irq_save(flags); 846 if (local_inc_return(&c->active) == 1) { 847 __llist_add(llnode, &c->free_llist); 848 cnt = ++c->free_cnt; 849 } else { 850 /* unit_free() cannot fail. Therefore add an object to atomic 851 * llist. free_bulk() will drain it. Though free_llist_extra is 852 * a per-cpu list we have to use atomic llist_add here, since 853 * it also can be interrupted by bpf nmi prog that does another 854 * unit_free() into the same free_llist_extra. 855 */ 856 llist_add(llnode, &c->free_llist_extra); 857 } 858 local_dec(&c->active); 859 860 if (cnt > c->high_watermark) 861 /* free few objects from current cpu into global kmalloc pool */ 862 irq_work_raise(c); 863 /* Enable IRQ after irq_work_raise() completes, otherwise when current 864 * task is preempted by task which does unit_alloc(), unit_alloc() may 865 * return NULL unexpectedly because irq work is already pending but can 866 * not been triggered and free_llist can not be refilled timely. 867 */ 868 local_irq_restore(flags); 869 } 870 871 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 872 { 873 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 874 unsigned long flags; 875 876 c->tgt = *(struct bpf_mem_cache **)llnode; 877 878 local_irq_save(flags); 879 if (local_inc_return(&c->active) == 1) { 880 if (__llist_add(llnode, &c->free_by_rcu)) 881 c->free_by_rcu_tail = llnode; 882 } else { 883 llist_add(llnode, &c->free_llist_extra_rcu); 884 } 885 local_dec(&c->active); 886 887 if (!atomic_read(&c->call_rcu_in_progress)) 888 irq_work_raise(c); 889 local_irq_restore(flags); 890 } 891 892 /* Called from BPF program or from sys_bpf syscall. 893 * In both cases migration is disabled. 894 */ 895 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 896 { 897 int idx; 898 void *ret; 899 900 if (!size) 901 return NULL; 902 903 if (!ma->percpu) 904 size += LLIST_NODE_SZ; 905 idx = bpf_mem_cache_idx(size); 906 if (idx < 0) 907 return NULL; 908 909 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 910 return !ret ? NULL : ret + LLIST_NODE_SZ; 911 } 912 913 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 914 { 915 struct bpf_mem_cache *c; 916 int idx; 917 918 if (!ptr) 919 return; 920 921 c = *(void **)(ptr - LLIST_NODE_SZ); 922 idx = bpf_mem_cache_idx(c->unit_size); 923 if (WARN_ON_ONCE(idx < 0)) 924 return; 925 926 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 927 } 928 929 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 930 { 931 struct bpf_mem_cache *c; 932 int idx; 933 934 if (!ptr) 935 return; 936 937 c = *(void **)(ptr - LLIST_NODE_SZ); 938 idx = bpf_mem_cache_idx(c->unit_size); 939 if (WARN_ON_ONCE(idx < 0)) 940 return; 941 942 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 943 } 944 945 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 946 { 947 void *ret; 948 949 ret = unit_alloc(this_cpu_ptr(ma->cache)); 950 return !ret ? NULL : ret + LLIST_NODE_SZ; 951 } 952 953 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 954 { 955 if (!ptr) 956 return; 957 958 unit_free(this_cpu_ptr(ma->cache), ptr); 959 } 960 961 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 962 { 963 if (!ptr) 964 return; 965 966 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 967 } 968 969 /* Directly does a kfree() without putting 'ptr' back to the free_llist 970 * for reuse and without waiting for a rcu_tasks_trace gp. 971 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 972 * before calling bpf_mem_cache_raw_free(). 973 * It could be used when the rcu_tasks_trace callback does not have 974 * a hold on the original bpf_mem_alloc object that allocated the 975 * 'ptr'. This should only be used in the uncommon code path. 976 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 977 * and may affect performance. 978 */ 979 void bpf_mem_cache_raw_free(void *ptr) 980 { 981 if (!ptr) 982 return; 983 984 kfree(ptr - LLIST_NODE_SZ); 985 } 986 987 /* When flags == GFP_KERNEL, it signals that the caller will not cause 988 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 989 * kmalloc if the free_llist is empty. 990 */ 991 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 992 { 993 struct bpf_mem_cache *c; 994 void *ret; 995 996 c = this_cpu_ptr(ma->cache); 997 998 ret = unit_alloc(c); 999 if (!ret && flags == GFP_KERNEL) { 1000 struct mem_cgroup *memcg, *old_memcg; 1001 1002 memcg = get_memcg(c); 1003 old_memcg = set_active_memcg(memcg); 1004 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 1005 if (ret) 1006 *(struct bpf_mem_cache **)ret = c; 1007 set_active_memcg(old_memcg); 1008 mem_cgroup_put(memcg); 1009 } 1010 1011 return !ret ? NULL : ret + LLIST_NODE_SZ; 1012 } 1013