1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 #define BPF_MEM_ALLOC_SIZE_MAX 4096 39 40 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 41 static u8 size_index[24] __ro_after_init = { 42 3, /* 8 */ 43 3, /* 16 */ 44 4, /* 24 */ 45 4, /* 32 */ 46 5, /* 40 */ 47 5, /* 48 */ 48 5, /* 56 */ 49 5, /* 64 */ 50 1, /* 72 */ 51 1, /* 80 */ 52 1, /* 88 */ 53 1, /* 96 */ 54 6, /* 104 */ 55 6, /* 112 */ 56 6, /* 120 */ 57 6, /* 128 */ 58 2, /* 136 */ 59 2, /* 144 */ 60 2, /* 152 */ 61 2, /* 160 */ 62 2, /* 168 */ 63 2, /* 176 */ 64 2, /* 184 */ 65 2 /* 192 */ 66 }; 67 68 static int bpf_mem_cache_idx(size_t size) 69 { 70 if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) 71 return -1; 72 73 if (size <= 192) 74 return size_index[(size - 1) / 8] - 1; 75 76 return fls(size - 1) - 2; 77 } 78 79 #define NUM_CACHES 11 80 81 struct bpf_mem_cache { 82 /* per-cpu list of free objects of size 'unit_size'. 83 * All accesses are done with interrupts disabled and 'active' counter 84 * protection with __llist_add() and __llist_del_first(). 85 */ 86 struct llist_head free_llist; 87 local_t active; 88 89 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 90 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 91 * fail. When 'active' is busy the unit_free() will add an object to 92 * free_llist_extra. 93 */ 94 struct llist_head free_llist_extra; 95 96 struct irq_work refill_work; 97 struct obj_cgroup *objcg; 98 int unit_size; 99 /* count of objects in free_llist */ 100 int free_cnt; 101 int low_watermark, high_watermark, batch; 102 int percpu_size; 103 bool draining; 104 struct bpf_mem_cache *tgt; 105 106 /* list of objects to be freed after RCU GP */ 107 struct llist_head free_by_rcu; 108 struct llist_node *free_by_rcu_tail; 109 struct llist_head waiting_for_gp; 110 struct llist_node *waiting_for_gp_tail; 111 struct rcu_head rcu; 112 atomic_t call_rcu_in_progress; 113 struct llist_head free_llist_extra_rcu; 114 115 /* list of objects to be freed after RCU tasks trace GP */ 116 struct llist_head free_by_rcu_ttrace; 117 struct llist_head waiting_for_gp_ttrace; 118 struct rcu_head rcu_ttrace; 119 atomic_t call_rcu_ttrace_in_progress; 120 }; 121 122 struct bpf_mem_caches { 123 struct bpf_mem_cache cache[NUM_CACHES]; 124 }; 125 126 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 127 128 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 129 { 130 struct llist_node *entry, *next; 131 132 entry = head->first; 133 if (!entry) 134 return NULL; 135 next = entry->next; 136 head->first = next; 137 return entry; 138 } 139 140 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 141 { 142 if (c->percpu_size) { 143 void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); 144 void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 145 146 if (!obj || !pptr) { 147 free_percpu(pptr); 148 kfree(obj); 149 return NULL; 150 } 151 obj[1] = pptr; 152 return obj; 153 } 154 155 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 156 } 157 158 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 159 { 160 #ifdef CONFIG_MEMCG 161 if (c->objcg) 162 return get_mem_cgroup_from_objcg(c->objcg); 163 return root_mem_cgroup; 164 #else 165 return NULL; 166 #endif 167 } 168 169 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 170 { 171 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 172 /* In RT irq_work runs in per-cpu kthread, so disable 173 * interrupts to avoid preemption and interrupts and 174 * reduce the chance of bpf prog executing on this cpu 175 * when active counter is busy. 176 */ 177 local_irq_save(*flags); 178 /* alloc_bulk runs from irq_work which will not preempt a bpf 179 * program that does unit_alloc/unit_free since IRQs are 180 * disabled there. There is no race to increment 'active' 181 * counter. It protects free_llist from corruption in case NMI 182 * bpf prog preempted this loop. 183 */ 184 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 185 } 186 187 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 188 { 189 local_dec(&c->active); 190 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 191 local_irq_restore(*flags); 192 } 193 194 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 195 { 196 unsigned long flags; 197 198 inc_active(c, &flags); 199 __llist_add(obj, &c->free_llist); 200 c->free_cnt++; 201 dec_active(c, &flags); 202 } 203 204 /* Mostly runs from irq_work except __init phase. */ 205 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 206 { 207 struct mem_cgroup *memcg = NULL, *old_memcg; 208 gfp_t gfp; 209 void *obj; 210 int i; 211 212 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 213 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 214 215 for (i = 0; i < cnt; i++) { 216 /* 217 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 218 * done only by one CPU == current CPU. Other CPUs might 219 * llist_add() and llist_del_all() in parallel. 220 */ 221 obj = llist_del_first(&c->free_by_rcu_ttrace); 222 if (!obj) 223 break; 224 add_obj_to_free_list(c, obj); 225 } 226 if (i >= cnt) 227 return; 228 229 for (; i < cnt; i++) { 230 obj = llist_del_first(&c->waiting_for_gp_ttrace); 231 if (!obj) 232 break; 233 add_obj_to_free_list(c, obj); 234 } 235 if (i >= cnt) 236 return; 237 238 memcg = get_memcg(c); 239 old_memcg = set_active_memcg(memcg); 240 for (; i < cnt; i++) { 241 /* Allocate, but don't deplete atomic reserves that typical 242 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 243 * will allocate from the current numa node which is what we 244 * want here. 245 */ 246 obj = __alloc(c, node, gfp); 247 if (!obj) 248 break; 249 add_obj_to_free_list(c, obj); 250 } 251 set_active_memcg(old_memcg); 252 mem_cgroup_put(memcg); 253 } 254 255 static void free_one(void *obj, bool percpu) 256 { 257 if (percpu) 258 free_percpu(((void __percpu **)obj)[1]); 259 260 kfree(obj); 261 } 262 263 static int free_all(struct llist_node *llnode, bool percpu) 264 { 265 struct llist_node *pos, *t; 266 int cnt = 0; 267 268 llist_for_each_safe(pos, t, llnode) { 269 free_one(pos, percpu); 270 cnt++; 271 } 272 return cnt; 273 } 274 275 static void __free_rcu(struct rcu_head *head) 276 { 277 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 278 279 free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 280 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 281 } 282 283 static void __free_rcu_tasks_trace(struct rcu_head *head) 284 { 285 /* If RCU Tasks Trace grace period implies RCU grace period, 286 * there is no need to invoke call_rcu(). 287 */ 288 if (rcu_trace_implies_rcu_gp()) 289 __free_rcu(head); 290 else 291 call_rcu(head, __free_rcu); 292 } 293 294 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 295 { 296 struct llist_node *llnode = obj; 297 298 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 299 * Nothing races to add to free_by_rcu_ttrace list. 300 */ 301 llist_add(llnode, &c->free_by_rcu_ttrace); 302 } 303 304 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 305 { 306 struct llist_node *llnode, *t; 307 308 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 309 if (unlikely(READ_ONCE(c->draining))) { 310 llnode = llist_del_all(&c->free_by_rcu_ttrace); 311 free_all(llnode, !!c->percpu_size); 312 } 313 return; 314 } 315 316 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 317 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 318 llist_add(llnode, &c->waiting_for_gp_ttrace); 319 320 if (unlikely(READ_ONCE(c->draining))) { 321 __free_rcu(&c->rcu_ttrace); 322 return; 323 } 324 325 /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 326 * If RCU Tasks Trace grace period implies RCU grace period, free 327 * these elements directly, else use call_rcu() to wait for normal 328 * progs to finish and finally do free_one() on each element. 329 */ 330 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); 331 } 332 333 static void free_bulk(struct bpf_mem_cache *c) 334 { 335 struct bpf_mem_cache *tgt = c->tgt; 336 struct llist_node *llnode, *t; 337 unsigned long flags; 338 int cnt; 339 340 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 341 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 342 343 do { 344 inc_active(c, &flags); 345 llnode = __llist_del_first(&c->free_llist); 346 if (llnode) 347 cnt = --c->free_cnt; 348 else 349 cnt = 0; 350 dec_active(c, &flags); 351 if (llnode) 352 enque_to_free(tgt, llnode); 353 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 354 355 /* and drain free_llist_extra */ 356 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 357 enque_to_free(tgt, llnode); 358 do_call_rcu_ttrace(tgt); 359 } 360 361 static void __free_by_rcu(struct rcu_head *head) 362 { 363 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 364 struct bpf_mem_cache *tgt = c->tgt; 365 struct llist_node *llnode; 366 367 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 368 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 369 370 llnode = llist_del_all(&c->waiting_for_gp); 371 if (!llnode) 372 goto out; 373 374 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 375 376 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 377 do_call_rcu_ttrace(tgt); 378 out: 379 atomic_set(&c->call_rcu_in_progress, 0); 380 } 381 382 static void check_free_by_rcu(struct bpf_mem_cache *c) 383 { 384 struct llist_node *llnode, *t; 385 unsigned long flags; 386 387 /* drain free_llist_extra_rcu */ 388 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 389 inc_active(c, &flags); 390 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 391 if (__llist_add(llnode, &c->free_by_rcu)) 392 c->free_by_rcu_tail = llnode; 393 dec_active(c, &flags); 394 } 395 396 if (llist_empty(&c->free_by_rcu)) 397 return; 398 399 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 400 /* 401 * Instead of kmalloc-ing new rcu_head and triggering 10k 402 * call_rcu() to hit rcutree.qhimark and force RCU to notice 403 * the overload just ask RCU to hurry up. There could be many 404 * objects in free_by_rcu list. 405 * This hint reduces memory consumption for an artificial 406 * benchmark from 2 Gbyte to 150 Mbyte. 407 */ 408 rcu_request_urgent_qs_task(current); 409 return; 410 } 411 412 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 413 414 inc_active(c, &flags); 415 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 416 c->waiting_for_gp_tail = c->free_by_rcu_tail; 417 dec_active(c, &flags); 418 419 if (unlikely(READ_ONCE(c->draining))) { 420 free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 421 atomic_set(&c->call_rcu_in_progress, 0); 422 } else { 423 call_rcu_hurry(&c->rcu, __free_by_rcu); 424 } 425 } 426 427 static void bpf_mem_refill(struct irq_work *work) 428 { 429 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 430 int cnt; 431 432 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 433 cnt = c->free_cnt; 434 if (cnt < c->low_watermark) 435 /* irq_work runs on this cpu and kmalloc will allocate 436 * from the current numa node which is what we want here. 437 */ 438 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 439 else if (cnt > c->high_watermark) 440 free_bulk(c); 441 442 check_free_by_rcu(c); 443 } 444 445 static void notrace irq_work_raise(struct bpf_mem_cache *c) 446 { 447 irq_work_queue(&c->refill_work); 448 } 449 450 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 451 * the freelist cache will be elem_size * 64 (or less) on each cpu. 452 * 453 * For bpf programs that don't have statically known allocation sizes and 454 * assuming (low_mark + high_mark) / 2 as an average number of elements per 455 * bucket and all buckets are used the total amount of memory in freelists 456 * on each cpu will be: 457 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 458 * == ~ 116 Kbyte using below heuristic. 459 * Initialized, but unused bpf allocator (not bpf map specific one) will 460 * consume ~ 11 Kbyte per cpu. 461 * Typical case will be between 11K and 116K closer to 11K. 462 * bpf progs can and should share bpf_mem_cache when possible. 463 * 464 * Percpu allocation is typically rare. To avoid potential unnecessary large 465 * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. 466 */ 467 static void init_refill_work(struct bpf_mem_cache *c) 468 { 469 init_irq_work(&c->refill_work, bpf_mem_refill); 470 if (c->percpu_size) { 471 c->low_watermark = 1; 472 c->high_watermark = 3; 473 } else if (c->unit_size <= 256) { 474 c->low_watermark = 32; 475 c->high_watermark = 96; 476 } else { 477 /* When page_size == 4k, order-0 cache will have low_mark == 2 478 * and high_mark == 6 with batch alloc of 3 individual pages at 479 * a time. 480 * 8k allocs and above low == 1, high == 3, batch == 1. 481 */ 482 c->low_watermark = max(32 * 256 / c->unit_size, 1); 483 c->high_watermark = max(96 * 256 / c->unit_size, 3); 484 } 485 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 486 } 487 488 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 489 { 490 int cnt = 1; 491 492 /* To avoid consuming memory, for non-percpu allocation, assume that 493 * 1st run of bpf prog won't be doing more than 4 map_update_elem from 494 * irq disabled region if unit size is less than or equal to 256. 495 * For all other cases, let us just do one allocation. 496 */ 497 if (!c->percpu_size && c->unit_size <= 256) 498 cnt = 4; 499 alloc_bulk(c, cnt, cpu_to_node(cpu), false); 500 } 501 502 /* When size != 0 bpf_mem_cache for each cpu. 503 * This is typical bpf hash map use case when all elements have equal size. 504 * 505 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 506 * kmalloc/kfree. Max allocation size is 4096 in this case. 507 * This is bpf_dynptr and bpf_kptr use case. 508 */ 509 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 510 { 511 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 512 struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; 513 struct obj_cgroup *objcg = NULL; 514 int cpu, i, unit_size, percpu_size = 0; 515 516 if (percpu && size == 0) 517 return -EINVAL; 518 519 /* room for llist_node and per-cpu pointer */ 520 if (percpu) 521 percpu_size = LLIST_NODE_SZ + sizeof(void *); 522 ma->percpu = percpu; 523 524 if (size) { 525 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 526 if (!pc) 527 return -ENOMEM; 528 529 if (!percpu) 530 size += LLIST_NODE_SZ; /* room for llist_node */ 531 unit_size = size; 532 533 #ifdef CONFIG_MEMCG 534 if (memcg_bpf_enabled()) 535 objcg = get_obj_cgroup_from_current(); 536 #endif 537 ma->objcg = objcg; 538 539 for_each_possible_cpu(cpu) { 540 c = per_cpu_ptr(pc, cpu); 541 c->unit_size = unit_size; 542 c->objcg = objcg; 543 c->percpu_size = percpu_size; 544 c->tgt = c; 545 init_refill_work(c); 546 prefill_mem_cache(c, cpu); 547 } 548 ma->cache = pc; 549 return 0; 550 } 551 552 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 553 if (!pcc) 554 return -ENOMEM; 555 #ifdef CONFIG_MEMCG 556 objcg = get_obj_cgroup_from_current(); 557 #endif 558 ma->objcg = objcg; 559 for_each_possible_cpu(cpu) { 560 cc = per_cpu_ptr(pcc, cpu); 561 for (i = 0; i < NUM_CACHES; i++) { 562 c = &cc->cache[i]; 563 c->unit_size = sizes[i]; 564 c->objcg = objcg; 565 c->percpu_size = percpu_size; 566 c->tgt = c; 567 568 init_refill_work(c); 569 prefill_mem_cache(c, cpu); 570 } 571 } 572 573 ma->caches = pcc; 574 return 0; 575 } 576 577 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 578 { 579 struct bpf_mem_caches __percpu *pcc; 580 581 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 582 if (!pcc) 583 return -ENOMEM; 584 585 ma->caches = pcc; 586 ma->objcg = objcg; 587 ma->percpu = true; 588 return 0; 589 } 590 591 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 592 { 593 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 594 int cpu, i, unit_size, percpu_size; 595 struct obj_cgroup *objcg; 596 struct bpf_mem_cache *c; 597 598 i = bpf_mem_cache_idx(size); 599 if (i < 0) 600 return -EINVAL; 601 602 /* room for llist_node and per-cpu pointer */ 603 percpu_size = LLIST_NODE_SZ + sizeof(void *); 604 605 unit_size = sizes[i]; 606 objcg = ma->objcg; 607 pcc = ma->caches; 608 609 for_each_possible_cpu(cpu) { 610 cc = per_cpu_ptr(pcc, cpu); 611 c = &cc->cache[i]; 612 if (c->unit_size) 613 break; 614 615 c->unit_size = unit_size; 616 c->objcg = objcg; 617 c->percpu_size = percpu_size; 618 c->tgt = c; 619 620 init_refill_work(c); 621 prefill_mem_cache(c, cpu); 622 } 623 624 return 0; 625 } 626 627 static void drain_mem_cache(struct bpf_mem_cache *c) 628 { 629 bool percpu = !!c->percpu_size; 630 631 /* No progs are using this bpf_mem_cache, but htab_map_free() called 632 * bpf_mem_cache_free() for all remaining elements and they can be in 633 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 634 * 635 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 636 * on these lists, so it is safe to use __llist_del_all(). 637 */ 638 free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 639 free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 640 free_all(__llist_del_all(&c->free_llist), percpu); 641 free_all(__llist_del_all(&c->free_llist_extra), percpu); 642 free_all(__llist_del_all(&c->free_by_rcu), percpu); 643 free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); 644 free_all(llist_del_all(&c->waiting_for_gp), percpu); 645 } 646 647 static void check_mem_cache(struct bpf_mem_cache *c) 648 { 649 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 650 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 651 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 652 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 653 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 654 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 655 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 656 } 657 658 static void check_leaked_objs(struct bpf_mem_alloc *ma) 659 { 660 struct bpf_mem_caches *cc; 661 struct bpf_mem_cache *c; 662 int cpu, i; 663 664 if (ma->cache) { 665 for_each_possible_cpu(cpu) { 666 c = per_cpu_ptr(ma->cache, cpu); 667 check_mem_cache(c); 668 } 669 } 670 if (ma->caches) { 671 for_each_possible_cpu(cpu) { 672 cc = per_cpu_ptr(ma->caches, cpu); 673 for (i = 0; i < NUM_CACHES; i++) { 674 c = &cc->cache[i]; 675 check_mem_cache(c); 676 } 677 } 678 } 679 } 680 681 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 682 { 683 check_leaked_objs(ma); 684 free_percpu(ma->cache); 685 free_percpu(ma->caches); 686 ma->cache = NULL; 687 ma->caches = NULL; 688 } 689 690 static void free_mem_alloc(struct bpf_mem_alloc *ma) 691 { 692 /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 693 * might still execute. Wait for them. 694 * 695 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 696 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 697 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), 698 * so if call_rcu(head, __free_rcu) is skipped due to 699 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by 700 * using rcu_trace_implies_rcu_gp() as well. 701 */ 702 rcu_barrier(); /* wait for __free_by_rcu */ 703 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 704 if (!rcu_trace_implies_rcu_gp()) 705 rcu_barrier(); 706 free_mem_alloc_no_barrier(ma); 707 } 708 709 static void free_mem_alloc_deferred(struct work_struct *work) 710 { 711 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 712 713 free_mem_alloc(ma); 714 kfree(ma); 715 } 716 717 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 718 { 719 struct bpf_mem_alloc *copy; 720 721 if (!rcu_in_progress) { 722 /* Fast path. No callbacks are pending, hence no need to do 723 * rcu_barrier-s. 724 */ 725 free_mem_alloc_no_barrier(ma); 726 return; 727 } 728 729 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 730 if (!copy) { 731 /* Slow path with inline barrier-s */ 732 free_mem_alloc(ma); 733 return; 734 } 735 736 /* Defer barriers into worker to let the rest of map memory to be freed */ 737 memset(ma, 0, sizeof(*ma)); 738 INIT_WORK(©->work, free_mem_alloc_deferred); 739 queue_work(system_unbound_wq, ©->work); 740 } 741 742 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 743 { 744 struct bpf_mem_caches *cc; 745 struct bpf_mem_cache *c; 746 int cpu, i, rcu_in_progress; 747 748 if (ma->cache) { 749 rcu_in_progress = 0; 750 for_each_possible_cpu(cpu) { 751 c = per_cpu_ptr(ma->cache, cpu); 752 WRITE_ONCE(c->draining, true); 753 irq_work_sync(&c->refill_work); 754 drain_mem_cache(c); 755 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 756 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 757 } 758 obj_cgroup_put(ma->objcg); 759 destroy_mem_alloc(ma, rcu_in_progress); 760 } 761 if (ma->caches) { 762 rcu_in_progress = 0; 763 for_each_possible_cpu(cpu) { 764 cc = per_cpu_ptr(ma->caches, cpu); 765 for (i = 0; i < NUM_CACHES; i++) { 766 c = &cc->cache[i]; 767 WRITE_ONCE(c->draining, true); 768 irq_work_sync(&c->refill_work); 769 drain_mem_cache(c); 770 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 771 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 772 } 773 } 774 obj_cgroup_put(ma->objcg); 775 destroy_mem_alloc(ma, rcu_in_progress); 776 } 777 } 778 779 /* notrace is necessary here and in other functions to make sure 780 * bpf programs cannot attach to them and cause llist corruptions. 781 */ 782 static void notrace *unit_alloc(struct bpf_mem_cache *c) 783 { 784 struct llist_node *llnode = NULL; 785 unsigned long flags; 786 int cnt = 0; 787 788 /* Disable irqs to prevent the following race for majority of prog types: 789 * prog_A 790 * bpf_mem_alloc 791 * preemption or irq -> prog_B 792 * bpf_mem_alloc 793 * 794 * but prog_B could be a perf_event NMI prog. 795 * Use per-cpu 'active' counter to order free_list access between 796 * unit_alloc/unit_free/bpf_mem_refill. 797 */ 798 local_irq_save(flags); 799 if (local_inc_return(&c->active) == 1) { 800 llnode = __llist_del_first(&c->free_llist); 801 if (llnode) { 802 cnt = --c->free_cnt; 803 *(struct bpf_mem_cache **)llnode = c; 804 } 805 } 806 local_dec(&c->active); 807 808 WARN_ON(cnt < 0); 809 810 if (cnt < c->low_watermark) 811 irq_work_raise(c); 812 /* Enable IRQ after the enqueue of irq work completes, so irq work 813 * will run after IRQ is enabled and free_llist may be refilled by 814 * irq work before other task preempts current task. 815 */ 816 local_irq_restore(flags); 817 818 return llnode; 819 } 820 821 /* Though 'ptr' object could have been allocated on a different cpu 822 * add it to the free_llist of the current cpu. 823 * Let kfree() logic deal with it when it's later called from irq_work. 824 */ 825 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 826 { 827 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 828 unsigned long flags; 829 int cnt = 0; 830 831 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 832 833 /* 834 * Remember bpf_mem_cache that allocated this object. 835 * The hint is not accurate. 836 */ 837 c->tgt = *(struct bpf_mem_cache **)llnode; 838 839 local_irq_save(flags); 840 if (local_inc_return(&c->active) == 1) { 841 __llist_add(llnode, &c->free_llist); 842 cnt = ++c->free_cnt; 843 } else { 844 /* unit_free() cannot fail. Therefore add an object to atomic 845 * llist. free_bulk() will drain it. Though free_llist_extra is 846 * a per-cpu list we have to use atomic llist_add here, since 847 * it also can be interrupted by bpf nmi prog that does another 848 * unit_free() into the same free_llist_extra. 849 */ 850 llist_add(llnode, &c->free_llist_extra); 851 } 852 local_dec(&c->active); 853 854 if (cnt > c->high_watermark) 855 /* free few objects from current cpu into global kmalloc pool */ 856 irq_work_raise(c); 857 /* Enable IRQ after irq_work_raise() completes, otherwise when current 858 * task is preempted by task which does unit_alloc(), unit_alloc() may 859 * return NULL unexpectedly because irq work is already pending but can 860 * not been triggered and free_llist can not be refilled timely. 861 */ 862 local_irq_restore(flags); 863 } 864 865 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 866 { 867 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 868 unsigned long flags; 869 870 c->tgt = *(struct bpf_mem_cache **)llnode; 871 872 local_irq_save(flags); 873 if (local_inc_return(&c->active) == 1) { 874 if (__llist_add(llnode, &c->free_by_rcu)) 875 c->free_by_rcu_tail = llnode; 876 } else { 877 llist_add(llnode, &c->free_llist_extra_rcu); 878 } 879 local_dec(&c->active); 880 881 if (!atomic_read(&c->call_rcu_in_progress)) 882 irq_work_raise(c); 883 local_irq_restore(flags); 884 } 885 886 /* Called from BPF program or from sys_bpf syscall. 887 * In both cases migration is disabled. 888 */ 889 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 890 { 891 int idx; 892 void *ret; 893 894 if (!size) 895 return NULL; 896 897 if (!ma->percpu) 898 size += LLIST_NODE_SZ; 899 idx = bpf_mem_cache_idx(size); 900 if (idx < 0) 901 return NULL; 902 903 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 904 return !ret ? NULL : ret + LLIST_NODE_SZ; 905 } 906 907 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 908 { 909 struct bpf_mem_cache *c; 910 int idx; 911 912 if (!ptr) 913 return; 914 915 c = *(void **)(ptr - LLIST_NODE_SZ); 916 idx = bpf_mem_cache_idx(c->unit_size); 917 if (WARN_ON_ONCE(idx < 0)) 918 return; 919 920 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 921 } 922 923 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 924 { 925 struct bpf_mem_cache *c; 926 int idx; 927 928 if (!ptr) 929 return; 930 931 c = *(void **)(ptr - LLIST_NODE_SZ); 932 idx = bpf_mem_cache_idx(c->unit_size); 933 if (WARN_ON_ONCE(idx < 0)) 934 return; 935 936 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 937 } 938 939 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 940 { 941 void *ret; 942 943 ret = unit_alloc(this_cpu_ptr(ma->cache)); 944 return !ret ? NULL : ret + LLIST_NODE_SZ; 945 } 946 947 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 948 { 949 if (!ptr) 950 return; 951 952 unit_free(this_cpu_ptr(ma->cache), ptr); 953 } 954 955 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 956 { 957 if (!ptr) 958 return; 959 960 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 961 } 962 963 /* Directly does a kfree() without putting 'ptr' back to the free_llist 964 * for reuse and without waiting for a rcu_tasks_trace gp. 965 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 966 * before calling bpf_mem_cache_raw_free(). 967 * It could be used when the rcu_tasks_trace callback does not have 968 * a hold on the original bpf_mem_alloc object that allocated the 969 * 'ptr'. This should only be used in the uncommon code path. 970 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 971 * and may affect performance. 972 */ 973 void bpf_mem_cache_raw_free(void *ptr) 974 { 975 if (!ptr) 976 return; 977 978 kfree(ptr - LLIST_NODE_SZ); 979 } 980 981 /* When flags == GFP_KERNEL, it signals that the caller will not cause 982 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 983 * kmalloc if the free_llist is empty. 984 */ 985 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 986 { 987 struct bpf_mem_cache *c; 988 void *ret; 989 990 c = this_cpu_ptr(ma->cache); 991 992 ret = unit_alloc(c); 993 if (!ret && flags == GFP_KERNEL) { 994 struct mem_cgroup *memcg, *old_memcg; 995 996 memcg = get_memcg(c); 997 old_memcg = set_active_memcg(memcg); 998 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 999 if (ret) 1000 *(struct bpf_mem_cache **)ret = c; 1001 set_active_memcg(old_memcg); 1002 mem_cgroup_put(memcg); 1003 } 1004 1005 return !ret ? NULL : ret + LLIST_NODE_SZ; 1006 } 1007 1008 int bpf_mem_alloc_check_size(bool percpu, size_t size) 1009 { 1010 /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ 1011 if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || 1012 (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) 1013 return -E2BIG; 1014 1015 return 0; 1016 } 1017