1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 39 static u8 size_index[24] __ro_after_init = { 40 3, /* 8 */ 41 3, /* 16 */ 42 4, /* 24 */ 43 4, /* 32 */ 44 5, /* 40 */ 45 5, /* 48 */ 46 5, /* 56 */ 47 5, /* 64 */ 48 1, /* 72 */ 49 1, /* 80 */ 50 1, /* 88 */ 51 1, /* 96 */ 52 6, /* 104 */ 53 6, /* 112 */ 54 6, /* 120 */ 55 6, /* 128 */ 56 2, /* 136 */ 57 2, /* 144 */ 58 2, /* 152 */ 59 2, /* 160 */ 60 2, /* 168 */ 61 2, /* 176 */ 62 2, /* 184 */ 63 2 /* 192 */ 64 }; 65 66 static int bpf_mem_cache_idx(size_t size) 67 { 68 if (!size || size > 4096) 69 return -1; 70 71 if (size <= 192) 72 return size_index[(size - 1) / 8] - 1; 73 74 return fls(size - 1) - 2; 75 } 76 77 #define NUM_CACHES 11 78 79 struct bpf_mem_cache { 80 /* per-cpu list of free objects of size 'unit_size'. 81 * All accesses are done with interrupts disabled and 'active' counter 82 * protection with __llist_add() and __llist_del_first(). 83 */ 84 struct llist_head free_llist; 85 local_t active; 86 87 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 88 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 89 * fail. When 'active' is busy the unit_free() will add an object to 90 * free_llist_extra. 91 */ 92 struct llist_head free_llist_extra; 93 94 struct irq_work refill_work; 95 struct obj_cgroup *objcg; 96 int unit_size; 97 /* count of objects in free_llist */ 98 int free_cnt; 99 int low_watermark, high_watermark, batch; 100 int percpu_size; 101 bool draining; 102 struct bpf_mem_cache *tgt; 103 104 /* list of objects to be freed after RCU GP */ 105 struct llist_head free_by_rcu; 106 struct llist_node *free_by_rcu_tail; 107 struct llist_head waiting_for_gp; 108 struct llist_node *waiting_for_gp_tail; 109 struct rcu_head rcu; 110 atomic_t call_rcu_in_progress; 111 struct llist_head free_llist_extra_rcu; 112 113 /* list of objects to be freed after RCU tasks trace GP */ 114 struct llist_head free_by_rcu_ttrace; 115 struct llist_head waiting_for_gp_ttrace; 116 struct rcu_head rcu_ttrace; 117 atomic_t call_rcu_ttrace_in_progress; 118 }; 119 120 struct bpf_mem_caches { 121 struct bpf_mem_cache cache[NUM_CACHES]; 122 }; 123 124 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 125 126 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 127 { 128 struct llist_node *entry, *next; 129 130 entry = head->first; 131 if (!entry) 132 return NULL; 133 next = entry->next; 134 head->first = next; 135 return entry; 136 } 137 138 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 139 { 140 if (c->percpu_size) { 141 void **obj = kmalloc_node(c->percpu_size, flags, node); 142 void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 143 144 if (!obj || !pptr) { 145 free_percpu(pptr); 146 kfree(obj); 147 return NULL; 148 } 149 obj[1] = pptr; 150 return obj; 151 } 152 153 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 154 } 155 156 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 157 { 158 #ifdef CONFIG_MEMCG 159 if (c->objcg) 160 return get_mem_cgroup_from_objcg(c->objcg); 161 return root_mem_cgroup; 162 #else 163 return NULL; 164 #endif 165 } 166 167 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 168 { 169 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 170 /* In RT irq_work runs in per-cpu kthread, so disable 171 * interrupts to avoid preemption and interrupts and 172 * reduce the chance of bpf prog executing on this cpu 173 * when active counter is busy. 174 */ 175 local_irq_save(*flags); 176 /* alloc_bulk runs from irq_work which will not preempt a bpf 177 * program that does unit_alloc/unit_free since IRQs are 178 * disabled there. There is no race to increment 'active' 179 * counter. It protects free_llist from corruption in case NMI 180 * bpf prog preempted this loop. 181 */ 182 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 183 } 184 185 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 186 { 187 local_dec(&c->active); 188 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 189 local_irq_restore(*flags); 190 } 191 192 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 193 { 194 unsigned long flags; 195 196 inc_active(c, &flags); 197 __llist_add(obj, &c->free_llist); 198 c->free_cnt++; 199 dec_active(c, &flags); 200 } 201 202 /* Mostly runs from irq_work except __init phase. */ 203 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 204 { 205 struct mem_cgroup *memcg = NULL, *old_memcg; 206 gfp_t gfp; 207 void *obj; 208 int i; 209 210 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 211 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 212 213 for (i = 0; i < cnt; i++) { 214 /* 215 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 216 * done only by one CPU == current CPU. Other CPUs might 217 * llist_add() and llist_del_all() in parallel. 218 */ 219 obj = llist_del_first(&c->free_by_rcu_ttrace); 220 if (!obj) 221 break; 222 add_obj_to_free_list(c, obj); 223 } 224 if (i >= cnt) 225 return; 226 227 for (; i < cnt; i++) { 228 obj = llist_del_first(&c->waiting_for_gp_ttrace); 229 if (!obj) 230 break; 231 add_obj_to_free_list(c, obj); 232 } 233 if (i >= cnt) 234 return; 235 236 memcg = get_memcg(c); 237 old_memcg = set_active_memcg(memcg); 238 for (; i < cnt; i++) { 239 /* Allocate, but don't deplete atomic reserves that typical 240 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 241 * will allocate from the current numa node which is what we 242 * want here. 243 */ 244 obj = __alloc(c, node, gfp); 245 if (!obj) 246 break; 247 add_obj_to_free_list(c, obj); 248 } 249 set_active_memcg(old_memcg); 250 mem_cgroup_put(memcg); 251 } 252 253 static void free_one(void *obj, bool percpu) 254 { 255 if (percpu) { 256 free_percpu(((void **)obj)[1]); 257 kfree(obj); 258 return; 259 } 260 261 kfree(obj); 262 } 263 264 static int free_all(struct llist_node *llnode, bool percpu) 265 { 266 struct llist_node *pos, *t; 267 int cnt = 0; 268 269 llist_for_each_safe(pos, t, llnode) { 270 free_one(pos, percpu); 271 cnt++; 272 } 273 return cnt; 274 } 275 276 static void __free_rcu(struct rcu_head *head) 277 { 278 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 279 280 free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 281 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 282 } 283 284 static void __free_rcu_tasks_trace(struct rcu_head *head) 285 { 286 /* If RCU Tasks Trace grace period implies RCU grace period, 287 * there is no need to invoke call_rcu(). 288 */ 289 if (rcu_trace_implies_rcu_gp()) 290 __free_rcu(head); 291 else 292 call_rcu(head, __free_rcu); 293 } 294 295 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 296 { 297 struct llist_node *llnode = obj; 298 299 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 300 * Nothing races to add to free_by_rcu_ttrace list. 301 */ 302 llist_add(llnode, &c->free_by_rcu_ttrace); 303 } 304 305 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 306 { 307 struct llist_node *llnode, *t; 308 309 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 310 if (unlikely(READ_ONCE(c->draining))) { 311 llnode = llist_del_all(&c->free_by_rcu_ttrace); 312 free_all(llnode, !!c->percpu_size); 313 } 314 return; 315 } 316 317 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 318 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 319 llist_add(llnode, &c->waiting_for_gp_ttrace); 320 321 if (unlikely(READ_ONCE(c->draining))) { 322 __free_rcu(&c->rcu_ttrace); 323 return; 324 } 325 326 /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 327 * If RCU Tasks Trace grace period implies RCU grace period, free 328 * these elements directly, else use call_rcu() to wait for normal 329 * progs to finish and finally do free_one() on each element. 330 */ 331 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); 332 } 333 334 static void free_bulk(struct bpf_mem_cache *c) 335 { 336 struct bpf_mem_cache *tgt = c->tgt; 337 struct llist_node *llnode, *t; 338 unsigned long flags; 339 int cnt; 340 341 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 342 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 343 344 do { 345 inc_active(c, &flags); 346 llnode = __llist_del_first(&c->free_llist); 347 if (llnode) 348 cnt = --c->free_cnt; 349 else 350 cnt = 0; 351 dec_active(c, &flags); 352 if (llnode) 353 enque_to_free(tgt, llnode); 354 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 355 356 /* and drain free_llist_extra */ 357 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 358 enque_to_free(tgt, llnode); 359 do_call_rcu_ttrace(tgt); 360 } 361 362 static void __free_by_rcu(struct rcu_head *head) 363 { 364 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 365 struct bpf_mem_cache *tgt = c->tgt; 366 struct llist_node *llnode; 367 368 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 369 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 370 371 llnode = llist_del_all(&c->waiting_for_gp); 372 if (!llnode) 373 goto out; 374 375 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 376 377 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 378 do_call_rcu_ttrace(tgt); 379 out: 380 atomic_set(&c->call_rcu_in_progress, 0); 381 } 382 383 static void check_free_by_rcu(struct bpf_mem_cache *c) 384 { 385 struct llist_node *llnode, *t; 386 unsigned long flags; 387 388 /* drain free_llist_extra_rcu */ 389 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 390 inc_active(c, &flags); 391 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 392 if (__llist_add(llnode, &c->free_by_rcu)) 393 c->free_by_rcu_tail = llnode; 394 dec_active(c, &flags); 395 } 396 397 if (llist_empty(&c->free_by_rcu)) 398 return; 399 400 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 401 /* 402 * Instead of kmalloc-ing new rcu_head and triggering 10k 403 * call_rcu() to hit rcutree.qhimark and force RCU to notice 404 * the overload just ask RCU to hurry up. There could be many 405 * objects in free_by_rcu list. 406 * This hint reduces memory consumption for an artificial 407 * benchmark from 2 Gbyte to 150 Mbyte. 408 */ 409 rcu_request_urgent_qs_task(current); 410 return; 411 } 412 413 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 414 415 inc_active(c, &flags); 416 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 417 c->waiting_for_gp_tail = c->free_by_rcu_tail; 418 dec_active(c, &flags); 419 420 if (unlikely(READ_ONCE(c->draining))) { 421 free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 422 atomic_set(&c->call_rcu_in_progress, 0); 423 } else { 424 call_rcu_hurry(&c->rcu, __free_by_rcu); 425 } 426 } 427 428 static void bpf_mem_refill(struct irq_work *work) 429 { 430 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 431 int cnt; 432 433 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 434 cnt = c->free_cnt; 435 if (cnt < c->low_watermark) 436 /* irq_work runs on this cpu and kmalloc will allocate 437 * from the current numa node which is what we want here. 438 */ 439 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 440 else if (cnt > c->high_watermark) 441 free_bulk(c); 442 443 check_free_by_rcu(c); 444 } 445 446 static void notrace irq_work_raise(struct bpf_mem_cache *c) 447 { 448 irq_work_queue(&c->refill_work); 449 } 450 451 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 452 * the freelist cache will be elem_size * 64 (or less) on each cpu. 453 * 454 * For bpf programs that don't have statically known allocation sizes and 455 * assuming (low_mark + high_mark) / 2 as an average number of elements per 456 * bucket and all buckets are used the total amount of memory in freelists 457 * on each cpu will be: 458 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 459 * == ~ 116 Kbyte using below heuristic. 460 * Initialized, but unused bpf allocator (not bpf map specific one) will 461 * consume ~ 11 Kbyte per cpu. 462 * Typical case will be between 11K and 116K closer to 11K. 463 * bpf progs can and should share bpf_mem_cache when possible. 464 * 465 * Percpu allocation is typically rare. To avoid potential unnecessary large 466 * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. 467 */ 468 static void init_refill_work(struct bpf_mem_cache *c) 469 { 470 init_irq_work(&c->refill_work, bpf_mem_refill); 471 if (c->percpu_size) { 472 c->low_watermark = 1; 473 c->high_watermark = 3; 474 } else if (c->unit_size <= 256) { 475 c->low_watermark = 32; 476 c->high_watermark = 96; 477 } else { 478 /* When page_size == 4k, order-0 cache will have low_mark == 2 479 * and high_mark == 6 with batch alloc of 3 individual pages at 480 * a time. 481 * 8k allocs and above low == 1, high == 3, batch == 1. 482 */ 483 c->low_watermark = max(32 * 256 / c->unit_size, 1); 484 c->high_watermark = max(96 * 256 / c->unit_size, 3); 485 } 486 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 487 } 488 489 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 490 { 491 int cnt = 1; 492 493 /* To avoid consuming memory, for non-percpu allocation, assume that 494 * 1st run of bpf prog won't be doing more than 4 map_update_elem from 495 * irq disabled region if unit size is less than or equal to 256. 496 * For all other cases, let us just do one allocation. 497 */ 498 if (!c->percpu_size && c->unit_size <= 256) 499 cnt = 4; 500 alloc_bulk(c, cnt, cpu_to_node(cpu), false); 501 } 502 503 /* When size != 0 bpf_mem_cache for each cpu. 504 * This is typical bpf hash map use case when all elements have equal size. 505 * 506 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 507 * kmalloc/kfree. Max allocation size is 4096 in this case. 508 * This is bpf_dynptr and bpf_kptr use case. 509 */ 510 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 511 { 512 struct bpf_mem_caches *cc, __percpu *pcc; 513 struct bpf_mem_cache *c, __percpu *pc; 514 struct obj_cgroup *objcg = NULL; 515 int cpu, i, unit_size, percpu_size = 0; 516 517 if (percpu && size == 0) 518 return -EINVAL; 519 520 /* room for llist_node and per-cpu pointer */ 521 if (percpu) 522 percpu_size = LLIST_NODE_SZ + sizeof(void *); 523 ma->percpu = percpu; 524 525 if (size) { 526 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 527 if (!pc) 528 return -ENOMEM; 529 530 if (!percpu) 531 size += LLIST_NODE_SZ; /* room for llist_node */ 532 unit_size = size; 533 534 #ifdef CONFIG_MEMCG 535 if (memcg_bpf_enabled()) 536 objcg = get_obj_cgroup_from_current(); 537 #endif 538 ma->objcg = objcg; 539 540 for_each_possible_cpu(cpu) { 541 c = per_cpu_ptr(pc, cpu); 542 c->unit_size = unit_size; 543 c->objcg = objcg; 544 c->percpu_size = percpu_size; 545 c->tgt = c; 546 init_refill_work(c); 547 prefill_mem_cache(c, cpu); 548 } 549 ma->cache = pc; 550 return 0; 551 } 552 553 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 554 if (!pcc) 555 return -ENOMEM; 556 #ifdef CONFIG_MEMCG 557 objcg = get_obj_cgroup_from_current(); 558 #endif 559 ma->objcg = objcg; 560 for_each_possible_cpu(cpu) { 561 cc = per_cpu_ptr(pcc, cpu); 562 for (i = 0; i < NUM_CACHES; i++) { 563 c = &cc->cache[i]; 564 c->unit_size = sizes[i]; 565 c->objcg = objcg; 566 c->percpu_size = percpu_size; 567 c->tgt = c; 568 569 init_refill_work(c); 570 prefill_mem_cache(c, cpu); 571 } 572 } 573 574 ma->caches = pcc; 575 return 0; 576 } 577 578 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 579 { 580 struct bpf_mem_caches __percpu *pcc; 581 582 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 583 if (!pcc) 584 return -ENOMEM; 585 586 ma->caches = pcc; 587 ma->objcg = objcg; 588 ma->percpu = true; 589 return 0; 590 } 591 592 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 593 { 594 struct bpf_mem_caches *cc, __percpu *pcc; 595 int cpu, i, unit_size, percpu_size; 596 struct obj_cgroup *objcg; 597 struct bpf_mem_cache *c; 598 599 i = bpf_mem_cache_idx(size); 600 if (i < 0) 601 return -EINVAL; 602 603 /* room for llist_node and per-cpu pointer */ 604 percpu_size = LLIST_NODE_SZ + sizeof(void *); 605 606 unit_size = sizes[i]; 607 objcg = ma->objcg; 608 pcc = ma->caches; 609 610 for_each_possible_cpu(cpu) { 611 cc = per_cpu_ptr(pcc, cpu); 612 c = &cc->cache[i]; 613 if (c->unit_size) 614 break; 615 616 c->unit_size = unit_size; 617 c->objcg = objcg; 618 c->percpu_size = percpu_size; 619 c->tgt = c; 620 621 init_refill_work(c); 622 prefill_mem_cache(c, cpu); 623 } 624 625 return 0; 626 } 627 628 static void drain_mem_cache(struct bpf_mem_cache *c) 629 { 630 bool percpu = !!c->percpu_size; 631 632 /* No progs are using this bpf_mem_cache, but htab_map_free() called 633 * bpf_mem_cache_free() for all remaining elements and they can be in 634 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 635 * 636 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 637 * on these lists, so it is safe to use __llist_del_all(). 638 */ 639 free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 640 free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 641 free_all(__llist_del_all(&c->free_llist), percpu); 642 free_all(__llist_del_all(&c->free_llist_extra), percpu); 643 free_all(__llist_del_all(&c->free_by_rcu), percpu); 644 free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); 645 free_all(llist_del_all(&c->waiting_for_gp), percpu); 646 } 647 648 static void check_mem_cache(struct bpf_mem_cache *c) 649 { 650 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 651 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 652 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 653 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 654 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 655 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 656 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 657 } 658 659 static void check_leaked_objs(struct bpf_mem_alloc *ma) 660 { 661 struct bpf_mem_caches *cc; 662 struct bpf_mem_cache *c; 663 int cpu, i; 664 665 if (ma->cache) { 666 for_each_possible_cpu(cpu) { 667 c = per_cpu_ptr(ma->cache, cpu); 668 check_mem_cache(c); 669 } 670 } 671 if (ma->caches) { 672 for_each_possible_cpu(cpu) { 673 cc = per_cpu_ptr(ma->caches, cpu); 674 for (i = 0; i < NUM_CACHES; i++) { 675 c = &cc->cache[i]; 676 check_mem_cache(c); 677 } 678 } 679 } 680 } 681 682 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 683 { 684 check_leaked_objs(ma); 685 free_percpu(ma->cache); 686 free_percpu(ma->caches); 687 ma->cache = NULL; 688 ma->caches = NULL; 689 } 690 691 static void free_mem_alloc(struct bpf_mem_alloc *ma) 692 { 693 /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 694 * might still execute. Wait for them. 695 * 696 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 697 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 698 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), 699 * so if call_rcu(head, __free_rcu) is skipped due to 700 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by 701 * using rcu_trace_implies_rcu_gp() as well. 702 */ 703 rcu_barrier(); /* wait for __free_by_rcu */ 704 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 705 if (!rcu_trace_implies_rcu_gp()) 706 rcu_barrier(); 707 free_mem_alloc_no_barrier(ma); 708 } 709 710 static void free_mem_alloc_deferred(struct work_struct *work) 711 { 712 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 713 714 free_mem_alloc(ma); 715 kfree(ma); 716 } 717 718 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 719 { 720 struct bpf_mem_alloc *copy; 721 722 if (!rcu_in_progress) { 723 /* Fast path. No callbacks are pending, hence no need to do 724 * rcu_barrier-s. 725 */ 726 free_mem_alloc_no_barrier(ma); 727 return; 728 } 729 730 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 731 if (!copy) { 732 /* Slow path with inline barrier-s */ 733 free_mem_alloc(ma); 734 return; 735 } 736 737 /* Defer barriers into worker to let the rest of map memory to be freed */ 738 memset(ma, 0, sizeof(*ma)); 739 INIT_WORK(©->work, free_mem_alloc_deferred); 740 queue_work(system_unbound_wq, ©->work); 741 } 742 743 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 744 { 745 struct bpf_mem_caches *cc; 746 struct bpf_mem_cache *c; 747 int cpu, i, rcu_in_progress; 748 749 if (ma->cache) { 750 rcu_in_progress = 0; 751 for_each_possible_cpu(cpu) { 752 c = per_cpu_ptr(ma->cache, cpu); 753 WRITE_ONCE(c->draining, true); 754 irq_work_sync(&c->refill_work); 755 drain_mem_cache(c); 756 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 757 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 758 } 759 obj_cgroup_put(ma->objcg); 760 destroy_mem_alloc(ma, rcu_in_progress); 761 } 762 if (ma->caches) { 763 rcu_in_progress = 0; 764 for_each_possible_cpu(cpu) { 765 cc = per_cpu_ptr(ma->caches, cpu); 766 for (i = 0; i < NUM_CACHES; i++) { 767 c = &cc->cache[i]; 768 WRITE_ONCE(c->draining, true); 769 irq_work_sync(&c->refill_work); 770 drain_mem_cache(c); 771 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 772 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 773 } 774 } 775 obj_cgroup_put(ma->objcg); 776 destroy_mem_alloc(ma, rcu_in_progress); 777 } 778 } 779 780 /* notrace is necessary here and in other functions to make sure 781 * bpf programs cannot attach to them and cause llist corruptions. 782 */ 783 static void notrace *unit_alloc(struct bpf_mem_cache *c) 784 { 785 struct llist_node *llnode = NULL; 786 unsigned long flags; 787 int cnt = 0; 788 789 /* Disable irqs to prevent the following race for majority of prog types: 790 * prog_A 791 * bpf_mem_alloc 792 * preemption or irq -> prog_B 793 * bpf_mem_alloc 794 * 795 * but prog_B could be a perf_event NMI prog. 796 * Use per-cpu 'active' counter to order free_list access between 797 * unit_alloc/unit_free/bpf_mem_refill. 798 */ 799 local_irq_save(flags); 800 if (local_inc_return(&c->active) == 1) { 801 llnode = __llist_del_first(&c->free_llist); 802 if (llnode) { 803 cnt = --c->free_cnt; 804 *(struct bpf_mem_cache **)llnode = c; 805 } 806 } 807 local_dec(&c->active); 808 809 WARN_ON(cnt < 0); 810 811 if (cnt < c->low_watermark) 812 irq_work_raise(c); 813 /* Enable IRQ after the enqueue of irq work completes, so irq work 814 * will run after IRQ is enabled and free_llist may be refilled by 815 * irq work before other task preempts current task. 816 */ 817 local_irq_restore(flags); 818 819 return llnode; 820 } 821 822 /* Though 'ptr' object could have been allocated on a different cpu 823 * add it to the free_llist of the current cpu. 824 * Let kfree() logic deal with it when it's later called from irq_work. 825 */ 826 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 827 { 828 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 829 unsigned long flags; 830 int cnt = 0; 831 832 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 833 834 /* 835 * Remember bpf_mem_cache that allocated this object. 836 * The hint is not accurate. 837 */ 838 c->tgt = *(struct bpf_mem_cache **)llnode; 839 840 local_irq_save(flags); 841 if (local_inc_return(&c->active) == 1) { 842 __llist_add(llnode, &c->free_llist); 843 cnt = ++c->free_cnt; 844 } else { 845 /* unit_free() cannot fail. Therefore add an object to atomic 846 * llist. free_bulk() will drain it. Though free_llist_extra is 847 * a per-cpu list we have to use atomic llist_add here, since 848 * it also can be interrupted by bpf nmi prog that does another 849 * unit_free() into the same free_llist_extra. 850 */ 851 llist_add(llnode, &c->free_llist_extra); 852 } 853 local_dec(&c->active); 854 855 if (cnt > c->high_watermark) 856 /* free few objects from current cpu into global kmalloc pool */ 857 irq_work_raise(c); 858 /* Enable IRQ after irq_work_raise() completes, otherwise when current 859 * task is preempted by task which does unit_alloc(), unit_alloc() may 860 * return NULL unexpectedly because irq work is already pending but can 861 * not been triggered and free_llist can not be refilled timely. 862 */ 863 local_irq_restore(flags); 864 } 865 866 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 867 { 868 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 869 unsigned long flags; 870 871 c->tgt = *(struct bpf_mem_cache **)llnode; 872 873 local_irq_save(flags); 874 if (local_inc_return(&c->active) == 1) { 875 if (__llist_add(llnode, &c->free_by_rcu)) 876 c->free_by_rcu_tail = llnode; 877 } else { 878 llist_add(llnode, &c->free_llist_extra_rcu); 879 } 880 local_dec(&c->active); 881 882 if (!atomic_read(&c->call_rcu_in_progress)) 883 irq_work_raise(c); 884 local_irq_restore(flags); 885 } 886 887 /* Called from BPF program or from sys_bpf syscall. 888 * In both cases migration is disabled. 889 */ 890 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 891 { 892 int idx; 893 void *ret; 894 895 if (!size) 896 return NULL; 897 898 if (!ma->percpu) 899 size += LLIST_NODE_SZ; 900 idx = bpf_mem_cache_idx(size); 901 if (idx < 0) 902 return NULL; 903 904 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 905 return !ret ? NULL : ret + LLIST_NODE_SZ; 906 } 907 908 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 909 { 910 struct bpf_mem_cache *c; 911 int idx; 912 913 if (!ptr) 914 return; 915 916 c = *(void **)(ptr - LLIST_NODE_SZ); 917 idx = bpf_mem_cache_idx(c->unit_size); 918 if (WARN_ON_ONCE(idx < 0)) 919 return; 920 921 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 922 } 923 924 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 925 { 926 struct bpf_mem_cache *c; 927 int idx; 928 929 if (!ptr) 930 return; 931 932 c = *(void **)(ptr - LLIST_NODE_SZ); 933 idx = bpf_mem_cache_idx(c->unit_size); 934 if (WARN_ON_ONCE(idx < 0)) 935 return; 936 937 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 938 } 939 940 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 941 { 942 void *ret; 943 944 ret = unit_alloc(this_cpu_ptr(ma->cache)); 945 return !ret ? NULL : ret + LLIST_NODE_SZ; 946 } 947 948 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 949 { 950 if (!ptr) 951 return; 952 953 unit_free(this_cpu_ptr(ma->cache), ptr); 954 } 955 956 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 957 { 958 if (!ptr) 959 return; 960 961 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 962 } 963 964 /* Directly does a kfree() without putting 'ptr' back to the free_llist 965 * for reuse and without waiting for a rcu_tasks_trace gp. 966 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 967 * before calling bpf_mem_cache_raw_free(). 968 * It could be used when the rcu_tasks_trace callback does not have 969 * a hold on the original bpf_mem_alloc object that allocated the 970 * 'ptr'. This should only be used in the uncommon code path. 971 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 972 * and may affect performance. 973 */ 974 void bpf_mem_cache_raw_free(void *ptr) 975 { 976 if (!ptr) 977 return; 978 979 kfree(ptr - LLIST_NODE_SZ); 980 } 981 982 /* When flags == GFP_KERNEL, it signals that the caller will not cause 983 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 984 * kmalloc if the free_llist is empty. 985 */ 986 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 987 { 988 struct bpf_mem_cache *c; 989 void *ret; 990 991 c = this_cpu_ptr(ma->cache); 992 993 ret = unit_alloc(c); 994 if (!ret && flags == GFP_KERNEL) { 995 struct mem_cgroup *memcg, *old_memcg; 996 997 memcg = get_memcg(c); 998 old_memcg = set_active_memcg(memcg); 999 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 1000 if (ret) 1001 *(struct bpf_mem_cache **)ret = c; 1002 set_active_memcg(old_memcg); 1003 mem_cgroup_put(memcg); 1004 } 1005 1006 return !ret ? NULL : ret + LLIST_NODE_SZ; 1007 } 1008