1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ 3 #include <linux/mm.h> 4 #include <linux/llist.h> 5 #include <linux/bpf.h> 6 #include <linux/irq_work.h> 7 #include <linux/bpf_mem_alloc.h> 8 #include <linux/memcontrol.h> 9 #include <asm/local.h> 10 11 /* Any context (including NMI) BPF specific memory allocator. 12 * 13 * Tracing BPF programs can attach to kprobe and fentry. Hence they 14 * run in unknown context where calling plain kmalloc() might not be safe. 15 * 16 * Front-end kmalloc() with per-cpu per-bucket cache of free elements. 17 * Refill this cache asynchronously from irq_work. 18 * 19 * CPU_0 buckets 20 * 16 32 64 96 128 196 256 512 1024 2048 4096 21 * ... 22 * CPU_N buckets 23 * 16 32 64 96 128 196 256 512 1024 2048 4096 24 * 25 * The buckets are prefilled at the start. 26 * BPF programs always run with migration disabled. 27 * It's safe to allocate from cache of the current cpu with irqs disabled. 28 * Free-ing is always done into bucket of the current cpu as well. 29 * irq_work trims extra free elements from buckets with kfree 30 * and refills them with kmalloc, so global kmalloc logic takes care 31 * of freeing objects allocated by one cpu and freed on another. 32 * 33 * Every allocated objected is padded with extra 8 bytes that contains 34 * struct llist_node. 35 */ 36 #define LLIST_NODE_SZ sizeof(struct llist_node) 37 38 /* similar to kmalloc, but sizeof == 8 bucket is gone */ 39 static u8 size_index[24] __ro_after_init = { 40 3, /* 8 */ 41 3, /* 16 */ 42 4, /* 24 */ 43 4, /* 32 */ 44 5, /* 40 */ 45 5, /* 48 */ 46 5, /* 56 */ 47 5, /* 64 */ 48 1, /* 72 */ 49 1, /* 80 */ 50 1, /* 88 */ 51 1, /* 96 */ 52 6, /* 104 */ 53 6, /* 112 */ 54 6, /* 120 */ 55 6, /* 128 */ 56 2, /* 136 */ 57 2, /* 144 */ 58 2, /* 152 */ 59 2, /* 160 */ 60 2, /* 168 */ 61 2, /* 176 */ 62 2, /* 184 */ 63 2 /* 192 */ 64 }; 65 66 static int bpf_mem_cache_idx(size_t size) 67 { 68 if (!size || size > 4096) 69 return -1; 70 71 if (size <= 192) 72 return size_index[(size - 1) / 8] - 1; 73 74 return fls(size - 1) - 2; 75 } 76 77 #define NUM_CACHES 11 78 79 struct bpf_mem_cache { 80 /* per-cpu list of free objects of size 'unit_size'. 81 * All accesses are done with interrupts disabled and 'active' counter 82 * protection with __llist_add() and __llist_del_first(). 83 */ 84 struct llist_head free_llist; 85 local_t active; 86 87 /* Operations on the free_list from unit_alloc/unit_free/bpf_mem_refill 88 * are sequenced by per-cpu 'active' counter. But unit_free() cannot 89 * fail. When 'active' is busy the unit_free() will add an object to 90 * free_llist_extra. 91 */ 92 struct llist_head free_llist_extra; 93 94 struct irq_work refill_work; 95 struct obj_cgroup *objcg; 96 int unit_size; 97 /* count of objects in free_llist */ 98 int free_cnt; 99 int low_watermark, high_watermark, batch; 100 int percpu_size; 101 bool draining; 102 struct bpf_mem_cache *tgt; 103 104 /* list of objects to be freed after RCU GP */ 105 struct llist_head free_by_rcu; 106 struct llist_node *free_by_rcu_tail; 107 struct llist_head waiting_for_gp; 108 struct llist_node *waiting_for_gp_tail; 109 struct rcu_head rcu; 110 atomic_t call_rcu_in_progress; 111 struct llist_head free_llist_extra_rcu; 112 113 /* list of objects to be freed after RCU tasks trace GP */ 114 struct llist_head free_by_rcu_ttrace; 115 struct llist_head waiting_for_gp_ttrace; 116 struct rcu_head rcu_ttrace; 117 atomic_t call_rcu_ttrace_in_progress; 118 }; 119 120 struct bpf_mem_caches { 121 struct bpf_mem_cache cache[NUM_CACHES]; 122 }; 123 124 static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096}; 125 126 static struct llist_node notrace *__llist_del_first(struct llist_head *head) 127 { 128 struct llist_node *entry, *next; 129 130 entry = head->first; 131 if (!entry) 132 return NULL; 133 next = entry->next; 134 head->first = next; 135 return entry; 136 } 137 138 static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags) 139 { 140 if (c->percpu_size) { 141 void __percpu **obj = kmalloc_node(c->percpu_size, flags, node); 142 void __percpu *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); 143 144 if (!obj || !pptr) { 145 free_percpu(pptr); 146 kfree(obj); 147 return NULL; 148 } 149 obj[1] = pptr; 150 return obj; 151 } 152 153 return kmalloc_node(c->unit_size, flags | __GFP_ZERO, node); 154 } 155 156 static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c) 157 { 158 #ifdef CONFIG_MEMCG 159 if (c->objcg) 160 return get_mem_cgroup_from_objcg(c->objcg); 161 return root_mem_cgroup; 162 #else 163 return NULL; 164 #endif 165 } 166 167 static void inc_active(struct bpf_mem_cache *c, unsigned long *flags) 168 { 169 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 170 /* In RT irq_work runs in per-cpu kthread, so disable 171 * interrupts to avoid preemption and interrupts and 172 * reduce the chance of bpf prog executing on this cpu 173 * when active counter is busy. 174 */ 175 local_irq_save(*flags); 176 /* alloc_bulk runs from irq_work which will not preempt a bpf 177 * program that does unit_alloc/unit_free since IRQs are 178 * disabled there. There is no race to increment 'active' 179 * counter. It protects free_llist from corruption in case NMI 180 * bpf prog preempted this loop. 181 */ 182 WARN_ON_ONCE(local_inc_return(&c->active) != 1); 183 } 184 185 static void dec_active(struct bpf_mem_cache *c, unsigned long *flags) 186 { 187 local_dec(&c->active); 188 if (IS_ENABLED(CONFIG_PREEMPT_RT)) 189 local_irq_restore(*flags); 190 } 191 192 static void add_obj_to_free_list(struct bpf_mem_cache *c, void *obj) 193 { 194 unsigned long flags; 195 196 inc_active(c, &flags); 197 __llist_add(obj, &c->free_llist); 198 c->free_cnt++; 199 dec_active(c, &flags); 200 } 201 202 /* Mostly runs from irq_work except __init phase. */ 203 static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic) 204 { 205 struct mem_cgroup *memcg = NULL, *old_memcg; 206 gfp_t gfp; 207 void *obj; 208 int i; 209 210 gfp = __GFP_NOWARN | __GFP_ACCOUNT; 211 gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL; 212 213 for (i = 0; i < cnt; i++) { 214 /* 215 * For every 'c' llist_del_first(&c->free_by_rcu_ttrace); is 216 * done only by one CPU == current CPU. Other CPUs might 217 * llist_add() and llist_del_all() in parallel. 218 */ 219 obj = llist_del_first(&c->free_by_rcu_ttrace); 220 if (!obj) 221 break; 222 add_obj_to_free_list(c, obj); 223 } 224 if (i >= cnt) 225 return; 226 227 for (; i < cnt; i++) { 228 obj = llist_del_first(&c->waiting_for_gp_ttrace); 229 if (!obj) 230 break; 231 add_obj_to_free_list(c, obj); 232 } 233 if (i >= cnt) 234 return; 235 236 memcg = get_memcg(c); 237 old_memcg = set_active_memcg(memcg); 238 for (; i < cnt; i++) { 239 /* Allocate, but don't deplete atomic reserves that typical 240 * GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc 241 * will allocate from the current numa node which is what we 242 * want here. 243 */ 244 obj = __alloc(c, node, gfp); 245 if (!obj) 246 break; 247 add_obj_to_free_list(c, obj); 248 } 249 set_active_memcg(old_memcg); 250 mem_cgroup_put(memcg); 251 } 252 253 static void free_one(void *obj, bool percpu) 254 { 255 if (percpu) 256 free_percpu(((void __percpu **)obj)[1]); 257 258 kfree(obj); 259 } 260 261 static int free_all(struct llist_node *llnode, bool percpu) 262 { 263 struct llist_node *pos, *t; 264 int cnt = 0; 265 266 llist_for_each_safe(pos, t, llnode) { 267 free_one(pos, percpu); 268 cnt++; 269 } 270 return cnt; 271 } 272 273 static void __free_rcu(struct rcu_head *head) 274 { 275 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu_ttrace); 276 277 free_all(llist_del_all(&c->waiting_for_gp_ttrace), !!c->percpu_size); 278 atomic_set(&c->call_rcu_ttrace_in_progress, 0); 279 } 280 281 static void __free_rcu_tasks_trace(struct rcu_head *head) 282 { 283 /* If RCU Tasks Trace grace period implies RCU grace period, 284 * there is no need to invoke call_rcu(). 285 */ 286 if (rcu_trace_implies_rcu_gp()) 287 __free_rcu(head); 288 else 289 call_rcu(head, __free_rcu); 290 } 291 292 static void enque_to_free(struct bpf_mem_cache *c, void *obj) 293 { 294 struct llist_node *llnode = obj; 295 296 /* bpf_mem_cache is a per-cpu object. Freeing happens in irq_work. 297 * Nothing races to add to free_by_rcu_ttrace list. 298 */ 299 llist_add(llnode, &c->free_by_rcu_ttrace); 300 } 301 302 static void do_call_rcu_ttrace(struct bpf_mem_cache *c) 303 { 304 struct llist_node *llnode, *t; 305 306 if (atomic_xchg(&c->call_rcu_ttrace_in_progress, 1)) { 307 if (unlikely(READ_ONCE(c->draining))) { 308 llnode = llist_del_all(&c->free_by_rcu_ttrace); 309 free_all(llnode, !!c->percpu_size); 310 } 311 return; 312 } 313 314 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 315 llist_for_each_safe(llnode, t, llist_del_all(&c->free_by_rcu_ttrace)) 316 llist_add(llnode, &c->waiting_for_gp_ttrace); 317 318 if (unlikely(READ_ONCE(c->draining))) { 319 __free_rcu(&c->rcu_ttrace); 320 return; 321 } 322 323 /* Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 324 * If RCU Tasks Trace grace period implies RCU grace period, free 325 * these elements directly, else use call_rcu() to wait for normal 326 * progs to finish and finally do free_one() on each element. 327 */ 328 call_rcu_tasks_trace(&c->rcu_ttrace, __free_rcu_tasks_trace); 329 } 330 331 static void free_bulk(struct bpf_mem_cache *c) 332 { 333 struct bpf_mem_cache *tgt = c->tgt; 334 struct llist_node *llnode, *t; 335 unsigned long flags; 336 int cnt; 337 338 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 339 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 340 341 do { 342 inc_active(c, &flags); 343 llnode = __llist_del_first(&c->free_llist); 344 if (llnode) 345 cnt = --c->free_cnt; 346 else 347 cnt = 0; 348 dec_active(c, &flags); 349 if (llnode) 350 enque_to_free(tgt, llnode); 351 } while (cnt > (c->high_watermark + c->low_watermark) / 2); 352 353 /* and drain free_llist_extra */ 354 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra)) 355 enque_to_free(tgt, llnode); 356 do_call_rcu_ttrace(tgt); 357 } 358 359 static void __free_by_rcu(struct rcu_head *head) 360 { 361 struct bpf_mem_cache *c = container_of(head, struct bpf_mem_cache, rcu); 362 struct bpf_mem_cache *tgt = c->tgt; 363 struct llist_node *llnode; 364 365 WARN_ON_ONCE(tgt->unit_size != c->unit_size); 366 WARN_ON_ONCE(tgt->percpu_size != c->percpu_size); 367 368 llnode = llist_del_all(&c->waiting_for_gp); 369 if (!llnode) 370 goto out; 371 372 llist_add_batch(llnode, c->waiting_for_gp_tail, &tgt->free_by_rcu_ttrace); 373 374 /* Objects went through regular RCU GP. Send them to RCU tasks trace */ 375 do_call_rcu_ttrace(tgt); 376 out: 377 atomic_set(&c->call_rcu_in_progress, 0); 378 } 379 380 static void check_free_by_rcu(struct bpf_mem_cache *c) 381 { 382 struct llist_node *llnode, *t; 383 unsigned long flags; 384 385 /* drain free_llist_extra_rcu */ 386 if (unlikely(!llist_empty(&c->free_llist_extra_rcu))) { 387 inc_active(c, &flags); 388 llist_for_each_safe(llnode, t, llist_del_all(&c->free_llist_extra_rcu)) 389 if (__llist_add(llnode, &c->free_by_rcu)) 390 c->free_by_rcu_tail = llnode; 391 dec_active(c, &flags); 392 } 393 394 if (llist_empty(&c->free_by_rcu)) 395 return; 396 397 if (atomic_xchg(&c->call_rcu_in_progress, 1)) { 398 /* 399 * Instead of kmalloc-ing new rcu_head and triggering 10k 400 * call_rcu() to hit rcutree.qhimark and force RCU to notice 401 * the overload just ask RCU to hurry up. There could be many 402 * objects in free_by_rcu list. 403 * This hint reduces memory consumption for an artificial 404 * benchmark from 2 Gbyte to 150 Mbyte. 405 */ 406 rcu_request_urgent_qs_task(current); 407 return; 408 } 409 410 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 411 412 inc_active(c, &flags); 413 WRITE_ONCE(c->waiting_for_gp.first, __llist_del_all(&c->free_by_rcu)); 414 c->waiting_for_gp_tail = c->free_by_rcu_tail; 415 dec_active(c, &flags); 416 417 if (unlikely(READ_ONCE(c->draining))) { 418 free_all(llist_del_all(&c->waiting_for_gp), !!c->percpu_size); 419 atomic_set(&c->call_rcu_in_progress, 0); 420 } else { 421 call_rcu_hurry(&c->rcu, __free_by_rcu); 422 } 423 } 424 425 static void bpf_mem_refill(struct irq_work *work) 426 { 427 struct bpf_mem_cache *c = container_of(work, struct bpf_mem_cache, refill_work); 428 int cnt; 429 430 /* Racy access to free_cnt. It doesn't need to be 100% accurate */ 431 cnt = c->free_cnt; 432 if (cnt < c->low_watermark) 433 /* irq_work runs on this cpu and kmalloc will allocate 434 * from the current numa node which is what we want here. 435 */ 436 alloc_bulk(c, c->batch, NUMA_NO_NODE, true); 437 else if (cnt > c->high_watermark) 438 free_bulk(c); 439 440 check_free_by_rcu(c); 441 } 442 443 static void notrace irq_work_raise(struct bpf_mem_cache *c) 444 { 445 irq_work_queue(&c->refill_work); 446 } 447 448 /* For typical bpf map case that uses bpf_mem_cache_alloc and single bucket 449 * the freelist cache will be elem_size * 64 (or less) on each cpu. 450 * 451 * For bpf programs that don't have statically known allocation sizes and 452 * assuming (low_mark + high_mark) / 2 as an average number of elements per 453 * bucket and all buckets are used the total amount of memory in freelists 454 * on each cpu will be: 455 * 64*16 + 64*32 + 64*64 + 64*96 + 64*128 + 64*196 + 64*256 + 32*512 + 16*1024 + 8*2048 + 4*4096 456 * == ~ 116 Kbyte using below heuristic. 457 * Initialized, but unused bpf allocator (not bpf map specific one) will 458 * consume ~ 11 Kbyte per cpu. 459 * Typical case will be between 11K and 116K closer to 11K. 460 * bpf progs can and should share bpf_mem_cache when possible. 461 * 462 * Percpu allocation is typically rare. To avoid potential unnecessary large 463 * memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1. 464 */ 465 static void init_refill_work(struct bpf_mem_cache *c) 466 { 467 init_irq_work(&c->refill_work, bpf_mem_refill); 468 if (c->percpu_size) { 469 c->low_watermark = 1; 470 c->high_watermark = 3; 471 } else if (c->unit_size <= 256) { 472 c->low_watermark = 32; 473 c->high_watermark = 96; 474 } else { 475 /* When page_size == 4k, order-0 cache will have low_mark == 2 476 * and high_mark == 6 with batch alloc of 3 individual pages at 477 * a time. 478 * 8k allocs and above low == 1, high == 3, batch == 1. 479 */ 480 c->low_watermark = max(32 * 256 / c->unit_size, 1); 481 c->high_watermark = max(96 * 256 / c->unit_size, 3); 482 } 483 c->batch = max((c->high_watermark - c->low_watermark) / 4 * 3, 1); 484 } 485 486 static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu) 487 { 488 int cnt = 1; 489 490 /* To avoid consuming memory, for non-percpu allocation, assume that 491 * 1st run of bpf prog won't be doing more than 4 map_update_elem from 492 * irq disabled region if unit size is less than or equal to 256. 493 * For all other cases, let us just do one allocation. 494 */ 495 if (!c->percpu_size && c->unit_size <= 256) 496 cnt = 4; 497 alloc_bulk(c, cnt, cpu_to_node(cpu), false); 498 } 499 500 /* When size != 0 bpf_mem_cache for each cpu. 501 * This is typical bpf hash map use case when all elements have equal size. 502 * 503 * When size == 0 allocate 11 bpf_mem_cache-s for each cpu, then rely on 504 * kmalloc/kfree. Max allocation size is 4096 in this case. 505 * This is bpf_dynptr and bpf_kptr use case. 506 */ 507 int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu) 508 { 509 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 510 struct bpf_mem_cache *c; struct bpf_mem_cache __percpu *pc; 511 struct obj_cgroup *objcg = NULL; 512 int cpu, i, unit_size, percpu_size = 0; 513 514 if (percpu && size == 0) 515 return -EINVAL; 516 517 /* room for llist_node and per-cpu pointer */ 518 if (percpu) 519 percpu_size = LLIST_NODE_SZ + sizeof(void *); 520 ma->percpu = percpu; 521 522 if (size) { 523 pc = __alloc_percpu_gfp(sizeof(*pc), 8, GFP_KERNEL); 524 if (!pc) 525 return -ENOMEM; 526 527 if (!percpu) 528 size += LLIST_NODE_SZ; /* room for llist_node */ 529 unit_size = size; 530 531 #ifdef CONFIG_MEMCG 532 if (memcg_bpf_enabled()) 533 objcg = get_obj_cgroup_from_current(); 534 #endif 535 ma->objcg = objcg; 536 537 for_each_possible_cpu(cpu) { 538 c = per_cpu_ptr(pc, cpu); 539 c->unit_size = unit_size; 540 c->objcg = objcg; 541 c->percpu_size = percpu_size; 542 c->tgt = c; 543 init_refill_work(c); 544 prefill_mem_cache(c, cpu); 545 } 546 ma->cache = pc; 547 return 0; 548 } 549 550 pcc = __alloc_percpu_gfp(sizeof(*cc), 8, GFP_KERNEL); 551 if (!pcc) 552 return -ENOMEM; 553 #ifdef CONFIG_MEMCG 554 objcg = get_obj_cgroup_from_current(); 555 #endif 556 ma->objcg = objcg; 557 for_each_possible_cpu(cpu) { 558 cc = per_cpu_ptr(pcc, cpu); 559 for (i = 0; i < NUM_CACHES; i++) { 560 c = &cc->cache[i]; 561 c->unit_size = sizes[i]; 562 c->objcg = objcg; 563 c->percpu_size = percpu_size; 564 c->tgt = c; 565 566 init_refill_work(c); 567 prefill_mem_cache(c, cpu); 568 } 569 } 570 571 ma->caches = pcc; 572 return 0; 573 } 574 575 int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg) 576 { 577 struct bpf_mem_caches __percpu *pcc; 578 579 pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL); 580 if (!pcc) 581 return -ENOMEM; 582 583 ma->caches = pcc; 584 ma->objcg = objcg; 585 ma->percpu = true; 586 return 0; 587 } 588 589 int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size) 590 { 591 struct bpf_mem_caches *cc; struct bpf_mem_caches __percpu *pcc; 592 int cpu, i, unit_size, percpu_size; 593 struct obj_cgroup *objcg; 594 struct bpf_mem_cache *c; 595 596 i = bpf_mem_cache_idx(size); 597 if (i < 0) 598 return -EINVAL; 599 600 /* room for llist_node and per-cpu pointer */ 601 percpu_size = LLIST_NODE_SZ + sizeof(void *); 602 603 unit_size = sizes[i]; 604 objcg = ma->objcg; 605 pcc = ma->caches; 606 607 for_each_possible_cpu(cpu) { 608 cc = per_cpu_ptr(pcc, cpu); 609 c = &cc->cache[i]; 610 if (c->unit_size) 611 break; 612 613 c->unit_size = unit_size; 614 c->objcg = objcg; 615 c->percpu_size = percpu_size; 616 c->tgt = c; 617 618 init_refill_work(c); 619 prefill_mem_cache(c, cpu); 620 } 621 622 return 0; 623 } 624 625 static void drain_mem_cache(struct bpf_mem_cache *c) 626 { 627 bool percpu = !!c->percpu_size; 628 629 /* No progs are using this bpf_mem_cache, but htab_map_free() called 630 * bpf_mem_cache_free() for all remaining elements and they can be in 631 * free_by_rcu_ttrace or in waiting_for_gp_ttrace lists, so drain those lists now. 632 * 633 * Except for waiting_for_gp_ttrace list, there are no concurrent operations 634 * on these lists, so it is safe to use __llist_del_all(). 635 */ 636 free_all(llist_del_all(&c->free_by_rcu_ttrace), percpu); 637 free_all(llist_del_all(&c->waiting_for_gp_ttrace), percpu); 638 free_all(__llist_del_all(&c->free_llist), percpu); 639 free_all(__llist_del_all(&c->free_llist_extra), percpu); 640 free_all(__llist_del_all(&c->free_by_rcu), percpu); 641 free_all(__llist_del_all(&c->free_llist_extra_rcu), percpu); 642 free_all(llist_del_all(&c->waiting_for_gp), percpu); 643 } 644 645 static void check_mem_cache(struct bpf_mem_cache *c) 646 { 647 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu_ttrace)); 648 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp_ttrace)); 649 WARN_ON_ONCE(!llist_empty(&c->free_llist)); 650 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra)); 651 WARN_ON_ONCE(!llist_empty(&c->free_by_rcu)); 652 WARN_ON_ONCE(!llist_empty(&c->free_llist_extra_rcu)); 653 WARN_ON_ONCE(!llist_empty(&c->waiting_for_gp)); 654 } 655 656 static void check_leaked_objs(struct bpf_mem_alloc *ma) 657 { 658 struct bpf_mem_caches *cc; 659 struct bpf_mem_cache *c; 660 int cpu, i; 661 662 if (ma->cache) { 663 for_each_possible_cpu(cpu) { 664 c = per_cpu_ptr(ma->cache, cpu); 665 check_mem_cache(c); 666 } 667 } 668 if (ma->caches) { 669 for_each_possible_cpu(cpu) { 670 cc = per_cpu_ptr(ma->caches, cpu); 671 for (i = 0; i < NUM_CACHES; i++) { 672 c = &cc->cache[i]; 673 check_mem_cache(c); 674 } 675 } 676 } 677 } 678 679 static void free_mem_alloc_no_barrier(struct bpf_mem_alloc *ma) 680 { 681 check_leaked_objs(ma); 682 free_percpu(ma->cache); 683 free_percpu(ma->caches); 684 ma->cache = NULL; 685 ma->caches = NULL; 686 } 687 688 static void free_mem_alloc(struct bpf_mem_alloc *ma) 689 { 690 /* waiting_for_gp[_ttrace] lists were drained, but RCU callbacks 691 * might still execute. Wait for them. 692 * 693 * rcu_barrier_tasks_trace() doesn't imply synchronize_rcu_tasks_trace(), 694 * but rcu_barrier_tasks_trace() and rcu_barrier() below are only used 695 * to wait for the pending __free_rcu_tasks_trace() and __free_rcu(), 696 * so if call_rcu(head, __free_rcu) is skipped due to 697 * rcu_trace_implies_rcu_gp(), it will be OK to skip rcu_barrier() by 698 * using rcu_trace_implies_rcu_gp() as well. 699 */ 700 rcu_barrier(); /* wait for __free_by_rcu */ 701 rcu_barrier_tasks_trace(); /* wait for __free_rcu */ 702 if (!rcu_trace_implies_rcu_gp()) 703 rcu_barrier(); 704 free_mem_alloc_no_barrier(ma); 705 } 706 707 static void free_mem_alloc_deferred(struct work_struct *work) 708 { 709 struct bpf_mem_alloc *ma = container_of(work, struct bpf_mem_alloc, work); 710 711 free_mem_alloc(ma); 712 kfree(ma); 713 } 714 715 static void destroy_mem_alloc(struct bpf_mem_alloc *ma, int rcu_in_progress) 716 { 717 struct bpf_mem_alloc *copy; 718 719 if (!rcu_in_progress) { 720 /* Fast path. No callbacks are pending, hence no need to do 721 * rcu_barrier-s. 722 */ 723 free_mem_alloc_no_barrier(ma); 724 return; 725 } 726 727 copy = kmemdup(ma, sizeof(*ma), GFP_KERNEL); 728 if (!copy) { 729 /* Slow path with inline barrier-s */ 730 free_mem_alloc(ma); 731 return; 732 } 733 734 /* Defer barriers into worker to let the rest of map memory to be freed */ 735 memset(ma, 0, sizeof(*ma)); 736 INIT_WORK(©->work, free_mem_alloc_deferred); 737 queue_work(system_unbound_wq, ©->work); 738 } 739 740 void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma) 741 { 742 struct bpf_mem_caches *cc; 743 struct bpf_mem_cache *c; 744 int cpu, i, rcu_in_progress; 745 746 if (ma->cache) { 747 rcu_in_progress = 0; 748 for_each_possible_cpu(cpu) { 749 c = per_cpu_ptr(ma->cache, cpu); 750 WRITE_ONCE(c->draining, true); 751 irq_work_sync(&c->refill_work); 752 drain_mem_cache(c); 753 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 754 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 755 } 756 obj_cgroup_put(ma->objcg); 757 destroy_mem_alloc(ma, rcu_in_progress); 758 } 759 if (ma->caches) { 760 rcu_in_progress = 0; 761 for_each_possible_cpu(cpu) { 762 cc = per_cpu_ptr(ma->caches, cpu); 763 for (i = 0; i < NUM_CACHES; i++) { 764 c = &cc->cache[i]; 765 WRITE_ONCE(c->draining, true); 766 irq_work_sync(&c->refill_work); 767 drain_mem_cache(c); 768 rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress); 769 rcu_in_progress += atomic_read(&c->call_rcu_in_progress); 770 } 771 } 772 obj_cgroup_put(ma->objcg); 773 destroy_mem_alloc(ma, rcu_in_progress); 774 } 775 } 776 777 /* notrace is necessary here and in other functions to make sure 778 * bpf programs cannot attach to them and cause llist corruptions. 779 */ 780 static void notrace *unit_alloc(struct bpf_mem_cache *c) 781 { 782 struct llist_node *llnode = NULL; 783 unsigned long flags; 784 int cnt = 0; 785 786 /* Disable irqs to prevent the following race for majority of prog types: 787 * prog_A 788 * bpf_mem_alloc 789 * preemption or irq -> prog_B 790 * bpf_mem_alloc 791 * 792 * but prog_B could be a perf_event NMI prog. 793 * Use per-cpu 'active' counter to order free_list access between 794 * unit_alloc/unit_free/bpf_mem_refill. 795 */ 796 local_irq_save(flags); 797 if (local_inc_return(&c->active) == 1) { 798 llnode = __llist_del_first(&c->free_llist); 799 if (llnode) { 800 cnt = --c->free_cnt; 801 *(struct bpf_mem_cache **)llnode = c; 802 } 803 } 804 local_dec(&c->active); 805 806 WARN_ON(cnt < 0); 807 808 if (cnt < c->low_watermark) 809 irq_work_raise(c); 810 /* Enable IRQ after the enqueue of irq work completes, so irq work 811 * will run after IRQ is enabled and free_llist may be refilled by 812 * irq work before other task preempts current task. 813 */ 814 local_irq_restore(flags); 815 816 return llnode; 817 } 818 819 /* Though 'ptr' object could have been allocated on a different cpu 820 * add it to the free_llist of the current cpu. 821 * Let kfree() logic deal with it when it's later called from irq_work. 822 */ 823 static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) 824 { 825 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 826 unsigned long flags; 827 int cnt = 0; 828 829 BUILD_BUG_ON(LLIST_NODE_SZ > 8); 830 831 /* 832 * Remember bpf_mem_cache that allocated this object. 833 * The hint is not accurate. 834 */ 835 c->tgt = *(struct bpf_mem_cache **)llnode; 836 837 local_irq_save(flags); 838 if (local_inc_return(&c->active) == 1) { 839 __llist_add(llnode, &c->free_llist); 840 cnt = ++c->free_cnt; 841 } else { 842 /* unit_free() cannot fail. Therefore add an object to atomic 843 * llist. free_bulk() will drain it. Though free_llist_extra is 844 * a per-cpu list we have to use atomic llist_add here, since 845 * it also can be interrupted by bpf nmi prog that does another 846 * unit_free() into the same free_llist_extra. 847 */ 848 llist_add(llnode, &c->free_llist_extra); 849 } 850 local_dec(&c->active); 851 852 if (cnt > c->high_watermark) 853 /* free few objects from current cpu into global kmalloc pool */ 854 irq_work_raise(c); 855 /* Enable IRQ after irq_work_raise() completes, otherwise when current 856 * task is preempted by task which does unit_alloc(), unit_alloc() may 857 * return NULL unexpectedly because irq work is already pending but can 858 * not been triggered and free_llist can not be refilled timely. 859 */ 860 local_irq_restore(flags); 861 } 862 863 static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) 864 { 865 struct llist_node *llnode = ptr - LLIST_NODE_SZ; 866 unsigned long flags; 867 868 c->tgt = *(struct bpf_mem_cache **)llnode; 869 870 local_irq_save(flags); 871 if (local_inc_return(&c->active) == 1) { 872 if (__llist_add(llnode, &c->free_by_rcu)) 873 c->free_by_rcu_tail = llnode; 874 } else { 875 llist_add(llnode, &c->free_llist_extra_rcu); 876 } 877 local_dec(&c->active); 878 879 if (!atomic_read(&c->call_rcu_in_progress)) 880 irq_work_raise(c); 881 local_irq_restore(flags); 882 } 883 884 /* Called from BPF program or from sys_bpf syscall. 885 * In both cases migration is disabled. 886 */ 887 void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size) 888 { 889 int idx; 890 void *ret; 891 892 if (!size) 893 return NULL; 894 895 if (!ma->percpu) 896 size += LLIST_NODE_SZ; 897 idx = bpf_mem_cache_idx(size); 898 if (idx < 0) 899 return NULL; 900 901 ret = unit_alloc(this_cpu_ptr(ma->caches)->cache + idx); 902 return !ret ? NULL : ret + LLIST_NODE_SZ; 903 } 904 905 void notrace bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr) 906 { 907 struct bpf_mem_cache *c; 908 int idx; 909 910 if (!ptr) 911 return; 912 913 c = *(void **)(ptr - LLIST_NODE_SZ); 914 idx = bpf_mem_cache_idx(c->unit_size); 915 if (WARN_ON_ONCE(idx < 0)) 916 return; 917 918 unit_free(this_cpu_ptr(ma->caches)->cache + idx, ptr); 919 } 920 921 void notrace bpf_mem_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 922 { 923 struct bpf_mem_cache *c; 924 int idx; 925 926 if (!ptr) 927 return; 928 929 c = *(void **)(ptr - LLIST_NODE_SZ); 930 idx = bpf_mem_cache_idx(c->unit_size); 931 if (WARN_ON_ONCE(idx < 0)) 932 return; 933 934 unit_free_rcu(this_cpu_ptr(ma->caches)->cache + idx, ptr); 935 } 936 937 void notrace *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma) 938 { 939 void *ret; 940 941 ret = unit_alloc(this_cpu_ptr(ma->cache)); 942 return !ret ? NULL : ret + LLIST_NODE_SZ; 943 } 944 945 void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) 946 { 947 if (!ptr) 948 return; 949 950 unit_free(this_cpu_ptr(ma->cache), ptr); 951 } 952 953 void notrace bpf_mem_cache_free_rcu(struct bpf_mem_alloc *ma, void *ptr) 954 { 955 if (!ptr) 956 return; 957 958 unit_free_rcu(this_cpu_ptr(ma->cache), ptr); 959 } 960 961 /* Directly does a kfree() without putting 'ptr' back to the free_llist 962 * for reuse and without waiting for a rcu_tasks_trace gp. 963 * The caller must first go through the rcu_tasks_trace gp for 'ptr' 964 * before calling bpf_mem_cache_raw_free(). 965 * It could be used when the rcu_tasks_trace callback does not have 966 * a hold on the original bpf_mem_alloc object that allocated the 967 * 'ptr'. This should only be used in the uncommon code path. 968 * Otherwise, the bpf_mem_alloc's free_llist cannot be refilled 969 * and may affect performance. 970 */ 971 void bpf_mem_cache_raw_free(void *ptr) 972 { 973 if (!ptr) 974 return; 975 976 kfree(ptr - LLIST_NODE_SZ); 977 } 978 979 /* When flags == GFP_KERNEL, it signals that the caller will not cause 980 * deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use 981 * kmalloc if the free_llist is empty. 982 */ 983 void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) 984 { 985 struct bpf_mem_cache *c; 986 void *ret; 987 988 c = this_cpu_ptr(ma->cache); 989 990 ret = unit_alloc(c); 991 if (!ret && flags == GFP_KERNEL) { 992 struct mem_cgroup *memcg, *old_memcg; 993 994 memcg = get_memcg(c); 995 old_memcg = set_active_memcg(memcg); 996 ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT); 997 if (ret) 998 *(struct bpf_mem_cache **)ret = c; 999 set_active_memcg(old_memcg); 1000 mem_cgroup_put(memcg); 1001 } 1002 1003 return !ret ? NULL : ret + LLIST_NODE_SZ; 1004 } 1005