1 // SPDX-License-Identifier: GPL-2.0-only 2 /* Copyright (c) 2019 Facebook */ 3 #include <linux/hash.h> 4 #include <linux/bpf.h> 5 #include <linux/filter.h> 6 #include <linux/ftrace.h> 7 #include <linux/rbtree_latch.h> 8 #include <linux/perf_event.h> 9 #include <linux/btf.h> 10 #include <linux/rcupdate_trace.h> 11 #include <linux/rcupdate_wait.h> 12 #include <linux/module.h> 13 #include <linux/static_call.h> 14 15 /* dummy _ops. The verifier will operate on target program's ops. */ 16 const struct bpf_verifier_ops bpf_extension_verifier_ops = { 17 }; 18 const struct bpf_prog_ops bpf_extension_prog_ops = { 19 }; 20 21 /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ 22 #define TRAMPOLINE_HASH_BITS 10 23 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) 24 25 static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; 26 27 /* serializes access to trampoline_table */ 28 static DEFINE_MUTEX(trampoline_mutex); 29 30 bool bpf_prog_has_trampoline(const struct bpf_prog *prog) 31 { 32 enum bpf_attach_type eatype = prog->expected_attach_type; 33 34 return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || 35 eatype == BPF_MODIFY_RETURN; 36 } 37 38 void *bpf_jit_alloc_exec_page(void) 39 { 40 void *image; 41 42 image = bpf_jit_alloc_exec(PAGE_SIZE); 43 if (!image) 44 return NULL; 45 46 set_vm_flush_reset_perms(image); 47 /* Keep image as writeable. The alternative is to keep flipping ro/rw 48 * every time new program is attached or detached. 49 */ 50 set_memory_x((long)image, 1); 51 return image; 52 } 53 54 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym) 55 { 56 ksym->start = (unsigned long) data; 57 ksym->end = ksym->start + PAGE_SIZE; 58 bpf_ksym_add(ksym); 59 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, 60 PAGE_SIZE, false, ksym->name); 61 } 62 63 void bpf_image_ksym_del(struct bpf_ksym *ksym) 64 { 65 bpf_ksym_del(ksym); 66 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start, 67 PAGE_SIZE, true, ksym->name); 68 } 69 70 static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) 71 { 72 struct bpf_trampoline *tr; 73 struct hlist_head *head; 74 int i; 75 76 mutex_lock(&trampoline_mutex); 77 head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)]; 78 hlist_for_each_entry(tr, head, hlist) { 79 if (tr->key == key) { 80 refcount_inc(&tr->refcnt); 81 goto out; 82 } 83 } 84 tr = kzalloc(sizeof(*tr), GFP_KERNEL); 85 if (!tr) 86 goto out; 87 88 tr->key = key; 89 INIT_HLIST_NODE(&tr->hlist); 90 hlist_add_head(&tr->hlist, head); 91 refcount_set(&tr->refcnt, 1); 92 mutex_init(&tr->mutex); 93 for (i = 0; i < BPF_TRAMP_MAX; i++) 94 INIT_HLIST_HEAD(&tr->progs_hlist[i]); 95 out: 96 mutex_unlock(&trampoline_mutex); 97 return tr; 98 } 99 100 static int bpf_trampoline_module_get(struct bpf_trampoline *tr) 101 { 102 struct module *mod; 103 int err = 0; 104 105 preempt_disable(); 106 mod = __module_text_address((unsigned long) tr->func.addr); 107 if (mod && !try_module_get(mod)) 108 err = -ENOENT; 109 preempt_enable(); 110 tr->mod = mod; 111 return err; 112 } 113 114 static void bpf_trampoline_module_put(struct bpf_trampoline *tr) 115 { 116 module_put(tr->mod); 117 tr->mod = NULL; 118 } 119 120 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) 121 { 122 void *ip = tr->func.addr; 123 int ret; 124 125 if (tr->func.ftrace_managed) 126 ret = unregister_ftrace_direct((long)ip, (long)old_addr); 127 else 128 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL); 129 130 if (!ret) 131 bpf_trampoline_module_put(tr); 132 return ret; 133 } 134 135 static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr) 136 { 137 void *ip = tr->func.addr; 138 int ret; 139 140 if (tr->func.ftrace_managed) 141 ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr); 142 else 143 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr); 144 return ret; 145 } 146 147 /* first time registering */ 148 static int register_fentry(struct bpf_trampoline *tr, void *new_addr) 149 { 150 void *ip = tr->func.addr; 151 unsigned long faddr; 152 int ret; 153 154 faddr = ftrace_location((unsigned long)ip); 155 if (faddr) 156 tr->func.ftrace_managed = true; 157 158 if (bpf_trampoline_module_get(tr)) 159 return -ENOENT; 160 161 if (tr->func.ftrace_managed) 162 ret = register_ftrace_direct((long)ip, (long)new_addr); 163 else 164 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr); 165 166 if (ret) 167 bpf_trampoline_module_put(tr); 168 return ret; 169 } 170 171 static struct bpf_tramp_progs * 172 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) 173 { 174 const struct bpf_prog_aux *aux; 175 struct bpf_tramp_progs *tprogs; 176 struct bpf_prog **progs; 177 int kind; 178 179 *total = 0; 180 tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL); 181 if (!tprogs) 182 return ERR_PTR(-ENOMEM); 183 184 for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { 185 tprogs[kind].nr_progs = tr->progs_cnt[kind]; 186 *total += tr->progs_cnt[kind]; 187 progs = tprogs[kind].progs; 188 189 hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist) { 190 *ip_arg |= aux->prog->call_get_func_ip; 191 *progs++ = aux->prog; 192 } 193 } 194 return tprogs; 195 } 196 197 static void __bpf_tramp_image_put_deferred(struct work_struct *work) 198 { 199 struct bpf_tramp_image *im; 200 201 im = container_of(work, struct bpf_tramp_image, work); 202 bpf_image_ksym_del(&im->ksym); 203 bpf_jit_free_exec(im->image); 204 bpf_jit_uncharge_modmem(PAGE_SIZE); 205 percpu_ref_exit(&im->pcref); 206 kfree_rcu(im, rcu); 207 } 208 209 /* callback, fexit step 3 or fentry step 2 */ 210 static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) 211 { 212 struct bpf_tramp_image *im; 213 214 im = container_of(rcu, struct bpf_tramp_image, rcu); 215 INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); 216 schedule_work(&im->work); 217 } 218 219 /* callback, fexit step 2. Called after percpu_ref_kill confirms. */ 220 static void __bpf_tramp_image_release(struct percpu_ref *pcref) 221 { 222 struct bpf_tramp_image *im; 223 224 im = container_of(pcref, struct bpf_tramp_image, pcref); 225 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); 226 } 227 228 /* callback, fexit or fentry step 1 */ 229 static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) 230 { 231 struct bpf_tramp_image *im; 232 233 im = container_of(rcu, struct bpf_tramp_image, rcu); 234 if (im->ip_after_call) 235 /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ 236 percpu_ref_kill(&im->pcref); 237 else 238 /* the case of fentry trampoline */ 239 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); 240 } 241 242 static void bpf_tramp_image_put(struct bpf_tramp_image *im) 243 { 244 /* The trampoline image that calls original function is using: 245 * rcu_read_lock_trace to protect sleepable bpf progs 246 * rcu_read_lock to protect normal bpf progs 247 * percpu_ref to protect trampoline itself 248 * rcu tasks to protect trampoline asm not covered by percpu_ref 249 * (which are few asm insns before __bpf_tramp_enter and 250 * after __bpf_tramp_exit) 251 * 252 * The trampoline is unreachable before bpf_tramp_image_put(). 253 * 254 * First, patch the trampoline to avoid calling into fexit progs. 255 * The progs will be freed even if the original function is still 256 * executing or sleeping. 257 * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on 258 * first few asm instructions to execute and call into 259 * __bpf_tramp_enter->percpu_ref_get. 260 * Then use percpu_ref_kill to wait for the trampoline and the original 261 * function to finish. 262 * Then use call_rcu_tasks() to make sure few asm insns in 263 * the trampoline epilogue are done as well. 264 * 265 * In !PREEMPT case the task that got interrupted in the first asm 266 * insns won't go through an RCU quiescent state which the 267 * percpu_ref_kill will be waiting for. Hence the first 268 * call_rcu_tasks() is not necessary. 269 */ 270 if (im->ip_after_call) { 271 int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, 272 NULL, im->ip_epilogue); 273 WARN_ON(err); 274 if (IS_ENABLED(CONFIG_PREEMPTION)) 275 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); 276 else 277 percpu_ref_kill(&im->pcref); 278 return; 279 } 280 281 /* The trampoline without fexit and fmod_ret progs doesn't call original 282 * function and doesn't use percpu_ref. 283 * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. 284 * Then use call_rcu_tasks() to wait for the rest of trampoline asm 285 * and normal progs. 286 */ 287 call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); 288 } 289 290 static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) 291 { 292 struct bpf_tramp_image *im; 293 struct bpf_ksym *ksym; 294 void *image; 295 int err = -ENOMEM; 296 297 im = kzalloc(sizeof(*im), GFP_KERNEL); 298 if (!im) 299 goto out; 300 301 err = bpf_jit_charge_modmem(PAGE_SIZE); 302 if (err) 303 goto out_free_im; 304 305 err = -ENOMEM; 306 im->image = image = bpf_jit_alloc_exec_page(); 307 if (!image) 308 goto out_uncharge; 309 310 err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); 311 if (err) 312 goto out_free_image; 313 314 ksym = &im->ksym; 315 INIT_LIST_HEAD_RCU(&ksym->lnode); 316 snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); 317 bpf_image_ksym_add(image, ksym); 318 return im; 319 320 out_free_image: 321 bpf_jit_free_exec(im->image); 322 out_uncharge: 323 bpf_jit_uncharge_modmem(PAGE_SIZE); 324 out_free_im: 325 kfree(im); 326 out: 327 return ERR_PTR(err); 328 } 329 330 static int bpf_trampoline_update(struct bpf_trampoline *tr) 331 { 332 struct bpf_tramp_image *im; 333 struct bpf_tramp_progs *tprogs; 334 u32 flags = BPF_TRAMP_F_RESTORE_REGS; 335 bool ip_arg = false; 336 int err, total; 337 338 tprogs = bpf_trampoline_get_progs(tr, &total, &ip_arg); 339 if (IS_ERR(tprogs)) 340 return PTR_ERR(tprogs); 341 342 if (total == 0) { 343 err = unregister_fentry(tr, tr->cur_image->image); 344 bpf_tramp_image_put(tr->cur_image); 345 tr->cur_image = NULL; 346 tr->selector = 0; 347 goto out; 348 } 349 350 im = bpf_tramp_image_alloc(tr->key, tr->selector); 351 if (IS_ERR(im)) { 352 err = PTR_ERR(im); 353 goto out; 354 } 355 356 if (tprogs[BPF_TRAMP_FEXIT].nr_progs || 357 tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) 358 flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; 359 360 if (ip_arg) 361 flags |= BPF_TRAMP_F_IP_ARG; 362 363 err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, 364 &tr->func.model, flags, tprogs, 365 tr->func.addr); 366 if (err < 0) 367 goto out; 368 369 WARN_ON(tr->cur_image && tr->selector == 0); 370 WARN_ON(!tr->cur_image && tr->selector); 371 if (tr->cur_image) 372 /* progs already running at this address */ 373 err = modify_fentry(tr, tr->cur_image->image, im->image); 374 else 375 /* first time registering */ 376 err = register_fentry(tr, im->image); 377 if (err) 378 goto out; 379 if (tr->cur_image) 380 bpf_tramp_image_put(tr->cur_image); 381 tr->cur_image = im; 382 tr->selector++; 383 out: 384 kfree(tprogs); 385 return err; 386 } 387 388 static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) 389 { 390 switch (prog->expected_attach_type) { 391 case BPF_TRACE_FENTRY: 392 return BPF_TRAMP_FENTRY; 393 case BPF_MODIFY_RETURN: 394 return BPF_TRAMP_MODIFY_RETURN; 395 case BPF_TRACE_FEXIT: 396 return BPF_TRAMP_FEXIT; 397 case BPF_LSM_MAC: 398 if (!prog->aux->attach_func_proto->type) 399 /* The function returns void, we cannot modify its 400 * return value. 401 */ 402 return BPF_TRAMP_FEXIT; 403 else 404 return BPF_TRAMP_MODIFY_RETURN; 405 default: 406 return BPF_TRAMP_REPLACE; 407 } 408 } 409 410 int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) 411 { 412 enum bpf_tramp_prog_type kind; 413 int err = 0; 414 int cnt; 415 416 kind = bpf_attach_type_to_tramp(prog); 417 mutex_lock(&tr->mutex); 418 if (tr->extension_prog) { 419 /* cannot attach fentry/fexit if extension prog is attached. 420 * cannot overwrite extension prog either. 421 */ 422 err = -EBUSY; 423 goto out; 424 } 425 cnt = tr->progs_cnt[BPF_TRAMP_FENTRY] + tr->progs_cnt[BPF_TRAMP_FEXIT]; 426 if (kind == BPF_TRAMP_REPLACE) { 427 /* Cannot attach extension if fentry/fexit are in use. */ 428 if (cnt) { 429 err = -EBUSY; 430 goto out; 431 } 432 tr->extension_prog = prog; 433 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL, 434 prog->bpf_func); 435 goto out; 436 } 437 if (cnt >= BPF_MAX_TRAMP_PROGS) { 438 err = -E2BIG; 439 goto out; 440 } 441 if (!hlist_unhashed(&prog->aux->tramp_hlist)) { 442 /* prog already linked */ 443 err = -EBUSY; 444 goto out; 445 } 446 hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]); 447 tr->progs_cnt[kind]++; 448 err = bpf_trampoline_update(tr); 449 if (err) { 450 hlist_del_init(&prog->aux->tramp_hlist); 451 tr->progs_cnt[kind]--; 452 } 453 out: 454 mutex_unlock(&tr->mutex); 455 return err; 456 } 457 458 /* bpf_trampoline_unlink_prog() should never fail. */ 459 int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) 460 { 461 enum bpf_tramp_prog_type kind; 462 int err; 463 464 kind = bpf_attach_type_to_tramp(prog); 465 mutex_lock(&tr->mutex); 466 if (kind == BPF_TRAMP_REPLACE) { 467 WARN_ON_ONCE(!tr->extension_prog); 468 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, 469 tr->extension_prog->bpf_func, NULL); 470 tr->extension_prog = NULL; 471 goto out; 472 } 473 hlist_del_init(&prog->aux->tramp_hlist); 474 tr->progs_cnt[kind]--; 475 err = bpf_trampoline_update(tr); 476 out: 477 mutex_unlock(&tr->mutex); 478 return err; 479 } 480 481 struct bpf_trampoline *bpf_trampoline_get(u64 key, 482 struct bpf_attach_target_info *tgt_info) 483 { 484 struct bpf_trampoline *tr; 485 486 tr = bpf_trampoline_lookup(key); 487 if (!tr) 488 return NULL; 489 490 mutex_lock(&tr->mutex); 491 if (tr->func.addr) 492 goto out; 493 494 memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); 495 tr->func.addr = (void *)tgt_info->tgt_addr; 496 out: 497 mutex_unlock(&tr->mutex); 498 return tr; 499 } 500 501 void bpf_trampoline_put(struct bpf_trampoline *tr) 502 { 503 if (!tr) 504 return; 505 mutex_lock(&trampoline_mutex); 506 if (!refcount_dec_and_test(&tr->refcnt)) 507 goto out; 508 WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); 509 if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FENTRY]))) 510 goto out; 511 if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) 512 goto out; 513 /* This code will be executed even when the last bpf_tramp_image 514 * is alive. All progs are detached from the trampoline and the 515 * trampoline image is patched with jmp into epilogue to skip 516 * fexit progs. The fentry-only trampoline will be freed via 517 * multiple rcu callbacks. 518 */ 519 hlist_del(&tr->hlist); 520 kfree(tr); 521 out: 522 mutex_unlock(&trampoline_mutex); 523 } 524 525 #define NO_START_TIME 1 526 static __always_inline u64 notrace bpf_prog_start_time(void) 527 { 528 u64 start = NO_START_TIME; 529 530 if (static_branch_unlikely(&bpf_stats_enabled_key)) { 531 start = sched_clock(); 532 if (unlikely(!start)) 533 start = NO_START_TIME; 534 } 535 return start; 536 } 537 538 static void notrace inc_misses_counter(struct bpf_prog *prog) 539 { 540 struct bpf_prog_stats *stats; 541 unsigned int flags; 542 543 stats = this_cpu_ptr(prog->stats); 544 flags = u64_stats_update_begin_irqsave(&stats->syncp); 545 u64_stats_inc(&stats->misses); 546 u64_stats_update_end_irqrestore(&stats->syncp, flags); 547 } 548 549 /* The logic is similar to bpf_prog_run(), but with an explicit 550 * rcu_read_lock() and migrate_disable() which are required 551 * for the trampoline. The macro is split into 552 * call __bpf_prog_enter 553 * call prog->bpf_func 554 * call __bpf_prog_exit 555 * 556 * __bpf_prog_enter returns: 557 * 0 - skip execution of the bpf prog 558 * 1 - execute bpf prog 559 * [2..MAX_U64] - execute bpf prog and record execution time. 560 * This is start time. 561 */ 562 u64 notrace __bpf_prog_enter(struct bpf_prog *prog) 563 __acquires(RCU) 564 { 565 rcu_read_lock(); 566 migrate_disable(); 567 if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { 568 inc_misses_counter(prog); 569 return 0; 570 } 571 return bpf_prog_start_time(); 572 } 573 574 static void notrace update_prog_stats(struct bpf_prog *prog, 575 u64 start) 576 { 577 struct bpf_prog_stats *stats; 578 579 if (static_branch_unlikely(&bpf_stats_enabled_key) && 580 /* static_key could be enabled in __bpf_prog_enter* 581 * and disabled in __bpf_prog_exit*. 582 * And vice versa. 583 * Hence check that 'start' is valid. 584 */ 585 start > NO_START_TIME) { 586 unsigned long flags; 587 588 stats = this_cpu_ptr(prog->stats); 589 flags = u64_stats_update_begin_irqsave(&stats->syncp); 590 u64_stats_inc(&stats->cnt); 591 u64_stats_add(&stats->nsecs, sched_clock() - start); 592 u64_stats_update_end_irqrestore(&stats->syncp, flags); 593 } 594 } 595 596 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) 597 __releases(RCU) 598 { 599 update_prog_stats(prog, start); 600 __this_cpu_dec(*(prog->active)); 601 migrate_enable(); 602 rcu_read_unlock(); 603 } 604 605 u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) 606 { 607 rcu_read_lock_trace(); 608 migrate_disable(); 609 might_fault(); 610 if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { 611 inc_misses_counter(prog); 612 return 0; 613 } 614 return bpf_prog_start_time(); 615 } 616 617 void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) 618 { 619 update_prog_stats(prog, start); 620 __this_cpu_dec(*(prog->active)); 621 migrate_enable(); 622 rcu_read_unlock_trace(); 623 } 624 625 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) 626 { 627 percpu_ref_get(&tr->pcref); 628 } 629 630 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) 631 { 632 percpu_ref_put(&tr->pcref); 633 } 634 635 int __weak 636 arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, 637 const struct btf_func_model *m, u32 flags, 638 struct bpf_tramp_progs *tprogs, 639 void *orig_call) 640 { 641 return -ENOTSUPP; 642 } 643 644 static int __init init_trampolines(void) 645 { 646 int i; 647 648 for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) 649 INIT_HLIST_HEAD(&trampoline_table[i]); 650 return 0; 651 } 652 late_initcall(init_trampolines); 653