1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Functions to manage eBPF programs attached to cgroups 4 * 5 * Copyright (c) 2016 Daniel Mack 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/atomic.h> 10 #include <linux/cgroup.h> 11 #include <linux/filter.h> 12 #include <linux/slab.h> 13 #include <linux/sysctl.h> 14 #include <linux/string.h> 15 #include <linux/bpf.h> 16 #include <linux/bpf-cgroup.h> 17 #include <linux/bpf_lsm.h> 18 #include <linux/bpf_verifier.h> 19 #include <net/sock.h> 20 #include <net/bpf_sk_storage.h> 21 22 #include "../cgroup/cgroup-internal.h" 23 24 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); 25 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 26 27 /* 28 * cgroup bpf destruction makes heavy use of work items and there can be a lot 29 * of concurrent destructions. Use a separate workqueue so that cgroup bpf 30 * destruction work items don't end up filling up max_active of system_percpu_wq 31 * which may lead to deadlock. 32 */ 33 static struct workqueue_struct *cgroup_bpf_destroy_wq; 34 35 static int __init cgroup_bpf_wq_init(void) 36 { 37 cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 38 WQ_PERCPU, 1); 39 if (!cgroup_bpf_destroy_wq) 40 panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); 41 return 0; 42 } 43 core_initcall(cgroup_bpf_wq_init); 44 45 static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, 46 unsigned long action, void *data); 47 48 static struct notifier_block cgroup_bpf_lifetime_nb = { 49 .notifier_call = cgroup_bpf_lifetime_notify, 50 }; 51 52 void __init cgroup_bpf_lifetime_notifier_init(void) 53 { 54 BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier, 55 &cgroup_bpf_lifetime_nb)); 56 } 57 58 #ifdef CONFIG_BPF_LSM 59 struct cgroup_lsm_atype { 60 u32 attach_btf_id; 61 int refcnt; 62 bool returns_errno; 63 }; 64 65 static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; 66 67 static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) 68 { 69 if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END) 70 return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno); 71 return true; 72 } 73 #else 74 static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) 75 { 76 return true; 77 } 78 #endif 79 80 /* __always_inline is necessary to prevent indirect call through run_prog 81 * function pointer. 82 */ 83 static __always_inline int 84 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, 85 enum cgroup_bpf_attach_type atype, 86 const void *ctx, bpf_prog_run_fn run_prog, 87 int retval, u32 *ret_flags) 88 { 89 const struct bpf_prog_array_item *item; 90 const struct bpf_prog *prog; 91 const struct bpf_prog_array *array; 92 struct bpf_run_ctx *old_run_ctx; 93 struct bpf_cg_run_ctx run_ctx; 94 u32 func_ret; 95 96 run_ctx.retval = retval; 97 rcu_read_lock_dont_migrate(); 98 array = rcu_dereference(cgrp->effective[atype]); 99 item = &array->items[0]; 100 old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); 101 while ((prog = READ_ONCE(item->prog))) { 102 run_ctx.prog_item = item; 103 func_ret = run_prog(prog, ctx); 104 if (ret_flags) { 105 *(ret_flags) |= (func_ret >> 1); 106 func_ret &= 1; 107 } 108 if (!func_ret && cgroup_bpf_hook_returns_errno(atype) && 109 !IS_ERR_VALUE((long)run_ctx.retval)) 110 run_ctx.retval = -EPERM; 111 item++; 112 } 113 bpf_reset_run_ctx(old_run_ctx); 114 rcu_read_unlock_migrate(); 115 return run_ctx.retval; 116 } 117 118 unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, 119 const struct bpf_insn *insn) 120 { 121 const struct bpf_prog *shim_prog; 122 struct sock *sk; 123 struct cgroup *cgrp; 124 int ret = 0; 125 u64 *args; 126 127 args = (u64 *)ctx; 128 sk = (void *)(unsigned long)args[0]; 129 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ 130 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); 131 132 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 133 if (likely(cgrp)) 134 ret = bpf_prog_run_array_cg(&cgrp->bpf, 135 shim_prog->aux->cgroup_atype, 136 ctx, bpf_prog_run, 0, NULL); 137 return ret; 138 } 139 140 unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, 141 const struct bpf_insn *insn) 142 { 143 const struct bpf_prog *shim_prog; 144 struct socket *sock; 145 struct cgroup *cgrp; 146 int ret = 0; 147 u64 *args; 148 149 args = (u64 *)ctx; 150 sock = (void *)(unsigned long)args[0]; 151 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ 152 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); 153 154 cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); 155 if (likely(cgrp)) 156 ret = bpf_prog_run_array_cg(&cgrp->bpf, 157 shim_prog->aux->cgroup_atype, 158 ctx, bpf_prog_run, 0, NULL); 159 return ret; 160 } 161 162 unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, 163 const struct bpf_insn *insn) 164 { 165 const struct bpf_prog *shim_prog; 166 struct cgroup *cgrp; 167 int ret = 0; 168 169 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ 170 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); 171 172 /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ 173 cgrp = task_dfl_cgroup(current); 174 if (likely(cgrp)) 175 ret = bpf_prog_run_array_cg(&cgrp->bpf, 176 shim_prog->aux->cgroup_atype, 177 ctx, bpf_prog_run, 0, NULL); 178 return ret; 179 } 180 181 #ifdef CONFIG_BPF_LSM 182 static enum cgroup_bpf_attach_type 183 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) 184 { 185 int i; 186 187 lockdep_assert_held(&cgroup_mutex); 188 189 if (attach_type != BPF_LSM_CGROUP) 190 return to_cgroup_bpf_attach_type(attach_type); 191 192 for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) 193 if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) 194 return CGROUP_LSM_START + i; 195 196 for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) 197 if (cgroup_lsm_atype[i].attach_btf_id == 0) 198 return CGROUP_LSM_START + i; 199 200 return -E2BIG; 201 202 } 203 204 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) 205 { 206 int i = cgroup_atype - CGROUP_LSM_START; 207 208 lockdep_assert_held(&cgroup_mutex); 209 210 if (!cgroup_lsm_atype[i].attach_btf_id) { 211 cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; 212 WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, 213 bpf_lsm_hook_returns_errno(attach_btf_id)); 214 } else { 215 WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); 216 } 217 cgroup_lsm_atype[i].refcnt++; 218 } 219 220 void bpf_cgroup_atype_put(int cgroup_atype) 221 { 222 int i = cgroup_atype - CGROUP_LSM_START; 223 224 cgroup_lock(); 225 if (--cgroup_lsm_atype[i].refcnt <= 0) { 226 WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true); 227 cgroup_lsm_atype[i].attach_btf_id = 0; 228 } 229 WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); 230 cgroup_unlock(); 231 } 232 #else 233 static enum cgroup_bpf_attach_type 234 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) 235 { 236 if (attach_type != BPF_LSM_CGROUP) 237 return to_cgroup_bpf_attach_type(attach_type); 238 return -EOPNOTSUPP; 239 } 240 #endif /* CONFIG_BPF_LSM */ 241 242 static void cgroup_bpf_offline(struct cgroup *cgrp) 243 { 244 cgroup_get(cgrp); 245 percpu_ref_kill(&cgrp->bpf.refcnt); 246 } 247 248 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) 249 { 250 enum bpf_cgroup_storage_type stype; 251 252 for_each_cgroup_storage_type(stype) 253 bpf_cgroup_storage_free(storages[stype]); 254 } 255 256 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], 257 struct bpf_cgroup_storage *new_storages[], 258 enum bpf_attach_type type, 259 struct bpf_prog *prog, 260 struct cgroup *cgrp) 261 { 262 enum bpf_cgroup_storage_type stype; 263 struct bpf_cgroup_storage_key key; 264 struct bpf_map *map; 265 266 key.cgroup_inode_id = cgroup_id(cgrp); 267 key.attach_type = type; 268 269 for_each_cgroup_storage_type(stype) { 270 map = prog->aux->cgroup_storage[stype]; 271 if (!map) 272 continue; 273 274 storages[stype] = cgroup_storage_lookup((void *)map, &key, false); 275 if (storages[stype]) 276 continue; 277 278 storages[stype] = bpf_cgroup_storage_alloc(prog, stype); 279 if (IS_ERR(storages[stype])) { 280 bpf_cgroup_storages_free(new_storages); 281 return -ENOMEM; 282 } 283 284 new_storages[stype] = storages[stype]; 285 } 286 287 return 0; 288 } 289 290 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], 291 struct bpf_cgroup_storage *src[]) 292 { 293 enum bpf_cgroup_storage_type stype; 294 295 for_each_cgroup_storage_type(stype) 296 dst[stype] = src[stype]; 297 } 298 299 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], 300 struct cgroup *cgrp, 301 enum bpf_attach_type attach_type) 302 { 303 enum bpf_cgroup_storage_type stype; 304 305 for_each_cgroup_storage_type(stype) 306 bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); 307 } 308 309 /* Called when bpf_cgroup_link is auto-detached from dying cgroup. 310 * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It 311 * doesn't free link memory, which will eventually be done by bpf_link's 312 * release() callback, when its last FD is closed. 313 */ 314 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) 315 { 316 cgroup_put(link->cgroup); 317 link->cgroup = NULL; 318 } 319 320 /** 321 * cgroup_bpf_release() - put references of all bpf programs and 322 * release all cgroup bpf data 323 * @work: work structure embedded into the cgroup to modify 324 */ 325 static void cgroup_bpf_release(struct work_struct *work) 326 { 327 struct cgroup *p, *cgrp = container_of(work, struct cgroup, 328 bpf.release_work); 329 struct bpf_prog_array *old_array; 330 struct list_head *storages = &cgrp->bpf.storages; 331 struct bpf_cgroup_storage *storage, *stmp; 332 333 unsigned int atype; 334 335 cgroup_lock(); 336 337 for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { 338 struct hlist_head *progs = &cgrp->bpf.progs[atype]; 339 struct bpf_prog_list *pl; 340 struct hlist_node *pltmp; 341 342 hlist_for_each_entry_safe(pl, pltmp, progs, node) { 343 hlist_del(&pl->node); 344 if (pl->prog) { 345 if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) 346 bpf_trampoline_unlink_cgroup_shim(pl->prog); 347 bpf_prog_put(pl->prog); 348 } 349 if (pl->link) { 350 if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) 351 bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); 352 bpf_cgroup_link_auto_detach(pl->link); 353 } 354 kfree(pl); 355 static_branch_dec(&cgroup_bpf_enabled_key[atype]); 356 } 357 old_array = rcu_dereference_protected( 358 cgrp->bpf.effective[atype], 359 lockdep_is_held(&cgroup_mutex)); 360 bpf_prog_array_free(old_array); 361 } 362 363 list_for_each_entry_safe(storage, stmp, storages, list_cg) { 364 bpf_cgroup_storage_unlink(storage); 365 bpf_cgroup_storage_free(storage); 366 } 367 368 cgroup_unlock(); 369 370 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 371 cgroup_bpf_put(p); 372 373 percpu_ref_exit(&cgrp->bpf.refcnt); 374 cgroup_put(cgrp); 375 } 376 377 /** 378 * cgroup_bpf_release_fn() - callback used to schedule releasing 379 * of bpf cgroup data 380 * @ref: percpu ref counter structure 381 */ 382 static void cgroup_bpf_release_fn(struct percpu_ref *ref) 383 { 384 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); 385 386 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); 387 queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); 388 } 389 390 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through 391 * link or direct prog. 392 */ 393 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) 394 { 395 if (pl->prog) 396 return pl->prog; 397 if (pl->link) 398 return pl->link->link.prog; 399 return NULL; 400 } 401 402 /* count number of elements in the list. 403 * it's slow but the list cannot be long 404 */ 405 static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) 406 { 407 struct bpf_prog_list *pl; 408 u32 cnt = 0; 409 410 hlist_for_each_entry(pl, head, node) { 411 if (!prog_list_prog(pl)) 412 continue; 413 if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) 414 (*preorder_cnt)++; 415 cnt++; 416 } 417 return cnt; 418 } 419 420 /* if parent has non-overridable prog attached, 421 * disallow attaching new programs to the descendent cgroup. 422 * if parent has overridable or multi-prog, allow attaching 423 */ 424 static bool hierarchy_allows_attach(struct cgroup *cgrp, 425 enum cgroup_bpf_attach_type atype) 426 { 427 struct cgroup *p; 428 429 p = cgroup_parent(cgrp); 430 if (!p) 431 return true; 432 do { 433 u32 flags = p->bpf.flags[atype]; 434 u32 cnt; 435 436 if (flags & BPF_F_ALLOW_MULTI) 437 return true; 438 cnt = prog_list_length(&p->bpf.progs[atype], NULL); 439 WARN_ON_ONCE(cnt > 1); 440 if (cnt == 1) 441 return !!(flags & BPF_F_ALLOW_OVERRIDE); 442 p = cgroup_parent(p); 443 } while (p); 444 return true; 445 } 446 447 /* compute a chain of effective programs for a given cgroup: 448 * start from the list of programs in this cgroup and add 449 * all parent programs. 450 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 451 * to programs in this cgroup 452 */ 453 static int compute_effective_progs(struct cgroup *cgrp, 454 enum cgroup_bpf_attach_type atype, 455 struct bpf_prog_array **array) 456 { 457 struct bpf_prog_array_item *item; 458 struct bpf_prog_array *progs; 459 struct bpf_prog_list *pl; 460 struct cgroup *p = cgrp; 461 int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; 462 463 /* count number of effective programs by walking parents */ 464 do { 465 if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 466 cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); 467 p = cgroup_parent(p); 468 } while (p); 469 470 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 471 if (!progs) 472 return -ENOMEM; 473 474 /* populate the array with effective progs */ 475 cnt = 0; 476 p = cgrp; 477 fstart = preorder_cnt; 478 bstart = preorder_cnt - 1; 479 do { 480 if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 481 continue; 482 483 init_bstart = bstart; 484 hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { 485 if (!prog_list_prog(pl)) 486 continue; 487 488 if (pl->flags & BPF_F_PREORDER) { 489 item = &progs->items[bstart]; 490 bstart--; 491 } else { 492 item = &progs->items[fstart]; 493 fstart++; 494 } 495 item->prog = prog_list_prog(pl); 496 bpf_cgroup_storages_assign(item->cgroup_storage, 497 pl->storage); 498 cnt++; 499 } 500 501 /* reverse pre-ordering progs at this cgroup level */ 502 for (i = bstart + 1, j = init_bstart; i < j; i++, j--) 503 swap(progs->items[i], progs->items[j]); 504 505 } while ((p = cgroup_parent(p))); 506 507 *array = progs; 508 return 0; 509 } 510 511 static void activate_effective_progs(struct cgroup *cgrp, 512 enum cgroup_bpf_attach_type atype, 513 struct bpf_prog_array *old_array) 514 { 515 old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, 516 lockdep_is_held(&cgroup_mutex)); 517 /* free prog array after grace period, since __cgroup_bpf_run_*() 518 * might be still walking the array 519 */ 520 bpf_prog_array_free(old_array); 521 } 522 523 /** 524 * cgroup_bpf_inherit() - inherit effective programs from parent 525 * @cgrp: the cgroup to modify 526 */ 527 static int cgroup_bpf_inherit(struct cgroup *cgrp) 528 { 529 /* has to use marco instead of const int, since compiler thinks 530 * that array below is variable length 531 */ 532 #define NR ARRAY_SIZE(cgrp->bpf.effective) 533 struct bpf_prog_array *arrays[NR] = {}; 534 struct cgroup *p; 535 int ret, i; 536 537 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, 538 GFP_KERNEL); 539 if (ret) 540 return ret; 541 542 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 543 cgroup_bpf_get(p); 544 545 for (i = 0; i < NR; i++) 546 INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); 547 548 INIT_LIST_HEAD(&cgrp->bpf.storages); 549 550 for (i = 0; i < NR; i++) 551 if (compute_effective_progs(cgrp, i, &arrays[i])) 552 goto cleanup; 553 554 for (i = 0; i < NR; i++) 555 activate_effective_progs(cgrp, i, arrays[i]); 556 557 return 0; 558 cleanup: 559 for (i = 0; i < NR; i++) 560 bpf_prog_array_free(arrays[i]); 561 562 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 563 cgroup_bpf_put(p); 564 565 percpu_ref_exit(&cgrp->bpf.refcnt); 566 567 return -ENOMEM; 568 } 569 570 static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, 571 unsigned long action, void *data) 572 { 573 struct cgroup *cgrp = data; 574 int ret = 0; 575 576 if (cgrp->root != &cgrp_dfl_root) 577 return NOTIFY_OK; 578 579 switch (action) { 580 case CGROUP_LIFETIME_ONLINE: 581 ret = cgroup_bpf_inherit(cgrp); 582 break; 583 case CGROUP_LIFETIME_OFFLINE: 584 cgroup_bpf_offline(cgrp); 585 break; 586 } 587 588 return notifier_from_errno(ret); 589 } 590 591 static int update_effective_progs(struct cgroup *cgrp, 592 enum cgroup_bpf_attach_type atype) 593 { 594 struct cgroup_subsys_state *css; 595 int err; 596 597 /* allocate and recompute effective prog arrays */ 598 css_for_each_descendant_pre(css, &cgrp->self) { 599 struct cgroup *desc = container_of(css, struct cgroup, self); 600 601 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 602 continue; 603 604 err = compute_effective_progs(desc, atype, &desc->bpf.inactive); 605 if (err) 606 goto cleanup; 607 } 608 609 /* all allocations were successful. Activate all prog arrays */ 610 css_for_each_descendant_pre(css, &cgrp->self) { 611 struct cgroup *desc = container_of(css, struct cgroup, self); 612 613 if (percpu_ref_is_zero(&desc->bpf.refcnt)) { 614 if (unlikely(desc->bpf.inactive)) { 615 bpf_prog_array_free(desc->bpf.inactive); 616 desc->bpf.inactive = NULL; 617 } 618 continue; 619 } 620 621 activate_effective_progs(desc, atype, desc->bpf.inactive); 622 desc->bpf.inactive = NULL; 623 } 624 625 return 0; 626 627 cleanup: 628 /* oom while computing effective. Free all computed effective arrays 629 * since they were not activated 630 */ 631 css_for_each_descendant_pre(css, &cgrp->self) { 632 struct cgroup *desc = container_of(css, struct cgroup, self); 633 634 bpf_prog_array_free(desc->bpf.inactive); 635 desc->bpf.inactive = NULL; 636 } 637 638 return err; 639 } 640 641 #define BPF_CGROUP_MAX_PROGS 64 642 643 static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, 644 struct bpf_prog *prog, 645 struct bpf_cgroup_link *link, 646 struct bpf_prog *replace_prog, 647 bool allow_multi) 648 { 649 struct bpf_prog_list *pl; 650 651 /* single-attach case */ 652 if (!allow_multi) { 653 if (hlist_empty(progs)) 654 return NULL; 655 return hlist_entry(progs->first, typeof(*pl), node); 656 } 657 658 hlist_for_each_entry(pl, progs, node) { 659 if (prog && pl->prog == prog && prog != replace_prog) 660 /* disallow attaching the same prog twice */ 661 return ERR_PTR(-EINVAL); 662 if (link && pl->link == link) 663 /* disallow attaching the same link twice */ 664 return ERR_PTR(-EINVAL); 665 } 666 667 /* direct prog multi-attach w/ replacement case */ 668 if (replace_prog) { 669 hlist_for_each_entry(pl, progs, node) { 670 if (pl->prog == replace_prog) 671 /* a match found */ 672 return pl; 673 } 674 /* prog to replace not found for cgroup */ 675 return ERR_PTR(-ENOENT); 676 } 677 678 return NULL; 679 } 680 681 static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) 682 { 683 struct bpf_link *link = ERR_PTR(-EINVAL); 684 685 if (flags & BPF_F_ID) 686 link = bpf_link_by_id(id_or_fd); 687 else if (id_or_fd) 688 link = bpf_link_get_from_fd(id_or_fd); 689 return link; 690 } 691 692 static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd) 693 { 694 struct bpf_prog *prog = ERR_PTR(-EINVAL); 695 696 if (flags & BPF_F_ID) 697 prog = bpf_prog_by_id(id_or_fd); 698 else if (id_or_fd) 699 prog = bpf_prog_get(id_or_fd); 700 return prog; 701 } 702 703 static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog, 704 struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd) 705 { 706 bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID; 707 struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL); 708 bool preorder = flags & BPF_F_PREORDER; 709 struct bpf_link *anchor_link = NULL; 710 struct bpf_prog *anchor_prog = NULL; 711 bool is_before, is_after; 712 713 is_before = flags & BPF_F_BEFORE; 714 is_after = flags & BPF_F_AFTER; 715 if (is_link || is_id || id_or_fd) { 716 /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */ 717 if (is_before == is_after) 718 return ERR_PTR(-EINVAL); 719 if ((is_link && !link) || (!is_link && !prog)) 720 return ERR_PTR(-EINVAL); 721 } else if (!hlist_empty(progs)) { 722 /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */ 723 if (is_before && is_after) 724 return ERR_PTR(-EINVAL); 725 } 726 727 if (is_link) { 728 anchor_link = bpf_get_anchor_link(flags, id_or_fd); 729 if (IS_ERR(anchor_link)) 730 return ERR_CAST(anchor_link); 731 } else if (is_id || id_or_fd) { 732 anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); 733 if (IS_ERR(anchor_prog)) 734 return ERR_CAST(anchor_prog); 735 } 736 737 if (!anchor_prog && !anchor_link) { 738 /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER 739 * doesn't matter since either prepend or append to a combined 740 * list of progs will end up with correct result. 741 */ 742 hlist_for_each_entry(pltmp, progs, node) { 743 if (is_before) 744 return pltmp; 745 if (pltmp->node.next) 746 continue; 747 return pltmp; 748 } 749 return NULL; 750 } 751 752 hlist_for_each_entry(pltmp, progs, node) { 753 if ((anchor_prog && anchor_prog == pltmp->prog) || 754 (anchor_link && anchor_link == &pltmp->link->link)) { 755 if (!!(pltmp->flags & BPF_F_PREORDER) != preorder) 756 goto out; 757 pl = pltmp; 758 goto out; 759 } 760 } 761 762 pl = ERR_PTR(-ENOENT); 763 out: 764 if (anchor_link) 765 bpf_link_put(anchor_link); 766 else 767 bpf_prog_put(anchor_prog); 768 return pl; 769 } 770 771 static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs, 772 struct bpf_prog *prog, struct bpf_cgroup_link *link, 773 u32 flags, u32 id_or_fd) 774 { 775 struct bpf_prog_list *pltmp; 776 777 pltmp = get_prog_list(progs, prog, link, flags, id_or_fd); 778 if (IS_ERR(pltmp)) 779 return PTR_ERR(pltmp); 780 781 if (!pltmp) 782 hlist_add_head(&pl->node, progs); 783 else if (flags & BPF_F_BEFORE) 784 hlist_add_before(&pl->node, &pltmp->node); 785 else 786 hlist_add_behind(&pl->node, &pltmp->node); 787 788 return 0; 789 } 790 791 /** 792 * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and 793 * propagate the change to descendants 794 * @cgrp: The cgroup which descendants to traverse 795 * @prog: A program to attach 796 * @link: A link to attach 797 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set 798 * @type: Type of attach operation 799 * @flags: Option flags 800 * @id_or_fd: Relative prog id or fd 801 * @revision: bpf_prog_list revision 802 * 803 * Exactly one of @prog or @link can be non-null. 804 * Must be called with cgroup_mutex held. 805 */ 806 static int __cgroup_bpf_attach(struct cgroup *cgrp, 807 struct bpf_prog *prog, struct bpf_prog *replace_prog, 808 struct bpf_cgroup_link *link, 809 enum bpf_attach_type type, u32 flags, u32 id_or_fd, 810 u64 revision) 811 { 812 u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); 813 struct bpf_prog *old_prog = NULL; 814 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; 815 struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; 816 struct bpf_prog *new_prog = prog ? : link->link.prog; 817 enum cgroup_bpf_attach_type atype; 818 struct bpf_prog_list *pl; 819 struct hlist_head *progs; 820 int err; 821 822 if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || 823 ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) 824 /* invalid combination */ 825 return -EINVAL; 826 if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER))) 827 /* only either replace or insertion with before/after */ 828 return -EINVAL; 829 if (link && (prog || replace_prog)) 830 /* only either link or prog/replace_prog can be specified */ 831 return -EINVAL; 832 if (!!replace_prog != !!(flags & BPF_F_REPLACE)) 833 /* replace_prog implies BPF_F_REPLACE, and vice versa */ 834 return -EINVAL; 835 836 atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); 837 if (atype < 0) 838 return -EINVAL; 839 if (revision && revision != cgrp->bpf.revisions[atype]) 840 return -ESTALE; 841 842 progs = &cgrp->bpf.progs[atype]; 843 844 if (!hierarchy_allows_attach(cgrp, atype)) 845 return -EPERM; 846 847 if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) 848 /* Disallow attaching non-overridable on top 849 * of existing overridable in this cgroup. 850 * Disallow attaching multi-prog if overridable or none 851 */ 852 return -EPERM; 853 854 if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) 855 return -E2BIG; 856 857 pl = find_attach_entry(progs, prog, link, replace_prog, 858 flags & BPF_F_ALLOW_MULTI); 859 if (IS_ERR(pl)) 860 return PTR_ERR(pl); 861 862 if (bpf_cgroup_storages_alloc(storage, new_storage, type, 863 prog ? : link->link.prog, cgrp)) 864 return -ENOMEM; 865 866 if (pl) { 867 old_prog = pl->prog; 868 } else { 869 pl = kmalloc_obj(*pl); 870 if (!pl) { 871 bpf_cgroup_storages_free(new_storage); 872 return -ENOMEM; 873 } 874 875 err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd); 876 if (err) { 877 kfree(pl); 878 bpf_cgroup_storages_free(new_storage); 879 return err; 880 } 881 } 882 883 pl->prog = prog; 884 pl->link = link; 885 pl->flags = flags; 886 bpf_cgroup_storages_assign(pl->storage, storage); 887 cgrp->bpf.flags[atype] = saved_flags; 888 889 if (type == BPF_LSM_CGROUP) { 890 err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type); 891 if (err) 892 goto cleanup; 893 } 894 895 err = update_effective_progs(cgrp, atype); 896 if (err) 897 goto cleanup_trampoline; 898 899 cgrp->bpf.revisions[atype] += 1; 900 if (old_prog) { 901 if (type == BPF_LSM_CGROUP) 902 bpf_trampoline_unlink_cgroup_shim(old_prog); 903 bpf_prog_put(old_prog); 904 } else { 905 static_branch_inc(&cgroup_bpf_enabled_key[atype]); 906 } 907 bpf_cgroup_storages_link(new_storage, cgrp, type); 908 return 0; 909 910 cleanup_trampoline: 911 if (type == BPF_LSM_CGROUP) 912 bpf_trampoline_unlink_cgroup_shim(new_prog); 913 914 cleanup: 915 if (old_prog) { 916 pl->prog = old_prog; 917 pl->link = NULL; 918 } 919 bpf_cgroup_storages_free(new_storage); 920 if (!old_prog) { 921 hlist_del(&pl->node); 922 kfree(pl); 923 } 924 return err; 925 } 926 927 static int cgroup_bpf_attach(struct cgroup *cgrp, 928 struct bpf_prog *prog, struct bpf_prog *replace_prog, 929 struct bpf_cgroup_link *link, 930 enum bpf_attach_type type, 931 u32 flags, u32 id_or_fd, u64 revision) 932 { 933 int ret; 934 935 cgroup_lock(); 936 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags, 937 id_or_fd, revision); 938 cgroup_unlock(); 939 return ret; 940 } 941 942 static int effective_prog_pos(struct cgroup *cgrp, 943 enum cgroup_bpf_attach_type atype, 944 struct bpf_prog_list *target_pl) 945 { 946 int cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart, pos = -1; 947 struct bpf_prog_list *pl; 948 struct cgroup *p = cgrp; 949 950 /* count effective programs to find where the preorder region ends */ 951 do { 952 if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 953 cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); 954 p = cgroup_parent(p); 955 } while (p); 956 957 /* replay compute_effective_progs() placement and record target's slot */ 958 cnt = 0; 959 p = cgrp; 960 fstart = preorder_cnt; 961 bstart = preorder_cnt - 1; 962 do { 963 if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 964 continue; 965 966 init_bstart = bstart; 967 hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { 968 if (!prog_list_prog(pl)) 969 continue; 970 971 if (pl->flags & BPF_F_PREORDER) { 972 if (pl == target_pl) 973 pos = bstart; 974 bstart--; 975 } else { 976 if (pl == target_pl) 977 pos = fstart; 978 fstart++; 979 } 980 cnt++; 981 } 982 983 /* reverse pre-ordering progs at this cgroup level */ 984 if (pos >= bstart + 1 && pos <= init_bstart) 985 pos = bstart + 1 + init_bstart - pos; 986 } while ((p = cgroup_parent(p))); 987 988 return pos; 989 } 990 991 /* Swap updated BPF program for given link in effective program arrays across 992 * all descendant cgroups. This function is guaranteed to succeed. 993 */ 994 static void replace_effective_prog(struct cgroup *cgrp, 995 enum cgroup_bpf_attach_type atype, 996 struct bpf_prog_list *pl) 997 { 998 struct bpf_prog_array_item *item; 999 struct cgroup_subsys_state *css; 1000 struct bpf_prog_array *progs; 1001 int pos; 1002 1003 css_for_each_descendant_pre(css, &cgrp->self) { 1004 struct cgroup *desc = container_of(css, struct cgroup, self); 1005 1006 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 1007 continue; 1008 1009 pos = effective_prog_pos(desc, atype, pl); 1010 if (WARN_ON_ONCE(pos < 0)) 1011 continue; 1012 1013 progs = rcu_dereference_protected( 1014 desc->bpf.effective[atype], 1015 lockdep_is_held(&cgroup_mutex)); 1016 item = &progs->items[pos]; 1017 WRITE_ONCE(item->prog, pl->link->link.prog); 1018 } 1019 } 1020 1021 /** 1022 * __cgroup_bpf_replace() - Replace link's program and propagate the change 1023 * to descendants 1024 * @cgrp: The cgroup which descendants to traverse 1025 * @link: A link for which to replace BPF program 1026 * @new_prog: &struct bpf_prog for the target BPF program with its refcnt 1027 * incremented 1028 * 1029 * Must be called with cgroup_mutex held. 1030 */ 1031 static int __cgroup_bpf_replace(struct cgroup *cgrp, 1032 struct bpf_cgroup_link *link, 1033 struct bpf_prog *new_prog) 1034 { 1035 enum cgroup_bpf_attach_type atype; 1036 struct bpf_prog *old_prog; 1037 struct bpf_prog_list *pl; 1038 struct hlist_head *progs; 1039 bool found = false; 1040 1041 atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id); 1042 if (atype < 0) 1043 return -EINVAL; 1044 1045 progs = &cgrp->bpf.progs[atype]; 1046 1047 if (link->link.prog->type != new_prog->type) 1048 return -EINVAL; 1049 1050 hlist_for_each_entry(pl, progs, node) { 1051 if (pl->link == link) { 1052 found = true; 1053 break; 1054 } 1055 } 1056 if (!found) 1057 return -ENOENT; 1058 1059 cgrp->bpf.revisions[atype] += 1; 1060 old_prog = xchg(&link->link.prog, new_prog); 1061 replace_effective_prog(cgrp, atype, pl); 1062 bpf_prog_put(old_prog); 1063 return 0; 1064 } 1065 1066 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, 1067 struct bpf_prog *old_prog) 1068 { 1069 struct bpf_cgroup_link *cg_link; 1070 int ret; 1071 1072 cg_link = container_of(link, struct bpf_cgroup_link, link); 1073 1074 cgroup_lock(); 1075 /* link might have been auto-released by dying cgroup, so fail */ 1076 if (!cg_link->cgroup) { 1077 ret = -ENOLINK; 1078 goto out_unlock; 1079 } 1080 if (old_prog && link->prog != old_prog) { 1081 ret = -EPERM; 1082 goto out_unlock; 1083 } 1084 ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); 1085 out_unlock: 1086 cgroup_unlock(); 1087 return ret; 1088 } 1089 1090 static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, 1091 struct bpf_prog *prog, 1092 struct bpf_cgroup_link *link, 1093 bool allow_multi) 1094 { 1095 struct bpf_prog_list *pl; 1096 1097 if (!allow_multi) { 1098 if (hlist_empty(progs)) 1099 /* report error when trying to detach and nothing is attached */ 1100 return ERR_PTR(-ENOENT); 1101 1102 /* to maintain backward compatibility NONE and OVERRIDE cgroups 1103 * allow detaching with invalid FD (prog==NULL) in legacy mode 1104 */ 1105 return hlist_entry(progs->first, typeof(*pl), node); 1106 } 1107 1108 if (!prog && !link) 1109 /* to detach MULTI prog the user has to specify valid FD 1110 * of the program or link to be detached 1111 */ 1112 return ERR_PTR(-EINVAL); 1113 1114 /* find the prog or link and detach it */ 1115 hlist_for_each_entry(pl, progs, node) { 1116 if (pl->prog == prog && pl->link == link) 1117 return pl; 1118 } 1119 return ERR_PTR(-ENOENT); 1120 } 1121 1122 /** 1123 * purge_effective_progs() - After compute_effective_progs fails to alloc new 1124 * cgrp->bpf.inactive table we can recover by 1125 * recomputing the array in place. 1126 * 1127 * @cgrp: The cgroup which descendants to travers 1128 * @pl: The prog_list entry being detached 1129 * @atype: Type of detach operation 1130 */ 1131 static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog_list *pl, 1132 enum cgroup_bpf_attach_type atype) 1133 { 1134 struct cgroup_subsys_state *css; 1135 struct bpf_prog_array *progs; 1136 int pos; 1137 1138 /* recompute effective prog array in place */ 1139 css_for_each_descendant_pre(css, &cgrp->self) { 1140 struct cgroup *desc = container_of(css, struct cgroup, self); 1141 1142 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 1143 continue; 1144 1145 pos = effective_prog_pos(desc, atype, pl); 1146 /* no link or prog match, skip the cgroup of this layer */ 1147 if (pos < 0) 1148 continue; 1149 1150 progs = rcu_dereference_protected( 1151 desc->bpf.effective[atype], 1152 lockdep_is_held(&cgroup_mutex)); 1153 1154 /* Remove the program from the array */ 1155 WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), 1156 "Failed to purge a prog from array at index %d", pos); 1157 } 1158 } 1159 1160 /** 1161 * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and 1162 * propagate the change to descendants 1163 * @cgrp: The cgroup which descendants to traverse 1164 * @prog: A program to detach or NULL 1165 * @link: A link to detach or NULL 1166 * @type: Type of detach operation 1167 * @revision: bpf_prog_list revision 1168 * 1169 * At most one of @prog or @link can be non-NULL. 1170 * Must be called with cgroup_mutex held. 1171 */ 1172 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 1173 struct bpf_cgroup_link *link, enum bpf_attach_type type, 1174 u64 revision) 1175 { 1176 enum cgroup_bpf_attach_type atype; 1177 struct bpf_prog *old_prog; 1178 struct bpf_prog_list *pl; 1179 struct hlist_head *progs; 1180 u32 attach_btf_id = 0; 1181 u32 flags; 1182 1183 if (prog) 1184 attach_btf_id = prog->aux->attach_btf_id; 1185 if (link) 1186 attach_btf_id = link->link.prog->aux->attach_btf_id; 1187 1188 atype = bpf_cgroup_atype_find(type, attach_btf_id); 1189 if (atype < 0) 1190 return -EINVAL; 1191 1192 if (revision && revision != cgrp->bpf.revisions[atype]) 1193 return -ESTALE; 1194 1195 progs = &cgrp->bpf.progs[atype]; 1196 flags = cgrp->bpf.flags[atype]; 1197 1198 if (prog && link) 1199 /* only one of prog or link can be specified */ 1200 return -EINVAL; 1201 1202 pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); 1203 if (IS_ERR(pl)) 1204 return PTR_ERR(pl); 1205 1206 /* mark it deleted, so it's ignored while recomputing effective */ 1207 old_prog = pl->prog; 1208 pl->prog = NULL; 1209 pl->link = NULL; 1210 1211 if (update_effective_progs(cgrp, atype)) { 1212 /* if update effective array failed replace the prog with a dummy prog*/ 1213 pl->prog = old_prog; 1214 pl->link = link; 1215 purge_effective_progs(cgrp, pl, atype); 1216 } 1217 1218 /* now can actually delete it from this cgroup list */ 1219 hlist_del(&pl->node); 1220 cgrp->bpf.revisions[atype] += 1; 1221 1222 kfree(pl); 1223 if (hlist_empty(progs)) 1224 /* last program was detached, reset flags to zero */ 1225 cgrp->bpf.flags[atype] = 0; 1226 if (old_prog) { 1227 if (type == BPF_LSM_CGROUP) 1228 bpf_trampoline_unlink_cgroup_shim(old_prog); 1229 bpf_prog_put(old_prog); 1230 } 1231 static_branch_dec(&cgroup_bpf_enabled_key[atype]); 1232 return 0; 1233 } 1234 1235 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 1236 enum bpf_attach_type type, u64 revision) 1237 { 1238 int ret; 1239 1240 cgroup_lock(); 1241 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision); 1242 cgroup_unlock(); 1243 return ret; 1244 } 1245 1246 /* Must be called with cgroup_mutex held to avoid races. */ 1247 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 1248 union bpf_attr __user *uattr, u32 uattr_size) 1249 { 1250 __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); 1251 bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; 1252 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 1253 enum bpf_attach_type type = attr->query.attach_type; 1254 enum cgroup_bpf_attach_type from_atype, to_atype; 1255 enum cgroup_bpf_attach_type atype; 1256 struct bpf_prog_array *effective; 1257 int cnt, ret = 0, i; 1258 int total_cnt = 0; 1259 u64 revision = 0; 1260 u32 flags; 1261 1262 if (effective_query && prog_attach_flags) 1263 return -EINVAL; 1264 1265 if (type == BPF_LSM_CGROUP) { 1266 if (!effective_query && attr->query.prog_cnt && 1267 prog_ids && !prog_attach_flags) 1268 return -EINVAL; 1269 1270 from_atype = CGROUP_LSM_START; 1271 to_atype = CGROUP_LSM_END; 1272 flags = 0; 1273 } else { 1274 from_atype = to_cgroup_bpf_attach_type(type); 1275 if (from_atype < 0) 1276 return -EINVAL; 1277 to_atype = from_atype; 1278 flags = cgrp->bpf.flags[from_atype]; 1279 } 1280 1281 for (atype = from_atype; atype <= to_atype; atype++) { 1282 if (effective_query) { 1283 effective = rcu_dereference_protected(cgrp->bpf.effective[atype], 1284 lockdep_is_held(&cgroup_mutex)); 1285 total_cnt += bpf_prog_array_length(effective); 1286 } else { 1287 total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); 1288 } 1289 } 1290 1291 /* always output uattr->query.attach_flags as 0 during effective query */ 1292 flags = effective_query ? 0 : flags; 1293 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 1294 return -EFAULT; 1295 if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) 1296 return -EFAULT; 1297 if (!effective_query && from_atype == to_atype) 1298 revision = cgrp->bpf.revisions[from_atype]; 1299 if (uattr_size >= offsetofend(union bpf_attr, query.revision) && 1300 copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) 1301 return -EFAULT; 1302 if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) 1303 /* return early if user requested only program count + flags */ 1304 return 0; 1305 1306 if (attr->query.prog_cnt < total_cnt) { 1307 total_cnt = attr->query.prog_cnt; 1308 ret = -ENOSPC; 1309 } 1310 1311 for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { 1312 if (effective_query) { 1313 effective = rcu_dereference_protected(cgrp->bpf.effective[atype], 1314 lockdep_is_held(&cgroup_mutex)); 1315 cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); 1316 ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); 1317 } else { 1318 struct hlist_head *progs; 1319 struct bpf_prog_list *pl; 1320 struct bpf_prog *prog; 1321 u32 id; 1322 1323 progs = &cgrp->bpf.progs[atype]; 1324 cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); 1325 i = 0; 1326 hlist_for_each_entry(pl, progs, node) { 1327 prog = prog_list_prog(pl); 1328 id = prog->aux->id; 1329 if (copy_to_user(prog_ids + i, &id, sizeof(id))) 1330 return -EFAULT; 1331 if (++i == cnt) 1332 break; 1333 } 1334 1335 if (prog_attach_flags) { 1336 flags = cgrp->bpf.flags[atype]; 1337 1338 for (i = 0; i < cnt; i++) 1339 if (copy_to_user(prog_attach_flags + i, 1340 &flags, sizeof(flags))) 1341 return -EFAULT; 1342 prog_attach_flags += cnt; 1343 } 1344 } 1345 1346 prog_ids += cnt; 1347 total_cnt -= cnt; 1348 } 1349 return ret; 1350 } 1351 1352 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 1353 union bpf_attr __user *uattr, u32 uattr_size) 1354 { 1355 int ret; 1356 1357 cgroup_lock(); 1358 ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size); 1359 cgroup_unlock(); 1360 return ret; 1361 } 1362 1363 int cgroup_bpf_prog_attach(const union bpf_attr *attr, 1364 enum bpf_prog_type ptype, struct bpf_prog *prog) 1365 { 1366 struct bpf_prog *replace_prog = NULL; 1367 struct cgroup *cgrp; 1368 int ret; 1369 1370 cgrp = cgroup_get_from_fd(attr->target_fd); 1371 if (IS_ERR(cgrp)) 1372 return PTR_ERR(cgrp); 1373 1374 if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && 1375 (attr->attach_flags & BPF_F_REPLACE)) { 1376 replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); 1377 if (IS_ERR(replace_prog)) { 1378 cgroup_put(cgrp); 1379 return PTR_ERR(replace_prog); 1380 } 1381 } 1382 1383 ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, 1384 attr->attach_type, attr->attach_flags, 1385 attr->relative_fd, attr->expected_revision); 1386 1387 if (replace_prog) 1388 bpf_prog_put(replace_prog); 1389 cgroup_put(cgrp); 1390 return ret; 1391 } 1392 1393 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) 1394 { 1395 struct bpf_prog *prog; 1396 struct cgroup *cgrp; 1397 int ret; 1398 1399 cgrp = cgroup_get_from_fd(attr->target_fd); 1400 if (IS_ERR(cgrp)) 1401 return PTR_ERR(cgrp); 1402 1403 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 1404 if (IS_ERR(prog)) 1405 prog = NULL; 1406 1407 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision); 1408 if (prog) 1409 bpf_prog_put(prog); 1410 1411 cgroup_put(cgrp); 1412 return ret; 1413 } 1414 1415 static void bpf_cgroup_link_release(struct bpf_link *link) 1416 { 1417 struct bpf_cgroup_link *cg_link = 1418 container_of(link, struct bpf_cgroup_link, link); 1419 struct cgroup *cg; 1420 1421 /* link might have been auto-detached by dying cgroup already, 1422 * in that case our work is done here 1423 */ 1424 if (!cg_link->cgroup) 1425 return; 1426 1427 cgroup_lock(); 1428 1429 /* re-check cgroup under lock again */ 1430 if (!cg_link->cgroup) { 1431 cgroup_unlock(); 1432 return; 1433 } 1434 1435 WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, 1436 link->attach_type, 0)); 1437 if (link->attach_type == BPF_LSM_CGROUP) 1438 bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); 1439 1440 cg = cg_link->cgroup; 1441 cg_link->cgroup = NULL; 1442 1443 cgroup_unlock(); 1444 1445 cgroup_put(cg); 1446 } 1447 1448 static void bpf_cgroup_link_dealloc(struct bpf_link *link) 1449 { 1450 struct bpf_cgroup_link *cg_link = 1451 container_of(link, struct bpf_cgroup_link, link); 1452 1453 kfree(cg_link); 1454 } 1455 1456 static int bpf_cgroup_link_detach(struct bpf_link *link) 1457 { 1458 bpf_cgroup_link_release(link); 1459 1460 return 0; 1461 } 1462 1463 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, 1464 struct seq_file *seq) 1465 { 1466 struct bpf_cgroup_link *cg_link = 1467 container_of(link, struct bpf_cgroup_link, link); 1468 u64 cg_id = 0; 1469 1470 cgroup_lock(); 1471 if (cg_link->cgroup) 1472 cg_id = cgroup_id(cg_link->cgroup); 1473 cgroup_unlock(); 1474 1475 seq_printf(seq, 1476 "cgroup_id:\t%llu\n" 1477 "attach_type:\t%d\n", 1478 cg_id, 1479 link->attach_type); 1480 } 1481 1482 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, 1483 struct bpf_link_info *info) 1484 { 1485 struct bpf_cgroup_link *cg_link = 1486 container_of(link, struct bpf_cgroup_link, link); 1487 u64 cg_id = 0; 1488 1489 cgroup_lock(); 1490 if (cg_link->cgroup) 1491 cg_id = cgroup_id(cg_link->cgroup); 1492 cgroup_unlock(); 1493 1494 info->cgroup.cgroup_id = cg_id; 1495 info->cgroup.attach_type = link->attach_type; 1496 return 0; 1497 } 1498 1499 static const struct bpf_link_ops bpf_cgroup_link_lops = { 1500 .release = bpf_cgroup_link_release, 1501 .dealloc = bpf_cgroup_link_dealloc, 1502 .detach = bpf_cgroup_link_detach, 1503 .update_prog = cgroup_bpf_replace, 1504 .show_fdinfo = bpf_cgroup_link_show_fdinfo, 1505 .fill_link_info = bpf_cgroup_link_fill_link_info, 1506 }; 1507 1508 #define BPF_F_LINK_ATTACH_MASK \ 1509 (BPF_F_ID | \ 1510 BPF_F_BEFORE | \ 1511 BPF_F_AFTER | \ 1512 BPF_F_PREORDER | \ 1513 BPF_F_LINK) 1514 1515 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 1516 { 1517 struct bpf_link_primer link_primer; 1518 struct bpf_cgroup_link *link; 1519 struct cgroup *cgrp; 1520 int err; 1521 1522 if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK)) 1523 return -EINVAL; 1524 1525 cgrp = cgroup_get_from_fd(attr->link_create.target_fd); 1526 if (IS_ERR(cgrp)) 1527 return PTR_ERR(cgrp); 1528 1529 link = kzalloc_obj(*link, GFP_USER); 1530 if (!link) { 1531 err = -ENOMEM; 1532 goto out_put_cgroup; 1533 } 1534 bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, 1535 prog, attr->link_create.attach_type); 1536 link->cgroup = cgrp; 1537 1538 err = bpf_link_prime(&link->link, &link_primer); 1539 if (err) { 1540 kfree(link); 1541 goto out_put_cgroup; 1542 } 1543 1544 err = cgroup_bpf_attach(cgrp, NULL, NULL, link, 1545 link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags, 1546 attr->link_create.cgroup.relative_fd, 1547 attr->link_create.cgroup.expected_revision); 1548 if (err) { 1549 bpf_link_cleanup(&link_primer); 1550 goto out_put_cgroup; 1551 } 1552 1553 return bpf_link_settle(&link_primer); 1554 1555 out_put_cgroup: 1556 cgroup_put(cgrp); 1557 return err; 1558 } 1559 1560 int cgroup_bpf_prog_query(const union bpf_attr *attr, 1561 union bpf_attr __user *uattr, u32 uattr_size) 1562 { 1563 struct cgroup *cgrp; 1564 int ret; 1565 1566 cgrp = cgroup_get_from_fd(attr->query.target_fd); 1567 if (IS_ERR(cgrp)) 1568 return PTR_ERR(cgrp); 1569 1570 ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size); 1571 1572 cgroup_put(cgrp); 1573 return ret; 1574 } 1575 1576 /** 1577 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 1578 * @sk: The socket sending or receiving traffic 1579 * @skb: The skb that is being sent or received 1580 * @atype: The type of program to be executed 1581 * 1582 * If no socket is passed, or the socket is not of type INET or INET6, 1583 * this function does nothing and returns 0. 1584 * 1585 * The program type passed in via @type must be suitable for network 1586 * filtering. No further check is performed to assert that. 1587 * 1588 * For egress packets, this function can return: 1589 * NET_XMIT_SUCCESS (0) - continue with packet output 1590 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr 1591 * NET_XMIT_CN (2) - continue with packet output and notify TCP 1592 * to call cwr 1593 * -err - drop packet 1594 * 1595 * For ingress packets, this function will return -EPERM if any 1596 * attached program was found and if it returned != 1 during execution. 1597 * Otherwise 0 is returned. 1598 */ 1599 int __cgroup_bpf_run_filter_skb(struct sock *sk, 1600 struct sk_buff *skb, 1601 enum cgroup_bpf_attach_type atype) 1602 { 1603 unsigned int offset = -skb_network_offset(skb); 1604 struct sock *save_sk; 1605 void *saved_data_end; 1606 struct cgroup *cgrp; 1607 int ret; 1608 1609 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 1610 return 0; 1611 1612 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1613 save_sk = skb->sk; 1614 skb->sk = sk; 1615 __skb_push(skb, offset); 1616 1617 /* compute pointers for the bpf prog */ 1618 bpf_compute_and_save_data_end(skb, &saved_data_end); 1619 1620 if (atype == CGROUP_INET_EGRESS) { 1621 u32 flags = 0; 1622 bool cn; 1623 1624 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, 1625 __bpf_prog_run_save_cb, 0, &flags); 1626 1627 /* Return values of CGROUP EGRESS BPF programs are: 1628 * 0: drop packet 1629 * 1: keep packet 1630 * 2: drop packet and cn 1631 * 3: keep packet and cn 1632 * 1633 * The returned value is then converted to one of the NET_XMIT 1634 * or an error code that is then interpreted as drop packet 1635 * (and no cn): 1636 * 0: NET_XMIT_SUCCESS skb should be transmitted 1637 * 1: NET_XMIT_DROP skb should be dropped and cn 1638 * 2: NET_XMIT_CN skb should be transmitted and cn 1639 * 3: -err skb should be dropped 1640 */ 1641 1642 cn = flags & BPF_RET_SET_CN; 1643 if (ret && !IS_ERR_VALUE((long)ret)) 1644 ret = -EFAULT; 1645 if (!ret) 1646 ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); 1647 else 1648 ret = (cn ? NET_XMIT_DROP : ret); 1649 } else { 1650 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, 1651 skb, __bpf_prog_run_save_cb, 0, 1652 NULL); 1653 if (ret && !IS_ERR_VALUE((long)ret)) 1654 ret = -EFAULT; 1655 } 1656 bpf_restore_data_end(skb, saved_data_end); 1657 __skb_pull(skb, offset); 1658 skb->sk = save_sk; 1659 1660 return ret; 1661 } 1662 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 1663 1664 /** 1665 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 1666 * @sk: sock structure to manipulate 1667 * @atype: The type of program to be executed 1668 * 1669 * socket is passed is expected to be of type INET or INET6. 1670 * 1671 * The program type passed in via @type must be suitable for sock 1672 * filtering. No further check is performed to assert that. 1673 * 1674 * This function will return %-EPERM if any if an attached program was found 1675 * and if it returned != 1 during execution. In all other cases, 0 is returned. 1676 */ 1677 int __cgroup_bpf_run_filter_sk(struct sock *sk, 1678 enum cgroup_bpf_attach_type atype) 1679 { 1680 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1681 1682 return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0, 1683 NULL); 1684 } 1685 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 1686 1687 /** 1688 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and 1689 * provided by user sockaddr 1690 * @sk: sock struct that will use sockaddr 1691 * @uaddr: sockaddr struct provided by user 1692 * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is 1693 * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX 1694 * uaddr. 1695 * @atype: The type of program to be executed 1696 * @t_ctx: Pointer to attach type specific context 1697 * @flags: Pointer to u32 which contains higher bits of BPF program 1698 * return value (OR'ed together). 1699 * 1700 * socket is expected to be of type INET, INET6 or UNIX. 1701 * 1702 * This function will return %-EPERM if an attached program is found and 1703 * returned value != 1 during execution. In all other cases, 0 is returned. 1704 */ 1705 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, 1706 struct sockaddr_unsized *uaddr, 1707 int *uaddrlen, 1708 enum cgroup_bpf_attach_type atype, 1709 void *t_ctx, 1710 u32 *flags) 1711 { 1712 struct bpf_sock_addr_kern ctx = { 1713 .sk = sk, 1714 .uaddr = uaddr, 1715 .t_ctx = t_ctx, 1716 }; 1717 struct sockaddr_storage storage; 1718 struct cgroup *cgrp; 1719 int ret; 1720 1721 if (!sk_is_inet(sk) && !sk_is_unix(sk)) 1722 return 0; 1723 1724 if (!ctx.uaddr) { 1725 memset(&storage, 0, sizeof(storage)); 1726 ctx.uaddr = (struct sockaddr_unsized *)&storage; 1727 ctx.uaddrlen = 0; 1728 } else { 1729 ctx.uaddrlen = *uaddrlen; 1730 } 1731 1732 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1733 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 1734 0, flags); 1735 1736 if (!ret && uaddr) 1737 *uaddrlen = ctx.uaddrlen; 1738 1739 return ret; 1740 } 1741 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); 1742 1743 /** 1744 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 1745 * @sk: socket to get cgroup from 1746 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 1747 * sk with connection information (IP addresses, etc.) May not contain 1748 * cgroup info if it is a req sock. 1749 * @atype: The type of program to be executed 1750 * 1751 * socket passed is expected to be of type INET or INET6. 1752 * 1753 * The program type passed in via @type must be suitable for sock_ops 1754 * filtering. No further check is performed to assert that. 1755 * 1756 * This function will return %-EPERM if any if an attached program was found 1757 * and if it returned != 1 during execution. In all other cases, 0 is returned. 1758 */ 1759 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 1760 struct bpf_sock_ops_kern *sock_ops, 1761 enum cgroup_bpf_attach_type atype) 1762 { 1763 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1764 1765 return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, 1766 0, NULL); 1767 } 1768 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 1769 1770 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, 1771 short access, enum cgroup_bpf_attach_type atype) 1772 { 1773 struct cgroup *cgrp; 1774 struct bpf_cgroup_dev_ctx ctx = { 1775 .access_type = (access << 16) | dev_type, 1776 .major = major, 1777 .minor = minor, 1778 }; 1779 int ret; 1780 1781 rcu_read_lock(); 1782 cgrp = task_dfl_cgroup(current); 1783 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, 1784 NULL); 1785 rcu_read_unlock(); 1786 1787 return ret; 1788 } 1789 1790 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) 1791 { 1792 /* flags argument is not used now, 1793 * but provides an ability to extend the API. 1794 * verifier checks that its value is correct. 1795 */ 1796 enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); 1797 struct bpf_cgroup_storage *storage; 1798 struct bpf_cg_run_ctx *ctx; 1799 void *ptr; 1800 1801 /* get current cgroup storage from BPF run context */ 1802 ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); 1803 storage = ctx->prog_item->cgroup_storage[stype]; 1804 1805 if (stype == BPF_CGROUP_STORAGE_SHARED) 1806 ptr = &READ_ONCE(storage->buf)->data[0]; 1807 else 1808 ptr = this_cpu_ptr(storage->percpu_buf); 1809 1810 return (unsigned long)ptr; 1811 } 1812 1813 const struct bpf_func_proto bpf_get_local_storage_proto = { 1814 .func = bpf_get_local_storage, 1815 .gpl_only = false, 1816 .ret_type = RET_PTR_TO_MAP_VALUE, 1817 .arg1_type = ARG_CONST_MAP_PTR, 1818 .arg2_type = ARG_ANYTHING, 1819 }; 1820 1821 BPF_CALL_0(bpf_get_retval) 1822 { 1823 struct bpf_cg_run_ctx *ctx = 1824 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); 1825 1826 return ctx->retval; 1827 } 1828 1829 const struct bpf_func_proto bpf_get_retval_proto = { 1830 .func = bpf_get_retval, 1831 .gpl_only = false, 1832 .ret_type = RET_INTEGER, 1833 }; 1834 1835 BPF_CALL_1(bpf_set_retval, int, retval) 1836 { 1837 struct bpf_cg_run_ctx *ctx = 1838 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); 1839 1840 ctx->retval = retval; 1841 return 0; 1842 } 1843 1844 const struct bpf_func_proto bpf_set_retval_proto = { 1845 .func = bpf_set_retval, 1846 .gpl_only = false, 1847 .ret_type = RET_INTEGER, 1848 .arg1_type = ARG_ANYTHING, 1849 }; 1850 1851 static const struct bpf_func_proto * 1852 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1853 { 1854 const struct bpf_func_proto *func_proto; 1855 1856 func_proto = cgroup_common_func_proto(func_id, prog); 1857 if (func_proto) 1858 return func_proto; 1859 1860 switch (func_id) { 1861 case BPF_FUNC_perf_event_output: 1862 return &bpf_event_output_data_proto; 1863 default: 1864 return bpf_base_func_proto(func_id, prog); 1865 } 1866 } 1867 1868 static bool cgroup_dev_is_valid_access(int off, int size, 1869 enum bpf_access_type type, 1870 const struct bpf_prog *prog, 1871 struct bpf_insn_access_aux *info) 1872 { 1873 const int size_default = sizeof(__u32); 1874 1875 if (type == BPF_WRITE) 1876 return false; 1877 1878 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) 1879 return false; 1880 /* The verifier guarantees that size > 0. */ 1881 if (off % size != 0) 1882 return false; 1883 1884 switch (off) { 1885 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): 1886 bpf_ctx_record_field_size(info, size_default); 1887 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 1888 return false; 1889 break; 1890 default: 1891 if (size != size_default) 1892 return false; 1893 } 1894 1895 return true; 1896 } 1897 1898 const struct bpf_prog_ops cg_dev_prog_ops = { 1899 }; 1900 1901 const struct bpf_verifier_ops cg_dev_verifier_ops = { 1902 .get_func_proto = cgroup_dev_func_proto, 1903 .is_valid_access = cgroup_dev_is_valid_access, 1904 }; 1905 1906 /** 1907 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl 1908 * 1909 * @head: sysctl table header 1910 * @table: sysctl table 1911 * @write: sysctl is being read (= 0) or written (= 1) 1912 * @buf: pointer to buffer (in and out) 1913 * @pcount: value-result argument: value is size of buffer pointed to by @buf, 1914 * result is size of @new_buf if program set new value, initial value 1915 * otherwise 1916 * @ppos: value-result argument: value is position at which read from or write 1917 * to sysctl is happening, result is new position if program overrode it, 1918 * initial value otherwise 1919 * @atype: type of program to be executed 1920 * 1921 * Program is run when sysctl is being accessed, either read or written, and 1922 * can allow or deny such access. 1923 * 1924 * This function will return %-EPERM if an attached program is found and 1925 * returned value != 1 during execution. In all other cases 0 is returned. 1926 */ 1927 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 1928 const struct ctl_table *table, int write, 1929 char **buf, size_t *pcount, loff_t *ppos, 1930 enum cgroup_bpf_attach_type atype) 1931 { 1932 struct bpf_sysctl_kern ctx = { 1933 .head = head, 1934 .table = table, 1935 .write = write, 1936 .ppos = ppos, 1937 .cur_val = NULL, 1938 .cur_len = PAGE_SIZE, 1939 .new_val = NULL, 1940 .new_len = 0, 1941 .new_updated = 0, 1942 }; 1943 struct cgroup *cgrp; 1944 loff_t pos = 0; 1945 int ret; 1946 1947 ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); 1948 if (!ctx.cur_val || 1949 table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { 1950 /* Let BPF program decide how to proceed. */ 1951 ctx.cur_len = 0; 1952 } 1953 1954 if (write && *buf && *pcount) { 1955 /* BPF program should be able to override new value with a 1956 * buffer bigger than provided by user. 1957 */ 1958 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); 1959 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); 1960 if (ctx.new_val) { 1961 memcpy(ctx.new_val, *buf, ctx.new_len); 1962 } else { 1963 /* Let BPF program decide how to proceed. */ 1964 ctx.new_len = 0; 1965 } 1966 } 1967 1968 rcu_read_lock(); 1969 cgrp = task_dfl_cgroup(current); 1970 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, 1971 NULL); 1972 rcu_read_unlock(); 1973 1974 kfree(ctx.cur_val); 1975 1976 if (!ret && ctx.new_updated) { 1977 kvfree(*buf); 1978 *buf = ctx.new_val; 1979 *pcount = ctx.new_len; 1980 } else { 1981 kfree(ctx.new_val); 1982 } 1983 1984 return ret; 1985 } 1986 1987 #ifdef CONFIG_NET 1988 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, 1989 struct bpf_sockopt_buf *buf) 1990 { 1991 if (unlikely(max_optlen < 0)) 1992 return -EINVAL; 1993 1994 if (unlikely(max_optlen > PAGE_SIZE)) { 1995 /* We don't expose optvals that are greater than PAGE_SIZE 1996 * to the BPF program. 1997 */ 1998 max_optlen = PAGE_SIZE; 1999 } 2000 2001 if (max_optlen <= sizeof(buf->data)) { 2002 /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE 2003 * bytes avoid the cost of kzalloc. 2004 */ 2005 ctx->optval = buf->data; 2006 ctx->optval_end = ctx->optval + max_optlen; 2007 return max_optlen; 2008 } 2009 2010 ctx->optval = kzalloc(max_optlen, GFP_USER); 2011 if (!ctx->optval) 2012 return -ENOMEM; 2013 2014 ctx->optval_end = ctx->optval + max_optlen; 2015 2016 return max_optlen; 2017 } 2018 2019 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, 2020 struct bpf_sockopt_buf *buf) 2021 { 2022 if (ctx->optval == buf->data) 2023 return; 2024 kfree(ctx->optval); 2025 } 2026 2027 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, 2028 struct bpf_sockopt_buf *buf) 2029 { 2030 return ctx->optval != buf->data; 2031 } 2032 2033 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, 2034 int *optname, sockptr_t optval, 2035 int *optlen, char **kernel_optval) 2036 { 2037 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 2038 struct bpf_sockopt_buf buf = {}; 2039 struct bpf_sockopt_kern ctx = { 2040 .sk = sk, 2041 .level = *level, 2042 .optname = *optname, 2043 }; 2044 int ret, max_optlen; 2045 2046 /* Allocate a bit more than the initial user buffer for 2047 * BPF program. The canonical use case is overriding 2048 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). 2049 */ 2050 max_optlen = max_t(int, 16, *optlen); 2051 max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); 2052 if (max_optlen < 0) 2053 return max_optlen; 2054 2055 ctx.optlen = *optlen; 2056 2057 if (copy_from_sockptr(ctx.optval, optval, 2058 min(*optlen, max_optlen))) { 2059 ret = -EFAULT; 2060 goto out; 2061 } 2062 2063 lock_sock(sk); 2064 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, 2065 &ctx, bpf_prog_run, 0, NULL); 2066 release_sock(sk); 2067 2068 if (ret) 2069 goto out; 2070 2071 if (ctx.optlen == -1) { 2072 /* optlen set to -1, bypass kernel */ 2073 ret = 1; 2074 } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { 2075 /* optlen is out of bounds */ 2076 if (*optlen > PAGE_SIZE && ctx.optlen >= 0) { 2077 pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", 2078 ctx.optlen, max_optlen); 2079 ret = 0; 2080 goto out; 2081 } 2082 ret = -EFAULT; 2083 } else { 2084 /* optlen within bounds, run kernel handler */ 2085 ret = 0; 2086 2087 /* export any potential modifications */ 2088 *level = ctx.level; 2089 *optname = ctx.optname; 2090 2091 /* optlen == 0 from BPF indicates that we should 2092 * use original userspace data. 2093 */ 2094 if (ctx.optlen != 0) { 2095 *optlen = ctx.optlen; 2096 /* We've used bpf_sockopt_kern->buf as an intermediary 2097 * storage, but the BPF program indicates that we need 2098 * to pass this data to the kernel setsockopt handler. 2099 * No way to export on-stack buf, have to allocate a 2100 * new buffer. 2101 */ 2102 if (!sockopt_buf_allocated(&ctx, &buf)) { 2103 void *p = kmalloc(ctx.optlen, GFP_USER); 2104 2105 if (!p) { 2106 ret = -ENOMEM; 2107 goto out; 2108 } 2109 memcpy(p, ctx.optval, ctx.optlen); 2110 *kernel_optval = p; 2111 } else { 2112 *kernel_optval = ctx.optval; 2113 } 2114 /* export and don't free sockopt buf */ 2115 return 0; 2116 } 2117 } 2118 2119 out: 2120 sockopt_free_buf(&ctx, &buf); 2121 return ret; 2122 } 2123 2124 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, 2125 int optname, sockptr_t optval, 2126 sockptr_t optlen, int max_optlen, 2127 int retval) 2128 { 2129 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 2130 struct bpf_sockopt_buf buf = {}; 2131 struct bpf_sockopt_kern ctx = { 2132 .sk = sk, 2133 .level = level, 2134 .optname = optname, 2135 .current_task = current, 2136 }; 2137 int orig_optlen; 2138 int ret; 2139 2140 orig_optlen = max_optlen; 2141 ctx.optlen = max_optlen; 2142 max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); 2143 if (max_optlen < 0) 2144 return max_optlen; 2145 2146 if (!retval) { 2147 /* If kernel getsockopt finished successfully, 2148 * copy whatever was returned to the user back 2149 * into our temporary buffer. Set optlen to the 2150 * one that kernel returned as well to let 2151 * BPF programs inspect the value. 2152 */ 2153 if (copy_from_sockptr(&ctx.optlen, optlen, 2154 sizeof(ctx.optlen))) { 2155 ret = -EFAULT; 2156 goto out; 2157 } 2158 2159 if (ctx.optlen < 0) { 2160 ret = -EFAULT; 2161 goto out; 2162 } 2163 orig_optlen = ctx.optlen; 2164 2165 if (copy_from_sockptr(ctx.optval, optval, 2166 min(ctx.optlen, max_optlen))) { 2167 ret = -EFAULT; 2168 goto out; 2169 } 2170 } 2171 2172 lock_sock(sk); 2173 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, 2174 &ctx, bpf_prog_run, retval, NULL); 2175 release_sock(sk); 2176 2177 if (ret < 0) 2178 goto out; 2179 2180 if (!sockptr_is_null(optval) && 2181 (ctx.optlen > max_optlen || ctx.optlen < 0)) { 2182 if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) { 2183 pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", 2184 ctx.optlen, max_optlen); 2185 ret = retval; 2186 goto out; 2187 } 2188 ret = -EFAULT; 2189 goto out; 2190 } 2191 2192 if (ctx.optlen != 0) { 2193 if (!sockptr_is_null(optval) && 2194 copy_to_sockptr(optval, ctx.optval, ctx.optlen)) { 2195 ret = -EFAULT; 2196 goto out; 2197 } 2198 if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) { 2199 ret = -EFAULT; 2200 goto out; 2201 } 2202 } 2203 2204 out: 2205 sockopt_free_buf(&ctx, &buf); 2206 return ret; 2207 } 2208 2209 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, 2210 int optname, void *optval, 2211 int *optlen, int retval) 2212 { 2213 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 2214 struct bpf_sockopt_kern ctx = { 2215 .sk = sk, 2216 .level = level, 2217 .optname = optname, 2218 .optlen = *optlen, 2219 .optval = optval, 2220 .optval_end = optval + *optlen, 2221 .current_task = current, 2222 }; 2223 int ret; 2224 2225 /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy 2226 * user data back into BPF buffer when reval != 0. This is 2227 * done as an optimization to avoid extra copy, assuming 2228 * kernel won't populate the data in case of an error. 2229 * Here we always pass the data and memset() should 2230 * be called if that data shouldn't be "exported". 2231 */ 2232 2233 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, 2234 &ctx, bpf_prog_run, retval, NULL); 2235 if (ret < 0) 2236 return ret; 2237 2238 if (ctx.optlen > *optlen) 2239 return -EFAULT; 2240 2241 /* BPF programs can shrink the buffer, export the modifications. 2242 */ 2243 if (ctx.optlen != 0) 2244 *optlen = ctx.optlen; 2245 2246 return ret; 2247 } 2248 #endif 2249 2250 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 2251 size_t *lenp) 2252 { 2253 ssize_t tmp_ret = 0, ret; 2254 2255 if (dir->header.parent) { 2256 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); 2257 if (tmp_ret < 0) 2258 return tmp_ret; 2259 } 2260 2261 ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); 2262 if (ret < 0) 2263 return ret; 2264 *bufp += ret; 2265 *lenp -= ret; 2266 ret += tmp_ret; 2267 2268 /* Avoid leading slash. */ 2269 if (!ret) 2270 return ret; 2271 2272 tmp_ret = strscpy(*bufp, "/", *lenp); 2273 if (tmp_ret < 0) 2274 return tmp_ret; 2275 *bufp += tmp_ret; 2276 *lenp -= tmp_ret; 2277 2278 return ret + tmp_ret; 2279 } 2280 2281 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, 2282 size_t, buf_len, u64, flags) 2283 { 2284 ssize_t tmp_ret = 0, ret; 2285 2286 if (!buf) 2287 return -EINVAL; 2288 2289 if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { 2290 if (!ctx->head) 2291 return -EINVAL; 2292 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); 2293 if (tmp_ret < 0) 2294 return tmp_ret; 2295 } 2296 2297 ret = strscpy(buf, ctx->table->procname, buf_len); 2298 2299 return ret < 0 ? ret : tmp_ret + ret; 2300 } 2301 2302 static const struct bpf_func_proto bpf_sysctl_get_name_proto = { 2303 .func = bpf_sysctl_get_name, 2304 .gpl_only = false, 2305 .ret_type = RET_INTEGER, 2306 .arg1_type = ARG_PTR_TO_CTX, 2307 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, 2308 .arg3_type = ARG_CONST_SIZE, 2309 .arg4_type = ARG_ANYTHING, 2310 }; 2311 2312 static int copy_sysctl_value(char *dst, size_t dst_len, char *src, 2313 size_t src_len) 2314 { 2315 if (!dst) 2316 return -EINVAL; 2317 2318 if (!dst_len) 2319 return -E2BIG; 2320 2321 if (!src || !src_len) { 2322 memset(dst, 0, dst_len); 2323 return -EINVAL; 2324 } 2325 2326 memcpy(dst, src, min(dst_len, src_len)); 2327 2328 if (dst_len > src_len) { 2329 memset(dst + src_len, '\0', dst_len - src_len); 2330 return src_len; 2331 } 2332 2333 dst[dst_len - 1] = '\0'; 2334 2335 return -E2BIG; 2336 } 2337 2338 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, 2339 char *, buf, size_t, buf_len) 2340 { 2341 return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); 2342 } 2343 2344 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { 2345 .func = bpf_sysctl_get_current_value, 2346 .gpl_only = false, 2347 .ret_type = RET_INTEGER, 2348 .arg1_type = ARG_PTR_TO_CTX, 2349 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 2350 .arg3_type = ARG_CONST_SIZE, 2351 }; 2352 2353 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, 2354 size_t, buf_len) 2355 { 2356 if (!ctx->write) { 2357 if (buf && buf_len) 2358 memset(buf, '\0', buf_len); 2359 return -EINVAL; 2360 } 2361 return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); 2362 } 2363 2364 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { 2365 .func = bpf_sysctl_get_new_value, 2366 .gpl_only = false, 2367 .ret_type = RET_INTEGER, 2368 .arg1_type = ARG_PTR_TO_CTX, 2369 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 2370 .arg3_type = ARG_CONST_SIZE, 2371 }; 2372 2373 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, 2374 const char *, buf, size_t, buf_len) 2375 { 2376 if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) 2377 return -EINVAL; 2378 2379 if (buf_len > PAGE_SIZE - 1) 2380 return -E2BIG; 2381 2382 memcpy(ctx->new_val, buf, buf_len); 2383 ((char *)ctx->new_val)[buf_len] = '\0'; 2384 ctx->new_len = buf_len; 2385 ctx->new_updated = 1; 2386 2387 return 0; 2388 } 2389 2390 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { 2391 .func = bpf_sysctl_set_new_value, 2392 .gpl_only = false, 2393 .ret_type = RET_INTEGER, 2394 .arg1_type = ARG_PTR_TO_CTX, 2395 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 2396 .arg3_type = ARG_CONST_SIZE, 2397 }; 2398 2399 static const struct bpf_func_proto * 2400 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 2401 { 2402 const struct bpf_func_proto *func_proto; 2403 2404 func_proto = cgroup_common_func_proto(func_id, prog); 2405 if (func_proto) 2406 return func_proto; 2407 2408 switch (func_id) { 2409 case BPF_FUNC_sysctl_get_name: 2410 return &bpf_sysctl_get_name_proto; 2411 case BPF_FUNC_sysctl_get_current_value: 2412 return &bpf_sysctl_get_current_value_proto; 2413 case BPF_FUNC_sysctl_get_new_value: 2414 return &bpf_sysctl_get_new_value_proto; 2415 case BPF_FUNC_sysctl_set_new_value: 2416 return &bpf_sysctl_set_new_value_proto; 2417 case BPF_FUNC_ktime_get_coarse_ns: 2418 return &bpf_ktime_get_coarse_ns_proto; 2419 case BPF_FUNC_perf_event_output: 2420 return &bpf_event_output_data_proto; 2421 default: 2422 return bpf_base_func_proto(func_id, prog); 2423 } 2424 } 2425 2426 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, 2427 const struct bpf_prog *prog, 2428 struct bpf_insn_access_aux *info) 2429 { 2430 const int size_default = sizeof(__u32); 2431 2432 if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) 2433 return false; 2434 2435 switch (off) { 2436 case bpf_ctx_range(struct bpf_sysctl, write): 2437 if (type != BPF_READ) 2438 return false; 2439 bpf_ctx_record_field_size(info, size_default); 2440 return bpf_ctx_narrow_access_ok(off, size, size_default); 2441 case bpf_ctx_range(struct bpf_sysctl, file_pos): 2442 if (type == BPF_READ) { 2443 bpf_ctx_record_field_size(info, size_default); 2444 return bpf_ctx_narrow_access_ok(off, size, size_default); 2445 } else { 2446 return size == size_default; 2447 } 2448 default: 2449 return false; 2450 } 2451 } 2452 2453 static u32 sysctl_convert_ctx_access(enum bpf_access_type type, 2454 const struct bpf_insn *si, 2455 struct bpf_insn *insn_buf, 2456 struct bpf_prog *prog, u32 *target_size) 2457 { 2458 struct bpf_insn *insn = insn_buf; 2459 u32 read_size; 2460 2461 switch (si->off) { 2462 case offsetof(struct bpf_sysctl, write): 2463 *insn++ = BPF_LDX_MEM( 2464 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 2465 bpf_target_off(struct bpf_sysctl_kern, write, 2466 sizeof_field(struct bpf_sysctl_kern, 2467 write), 2468 target_size)); 2469 break; 2470 case offsetof(struct bpf_sysctl, file_pos): 2471 /* ppos is a pointer so it should be accessed via indirect 2472 * loads and stores. Also for stores additional temporary 2473 * register is used since neither src_reg nor dst_reg can be 2474 * overridden. 2475 */ 2476 if (type == BPF_WRITE) { 2477 int treg = BPF_REG_9; 2478 2479 if (si->src_reg == treg || si->dst_reg == treg) 2480 --treg; 2481 if (si->src_reg == treg || si->dst_reg == treg) 2482 --treg; 2483 *insn++ = BPF_STX_MEM( 2484 BPF_DW, si->dst_reg, treg, 2485 offsetof(struct bpf_sysctl_kern, tmp_reg)); 2486 *insn++ = BPF_LDX_MEM( 2487 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 2488 treg, si->dst_reg, 2489 offsetof(struct bpf_sysctl_kern, ppos)); 2490 *insn++ = BPF_RAW_INSN( 2491 BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32), 2492 treg, si->src_reg, 2493 bpf_ctx_narrow_access_offset( 2494 0, sizeof(u32), sizeof(loff_t)), 2495 si->imm); 2496 *insn++ = BPF_LDX_MEM( 2497 BPF_DW, treg, si->dst_reg, 2498 offsetof(struct bpf_sysctl_kern, tmp_reg)); 2499 } else { 2500 *insn++ = BPF_LDX_MEM( 2501 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 2502 si->dst_reg, si->src_reg, 2503 offsetof(struct bpf_sysctl_kern, ppos)); 2504 read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); 2505 *insn++ = BPF_LDX_MEM( 2506 BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 2507 bpf_ctx_narrow_access_offset( 2508 0, read_size, sizeof(loff_t))); 2509 } 2510 *target_size = sizeof(u32); 2511 break; 2512 } 2513 2514 return insn - insn_buf; 2515 } 2516 2517 const struct bpf_verifier_ops cg_sysctl_verifier_ops = { 2518 .get_func_proto = sysctl_func_proto, 2519 .is_valid_access = sysctl_is_valid_access, 2520 .convert_ctx_access = sysctl_convert_ctx_access, 2521 }; 2522 2523 const struct bpf_prog_ops cg_sysctl_prog_ops = { 2524 }; 2525 2526 #ifdef CONFIG_NET 2527 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx) 2528 { 2529 const struct net *net = ctx ? sock_net(ctx->sk) : &init_net; 2530 2531 return net->net_cookie; 2532 } 2533 2534 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { 2535 .func = bpf_get_netns_cookie_sockopt, 2536 .gpl_only = false, 2537 .ret_type = RET_INTEGER, 2538 .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 2539 }; 2540 #endif 2541 2542 static const struct bpf_func_proto * 2543 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 2544 { 2545 const struct bpf_func_proto *func_proto; 2546 2547 func_proto = cgroup_common_func_proto(func_id, prog); 2548 if (func_proto) 2549 return func_proto; 2550 2551 switch (func_id) { 2552 #ifdef CONFIG_NET 2553 case BPF_FUNC_get_netns_cookie: 2554 return &bpf_get_netns_cookie_sockopt_proto; 2555 case BPF_FUNC_sk_storage_get: 2556 return &bpf_sk_storage_get_proto; 2557 case BPF_FUNC_sk_storage_delete: 2558 return &bpf_sk_storage_delete_proto; 2559 case BPF_FUNC_setsockopt: 2560 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) 2561 return &bpf_sk_setsockopt_proto; 2562 return NULL; 2563 case BPF_FUNC_getsockopt: 2564 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) 2565 return &bpf_sk_getsockopt_proto; 2566 return NULL; 2567 #endif 2568 #ifdef CONFIG_INET 2569 case BPF_FUNC_tcp_sock: 2570 return &bpf_tcp_sock_proto; 2571 #endif 2572 case BPF_FUNC_perf_event_output: 2573 return &bpf_event_output_data_proto; 2574 default: 2575 return bpf_base_func_proto(func_id, prog); 2576 } 2577 } 2578 2579 static bool cg_sockopt_is_valid_access(int off, int size, 2580 enum bpf_access_type type, 2581 const struct bpf_prog *prog, 2582 struct bpf_insn_access_aux *info) 2583 { 2584 const int size_default = sizeof(__u32); 2585 2586 if (off < 0 || off >= sizeof(struct bpf_sockopt)) 2587 return false; 2588 2589 if (off % size != 0) 2590 return false; 2591 2592 if (type == BPF_WRITE) { 2593 switch (off) { 2594 case offsetof(struct bpf_sockopt, retval): 2595 if (size != size_default) 2596 return false; 2597 return prog->expected_attach_type == 2598 BPF_CGROUP_GETSOCKOPT; 2599 case offsetof(struct bpf_sockopt, optname): 2600 fallthrough; 2601 case offsetof(struct bpf_sockopt, level): 2602 if (size != size_default) 2603 return false; 2604 return prog->expected_attach_type == 2605 BPF_CGROUP_SETSOCKOPT; 2606 case offsetof(struct bpf_sockopt, optlen): 2607 return size == size_default; 2608 default: 2609 return false; 2610 } 2611 } 2612 2613 switch (off) { 2614 case bpf_ctx_range_ptr(struct bpf_sockopt, sk): 2615 if (size != sizeof(__u64)) 2616 return false; 2617 info->reg_type = PTR_TO_SOCKET; 2618 break; 2619 case bpf_ctx_range_ptr(struct bpf_sockopt, optval): 2620 if (size != sizeof(__u64)) 2621 return false; 2622 info->reg_type = PTR_TO_PACKET; 2623 break; 2624 case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end): 2625 if (size != sizeof(__u64)) 2626 return false; 2627 info->reg_type = PTR_TO_PACKET_END; 2628 break; 2629 case bpf_ctx_range(struct bpf_sockopt, retval): 2630 if (size != size_default) 2631 return false; 2632 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; 2633 default: 2634 if (size != size_default) 2635 return false; 2636 break; 2637 } 2638 return true; 2639 } 2640 2641 #define CG_SOCKOPT_READ_FIELD(F) \ 2642 BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ 2643 si->dst_reg, si->src_reg, \ 2644 offsetof(struct bpf_sockopt_kern, F)) 2645 2646 #define CG_SOCKOPT_WRITE_FIELD(F) \ 2647 BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \ 2648 BPF_MEM | BPF_CLASS(si->code)), \ 2649 si->dst_reg, si->src_reg, \ 2650 offsetof(struct bpf_sockopt_kern, F), \ 2651 si->imm) 2652 2653 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, 2654 const struct bpf_insn *si, 2655 struct bpf_insn *insn_buf, 2656 struct bpf_prog *prog, 2657 u32 *target_size) 2658 { 2659 struct bpf_insn *insn = insn_buf; 2660 2661 switch (si->off) { 2662 case offsetof(struct bpf_sockopt, sk): 2663 *insn++ = CG_SOCKOPT_READ_FIELD(sk); 2664 break; 2665 case offsetof(struct bpf_sockopt, level): 2666 if (type == BPF_WRITE) 2667 *insn++ = CG_SOCKOPT_WRITE_FIELD(level); 2668 else 2669 *insn++ = CG_SOCKOPT_READ_FIELD(level); 2670 break; 2671 case offsetof(struct bpf_sockopt, optname): 2672 if (type == BPF_WRITE) 2673 *insn++ = CG_SOCKOPT_WRITE_FIELD(optname); 2674 else 2675 *insn++ = CG_SOCKOPT_READ_FIELD(optname); 2676 break; 2677 case offsetof(struct bpf_sockopt, optlen): 2678 if (type == BPF_WRITE) 2679 *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen); 2680 else 2681 *insn++ = CG_SOCKOPT_READ_FIELD(optlen); 2682 break; 2683 case offsetof(struct bpf_sockopt, retval): 2684 BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); 2685 2686 if (type == BPF_WRITE) { 2687 int treg = BPF_REG_9; 2688 2689 if (si->src_reg == treg || si->dst_reg == treg) 2690 --treg; 2691 if (si->src_reg == treg || si->dst_reg == treg) 2692 --treg; 2693 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, 2694 offsetof(struct bpf_sockopt_kern, tmp_reg)); 2695 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), 2696 treg, si->dst_reg, 2697 offsetof(struct bpf_sockopt_kern, current_task)); 2698 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), 2699 treg, treg, 2700 offsetof(struct task_struct, bpf_ctx)); 2701 *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM | 2702 BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), 2703 treg, si->src_reg, 2704 offsetof(struct bpf_cg_run_ctx, retval), 2705 si->imm); 2706 *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, 2707 offsetof(struct bpf_sockopt_kern, tmp_reg)); 2708 } else { 2709 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), 2710 si->dst_reg, si->src_reg, 2711 offsetof(struct bpf_sockopt_kern, current_task)); 2712 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), 2713 si->dst_reg, si->dst_reg, 2714 offsetof(struct task_struct, bpf_ctx)); 2715 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), 2716 si->dst_reg, si->dst_reg, 2717 offsetof(struct bpf_cg_run_ctx, retval)); 2718 } 2719 break; 2720 case offsetof(struct bpf_sockopt, optval): 2721 *insn++ = CG_SOCKOPT_READ_FIELD(optval); 2722 break; 2723 case offsetof(struct bpf_sockopt, optval_end): 2724 *insn++ = CG_SOCKOPT_READ_FIELD(optval_end); 2725 break; 2726 } 2727 2728 return insn - insn_buf; 2729 } 2730 2731 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, 2732 bool direct_write, 2733 const struct bpf_prog *prog) 2734 { 2735 /* Nothing to do for sockopt argument. The data is kzalloc'ated. 2736 */ 2737 return 0; 2738 } 2739 2740 const struct bpf_verifier_ops cg_sockopt_verifier_ops = { 2741 .get_func_proto = cg_sockopt_func_proto, 2742 .is_valid_access = cg_sockopt_is_valid_access, 2743 .convert_ctx_access = cg_sockopt_convert_ctx_access, 2744 .gen_prologue = cg_sockopt_get_prologue, 2745 }; 2746 2747 const struct bpf_prog_ops cg_sockopt_prog_ops = { 2748 }; 2749 2750 /* Common helpers for cgroup hooks. */ 2751 const struct bpf_func_proto * 2752 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 2753 { 2754 switch (func_id) { 2755 case BPF_FUNC_get_local_storage: 2756 return &bpf_get_local_storage_proto; 2757 case BPF_FUNC_get_retval: 2758 switch (prog->expected_attach_type) { 2759 case BPF_CGROUP_INET_INGRESS: 2760 case BPF_CGROUP_INET_EGRESS: 2761 case BPF_CGROUP_SOCK_OPS: 2762 case BPF_CGROUP_UDP4_RECVMSG: 2763 case BPF_CGROUP_UDP6_RECVMSG: 2764 case BPF_CGROUP_UNIX_RECVMSG: 2765 case BPF_CGROUP_INET4_GETPEERNAME: 2766 case BPF_CGROUP_INET6_GETPEERNAME: 2767 case BPF_CGROUP_UNIX_GETPEERNAME: 2768 case BPF_CGROUP_INET4_GETSOCKNAME: 2769 case BPF_CGROUP_INET6_GETSOCKNAME: 2770 case BPF_CGROUP_UNIX_GETSOCKNAME: 2771 return NULL; 2772 default: 2773 return &bpf_get_retval_proto; 2774 } 2775 case BPF_FUNC_set_retval: 2776 switch (prog->expected_attach_type) { 2777 case BPF_CGROUP_INET_INGRESS: 2778 case BPF_CGROUP_INET_EGRESS: 2779 case BPF_CGROUP_SOCK_OPS: 2780 case BPF_CGROUP_UDP4_RECVMSG: 2781 case BPF_CGROUP_UDP6_RECVMSG: 2782 case BPF_CGROUP_UNIX_RECVMSG: 2783 case BPF_CGROUP_INET4_GETPEERNAME: 2784 case BPF_CGROUP_INET6_GETPEERNAME: 2785 case BPF_CGROUP_UNIX_GETPEERNAME: 2786 case BPF_CGROUP_INET4_GETSOCKNAME: 2787 case BPF_CGROUP_INET6_GETSOCKNAME: 2788 case BPF_CGROUP_UNIX_GETSOCKNAME: 2789 return NULL; 2790 default: 2791 return &bpf_set_retval_proto; 2792 } 2793 default: 2794 return NULL; 2795 } 2796 } 2797