1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Functions to manage eBPF programs attached to cgroups 4 * 5 * Copyright (c) 2016 Daniel Mack 6 */ 7 8 #include <linux/kernel.h> 9 #include <linux/atomic.h> 10 #include <linux/cgroup.h> 11 #include <linux/filter.h> 12 #include <linux/slab.h> 13 #include <linux/sysctl.h> 14 #include <linux/string.h> 15 #include <linux/bpf.h> 16 #include <linux/bpf-cgroup.h> 17 #include <linux/bpf_lsm.h> 18 #include <linux/bpf_verifier.h> 19 #include <net/sock.h> 20 #include <net/bpf_sk_storage.h> 21 22 #include "../cgroup/cgroup-internal.h" 23 24 DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); 25 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 26 27 /* 28 * cgroup bpf destruction makes heavy use of work items and there can be a lot 29 * of concurrent destructions. Use a separate workqueue so that cgroup bpf 30 * destruction work items don't end up filling up max_active of system_percpu_wq 31 * which may lead to deadlock. 32 */ 33 static struct workqueue_struct *cgroup_bpf_destroy_wq; 34 35 static int __init cgroup_bpf_wq_init(void) 36 { 37 cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 38 WQ_PERCPU, 1); 39 if (!cgroup_bpf_destroy_wq) 40 panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); 41 return 0; 42 } 43 core_initcall(cgroup_bpf_wq_init); 44 45 static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, 46 unsigned long action, void *data); 47 48 static struct notifier_block cgroup_bpf_lifetime_nb = { 49 .notifier_call = cgroup_bpf_lifetime_notify, 50 }; 51 52 void __init cgroup_bpf_lifetime_notifier_init(void) 53 { 54 BUG_ON(blocking_notifier_chain_register(&cgroup_lifetime_notifier, 55 &cgroup_bpf_lifetime_nb)); 56 } 57 58 #ifdef CONFIG_BPF_LSM 59 struct cgroup_lsm_atype { 60 u32 attach_btf_id; 61 int refcnt; 62 bool returns_errno; 63 }; 64 65 static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM]; 66 67 static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) 68 { 69 if (atype >= CGROUP_LSM_START && atype <= CGROUP_LSM_END) 70 return READ_ONCE(cgroup_lsm_atype[atype - CGROUP_LSM_START].returns_errno); 71 return true; 72 } 73 #else 74 static bool cgroup_bpf_hook_returns_errno(enum cgroup_bpf_attach_type atype) 75 { 76 return true; 77 } 78 #endif 79 80 /* __always_inline is necessary to prevent indirect call through run_prog 81 * function pointer. 82 */ 83 static __always_inline int 84 bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp, 85 enum cgroup_bpf_attach_type atype, 86 const void *ctx, bpf_prog_run_fn run_prog, 87 int retval, u32 *ret_flags) 88 { 89 const struct bpf_prog_array_item *item; 90 const struct bpf_prog *prog; 91 const struct bpf_prog_array *array; 92 struct bpf_run_ctx *old_run_ctx; 93 struct bpf_cg_run_ctx run_ctx; 94 u32 func_ret; 95 96 run_ctx.retval = retval; 97 rcu_read_lock_dont_migrate(); 98 array = rcu_dereference(cgrp->effective[atype]); 99 item = &array->items[0]; 100 old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); 101 while ((prog = READ_ONCE(item->prog))) { 102 run_ctx.prog_item = item; 103 func_ret = run_prog(prog, ctx); 104 if (ret_flags) { 105 *(ret_flags) |= (func_ret >> 1); 106 func_ret &= 1; 107 } 108 if (!func_ret && cgroup_bpf_hook_returns_errno(atype) && 109 !IS_ERR_VALUE((long)run_ctx.retval)) 110 run_ctx.retval = -EPERM; 111 item++; 112 } 113 bpf_reset_run_ctx(old_run_ctx); 114 rcu_read_unlock_migrate(); 115 return run_ctx.retval; 116 } 117 118 unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx, 119 const struct bpf_insn *insn) 120 { 121 const struct bpf_prog *shim_prog; 122 struct sock *sk; 123 struct cgroup *cgrp; 124 int ret = 0; 125 u64 *args; 126 127 args = (u64 *)ctx; 128 sk = (void *)(unsigned long)args[0]; 129 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ 130 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); 131 132 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 133 if (likely(cgrp)) 134 ret = bpf_prog_run_array_cg(&cgrp->bpf, 135 shim_prog->aux->cgroup_atype, 136 ctx, bpf_prog_run, 0, NULL); 137 return ret; 138 } 139 140 unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx, 141 const struct bpf_insn *insn) 142 { 143 const struct bpf_prog *shim_prog; 144 struct socket *sock; 145 struct cgroup *cgrp; 146 int ret = 0; 147 u64 *args; 148 149 args = (u64 *)ctx; 150 sock = (void *)(unsigned long)args[0]; 151 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ 152 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); 153 154 cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data); 155 if (likely(cgrp)) 156 ret = bpf_prog_run_array_cg(&cgrp->bpf, 157 shim_prog->aux->cgroup_atype, 158 ctx, bpf_prog_run, 0, NULL); 159 return ret; 160 } 161 162 unsigned int __cgroup_bpf_run_lsm_current(const void *ctx, 163 const struct bpf_insn *insn) 164 { 165 const struct bpf_prog *shim_prog; 166 struct cgroup *cgrp; 167 int ret = 0; 168 169 /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/ 170 shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi)); 171 172 /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */ 173 cgrp = task_dfl_cgroup(current); 174 if (likely(cgrp)) 175 ret = bpf_prog_run_array_cg(&cgrp->bpf, 176 shim_prog->aux->cgroup_atype, 177 ctx, bpf_prog_run, 0, NULL); 178 return ret; 179 } 180 181 #ifdef CONFIG_BPF_LSM 182 static enum cgroup_bpf_attach_type 183 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) 184 { 185 int i; 186 187 lockdep_assert_held(&cgroup_mutex); 188 189 if (attach_type != BPF_LSM_CGROUP) 190 return to_cgroup_bpf_attach_type(attach_type); 191 192 for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) 193 if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id) 194 return CGROUP_LSM_START + i; 195 196 for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++) 197 if (cgroup_lsm_atype[i].attach_btf_id == 0) 198 return CGROUP_LSM_START + i; 199 200 return -E2BIG; 201 202 } 203 204 void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype) 205 { 206 int i = cgroup_atype - CGROUP_LSM_START; 207 208 lockdep_assert_held(&cgroup_mutex); 209 210 if (!cgroup_lsm_atype[i].attach_btf_id) { 211 cgroup_lsm_atype[i].attach_btf_id = attach_btf_id; 212 WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, 213 bpf_lsm_hook_returns_errno(attach_btf_id)); 214 } else { 215 WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id != attach_btf_id); 216 } 217 cgroup_lsm_atype[i].refcnt++; 218 } 219 220 void bpf_cgroup_atype_put(int cgroup_atype) 221 { 222 int i = cgroup_atype - CGROUP_LSM_START; 223 224 cgroup_lock(); 225 if (--cgroup_lsm_atype[i].refcnt <= 0) { 226 WRITE_ONCE(cgroup_lsm_atype[i].returns_errno, true); 227 cgroup_lsm_atype[i].attach_btf_id = 0; 228 } 229 WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0); 230 cgroup_unlock(); 231 } 232 #else 233 static enum cgroup_bpf_attach_type 234 bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id) 235 { 236 if (attach_type != BPF_LSM_CGROUP) 237 return to_cgroup_bpf_attach_type(attach_type); 238 return -EOPNOTSUPP; 239 } 240 #endif /* CONFIG_BPF_LSM */ 241 242 static void cgroup_bpf_offline(struct cgroup *cgrp) 243 { 244 cgroup_get(cgrp); 245 percpu_ref_kill(&cgrp->bpf.refcnt); 246 } 247 248 static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[]) 249 { 250 enum bpf_cgroup_storage_type stype; 251 252 for_each_cgroup_storage_type(stype) 253 bpf_cgroup_storage_free(storages[stype]); 254 } 255 256 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[], 257 struct bpf_cgroup_storage *new_storages[], 258 enum bpf_attach_type type, 259 struct bpf_prog *prog, 260 struct cgroup *cgrp) 261 { 262 enum bpf_cgroup_storage_type stype; 263 struct bpf_cgroup_storage_key key; 264 struct bpf_map *map; 265 266 key.cgroup_inode_id = cgroup_id(cgrp); 267 key.attach_type = type; 268 269 for_each_cgroup_storage_type(stype) { 270 map = prog->aux->cgroup_storage[stype]; 271 if (!map) 272 continue; 273 274 storages[stype] = cgroup_storage_lookup((void *)map, &key, false); 275 if (storages[stype]) 276 continue; 277 278 storages[stype] = bpf_cgroup_storage_alloc(prog, stype); 279 if (IS_ERR(storages[stype])) { 280 bpf_cgroup_storages_free(new_storages); 281 return -ENOMEM; 282 } 283 284 new_storages[stype] = storages[stype]; 285 } 286 287 return 0; 288 } 289 290 static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[], 291 struct bpf_cgroup_storage *src[]) 292 { 293 enum bpf_cgroup_storage_type stype; 294 295 for_each_cgroup_storage_type(stype) 296 dst[stype] = src[stype]; 297 } 298 299 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[], 300 struct cgroup *cgrp, 301 enum bpf_attach_type attach_type) 302 { 303 enum bpf_cgroup_storage_type stype; 304 305 for_each_cgroup_storage_type(stype) 306 bpf_cgroup_storage_link(storages[stype], cgrp, attach_type); 307 } 308 309 /* Called when bpf_cgroup_link is auto-detached from dying cgroup. 310 * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It 311 * doesn't free link memory, which will eventually be done by bpf_link's 312 * release() callback, when its last FD is closed. 313 */ 314 static void bpf_cgroup_link_auto_detach(struct bpf_cgroup_link *link) 315 { 316 cgroup_put(link->cgroup); 317 link->cgroup = NULL; 318 } 319 320 /** 321 * cgroup_bpf_release() - put references of all bpf programs and 322 * release all cgroup bpf data 323 * @work: work structure embedded into the cgroup to modify 324 */ 325 static void cgroup_bpf_release(struct work_struct *work) 326 { 327 struct cgroup *p, *cgrp = container_of(work, struct cgroup, 328 bpf.release_work); 329 struct bpf_prog_array *old_array; 330 struct list_head *storages = &cgrp->bpf.storages; 331 struct bpf_cgroup_storage *storage, *stmp; 332 333 unsigned int atype; 334 335 cgroup_lock(); 336 337 for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) { 338 struct hlist_head *progs = &cgrp->bpf.progs[atype]; 339 struct bpf_prog_list *pl; 340 struct hlist_node *pltmp; 341 342 hlist_for_each_entry_safe(pl, pltmp, progs, node) { 343 hlist_del(&pl->node); 344 if (pl->prog) { 345 if (pl->prog->expected_attach_type == BPF_LSM_CGROUP) 346 bpf_trampoline_unlink_cgroup_shim(pl->prog); 347 bpf_prog_put(pl->prog); 348 } 349 if (pl->link) { 350 if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP) 351 bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog); 352 bpf_cgroup_link_auto_detach(pl->link); 353 } 354 kfree(pl); 355 static_branch_dec(&cgroup_bpf_enabled_key[atype]); 356 } 357 old_array = rcu_dereference_protected( 358 cgrp->bpf.effective[atype], 359 lockdep_is_held(&cgroup_mutex)); 360 bpf_prog_array_free(old_array); 361 } 362 363 list_for_each_entry_safe(storage, stmp, storages, list_cg) { 364 bpf_cgroup_storage_unlink(storage); 365 bpf_cgroup_storage_free(storage); 366 } 367 368 cgroup_unlock(); 369 370 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 371 cgroup_bpf_put(p); 372 373 percpu_ref_exit(&cgrp->bpf.refcnt); 374 cgroup_put(cgrp); 375 } 376 377 /** 378 * cgroup_bpf_release_fn() - callback used to schedule releasing 379 * of bpf cgroup data 380 * @ref: percpu ref counter structure 381 */ 382 static void cgroup_bpf_release_fn(struct percpu_ref *ref) 383 { 384 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); 385 386 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); 387 queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); 388 } 389 390 /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through 391 * link or direct prog. 392 */ 393 static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl) 394 { 395 if (pl->prog) 396 return pl->prog; 397 if (pl->link) 398 return pl->link->link.prog; 399 return NULL; 400 } 401 402 /* count number of elements in the list. 403 * it's slow but the list cannot be long 404 */ 405 static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt) 406 { 407 struct bpf_prog_list *pl; 408 u32 cnt = 0; 409 410 hlist_for_each_entry(pl, head, node) { 411 if (!prog_list_prog(pl)) 412 continue; 413 if (preorder_cnt && (pl->flags & BPF_F_PREORDER)) 414 (*preorder_cnt)++; 415 cnt++; 416 } 417 return cnt; 418 } 419 420 /* if parent has non-overridable prog attached, 421 * disallow attaching new programs to the descendent cgroup. 422 * if parent has overridable or multi-prog, allow attaching 423 */ 424 static bool hierarchy_allows_attach(struct cgroup *cgrp, 425 enum cgroup_bpf_attach_type atype) 426 { 427 struct cgroup *p; 428 429 p = cgroup_parent(cgrp); 430 if (!p) 431 return true; 432 do { 433 u32 flags = p->bpf.flags[atype]; 434 u32 cnt; 435 436 if (flags & BPF_F_ALLOW_MULTI) 437 return true; 438 cnt = prog_list_length(&p->bpf.progs[atype], NULL); 439 WARN_ON_ONCE(cnt > 1); 440 if (cnt == 1) 441 return !!(flags & BPF_F_ALLOW_OVERRIDE); 442 p = cgroup_parent(p); 443 } while (p); 444 return true; 445 } 446 447 /* compute a chain of effective programs for a given cgroup: 448 * start from the list of programs in this cgroup and add 449 * all parent programs. 450 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 451 * to programs in this cgroup 452 */ 453 static int compute_effective_progs(struct cgroup *cgrp, 454 enum cgroup_bpf_attach_type atype, 455 struct bpf_prog_array **array) 456 { 457 struct bpf_prog_array_item *item; 458 struct bpf_prog_array *progs; 459 struct bpf_prog_list *pl; 460 struct cgroup *p = cgrp; 461 int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart; 462 463 /* count number of effective programs by walking parents */ 464 do { 465 if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 466 cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt); 467 p = cgroup_parent(p); 468 } while (p); 469 470 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 471 if (!progs) 472 return -ENOMEM; 473 474 /* populate the array with effective progs */ 475 cnt = 0; 476 p = cgrp; 477 fstart = preorder_cnt; 478 bstart = preorder_cnt - 1; 479 do { 480 if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 481 continue; 482 483 init_bstart = bstart; 484 hlist_for_each_entry(pl, &p->bpf.progs[atype], node) { 485 if (!prog_list_prog(pl)) 486 continue; 487 488 if (pl->flags & BPF_F_PREORDER) { 489 item = &progs->items[bstart]; 490 bstart--; 491 } else { 492 item = &progs->items[fstart]; 493 fstart++; 494 } 495 item->prog = prog_list_prog(pl); 496 bpf_cgroup_storages_assign(item->cgroup_storage, 497 pl->storage); 498 cnt++; 499 } 500 501 /* reverse pre-ordering progs at this cgroup level */ 502 for (i = bstart + 1, j = init_bstart; i < j; i++, j--) 503 swap(progs->items[i], progs->items[j]); 504 505 } while ((p = cgroup_parent(p))); 506 507 *array = progs; 508 return 0; 509 } 510 511 static void activate_effective_progs(struct cgroup *cgrp, 512 enum cgroup_bpf_attach_type atype, 513 struct bpf_prog_array *old_array) 514 { 515 old_array = rcu_replace_pointer(cgrp->bpf.effective[atype], old_array, 516 lockdep_is_held(&cgroup_mutex)); 517 /* free prog array after grace period, since __cgroup_bpf_run_*() 518 * might be still walking the array 519 */ 520 bpf_prog_array_free(old_array); 521 } 522 523 /** 524 * cgroup_bpf_inherit() - inherit effective programs from parent 525 * @cgrp: the cgroup to modify 526 */ 527 static int cgroup_bpf_inherit(struct cgroup *cgrp) 528 { 529 /* has to use marco instead of const int, since compiler thinks 530 * that array below is variable length 531 */ 532 #define NR ARRAY_SIZE(cgrp->bpf.effective) 533 struct bpf_prog_array *arrays[NR] = {}; 534 struct cgroup *p; 535 int ret, i; 536 537 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, 538 GFP_KERNEL); 539 if (ret) 540 return ret; 541 542 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 543 cgroup_bpf_get(p); 544 545 for (i = 0; i < NR; i++) 546 INIT_HLIST_HEAD(&cgrp->bpf.progs[i]); 547 548 INIT_LIST_HEAD(&cgrp->bpf.storages); 549 550 for (i = 0; i < NR; i++) 551 if (compute_effective_progs(cgrp, i, &arrays[i])) 552 goto cleanup; 553 554 for (i = 0; i < NR; i++) 555 activate_effective_progs(cgrp, i, arrays[i]); 556 557 return 0; 558 cleanup: 559 for (i = 0; i < NR; i++) 560 bpf_prog_array_free(arrays[i]); 561 562 for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) 563 cgroup_bpf_put(p); 564 565 percpu_ref_exit(&cgrp->bpf.refcnt); 566 567 return -ENOMEM; 568 } 569 570 static int cgroup_bpf_lifetime_notify(struct notifier_block *nb, 571 unsigned long action, void *data) 572 { 573 struct cgroup *cgrp = data; 574 int ret = 0; 575 576 if (cgrp->root != &cgrp_dfl_root) 577 return NOTIFY_OK; 578 579 switch (action) { 580 case CGROUP_LIFETIME_ONLINE: 581 ret = cgroup_bpf_inherit(cgrp); 582 break; 583 case CGROUP_LIFETIME_OFFLINE: 584 cgroup_bpf_offline(cgrp); 585 break; 586 } 587 588 return notifier_from_errno(ret); 589 } 590 591 static int update_effective_progs(struct cgroup *cgrp, 592 enum cgroup_bpf_attach_type atype) 593 { 594 struct cgroup_subsys_state *css; 595 int err; 596 597 /* allocate and recompute effective prog arrays */ 598 css_for_each_descendant_pre(css, &cgrp->self) { 599 struct cgroup *desc = container_of(css, struct cgroup, self); 600 601 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 602 continue; 603 604 err = compute_effective_progs(desc, atype, &desc->bpf.inactive); 605 if (err) 606 goto cleanup; 607 } 608 609 /* all allocations were successful. Activate all prog arrays */ 610 css_for_each_descendant_pre(css, &cgrp->self) { 611 struct cgroup *desc = container_of(css, struct cgroup, self); 612 613 if (percpu_ref_is_zero(&desc->bpf.refcnt)) { 614 if (unlikely(desc->bpf.inactive)) { 615 bpf_prog_array_free(desc->bpf.inactive); 616 desc->bpf.inactive = NULL; 617 } 618 continue; 619 } 620 621 activate_effective_progs(desc, atype, desc->bpf.inactive); 622 desc->bpf.inactive = NULL; 623 } 624 625 return 0; 626 627 cleanup: 628 /* oom while computing effective. Free all computed effective arrays 629 * since they were not activated 630 */ 631 css_for_each_descendant_pre(css, &cgrp->self) { 632 struct cgroup *desc = container_of(css, struct cgroup, self); 633 634 bpf_prog_array_free(desc->bpf.inactive); 635 desc->bpf.inactive = NULL; 636 } 637 638 return err; 639 } 640 641 #define BPF_CGROUP_MAX_PROGS 64 642 643 static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs, 644 struct bpf_prog *prog, 645 struct bpf_cgroup_link *link, 646 struct bpf_prog *replace_prog, 647 bool allow_multi) 648 { 649 struct bpf_prog_list *pl; 650 651 /* single-attach case */ 652 if (!allow_multi) { 653 if (hlist_empty(progs)) 654 return NULL; 655 return hlist_entry(progs->first, typeof(*pl), node); 656 } 657 658 hlist_for_each_entry(pl, progs, node) { 659 if (prog && pl->prog == prog && prog != replace_prog) 660 /* disallow attaching the same prog twice */ 661 return ERR_PTR(-EINVAL); 662 if (link && pl->link == link) 663 /* disallow attaching the same link twice */ 664 return ERR_PTR(-EINVAL); 665 } 666 667 /* direct prog multi-attach w/ replacement case */ 668 if (replace_prog) { 669 hlist_for_each_entry(pl, progs, node) { 670 if (pl->prog == replace_prog) 671 /* a match found */ 672 return pl; 673 } 674 /* prog to replace not found for cgroup */ 675 return ERR_PTR(-ENOENT); 676 } 677 678 return NULL; 679 } 680 681 static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd) 682 { 683 struct bpf_link *link = ERR_PTR(-EINVAL); 684 685 if (flags & BPF_F_ID) 686 link = bpf_link_by_id(id_or_fd); 687 else if (id_or_fd) 688 link = bpf_link_get_from_fd(id_or_fd); 689 return link; 690 } 691 692 static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd) 693 { 694 struct bpf_prog *prog = ERR_PTR(-EINVAL); 695 696 if (flags & BPF_F_ID) 697 prog = bpf_prog_by_id(id_or_fd); 698 else if (id_or_fd) 699 prog = bpf_prog_get(id_or_fd); 700 return prog; 701 } 702 703 static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog, 704 struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd) 705 { 706 bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID; 707 struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL); 708 bool preorder = flags & BPF_F_PREORDER; 709 struct bpf_link *anchor_link = NULL; 710 struct bpf_prog *anchor_prog = NULL; 711 bool is_before, is_after; 712 713 is_before = flags & BPF_F_BEFORE; 714 is_after = flags & BPF_F_AFTER; 715 if (is_link || is_id || id_or_fd) { 716 /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */ 717 if (is_before == is_after) 718 return ERR_PTR(-EINVAL); 719 if ((is_link && !link) || (!is_link && !prog)) 720 return ERR_PTR(-EINVAL); 721 } else if (!hlist_empty(progs)) { 722 /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */ 723 if (is_before && is_after) 724 return ERR_PTR(-EINVAL); 725 } 726 727 if (is_link) { 728 anchor_link = bpf_get_anchor_link(flags, id_or_fd); 729 if (IS_ERR(anchor_link)) 730 return ERR_CAST(anchor_link); 731 } else if (is_id || id_or_fd) { 732 anchor_prog = bpf_get_anchor_prog(flags, id_or_fd); 733 if (IS_ERR(anchor_prog)) 734 return ERR_CAST(anchor_prog); 735 } 736 737 if (!anchor_prog && !anchor_link) { 738 /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER 739 * doesn't matter since either prepend or append to a combined 740 * list of progs will end up with correct result. 741 */ 742 hlist_for_each_entry(pltmp, progs, node) { 743 if (is_before) 744 return pltmp; 745 if (pltmp->node.next) 746 continue; 747 return pltmp; 748 } 749 return NULL; 750 } 751 752 hlist_for_each_entry(pltmp, progs, node) { 753 if ((anchor_prog && anchor_prog == pltmp->prog) || 754 (anchor_link && anchor_link == &pltmp->link->link)) { 755 if (!!(pltmp->flags & BPF_F_PREORDER) != preorder) 756 goto out; 757 pl = pltmp; 758 goto out; 759 } 760 } 761 762 pl = ERR_PTR(-ENOENT); 763 out: 764 if (anchor_link) 765 bpf_link_put(anchor_link); 766 else 767 bpf_prog_put(anchor_prog); 768 return pl; 769 } 770 771 static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs, 772 struct bpf_prog *prog, struct bpf_cgroup_link *link, 773 u32 flags, u32 id_or_fd) 774 { 775 struct bpf_prog_list *pltmp; 776 777 pltmp = get_prog_list(progs, prog, link, flags, id_or_fd); 778 if (IS_ERR(pltmp)) 779 return PTR_ERR(pltmp); 780 781 if (!pltmp) 782 hlist_add_head(&pl->node, progs); 783 else if (flags & BPF_F_BEFORE) 784 hlist_add_before(&pl->node, &pltmp->node); 785 else 786 hlist_add_behind(&pl->node, &pltmp->node); 787 788 return 0; 789 } 790 791 /** 792 * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and 793 * propagate the change to descendants 794 * @cgrp: The cgroup which descendants to traverse 795 * @prog: A program to attach 796 * @link: A link to attach 797 * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set 798 * @type: Type of attach operation 799 * @flags: Option flags 800 * @id_or_fd: Relative prog id or fd 801 * @revision: bpf_prog_list revision 802 * 803 * Exactly one of @prog or @link can be non-null. 804 * Must be called with cgroup_mutex held. 805 */ 806 static int __cgroup_bpf_attach(struct cgroup *cgrp, 807 struct bpf_prog *prog, struct bpf_prog *replace_prog, 808 struct bpf_cgroup_link *link, 809 enum bpf_attach_type type, u32 flags, u32 id_or_fd, 810 u64 revision) 811 { 812 u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)); 813 struct bpf_prog *old_prog = NULL; 814 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; 815 struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; 816 struct bpf_prog *new_prog = prog ? : link->link.prog; 817 enum cgroup_bpf_attach_type atype; 818 struct bpf_prog_list *pl; 819 struct hlist_head *progs; 820 int err; 821 822 if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) || 823 ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI))) 824 /* invalid combination */ 825 return -EINVAL; 826 if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER))) 827 /* only either replace or insertion with before/after */ 828 return -EINVAL; 829 if (link && (prog || replace_prog)) 830 /* only either link or prog/replace_prog can be specified */ 831 return -EINVAL; 832 if (!!replace_prog != !!(flags & BPF_F_REPLACE)) 833 /* replace_prog implies BPF_F_REPLACE, and vice versa */ 834 return -EINVAL; 835 836 atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id); 837 if (atype < 0) 838 return -EINVAL; 839 if (revision && revision != cgrp->bpf.revisions[atype]) 840 return -ESTALE; 841 842 progs = &cgrp->bpf.progs[atype]; 843 844 if (!hierarchy_allows_attach(cgrp, atype)) 845 return -EPERM; 846 847 if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags) 848 /* Disallow attaching non-overridable on top 849 * of existing overridable in this cgroup. 850 * Disallow attaching multi-prog if overridable or none 851 */ 852 return -EPERM; 853 854 if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS) 855 return -E2BIG; 856 857 pl = find_attach_entry(progs, prog, link, replace_prog, 858 flags & BPF_F_ALLOW_MULTI); 859 if (IS_ERR(pl)) 860 return PTR_ERR(pl); 861 862 if (bpf_cgroup_storages_alloc(storage, new_storage, type, 863 prog ? : link->link.prog, cgrp)) 864 return -ENOMEM; 865 866 if (pl) { 867 old_prog = pl->prog; 868 } else { 869 pl = kmalloc_obj(*pl); 870 if (!pl) { 871 bpf_cgroup_storages_free(new_storage); 872 return -ENOMEM; 873 } 874 875 err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd); 876 if (err) { 877 kfree(pl); 878 bpf_cgroup_storages_free(new_storage); 879 return err; 880 } 881 } 882 883 pl->prog = prog; 884 pl->link = link; 885 pl->flags = flags; 886 bpf_cgroup_storages_assign(pl->storage, storage); 887 cgrp->bpf.flags[atype] = saved_flags; 888 889 if (type == BPF_LSM_CGROUP) { 890 err = bpf_trampoline_link_cgroup_shim(new_prog, atype, type); 891 if (err) 892 goto cleanup; 893 } 894 895 err = update_effective_progs(cgrp, atype); 896 if (err) 897 goto cleanup_trampoline; 898 899 cgrp->bpf.revisions[atype] += 1; 900 if (old_prog) { 901 if (type == BPF_LSM_CGROUP) 902 bpf_trampoline_unlink_cgroup_shim(old_prog); 903 bpf_prog_put(old_prog); 904 } else { 905 static_branch_inc(&cgroup_bpf_enabled_key[atype]); 906 } 907 bpf_cgroup_storages_link(new_storage, cgrp, type); 908 return 0; 909 910 cleanup_trampoline: 911 if (type == BPF_LSM_CGROUP) 912 bpf_trampoline_unlink_cgroup_shim(new_prog); 913 914 cleanup: 915 if (old_prog) { 916 pl->prog = old_prog; 917 pl->link = NULL; 918 } 919 bpf_cgroup_storages_free(new_storage); 920 if (!old_prog) { 921 hlist_del(&pl->node); 922 kfree(pl); 923 } 924 return err; 925 } 926 927 static int cgroup_bpf_attach(struct cgroup *cgrp, 928 struct bpf_prog *prog, struct bpf_prog *replace_prog, 929 struct bpf_cgroup_link *link, 930 enum bpf_attach_type type, 931 u32 flags, u32 id_or_fd, u64 revision) 932 { 933 int ret; 934 935 cgroup_lock(); 936 ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags, 937 id_or_fd, revision); 938 cgroup_unlock(); 939 return ret; 940 } 941 942 /* Swap updated BPF program for given link in effective program arrays across 943 * all descendant cgroups. This function is guaranteed to succeed. 944 */ 945 static void replace_effective_prog(struct cgroup *cgrp, 946 enum cgroup_bpf_attach_type atype, 947 struct bpf_cgroup_link *link) 948 { 949 struct bpf_prog_array_item *item; 950 struct cgroup_subsys_state *css; 951 struct bpf_prog_array *progs; 952 struct bpf_prog_list *pl; 953 struct hlist_head *head; 954 struct cgroup *cg; 955 int pos; 956 957 css_for_each_descendant_pre(css, &cgrp->self) { 958 struct cgroup *desc = container_of(css, struct cgroup, self); 959 960 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 961 continue; 962 963 /* find position of link in effective progs array */ 964 for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { 965 if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 966 continue; 967 968 head = &cg->bpf.progs[atype]; 969 hlist_for_each_entry(pl, head, node) { 970 if (!prog_list_prog(pl)) 971 continue; 972 if (pl->link == link) 973 goto found; 974 pos++; 975 } 976 } 977 found: 978 BUG_ON(!cg); 979 progs = rcu_dereference_protected( 980 desc->bpf.effective[atype], 981 lockdep_is_held(&cgroup_mutex)); 982 item = &progs->items[pos]; 983 WRITE_ONCE(item->prog, link->link.prog); 984 } 985 } 986 987 /** 988 * __cgroup_bpf_replace() - Replace link's program and propagate the change 989 * to descendants 990 * @cgrp: The cgroup which descendants to traverse 991 * @link: A link for which to replace BPF program 992 * @new_prog: &struct bpf_prog for the target BPF program with its refcnt 993 * incremented 994 * 995 * Must be called with cgroup_mutex held. 996 */ 997 static int __cgroup_bpf_replace(struct cgroup *cgrp, 998 struct bpf_cgroup_link *link, 999 struct bpf_prog *new_prog) 1000 { 1001 enum cgroup_bpf_attach_type atype; 1002 struct bpf_prog *old_prog; 1003 struct bpf_prog_list *pl; 1004 struct hlist_head *progs; 1005 bool found = false; 1006 1007 atype = bpf_cgroup_atype_find(link->link.attach_type, new_prog->aux->attach_btf_id); 1008 if (atype < 0) 1009 return -EINVAL; 1010 1011 progs = &cgrp->bpf.progs[atype]; 1012 1013 if (link->link.prog->type != new_prog->type) 1014 return -EINVAL; 1015 1016 hlist_for_each_entry(pl, progs, node) { 1017 if (pl->link == link) { 1018 found = true; 1019 break; 1020 } 1021 } 1022 if (!found) 1023 return -ENOENT; 1024 1025 cgrp->bpf.revisions[atype] += 1; 1026 old_prog = xchg(&link->link.prog, new_prog); 1027 replace_effective_prog(cgrp, atype, link); 1028 bpf_prog_put(old_prog); 1029 return 0; 1030 } 1031 1032 static int cgroup_bpf_replace(struct bpf_link *link, struct bpf_prog *new_prog, 1033 struct bpf_prog *old_prog) 1034 { 1035 struct bpf_cgroup_link *cg_link; 1036 int ret; 1037 1038 cg_link = container_of(link, struct bpf_cgroup_link, link); 1039 1040 cgroup_lock(); 1041 /* link might have been auto-released by dying cgroup, so fail */ 1042 if (!cg_link->cgroup) { 1043 ret = -ENOLINK; 1044 goto out_unlock; 1045 } 1046 if (old_prog && link->prog != old_prog) { 1047 ret = -EPERM; 1048 goto out_unlock; 1049 } 1050 ret = __cgroup_bpf_replace(cg_link->cgroup, cg_link, new_prog); 1051 out_unlock: 1052 cgroup_unlock(); 1053 return ret; 1054 } 1055 1056 static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs, 1057 struct bpf_prog *prog, 1058 struct bpf_cgroup_link *link, 1059 bool allow_multi) 1060 { 1061 struct bpf_prog_list *pl; 1062 1063 if (!allow_multi) { 1064 if (hlist_empty(progs)) 1065 /* report error when trying to detach and nothing is attached */ 1066 return ERR_PTR(-ENOENT); 1067 1068 /* to maintain backward compatibility NONE and OVERRIDE cgroups 1069 * allow detaching with invalid FD (prog==NULL) in legacy mode 1070 */ 1071 return hlist_entry(progs->first, typeof(*pl), node); 1072 } 1073 1074 if (!prog && !link) 1075 /* to detach MULTI prog the user has to specify valid FD 1076 * of the program or link to be detached 1077 */ 1078 return ERR_PTR(-EINVAL); 1079 1080 /* find the prog or link and detach it */ 1081 hlist_for_each_entry(pl, progs, node) { 1082 if (pl->prog == prog && pl->link == link) 1083 return pl; 1084 } 1085 return ERR_PTR(-ENOENT); 1086 } 1087 1088 /** 1089 * purge_effective_progs() - After compute_effective_progs fails to alloc new 1090 * cgrp->bpf.inactive table we can recover by 1091 * recomputing the array in place. 1092 * 1093 * @cgrp: The cgroup which descendants to travers 1094 * @prog: A program to detach or NULL 1095 * @link: A link to detach or NULL 1096 * @atype: Type of detach operation 1097 */ 1098 static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, 1099 struct bpf_cgroup_link *link, 1100 enum cgroup_bpf_attach_type atype) 1101 { 1102 struct cgroup_subsys_state *css; 1103 struct bpf_prog_array *progs; 1104 struct bpf_prog_list *pl; 1105 struct hlist_head *head; 1106 struct cgroup *cg; 1107 int pos; 1108 1109 /* recompute effective prog array in place */ 1110 css_for_each_descendant_pre(css, &cgrp->self) { 1111 struct cgroup *desc = container_of(css, struct cgroup, self); 1112 1113 if (percpu_ref_is_zero(&desc->bpf.refcnt)) 1114 continue; 1115 1116 /* find position of link or prog in effective progs array */ 1117 for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { 1118 if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) 1119 continue; 1120 1121 head = &cg->bpf.progs[atype]; 1122 hlist_for_each_entry(pl, head, node) { 1123 if (!prog_list_prog(pl)) 1124 continue; 1125 if (pl->prog == prog && pl->link == link) 1126 goto found; 1127 pos++; 1128 } 1129 } 1130 1131 /* no link or prog match, skip the cgroup of this layer */ 1132 continue; 1133 found: 1134 progs = rcu_dereference_protected( 1135 desc->bpf.effective[atype], 1136 lockdep_is_held(&cgroup_mutex)); 1137 1138 /* Remove the program from the array */ 1139 WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), 1140 "Failed to purge a prog from array at index %d", pos); 1141 } 1142 } 1143 1144 /** 1145 * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and 1146 * propagate the change to descendants 1147 * @cgrp: The cgroup which descendants to traverse 1148 * @prog: A program to detach or NULL 1149 * @link: A link to detach or NULL 1150 * @type: Type of detach operation 1151 * @revision: bpf_prog_list revision 1152 * 1153 * At most one of @prog or @link can be non-NULL. 1154 * Must be called with cgroup_mutex held. 1155 */ 1156 static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 1157 struct bpf_cgroup_link *link, enum bpf_attach_type type, 1158 u64 revision) 1159 { 1160 enum cgroup_bpf_attach_type atype; 1161 struct bpf_prog *old_prog; 1162 struct bpf_prog_list *pl; 1163 struct hlist_head *progs; 1164 u32 attach_btf_id = 0; 1165 u32 flags; 1166 1167 if (prog) 1168 attach_btf_id = prog->aux->attach_btf_id; 1169 if (link) 1170 attach_btf_id = link->link.prog->aux->attach_btf_id; 1171 1172 atype = bpf_cgroup_atype_find(type, attach_btf_id); 1173 if (atype < 0) 1174 return -EINVAL; 1175 1176 if (revision && revision != cgrp->bpf.revisions[atype]) 1177 return -ESTALE; 1178 1179 progs = &cgrp->bpf.progs[atype]; 1180 flags = cgrp->bpf.flags[atype]; 1181 1182 if (prog && link) 1183 /* only one of prog or link can be specified */ 1184 return -EINVAL; 1185 1186 pl = find_detach_entry(progs, prog, link, flags & BPF_F_ALLOW_MULTI); 1187 if (IS_ERR(pl)) 1188 return PTR_ERR(pl); 1189 1190 /* mark it deleted, so it's ignored while recomputing effective */ 1191 old_prog = pl->prog; 1192 pl->prog = NULL; 1193 pl->link = NULL; 1194 1195 if (update_effective_progs(cgrp, atype)) { 1196 /* if update effective array failed replace the prog with a dummy prog*/ 1197 pl->prog = old_prog; 1198 pl->link = link; 1199 purge_effective_progs(cgrp, old_prog, link, atype); 1200 } 1201 1202 /* now can actually delete it from this cgroup list */ 1203 hlist_del(&pl->node); 1204 cgrp->bpf.revisions[atype] += 1; 1205 1206 kfree(pl); 1207 if (hlist_empty(progs)) 1208 /* last program was detached, reset flags to zero */ 1209 cgrp->bpf.flags[atype] = 0; 1210 if (old_prog) { 1211 if (type == BPF_LSM_CGROUP) 1212 bpf_trampoline_unlink_cgroup_shim(old_prog); 1213 bpf_prog_put(old_prog); 1214 } 1215 static_branch_dec(&cgroup_bpf_enabled_key[atype]); 1216 return 0; 1217 } 1218 1219 static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 1220 enum bpf_attach_type type, u64 revision) 1221 { 1222 int ret; 1223 1224 cgroup_lock(); 1225 ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision); 1226 cgroup_unlock(); 1227 return ret; 1228 } 1229 1230 /* Must be called with cgroup_mutex held to avoid races. */ 1231 static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 1232 union bpf_attr __user *uattr, u32 uattr_size) 1233 { 1234 __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags); 1235 bool effective_query = attr->query.query_flags & BPF_F_QUERY_EFFECTIVE; 1236 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 1237 enum bpf_attach_type type = attr->query.attach_type; 1238 enum cgroup_bpf_attach_type from_atype, to_atype; 1239 enum cgroup_bpf_attach_type atype; 1240 struct bpf_prog_array *effective; 1241 int cnt, ret = 0, i; 1242 int total_cnt = 0; 1243 u64 revision = 0; 1244 u32 flags; 1245 1246 if (effective_query && prog_attach_flags) 1247 return -EINVAL; 1248 1249 if (type == BPF_LSM_CGROUP) { 1250 if (!effective_query && attr->query.prog_cnt && 1251 prog_ids && !prog_attach_flags) 1252 return -EINVAL; 1253 1254 from_atype = CGROUP_LSM_START; 1255 to_atype = CGROUP_LSM_END; 1256 flags = 0; 1257 } else { 1258 from_atype = to_cgroup_bpf_attach_type(type); 1259 if (from_atype < 0) 1260 return -EINVAL; 1261 to_atype = from_atype; 1262 flags = cgrp->bpf.flags[from_atype]; 1263 } 1264 1265 for (atype = from_atype; atype <= to_atype; atype++) { 1266 if (effective_query) { 1267 effective = rcu_dereference_protected(cgrp->bpf.effective[atype], 1268 lockdep_is_held(&cgroup_mutex)); 1269 total_cnt += bpf_prog_array_length(effective); 1270 } else { 1271 total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL); 1272 } 1273 } 1274 1275 /* always output uattr->query.attach_flags as 0 during effective query */ 1276 flags = effective_query ? 0 : flags; 1277 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 1278 return -EFAULT; 1279 if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt))) 1280 return -EFAULT; 1281 if (!effective_query && from_atype == to_atype) 1282 revision = cgrp->bpf.revisions[from_atype]; 1283 if (uattr_size >= offsetofend(union bpf_attr, query.revision) && 1284 copy_to_user(&uattr->query.revision, &revision, sizeof(revision))) 1285 return -EFAULT; 1286 if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt) 1287 /* return early if user requested only program count + flags */ 1288 return 0; 1289 1290 if (attr->query.prog_cnt < total_cnt) { 1291 total_cnt = attr->query.prog_cnt; 1292 ret = -ENOSPC; 1293 } 1294 1295 for (atype = from_atype; atype <= to_atype && total_cnt; atype++) { 1296 if (effective_query) { 1297 effective = rcu_dereference_protected(cgrp->bpf.effective[atype], 1298 lockdep_is_held(&cgroup_mutex)); 1299 cnt = min_t(int, bpf_prog_array_length(effective), total_cnt); 1300 ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt); 1301 } else { 1302 struct hlist_head *progs; 1303 struct bpf_prog_list *pl; 1304 struct bpf_prog *prog; 1305 u32 id; 1306 1307 progs = &cgrp->bpf.progs[atype]; 1308 cnt = min_t(int, prog_list_length(progs, NULL), total_cnt); 1309 i = 0; 1310 hlist_for_each_entry(pl, progs, node) { 1311 prog = prog_list_prog(pl); 1312 id = prog->aux->id; 1313 if (copy_to_user(prog_ids + i, &id, sizeof(id))) 1314 return -EFAULT; 1315 if (++i == cnt) 1316 break; 1317 } 1318 1319 if (prog_attach_flags) { 1320 flags = cgrp->bpf.flags[atype]; 1321 1322 for (i = 0; i < cnt; i++) 1323 if (copy_to_user(prog_attach_flags + i, 1324 &flags, sizeof(flags))) 1325 return -EFAULT; 1326 prog_attach_flags += cnt; 1327 } 1328 } 1329 1330 prog_ids += cnt; 1331 total_cnt -= cnt; 1332 } 1333 return ret; 1334 } 1335 1336 static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 1337 union bpf_attr __user *uattr, u32 uattr_size) 1338 { 1339 int ret; 1340 1341 cgroup_lock(); 1342 ret = __cgroup_bpf_query(cgrp, attr, uattr, uattr_size); 1343 cgroup_unlock(); 1344 return ret; 1345 } 1346 1347 int cgroup_bpf_prog_attach(const union bpf_attr *attr, 1348 enum bpf_prog_type ptype, struct bpf_prog *prog) 1349 { 1350 struct bpf_prog *replace_prog = NULL; 1351 struct cgroup *cgrp; 1352 int ret; 1353 1354 cgrp = cgroup_get_from_fd(attr->target_fd); 1355 if (IS_ERR(cgrp)) 1356 return PTR_ERR(cgrp); 1357 1358 if ((attr->attach_flags & BPF_F_ALLOW_MULTI) && 1359 (attr->attach_flags & BPF_F_REPLACE)) { 1360 replace_prog = bpf_prog_get_type(attr->replace_bpf_fd, ptype); 1361 if (IS_ERR(replace_prog)) { 1362 cgroup_put(cgrp); 1363 return PTR_ERR(replace_prog); 1364 } 1365 } 1366 1367 ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL, 1368 attr->attach_type, attr->attach_flags, 1369 attr->relative_fd, attr->expected_revision); 1370 1371 if (replace_prog) 1372 bpf_prog_put(replace_prog); 1373 cgroup_put(cgrp); 1374 return ret; 1375 } 1376 1377 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) 1378 { 1379 struct bpf_prog *prog; 1380 struct cgroup *cgrp; 1381 int ret; 1382 1383 cgrp = cgroup_get_from_fd(attr->target_fd); 1384 if (IS_ERR(cgrp)) 1385 return PTR_ERR(cgrp); 1386 1387 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 1388 if (IS_ERR(prog)) 1389 prog = NULL; 1390 1391 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision); 1392 if (prog) 1393 bpf_prog_put(prog); 1394 1395 cgroup_put(cgrp); 1396 return ret; 1397 } 1398 1399 static void bpf_cgroup_link_release(struct bpf_link *link) 1400 { 1401 struct bpf_cgroup_link *cg_link = 1402 container_of(link, struct bpf_cgroup_link, link); 1403 struct cgroup *cg; 1404 1405 /* link might have been auto-detached by dying cgroup already, 1406 * in that case our work is done here 1407 */ 1408 if (!cg_link->cgroup) 1409 return; 1410 1411 cgroup_lock(); 1412 1413 /* re-check cgroup under lock again */ 1414 if (!cg_link->cgroup) { 1415 cgroup_unlock(); 1416 return; 1417 } 1418 1419 WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link, 1420 link->attach_type, 0)); 1421 if (link->attach_type == BPF_LSM_CGROUP) 1422 bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog); 1423 1424 cg = cg_link->cgroup; 1425 cg_link->cgroup = NULL; 1426 1427 cgroup_unlock(); 1428 1429 cgroup_put(cg); 1430 } 1431 1432 static void bpf_cgroup_link_dealloc(struct bpf_link *link) 1433 { 1434 struct bpf_cgroup_link *cg_link = 1435 container_of(link, struct bpf_cgroup_link, link); 1436 1437 kfree(cg_link); 1438 } 1439 1440 static int bpf_cgroup_link_detach(struct bpf_link *link) 1441 { 1442 bpf_cgroup_link_release(link); 1443 1444 return 0; 1445 } 1446 1447 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link, 1448 struct seq_file *seq) 1449 { 1450 struct bpf_cgroup_link *cg_link = 1451 container_of(link, struct bpf_cgroup_link, link); 1452 u64 cg_id = 0; 1453 1454 cgroup_lock(); 1455 if (cg_link->cgroup) 1456 cg_id = cgroup_id(cg_link->cgroup); 1457 cgroup_unlock(); 1458 1459 seq_printf(seq, 1460 "cgroup_id:\t%llu\n" 1461 "attach_type:\t%d\n", 1462 cg_id, 1463 link->attach_type); 1464 } 1465 1466 static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link, 1467 struct bpf_link_info *info) 1468 { 1469 struct bpf_cgroup_link *cg_link = 1470 container_of(link, struct bpf_cgroup_link, link); 1471 u64 cg_id = 0; 1472 1473 cgroup_lock(); 1474 if (cg_link->cgroup) 1475 cg_id = cgroup_id(cg_link->cgroup); 1476 cgroup_unlock(); 1477 1478 info->cgroup.cgroup_id = cg_id; 1479 info->cgroup.attach_type = link->attach_type; 1480 return 0; 1481 } 1482 1483 static const struct bpf_link_ops bpf_cgroup_link_lops = { 1484 .release = bpf_cgroup_link_release, 1485 .dealloc = bpf_cgroup_link_dealloc, 1486 .detach = bpf_cgroup_link_detach, 1487 .update_prog = cgroup_bpf_replace, 1488 .show_fdinfo = bpf_cgroup_link_show_fdinfo, 1489 .fill_link_info = bpf_cgroup_link_fill_link_info, 1490 }; 1491 1492 #define BPF_F_LINK_ATTACH_MASK \ 1493 (BPF_F_ID | \ 1494 BPF_F_BEFORE | \ 1495 BPF_F_AFTER | \ 1496 BPF_F_PREORDER | \ 1497 BPF_F_LINK) 1498 1499 int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog) 1500 { 1501 struct bpf_link_primer link_primer; 1502 struct bpf_cgroup_link *link; 1503 struct cgroup *cgrp; 1504 int err; 1505 1506 if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK)) 1507 return -EINVAL; 1508 1509 cgrp = cgroup_get_from_fd(attr->link_create.target_fd); 1510 if (IS_ERR(cgrp)) 1511 return PTR_ERR(cgrp); 1512 1513 link = kzalloc_obj(*link, GFP_USER); 1514 if (!link) { 1515 err = -ENOMEM; 1516 goto out_put_cgroup; 1517 } 1518 bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP, &bpf_cgroup_link_lops, 1519 prog, attr->link_create.attach_type); 1520 link->cgroup = cgrp; 1521 1522 err = bpf_link_prime(&link->link, &link_primer); 1523 if (err) { 1524 kfree(link); 1525 goto out_put_cgroup; 1526 } 1527 1528 err = cgroup_bpf_attach(cgrp, NULL, NULL, link, 1529 link->link.attach_type, BPF_F_ALLOW_MULTI | attr->link_create.flags, 1530 attr->link_create.cgroup.relative_fd, 1531 attr->link_create.cgroup.expected_revision); 1532 if (err) { 1533 bpf_link_cleanup(&link_primer); 1534 goto out_put_cgroup; 1535 } 1536 1537 return bpf_link_settle(&link_primer); 1538 1539 out_put_cgroup: 1540 cgroup_put(cgrp); 1541 return err; 1542 } 1543 1544 int cgroup_bpf_prog_query(const union bpf_attr *attr, 1545 union bpf_attr __user *uattr, u32 uattr_size) 1546 { 1547 struct cgroup *cgrp; 1548 int ret; 1549 1550 cgrp = cgroup_get_from_fd(attr->query.target_fd); 1551 if (IS_ERR(cgrp)) 1552 return PTR_ERR(cgrp); 1553 1554 ret = cgroup_bpf_query(cgrp, attr, uattr, uattr_size); 1555 1556 cgroup_put(cgrp); 1557 return ret; 1558 } 1559 1560 /** 1561 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 1562 * @sk: The socket sending or receiving traffic 1563 * @skb: The skb that is being sent or received 1564 * @atype: The type of program to be executed 1565 * 1566 * If no socket is passed, or the socket is not of type INET or INET6, 1567 * this function does nothing and returns 0. 1568 * 1569 * The program type passed in via @type must be suitable for network 1570 * filtering. No further check is performed to assert that. 1571 * 1572 * For egress packets, this function can return: 1573 * NET_XMIT_SUCCESS (0) - continue with packet output 1574 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr 1575 * NET_XMIT_CN (2) - continue with packet output and notify TCP 1576 * to call cwr 1577 * -err - drop packet 1578 * 1579 * For ingress packets, this function will return -EPERM if any 1580 * attached program was found and if it returned != 1 during execution. 1581 * Otherwise 0 is returned. 1582 */ 1583 int __cgroup_bpf_run_filter_skb(struct sock *sk, 1584 struct sk_buff *skb, 1585 enum cgroup_bpf_attach_type atype) 1586 { 1587 unsigned int offset = -skb_network_offset(skb); 1588 struct sock *save_sk; 1589 void *saved_data_end; 1590 struct cgroup *cgrp; 1591 int ret; 1592 1593 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 1594 return 0; 1595 1596 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1597 save_sk = skb->sk; 1598 skb->sk = sk; 1599 __skb_push(skb, offset); 1600 1601 /* compute pointers for the bpf prog */ 1602 bpf_compute_and_save_data_end(skb, &saved_data_end); 1603 1604 if (atype == CGROUP_INET_EGRESS) { 1605 u32 flags = 0; 1606 bool cn; 1607 1608 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, skb, 1609 __bpf_prog_run_save_cb, 0, &flags); 1610 1611 /* Return values of CGROUP EGRESS BPF programs are: 1612 * 0: drop packet 1613 * 1: keep packet 1614 * 2: drop packet and cn 1615 * 3: keep packet and cn 1616 * 1617 * The returned value is then converted to one of the NET_XMIT 1618 * or an error code that is then interpreted as drop packet 1619 * (and no cn): 1620 * 0: NET_XMIT_SUCCESS skb should be transmitted 1621 * 1: NET_XMIT_DROP skb should be dropped and cn 1622 * 2: NET_XMIT_CN skb should be transmitted and cn 1623 * 3: -err skb should be dropped 1624 */ 1625 1626 cn = flags & BPF_RET_SET_CN; 1627 if (ret && !IS_ERR_VALUE((long)ret)) 1628 ret = -EFAULT; 1629 if (!ret) 1630 ret = (cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); 1631 else 1632 ret = (cn ? NET_XMIT_DROP : ret); 1633 } else { 1634 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, 1635 skb, __bpf_prog_run_save_cb, 0, 1636 NULL); 1637 if (ret && !IS_ERR_VALUE((long)ret)) 1638 ret = -EFAULT; 1639 } 1640 bpf_restore_data_end(skb, saved_data_end); 1641 __skb_pull(skb, offset); 1642 skb->sk = save_sk; 1643 1644 return ret; 1645 } 1646 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 1647 1648 /** 1649 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 1650 * @sk: sock structure to manipulate 1651 * @atype: The type of program to be executed 1652 * 1653 * socket is passed is expected to be of type INET or INET6. 1654 * 1655 * The program type passed in via @type must be suitable for sock 1656 * filtering. No further check is performed to assert that. 1657 * 1658 * This function will return %-EPERM if any if an attached program was found 1659 * and if it returned != 1 during execution. In all other cases, 0 is returned. 1660 */ 1661 int __cgroup_bpf_run_filter_sk(struct sock *sk, 1662 enum cgroup_bpf_attach_type atype) 1663 { 1664 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1665 1666 return bpf_prog_run_array_cg(&cgrp->bpf, atype, sk, bpf_prog_run, 0, 1667 NULL); 1668 } 1669 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 1670 1671 /** 1672 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and 1673 * provided by user sockaddr 1674 * @sk: sock struct that will use sockaddr 1675 * @uaddr: sockaddr struct provided by user 1676 * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is 1677 * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX 1678 * uaddr. 1679 * @atype: The type of program to be executed 1680 * @t_ctx: Pointer to attach type specific context 1681 * @flags: Pointer to u32 which contains higher bits of BPF program 1682 * return value (OR'ed together). 1683 * 1684 * socket is expected to be of type INET, INET6 or UNIX. 1685 * 1686 * This function will return %-EPERM if an attached program is found and 1687 * returned value != 1 during execution. In all other cases, 0 is returned. 1688 */ 1689 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, 1690 struct sockaddr_unsized *uaddr, 1691 int *uaddrlen, 1692 enum cgroup_bpf_attach_type atype, 1693 void *t_ctx, 1694 u32 *flags) 1695 { 1696 struct bpf_sock_addr_kern ctx = { 1697 .sk = sk, 1698 .uaddr = uaddr, 1699 .t_ctx = t_ctx, 1700 }; 1701 struct sockaddr_storage storage; 1702 struct cgroup *cgrp; 1703 int ret; 1704 1705 if (!sk_is_inet(sk) && !sk_is_unix(sk)) 1706 return 0; 1707 1708 if (!ctx.uaddr) { 1709 memset(&storage, 0, sizeof(storage)); 1710 ctx.uaddr = (struct sockaddr_unsized *)&storage; 1711 ctx.uaddrlen = 0; 1712 } else { 1713 ctx.uaddrlen = *uaddrlen; 1714 } 1715 1716 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1717 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 1718 0, flags); 1719 1720 if (!ret && uaddr) 1721 *uaddrlen = ctx.uaddrlen; 1722 1723 return ret; 1724 } 1725 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); 1726 1727 /** 1728 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 1729 * @sk: socket to get cgroup from 1730 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 1731 * sk with connection information (IP addresses, etc.) May not contain 1732 * cgroup info if it is a req sock. 1733 * @atype: The type of program to be executed 1734 * 1735 * socket passed is expected to be of type INET or INET6. 1736 * 1737 * The program type passed in via @type must be suitable for sock_ops 1738 * filtering. No further check is performed to assert that. 1739 * 1740 * This function will return %-EPERM if any if an attached program was found 1741 * and if it returned != 1 during execution. In all other cases, 0 is returned. 1742 */ 1743 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 1744 struct bpf_sock_ops_kern *sock_ops, 1745 enum cgroup_bpf_attach_type atype) 1746 { 1747 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 1748 1749 return bpf_prog_run_array_cg(&cgrp->bpf, atype, sock_ops, bpf_prog_run, 1750 0, NULL); 1751 } 1752 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 1753 1754 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, 1755 short access, enum cgroup_bpf_attach_type atype) 1756 { 1757 struct cgroup *cgrp; 1758 struct bpf_cgroup_dev_ctx ctx = { 1759 .access_type = (access << 16) | dev_type, 1760 .major = major, 1761 .minor = minor, 1762 }; 1763 int ret; 1764 1765 rcu_read_lock(); 1766 cgrp = task_dfl_cgroup(current); 1767 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, 1768 NULL); 1769 rcu_read_unlock(); 1770 1771 return ret; 1772 } 1773 1774 BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) 1775 { 1776 /* flags argument is not used now, 1777 * but provides an ability to extend the API. 1778 * verifier checks that its value is correct. 1779 */ 1780 enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); 1781 struct bpf_cgroup_storage *storage; 1782 struct bpf_cg_run_ctx *ctx; 1783 void *ptr; 1784 1785 /* get current cgroup storage from BPF run context */ 1786 ctx = container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); 1787 storage = ctx->prog_item->cgroup_storage[stype]; 1788 1789 if (stype == BPF_CGROUP_STORAGE_SHARED) 1790 ptr = &READ_ONCE(storage->buf)->data[0]; 1791 else 1792 ptr = this_cpu_ptr(storage->percpu_buf); 1793 1794 return (unsigned long)ptr; 1795 } 1796 1797 const struct bpf_func_proto bpf_get_local_storage_proto = { 1798 .func = bpf_get_local_storage, 1799 .gpl_only = false, 1800 .ret_type = RET_PTR_TO_MAP_VALUE, 1801 .arg1_type = ARG_CONST_MAP_PTR, 1802 .arg2_type = ARG_ANYTHING, 1803 }; 1804 1805 BPF_CALL_0(bpf_get_retval) 1806 { 1807 struct bpf_cg_run_ctx *ctx = 1808 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); 1809 1810 return ctx->retval; 1811 } 1812 1813 const struct bpf_func_proto bpf_get_retval_proto = { 1814 .func = bpf_get_retval, 1815 .gpl_only = false, 1816 .ret_type = RET_INTEGER, 1817 }; 1818 1819 BPF_CALL_1(bpf_set_retval, int, retval) 1820 { 1821 struct bpf_cg_run_ctx *ctx = 1822 container_of(current->bpf_ctx, struct bpf_cg_run_ctx, run_ctx); 1823 1824 ctx->retval = retval; 1825 return 0; 1826 } 1827 1828 const struct bpf_func_proto bpf_set_retval_proto = { 1829 .func = bpf_set_retval, 1830 .gpl_only = false, 1831 .ret_type = RET_INTEGER, 1832 .arg1_type = ARG_ANYTHING, 1833 }; 1834 1835 static const struct bpf_func_proto * 1836 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1837 { 1838 const struct bpf_func_proto *func_proto; 1839 1840 func_proto = cgroup_common_func_proto(func_id, prog); 1841 if (func_proto) 1842 return func_proto; 1843 1844 switch (func_id) { 1845 case BPF_FUNC_perf_event_output: 1846 return &bpf_event_output_data_proto; 1847 default: 1848 return bpf_base_func_proto(func_id, prog); 1849 } 1850 } 1851 1852 static bool cgroup_dev_is_valid_access(int off, int size, 1853 enum bpf_access_type type, 1854 const struct bpf_prog *prog, 1855 struct bpf_insn_access_aux *info) 1856 { 1857 const int size_default = sizeof(__u32); 1858 1859 if (type == BPF_WRITE) 1860 return false; 1861 1862 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) 1863 return false; 1864 /* The verifier guarantees that size > 0. */ 1865 if (off % size != 0) 1866 return false; 1867 1868 switch (off) { 1869 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): 1870 bpf_ctx_record_field_size(info, size_default); 1871 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 1872 return false; 1873 break; 1874 default: 1875 if (size != size_default) 1876 return false; 1877 } 1878 1879 return true; 1880 } 1881 1882 const struct bpf_prog_ops cg_dev_prog_ops = { 1883 }; 1884 1885 const struct bpf_verifier_ops cg_dev_verifier_ops = { 1886 .get_func_proto = cgroup_dev_func_proto, 1887 .is_valid_access = cgroup_dev_is_valid_access, 1888 }; 1889 1890 /** 1891 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl 1892 * 1893 * @head: sysctl table header 1894 * @table: sysctl table 1895 * @write: sysctl is being read (= 0) or written (= 1) 1896 * @buf: pointer to buffer (in and out) 1897 * @pcount: value-result argument: value is size of buffer pointed to by @buf, 1898 * result is size of @new_buf if program set new value, initial value 1899 * otherwise 1900 * @ppos: value-result argument: value is position at which read from or write 1901 * to sysctl is happening, result is new position if program overrode it, 1902 * initial value otherwise 1903 * @atype: type of program to be executed 1904 * 1905 * Program is run when sysctl is being accessed, either read or written, and 1906 * can allow or deny such access. 1907 * 1908 * This function will return %-EPERM if an attached program is found and 1909 * returned value != 1 during execution. In all other cases 0 is returned. 1910 */ 1911 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 1912 const struct ctl_table *table, int write, 1913 char **buf, size_t *pcount, loff_t *ppos, 1914 enum cgroup_bpf_attach_type atype) 1915 { 1916 struct bpf_sysctl_kern ctx = { 1917 .head = head, 1918 .table = table, 1919 .write = write, 1920 .ppos = ppos, 1921 .cur_val = NULL, 1922 .cur_len = PAGE_SIZE, 1923 .new_val = NULL, 1924 .new_len = 0, 1925 .new_updated = 0, 1926 }; 1927 struct cgroup *cgrp; 1928 loff_t pos = 0; 1929 int ret; 1930 1931 ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); 1932 if (!ctx.cur_val || 1933 table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { 1934 /* Let BPF program decide how to proceed. */ 1935 ctx.cur_len = 0; 1936 } 1937 1938 if (write && *buf && *pcount) { 1939 /* BPF program should be able to override new value with a 1940 * buffer bigger than provided by user. 1941 */ 1942 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); 1943 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); 1944 if (ctx.new_val) { 1945 memcpy(ctx.new_val, *buf, ctx.new_len); 1946 } else { 1947 /* Let BPF program decide how to proceed. */ 1948 ctx.new_len = 0; 1949 } 1950 } 1951 1952 rcu_read_lock(); 1953 cgrp = task_dfl_cgroup(current); 1954 ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, 0, 1955 NULL); 1956 rcu_read_unlock(); 1957 1958 kfree(ctx.cur_val); 1959 1960 if (!ret && ctx.new_updated) { 1961 kvfree(*buf); 1962 *buf = ctx.new_val; 1963 *pcount = ctx.new_len; 1964 } else { 1965 kfree(ctx.new_val); 1966 } 1967 1968 return ret; 1969 } 1970 1971 #ifdef CONFIG_NET 1972 static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, 1973 struct bpf_sockopt_buf *buf) 1974 { 1975 if (unlikely(max_optlen < 0)) 1976 return -EINVAL; 1977 1978 if (unlikely(max_optlen > PAGE_SIZE)) { 1979 /* We don't expose optvals that are greater than PAGE_SIZE 1980 * to the BPF program. 1981 */ 1982 max_optlen = PAGE_SIZE; 1983 } 1984 1985 if (max_optlen <= sizeof(buf->data)) { 1986 /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE 1987 * bytes avoid the cost of kzalloc. 1988 */ 1989 ctx->optval = buf->data; 1990 ctx->optval_end = ctx->optval + max_optlen; 1991 return max_optlen; 1992 } 1993 1994 ctx->optval = kzalloc(max_optlen, GFP_USER); 1995 if (!ctx->optval) 1996 return -ENOMEM; 1997 1998 ctx->optval_end = ctx->optval + max_optlen; 1999 2000 return max_optlen; 2001 } 2002 2003 static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, 2004 struct bpf_sockopt_buf *buf) 2005 { 2006 if (ctx->optval == buf->data) 2007 return; 2008 kfree(ctx->optval); 2009 } 2010 2011 static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, 2012 struct bpf_sockopt_buf *buf) 2013 { 2014 return ctx->optval != buf->data; 2015 } 2016 2017 int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, 2018 int *optname, sockptr_t optval, 2019 int *optlen, char **kernel_optval) 2020 { 2021 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 2022 struct bpf_sockopt_buf buf = {}; 2023 struct bpf_sockopt_kern ctx = { 2024 .sk = sk, 2025 .level = *level, 2026 .optname = *optname, 2027 }; 2028 int ret, max_optlen; 2029 2030 /* Allocate a bit more than the initial user buffer for 2031 * BPF program. The canonical use case is overriding 2032 * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). 2033 */ 2034 max_optlen = max_t(int, 16, *optlen); 2035 max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); 2036 if (max_optlen < 0) 2037 return max_optlen; 2038 2039 ctx.optlen = *optlen; 2040 2041 if (copy_from_sockptr(ctx.optval, optval, 2042 min(*optlen, max_optlen))) { 2043 ret = -EFAULT; 2044 goto out; 2045 } 2046 2047 lock_sock(sk); 2048 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_SETSOCKOPT, 2049 &ctx, bpf_prog_run, 0, NULL); 2050 release_sock(sk); 2051 2052 if (ret) 2053 goto out; 2054 2055 if (ctx.optlen == -1) { 2056 /* optlen set to -1, bypass kernel */ 2057 ret = 1; 2058 } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { 2059 /* optlen is out of bounds */ 2060 if (*optlen > PAGE_SIZE && ctx.optlen >= 0) { 2061 pr_info_once("bpf setsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", 2062 ctx.optlen, max_optlen); 2063 ret = 0; 2064 goto out; 2065 } 2066 ret = -EFAULT; 2067 } else { 2068 /* optlen within bounds, run kernel handler */ 2069 ret = 0; 2070 2071 /* export any potential modifications */ 2072 *level = ctx.level; 2073 *optname = ctx.optname; 2074 2075 /* optlen == 0 from BPF indicates that we should 2076 * use original userspace data. 2077 */ 2078 if (ctx.optlen != 0) { 2079 *optlen = ctx.optlen; 2080 /* We've used bpf_sockopt_kern->buf as an intermediary 2081 * storage, but the BPF program indicates that we need 2082 * to pass this data to the kernel setsockopt handler. 2083 * No way to export on-stack buf, have to allocate a 2084 * new buffer. 2085 */ 2086 if (!sockopt_buf_allocated(&ctx, &buf)) { 2087 void *p = kmalloc(ctx.optlen, GFP_USER); 2088 2089 if (!p) { 2090 ret = -ENOMEM; 2091 goto out; 2092 } 2093 memcpy(p, ctx.optval, ctx.optlen); 2094 *kernel_optval = p; 2095 } else { 2096 *kernel_optval = ctx.optval; 2097 } 2098 /* export and don't free sockopt buf */ 2099 return 0; 2100 } 2101 } 2102 2103 out: 2104 sockopt_free_buf(&ctx, &buf); 2105 return ret; 2106 } 2107 2108 int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, 2109 int optname, sockptr_t optval, 2110 sockptr_t optlen, int max_optlen, 2111 int retval) 2112 { 2113 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 2114 struct bpf_sockopt_buf buf = {}; 2115 struct bpf_sockopt_kern ctx = { 2116 .sk = sk, 2117 .level = level, 2118 .optname = optname, 2119 .current_task = current, 2120 }; 2121 int orig_optlen; 2122 int ret; 2123 2124 orig_optlen = max_optlen; 2125 ctx.optlen = max_optlen; 2126 max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); 2127 if (max_optlen < 0) 2128 return max_optlen; 2129 2130 if (!retval) { 2131 /* If kernel getsockopt finished successfully, 2132 * copy whatever was returned to the user back 2133 * into our temporary buffer. Set optlen to the 2134 * one that kernel returned as well to let 2135 * BPF programs inspect the value. 2136 */ 2137 if (copy_from_sockptr(&ctx.optlen, optlen, 2138 sizeof(ctx.optlen))) { 2139 ret = -EFAULT; 2140 goto out; 2141 } 2142 2143 if (ctx.optlen < 0) { 2144 ret = -EFAULT; 2145 goto out; 2146 } 2147 orig_optlen = ctx.optlen; 2148 2149 if (copy_from_sockptr(ctx.optval, optval, 2150 min(ctx.optlen, max_optlen))) { 2151 ret = -EFAULT; 2152 goto out; 2153 } 2154 } 2155 2156 lock_sock(sk); 2157 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, 2158 &ctx, bpf_prog_run, retval, NULL); 2159 release_sock(sk); 2160 2161 if (ret < 0) 2162 goto out; 2163 2164 if (!sockptr_is_null(optval) && 2165 (ctx.optlen > max_optlen || ctx.optlen < 0)) { 2166 if (orig_optlen > PAGE_SIZE && ctx.optlen >= 0) { 2167 pr_info_once("bpf getsockopt: ignoring program buffer with optlen=%d (max_optlen=%d)\n", 2168 ctx.optlen, max_optlen); 2169 ret = retval; 2170 goto out; 2171 } 2172 ret = -EFAULT; 2173 goto out; 2174 } 2175 2176 if (ctx.optlen != 0) { 2177 if (!sockptr_is_null(optval) && 2178 copy_to_sockptr(optval, ctx.optval, ctx.optlen)) { 2179 ret = -EFAULT; 2180 goto out; 2181 } 2182 if (copy_to_sockptr(optlen, &ctx.optlen, sizeof(ctx.optlen))) { 2183 ret = -EFAULT; 2184 goto out; 2185 } 2186 } 2187 2188 out: 2189 sockopt_free_buf(&ctx, &buf); 2190 return ret; 2191 } 2192 2193 int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, 2194 int optname, void *optval, 2195 int *optlen, int retval) 2196 { 2197 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 2198 struct bpf_sockopt_kern ctx = { 2199 .sk = sk, 2200 .level = level, 2201 .optname = optname, 2202 .optlen = *optlen, 2203 .optval = optval, 2204 .optval_end = optval + *optlen, 2205 .current_task = current, 2206 }; 2207 int ret; 2208 2209 /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy 2210 * user data back into BPF buffer when reval != 0. This is 2211 * done as an optimization to avoid extra copy, assuming 2212 * kernel won't populate the data in case of an error. 2213 * Here we always pass the data and memset() should 2214 * be called if that data shouldn't be "exported". 2215 */ 2216 2217 ret = bpf_prog_run_array_cg(&cgrp->bpf, CGROUP_GETSOCKOPT, 2218 &ctx, bpf_prog_run, retval, NULL); 2219 if (ret < 0) 2220 return ret; 2221 2222 if (ctx.optlen > *optlen) 2223 return -EFAULT; 2224 2225 /* BPF programs can shrink the buffer, export the modifications. 2226 */ 2227 if (ctx.optlen != 0) 2228 *optlen = ctx.optlen; 2229 2230 return ret; 2231 } 2232 #endif 2233 2234 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 2235 size_t *lenp) 2236 { 2237 ssize_t tmp_ret = 0, ret; 2238 2239 if (dir->header.parent) { 2240 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); 2241 if (tmp_ret < 0) 2242 return tmp_ret; 2243 } 2244 2245 ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); 2246 if (ret < 0) 2247 return ret; 2248 *bufp += ret; 2249 *lenp -= ret; 2250 ret += tmp_ret; 2251 2252 /* Avoid leading slash. */ 2253 if (!ret) 2254 return ret; 2255 2256 tmp_ret = strscpy(*bufp, "/", *lenp); 2257 if (tmp_ret < 0) 2258 return tmp_ret; 2259 *bufp += tmp_ret; 2260 *lenp -= tmp_ret; 2261 2262 return ret + tmp_ret; 2263 } 2264 2265 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, 2266 size_t, buf_len, u64, flags) 2267 { 2268 ssize_t tmp_ret = 0, ret; 2269 2270 if (!buf) 2271 return -EINVAL; 2272 2273 if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { 2274 if (!ctx->head) 2275 return -EINVAL; 2276 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); 2277 if (tmp_ret < 0) 2278 return tmp_ret; 2279 } 2280 2281 ret = strscpy(buf, ctx->table->procname, buf_len); 2282 2283 return ret < 0 ? ret : tmp_ret + ret; 2284 } 2285 2286 static const struct bpf_func_proto bpf_sysctl_get_name_proto = { 2287 .func = bpf_sysctl_get_name, 2288 .gpl_only = false, 2289 .ret_type = RET_INTEGER, 2290 .arg1_type = ARG_PTR_TO_CTX, 2291 .arg2_type = ARG_PTR_TO_MEM | MEM_WRITE, 2292 .arg3_type = ARG_CONST_SIZE, 2293 .arg4_type = ARG_ANYTHING, 2294 }; 2295 2296 static int copy_sysctl_value(char *dst, size_t dst_len, char *src, 2297 size_t src_len) 2298 { 2299 if (!dst) 2300 return -EINVAL; 2301 2302 if (!dst_len) 2303 return -E2BIG; 2304 2305 if (!src || !src_len) { 2306 memset(dst, 0, dst_len); 2307 return -EINVAL; 2308 } 2309 2310 memcpy(dst, src, min(dst_len, src_len)); 2311 2312 if (dst_len > src_len) { 2313 memset(dst + src_len, '\0', dst_len - src_len); 2314 return src_len; 2315 } 2316 2317 dst[dst_len - 1] = '\0'; 2318 2319 return -E2BIG; 2320 } 2321 2322 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, 2323 char *, buf, size_t, buf_len) 2324 { 2325 return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); 2326 } 2327 2328 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { 2329 .func = bpf_sysctl_get_current_value, 2330 .gpl_only = false, 2331 .ret_type = RET_INTEGER, 2332 .arg1_type = ARG_PTR_TO_CTX, 2333 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 2334 .arg3_type = ARG_CONST_SIZE, 2335 }; 2336 2337 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, 2338 size_t, buf_len) 2339 { 2340 if (!ctx->write) { 2341 if (buf && buf_len) 2342 memset(buf, '\0', buf_len); 2343 return -EINVAL; 2344 } 2345 return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); 2346 } 2347 2348 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { 2349 .func = bpf_sysctl_get_new_value, 2350 .gpl_only = false, 2351 .ret_type = RET_INTEGER, 2352 .arg1_type = ARG_PTR_TO_CTX, 2353 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 2354 .arg3_type = ARG_CONST_SIZE, 2355 }; 2356 2357 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, 2358 const char *, buf, size_t, buf_len) 2359 { 2360 if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) 2361 return -EINVAL; 2362 2363 if (buf_len > PAGE_SIZE - 1) 2364 return -E2BIG; 2365 2366 memcpy(ctx->new_val, buf, buf_len); 2367 ((char *)ctx->new_val)[buf_len] = '\0'; 2368 ctx->new_len = buf_len; 2369 ctx->new_updated = 1; 2370 2371 return 0; 2372 } 2373 2374 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { 2375 .func = bpf_sysctl_set_new_value, 2376 .gpl_only = false, 2377 .ret_type = RET_INTEGER, 2378 .arg1_type = ARG_PTR_TO_CTX, 2379 .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, 2380 .arg3_type = ARG_CONST_SIZE, 2381 }; 2382 2383 static const struct bpf_func_proto * 2384 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 2385 { 2386 const struct bpf_func_proto *func_proto; 2387 2388 func_proto = cgroup_common_func_proto(func_id, prog); 2389 if (func_proto) 2390 return func_proto; 2391 2392 switch (func_id) { 2393 case BPF_FUNC_sysctl_get_name: 2394 return &bpf_sysctl_get_name_proto; 2395 case BPF_FUNC_sysctl_get_current_value: 2396 return &bpf_sysctl_get_current_value_proto; 2397 case BPF_FUNC_sysctl_get_new_value: 2398 return &bpf_sysctl_get_new_value_proto; 2399 case BPF_FUNC_sysctl_set_new_value: 2400 return &bpf_sysctl_set_new_value_proto; 2401 case BPF_FUNC_ktime_get_coarse_ns: 2402 return &bpf_ktime_get_coarse_ns_proto; 2403 case BPF_FUNC_perf_event_output: 2404 return &bpf_event_output_data_proto; 2405 default: 2406 return bpf_base_func_proto(func_id, prog); 2407 } 2408 } 2409 2410 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, 2411 const struct bpf_prog *prog, 2412 struct bpf_insn_access_aux *info) 2413 { 2414 const int size_default = sizeof(__u32); 2415 2416 if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) 2417 return false; 2418 2419 switch (off) { 2420 case bpf_ctx_range(struct bpf_sysctl, write): 2421 if (type != BPF_READ) 2422 return false; 2423 bpf_ctx_record_field_size(info, size_default); 2424 return bpf_ctx_narrow_access_ok(off, size, size_default); 2425 case bpf_ctx_range(struct bpf_sysctl, file_pos): 2426 if (type == BPF_READ) { 2427 bpf_ctx_record_field_size(info, size_default); 2428 return bpf_ctx_narrow_access_ok(off, size, size_default); 2429 } else { 2430 return size == size_default; 2431 } 2432 default: 2433 return false; 2434 } 2435 } 2436 2437 static u32 sysctl_convert_ctx_access(enum bpf_access_type type, 2438 const struct bpf_insn *si, 2439 struct bpf_insn *insn_buf, 2440 struct bpf_prog *prog, u32 *target_size) 2441 { 2442 struct bpf_insn *insn = insn_buf; 2443 u32 read_size; 2444 2445 switch (si->off) { 2446 case offsetof(struct bpf_sysctl, write): 2447 *insn++ = BPF_LDX_MEM( 2448 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 2449 bpf_target_off(struct bpf_sysctl_kern, write, 2450 sizeof_field(struct bpf_sysctl_kern, 2451 write), 2452 target_size)); 2453 break; 2454 case offsetof(struct bpf_sysctl, file_pos): 2455 /* ppos is a pointer so it should be accessed via indirect 2456 * loads and stores. Also for stores additional temporary 2457 * register is used since neither src_reg nor dst_reg can be 2458 * overridden. 2459 */ 2460 if (type == BPF_WRITE) { 2461 int treg = BPF_REG_9; 2462 2463 if (si->src_reg == treg || si->dst_reg == treg) 2464 --treg; 2465 if (si->src_reg == treg || si->dst_reg == treg) 2466 --treg; 2467 *insn++ = BPF_STX_MEM( 2468 BPF_DW, si->dst_reg, treg, 2469 offsetof(struct bpf_sysctl_kern, tmp_reg)); 2470 *insn++ = BPF_LDX_MEM( 2471 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 2472 treg, si->dst_reg, 2473 offsetof(struct bpf_sysctl_kern, ppos)); 2474 *insn++ = BPF_RAW_INSN( 2475 BPF_CLASS(si->code) | BPF_MEM | BPF_SIZEOF(u32), 2476 treg, si->src_reg, 2477 bpf_ctx_narrow_access_offset( 2478 0, sizeof(u32), sizeof(loff_t)), 2479 si->imm); 2480 *insn++ = BPF_LDX_MEM( 2481 BPF_DW, treg, si->dst_reg, 2482 offsetof(struct bpf_sysctl_kern, tmp_reg)); 2483 } else { 2484 *insn++ = BPF_LDX_MEM( 2485 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 2486 si->dst_reg, si->src_reg, 2487 offsetof(struct bpf_sysctl_kern, ppos)); 2488 read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); 2489 *insn++ = BPF_LDX_MEM( 2490 BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 2491 bpf_ctx_narrow_access_offset( 2492 0, read_size, sizeof(loff_t))); 2493 } 2494 *target_size = sizeof(u32); 2495 break; 2496 } 2497 2498 return insn - insn_buf; 2499 } 2500 2501 const struct bpf_verifier_ops cg_sysctl_verifier_ops = { 2502 .get_func_proto = sysctl_func_proto, 2503 .is_valid_access = sysctl_is_valid_access, 2504 .convert_ctx_access = sysctl_convert_ctx_access, 2505 }; 2506 2507 const struct bpf_prog_ops cg_sysctl_prog_ops = { 2508 }; 2509 2510 #ifdef CONFIG_NET 2511 BPF_CALL_1(bpf_get_netns_cookie_sockopt, struct bpf_sockopt_kern *, ctx) 2512 { 2513 const struct net *net = ctx ? sock_net(ctx->sk) : &init_net; 2514 2515 return net->net_cookie; 2516 } 2517 2518 static const struct bpf_func_proto bpf_get_netns_cookie_sockopt_proto = { 2519 .func = bpf_get_netns_cookie_sockopt, 2520 .gpl_only = false, 2521 .ret_type = RET_INTEGER, 2522 .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 2523 }; 2524 #endif 2525 2526 static const struct bpf_func_proto * 2527 cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 2528 { 2529 const struct bpf_func_proto *func_proto; 2530 2531 func_proto = cgroup_common_func_proto(func_id, prog); 2532 if (func_proto) 2533 return func_proto; 2534 2535 switch (func_id) { 2536 #ifdef CONFIG_NET 2537 case BPF_FUNC_get_netns_cookie: 2538 return &bpf_get_netns_cookie_sockopt_proto; 2539 case BPF_FUNC_sk_storage_get: 2540 return &bpf_sk_storage_get_proto; 2541 case BPF_FUNC_sk_storage_delete: 2542 return &bpf_sk_storage_delete_proto; 2543 case BPF_FUNC_setsockopt: 2544 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) 2545 return &bpf_sk_setsockopt_proto; 2546 return NULL; 2547 case BPF_FUNC_getsockopt: 2548 if (prog->expected_attach_type == BPF_CGROUP_SETSOCKOPT) 2549 return &bpf_sk_getsockopt_proto; 2550 return NULL; 2551 #endif 2552 #ifdef CONFIG_INET 2553 case BPF_FUNC_tcp_sock: 2554 return &bpf_tcp_sock_proto; 2555 #endif 2556 case BPF_FUNC_perf_event_output: 2557 return &bpf_event_output_data_proto; 2558 default: 2559 return bpf_base_func_proto(func_id, prog); 2560 } 2561 } 2562 2563 static bool cg_sockopt_is_valid_access(int off, int size, 2564 enum bpf_access_type type, 2565 const struct bpf_prog *prog, 2566 struct bpf_insn_access_aux *info) 2567 { 2568 const int size_default = sizeof(__u32); 2569 2570 if (off < 0 || off >= sizeof(struct bpf_sockopt)) 2571 return false; 2572 2573 if (off % size != 0) 2574 return false; 2575 2576 if (type == BPF_WRITE) { 2577 switch (off) { 2578 case offsetof(struct bpf_sockopt, retval): 2579 if (size != size_default) 2580 return false; 2581 return prog->expected_attach_type == 2582 BPF_CGROUP_GETSOCKOPT; 2583 case offsetof(struct bpf_sockopt, optname): 2584 fallthrough; 2585 case offsetof(struct bpf_sockopt, level): 2586 if (size != size_default) 2587 return false; 2588 return prog->expected_attach_type == 2589 BPF_CGROUP_SETSOCKOPT; 2590 case offsetof(struct bpf_sockopt, optlen): 2591 return size == size_default; 2592 default: 2593 return false; 2594 } 2595 } 2596 2597 switch (off) { 2598 case bpf_ctx_range_ptr(struct bpf_sockopt, sk): 2599 if (size != sizeof(__u64)) 2600 return false; 2601 info->reg_type = PTR_TO_SOCKET; 2602 break; 2603 case bpf_ctx_range_ptr(struct bpf_sockopt, optval): 2604 if (size != sizeof(__u64)) 2605 return false; 2606 info->reg_type = PTR_TO_PACKET; 2607 break; 2608 case bpf_ctx_range_ptr(struct bpf_sockopt, optval_end): 2609 if (size != sizeof(__u64)) 2610 return false; 2611 info->reg_type = PTR_TO_PACKET_END; 2612 break; 2613 case bpf_ctx_range(struct bpf_sockopt, retval): 2614 if (size != size_default) 2615 return false; 2616 return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; 2617 default: 2618 if (size != size_default) 2619 return false; 2620 break; 2621 } 2622 return true; 2623 } 2624 2625 #define CG_SOCKOPT_READ_FIELD(F) \ 2626 BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ 2627 si->dst_reg, si->src_reg, \ 2628 offsetof(struct bpf_sockopt_kern, F)) 2629 2630 #define CG_SOCKOPT_WRITE_FIELD(F) \ 2631 BPF_RAW_INSN((BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F) | \ 2632 BPF_MEM | BPF_CLASS(si->code)), \ 2633 si->dst_reg, si->src_reg, \ 2634 offsetof(struct bpf_sockopt_kern, F), \ 2635 si->imm) 2636 2637 static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, 2638 const struct bpf_insn *si, 2639 struct bpf_insn *insn_buf, 2640 struct bpf_prog *prog, 2641 u32 *target_size) 2642 { 2643 struct bpf_insn *insn = insn_buf; 2644 2645 switch (si->off) { 2646 case offsetof(struct bpf_sockopt, sk): 2647 *insn++ = CG_SOCKOPT_READ_FIELD(sk); 2648 break; 2649 case offsetof(struct bpf_sockopt, level): 2650 if (type == BPF_WRITE) 2651 *insn++ = CG_SOCKOPT_WRITE_FIELD(level); 2652 else 2653 *insn++ = CG_SOCKOPT_READ_FIELD(level); 2654 break; 2655 case offsetof(struct bpf_sockopt, optname): 2656 if (type == BPF_WRITE) 2657 *insn++ = CG_SOCKOPT_WRITE_FIELD(optname); 2658 else 2659 *insn++ = CG_SOCKOPT_READ_FIELD(optname); 2660 break; 2661 case offsetof(struct bpf_sockopt, optlen): 2662 if (type == BPF_WRITE) 2663 *insn++ = CG_SOCKOPT_WRITE_FIELD(optlen); 2664 else 2665 *insn++ = CG_SOCKOPT_READ_FIELD(optlen); 2666 break; 2667 case offsetof(struct bpf_sockopt, retval): 2668 BUILD_BUG_ON(offsetof(struct bpf_cg_run_ctx, run_ctx) != 0); 2669 2670 if (type == BPF_WRITE) { 2671 int treg = BPF_REG_9; 2672 2673 if (si->src_reg == treg || si->dst_reg == treg) 2674 --treg; 2675 if (si->src_reg == treg || si->dst_reg == treg) 2676 --treg; 2677 *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, treg, 2678 offsetof(struct bpf_sockopt_kern, tmp_reg)); 2679 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), 2680 treg, si->dst_reg, 2681 offsetof(struct bpf_sockopt_kern, current_task)); 2682 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), 2683 treg, treg, 2684 offsetof(struct task_struct, bpf_ctx)); 2685 *insn++ = BPF_RAW_INSN(BPF_CLASS(si->code) | BPF_MEM | 2686 BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), 2687 treg, si->src_reg, 2688 offsetof(struct bpf_cg_run_ctx, retval), 2689 si->imm); 2690 *insn++ = BPF_LDX_MEM(BPF_DW, treg, si->dst_reg, 2691 offsetof(struct bpf_sockopt_kern, tmp_reg)); 2692 } else { 2693 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, current_task), 2694 si->dst_reg, si->src_reg, 2695 offsetof(struct bpf_sockopt_kern, current_task)); 2696 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct task_struct, bpf_ctx), 2697 si->dst_reg, si->dst_reg, 2698 offsetof(struct task_struct, bpf_ctx)); 2699 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_cg_run_ctx, retval), 2700 si->dst_reg, si->dst_reg, 2701 offsetof(struct bpf_cg_run_ctx, retval)); 2702 } 2703 break; 2704 case offsetof(struct bpf_sockopt, optval): 2705 *insn++ = CG_SOCKOPT_READ_FIELD(optval); 2706 break; 2707 case offsetof(struct bpf_sockopt, optval_end): 2708 *insn++ = CG_SOCKOPT_READ_FIELD(optval_end); 2709 break; 2710 } 2711 2712 return insn - insn_buf; 2713 } 2714 2715 static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, 2716 bool direct_write, 2717 const struct bpf_prog *prog) 2718 { 2719 /* Nothing to do for sockopt argument. The data is kzalloc'ated. 2720 */ 2721 return 0; 2722 } 2723 2724 const struct bpf_verifier_ops cg_sockopt_verifier_ops = { 2725 .get_func_proto = cg_sockopt_func_proto, 2726 .is_valid_access = cg_sockopt_is_valid_access, 2727 .convert_ctx_access = cg_sockopt_convert_ctx_access, 2728 .gen_prologue = cg_sockopt_get_prologue, 2729 }; 2730 2731 const struct bpf_prog_ops cg_sockopt_prog_ops = { 2732 }; 2733 2734 /* Common helpers for cgroup hooks. */ 2735 const struct bpf_func_proto * 2736 cgroup_common_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 2737 { 2738 switch (func_id) { 2739 case BPF_FUNC_get_local_storage: 2740 return &bpf_get_local_storage_proto; 2741 case BPF_FUNC_get_retval: 2742 switch (prog->expected_attach_type) { 2743 case BPF_CGROUP_INET_INGRESS: 2744 case BPF_CGROUP_INET_EGRESS: 2745 case BPF_CGROUP_SOCK_OPS: 2746 case BPF_CGROUP_UDP4_RECVMSG: 2747 case BPF_CGROUP_UDP6_RECVMSG: 2748 case BPF_CGROUP_UNIX_RECVMSG: 2749 case BPF_CGROUP_INET4_GETPEERNAME: 2750 case BPF_CGROUP_INET6_GETPEERNAME: 2751 case BPF_CGROUP_UNIX_GETPEERNAME: 2752 case BPF_CGROUP_INET4_GETSOCKNAME: 2753 case BPF_CGROUP_INET6_GETSOCKNAME: 2754 case BPF_CGROUP_UNIX_GETSOCKNAME: 2755 return NULL; 2756 default: 2757 return &bpf_get_retval_proto; 2758 } 2759 case BPF_FUNC_set_retval: 2760 switch (prog->expected_attach_type) { 2761 case BPF_CGROUP_INET_INGRESS: 2762 case BPF_CGROUP_INET_EGRESS: 2763 case BPF_CGROUP_SOCK_OPS: 2764 case BPF_CGROUP_UDP4_RECVMSG: 2765 case BPF_CGROUP_UDP6_RECVMSG: 2766 case BPF_CGROUP_UNIX_RECVMSG: 2767 case BPF_CGROUP_INET4_GETPEERNAME: 2768 case BPF_CGROUP_INET6_GETPEERNAME: 2769 case BPF_CGROUP_UNIX_GETPEERNAME: 2770 case BPF_CGROUP_INET4_GETSOCKNAME: 2771 case BPF_CGROUP_INET6_GETSOCKNAME: 2772 case BPF_CGROUP_UNIX_GETSOCKNAME: 2773 return NULL; 2774 default: 2775 return &bpf_set_retval_proto; 2776 } 2777 default: 2778 return NULL; 2779 } 2780 } 2781