1 /* 2 * Functions to manage eBPF programs attached to cgroups 3 * 4 * Copyright (c) 2016 Daniel Mack 5 * 6 * This file is subject to the terms and conditions of version 2 of the GNU 7 * General Public License. See the file COPYING in the main directory of the 8 * Linux distribution for more details. 9 */ 10 11 #include <linux/kernel.h> 12 #include <linux/atomic.h> 13 #include <linux/cgroup.h> 14 #include <linux/filter.h> 15 #include <linux/slab.h> 16 #include <linux/sysctl.h> 17 #include <linux/string.h> 18 #include <linux/bpf.h> 19 #include <linux/bpf-cgroup.h> 20 #include <net/sock.h> 21 22 DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); 23 EXPORT_SYMBOL(cgroup_bpf_enabled_key); 24 25 void cgroup_bpf_offline(struct cgroup *cgrp) 26 { 27 cgroup_get(cgrp); 28 percpu_ref_kill(&cgrp->bpf.refcnt); 29 } 30 31 /** 32 * cgroup_bpf_release() - put references of all bpf programs and 33 * release all cgroup bpf data 34 * @work: work structure embedded into the cgroup to modify 35 */ 36 static void cgroup_bpf_release(struct work_struct *work) 37 { 38 struct cgroup *cgrp = container_of(work, struct cgroup, 39 bpf.release_work); 40 enum bpf_cgroup_storage_type stype; 41 struct bpf_prog_array *old_array; 42 unsigned int type; 43 44 for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { 45 struct list_head *progs = &cgrp->bpf.progs[type]; 46 struct bpf_prog_list *pl, *tmp; 47 48 list_for_each_entry_safe(pl, tmp, progs, node) { 49 list_del(&pl->node); 50 bpf_prog_put(pl->prog); 51 for_each_cgroup_storage_type(stype) { 52 bpf_cgroup_storage_unlink(pl->storage[stype]); 53 bpf_cgroup_storage_free(pl->storage[stype]); 54 } 55 kfree(pl); 56 static_branch_dec(&cgroup_bpf_enabled_key); 57 } 58 old_array = rcu_dereference_protected( 59 cgrp->bpf.effective[type], 60 percpu_ref_is_dying(&cgrp->bpf.refcnt)); 61 bpf_prog_array_free(old_array); 62 } 63 64 percpu_ref_exit(&cgrp->bpf.refcnt); 65 cgroup_put(cgrp); 66 } 67 68 /** 69 * cgroup_bpf_release_fn() - callback used to schedule releasing 70 * of bpf cgroup data 71 * @ref: percpu ref counter structure 72 */ 73 static void cgroup_bpf_release_fn(struct percpu_ref *ref) 74 { 75 struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); 76 77 INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); 78 queue_work(system_wq, &cgrp->bpf.release_work); 79 } 80 81 /* count number of elements in the list. 82 * it's slow but the list cannot be long 83 */ 84 static u32 prog_list_length(struct list_head *head) 85 { 86 struct bpf_prog_list *pl; 87 u32 cnt = 0; 88 89 list_for_each_entry(pl, head, node) { 90 if (!pl->prog) 91 continue; 92 cnt++; 93 } 94 return cnt; 95 } 96 97 /* if parent has non-overridable prog attached, 98 * disallow attaching new programs to the descendent cgroup. 99 * if parent has overridable or multi-prog, allow attaching 100 */ 101 static bool hierarchy_allows_attach(struct cgroup *cgrp, 102 enum bpf_attach_type type, 103 u32 new_flags) 104 { 105 struct cgroup *p; 106 107 p = cgroup_parent(cgrp); 108 if (!p) 109 return true; 110 do { 111 u32 flags = p->bpf.flags[type]; 112 u32 cnt; 113 114 if (flags & BPF_F_ALLOW_MULTI) 115 return true; 116 cnt = prog_list_length(&p->bpf.progs[type]); 117 WARN_ON_ONCE(cnt > 1); 118 if (cnt == 1) 119 return !!(flags & BPF_F_ALLOW_OVERRIDE); 120 p = cgroup_parent(p); 121 } while (p); 122 return true; 123 } 124 125 /* compute a chain of effective programs for a given cgroup: 126 * start from the list of programs in this cgroup and add 127 * all parent programs. 128 * Note that parent's F_ALLOW_OVERRIDE-type program is yielding 129 * to programs in this cgroup 130 */ 131 static int compute_effective_progs(struct cgroup *cgrp, 132 enum bpf_attach_type type, 133 struct bpf_prog_array **array) 134 { 135 enum bpf_cgroup_storage_type stype; 136 struct bpf_prog_array *progs; 137 struct bpf_prog_list *pl; 138 struct cgroup *p = cgrp; 139 int cnt = 0; 140 141 /* count number of effective programs by walking parents */ 142 do { 143 if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 144 cnt += prog_list_length(&p->bpf.progs[type]); 145 p = cgroup_parent(p); 146 } while (p); 147 148 progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); 149 if (!progs) 150 return -ENOMEM; 151 152 /* populate the array with effective progs */ 153 cnt = 0; 154 p = cgrp; 155 do { 156 if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) 157 continue; 158 159 list_for_each_entry(pl, &p->bpf.progs[type], node) { 160 if (!pl->prog) 161 continue; 162 163 progs->items[cnt].prog = pl->prog; 164 for_each_cgroup_storage_type(stype) 165 progs->items[cnt].cgroup_storage[stype] = 166 pl->storage[stype]; 167 cnt++; 168 } 169 } while ((p = cgroup_parent(p))); 170 171 *array = progs; 172 return 0; 173 } 174 175 static void activate_effective_progs(struct cgroup *cgrp, 176 enum bpf_attach_type type, 177 struct bpf_prog_array *old_array) 178 { 179 rcu_swap_protected(cgrp->bpf.effective[type], old_array, 180 lockdep_is_held(&cgroup_mutex)); 181 /* free prog array after grace period, since __cgroup_bpf_run_*() 182 * might be still walking the array 183 */ 184 bpf_prog_array_free(old_array); 185 } 186 187 /** 188 * cgroup_bpf_inherit() - inherit effective programs from parent 189 * @cgrp: the cgroup to modify 190 */ 191 int cgroup_bpf_inherit(struct cgroup *cgrp) 192 { 193 /* has to use marco instead of const int, since compiler thinks 194 * that array below is variable length 195 */ 196 #define NR ARRAY_SIZE(cgrp->bpf.effective) 197 struct bpf_prog_array *arrays[NR] = {}; 198 int ret, i; 199 200 ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, 201 GFP_KERNEL); 202 if (ret) 203 return ret; 204 205 for (i = 0; i < NR; i++) 206 INIT_LIST_HEAD(&cgrp->bpf.progs[i]); 207 208 for (i = 0; i < NR; i++) 209 if (compute_effective_progs(cgrp, i, &arrays[i])) 210 goto cleanup; 211 212 for (i = 0; i < NR; i++) 213 activate_effective_progs(cgrp, i, arrays[i]); 214 215 return 0; 216 cleanup: 217 for (i = 0; i < NR; i++) 218 bpf_prog_array_free(arrays[i]); 219 220 percpu_ref_exit(&cgrp->bpf.refcnt); 221 222 return -ENOMEM; 223 } 224 225 static int update_effective_progs(struct cgroup *cgrp, 226 enum bpf_attach_type type) 227 { 228 struct cgroup_subsys_state *css; 229 int err; 230 231 /* allocate and recompute effective prog arrays */ 232 css_for_each_descendant_pre(css, &cgrp->self) { 233 struct cgroup *desc = container_of(css, struct cgroup, self); 234 235 err = compute_effective_progs(desc, type, &desc->bpf.inactive); 236 if (err) 237 goto cleanup; 238 } 239 240 /* all allocations were successful. Activate all prog arrays */ 241 css_for_each_descendant_pre(css, &cgrp->self) { 242 struct cgroup *desc = container_of(css, struct cgroup, self); 243 244 activate_effective_progs(desc, type, desc->bpf.inactive); 245 desc->bpf.inactive = NULL; 246 } 247 248 return 0; 249 250 cleanup: 251 /* oom while computing effective. Free all computed effective arrays 252 * since they were not activated 253 */ 254 css_for_each_descendant_pre(css, &cgrp->self) { 255 struct cgroup *desc = container_of(css, struct cgroup, self); 256 257 bpf_prog_array_free(desc->bpf.inactive); 258 desc->bpf.inactive = NULL; 259 } 260 261 return err; 262 } 263 264 #define BPF_CGROUP_MAX_PROGS 64 265 266 /** 267 * __cgroup_bpf_attach() - Attach the program to a cgroup, and 268 * propagate the change to descendants 269 * @cgrp: The cgroup which descendants to traverse 270 * @prog: A program to attach 271 * @type: Type of attach operation 272 * @flags: Option flags 273 * 274 * Must be called with cgroup_mutex held. 275 */ 276 int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, 277 enum bpf_attach_type type, u32 flags) 278 { 279 struct list_head *progs = &cgrp->bpf.progs[type]; 280 struct bpf_prog *old_prog = NULL; 281 struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE], 282 *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {NULL}; 283 enum bpf_cgroup_storage_type stype; 284 struct bpf_prog_list *pl; 285 bool pl_was_allocated; 286 int err; 287 288 if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) 289 /* invalid combination */ 290 return -EINVAL; 291 292 if (!hierarchy_allows_attach(cgrp, type, flags)) 293 return -EPERM; 294 295 if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) 296 /* Disallow attaching non-overridable on top 297 * of existing overridable in this cgroup. 298 * Disallow attaching multi-prog if overridable or none 299 */ 300 return -EPERM; 301 302 if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) 303 return -E2BIG; 304 305 for_each_cgroup_storage_type(stype) { 306 storage[stype] = bpf_cgroup_storage_alloc(prog, stype); 307 if (IS_ERR(storage[stype])) { 308 storage[stype] = NULL; 309 for_each_cgroup_storage_type(stype) 310 bpf_cgroup_storage_free(storage[stype]); 311 return -ENOMEM; 312 } 313 } 314 315 if (flags & BPF_F_ALLOW_MULTI) { 316 list_for_each_entry(pl, progs, node) { 317 if (pl->prog == prog) { 318 /* disallow attaching the same prog twice */ 319 for_each_cgroup_storage_type(stype) 320 bpf_cgroup_storage_free(storage[stype]); 321 return -EINVAL; 322 } 323 } 324 325 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 326 if (!pl) { 327 for_each_cgroup_storage_type(stype) 328 bpf_cgroup_storage_free(storage[stype]); 329 return -ENOMEM; 330 } 331 332 pl_was_allocated = true; 333 pl->prog = prog; 334 for_each_cgroup_storage_type(stype) 335 pl->storage[stype] = storage[stype]; 336 list_add_tail(&pl->node, progs); 337 } else { 338 if (list_empty(progs)) { 339 pl = kmalloc(sizeof(*pl), GFP_KERNEL); 340 if (!pl) { 341 for_each_cgroup_storage_type(stype) 342 bpf_cgroup_storage_free(storage[stype]); 343 return -ENOMEM; 344 } 345 pl_was_allocated = true; 346 list_add_tail(&pl->node, progs); 347 } else { 348 pl = list_first_entry(progs, typeof(*pl), node); 349 old_prog = pl->prog; 350 for_each_cgroup_storage_type(stype) { 351 old_storage[stype] = pl->storage[stype]; 352 bpf_cgroup_storage_unlink(old_storage[stype]); 353 } 354 pl_was_allocated = false; 355 } 356 pl->prog = prog; 357 for_each_cgroup_storage_type(stype) 358 pl->storage[stype] = storage[stype]; 359 } 360 361 cgrp->bpf.flags[type] = flags; 362 363 err = update_effective_progs(cgrp, type); 364 if (err) 365 goto cleanup; 366 367 static_branch_inc(&cgroup_bpf_enabled_key); 368 for_each_cgroup_storage_type(stype) { 369 if (!old_storage[stype]) 370 continue; 371 bpf_cgroup_storage_free(old_storage[stype]); 372 } 373 if (old_prog) { 374 bpf_prog_put(old_prog); 375 static_branch_dec(&cgroup_bpf_enabled_key); 376 } 377 for_each_cgroup_storage_type(stype) 378 bpf_cgroup_storage_link(storage[stype], cgrp, type); 379 return 0; 380 381 cleanup: 382 /* and cleanup the prog list */ 383 pl->prog = old_prog; 384 for_each_cgroup_storage_type(stype) { 385 bpf_cgroup_storage_free(pl->storage[stype]); 386 pl->storage[stype] = old_storage[stype]; 387 bpf_cgroup_storage_link(old_storage[stype], cgrp, type); 388 } 389 if (pl_was_allocated) { 390 list_del(&pl->node); 391 kfree(pl); 392 } 393 return err; 394 } 395 396 /** 397 * __cgroup_bpf_detach() - Detach the program from a cgroup, and 398 * propagate the change to descendants 399 * @cgrp: The cgroup which descendants to traverse 400 * @prog: A program to detach or NULL 401 * @type: Type of detach operation 402 * 403 * Must be called with cgroup_mutex held. 404 */ 405 int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, 406 enum bpf_attach_type type) 407 { 408 struct list_head *progs = &cgrp->bpf.progs[type]; 409 enum bpf_cgroup_storage_type stype; 410 u32 flags = cgrp->bpf.flags[type]; 411 struct bpf_prog *old_prog = NULL; 412 struct bpf_prog_list *pl; 413 int err; 414 415 if (flags & BPF_F_ALLOW_MULTI) { 416 if (!prog) 417 /* to detach MULTI prog the user has to specify valid FD 418 * of the program to be detached 419 */ 420 return -EINVAL; 421 } else { 422 if (list_empty(progs)) 423 /* report error when trying to detach and nothing is attached */ 424 return -ENOENT; 425 } 426 427 if (flags & BPF_F_ALLOW_MULTI) { 428 /* find the prog and detach it */ 429 list_for_each_entry(pl, progs, node) { 430 if (pl->prog != prog) 431 continue; 432 old_prog = prog; 433 /* mark it deleted, so it's ignored while 434 * recomputing effective 435 */ 436 pl->prog = NULL; 437 break; 438 } 439 if (!old_prog) 440 return -ENOENT; 441 } else { 442 /* to maintain backward compatibility NONE and OVERRIDE cgroups 443 * allow detaching with invalid FD (prog==NULL) 444 */ 445 pl = list_first_entry(progs, typeof(*pl), node); 446 old_prog = pl->prog; 447 pl->prog = NULL; 448 } 449 450 err = update_effective_progs(cgrp, type); 451 if (err) 452 goto cleanup; 453 454 /* now can actually delete it from this cgroup list */ 455 list_del(&pl->node); 456 for_each_cgroup_storage_type(stype) { 457 bpf_cgroup_storage_unlink(pl->storage[stype]); 458 bpf_cgroup_storage_free(pl->storage[stype]); 459 } 460 kfree(pl); 461 if (list_empty(progs)) 462 /* last program was detached, reset flags to zero */ 463 cgrp->bpf.flags[type] = 0; 464 465 bpf_prog_put(old_prog); 466 static_branch_dec(&cgroup_bpf_enabled_key); 467 return 0; 468 469 cleanup: 470 /* and restore back old_prog */ 471 pl->prog = old_prog; 472 return err; 473 } 474 475 /* Must be called with cgroup_mutex held to avoid races. */ 476 int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, 477 union bpf_attr __user *uattr) 478 { 479 __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); 480 enum bpf_attach_type type = attr->query.attach_type; 481 struct list_head *progs = &cgrp->bpf.progs[type]; 482 u32 flags = cgrp->bpf.flags[type]; 483 struct bpf_prog_array *effective; 484 int cnt, ret = 0, i; 485 486 effective = rcu_dereference_protected(cgrp->bpf.effective[type], 487 lockdep_is_held(&cgroup_mutex)); 488 489 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) 490 cnt = bpf_prog_array_length(effective); 491 else 492 cnt = prog_list_length(progs); 493 494 if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) 495 return -EFAULT; 496 if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) 497 return -EFAULT; 498 if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) 499 /* return early if user requested only program count + flags */ 500 return 0; 501 if (attr->query.prog_cnt < cnt) { 502 cnt = attr->query.prog_cnt; 503 ret = -ENOSPC; 504 } 505 506 if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { 507 return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); 508 } else { 509 struct bpf_prog_list *pl; 510 u32 id; 511 512 i = 0; 513 list_for_each_entry(pl, progs, node) { 514 id = pl->prog->aux->id; 515 if (copy_to_user(prog_ids + i, &id, sizeof(id))) 516 return -EFAULT; 517 if (++i == cnt) 518 break; 519 } 520 } 521 return ret; 522 } 523 524 int cgroup_bpf_prog_attach(const union bpf_attr *attr, 525 enum bpf_prog_type ptype, struct bpf_prog *prog) 526 { 527 struct cgroup *cgrp; 528 int ret; 529 530 cgrp = cgroup_get_from_fd(attr->target_fd); 531 if (IS_ERR(cgrp)) 532 return PTR_ERR(cgrp); 533 534 ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, 535 attr->attach_flags); 536 cgroup_put(cgrp); 537 return ret; 538 } 539 540 int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) 541 { 542 struct bpf_prog *prog; 543 struct cgroup *cgrp; 544 int ret; 545 546 cgrp = cgroup_get_from_fd(attr->target_fd); 547 if (IS_ERR(cgrp)) 548 return PTR_ERR(cgrp); 549 550 prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); 551 if (IS_ERR(prog)) 552 prog = NULL; 553 554 ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); 555 if (prog) 556 bpf_prog_put(prog); 557 558 cgroup_put(cgrp); 559 return ret; 560 } 561 562 int cgroup_bpf_prog_query(const union bpf_attr *attr, 563 union bpf_attr __user *uattr) 564 { 565 struct cgroup *cgrp; 566 int ret; 567 568 cgrp = cgroup_get_from_fd(attr->query.target_fd); 569 if (IS_ERR(cgrp)) 570 return PTR_ERR(cgrp); 571 572 ret = cgroup_bpf_query(cgrp, attr, uattr); 573 574 cgroup_put(cgrp); 575 return ret; 576 } 577 578 /** 579 * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering 580 * @sk: The socket sending or receiving traffic 581 * @skb: The skb that is being sent or received 582 * @type: The type of program to be exectuted 583 * 584 * If no socket is passed, or the socket is not of type INET or INET6, 585 * this function does nothing and returns 0. 586 * 587 * The program type passed in via @type must be suitable for network 588 * filtering. No further check is performed to assert that. 589 * 590 * For egress packets, this function can return: 591 * NET_XMIT_SUCCESS (0) - continue with packet output 592 * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr 593 * NET_XMIT_CN (2) - continue with packet output and notify TCP 594 * to call cwr 595 * -EPERM - drop packet 596 * 597 * For ingress packets, this function will return -EPERM if any 598 * attached program was found and if it returned != 1 during execution. 599 * Otherwise 0 is returned. 600 */ 601 int __cgroup_bpf_run_filter_skb(struct sock *sk, 602 struct sk_buff *skb, 603 enum bpf_attach_type type) 604 { 605 unsigned int offset = skb->data - skb_network_header(skb); 606 struct sock *save_sk; 607 void *saved_data_end; 608 struct cgroup *cgrp; 609 int ret; 610 611 if (!sk || !sk_fullsock(sk)) 612 return 0; 613 614 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 615 return 0; 616 617 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 618 save_sk = skb->sk; 619 skb->sk = sk; 620 __skb_push(skb, offset); 621 622 /* compute pointers for the bpf prog */ 623 bpf_compute_and_save_data_end(skb, &saved_data_end); 624 625 if (type == BPF_CGROUP_INET_EGRESS) { 626 ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( 627 cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); 628 } else { 629 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, 630 __bpf_prog_run_save_cb); 631 ret = (ret == 1 ? 0 : -EPERM); 632 } 633 bpf_restore_data_end(skb, saved_data_end); 634 __skb_pull(skb, offset); 635 skb->sk = save_sk; 636 637 return ret; 638 } 639 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); 640 641 /** 642 * __cgroup_bpf_run_filter_sk() - Run a program on a sock 643 * @sk: sock structure to manipulate 644 * @type: The type of program to be exectuted 645 * 646 * socket is passed is expected to be of type INET or INET6. 647 * 648 * The program type passed in via @type must be suitable for sock 649 * filtering. No further check is performed to assert that. 650 * 651 * This function will return %-EPERM if any if an attached program was found 652 * and if it returned != 1 during execution. In all other cases, 0 is returned. 653 */ 654 int __cgroup_bpf_run_filter_sk(struct sock *sk, 655 enum bpf_attach_type type) 656 { 657 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 658 int ret; 659 660 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); 661 return ret == 1 ? 0 : -EPERM; 662 } 663 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); 664 665 /** 666 * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and 667 * provided by user sockaddr 668 * @sk: sock struct that will use sockaddr 669 * @uaddr: sockaddr struct provided by user 670 * @type: The type of program to be exectuted 671 * @t_ctx: Pointer to attach type specific context 672 * 673 * socket is expected to be of type INET or INET6. 674 * 675 * This function will return %-EPERM if an attached program is found and 676 * returned value != 1 during execution. In all other cases, 0 is returned. 677 */ 678 int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, 679 struct sockaddr *uaddr, 680 enum bpf_attach_type type, 681 void *t_ctx) 682 { 683 struct bpf_sock_addr_kern ctx = { 684 .sk = sk, 685 .uaddr = uaddr, 686 .t_ctx = t_ctx, 687 }; 688 struct sockaddr_storage unspec; 689 struct cgroup *cgrp; 690 int ret; 691 692 /* Check socket family since not all sockets represent network 693 * endpoint (e.g. AF_UNIX). 694 */ 695 if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) 696 return 0; 697 698 if (!ctx.uaddr) { 699 memset(&unspec, 0, sizeof(unspec)); 700 ctx.uaddr = (struct sockaddr *)&unspec; 701 } 702 703 cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 704 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 705 706 return ret == 1 ? 0 : -EPERM; 707 } 708 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); 709 710 /** 711 * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock 712 * @sk: socket to get cgroup from 713 * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains 714 * sk with connection information (IP addresses, etc.) May not contain 715 * cgroup info if it is a req sock. 716 * @type: The type of program to be exectuted 717 * 718 * socket passed is expected to be of type INET or INET6. 719 * 720 * The program type passed in via @type must be suitable for sock_ops 721 * filtering. No further check is performed to assert that. 722 * 723 * This function will return %-EPERM if any if an attached program was found 724 * and if it returned != 1 during execution. In all other cases, 0 is returned. 725 */ 726 int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, 727 struct bpf_sock_ops_kern *sock_ops, 728 enum bpf_attach_type type) 729 { 730 struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); 731 int ret; 732 733 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, 734 BPF_PROG_RUN); 735 return ret == 1 ? 0 : -EPERM; 736 } 737 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); 738 739 int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, 740 short access, enum bpf_attach_type type) 741 { 742 struct cgroup *cgrp; 743 struct bpf_cgroup_dev_ctx ctx = { 744 .access_type = (access << 16) | dev_type, 745 .major = major, 746 .minor = minor, 747 }; 748 int allow = 1; 749 750 rcu_read_lock(); 751 cgrp = task_dfl_cgroup(current); 752 allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, 753 BPF_PROG_RUN); 754 rcu_read_unlock(); 755 756 return !allow; 757 } 758 EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); 759 760 static const struct bpf_func_proto * 761 cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 762 { 763 switch (func_id) { 764 case BPF_FUNC_map_lookup_elem: 765 return &bpf_map_lookup_elem_proto; 766 case BPF_FUNC_map_update_elem: 767 return &bpf_map_update_elem_proto; 768 case BPF_FUNC_map_delete_elem: 769 return &bpf_map_delete_elem_proto; 770 case BPF_FUNC_map_push_elem: 771 return &bpf_map_push_elem_proto; 772 case BPF_FUNC_map_pop_elem: 773 return &bpf_map_pop_elem_proto; 774 case BPF_FUNC_map_peek_elem: 775 return &bpf_map_peek_elem_proto; 776 case BPF_FUNC_get_current_uid_gid: 777 return &bpf_get_current_uid_gid_proto; 778 case BPF_FUNC_get_local_storage: 779 return &bpf_get_local_storage_proto; 780 case BPF_FUNC_get_current_cgroup_id: 781 return &bpf_get_current_cgroup_id_proto; 782 case BPF_FUNC_trace_printk: 783 if (capable(CAP_SYS_ADMIN)) 784 return bpf_get_trace_printk_proto(); 785 /* fall through */ 786 default: 787 return NULL; 788 } 789 } 790 791 static const struct bpf_func_proto * 792 cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 793 { 794 return cgroup_base_func_proto(func_id, prog); 795 } 796 797 static bool cgroup_dev_is_valid_access(int off, int size, 798 enum bpf_access_type type, 799 const struct bpf_prog *prog, 800 struct bpf_insn_access_aux *info) 801 { 802 const int size_default = sizeof(__u32); 803 804 if (type == BPF_WRITE) 805 return false; 806 807 if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) 808 return false; 809 /* The verifier guarantees that size > 0. */ 810 if (off % size != 0) 811 return false; 812 813 switch (off) { 814 case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): 815 bpf_ctx_record_field_size(info, size_default); 816 if (!bpf_ctx_narrow_access_ok(off, size, size_default)) 817 return false; 818 break; 819 default: 820 if (size != size_default) 821 return false; 822 } 823 824 return true; 825 } 826 827 const struct bpf_prog_ops cg_dev_prog_ops = { 828 }; 829 830 const struct bpf_verifier_ops cg_dev_verifier_ops = { 831 .get_func_proto = cgroup_dev_func_proto, 832 .is_valid_access = cgroup_dev_is_valid_access, 833 }; 834 835 /** 836 * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl 837 * 838 * @head: sysctl table header 839 * @table: sysctl table 840 * @write: sysctl is being read (= 0) or written (= 1) 841 * @buf: pointer to buffer passed by user space 842 * @pcount: value-result argument: value is size of buffer pointed to by @buf, 843 * result is size of @new_buf if program set new value, initial value 844 * otherwise 845 * @ppos: value-result argument: value is position at which read from or write 846 * to sysctl is happening, result is new position if program overrode it, 847 * initial value otherwise 848 * @new_buf: pointer to pointer to new buffer that will be allocated if program 849 * overrides new value provided by user space on sysctl write 850 * NOTE: it's caller responsibility to free *new_buf if it was set 851 * @type: type of program to be executed 852 * 853 * Program is run when sysctl is being accessed, either read or written, and 854 * can allow or deny such access. 855 * 856 * This function will return %-EPERM if an attached program is found and 857 * returned value != 1 during execution. In all other cases 0 is returned. 858 */ 859 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, 860 struct ctl_table *table, int write, 861 void __user *buf, size_t *pcount, 862 loff_t *ppos, void **new_buf, 863 enum bpf_attach_type type) 864 { 865 struct bpf_sysctl_kern ctx = { 866 .head = head, 867 .table = table, 868 .write = write, 869 .ppos = ppos, 870 .cur_val = NULL, 871 .cur_len = PAGE_SIZE, 872 .new_val = NULL, 873 .new_len = 0, 874 .new_updated = 0, 875 }; 876 struct cgroup *cgrp; 877 int ret; 878 879 ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); 880 if (ctx.cur_val) { 881 mm_segment_t old_fs; 882 loff_t pos = 0; 883 884 old_fs = get_fs(); 885 set_fs(KERNEL_DS); 886 if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, 887 &ctx.cur_len, &pos)) { 888 /* Let BPF program decide how to proceed. */ 889 ctx.cur_len = 0; 890 } 891 set_fs(old_fs); 892 } else { 893 /* Let BPF program decide how to proceed. */ 894 ctx.cur_len = 0; 895 } 896 897 if (write && buf && *pcount) { 898 /* BPF program should be able to override new value with a 899 * buffer bigger than provided by user. 900 */ 901 ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); 902 ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); 903 if (!ctx.new_val || 904 copy_from_user(ctx.new_val, buf, ctx.new_len)) 905 /* Let BPF program decide how to proceed. */ 906 ctx.new_len = 0; 907 } 908 909 rcu_read_lock(); 910 cgrp = task_dfl_cgroup(current); 911 ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); 912 rcu_read_unlock(); 913 914 kfree(ctx.cur_val); 915 916 if (ret == 1 && ctx.new_updated) { 917 *new_buf = ctx.new_val; 918 *pcount = ctx.new_len; 919 } else { 920 kfree(ctx.new_val); 921 } 922 923 return ret == 1 ? 0 : -EPERM; 924 } 925 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); 926 927 static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, 928 size_t *lenp) 929 { 930 ssize_t tmp_ret = 0, ret; 931 932 if (dir->header.parent) { 933 tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); 934 if (tmp_ret < 0) 935 return tmp_ret; 936 } 937 938 ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); 939 if (ret < 0) 940 return ret; 941 *bufp += ret; 942 *lenp -= ret; 943 ret += tmp_ret; 944 945 /* Avoid leading slash. */ 946 if (!ret) 947 return ret; 948 949 tmp_ret = strscpy(*bufp, "/", *lenp); 950 if (tmp_ret < 0) 951 return tmp_ret; 952 *bufp += tmp_ret; 953 *lenp -= tmp_ret; 954 955 return ret + tmp_ret; 956 } 957 958 BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, 959 size_t, buf_len, u64, flags) 960 { 961 ssize_t tmp_ret = 0, ret; 962 963 if (!buf) 964 return -EINVAL; 965 966 if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { 967 if (!ctx->head) 968 return -EINVAL; 969 tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); 970 if (tmp_ret < 0) 971 return tmp_ret; 972 } 973 974 ret = strscpy(buf, ctx->table->procname, buf_len); 975 976 return ret < 0 ? ret : tmp_ret + ret; 977 } 978 979 static const struct bpf_func_proto bpf_sysctl_get_name_proto = { 980 .func = bpf_sysctl_get_name, 981 .gpl_only = false, 982 .ret_type = RET_INTEGER, 983 .arg1_type = ARG_PTR_TO_CTX, 984 .arg2_type = ARG_PTR_TO_MEM, 985 .arg3_type = ARG_CONST_SIZE, 986 .arg4_type = ARG_ANYTHING, 987 }; 988 989 static int copy_sysctl_value(char *dst, size_t dst_len, char *src, 990 size_t src_len) 991 { 992 if (!dst) 993 return -EINVAL; 994 995 if (!dst_len) 996 return -E2BIG; 997 998 if (!src || !src_len) { 999 memset(dst, 0, dst_len); 1000 return -EINVAL; 1001 } 1002 1003 memcpy(dst, src, min(dst_len, src_len)); 1004 1005 if (dst_len > src_len) { 1006 memset(dst + src_len, '\0', dst_len - src_len); 1007 return src_len; 1008 } 1009 1010 dst[dst_len - 1] = '\0'; 1011 1012 return -E2BIG; 1013 } 1014 1015 BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, 1016 char *, buf, size_t, buf_len) 1017 { 1018 return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); 1019 } 1020 1021 static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { 1022 .func = bpf_sysctl_get_current_value, 1023 .gpl_only = false, 1024 .ret_type = RET_INTEGER, 1025 .arg1_type = ARG_PTR_TO_CTX, 1026 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 1027 .arg3_type = ARG_CONST_SIZE, 1028 }; 1029 1030 BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, 1031 size_t, buf_len) 1032 { 1033 if (!ctx->write) { 1034 if (buf && buf_len) 1035 memset(buf, '\0', buf_len); 1036 return -EINVAL; 1037 } 1038 return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); 1039 } 1040 1041 static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { 1042 .func = bpf_sysctl_get_new_value, 1043 .gpl_only = false, 1044 .ret_type = RET_INTEGER, 1045 .arg1_type = ARG_PTR_TO_CTX, 1046 .arg2_type = ARG_PTR_TO_UNINIT_MEM, 1047 .arg3_type = ARG_CONST_SIZE, 1048 }; 1049 1050 BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, 1051 const char *, buf, size_t, buf_len) 1052 { 1053 if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) 1054 return -EINVAL; 1055 1056 if (buf_len > PAGE_SIZE - 1) 1057 return -E2BIG; 1058 1059 memcpy(ctx->new_val, buf, buf_len); 1060 ctx->new_len = buf_len; 1061 ctx->new_updated = 1; 1062 1063 return 0; 1064 } 1065 1066 static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { 1067 .func = bpf_sysctl_set_new_value, 1068 .gpl_only = false, 1069 .ret_type = RET_INTEGER, 1070 .arg1_type = ARG_PTR_TO_CTX, 1071 .arg2_type = ARG_PTR_TO_MEM, 1072 .arg3_type = ARG_CONST_SIZE, 1073 }; 1074 1075 static const struct bpf_func_proto * 1076 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) 1077 { 1078 switch (func_id) { 1079 case BPF_FUNC_strtol: 1080 return &bpf_strtol_proto; 1081 case BPF_FUNC_strtoul: 1082 return &bpf_strtoul_proto; 1083 case BPF_FUNC_sysctl_get_name: 1084 return &bpf_sysctl_get_name_proto; 1085 case BPF_FUNC_sysctl_get_current_value: 1086 return &bpf_sysctl_get_current_value_proto; 1087 case BPF_FUNC_sysctl_get_new_value: 1088 return &bpf_sysctl_get_new_value_proto; 1089 case BPF_FUNC_sysctl_set_new_value: 1090 return &bpf_sysctl_set_new_value_proto; 1091 default: 1092 return cgroup_base_func_proto(func_id, prog); 1093 } 1094 } 1095 1096 static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, 1097 const struct bpf_prog *prog, 1098 struct bpf_insn_access_aux *info) 1099 { 1100 const int size_default = sizeof(__u32); 1101 1102 if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) 1103 return false; 1104 1105 switch (off) { 1106 case offsetof(struct bpf_sysctl, write): 1107 if (type != BPF_READ) 1108 return false; 1109 bpf_ctx_record_field_size(info, size_default); 1110 return bpf_ctx_narrow_access_ok(off, size, size_default); 1111 case offsetof(struct bpf_sysctl, file_pos): 1112 if (type == BPF_READ) { 1113 bpf_ctx_record_field_size(info, size_default); 1114 return bpf_ctx_narrow_access_ok(off, size, size_default); 1115 } else { 1116 return size == size_default; 1117 } 1118 default: 1119 return false; 1120 } 1121 } 1122 1123 static u32 sysctl_convert_ctx_access(enum bpf_access_type type, 1124 const struct bpf_insn *si, 1125 struct bpf_insn *insn_buf, 1126 struct bpf_prog *prog, u32 *target_size) 1127 { 1128 struct bpf_insn *insn = insn_buf; 1129 1130 switch (si->off) { 1131 case offsetof(struct bpf_sysctl, write): 1132 *insn++ = BPF_LDX_MEM( 1133 BPF_SIZE(si->code), si->dst_reg, si->src_reg, 1134 bpf_target_off(struct bpf_sysctl_kern, write, 1135 FIELD_SIZEOF(struct bpf_sysctl_kern, 1136 write), 1137 target_size)); 1138 break; 1139 case offsetof(struct bpf_sysctl, file_pos): 1140 /* ppos is a pointer so it should be accessed via indirect 1141 * loads and stores. Also for stores additional temporary 1142 * register is used since neither src_reg nor dst_reg can be 1143 * overridden. 1144 */ 1145 if (type == BPF_WRITE) { 1146 int treg = BPF_REG_9; 1147 1148 if (si->src_reg == treg || si->dst_reg == treg) 1149 --treg; 1150 if (si->src_reg == treg || si->dst_reg == treg) 1151 --treg; 1152 *insn++ = BPF_STX_MEM( 1153 BPF_DW, si->dst_reg, treg, 1154 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1155 *insn++ = BPF_LDX_MEM( 1156 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1157 treg, si->dst_reg, 1158 offsetof(struct bpf_sysctl_kern, ppos)); 1159 *insn++ = BPF_STX_MEM( 1160 BPF_SIZEOF(u32), treg, si->src_reg, 0); 1161 *insn++ = BPF_LDX_MEM( 1162 BPF_DW, treg, si->dst_reg, 1163 offsetof(struct bpf_sysctl_kern, tmp_reg)); 1164 } else { 1165 *insn++ = BPF_LDX_MEM( 1166 BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), 1167 si->dst_reg, si->src_reg, 1168 offsetof(struct bpf_sysctl_kern, ppos)); 1169 *insn++ = BPF_LDX_MEM( 1170 BPF_SIZE(si->code), si->dst_reg, si->dst_reg, 0); 1171 } 1172 *target_size = sizeof(u32); 1173 break; 1174 } 1175 1176 return insn - insn_buf; 1177 } 1178 1179 const struct bpf_verifier_ops cg_sysctl_verifier_ops = { 1180 .get_func_proto = sysctl_func_proto, 1181 .is_valid_access = sysctl_is_valid_access, 1182 .convert_ctx_access = sysctl_convert_ctx_access, 1183 }; 1184 1185 const struct bpf_prog_ops cg_sysctl_prog_ops = { 1186 }; 1187