1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Minimal file system backend for holding eBPF maps and programs, 4 * used by bpf(2) object pinning. 5 * 6 * Authors: 7 * 8 * Daniel Borkmann <daniel@iogearbox.net> 9 */ 10 11 #include <linux/init.h> 12 #include <linux/magic.h> 13 #include <linux/major.h> 14 #include <linux/mount.h> 15 #include <linux/namei.h> 16 #include <linux/fs.h> 17 #include <linux/fs_context.h> 18 #include <linux/fs_parser.h> 19 #include <linux/kdev_t.h> 20 #include <linux/filter.h> 21 #include <linux/bpf.h> 22 #include <linux/bpf_trace.h> 23 #include <linux/kstrtox.h> 24 #include <linux/xattr.h> 25 #include <linux/security.h> 26 27 #include "preload/bpf_preload.h" 28 29 enum bpf_type { 30 BPF_TYPE_UNSPEC = 0, 31 BPF_TYPE_PROG, 32 BPF_TYPE_MAP, 33 BPF_TYPE_LINK, 34 }; 35 36 struct bpf_fs_inode { 37 struct list_head xattrs; 38 struct simple_xattr_limits xlimits; 39 struct inode vfs_inode; 40 }; 41 42 static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode) 43 { 44 return container_of(inode, struct bpf_fs_inode, vfs_inode); 45 } 46 47 static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init; 48 49 static int bpf_fs_initxattrs(struct inode *inode, 50 const struct xattr *xattr_array, void *fs_info); 51 static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size); 52 53 static void *bpf_any_get(void *raw, enum bpf_type type) 54 { 55 switch (type) { 56 case BPF_TYPE_PROG: 57 bpf_prog_inc(raw); 58 break; 59 case BPF_TYPE_MAP: 60 bpf_map_inc_with_uref(raw); 61 break; 62 case BPF_TYPE_LINK: 63 bpf_link_inc(raw); 64 break; 65 default: 66 WARN_ON_ONCE(1); 67 break; 68 } 69 70 return raw; 71 } 72 73 static void bpf_any_put(void *raw, enum bpf_type type) 74 { 75 switch (type) { 76 case BPF_TYPE_PROG: 77 bpf_prog_put(raw); 78 break; 79 case BPF_TYPE_MAP: 80 bpf_map_put_with_uref(raw); 81 break; 82 case BPF_TYPE_LINK: 83 bpf_link_put(raw); 84 break; 85 default: 86 WARN_ON_ONCE(1); 87 break; 88 } 89 } 90 91 static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) 92 { 93 void *raw; 94 95 raw = bpf_map_get_with_uref(ufd); 96 if (!IS_ERR(raw)) { 97 *type = BPF_TYPE_MAP; 98 return raw; 99 } 100 101 raw = bpf_prog_get(ufd); 102 if (!IS_ERR(raw)) { 103 *type = BPF_TYPE_PROG; 104 return raw; 105 } 106 107 raw = bpf_link_get_from_fd(ufd); 108 if (!IS_ERR(raw)) { 109 *type = BPF_TYPE_LINK; 110 return raw; 111 } 112 113 return ERR_PTR(-EINVAL); 114 } 115 116 static const struct inode_operations bpf_dir_iops; 117 static const struct inode_operations bpf_symlink_iops; 118 119 static const struct inode_operations bpf_prog_iops = { 120 .listxattr = bpf_fs_listxattr, 121 }; 122 static const struct inode_operations bpf_map_iops = { 123 .listxattr = bpf_fs_listxattr, 124 }; 125 static const struct inode_operations bpf_link_iops = { 126 .listxattr = bpf_fs_listxattr, 127 }; 128 129 struct inode *bpf_get_inode(struct super_block *sb, 130 const struct inode *dir, 131 umode_t mode) 132 { 133 struct inode *inode; 134 135 switch (mode & S_IFMT) { 136 case S_IFDIR: 137 case S_IFREG: 138 case S_IFLNK: 139 break; 140 default: 141 return ERR_PTR(-EINVAL); 142 } 143 144 inode = new_inode(sb); 145 if (!inode) 146 return ERR_PTR(-ENOSPC); 147 148 inode->i_ino = get_next_ino(); 149 simple_inode_init_ts(inode); 150 151 inode_init_owner(&nop_mnt_idmap, inode, dir, mode); 152 153 return inode; 154 } 155 156 static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) 157 { 158 *type = BPF_TYPE_UNSPEC; 159 if (inode->i_op == &bpf_prog_iops) 160 *type = BPF_TYPE_PROG; 161 else if (inode->i_op == &bpf_map_iops) 162 *type = BPF_TYPE_MAP; 163 else if (inode->i_op == &bpf_link_iops) 164 *type = BPF_TYPE_LINK; 165 else 166 return -EACCES; 167 168 return 0; 169 } 170 171 static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, 172 struct inode *dir) 173 { 174 d_make_persistent(dentry, inode); 175 176 inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); 177 } 178 179 static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, 180 struct dentry *dentry, umode_t mode) 181 { 182 struct inode *inode; 183 int ret; 184 185 inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); 186 if (IS_ERR(inode)) 187 return ERR_CAST(inode); 188 189 ret = security_inode_init_security(inode, dir, &dentry->d_name, 190 bpf_fs_initxattrs, NULL); 191 if (ret && ret != -EOPNOTSUPP) { 192 iput(inode); 193 return ERR_PTR(ret); 194 } 195 196 inode->i_op = &bpf_dir_iops; 197 inode->i_fop = &simple_dir_operations; 198 199 inc_nlink(inode); 200 inc_nlink(dir); 201 202 bpf_dentry_finalize(dentry, inode, dir); 203 return NULL; 204 } 205 206 struct map_iter { 207 void *key; 208 bool done; 209 }; 210 211 static struct map_iter *map_iter(struct seq_file *m) 212 { 213 return m->private; 214 } 215 216 static struct bpf_map *seq_file_to_map(struct seq_file *m) 217 { 218 return file_inode(m->file)->i_private; 219 } 220 221 static void map_iter_free(struct map_iter *iter) 222 { 223 if (iter) { 224 kfree(iter->key); 225 kfree(iter); 226 } 227 } 228 229 static struct map_iter *map_iter_alloc(struct bpf_map *map) 230 { 231 struct map_iter *iter; 232 233 iter = kzalloc_obj(*iter, GFP_KERNEL | __GFP_NOWARN); 234 if (!iter) 235 goto error; 236 237 iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); 238 if (!iter->key) 239 goto error; 240 241 return iter; 242 243 error: 244 map_iter_free(iter); 245 return NULL; 246 } 247 248 static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) 249 { 250 struct bpf_map *map = seq_file_to_map(m); 251 void *key = map_iter(m)->key; 252 void *prev_key; 253 254 (*pos)++; 255 if (map_iter(m)->done) 256 return NULL; 257 258 if (unlikely(v == SEQ_START_TOKEN)) 259 prev_key = NULL; 260 else 261 prev_key = key; 262 263 rcu_read_lock(); 264 if (map->ops->map_get_next_key(map, prev_key, key)) { 265 map_iter(m)->done = true; 266 key = NULL; 267 } 268 rcu_read_unlock(); 269 return key; 270 } 271 272 static void *map_seq_start(struct seq_file *m, loff_t *pos) 273 { 274 if (map_iter(m)->done) 275 return NULL; 276 277 return *pos ? map_iter(m)->key : SEQ_START_TOKEN; 278 } 279 280 static void map_seq_stop(struct seq_file *m, void *v) 281 { 282 } 283 284 static int map_seq_show(struct seq_file *m, void *v) 285 { 286 struct bpf_map *map = seq_file_to_map(m); 287 void *key = map_iter(m)->key; 288 289 if (unlikely(v == SEQ_START_TOKEN)) { 290 seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); 291 seq_puts(m, "# WARNING!! The output format will change\n"); 292 } else { 293 map->ops->map_seq_show_elem(map, key, m); 294 } 295 296 return 0; 297 } 298 299 static const struct seq_operations bpffs_map_seq_ops = { 300 .start = map_seq_start, 301 .next = map_seq_next, 302 .show = map_seq_show, 303 .stop = map_seq_stop, 304 }; 305 306 static int bpffs_map_open(struct inode *inode, struct file *file) 307 { 308 struct bpf_map *map = inode->i_private; 309 struct map_iter *iter; 310 struct seq_file *m; 311 int err; 312 313 iter = map_iter_alloc(map); 314 if (!iter) 315 return -ENOMEM; 316 317 err = seq_open(file, &bpffs_map_seq_ops); 318 if (err) { 319 map_iter_free(iter); 320 return err; 321 } 322 323 m = file->private_data; 324 m->private = iter; 325 326 return 0; 327 } 328 329 static int bpffs_map_release(struct inode *inode, struct file *file) 330 { 331 struct seq_file *m = file->private_data; 332 333 map_iter_free(map_iter(m)); 334 335 return seq_release(inode, file); 336 } 337 338 /* bpffs_map_fops should only implement the basic 339 * read operation for a BPF map. The purpose is to 340 * provide a simple user intuitive way to do 341 * "cat bpffs/pathto/a-pinned-map". 342 * 343 * Other operations (e.g. write, lookup...) should be realized by 344 * the userspace tools (e.g. bpftool) through the 345 * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update 346 * interface. 347 */ 348 static const struct file_operations bpffs_map_fops = { 349 .open = bpffs_map_open, 350 .read = seq_read, 351 .release = bpffs_map_release, 352 }; 353 354 static int bpffs_obj_open(struct inode *inode, struct file *file) 355 { 356 return -EIO; 357 } 358 359 static const struct file_operations bpffs_obj_fops = { 360 .open = bpffs_obj_open, 361 }; 362 363 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, 364 const struct inode_operations *iops, 365 const struct file_operations *fops) 366 { 367 struct inode *dir = dentry->d_parent->d_inode; 368 struct inode *inode; 369 int ret; 370 371 inode = bpf_get_inode(dir->i_sb, dir, mode); 372 if (IS_ERR(inode)) 373 return PTR_ERR(inode); 374 375 ret = security_inode_init_security(inode, dir, &dentry->d_name, 376 bpf_fs_initxattrs, NULL); 377 if (ret && ret != -EOPNOTSUPP) { 378 iput(inode); 379 return ret; 380 } 381 382 inode->i_op = iops; 383 inode->i_fop = fops; 384 inode->i_private = raw; 385 386 bpf_dentry_finalize(dentry, inode, dir); 387 return 0; 388 } 389 390 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) 391 { 392 return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, 393 &bpffs_obj_fops); 394 } 395 396 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) 397 { 398 struct bpf_map *map = arg; 399 400 return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, 401 bpf_map_support_seq_show(map) ? 402 &bpffs_map_fops : &bpffs_obj_fops); 403 } 404 405 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) 406 { 407 struct bpf_link *link = arg; 408 409 return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, 410 bpf_link_is_iter(link) ? 411 &bpf_iter_fops : &bpffs_obj_fops); 412 } 413 414 static struct dentry * 415 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) 416 { 417 /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future 418 * extensions. That allows popoulate_bpffs() create special files. 419 */ 420 if ((dir->i_mode & S_IALLUGO) && 421 strchr(dentry->d_name.name, '.')) 422 return ERR_PTR(-EPERM); 423 424 return simple_lookup(dir, dentry, flags); 425 } 426 427 static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, 428 struct dentry *dentry, const char *target) 429 { 430 struct inode *inode; 431 char *link; 432 int ret; 433 434 link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 435 if (!link) 436 return -ENOMEM; 437 438 inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); 439 if (IS_ERR(inode)) { 440 kfree(link); 441 return PTR_ERR(inode); 442 } 443 444 inode->i_op = &bpf_symlink_iops; 445 inode->i_link = link; 446 447 ret = security_inode_init_security(inode, dir, &dentry->d_name, 448 bpf_fs_initxattrs, NULL); 449 if (ret && ret != -EOPNOTSUPP) { 450 iput(inode); 451 return ret; 452 } 453 454 bpf_dentry_finalize(dentry, inode, dir); 455 return 0; 456 } 457 458 static const struct inode_operations bpf_symlink_iops = { 459 .get_link = simple_get_link, 460 .listxattr = bpf_fs_listxattr, 461 }; 462 463 static const struct inode_operations bpf_dir_iops = { 464 .lookup = bpf_lookup, 465 .mkdir = bpf_mkdir, 466 .symlink = bpf_symlink, 467 .rmdir = simple_rmdir, 468 .rename = simple_rename, 469 .link = simple_link, 470 .unlink = simple_unlink, 471 .listxattr = bpf_fs_listxattr, 472 }; 473 474 /* pin iterator link into bpffs */ 475 static int bpf_iter_link_pin_kernel(struct dentry *parent, 476 const char *name, struct bpf_link *link) 477 { 478 umode_t mode = S_IFREG | S_IRUSR; 479 struct dentry *dentry; 480 int ret; 481 482 dentry = simple_start_creating(parent, name); 483 if (IS_ERR(dentry)) 484 return PTR_ERR(dentry); 485 ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, 486 &bpf_iter_fops); 487 simple_done_creating(dentry); 488 return ret; 489 } 490 491 static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, 492 enum bpf_type type) 493 { 494 struct dentry *dentry; 495 struct inode *dir; 496 struct path path; 497 umode_t mode; 498 int ret; 499 500 dentry = start_creating_user_path(path_fd, pathname, &path, 0); 501 if (IS_ERR(dentry)) 502 return PTR_ERR(dentry); 503 504 dir = d_inode(path.dentry); 505 if (dir->i_op != &bpf_dir_iops) { 506 ret = -EPERM; 507 goto out; 508 } 509 510 mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); 511 ret = security_path_mknod(&path, dentry, mode, 0); 512 if (ret) 513 goto out; 514 515 switch (type) { 516 case BPF_TYPE_PROG: 517 ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); 518 break; 519 case BPF_TYPE_MAP: 520 ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); 521 break; 522 case BPF_TYPE_LINK: 523 ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); 524 break; 525 default: 526 ret = -EPERM; 527 } 528 out: 529 end_creating_path(&path, dentry); 530 return ret; 531 } 532 533 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname) 534 { 535 enum bpf_type type; 536 void *raw; 537 int ret; 538 539 raw = bpf_fd_probe_obj(ufd, &type); 540 if (IS_ERR(raw)) 541 return PTR_ERR(raw); 542 543 ret = bpf_obj_do_pin(path_fd, pathname, raw, type); 544 if (ret != 0) 545 bpf_any_put(raw, type); 546 547 return ret; 548 } 549 550 static void *bpf_obj_do_get(int path_fd, const char __user *pathname, 551 enum bpf_type *type, int flags) 552 { 553 struct inode *inode; 554 struct path path; 555 void *raw; 556 int ret; 557 558 ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path); 559 if (ret) 560 return ERR_PTR(ret); 561 562 inode = d_backing_inode(path.dentry); 563 ret = path_permission(&path, ACC_MODE(flags)); 564 if (ret) 565 goto out; 566 567 ret = bpf_inode_type(inode, type); 568 if (ret) 569 goto out; 570 571 raw = bpf_any_get(inode->i_private, *type); 572 if (!IS_ERR(raw)) 573 touch_atime(&path); 574 575 path_put(&path); 576 return raw; 577 out: 578 path_put(&path); 579 return ERR_PTR(ret); 580 } 581 582 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags) 583 { 584 enum bpf_type type = BPF_TYPE_UNSPEC; 585 int f_flags; 586 void *raw; 587 int ret; 588 589 f_flags = bpf_get_file_flag(flags); 590 if (f_flags < 0) 591 return f_flags; 592 593 raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags); 594 if (IS_ERR(raw)) 595 return PTR_ERR(raw); 596 597 if (type == BPF_TYPE_PROG) 598 ret = bpf_prog_new_fd(raw); 599 else if (type == BPF_TYPE_MAP) 600 ret = bpf_map_new_fd(raw, f_flags); 601 else if (type == BPF_TYPE_LINK) 602 ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw); 603 else 604 return -ENOENT; 605 606 if (ret < 0) 607 bpf_any_put(raw, type); 608 return ret; 609 } 610 611 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) 612 { 613 struct bpf_prog *prog; 614 int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); 615 if (ret) 616 return ERR_PTR(ret); 617 618 if (inode->i_op == &bpf_map_iops) 619 return ERR_PTR(-EINVAL); 620 if (inode->i_op == &bpf_link_iops) 621 return ERR_PTR(-EINVAL); 622 if (inode->i_op != &bpf_prog_iops) 623 return ERR_PTR(-EACCES); 624 625 prog = inode->i_private; 626 627 ret = security_bpf_prog(prog); 628 if (ret < 0) 629 return ERR_PTR(ret); 630 631 if (!bpf_prog_get_ok(prog, &type, false)) 632 return ERR_PTR(-EINVAL); 633 634 bpf_prog_inc(prog); 635 return prog; 636 } 637 638 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) 639 { 640 struct bpf_prog *prog; 641 struct path path; 642 int ret = kern_path(name, LOOKUP_FOLLOW, &path); 643 if (ret) 644 return ERR_PTR(ret); 645 prog = __get_prog_inode(d_backing_inode(path.dentry), type); 646 if (!IS_ERR(prog)) 647 touch_atime(&path); 648 path_put(&path); 649 return prog; 650 } 651 EXPORT_SYMBOL(bpf_prog_get_type_path); 652 653 struct bpffs_btf_enums { 654 const struct btf *btf; 655 const struct btf_type *cmd_t; 656 const struct btf_type *map_t; 657 const struct btf_type *prog_t; 658 const struct btf_type *attach_t; 659 }; 660 661 static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) 662 { 663 struct { 664 const struct btf_type **type; 665 const char *name; 666 } btf_enums[] = { 667 {&info->cmd_t, "bpf_cmd"}, 668 {&info->map_t, "bpf_map_type"}, 669 {&info->prog_t, "bpf_prog_type"}, 670 {&info->attach_t, "bpf_attach_type"}, 671 }; 672 const struct btf *btf; 673 int i, id; 674 675 memset(info, 0, sizeof(*info)); 676 677 btf = bpf_get_btf_vmlinux(); 678 if (IS_ERR(btf)) 679 return PTR_ERR(btf); 680 if (!btf) 681 return -ENOENT; 682 683 info->btf = btf; 684 685 for (i = 0; i < ARRAY_SIZE(btf_enums); i++) { 686 id = btf_find_by_name_kind(btf, btf_enums[i].name, 687 BTF_KIND_ENUM); 688 if (id < 0) 689 return -ESRCH; 690 691 *btf_enums[i].type = btf_type_by_id(btf, id); 692 } 693 694 return 0; 695 } 696 697 static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, 698 const char *prefix, const char *str, int *value) 699 { 700 const struct btf_enum *e; 701 const char *name; 702 int i, n, pfx_len = strlen(prefix); 703 704 *value = 0; 705 706 if (!btf || !enum_t) 707 return false; 708 709 for (i = 0, n = btf_vlen(enum_t); i < n; i++) { 710 e = &btf_enum(enum_t)[i]; 711 712 name = btf_name_by_offset(btf, e->name_off); 713 if (!name || strncasecmp(name, prefix, pfx_len) != 0) 714 continue; 715 716 /* match symbolic name case insensitive and ignoring prefix */ 717 if (strcasecmp(name + pfx_len, str) == 0) { 718 *value = e->val; 719 return true; 720 } 721 } 722 723 return false; 724 } 725 726 static void seq_print_delegate_opts(struct seq_file *m, 727 const char *opt_name, 728 const struct btf *btf, 729 const struct btf_type *enum_t, 730 const char *prefix, 731 u64 delegate_msk, u64 any_msk) 732 { 733 const struct btf_enum *e; 734 bool first = true; 735 const char *name; 736 u64 msk; 737 int i, n, pfx_len = strlen(prefix); 738 739 delegate_msk &= any_msk; /* clear unknown bits */ 740 741 if (delegate_msk == 0) 742 return; 743 744 seq_printf(m, ",%s", opt_name); 745 if (delegate_msk == any_msk) { 746 seq_printf(m, "=any"); 747 return; 748 } 749 750 if (btf && enum_t) { 751 for (i = 0, n = btf_vlen(enum_t); i < n; i++) { 752 e = &btf_enum(enum_t)[i]; 753 name = btf_name_by_offset(btf, e->name_off); 754 if (!name || strncasecmp(name, prefix, pfx_len) != 0) 755 continue; 756 msk = 1ULL << e->val; 757 if (delegate_msk & msk) { 758 /* emit lower-case name without prefix */ 759 seq_putc(m, first ? '=' : ':'); 760 name += pfx_len; 761 while (*name) { 762 seq_putc(m, tolower(*name)); 763 name++; 764 } 765 766 delegate_msk &= ~msk; 767 first = false; 768 } 769 } 770 } 771 if (delegate_msk) 772 seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk); 773 } 774 775 /* 776 * Display the mount options in /proc/mounts. 777 */ 778 static int bpf_show_options(struct seq_file *m, struct dentry *root) 779 { 780 struct inode *inode = d_inode(root); 781 umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; 782 struct bpf_mount_opts *opts = root->d_sb->s_fs_info; 783 u64 mask; 784 785 if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) 786 seq_printf(m, ",uid=%u", 787 from_kuid_munged(&init_user_ns, inode->i_uid)); 788 if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) 789 seq_printf(m, ",gid=%u", 790 from_kgid_munged(&init_user_ns, inode->i_gid)); 791 if (mode != S_IRWXUGO) 792 seq_printf(m, ",mode=%o", mode); 793 794 if (opts->delegate_cmds || opts->delegate_maps || 795 opts->delegate_progs || opts->delegate_attachs) { 796 struct bpffs_btf_enums info; 797 798 /* ignore errors, fallback to hex */ 799 (void)find_bpffs_btf_enums(&info); 800 801 mask = (1ULL << __MAX_BPF_CMD) - 1; 802 seq_print_delegate_opts(m, "delegate_cmds", 803 info.btf, info.cmd_t, "BPF_", 804 opts->delegate_cmds, mask); 805 806 mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; 807 seq_print_delegate_opts(m, "delegate_maps", 808 info.btf, info.map_t, "BPF_MAP_TYPE_", 809 opts->delegate_maps, mask); 810 811 mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; 812 seq_print_delegate_opts(m, "delegate_progs", 813 info.btf, info.prog_t, "BPF_PROG_TYPE_", 814 opts->delegate_progs, mask); 815 816 mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; 817 seq_print_delegate_opts(m, "delegate_attachs", 818 info.btf, info.attach_t, "BPF_", 819 opts->delegate_attachs, mask); 820 } 821 822 return 0; 823 } 824 825 static struct inode *bpf_fs_alloc_inode(struct super_block *sb) 826 { 827 struct bpf_fs_inode *bi; 828 829 bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL); 830 if (!bi) 831 return NULL; 832 INIT_LIST_HEAD_RCU(&bi->xattrs); 833 simple_xattr_limits_init(&bi->xlimits); 834 return &bi->vfs_inode; 835 } 836 837 static void bpf_destroy_inode(struct inode *inode) 838 { 839 struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; 840 struct bpf_fs_inode *bi = BPF_FS_I(inode); 841 enum bpf_type type; 842 843 if (!bpf_inode_type(inode, &type)) 844 bpf_any_put(inode->i_private, type); 845 simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL); 846 } 847 848 static void bpf_free_inode(struct inode *inode) 849 { 850 if (S_ISLNK(inode->i_mode)) 851 kfree(inode->i_link); 852 kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode)); 853 } 854 855 static int bpf_fs_xattr_get(const struct xattr_handler *handler, 856 struct dentry *unused, struct inode *inode, 857 const char *name, void *value, size_t size) 858 { 859 struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; 860 struct bpf_fs_inode *bi = BPF_FS_I(inode); 861 862 name = xattr_full_name(handler, name); 863 return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size); 864 } 865 866 enum { 867 BPF_FS_XATTR_UNSPEC, 868 BPF_FS_XATTR_SECURITY, 869 BPF_FS_XATTR_TRUSTED, 870 }; 871 872 static int bpf_fs_xattr_set(const struct xattr_handler *handler, 873 struct mnt_idmap *idmap, struct dentry *unused, 874 struct inode *inode, const char *name, 875 const void *value, size_t size, int flags) 876 { 877 struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; 878 struct bpf_fs_inode *bi = BPF_FS_I(inode); 879 struct simple_xattr *old; 880 int err = -EINVAL; 881 882 name = xattr_full_name(handler, name); 883 switch (handler->flags) { 884 case BPF_FS_XATTR_SECURITY: 885 err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs, 886 &bi->xlimits, name, value, size, 887 flags); 888 break; 889 case BPF_FS_XATTR_TRUSTED: 890 old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name, 891 value, size, flags); 892 err = IS_ERR(old) ? PTR_ERR(old) : 0; 893 if (!err) 894 simple_xattr_free_rcu(old); 895 break; 896 } 897 if (err) 898 return err; 899 inode_set_ctime_current(inode); 900 return 0; 901 } 902 903 static const struct xattr_handler bpf_fs_trusted_xattr_handler = { 904 .prefix = XATTR_TRUSTED_PREFIX, 905 .flags = BPF_FS_XATTR_TRUSTED, 906 .get = bpf_fs_xattr_get, 907 .set = bpf_fs_xattr_set, 908 }; 909 910 static const struct xattr_handler bpf_fs_security_xattr_handler = { 911 .prefix = XATTR_SECURITY_PREFIX, 912 .flags = BPF_FS_XATTR_SECURITY, 913 .get = bpf_fs_xattr_get, 914 .set = bpf_fs_xattr_set, 915 }; 916 917 static const struct xattr_handler * const bpf_fs_xattr_handlers[] = { 918 &bpf_fs_trusted_xattr_handler, 919 &bpf_fs_security_xattr_handler, 920 NULL, 921 }; 922 923 static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size) 924 { 925 struct inode *inode = d_inode(dentry); 926 927 return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size); 928 } 929 930 static int bpf_fs_initxattrs(struct inode *inode, 931 const struct xattr *xattr_array, void *fs_info) 932 { 933 struct bpf_mount_opts *opts = inode->i_sb->s_fs_info; 934 struct bpf_fs_inode *bi = BPF_FS_I(inode); 935 const struct xattr *xattr; 936 int err; 937 938 for (xattr = xattr_array; xattr->name != NULL; xattr++) { 939 CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len); 940 if (IS_ERR(new_xattr)) 941 return PTR_ERR(new_xattr); 942 943 new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT, 944 XATTR_SECURITY_PREFIX "%s", 945 xattr->name); 946 if (!new_xattr->name) 947 return -ENOMEM; 948 949 err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs, 950 &bi->xlimits, new_xattr); 951 if (err) 952 return err; 953 954 retain_and_null_ptr(new_xattr); 955 } 956 return 0; 957 } 958 959 const struct super_operations bpf_super_ops = { 960 .statfs = simple_statfs, 961 .drop_inode = inode_just_drop, 962 .show_options = bpf_show_options, 963 .alloc_inode = bpf_fs_alloc_inode, 964 .destroy_inode = bpf_destroy_inode, 965 .free_inode = bpf_free_inode, 966 }; 967 968 enum { 969 OPT_UID, 970 OPT_GID, 971 OPT_MODE, 972 OPT_DELEGATE_CMDS, 973 OPT_DELEGATE_MAPS, 974 OPT_DELEGATE_PROGS, 975 OPT_DELEGATE_ATTACHS, 976 }; 977 978 static const struct fs_parameter_spec bpf_fs_parameters[] = { 979 fsparam_u32 ("uid", OPT_UID), 980 fsparam_u32 ("gid", OPT_GID), 981 fsparam_u32oct ("mode", OPT_MODE), 982 fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), 983 fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), 984 fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS), 985 fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS), 986 {} 987 }; 988 989 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) 990 { 991 struct bpf_mount_opts *opts = fc->s_fs_info; 992 struct fs_parse_result result; 993 kuid_t uid; 994 kgid_t gid; 995 int opt, err; 996 997 opt = fs_parse(fc, bpf_fs_parameters, param, &result); 998 if (opt < 0) { 999 /* We might like to report bad mount options here, but 1000 * traditionally we've ignored all mount options, so we'd 1001 * better continue to ignore non-existing options for bpf. 1002 */ 1003 if (opt == -ENOPARAM) { 1004 opt = vfs_parse_fs_param_source(fc, param); 1005 if (opt != -ENOPARAM) 1006 return opt; 1007 1008 return 0; 1009 } 1010 1011 if (opt < 0) 1012 return opt; 1013 } 1014 1015 switch (opt) { 1016 case OPT_UID: 1017 uid = make_kuid(current_user_ns(), result.uint_32); 1018 if (!uid_valid(uid)) 1019 goto bad_value; 1020 1021 /* 1022 * The requested uid must be representable in the 1023 * filesystem's idmapping. 1024 */ 1025 if (!kuid_has_mapping(fc->user_ns, uid)) 1026 goto bad_value; 1027 1028 opts->uid = uid; 1029 break; 1030 case OPT_GID: 1031 gid = make_kgid(current_user_ns(), result.uint_32); 1032 if (!gid_valid(gid)) 1033 goto bad_value; 1034 1035 /* 1036 * The requested gid must be representable in the 1037 * filesystem's idmapping. 1038 */ 1039 if (!kgid_has_mapping(fc->user_ns, gid)) 1040 goto bad_value; 1041 1042 opts->gid = gid; 1043 break; 1044 case OPT_MODE: 1045 opts->mode = result.uint_32 & S_IALLUGO; 1046 break; 1047 case OPT_DELEGATE_CMDS: 1048 case OPT_DELEGATE_MAPS: 1049 case OPT_DELEGATE_PROGS: 1050 case OPT_DELEGATE_ATTACHS: { 1051 struct bpffs_btf_enums info; 1052 const struct btf_type *enum_t; 1053 const char *enum_pfx; 1054 u64 *delegate_msk, msk = 0; 1055 char *p, *str; 1056 int val; 1057 1058 /* ignore errors, fallback to hex */ 1059 (void)find_bpffs_btf_enums(&info); 1060 1061 switch (opt) { 1062 case OPT_DELEGATE_CMDS: 1063 delegate_msk = &opts->delegate_cmds; 1064 enum_t = info.cmd_t; 1065 enum_pfx = "BPF_"; 1066 break; 1067 case OPT_DELEGATE_MAPS: 1068 delegate_msk = &opts->delegate_maps; 1069 enum_t = info.map_t; 1070 enum_pfx = "BPF_MAP_TYPE_"; 1071 break; 1072 case OPT_DELEGATE_PROGS: 1073 delegate_msk = &opts->delegate_progs; 1074 enum_t = info.prog_t; 1075 enum_pfx = "BPF_PROG_TYPE_"; 1076 break; 1077 case OPT_DELEGATE_ATTACHS: 1078 delegate_msk = &opts->delegate_attachs; 1079 enum_t = info.attach_t; 1080 enum_pfx = "BPF_"; 1081 break; 1082 default: 1083 return -EINVAL; 1084 } 1085 1086 str = param->string; 1087 while ((p = strsep(&str, ":"))) { 1088 if (strcmp(p, "any") == 0) { 1089 msk |= ~0ULL; 1090 } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { 1091 msk |= 1ULL << val; 1092 } else { 1093 err = kstrtou64(p, 0, &msk); 1094 if (err) 1095 return err; 1096 } 1097 } 1098 1099 /* Setting delegation mount options requires privileges */ 1100 if (msk && !capable(CAP_SYS_ADMIN)) 1101 return -EPERM; 1102 1103 *delegate_msk |= msk; 1104 break; 1105 } 1106 default: 1107 /* ignore unknown mount options */ 1108 break; 1109 } 1110 1111 return 0; 1112 bad_value: 1113 return invalfc(fc, "Bad value for '%s'", param->key); 1114 } 1115 1116 struct bpf_preload_ops *bpf_preload_ops; 1117 EXPORT_SYMBOL_GPL(bpf_preload_ops); 1118 1119 static bool bpf_preload_mod_get(void) 1120 { 1121 /* If bpf_preload.ko wasn't loaded earlier then load it now. 1122 * When bpf_preload is built into vmlinux the module's __init 1123 * function will populate it. 1124 */ 1125 if (!bpf_preload_ops) { 1126 request_module("bpf_preload"); 1127 if (!bpf_preload_ops) 1128 return false; 1129 } 1130 /* And grab the reference, so the module doesn't disappear while the 1131 * kernel is interacting with the kernel module and its UMD. 1132 */ 1133 if (!try_module_get(bpf_preload_ops->owner)) { 1134 pr_err("bpf_preload module get failed.\n"); 1135 return false; 1136 } 1137 return true; 1138 } 1139 1140 static void bpf_preload_mod_put(void) 1141 { 1142 if (bpf_preload_ops) 1143 /* now user can "rmmod bpf_preload" if necessary */ 1144 module_put(bpf_preload_ops->owner); 1145 } 1146 1147 static DEFINE_MUTEX(bpf_preload_lock); 1148 1149 static int populate_bpffs(struct dentry *parent) 1150 { 1151 struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; 1152 int err = 0, i; 1153 1154 /* grab the mutex to make sure the kernel interactions with bpf_preload 1155 * are serialized 1156 */ 1157 mutex_lock(&bpf_preload_lock); 1158 1159 /* if bpf_preload.ko wasn't built into vmlinux then load it */ 1160 if (!bpf_preload_mod_get()) 1161 goto out; 1162 1163 err = bpf_preload_ops->preload(objs); 1164 if (err) 1165 goto out_put; 1166 for (i = 0; i < BPF_PRELOAD_LINKS; i++) { 1167 bpf_link_inc(objs[i].link); 1168 err = bpf_iter_link_pin_kernel(parent, 1169 objs[i].link_name, objs[i].link); 1170 if (err) { 1171 bpf_link_put(objs[i].link); 1172 goto out_put; 1173 } 1174 } 1175 out_put: 1176 bpf_preload_mod_put(); 1177 out: 1178 mutex_unlock(&bpf_preload_lock); 1179 return err; 1180 } 1181 1182 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) 1183 { 1184 struct bpf_mount_opts *opts = sb->s_fs_info; 1185 struct inode *inode; 1186 1187 /* Mounting an instance of BPF FS requires privileges */ 1188 if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) 1189 return -EPERM; 1190 1191 sb->s_blocksize = PAGE_SIZE; 1192 sb->s_blocksize_bits = PAGE_SHIFT; 1193 sb->s_magic = BPF_FS_MAGIC; 1194 sb->s_op = &bpf_super_ops; 1195 sb->s_xattr = bpf_fs_xattr_handlers; 1196 sb->s_iflags |= SB_I_NOEXEC; 1197 sb->s_iflags |= SB_I_NODEV; 1198 sb->s_time_gran = 1; 1199 1200 inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777); 1201 if (IS_ERR(inode)) 1202 return PTR_ERR(inode); 1203 1204 inode->i_ino = 1; 1205 inode->i_op = &bpf_dir_iops; 1206 inode->i_fop = &simple_dir_operations; 1207 set_nlink(inode, 2); 1208 1209 sb->s_root = d_make_root(inode); 1210 if (!sb->s_root) 1211 return -ENOMEM; 1212 1213 inode = d_inode(sb->s_root); 1214 inode->i_uid = opts->uid; 1215 inode->i_gid = opts->gid; 1216 inode->i_mode &= ~S_IALLUGO; 1217 populate_bpffs(sb->s_root); 1218 inode->i_mode |= S_ISVTX | opts->mode; 1219 return 0; 1220 } 1221 1222 static int bpf_get_tree(struct fs_context *fc) 1223 { 1224 return get_tree_nodev(fc, bpf_fill_super); 1225 } 1226 1227 static void bpf_free_fc(struct fs_context *fc) 1228 { 1229 kfree(fc->s_fs_info); 1230 } 1231 1232 static const struct fs_context_operations bpf_context_ops = { 1233 .free = bpf_free_fc, 1234 .parse_param = bpf_parse_param, 1235 .get_tree = bpf_get_tree, 1236 }; 1237 1238 /* 1239 * Set up the filesystem mount context. 1240 */ 1241 static int bpf_init_fs_context(struct fs_context *fc) 1242 { 1243 struct bpf_mount_opts *opts; 1244 1245 opts = kzalloc_obj(struct bpf_mount_opts); 1246 if (!opts) 1247 return -ENOMEM; 1248 1249 opts->mode = S_IRWXUGO; 1250 opts->uid = current_fsuid(); 1251 opts->gid = current_fsgid(); 1252 1253 /* start out with no BPF token delegation enabled */ 1254 opts->delegate_cmds = 0; 1255 opts->delegate_maps = 0; 1256 opts->delegate_progs = 0; 1257 opts->delegate_attachs = 0; 1258 1259 fc->s_fs_info = opts; 1260 fc->ops = &bpf_context_ops; 1261 return 0; 1262 } 1263 1264 static void bpf_kill_super(struct super_block *sb) 1265 { 1266 struct bpf_mount_opts *opts = sb->s_fs_info; 1267 1268 kill_anon_super(sb); 1269 simple_xattr_cache_cleanup(&opts->xa_cache); 1270 kfree(opts); 1271 } 1272 1273 static struct file_system_type bpf_fs_type = { 1274 .owner = THIS_MODULE, 1275 .name = "bpf", 1276 .init_fs_context = bpf_init_fs_context, 1277 .parameters = bpf_fs_parameters, 1278 .kill_sb = bpf_kill_super, 1279 .fs_flags = FS_USERNS_MOUNT, 1280 }; 1281 1282 static void bpf_fs_inode_init_once(void *foo) 1283 { 1284 struct bpf_fs_inode *bi = foo; 1285 1286 inode_init_once(&bi->vfs_inode); 1287 } 1288 1289 static int __init bpf_init(void) 1290 { 1291 int ret; 1292 1293 bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache", 1294 sizeof(struct bpf_fs_inode), 1295 0, SLAB_ACCOUNT, 1296 bpf_fs_inode_init_once); 1297 if (!bpf_fs_inode_cachep) 1298 return -ENOMEM; 1299 1300 ret = sysfs_create_mount_point(fs_kobj, "bpf"); 1301 if (ret) 1302 goto out_cache; 1303 1304 ret = register_filesystem(&bpf_fs_type); 1305 if (ret) { 1306 sysfs_remove_mount_point(fs_kobj, "bpf"); 1307 goto out_cache; 1308 } 1309 1310 return 0; 1311 out_cache: 1312 kmem_cache_destroy(bpf_fs_inode_cachep); 1313 return ret; 1314 } 1315 fs_initcall(bpf_init); 1316