xref: /linux/kernel/bpf/inode.c (revision 9722955b54307e9070994f2382ec06af3d7405e0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Minimal file system backend for holding eBPF maps and programs,
4  * used by bpf(2) object pinning.
5  *
6  * Authors:
7  *
8  *	Daniel Borkmann <daniel@iogearbox.net>
9  */
10 
11 #include <linux/init.h>
12 #include <linux/magic.h>
13 #include <linux/major.h>
14 #include <linux/mount.h>
15 #include <linux/namei.h>
16 #include <linux/fs.h>
17 #include <linux/fs_context.h>
18 #include <linux/fs_parser.h>
19 #include <linux/kdev_t.h>
20 #include <linux/filter.h>
21 #include <linux/bpf.h>
22 #include <linux/bpf_trace.h>
23 #include <linux/kstrtox.h>
24 #include <linux/xattr.h>
25 #include <linux/security.h>
26 
27 #include "preload/bpf_preload.h"
28 
29 enum bpf_type {
30 	BPF_TYPE_UNSPEC	= 0,
31 	BPF_TYPE_PROG,
32 	BPF_TYPE_MAP,
33 	BPF_TYPE_LINK,
34 };
35 
36 struct bpf_fs_inode {
37 	struct list_head		xattrs;
38 	struct simple_xattr_limits	xlimits;
39 	struct inode			vfs_inode;
40 };
41 
42 static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode)
43 {
44 	return container_of(inode, struct bpf_fs_inode, vfs_inode);
45 }
46 
47 static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init;
48 
49 static int bpf_fs_initxattrs(struct inode *inode,
50 			     const struct xattr *xattr_array, void *fs_info);
51 static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size);
52 
53 static void *bpf_any_get(void *raw, enum bpf_type type)
54 {
55 	switch (type) {
56 	case BPF_TYPE_PROG:
57 		bpf_prog_inc(raw);
58 		break;
59 	case BPF_TYPE_MAP:
60 		bpf_map_inc_with_uref(raw);
61 		break;
62 	case BPF_TYPE_LINK:
63 		bpf_link_inc(raw);
64 		break;
65 	default:
66 		WARN_ON_ONCE(1);
67 		break;
68 	}
69 
70 	return raw;
71 }
72 
73 static void bpf_any_put(void *raw, enum bpf_type type)
74 {
75 	switch (type) {
76 	case BPF_TYPE_PROG:
77 		bpf_prog_put(raw);
78 		break;
79 	case BPF_TYPE_MAP:
80 		bpf_map_put_with_uref(raw);
81 		break;
82 	case BPF_TYPE_LINK:
83 		bpf_link_put(raw);
84 		break;
85 	default:
86 		WARN_ON_ONCE(1);
87 		break;
88 	}
89 }
90 
91 static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
92 {
93 	void *raw;
94 
95 	raw = bpf_map_get_with_uref(ufd);
96 	if (!IS_ERR(raw)) {
97 		*type = BPF_TYPE_MAP;
98 		return raw;
99 	}
100 
101 	raw = bpf_prog_get(ufd);
102 	if (!IS_ERR(raw)) {
103 		*type = BPF_TYPE_PROG;
104 		return raw;
105 	}
106 
107 	raw = bpf_link_get_from_fd(ufd);
108 	if (!IS_ERR(raw)) {
109 		*type = BPF_TYPE_LINK;
110 		return raw;
111 	}
112 
113 	return ERR_PTR(-EINVAL);
114 }
115 
116 static const struct inode_operations bpf_dir_iops;
117 static const struct inode_operations bpf_symlink_iops;
118 
119 static const struct inode_operations bpf_prog_iops = {
120 	.listxattr	= bpf_fs_listxattr,
121 };
122 static const struct inode_operations bpf_map_iops  = {
123 	.listxattr	= bpf_fs_listxattr,
124 };
125 static const struct inode_operations bpf_link_iops  = {
126 	.listxattr	= bpf_fs_listxattr,
127 };
128 
129 struct inode *bpf_get_inode(struct super_block *sb,
130 			    const struct inode *dir,
131 			    umode_t mode)
132 {
133 	struct inode *inode;
134 
135 	switch (mode & S_IFMT) {
136 	case S_IFDIR:
137 	case S_IFREG:
138 	case S_IFLNK:
139 		break;
140 	default:
141 		return ERR_PTR(-EINVAL);
142 	}
143 
144 	inode = new_inode(sb);
145 	if (!inode)
146 		return ERR_PTR(-ENOSPC);
147 
148 	inode->i_ino = get_next_ino();
149 	simple_inode_init_ts(inode);
150 
151 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
152 
153 	return inode;
154 }
155 
156 static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
157 {
158 	*type = BPF_TYPE_UNSPEC;
159 	if (inode->i_op == &bpf_prog_iops)
160 		*type = BPF_TYPE_PROG;
161 	else if (inode->i_op == &bpf_map_iops)
162 		*type = BPF_TYPE_MAP;
163 	else if (inode->i_op == &bpf_link_iops)
164 		*type = BPF_TYPE_LINK;
165 	else
166 		return -EACCES;
167 
168 	return 0;
169 }
170 
171 static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
172 				struct inode *dir)
173 {
174 	d_make_persistent(dentry, inode);
175 
176 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
177 }
178 
179 static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
180 				struct dentry *dentry, umode_t mode)
181 {
182 	struct inode *inode;
183 	int ret;
184 
185 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
186 	if (IS_ERR(inode))
187 		return ERR_CAST(inode);
188 
189 	ret = security_inode_init_security(inode, dir, &dentry->d_name,
190 					   bpf_fs_initxattrs, NULL);
191 	if (ret && ret != -EOPNOTSUPP) {
192 		iput(inode);
193 		return ERR_PTR(ret);
194 	}
195 
196 	inode->i_op = &bpf_dir_iops;
197 	inode->i_fop = &simple_dir_operations;
198 
199 	inc_nlink(inode);
200 	inc_nlink(dir);
201 
202 	bpf_dentry_finalize(dentry, inode, dir);
203 	return NULL;
204 }
205 
206 struct map_iter {
207 	void *key;
208 	bool done;
209 };
210 
211 static struct map_iter *map_iter(struct seq_file *m)
212 {
213 	return m->private;
214 }
215 
216 static struct bpf_map *seq_file_to_map(struct seq_file *m)
217 {
218 	return file_inode(m->file)->i_private;
219 }
220 
221 static void map_iter_free(struct map_iter *iter)
222 {
223 	if (iter) {
224 		kfree(iter->key);
225 		kfree(iter);
226 	}
227 }
228 
229 static struct map_iter *map_iter_alloc(struct bpf_map *map)
230 {
231 	struct map_iter *iter;
232 
233 	iter = kzalloc_obj(*iter, GFP_KERNEL | __GFP_NOWARN);
234 	if (!iter)
235 		goto error;
236 
237 	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
238 	if (!iter->key)
239 		goto error;
240 
241 	return iter;
242 
243 error:
244 	map_iter_free(iter);
245 	return NULL;
246 }
247 
248 static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
249 {
250 	struct bpf_map *map = seq_file_to_map(m);
251 	void *key = map_iter(m)->key;
252 	void *prev_key;
253 
254 	(*pos)++;
255 	if (map_iter(m)->done)
256 		return NULL;
257 
258 	if (unlikely(v == SEQ_START_TOKEN))
259 		prev_key = NULL;
260 	else
261 		prev_key = key;
262 
263 	rcu_read_lock();
264 	if (map->ops->map_get_next_key(map, prev_key, key)) {
265 		map_iter(m)->done = true;
266 		key = NULL;
267 	}
268 	rcu_read_unlock();
269 	return key;
270 }
271 
272 static void *map_seq_start(struct seq_file *m, loff_t *pos)
273 {
274 	if (map_iter(m)->done)
275 		return NULL;
276 
277 	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
278 }
279 
280 static void map_seq_stop(struct seq_file *m, void *v)
281 {
282 }
283 
284 static int map_seq_show(struct seq_file *m, void *v)
285 {
286 	struct bpf_map *map = seq_file_to_map(m);
287 	void *key = map_iter(m)->key;
288 
289 	if (unlikely(v == SEQ_START_TOKEN)) {
290 		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
291 		seq_puts(m, "# WARNING!! The output format will change\n");
292 	} else {
293 		map->ops->map_seq_show_elem(map, key, m);
294 	}
295 
296 	return 0;
297 }
298 
299 static const struct seq_operations bpffs_map_seq_ops = {
300 	.start	= map_seq_start,
301 	.next	= map_seq_next,
302 	.show	= map_seq_show,
303 	.stop	= map_seq_stop,
304 };
305 
306 static int bpffs_map_open(struct inode *inode, struct file *file)
307 {
308 	struct bpf_map *map = inode->i_private;
309 	struct map_iter *iter;
310 	struct seq_file *m;
311 	int err;
312 
313 	iter = map_iter_alloc(map);
314 	if (!iter)
315 		return -ENOMEM;
316 
317 	err = seq_open(file, &bpffs_map_seq_ops);
318 	if (err) {
319 		map_iter_free(iter);
320 		return err;
321 	}
322 
323 	m = file->private_data;
324 	m->private = iter;
325 
326 	return 0;
327 }
328 
329 static int bpffs_map_release(struct inode *inode, struct file *file)
330 {
331 	struct seq_file *m = file->private_data;
332 
333 	map_iter_free(map_iter(m));
334 
335 	return seq_release(inode, file);
336 }
337 
338 /* bpffs_map_fops should only implement the basic
339  * read operation for a BPF map.  The purpose is to
340  * provide a simple user intuitive way to do
341  * "cat bpffs/pathto/a-pinned-map".
342  *
343  * Other operations (e.g. write, lookup...) should be realized by
344  * the userspace tools (e.g. bpftool) through the
345  * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
346  * interface.
347  */
348 static const struct file_operations bpffs_map_fops = {
349 	.open		= bpffs_map_open,
350 	.read		= seq_read,
351 	.release	= bpffs_map_release,
352 };
353 
354 static int bpffs_obj_open(struct inode *inode, struct file *file)
355 {
356 	return -EIO;
357 }
358 
359 static const struct file_operations bpffs_obj_fops = {
360 	.open		= bpffs_obj_open,
361 };
362 
363 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
364 			 const struct inode_operations *iops,
365 			 const struct file_operations *fops)
366 {
367 	struct inode *dir = dentry->d_parent->d_inode;
368 	struct inode *inode;
369 	int ret;
370 
371 	inode = bpf_get_inode(dir->i_sb, dir, mode);
372 	if (IS_ERR(inode))
373 		return PTR_ERR(inode);
374 
375 	ret = security_inode_init_security(inode, dir, &dentry->d_name,
376 					   bpf_fs_initxattrs, NULL);
377 	if (ret && ret != -EOPNOTSUPP) {
378 		iput(inode);
379 		return ret;
380 	}
381 
382 	inode->i_op = iops;
383 	inode->i_fop = fops;
384 	inode->i_private = raw;
385 
386 	bpf_dentry_finalize(dentry, inode, dir);
387 	return 0;
388 }
389 
390 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
391 {
392 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
393 			     &bpffs_obj_fops);
394 }
395 
396 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
397 {
398 	struct bpf_map *map = arg;
399 
400 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
401 			     bpf_map_support_seq_show(map) ?
402 			     &bpffs_map_fops : &bpffs_obj_fops);
403 }
404 
405 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
406 {
407 	struct bpf_link *link = arg;
408 
409 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
410 			     bpf_link_is_iter(link) ?
411 			     &bpf_iter_fops : &bpffs_obj_fops);
412 }
413 
414 static struct dentry *
415 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
416 {
417 	/* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
418 	 * extensions. That allows popoulate_bpffs() create special files.
419 	 */
420 	if ((dir->i_mode & S_IALLUGO) &&
421 	    strchr(dentry->d_name.name, '.'))
422 		return ERR_PTR(-EPERM);
423 
424 	return simple_lookup(dir, dentry, flags);
425 }
426 
427 static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
428 		       struct dentry *dentry, const char *target)
429 {
430 	struct inode *inode;
431 	char *link;
432 	int ret;
433 
434 	link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
435 	if (!link)
436 		return -ENOMEM;
437 
438 	inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
439 	if (IS_ERR(inode)) {
440 		kfree(link);
441 		return PTR_ERR(inode);
442 	}
443 
444 	inode->i_op = &bpf_symlink_iops;
445 	inode->i_link = link;
446 
447 	ret = security_inode_init_security(inode, dir, &dentry->d_name,
448 					   bpf_fs_initxattrs, NULL);
449 	if (ret && ret != -EOPNOTSUPP) {
450 		iput(inode);
451 		return ret;
452 	}
453 
454 	bpf_dentry_finalize(dentry, inode, dir);
455 	return 0;
456 }
457 
458 static const struct inode_operations bpf_symlink_iops = {
459 	.get_link	= simple_get_link,
460 	.listxattr	= bpf_fs_listxattr,
461 };
462 
463 static const struct inode_operations bpf_dir_iops = {
464 	.lookup		= bpf_lookup,
465 	.mkdir		= bpf_mkdir,
466 	.symlink	= bpf_symlink,
467 	.rmdir		= simple_rmdir,
468 	.rename		= simple_rename,
469 	.link		= simple_link,
470 	.unlink		= simple_unlink,
471 	.listxattr	= bpf_fs_listxattr,
472 };
473 
474 /* pin iterator link into bpffs */
475 static int bpf_iter_link_pin_kernel(struct dentry *parent,
476 				    const char *name, struct bpf_link *link)
477 {
478 	umode_t mode = S_IFREG | S_IRUSR;
479 	struct dentry *dentry;
480 	int ret;
481 
482 	dentry = simple_start_creating(parent, name);
483 	if (IS_ERR(dentry))
484 		return PTR_ERR(dentry);
485 	ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
486 			    &bpf_iter_fops);
487 	simple_done_creating(dentry);
488 	return ret;
489 }
490 
491 static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
492 			  enum bpf_type type)
493 {
494 	struct dentry *dentry;
495 	struct inode *dir;
496 	struct path path;
497 	umode_t mode;
498 	int ret;
499 
500 	dentry = start_creating_user_path(path_fd, pathname, &path, 0);
501 	if (IS_ERR(dentry))
502 		return PTR_ERR(dentry);
503 
504 	dir = d_inode(path.dentry);
505 	if (dir->i_op != &bpf_dir_iops) {
506 		ret = -EPERM;
507 		goto out;
508 	}
509 
510 	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
511 	ret = security_path_mknod(&path, dentry, mode, 0);
512 	if (ret)
513 		goto out;
514 
515 	switch (type) {
516 	case BPF_TYPE_PROG:
517 		ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
518 		break;
519 	case BPF_TYPE_MAP:
520 		ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
521 		break;
522 	case BPF_TYPE_LINK:
523 		ret = vfs_mkobj(dentry, mode, bpf_mklink, raw);
524 		break;
525 	default:
526 		ret = -EPERM;
527 	}
528 out:
529 	end_creating_path(&path, dentry);
530 	return ret;
531 }
532 
533 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname)
534 {
535 	enum bpf_type type;
536 	void *raw;
537 	int ret;
538 
539 	raw = bpf_fd_probe_obj(ufd, &type);
540 	if (IS_ERR(raw))
541 		return PTR_ERR(raw);
542 
543 	ret = bpf_obj_do_pin(path_fd, pathname, raw, type);
544 	if (ret != 0)
545 		bpf_any_put(raw, type);
546 
547 	return ret;
548 }
549 
550 static void *bpf_obj_do_get(int path_fd, const char __user *pathname,
551 			    enum bpf_type *type, int flags)
552 {
553 	struct inode *inode;
554 	struct path path;
555 	void *raw;
556 	int ret;
557 
558 	ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path);
559 	if (ret)
560 		return ERR_PTR(ret);
561 
562 	inode = d_backing_inode(path.dentry);
563 	ret = path_permission(&path, ACC_MODE(flags));
564 	if (ret)
565 		goto out;
566 
567 	ret = bpf_inode_type(inode, type);
568 	if (ret)
569 		goto out;
570 
571 	raw = bpf_any_get(inode->i_private, *type);
572 	if (!IS_ERR(raw))
573 		touch_atime(&path);
574 
575 	path_put(&path);
576 	return raw;
577 out:
578 	path_put(&path);
579 	return ERR_PTR(ret);
580 }
581 
582 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags)
583 {
584 	enum bpf_type type = BPF_TYPE_UNSPEC;
585 	int f_flags;
586 	void *raw;
587 	int ret;
588 
589 	f_flags = bpf_get_file_flag(flags);
590 	if (f_flags < 0)
591 		return f_flags;
592 
593 	raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags);
594 	if (IS_ERR(raw))
595 		return PTR_ERR(raw);
596 
597 	if (type == BPF_TYPE_PROG)
598 		ret = bpf_prog_new_fd(raw);
599 	else if (type == BPF_TYPE_MAP)
600 		ret = bpf_map_new_fd(raw, f_flags);
601 	else if (type == BPF_TYPE_LINK)
602 		ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
603 	else
604 		return -ENOENT;
605 
606 	if (ret < 0)
607 		bpf_any_put(raw, type);
608 	return ret;
609 }
610 
611 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
612 {
613 	struct bpf_prog *prog;
614 	int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
615 	if (ret)
616 		return ERR_PTR(ret);
617 
618 	if (inode->i_op == &bpf_map_iops)
619 		return ERR_PTR(-EINVAL);
620 	if (inode->i_op == &bpf_link_iops)
621 		return ERR_PTR(-EINVAL);
622 	if (inode->i_op != &bpf_prog_iops)
623 		return ERR_PTR(-EACCES);
624 
625 	prog = inode->i_private;
626 
627 	ret = security_bpf_prog(prog);
628 	if (ret < 0)
629 		return ERR_PTR(ret);
630 
631 	if (!bpf_prog_get_ok(prog, &type, false))
632 		return ERR_PTR(-EINVAL);
633 
634 	bpf_prog_inc(prog);
635 	return prog;
636 }
637 
638 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
639 {
640 	struct bpf_prog *prog;
641 	struct path path;
642 	int ret = kern_path(name, LOOKUP_FOLLOW, &path);
643 	if (ret)
644 		return ERR_PTR(ret);
645 	prog = __get_prog_inode(d_backing_inode(path.dentry), type);
646 	if (!IS_ERR(prog))
647 		touch_atime(&path);
648 	path_put(&path);
649 	return prog;
650 }
651 EXPORT_SYMBOL(bpf_prog_get_type_path);
652 
653 struct bpffs_btf_enums {
654 	const struct btf *btf;
655 	const struct btf_type *cmd_t;
656 	const struct btf_type *map_t;
657 	const struct btf_type *prog_t;
658 	const struct btf_type *attach_t;
659 };
660 
661 static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
662 {
663 	struct {
664 		const struct btf_type **type;
665 		const char *name;
666 	} btf_enums[] = {
667 		{&info->cmd_t,		"bpf_cmd"},
668 		{&info->map_t,		"bpf_map_type"},
669 		{&info->prog_t,		"bpf_prog_type"},
670 		{&info->attach_t,	"bpf_attach_type"},
671 	};
672 	const struct btf *btf;
673 	int i, id;
674 
675 	memset(info, 0, sizeof(*info));
676 
677 	btf = bpf_get_btf_vmlinux();
678 	if (IS_ERR(btf))
679 		return PTR_ERR(btf);
680 	if (!btf)
681 		return -ENOENT;
682 
683 	info->btf = btf;
684 
685 	for (i = 0; i < ARRAY_SIZE(btf_enums); i++) {
686 		id = btf_find_by_name_kind(btf, btf_enums[i].name,
687 					   BTF_KIND_ENUM);
688 		if (id < 0)
689 			return -ESRCH;
690 
691 		*btf_enums[i].type = btf_type_by_id(btf, id);
692 	}
693 
694 	return 0;
695 }
696 
697 static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
698 				const char *prefix, const char *str, int *value)
699 {
700 	const struct btf_enum *e;
701 	const char *name;
702 	int i, n, pfx_len = strlen(prefix);
703 
704 	*value = 0;
705 
706 	if (!btf || !enum_t)
707 		return false;
708 
709 	for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
710 		e = &btf_enum(enum_t)[i];
711 
712 		name = btf_name_by_offset(btf, e->name_off);
713 		if (!name || strncasecmp(name, prefix, pfx_len) != 0)
714 			continue;
715 
716 		/* match symbolic name case insensitive and ignoring prefix */
717 		if (strcasecmp(name + pfx_len, str) == 0) {
718 			*value = e->val;
719 			return true;
720 		}
721 	}
722 
723 	return false;
724 }
725 
726 static void seq_print_delegate_opts(struct seq_file *m,
727 				    const char *opt_name,
728 				    const struct btf *btf,
729 				    const struct btf_type *enum_t,
730 				    const char *prefix,
731 				    u64 delegate_msk, u64 any_msk)
732 {
733 	const struct btf_enum *e;
734 	bool first = true;
735 	const char *name;
736 	u64 msk;
737 	int i, n, pfx_len = strlen(prefix);
738 
739 	delegate_msk &= any_msk; /* clear unknown bits */
740 
741 	if (delegate_msk == 0)
742 		return;
743 
744 	seq_printf(m, ",%s", opt_name);
745 	if (delegate_msk == any_msk) {
746 		seq_printf(m, "=any");
747 		return;
748 	}
749 
750 	if (btf && enum_t) {
751 		for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
752 			e = &btf_enum(enum_t)[i];
753 			name = btf_name_by_offset(btf, e->name_off);
754 			if (!name || strncasecmp(name, prefix, pfx_len) != 0)
755 				continue;
756 			msk = 1ULL << e->val;
757 			if (delegate_msk & msk) {
758 				/* emit lower-case name without prefix */
759 				seq_putc(m, first ? '=' : ':');
760 				name += pfx_len;
761 				while (*name) {
762 					seq_putc(m, tolower(*name));
763 					name++;
764 				}
765 
766 				delegate_msk &= ~msk;
767 				first = false;
768 			}
769 		}
770 	}
771 	if (delegate_msk)
772 		seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
773 }
774 
775 /*
776  * Display the mount options in /proc/mounts.
777  */
778 static int bpf_show_options(struct seq_file *m, struct dentry *root)
779 {
780 	struct inode *inode = d_inode(root);
781 	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
782 	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
783 	u64 mask;
784 
785 	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
786 		seq_printf(m, ",uid=%u",
787 			   from_kuid_munged(&init_user_ns, inode->i_uid));
788 	if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
789 		seq_printf(m, ",gid=%u",
790 			   from_kgid_munged(&init_user_ns, inode->i_gid));
791 	if (mode != S_IRWXUGO)
792 		seq_printf(m, ",mode=%o", mode);
793 
794 	if (opts->delegate_cmds || opts->delegate_maps ||
795 	    opts->delegate_progs || opts->delegate_attachs) {
796 		struct bpffs_btf_enums info;
797 
798 		/* ignore errors, fallback to hex */
799 		(void)find_bpffs_btf_enums(&info);
800 
801 		mask = (1ULL << __MAX_BPF_CMD) - 1;
802 		seq_print_delegate_opts(m, "delegate_cmds",
803 					info.btf, info.cmd_t, "BPF_",
804 					opts->delegate_cmds, mask);
805 
806 		mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
807 		seq_print_delegate_opts(m, "delegate_maps",
808 					info.btf, info.map_t, "BPF_MAP_TYPE_",
809 					opts->delegate_maps, mask);
810 
811 		mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
812 		seq_print_delegate_opts(m, "delegate_progs",
813 					info.btf, info.prog_t, "BPF_PROG_TYPE_",
814 					opts->delegate_progs, mask);
815 
816 		mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
817 		seq_print_delegate_opts(m, "delegate_attachs",
818 					info.btf, info.attach_t, "BPF_",
819 					opts->delegate_attachs, mask);
820 	}
821 
822 	return 0;
823 }
824 
825 static struct inode *bpf_fs_alloc_inode(struct super_block *sb)
826 {
827 	struct bpf_fs_inode *bi;
828 
829 	bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL);
830 	if (!bi)
831 		return NULL;
832 	INIT_LIST_HEAD_RCU(&bi->xattrs);
833 	simple_xattr_limits_init(&bi->xlimits);
834 	return &bi->vfs_inode;
835 }
836 
837 static void bpf_destroy_inode(struct inode *inode)
838 {
839 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
840 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
841 	enum bpf_type type;
842 
843 	if (!bpf_inode_type(inode, &type))
844 		bpf_any_put(inode->i_private, type);
845 	simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL);
846 }
847 
848 static void bpf_free_inode(struct inode *inode)
849 {
850 	if (S_ISLNK(inode->i_mode))
851 		kfree(inode->i_link);
852 	kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode));
853 }
854 
855 static int bpf_fs_xattr_get(const struct xattr_handler *handler,
856 			    struct dentry *unused, struct inode *inode,
857 			    const char *name, void *value, size_t size)
858 {
859 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
860 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
861 
862 	name = xattr_full_name(handler, name);
863 	return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size);
864 }
865 
866 enum {
867 	BPF_FS_XATTR_UNSPEC,
868 	BPF_FS_XATTR_SECURITY,
869 	BPF_FS_XATTR_TRUSTED,
870 };
871 
872 static int bpf_fs_xattr_set(const struct xattr_handler *handler,
873 			    struct mnt_idmap *idmap, struct dentry *unused,
874 			    struct inode *inode, const char *name,
875 			    const void *value, size_t size, int flags)
876 {
877 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
878 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
879 	struct simple_xattr *old;
880 	int err = -EINVAL;
881 
882 	name = xattr_full_name(handler, name);
883 	switch (handler->flags) {
884 	case BPF_FS_XATTR_SECURITY:
885 		err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs,
886 					       &bi->xlimits, name, value, size,
887 					       flags);
888 		break;
889 	case BPF_FS_XATTR_TRUSTED:
890 		old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name,
891 				       value, size, flags);
892 		err = IS_ERR(old) ? PTR_ERR(old) : 0;
893 		if (!err)
894 			simple_xattr_free_rcu(old);
895 		break;
896 	}
897 	if (err)
898 		return err;
899 	inode_set_ctime_current(inode);
900 	return 0;
901 }
902 
903 static const struct xattr_handler bpf_fs_trusted_xattr_handler = {
904 	.prefix	= XATTR_TRUSTED_PREFIX,
905 	.flags	= BPF_FS_XATTR_TRUSTED,
906 	.get	= bpf_fs_xattr_get,
907 	.set	= bpf_fs_xattr_set,
908 };
909 
910 static const struct xattr_handler bpf_fs_security_xattr_handler = {
911 	.prefix	= XATTR_SECURITY_PREFIX,
912 	.flags	= BPF_FS_XATTR_SECURITY,
913 	.get	= bpf_fs_xattr_get,
914 	.set	= bpf_fs_xattr_set,
915 };
916 
917 static const struct xattr_handler * const bpf_fs_xattr_handlers[] = {
918 	&bpf_fs_trusted_xattr_handler,
919 	&bpf_fs_security_xattr_handler,
920 	NULL,
921 };
922 
923 static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size)
924 {
925 	struct inode *inode = d_inode(dentry);
926 
927 	return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size);
928 }
929 
930 static int bpf_fs_initxattrs(struct inode *inode,
931 			     const struct xattr *xattr_array, void *fs_info)
932 {
933 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
934 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
935 	const struct xattr *xattr;
936 	int err;
937 
938 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
939 		CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len);
940 		if (IS_ERR(new_xattr))
941 			return PTR_ERR(new_xattr);
942 
943 		new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT,
944 					    XATTR_SECURITY_PREFIX "%s",
945 					    xattr->name);
946 		if (!new_xattr->name)
947 			return -ENOMEM;
948 
949 		err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs,
950 					       &bi->xlimits, new_xattr);
951 		if (err)
952 			return err;
953 
954 		retain_and_null_ptr(new_xattr);
955 	}
956 	return 0;
957 }
958 
959 const struct super_operations bpf_super_ops = {
960 	.statfs		= simple_statfs,
961 	.drop_inode	= inode_just_drop,
962 	.show_options	= bpf_show_options,
963 	.alloc_inode	= bpf_fs_alloc_inode,
964 	.destroy_inode	= bpf_destroy_inode,
965 	.free_inode	= bpf_free_inode,
966 };
967 
968 enum {
969 	OPT_UID,
970 	OPT_GID,
971 	OPT_MODE,
972 	OPT_DELEGATE_CMDS,
973 	OPT_DELEGATE_MAPS,
974 	OPT_DELEGATE_PROGS,
975 	OPT_DELEGATE_ATTACHS,
976 };
977 
978 static const struct fs_parameter_spec bpf_fs_parameters[] = {
979 	fsparam_u32	("uid",				OPT_UID),
980 	fsparam_u32	("gid",				OPT_GID),
981 	fsparam_u32oct	("mode",			OPT_MODE),
982 	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
983 	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
984 	fsparam_string	("delegate_progs",		OPT_DELEGATE_PROGS),
985 	fsparam_string	("delegate_attachs",		OPT_DELEGATE_ATTACHS),
986 	{}
987 };
988 
989 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
990 {
991 	struct bpf_mount_opts *opts = fc->s_fs_info;
992 	struct fs_parse_result result;
993 	kuid_t uid;
994 	kgid_t gid;
995 	int opt, err;
996 
997 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
998 	if (opt < 0) {
999 		/* We might like to report bad mount options here, but
1000 		 * traditionally we've ignored all mount options, so we'd
1001 		 * better continue to ignore non-existing options for bpf.
1002 		 */
1003 		if (opt == -ENOPARAM) {
1004 			opt = vfs_parse_fs_param_source(fc, param);
1005 			if (opt != -ENOPARAM)
1006 				return opt;
1007 
1008 			return 0;
1009 		}
1010 
1011 		if (opt < 0)
1012 			return opt;
1013 	}
1014 
1015 	switch (opt) {
1016 	case OPT_UID:
1017 		uid = make_kuid(current_user_ns(), result.uint_32);
1018 		if (!uid_valid(uid))
1019 			goto bad_value;
1020 
1021 		/*
1022 		 * The requested uid must be representable in the
1023 		 * filesystem's idmapping.
1024 		 */
1025 		if (!kuid_has_mapping(fc->user_ns, uid))
1026 			goto bad_value;
1027 
1028 		opts->uid = uid;
1029 		break;
1030 	case OPT_GID:
1031 		gid = make_kgid(current_user_ns(), result.uint_32);
1032 		if (!gid_valid(gid))
1033 			goto bad_value;
1034 
1035 		/*
1036 		 * The requested gid must be representable in the
1037 		 * filesystem's idmapping.
1038 		 */
1039 		if (!kgid_has_mapping(fc->user_ns, gid))
1040 			goto bad_value;
1041 
1042 		opts->gid = gid;
1043 		break;
1044 	case OPT_MODE:
1045 		opts->mode = result.uint_32 & S_IALLUGO;
1046 		break;
1047 	case OPT_DELEGATE_CMDS:
1048 	case OPT_DELEGATE_MAPS:
1049 	case OPT_DELEGATE_PROGS:
1050 	case OPT_DELEGATE_ATTACHS: {
1051 		struct bpffs_btf_enums info;
1052 		const struct btf_type *enum_t;
1053 		const char *enum_pfx;
1054 		u64 *delegate_msk, msk = 0;
1055 		char *p, *str;
1056 		int val;
1057 
1058 		/* ignore errors, fallback to hex */
1059 		(void)find_bpffs_btf_enums(&info);
1060 
1061 		switch (opt) {
1062 		case OPT_DELEGATE_CMDS:
1063 			delegate_msk = &opts->delegate_cmds;
1064 			enum_t = info.cmd_t;
1065 			enum_pfx = "BPF_";
1066 			break;
1067 		case OPT_DELEGATE_MAPS:
1068 			delegate_msk = &opts->delegate_maps;
1069 			enum_t = info.map_t;
1070 			enum_pfx = "BPF_MAP_TYPE_";
1071 			break;
1072 		case OPT_DELEGATE_PROGS:
1073 			delegate_msk = &opts->delegate_progs;
1074 			enum_t = info.prog_t;
1075 			enum_pfx = "BPF_PROG_TYPE_";
1076 			break;
1077 		case OPT_DELEGATE_ATTACHS:
1078 			delegate_msk = &opts->delegate_attachs;
1079 			enum_t = info.attach_t;
1080 			enum_pfx = "BPF_";
1081 			break;
1082 		default:
1083 			return -EINVAL;
1084 		}
1085 
1086 		str = param->string;
1087 		while ((p = strsep(&str, ":"))) {
1088 			if (strcmp(p, "any") == 0) {
1089 				msk |= ~0ULL;
1090 			} else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
1091 				msk |= 1ULL << val;
1092 			} else {
1093 				err = kstrtou64(p, 0, &msk);
1094 				if (err)
1095 					return err;
1096 			}
1097 		}
1098 
1099 		/* Setting delegation mount options requires privileges */
1100 		if (msk && !capable(CAP_SYS_ADMIN))
1101 			return -EPERM;
1102 
1103 		*delegate_msk |= msk;
1104 		break;
1105 	}
1106 	default:
1107 		/* ignore unknown mount options */
1108 		break;
1109 	}
1110 
1111 	return 0;
1112 bad_value:
1113 	return invalfc(fc, "Bad value for '%s'", param->key);
1114 }
1115 
1116 struct bpf_preload_ops *bpf_preload_ops;
1117 EXPORT_SYMBOL_GPL(bpf_preload_ops);
1118 
1119 static bool bpf_preload_mod_get(void)
1120 {
1121 	/* If bpf_preload.ko wasn't loaded earlier then load it now.
1122 	 * When bpf_preload is built into vmlinux the module's __init
1123 	 * function will populate it.
1124 	 */
1125 	if (!bpf_preload_ops) {
1126 		request_module("bpf_preload");
1127 		if (!bpf_preload_ops)
1128 			return false;
1129 	}
1130 	/* And grab the reference, so the module doesn't disappear while the
1131 	 * kernel is interacting with the kernel module and its UMD.
1132 	 */
1133 	if (!try_module_get(bpf_preload_ops->owner)) {
1134 		pr_err("bpf_preload module get failed.\n");
1135 		return false;
1136 	}
1137 	return true;
1138 }
1139 
1140 static void bpf_preload_mod_put(void)
1141 {
1142 	if (bpf_preload_ops)
1143 		/* now user can "rmmod bpf_preload" if necessary */
1144 		module_put(bpf_preload_ops->owner);
1145 }
1146 
1147 static DEFINE_MUTEX(bpf_preload_lock);
1148 
1149 static int populate_bpffs(struct dentry *parent)
1150 {
1151 	struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
1152 	int err = 0, i;
1153 
1154 	/* grab the mutex to make sure the kernel interactions with bpf_preload
1155 	 * are serialized
1156 	 */
1157 	mutex_lock(&bpf_preload_lock);
1158 
1159 	/* if bpf_preload.ko wasn't built into vmlinux then load it */
1160 	if (!bpf_preload_mod_get())
1161 		goto out;
1162 
1163 	err = bpf_preload_ops->preload(objs);
1164 	if (err)
1165 		goto out_put;
1166 	for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
1167 		bpf_link_inc(objs[i].link);
1168 		err = bpf_iter_link_pin_kernel(parent,
1169 					       objs[i].link_name, objs[i].link);
1170 		if (err) {
1171 			bpf_link_put(objs[i].link);
1172 			goto out_put;
1173 		}
1174 	}
1175 out_put:
1176 	bpf_preload_mod_put();
1177 out:
1178 	mutex_unlock(&bpf_preload_lock);
1179 	return err;
1180 }
1181 
1182 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
1183 {
1184 	struct bpf_mount_opts *opts = sb->s_fs_info;
1185 	struct inode *inode;
1186 
1187 	/* Mounting an instance of BPF FS requires privileges */
1188 	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
1189 		return -EPERM;
1190 
1191 	sb->s_blocksize = PAGE_SIZE;
1192 	sb->s_blocksize_bits = PAGE_SHIFT;
1193 	sb->s_magic = BPF_FS_MAGIC;
1194 	sb->s_op = &bpf_super_ops;
1195 	sb->s_xattr = bpf_fs_xattr_handlers;
1196 	sb->s_iflags |= SB_I_NOEXEC;
1197 	sb->s_iflags |= SB_I_NODEV;
1198 	sb->s_time_gran = 1;
1199 
1200 	inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777);
1201 	if (IS_ERR(inode))
1202 		return PTR_ERR(inode);
1203 
1204 	inode->i_ino = 1;
1205 	inode->i_op = &bpf_dir_iops;
1206 	inode->i_fop = &simple_dir_operations;
1207 	set_nlink(inode, 2);
1208 
1209 	sb->s_root = d_make_root(inode);
1210 	if (!sb->s_root)
1211 		return -ENOMEM;
1212 
1213 	inode = d_inode(sb->s_root);
1214 	inode->i_uid = opts->uid;
1215 	inode->i_gid = opts->gid;
1216 	inode->i_mode &= ~S_IALLUGO;
1217 	populate_bpffs(sb->s_root);
1218 	inode->i_mode |= S_ISVTX | opts->mode;
1219 	return 0;
1220 }
1221 
1222 static int bpf_get_tree(struct fs_context *fc)
1223 {
1224 	return get_tree_nodev(fc, bpf_fill_super);
1225 }
1226 
1227 static void bpf_free_fc(struct fs_context *fc)
1228 {
1229 	kfree(fc->s_fs_info);
1230 }
1231 
1232 static const struct fs_context_operations bpf_context_ops = {
1233 	.free		= bpf_free_fc,
1234 	.parse_param	= bpf_parse_param,
1235 	.get_tree	= bpf_get_tree,
1236 };
1237 
1238 /*
1239  * Set up the filesystem mount context.
1240  */
1241 static int bpf_init_fs_context(struct fs_context *fc)
1242 {
1243 	struct bpf_mount_opts *opts;
1244 
1245 	opts = kzalloc_obj(struct bpf_mount_opts);
1246 	if (!opts)
1247 		return -ENOMEM;
1248 
1249 	opts->mode = S_IRWXUGO;
1250 	opts->uid = current_fsuid();
1251 	opts->gid = current_fsgid();
1252 
1253 	/* start out with no BPF token delegation enabled */
1254 	opts->delegate_cmds = 0;
1255 	opts->delegate_maps = 0;
1256 	opts->delegate_progs = 0;
1257 	opts->delegate_attachs = 0;
1258 
1259 	fc->s_fs_info = opts;
1260 	fc->ops = &bpf_context_ops;
1261 	return 0;
1262 }
1263 
1264 static void bpf_kill_super(struct super_block *sb)
1265 {
1266 	struct bpf_mount_opts *opts = sb->s_fs_info;
1267 
1268 	kill_anon_super(sb);
1269 	simple_xattr_cache_cleanup(&opts->xa_cache);
1270 	kfree(opts);
1271 }
1272 
1273 static struct file_system_type bpf_fs_type = {
1274 	.owner		= THIS_MODULE,
1275 	.name		= "bpf",
1276 	.init_fs_context = bpf_init_fs_context,
1277 	.parameters	= bpf_fs_parameters,
1278 	.kill_sb	= bpf_kill_super,
1279 	.fs_flags	= FS_USERNS_MOUNT,
1280 };
1281 
1282 static void bpf_fs_inode_init_once(void *foo)
1283 {
1284 	struct bpf_fs_inode *bi = foo;
1285 
1286 	inode_init_once(&bi->vfs_inode);
1287 }
1288 
1289 static int __init bpf_init(void)
1290 {
1291 	int ret;
1292 
1293 	bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache",
1294 						sizeof(struct bpf_fs_inode),
1295 						0, SLAB_ACCOUNT,
1296 						bpf_fs_inode_init_once);
1297 	if (!bpf_fs_inode_cachep)
1298 		return -ENOMEM;
1299 
1300 	ret = sysfs_create_mount_point(fs_kobj, "bpf");
1301 	if (ret)
1302 		goto out_cache;
1303 
1304 	ret = register_filesystem(&bpf_fs_type);
1305 	if (ret) {
1306 		sysfs_remove_mount_point(fs_kobj, "bpf");
1307 		goto out_cache;
1308 	}
1309 
1310 	return 0;
1311 out_cache:
1312 	kmem_cache_destroy(bpf_fs_inode_cachep);
1313 	return ret;
1314 }
1315 fs_initcall(bpf_init);
1316