xref: /linux/kernel/bpf/inode.c (revision 9c87e61e3c5797277407ba5eae4eac8a52be3fa3)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Minimal file system backend for holding eBPF maps and programs,
4  * used by bpf(2) object pinning.
5  *
6  * Authors:
7  *
8  *	Daniel Borkmann <daniel@iogearbox.net>
9  */
10 
11 #include <linux/init.h>
12 #include <linux/magic.h>
13 #include <linux/major.h>
14 #include <linux/mount.h>
15 #include <linux/namei.h>
16 #include <linux/fs.h>
17 #include <linux/fs_context.h>
18 #include <linux/fs_parser.h>
19 #include <linux/kdev_t.h>
20 #include <linux/filter.h>
21 #include <linux/bpf.h>
22 #include <linux/bpf_trace.h>
23 #include <linux/kstrtox.h>
24 #include <linux/xattr.h>
25 #include <linux/security.h>
26 
27 #include "preload/bpf_preload.h"
28 
29 enum bpf_type {
30 	BPF_TYPE_UNSPEC	= 0,
31 	BPF_TYPE_PROG,
32 	BPF_TYPE_MAP,
33 	BPF_TYPE_LINK,
34 };
35 
36 struct bpf_fs_inode {
37 	struct list_head		xattrs;
38 	struct simple_xattr_limits	xlimits;
39 	struct inode			vfs_inode;
40 };
41 
42 static inline struct bpf_fs_inode *BPF_FS_I(struct inode *inode)
43 {
44 	return container_of(inode, struct bpf_fs_inode, vfs_inode);
45 }
46 
47 static struct kmem_cache *bpf_fs_inode_cachep __ro_after_init;
48 
49 static int bpf_fs_initxattrs(struct inode *inode,
50 			     const struct xattr *xattr_array, void *fs_info);
51 static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size);
52 
53 static void *bpf_any_get(void *raw, enum bpf_type type)
54 {
55 	switch (type) {
56 	case BPF_TYPE_PROG:
57 		bpf_prog_inc(raw);
58 		break;
59 	case BPF_TYPE_MAP:
60 		bpf_map_inc_with_uref(raw);
61 		break;
62 	case BPF_TYPE_LINK:
63 		bpf_link_inc(raw);
64 		break;
65 	default:
66 		WARN_ON_ONCE(1);
67 		break;
68 	}
69 
70 	return raw;
71 }
72 
73 static void bpf_any_put(void *raw, enum bpf_type type)
74 {
75 	switch (type) {
76 	case BPF_TYPE_PROG:
77 		bpf_prog_put(raw);
78 		break;
79 	case BPF_TYPE_MAP:
80 		bpf_map_put_with_uref(raw);
81 		break;
82 	case BPF_TYPE_LINK:
83 		bpf_link_put(raw);
84 		break;
85 	default:
86 		WARN_ON_ONCE(1);
87 		break;
88 	}
89 }
90 
91 static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
92 {
93 	void *raw;
94 
95 	raw = bpf_map_get_with_uref(ufd);
96 	if (!IS_ERR(raw)) {
97 		*type = BPF_TYPE_MAP;
98 		return raw;
99 	}
100 
101 	raw = bpf_prog_get(ufd);
102 	if (!IS_ERR(raw)) {
103 		*type = BPF_TYPE_PROG;
104 		return raw;
105 	}
106 
107 	raw = bpf_link_get_from_fd(ufd);
108 	if (!IS_ERR(raw)) {
109 		*type = BPF_TYPE_LINK;
110 		return raw;
111 	}
112 
113 	return ERR_PTR(-EINVAL);
114 }
115 
116 static const struct inode_operations bpf_dir_iops;
117 static const struct inode_operations bpf_symlink_iops;
118 
119 static const struct inode_operations bpf_prog_iops = {
120 	.listxattr	= bpf_fs_listxattr,
121 };
122 static const struct inode_operations bpf_map_iops  = {
123 	.listxattr	= bpf_fs_listxattr,
124 };
125 static const struct inode_operations bpf_link_iops  = {
126 	.listxattr	= bpf_fs_listxattr,
127 };
128 
129 struct inode *bpf_get_inode(struct super_block *sb,
130 			    const struct inode *dir,
131 			    umode_t mode)
132 {
133 	struct inode *inode;
134 
135 	switch (mode & S_IFMT) {
136 	case S_IFDIR:
137 	case S_IFREG:
138 	case S_IFLNK:
139 		break;
140 	default:
141 		return ERR_PTR(-EINVAL);
142 	}
143 
144 	inode = new_inode(sb);
145 	if (!inode)
146 		return ERR_PTR(-ENOSPC);
147 
148 	inode->i_ino = get_next_ino();
149 	simple_inode_init_ts(inode);
150 
151 	inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
152 
153 	return inode;
154 }
155 
156 static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
157 {
158 	*type = BPF_TYPE_UNSPEC;
159 	if (inode->i_op == &bpf_prog_iops)
160 		*type = BPF_TYPE_PROG;
161 	else if (inode->i_op == &bpf_map_iops)
162 		*type = BPF_TYPE_MAP;
163 	else if (inode->i_op == &bpf_link_iops)
164 		*type = BPF_TYPE_LINK;
165 	else
166 		return -EACCES;
167 
168 	return 0;
169 }
170 
171 static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode,
172 				struct inode *dir)
173 {
174 	d_make_persistent(dentry, inode);
175 
176 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
177 }
178 
179 static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir,
180 				struct dentry *dentry, umode_t mode)
181 {
182 	struct inode *inode;
183 	int ret;
184 
185 	inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
186 	if (IS_ERR(inode))
187 		return ERR_CAST(inode);
188 
189 	ret = security_inode_init_security(inode, dir, &dentry->d_name,
190 					   bpf_fs_initxattrs, NULL);
191 	if (ret && ret != -EOPNOTSUPP) {
192 		iput(inode);
193 		return ERR_PTR(ret);
194 	}
195 
196 	inode->i_op = &bpf_dir_iops;
197 	inode->i_fop = &simple_dir_operations;
198 
199 	inc_nlink(inode);
200 	inc_nlink(dir);
201 
202 	bpf_dentry_finalize(dentry, inode, dir);
203 	return NULL;
204 }
205 
206 struct map_iter {
207 	void *key;
208 	bool done;
209 };
210 
211 static struct map_iter *map_iter(struct seq_file *m)
212 {
213 	return m->private;
214 }
215 
216 static struct bpf_map *seq_file_to_map(struct seq_file *m)
217 {
218 	return file_inode(m->file)->i_private;
219 }
220 
221 static void map_iter_free(struct map_iter *iter)
222 {
223 	if (iter) {
224 		kfree(iter->key);
225 		kfree(iter);
226 	}
227 }
228 
229 static struct map_iter *map_iter_alloc(struct bpf_map *map)
230 {
231 	struct map_iter *iter;
232 
233 	iter = kzalloc_obj(*iter, GFP_KERNEL | __GFP_NOWARN);
234 	if (!iter)
235 		goto error;
236 
237 	iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN);
238 	if (!iter->key)
239 		goto error;
240 
241 	return iter;
242 
243 error:
244 	map_iter_free(iter);
245 	return NULL;
246 }
247 
248 static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
249 {
250 	struct bpf_map *map = seq_file_to_map(m);
251 	void *key = map_iter(m)->key;
252 	void *prev_key;
253 
254 	(*pos)++;
255 	if (map_iter(m)->done)
256 		return NULL;
257 
258 	if (unlikely(v == SEQ_START_TOKEN))
259 		prev_key = NULL;
260 	else
261 		prev_key = key;
262 
263 	rcu_read_lock();
264 	if (map->ops->map_get_next_key(map, prev_key, key)) {
265 		map_iter(m)->done = true;
266 		key = NULL;
267 	}
268 	rcu_read_unlock();
269 	return key;
270 }
271 
272 static void *map_seq_start(struct seq_file *m, loff_t *pos)
273 {
274 	if (map_iter(m)->done)
275 		return NULL;
276 
277 	return *pos ? map_iter(m)->key : SEQ_START_TOKEN;
278 }
279 
280 static void map_seq_stop(struct seq_file *m, void *v)
281 {
282 }
283 
284 static int map_seq_show(struct seq_file *m, void *v)
285 {
286 	struct bpf_map *map = seq_file_to_map(m);
287 	void *key = map_iter(m)->key;
288 
289 	if (unlikely(v == SEQ_START_TOKEN)) {
290 		seq_puts(m, "# WARNING!! The output is for debug purpose only\n");
291 		seq_puts(m, "# WARNING!! The output format will change\n");
292 	} else {
293 		map->ops->map_seq_show_elem(map, key, m);
294 	}
295 
296 	return 0;
297 }
298 
299 static const struct seq_operations bpffs_map_seq_ops = {
300 	.start	= map_seq_start,
301 	.next	= map_seq_next,
302 	.show	= map_seq_show,
303 	.stop	= map_seq_stop,
304 };
305 
306 static int bpffs_map_open(struct inode *inode, struct file *file)
307 {
308 	struct bpf_map *map = inode->i_private;
309 	struct map_iter *iter;
310 	struct seq_file *m;
311 	int err;
312 
313 	iter = map_iter_alloc(map);
314 	if (!iter)
315 		return -ENOMEM;
316 
317 	err = seq_open(file, &bpffs_map_seq_ops);
318 	if (err) {
319 		map_iter_free(iter);
320 		return err;
321 	}
322 
323 	m = file->private_data;
324 	m->private = iter;
325 
326 	return 0;
327 }
328 
329 static int bpffs_map_release(struct inode *inode, struct file *file)
330 {
331 	struct seq_file *m = file->private_data;
332 
333 	map_iter_free(map_iter(m));
334 
335 	return seq_release(inode, file);
336 }
337 
338 /* bpffs_map_fops should only implement the basic
339  * read operation for a BPF map.  The purpose is to
340  * provide a simple user intuitive way to do
341  * "cat bpffs/pathto/a-pinned-map".
342  *
343  * Other operations (e.g. write, lookup...) should be realized by
344  * the userspace tools (e.g. bpftool) through the
345  * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update
346  * interface.
347  */
348 static const struct file_operations bpffs_map_fops = {
349 	.open		= bpffs_map_open,
350 	.read		= seq_read,
351 	.release	= bpffs_map_release,
352 };
353 
354 static int bpffs_obj_open(struct inode *inode, struct file *file)
355 {
356 	return -EIO;
357 }
358 
359 static const struct file_operations bpffs_obj_fops = {
360 	.open		= bpffs_obj_open,
361 };
362 
363 static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw,
364 			 const struct inode_operations *iops,
365 			 const struct file_operations *fops)
366 {
367 	struct inode *dir = dentry->d_parent->d_inode;
368 	struct inode *inode;
369 	int ret;
370 
371 	inode = bpf_get_inode(dir->i_sb, dir, mode);
372 	if (IS_ERR(inode))
373 		return PTR_ERR(inode);
374 
375 	ret = security_inode_init_security(inode, dir, &dentry->d_name,
376 					   bpf_fs_initxattrs, NULL);
377 	if (ret && ret != -EOPNOTSUPP) {
378 		iput(inode);
379 		return ret;
380 	}
381 
382 	inode->i_op = iops;
383 	inode->i_fop = fops;
384 	inode->i_private = raw;
385 
386 	bpf_dentry_finalize(dentry, inode, dir);
387 	return 0;
388 }
389 
390 static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg)
391 {
392 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops,
393 			     &bpffs_obj_fops);
394 }
395 
396 static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg)
397 {
398 	struct bpf_map *map = arg;
399 
400 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops,
401 			     bpf_map_support_seq_show(map) ?
402 			     &bpffs_map_fops : &bpffs_obj_fops);
403 }
404 
405 static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg)
406 {
407 	struct bpf_link *link = arg;
408 
409 	return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops,
410 			     bpf_link_is_iter(link) ?
411 			     &bpf_iter_fops : &bpffs_obj_fops);
412 }
413 
414 static struct dentry *
415 bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags)
416 {
417 	/* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future
418 	 * extensions. That allows popoulate_bpffs() create special files.
419 	 */
420 	if ((dir->i_mode & S_IALLUGO) &&
421 	    strchr(dentry->d_name.name, '.'))
422 		return ERR_PTR(-EPERM);
423 
424 	return simple_lookup(dir, dentry, flags);
425 }
426 
427 static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir,
428 		       struct dentry *dentry, const char *target)
429 {
430 	struct inode *inode;
431 	char *link;
432 	int ret;
433 
434 	link = kstrdup(target, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
435 	if (!link)
436 		return -ENOMEM;
437 
438 	inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK);
439 	if (IS_ERR(inode)) {
440 		kfree(link);
441 		return PTR_ERR(inode);
442 	}
443 
444 	inode->i_op = &bpf_symlink_iops;
445 	inode->i_link = link;
446 
447 	ret = security_inode_init_security(inode, dir, &dentry->d_name,
448 					   bpf_fs_initxattrs, NULL);
449 	if (ret && ret != -EOPNOTSUPP) {
450 		iput(inode);
451 		return ret;
452 	}
453 
454 	bpf_dentry_finalize(dentry, inode, dir);
455 	return 0;
456 }
457 
458 static const struct inode_operations bpf_symlink_iops = {
459 	.get_link	= simple_get_link,
460 	.listxattr	= bpf_fs_listxattr,
461 };
462 
463 static const struct inode_operations bpf_dir_iops = {
464 	.lookup		= bpf_lookup,
465 	.mkdir		= bpf_mkdir,
466 	.symlink	= bpf_symlink,
467 	.rmdir		= simple_rmdir,
468 	.rename		= simple_rename,
469 	.link		= simple_link,
470 	.unlink		= simple_unlink,
471 	.listxattr	= bpf_fs_listxattr,
472 };
473 
474 /* pin iterator link into bpffs */
475 static int bpf_iter_link_pin_kernel(struct dentry *parent,
476 				    const char *name, struct bpf_link *link)
477 {
478 	umode_t mode = S_IFREG | S_IRUSR;
479 	struct dentry *dentry;
480 	int ret;
481 
482 	dentry = simple_start_creating(parent, name);
483 	if (IS_ERR(dentry))
484 		return PTR_ERR(dentry);
485 	ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops,
486 			    &bpf_iter_fops);
487 	simple_done_creating(dentry);
488 	return ret;
489 }
490 
491 static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw,
492 			  enum bpf_type type)
493 {
494 	struct dentry *dentry;
495 	struct inode *dir;
496 	struct path path;
497 	umode_t mode;
498 	int ret;
499 
500 	dentry = start_creating_user_path(path_fd, pathname, &path, 0);
501 	if (IS_ERR(dentry))
502 		return PTR_ERR(dentry);
503 
504 	dir = d_inode(path.dentry);
505 	if (dir->i_op != &bpf_dir_iops) {
506 		ret = -EPERM;
507 		goto out;
508 	}
509 
510 	mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
511 	ret = security_path_mknod(&path, dentry, mode, 0);
512 	if (ret)
513 		goto out;
514 
515 	switch (type) {
516 	case BPF_TYPE_PROG:
517 		ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw);
518 		break;
519 	case BPF_TYPE_MAP:
520 		ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw);
521 		break;
522 	case BPF_TYPE_LINK:
523 		ret = vfs_mkobj(dentry, mode, bpf_mklink, raw);
524 		break;
525 	default:
526 		ret = -EPERM;
527 	}
528 out:
529 	end_creating_path(&path, dentry);
530 	return ret;
531 }
532 
533 int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname)
534 {
535 	enum bpf_type type;
536 	void *raw;
537 	int ret;
538 
539 	raw = bpf_fd_probe_obj(ufd, &type);
540 	if (IS_ERR(raw))
541 		return PTR_ERR(raw);
542 
543 	ret = bpf_obj_do_pin(path_fd, pathname, raw, type);
544 	if (ret != 0)
545 		bpf_any_put(raw, type);
546 
547 	return ret;
548 }
549 
550 static void *bpf_obj_do_get(int path_fd, const char __user *pathname,
551 			    enum bpf_type *type, int flags)
552 {
553 	struct inode *inode;
554 	struct path path;
555 	void *raw;
556 	int ret;
557 
558 	ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path);
559 	if (ret)
560 		return ERR_PTR(ret);
561 
562 	inode = d_backing_inode(path.dentry);
563 	ret = path_permission(&path, ACC_MODE(flags));
564 	if (ret)
565 		goto out;
566 
567 	ret = bpf_inode_type(inode, type);
568 	if (ret)
569 		goto out;
570 
571 	raw = bpf_any_get(inode->i_private, *type);
572 	if (!IS_ERR(raw))
573 		touch_atime(&path);
574 
575 	path_put(&path);
576 	return raw;
577 out:
578 	path_put(&path);
579 	return ERR_PTR(ret);
580 }
581 
582 int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags)
583 {
584 	enum bpf_type type = BPF_TYPE_UNSPEC;
585 	int f_flags;
586 	void *raw;
587 	int ret;
588 
589 	f_flags = bpf_get_file_flag(flags);
590 	if (f_flags < 0)
591 		return f_flags;
592 
593 	raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags);
594 	if (IS_ERR(raw))
595 		return PTR_ERR(raw);
596 
597 	if (type == BPF_TYPE_PROG)
598 		ret = bpf_prog_new_fd(raw);
599 	else if (type == BPF_TYPE_MAP)
600 		ret = bpf_map_new_fd(raw, f_flags);
601 	else if (type == BPF_TYPE_LINK)
602 		ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw);
603 	else
604 		return -ENOENT;
605 
606 	if (ret < 0)
607 		bpf_any_put(raw, type);
608 	return ret;
609 }
610 
611 static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type)
612 {
613 	struct bpf_prog *prog;
614 	int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ);
615 	if (ret)
616 		return ERR_PTR(ret);
617 
618 	if (inode->i_op == &bpf_map_iops)
619 		return ERR_PTR(-EINVAL);
620 	if (inode->i_op == &bpf_link_iops)
621 		return ERR_PTR(-EINVAL);
622 	if (inode->i_op != &bpf_prog_iops)
623 		return ERR_PTR(-EACCES);
624 
625 	prog = inode->i_private;
626 
627 	ret = security_bpf_prog(prog);
628 	if (ret < 0)
629 		return ERR_PTR(ret);
630 
631 	if (!bpf_prog_get_ok(prog, &type, false))
632 		return ERR_PTR(-EINVAL);
633 
634 	bpf_prog_inc(prog);
635 	return prog;
636 }
637 
638 struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type)
639 {
640 	struct bpf_prog *prog;
641 	struct path path;
642 	int ret = kern_path(name, LOOKUP_FOLLOW, &path);
643 	if (ret)
644 		return ERR_PTR(ret);
645 	prog = __get_prog_inode(d_backing_inode(path.dentry), type);
646 	if (!IS_ERR(prog))
647 		touch_atime(&path);
648 	path_put(&path);
649 	return prog;
650 }
651 EXPORT_SYMBOL(bpf_prog_get_type_path);
652 
653 struct bpffs_btf_enums {
654 	const struct btf *btf;
655 	const struct btf_type *cmd_t;
656 	const struct btf_type *map_t;
657 	const struct btf_type *prog_t;
658 	const struct btf_type *attach_t;
659 };
660 
661 static int find_bpffs_btf_enums(struct bpffs_btf_enums *info)
662 {
663 	struct {
664 		const struct btf_type **type;
665 		const char *name;
666 	} btf_enums[] = {
667 		{&info->cmd_t,		"bpf_cmd"},
668 		{&info->map_t,		"bpf_map_type"},
669 		{&info->prog_t,		"bpf_prog_type"},
670 		{&info->attach_t,	"bpf_attach_type"},
671 	};
672 	const struct btf *btf;
673 	int i, id;
674 
675 	memset(info, 0, sizeof(*info));
676 
677 	btf = bpf_get_btf_vmlinux();
678 	if (IS_ERR(btf))
679 		return PTR_ERR(btf);
680 	if (!btf)
681 		return -ENOENT;
682 
683 	info->btf = btf;
684 
685 	for (i = 0; i < ARRAY_SIZE(btf_enums); i++) {
686 		id = btf_find_by_name_kind(btf, btf_enums[i].name,
687 					   BTF_KIND_ENUM);
688 		if (id < 0)
689 			return -ESRCH;
690 
691 		*btf_enums[i].type = btf_type_by_id(btf, id);
692 	}
693 
694 	return 0;
695 }
696 
697 static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t,
698 				const char *prefix, const char *str, int *value)
699 {
700 	const struct btf_enum *e;
701 	const char *name;
702 	int i, n, pfx_len = strlen(prefix);
703 
704 	*value = 0;
705 
706 	if (!btf || !enum_t)
707 		return false;
708 
709 	for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
710 		e = &btf_enum(enum_t)[i];
711 
712 		name = btf_name_by_offset(btf, e->name_off);
713 		if (!name || strncasecmp(name, prefix, pfx_len) != 0)
714 			continue;
715 
716 		/* match symbolic name case insensitive and ignoring prefix */
717 		if (strcasecmp(name + pfx_len, str) == 0) {
718 			*value = e->val;
719 			return true;
720 		}
721 	}
722 
723 	return false;
724 }
725 
726 static void seq_print_delegate_opts(struct seq_file *m,
727 				    const char *opt_name,
728 				    const struct btf *btf,
729 				    const struct btf_type *enum_t,
730 				    const char *prefix,
731 				    u64 delegate_msk, u64 any_msk)
732 {
733 	const struct btf_enum *e;
734 	bool first = true;
735 	const char *name;
736 	u64 msk;
737 	int i, n, pfx_len = strlen(prefix);
738 
739 	delegate_msk &= any_msk; /* clear unknown bits */
740 
741 	if (delegate_msk == 0)
742 		return;
743 
744 	seq_printf(m, ",%s", opt_name);
745 	if (delegate_msk == any_msk) {
746 		seq_printf(m, "=any");
747 		return;
748 	}
749 
750 	if (btf && enum_t) {
751 		for (i = 0, n = btf_vlen(enum_t); i < n; i++) {
752 			e = &btf_enum(enum_t)[i];
753 			name = btf_name_by_offset(btf, e->name_off);
754 			if (!name || strncasecmp(name, prefix, pfx_len) != 0)
755 				continue;
756 			msk = 1ULL << e->val;
757 			if (delegate_msk & msk) {
758 				/* emit lower-case name without prefix */
759 				seq_putc(m, first ? '=' : ':');
760 				name += pfx_len;
761 				while (*name) {
762 					seq_putc(m, tolower(*name));
763 					name++;
764 				}
765 
766 				delegate_msk &= ~msk;
767 				first = false;
768 			}
769 		}
770 	}
771 	if (delegate_msk)
772 		seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk);
773 }
774 
775 /*
776  * Display the mount options in /proc/mounts.
777  */
778 static int bpf_show_options(struct seq_file *m, struct dentry *root)
779 {
780 	struct inode *inode = d_inode(root);
781 	umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX;
782 	struct bpf_mount_opts *opts = root->d_sb->s_fs_info;
783 	u64 mask;
784 
785 	if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID))
786 		seq_printf(m, ",uid=%u",
787 			   from_kuid_munged(&init_user_ns, inode->i_uid));
788 	if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID))
789 		seq_printf(m, ",gid=%u",
790 			   from_kgid_munged(&init_user_ns, inode->i_gid));
791 	if (mode != S_IRWXUGO)
792 		seq_printf(m, ",mode=%o", mode);
793 
794 	if (opts->delegate_cmds || opts->delegate_maps ||
795 	    opts->delegate_progs || opts->delegate_attachs) {
796 		struct bpffs_btf_enums info;
797 
798 		/* ignore errors, fallback to hex */
799 		(void)find_bpffs_btf_enums(&info);
800 
801 		mask = (1ULL << __MAX_BPF_CMD) - 1;
802 		seq_print_delegate_opts(m, "delegate_cmds",
803 					info.btf, info.cmd_t, "BPF_",
804 					opts->delegate_cmds, mask);
805 
806 		mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1;
807 		seq_print_delegate_opts(m, "delegate_maps",
808 					info.btf, info.map_t, "BPF_MAP_TYPE_",
809 					opts->delegate_maps, mask);
810 
811 		mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1;
812 		seq_print_delegate_opts(m, "delegate_progs",
813 					info.btf, info.prog_t, "BPF_PROG_TYPE_",
814 					opts->delegate_progs, mask);
815 
816 		mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1;
817 		seq_print_delegate_opts(m, "delegate_attachs",
818 					info.btf, info.attach_t, "BPF_",
819 					opts->delegate_attachs, mask);
820 	}
821 
822 	return 0;
823 }
824 
825 static struct inode *bpf_fs_alloc_inode(struct super_block *sb)
826 {
827 	struct bpf_fs_inode *bi;
828 
829 	bi = alloc_inode_sb(sb, bpf_fs_inode_cachep, GFP_KERNEL);
830 	if (!bi)
831 		return NULL;
832 	INIT_LIST_HEAD_RCU(&bi->xattrs);
833 	simple_xattr_limits_init(&bi->xlimits);
834 	return &bi->vfs_inode;
835 }
836 
837 static void bpf_destroy_inode(struct inode *inode)
838 {
839 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
840 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
841 	enum bpf_type type;
842 
843 	if (!bpf_inode_type(inode, &type))
844 		bpf_any_put(inode->i_private, type);
845 	simple_xattrs_free(&opts->xa_cache, &bi->xattrs, NULL);
846 }
847 
848 /*
849  * Called after RCU grace period - safe to free inode and anything
850  *  that might be accessed by RCU pathwalk (inode fields, i_link).
851  */
852 static void bpf_free_inode(struct inode *inode)
853 {
854 	if (S_ISLNK(inode->i_mode))
855 		kfree(inode->i_link);
856 	kmem_cache_free(bpf_fs_inode_cachep, BPF_FS_I(inode));
857 }
858 
859 static int bpf_fs_xattr_get(const struct xattr_handler *handler,
860 			    struct dentry *unused, struct inode *inode,
861 			    const char *name, void *value, size_t size)
862 {
863 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
864 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
865 
866 	name = xattr_full_name(handler, name);
867 	return simple_xattr_get(&opts->xa_cache, &bi->xattrs, name, value, size);
868 }
869 
870 enum {
871 	BPF_FS_XATTR_UNSPEC,
872 	BPF_FS_XATTR_SECURITY,
873 	BPF_FS_XATTR_TRUSTED,
874 };
875 
876 static int bpf_fs_xattr_set(const struct xattr_handler *handler,
877 			    struct mnt_idmap *idmap, struct dentry *unused,
878 			    struct inode *inode, const char *name,
879 			    const void *value, size_t size, int flags)
880 {
881 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
882 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
883 	struct simple_xattr *old;
884 	int err = -EINVAL;
885 
886 	name = xattr_full_name(handler, name);
887 	switch (handler->flags) {
888 	case BPF_FS_XATTR_SECURITY:
889 		err = simple_xattr_set_limited(&opts->xa_cache, &bi->xattrs,
890 					       &bi->xlimits, name, value, size,
891 					       flags);
892 		break;
893 	case BPF_FS_XATTR_TRUSTED:
894 		old = simple_xattr_set(&opts->xa_cache, &bi->xattrs, name,
895 				       value, size, flags);
896 		err = IS_ERR(old) ? PTR_ERR(old) : 0;
897 		if (!err)
898 			simple_xattr_free_rcu(old);
899 		break;
900 	}
901 	if (err)
902 		return err;
903 	inode_set_ctime_current(inode);
904 	return 0;
905 }
906 
907 static const struct xattr_handler bpf_fs_trusted_xattr_handler = {
908 	.prefix	= XATTR_TRUSTED_PREFIX,
909 	.flags	= BPF_FS_XATTR_TRUSTED,
910 	.get	= bpf_fs_xattr_get,
911 	.set	= bpf_fs_xattr_set,
912 };
913 
914 static const struct xattr_handler bpf_fs_security_xattr_handler = {
915 	.prefix	= XATTR_SECURITY_PREFIX,
916 	.flags	= BPF_FS_XATTR_SECURITY,
917 	.get	= bpf_fs_xattr_get,
918 	.set	= bpf_fs_xattr_set,
919 };
920 
921 static const struct xattr_handler * const bpf_fs_xattr_handlers[] = {
922 	&bpf_fs_trusted_xattr_handler,
923 	&bpf_fs_security_xattr_handler,
924 	NULL,
925 };
926 
927 static ssize_t bpf_fs_listxattr(struct dentry *dentry, char *buf, size_t size)
928 {
929 	struct inode *inode = d_inode(dentry);
930 
931 	return simple_xattr_list(inode, &BPF_FS_I(inode)->xattrs, buf, size);
932 }
933 
934 static int bpf_fs_initxattrs(struct inode *inode,
935 			     const struct xattr *xattr_array, void *fs_info)
936 {
937 	struct bpf_mount_opts *opts = inode->i_sb->s_fs_info;
938 	struct bpf_fs_inode *bi = BPF_FS_I(inode);
939 	const struct xattr *xattr;
940 	int err;
941 
942 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
943 		CLASS(simple_xattr, new_xattr)(xattr->value, xattr->value_len);
944 		if (IS_ERR(new_xattr))
945 			return PTR_ERR(new_xattr);
946 
947 		new_xattr->name = kasprintf(GFP_KERNEL_ACCOUNT,
948 					    XATTR_SECURITY_PREFIX "%s",
949 					    xattr->name);
950 		if (!new_xattr->name)
951 			return -ENOMEM;
952 
953 		err = simple_xattr_add_limited(&opts->xa_cache, &bi->xattrs,
954 					       &bi->xlimits, new_xattr);
955 		if (err)
956 			return err;
957 
958 		retain_and_null_ptr(new_xattr);
959 	}
960 	return 0;
961 }
962 
963 const struct super_operations bpf_super_ops = {
964 	.statfs		= simple_statfs,
965 	.drop_inode	= inode_just_drop,
966 	.show_options	= bpf_show_options,
967 	.alloc_inode	= bpf_fs_alloc_inode,
968 	.destroy_inode	= bpf_destroy_inode,
969 	.free_inode	= bpf_free_inode,
970 };
971 
972 enum {
973 	OPT_UID,
974 	OPT_GID,
975 	OPT_MODE,
976 	OPT_DELEGATE_CMDS,
977 	OPT_DELEGATE_MAPS,
978 	OPT_DELEGATE_PROGS,
979 	OPT_DELEGATE_ATTACHS,
980 };
981 
982 static const struct fs_parameter_spec bpf_fs_parameters[] = {
983 	fsparam_u32	("uid",				OPT_UID),
984 	fsparam_u32	("gid",				OPT_GID),
985 	fsparam_u32oct	("mode",			OPT_MODE),
986 	fsparam_string	("delegate_cmds",		OPT_DELEGATE_CMDS),
987 	fsparam_string	("delegate_maps",		OPT_DELEGATE_MAPS),
988 	fsparam_string	("delegate_progs",		OPT_DELEGATE_PROGS),
989 	fsparam_string	("delegate_attachs",		OPT_DELEGATE_ATTACHS),
990 	{}
991 };
992 
993 static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param)
994 {
995 	struct bpf_mount_opts *opts = fc->s_fs_info;
996 	struct fs_parse_result result;
997 	kuid_t uid;
998 	kgid_t gid;
999 	int opt, err;
1000 
1001 	opt = fs_parse(fc, bpf_fs_parameters, param, &result);
1002 	if (opt < 0) {
1003 		/* We might like to report bad mount options here, but
1004 		 * traditionally we've ignored all mount options, so we'd
1005 		 * better continue to ignore non-existing options for bpf.
1006 		 */
1007 		if (opt == -ENOPARAM) {
1008 			opt = vfs_parse_fs_param_source(fc, param);
1009 			if (opt != -ENOPARAM)
1010 				return opt;
1011 
1012 			return 0;
1013 		}
1014 
1015 		if (opt < 0)
1016 			return opt;
1017 	}
1018 
1019 	switch (opt) {
1020 	case OPT_UID:
1021 		uid = make_kuid(current_user_ns(), result.uint_32);
1022 		if (!uid_valid(uid))
1023 			goto bad_value;
1024 
1025 		/*
1026 		 * The requested uid must be representable in the
1027 		 * filesystem's idmapping.
1028 		 */
1029 		if (!kuid_has_mapping(fc->user_ns, uid))
1030 			goto bad_value;
1031 
1032 		opts->uid = uid;
1033 		break;
1034 	case OPT_GID:
1035 		gid = make_kgid(current_user_ns(), result.uint_32);
1036 		if (!gid_valid(gid))
1037 			goto bad_value;
1038 
1039 		/*
1040 		 * The requested gid must be representable in the
1041 		 * filesystem's idmapping.
1042 		 */
1043 		if (!kgid_has_mapping(fc->user_ns, gid))
1044 			goto bad_value;
1045 
1046 		opts->gid = gid;
1047 		break;
1048 	case OPT_MODE:
1049 		opts->mode = result.uint_32 & S_IALLUGO;
1050 		break;
1051 	case OPT_DELEGATE_CMDS:
1052 	case OPT_DELEGATE_MAPS:
1053 	case OPT_DELEGATE_PROGS:
1054 	case OPT_DELEGATE_ATTACHS: {
1055 		struct bpffs_btf_enums info;
1056 		const struct btf_type *enum_t;
1057 		const char *enum_pfx;
1058 		u64 *delegate_msk, msk = 0;
1059 		char *p, *str;
1060 		int val;
1061 
1062 		/* ignore errors, fallback to hex */
1063 		(void)find_bpffs_btf_enums(&info);
1064 
1065 		switch (opt) {
1066 		case OPT_DELEGATE_CMDS:
1067 			delegate_msk = &opts->delegate_cmds;
1068 			enum_t = info.cmd_t;
1069 			enum_pfx = "BPF_";
1070 			break;
1071 		case OPT_DELEGATE_MAPS:
1072 			delegate_msk = &opts->delegate_maps;
1073 			enum_t = info.map_t;
1074 			enum_pfx = "BPF_MAP_TYPE_";
1075 			break;
1076 		case OPT_DELEGATE_PROGS:
1077 			delegate_msk = &opts->delegate_progs;
1078 			enum_t = info.prog_t;
1079 			enum_pfx = "BPF_PROG_TYPE_";
1080 			break;
1081 		case OPT_DELEGATE_ATTACHS:
1082 			delegate_msk = &opts->delegate_attachs;
1083 			enum_t = info.attach_t;
1084 			enum_pfx = "BPF_";
1085 			break;
1086 		default:
1087 			return -EINVAL;
1088 		}
1089 
1090 		str = param->string;
1091 		while ((p = strsep(&str, ":"))) {
1092 			if (strcmp(p, "any") == 0) {
1093 				msk |= ~0ULL;
1094 			} else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) {
1095 				msk |= 1ULL << val;
1096 			} else {
1097 				err = kstrtou64(p, 0, &msk);
1098 				if (err)
1099 					return err;
1100 			}
1101 		}
1102 
1103 		/* Setting delegation mount options requires privileges */
1104 		if (msk && !capable(CAP_SYS_ADMIN))
1105 			return -EPERM;
1106 
1107 		*delegate_msk |= msk;
1108 		break;
1109 	}
1110 	default:
1111 		/* ignore unknown mount options */
1112 		break;
1113 	}
1114 
1115 	return 0;
1116 bad_value:
1117 	return invalfc(fc, "Bad value for '%s'", param->key);
1118 }
1119 
1120 struct bpf_preload_ops *bpf_preload_ops;
1121 EXPORT_SYMBOL_GPL(bpf_preload_ops);
1122 
1123 static bool bpf_preload_mod_get(void)
1124 {
1125 	/* If bpf_preload.ko wasn't loaded earlier then load it now.
1126 	 * When bpf_preload is built into vmlinux the module's __init
1127 	 * function will populate it.
1128 	 */
1129 	if (!bpf_preload_ops) {
1130 		request_module("bpf_preload");
1131 		if (!bpf_preload_ops)
1132 			return false;
1133 	}
1134 	/* And grab the reference, so the module doesn't disappear while the
1135 	 * kernel is interacting with the kernel module and its UMD.
1136 	 */
1137 	if (!try_module_get(bpf_preload_ops->owner)) {
1138 		pr_err("bpf_preload module get failed.\n");
1139 		return false;
1140 	}
1141 	return true;
1142 }
1143 
1144 static void bpf_preload_mod_put(void)
1145 {
1146 	if (bpf_preload_ops)
1147 		/* now user can "rmmod bpf_preload" if necessary */
1148 		module_put(bpf_preload_ops->owner);
1149 }
1150 
1151 static DEFINE_MUTEX(bpf_preload_lock);
1152 
1153 static int populate_bpffs(struct dentry *parent)
1154 {
1155 	struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {};
1156 	int err = 0, i;
1157 
1158 	/* grab the mutex to make sure the kernel interactions with bpf_preload
1159 	 * are serialized
1160 	 */
1161 	mutex_lock(&bpf_preload_lock);
1162 
1163 	/* if bpf_preload.ko wasn't built into vmlinux then load it */
1164 	if (!bpf_preload_mod_get())
1165 		goto out;
1166 
1167 	err = bpf_preload_ops->preload(objs);
1168 	if (err)
1169 		goto out_put;
1170 	for (i = 0; i < BPF_PRELOAD_LINKS; i++) {
1171 		bpf_link_inc(objs[i].link);
1172 		err = bpf_iter_link_pin_kernel(parent,
1173 					       objs[i].link_name, objs[i].link);
1174 		if (err) {
1175 			bpf_link_put(objs[i].link);
1176 			goto out_put;
1177 		}
1178 	}
1179 out_put:
1180 	bpf_preload_mod_put();
1181 out:
1182 	mutex_unlock(&bpf_preload_lock);
1183 	return err;
1184 }
1185 
1186 static int bpf_fill_super(struct super_block *sb, struct fs_context *fc)
1187 {
1188 	struct bpf_mount_opts *opts = sb->s_fs_info;
1189 	struct inode *inode;
1190 
1191 	/* Mounting an instance of BPF FS requires privileges */
1192 	if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN))
1193 		return -EPERM;
1194 
1195 	sb->s_blocksize = PAGE_SIZE;
1196 	sb->s_blocksize_bits = PAGE_SHIFT;
1197 	sb->s_magic = BPF_FS_MAGIC;
1198 	sb->s_op = &bpf_super_ops;
1199 	sb->s_xattr = bpf_fs_xattr_handlers;
1200 	sb->s_iflags |= SB_I_NOEXEC;
1201 	sb->s_iflags |= SB_I_NODEV;
1202 	sb->s_time_gran = 1;
1203 
1204 	inode = bpf_get_inode(sb, NULL, S_IFDIR | 0777);
1205 	if (IS_ERR(inode))
1206 		return PTR_ERR(inode);
1207 
1208 	inode->i_ino = 1;
1209 	inode->i_op = &bpf_dir_iops;
1210 	inode->i_fop = &simple_dir_operations;
1211 	set_nlink(inode, 2);
1212 
1213 	sb->s_root = d_make_root(inode);
1214 	if (!sb->s_root)
1215 		return -ENOMEM;
1216 
1217 	inode = d_inode(sb->s_root);
1218 	inode->i_uid = opts->uid;
1219 	inode->i_gid = opts->gid;
1220 	inode->i_mode &= ~S_IALLUGO;
1221 	populate_bpffs(sb->s_root);
1222 	inode->i_mode |= S_ISVTX | opts->mode;
1223 	return 0;
1224 }
1225 
1226 static int bpf_get_tree(struct fs_context *fc)
1227 {
1228 	return get_tree_nodev(fc, bpf_fill_super);
1229 }
1230 
1231 static void bpf_free_fc(struct fs_context *fc)
1232 {
1233 	kfree(fc->s_fs_info);
1234 }
1235 
1236 static const struct fs_context_operations bpf_context_ops = {
1237 	.free		= bpf_free_fc,
1238 	.parse_param	= bpf_parse_param,
1239 	.get_tree	= bpf_get_tree,
1240 };
1241 
1242 /*
1243  * Set up the filesystem mount context.
1244  */
1245 static int bpf_init_fs_context(struct fs_context *fc)
1246 {
1247 	struct bpf_mount_opts *opts;
1248 
1249 	opts = kzalloc_obj(struct bpf_mount_opts);
1250 	if (!opts)
1251 		return -ENOMEM;
1252 
1253 	opts->mode = S_IRWXUGO;
1254 	opts->uid = current_fsuid();
1255 	opts->gid = current_fsgid();
1256 
1257 	/* start out with no BPF token delegation enabled */
1258 	opts->delegate_cmds = 0;
1259 	opts->delegate_maps = 0;
1260 	opts->delegate_progs = 0;
1261 	opts->delegate_attachs = 0;
1262 
1263 	fc->s_fs_info = opts;
1264 	fc->ops = &bpf_context_ops;
1265 	return 0;
1266 }
1267 
1268 static void bpf_kill_super(struct super_block *sb)
1269 {
1270 	struct bpf_mount_opts *opts = sb->s_fs_info;
1271 
1272 	kill_anon_super(sb);
1273 	simple_xattr_cache_cleanup(&opts->xa_cache);
1274 	kfree(opts);
1275 }
1276 
1277 static struct file_system_type bpf_fs_type = {
1278 	.owner		= THIS_MODULE,
1279 	.name		= "bpf",
1280 	.init_fs_context = bpf_init_fs_context,
1281 	.parameters	= bpf_fs_parameters,
1282 	.kill_sb	= bpf_kill_super,
1283 	.fs_flags	= FS_USERNS_MOUNT,
1284 };
1285 
1286 static void bpf_fs_inode_init_once(void *foo)
1287 {
1288 	struct bpf_fs_inode *bi = foo;
1289 
1290 	inode_init_once(&bi->vfs_inode);
1291 }
1292 
1293 static int __init bpf_init(void)
1294 {
1295 	int ret;
1296 
1297 	bpf_fs_inode_cachep = kmem_cache_create("bpf_fs_inode_cache",
1298 						sizeof(struct bpf_fs_inode),
1299 						0, SLAB_ACCOUNT,
1300 						bpf_fs_inode_init_once);
1301 	if (!bpf_fs_inode_cachep)
1302 		return -ENOMEM;
1303 
1304 	ret = sysfs_create_mount_point(fs_kobj, "bpf");
1305 	if (ret)
1306 		goto out_cache;
1307 
1308 	ret = register_filesystem(&bpf_fs_type);
1309 	if (ret) {
1310 		sysfs_remove_mount_point(fs_kobj, "bpf");
1311 		goto out_cache;
1312 	}
1313 
1314 	return 0;
1315 out_cache:
1316 	kmem_cache_destroy(bpf_fs_inode_cachep);
1317 	return ret;
1318 }
1319 fs_initcall(bpf_init);
1320