xref: /linux/fs/binfmt_misc.c (revision 24c776355f4097316a763005434ffff716aa21a8)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * binfmt_misc.c
4  *
5  * Copyright (C) 1997 Richard Günther
6  *
7  * binfmt_misc detects binaries via a magic or filename extension and invokes
8  * a specified wrapper. See Documentation/admin-guide/binfmt-misc.rst for more details.
9  */
10 
11 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12 
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/hex.h>
16 #include <linux/init.h>
17 #include <linux/sched/mm.h>
18 #include <linux/magic.h>
19 #include <linux/binfmts.h>
20 #include <linux/slab.h>
21 #include <linux/ctype.h>
22 #include <linux/string_helpers.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/namei.h>
26 #include <linux/mount.h>
27 #include <linux/fs_context.h>
28 #include <linux/syscalls.h>
29 #include <linux/fs.h>
30 #include <linux/uaccess.h>
31 
32 #include "internal.h"
33 
34 #ifdef DEBUG
35 # define USE_DEBUG 1
36 #else
37 # define USE_DEBUG 0
38 #endif
39 
40 enum {
41 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
42 };
43 
44 enum {Enabled, Magic};
45 #define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
46 #define MISC_FMT_OPEN_BINARY (1UL << 30)
47 #define MISC_FMT_CREDENTIALS (1UL << 29)
48 #define MISC_FMT_OPEN_FILE (1UL << 28)
49 
50 typedef struct {
51 	struct list_head list;
52 	unsigned long flags;		/* type, status, etc. */
53 	int offset;			/* offset of magic */
54 	int size;			/* size of magic/mask */
55 	char *magic;			/* magic or filename extension */
56 	char *mask;			/* mask, NULL for exact match */
57 	const char *interpreter;	/* filename of interpreter */
58 	char *name;
59 	struct dentry *dentry;
60 	struct file *interp_file;
61 	refcount_t users;		/* sync removal with load_misc_binary() */
62 } Node;
63 
64 static struct file_system_type bm_fs_type;
65 
66 /*
67  * Max length of the register string.  Determined by:
68  *  - 7 delimiters
69  *  - name:   ~50 bytes
70  *  - type:   1 byte
71  *  - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE)
72  *  - magic:  128 bytes (512 in escaped form)
73  *  - mask:   128 bytes (512 in escaped form)
74  *  - interp: ~50 bytes
75  *  - flags:  5 bytes
76  * Round that up a bit, and then back off to hold the internal data
77  * (like struct Node).
78  */
79 #define MAX_REGISTER_LENGTH 1920
80 
81 /**
82  * search_binfmt_handler - search for a binary handler for @bprm
83  * @misc: handle to binfmt_misc instance
84  * @bprm: binary for which we are looking for a handler
85  *
86  * Search for a binary type handler for @bprm in the list of registered binary
87  * type handlers.
88  *
89  * Return: binary type list entry on success, NULL on failure
90  */
91 static Node *search_binfmt_handler(struct binfmt_misc *misc,
92 				   struct linux_binprm *bprm)
93 {
94 	char *p = strrchr(bprm->interp, '.');
95 	Node *e;
96 
97 	/* Walk all the registered handlers. */
98 	list_for_each_entry(e, &misc->entries, list) {
99 		char *s;
100 		int j;
101 
102 		/* Make sure this one is currently enabled. */
103 		if (!test_bit(Enabled, &e->flags))
104 			continue;
105 
106 		/* Do matching based on extension if applicable. */
107 		if (!test_bit(Magic, &e->flags)) {
108 			if (p && !strcmp(e->magic, p + 1))
109 				return e;
110 			continue;
111 		}
112 
113 		/* Do matching based on magic & mask. */
114 		s = bprm->buf + e->offset;
115 		if (e->mask) {
116 			for (j = 0; j < e->size; j++)
117 				if ((*s++ ^ e->magic[j]) & e->mask[j])
118 					break;
119 		} else {
120 			for (j = 0; j < e->size; j++)
121 				if ((*s++ ^ e->magic[j]))
122 					break;
123 		}
124 		if (j == e->size)
125 			return e;
126 	}
127 
128 	return NULL;
129 }
130 
131 /**
132  * get_binfmt_handler - try to find a binary type handler
133  * @misc: handle to binfmt_misc instance
134  * @bprm: binary for which we are looking for a handler
135  *
136  * Try to find a binfmt handler for the binary type. If one is found take a
137  * reference to protect against removal via bm_{entry,status}_write().
138  *
139  * Return: binary type list entry on success, NULL on failure
140  */
141 static Node *get_binfmt_handler(struct binfmt_misc *misc,
142 				struct linux_binprm *bprm)
143 {
144 	Node *e;
145 
146 	read_lock(&misc->entries_lock);
147 	e = search_binfmt_handler(misc, bprm);
148 	if (e)
149 		refcount_inc(&e->users);
150 	read_unlock(&misc->entries_lock);
151 	return e;
152 }
153 
154 /**
155  * put_binfmt_handler - put binary handler node
156  * @e: node to put
157  *
158  * Free node syncing with load_misc_binary() and defer final free to
159  * load_misc_binary() in case it is using the binary type handler we were
160  * requested to remove.
161  */
162 static void put_binfmt_handler(Node *e)
163 {
164 	if (refcount_dec_and_test(&e->users)) {
165 		if (e->flags & MISC_FMT_OPEN_FILE)
166 			filp_close(e->interp_file, NULL);
167 		kfree(e);
168 	}
169 }
170 
171 /**
172  * load_binfmt_misc - load the binfmt_misc of the caller's user namespace
173  *
174  * To be called in load_misc_binary() to load the relevant struct binfmt_misc.
175  * If a user namespace doesn't have its own binfmt_misc mount it can make use
176  * of its ancestor's binfmt_misc handlers. This mimicks the behavior of
177  * pre-namespaced binfmt_misc where all registered binfmt_misc handlers where
178  * available to all user and user namespaces on the system.
179  *
180  * Return: the binfmt_misc instance of the caller's user namespace
181  */
182 static struct binfmt_misc *load_binfmt_misc(void)
183 {
184 	const struct user_namespace *user_ns;
185 	struct binfmt_misc *misc;
186 
187 	user_ns = current_user_ns();
188 	while (user_ns) {
189 		/* Pairs with smp_store_release() in bm_fill_super(). */
190 		misc = smp_load_acquire(&user_ns->binfmt_misc);
191 		if (misc)
192 			return misc;
193 
194 		user_ns = user_ns->parent;
195 	}
196 
197 	return &init_binfmt_misc;
198 }
199 
200 /*
201  * the loader itself
202  */
203 static int load_misc_binary(struct linux_binprm *bprm)
204 {
205 	Node *fmt;
206 	struct file *interp_file = NULL;
207 	int retval = -ENOEXEC;
208 	struct binfmt_misc *misc;
209 
210 	misc = load_binfmt_misc();
211 	if (!misc->enabled)
212 		return retval;
213 
214 	fmt = get_binfmt_handler(misc, bprm);
215 	if (!fmt)
216 		return retval;
217 
218 	/* Need to be able to load the file after exec */
219 	retval = -ENOENT;
220 	if (bprm->interp_flags & BINPRM_FLAGS_PATH_INACCESSIBLE)
221 		goto ret;
222 
223 	if (fmt->flags & MISC_FMT_PRESERVE_ARGV0) {
224 		bprm->interp_flags |= BINPRM_FLAGS_PRESERVE_ARGV0;
225 	} else {
226 		retval = remove_arg_zero(bprm);
227 		if (retval)
228 			goto ret;
229 	}
230 
231 	if (fmt->flags & MISC_FMT_OPEN_BINARY)
232 		bprm->have_execfd = 1;
233 
234 	/* make argv[1] be the path to the binary */
235 	retval = copy_string_kernel(bprm->interp, bprm);
236 	if (retval < 0)
237 		goto ret;
238 	bprm->argc++;
239 
240 	/* add the interp as argv[0] */
241 	retval = copy_string_kernel(fmt->interpreter, bprm);
242 	if (retval < 0)
243 		goto ret;
244 	bprm->argc++;
245 
246 	/* Update interp in case binfmt_script needs it. */
247 	retval = bprm_change_interp(fmt->interpreter, bprm);
248 	if (retval < 0)
249 		goto ret;
250 
251 	if (fmt->flags & MISC_FMT_OPEN_FILE) {
252 		interp_file = file_clone_open(fmt->interp_file);
253 		if (!IS_ERR(interp_file))
254 			deny_write_access(interp_file);
255 	} else {
256 		interp_file = open_exec(fmt->interpreter);
257 	}
258 	retval = PTR_ERR(interp_file);
259 	if (IS_ERR(interp_file))
260 		goto ret;
261 
262 	bprm->interpreter = interp_file;
263 	if (fmt->flags & MISC_FMT_CREDENTIALS)
264 		bprm->execfd_creds = 1;
265 
266 	retval = 0;
267 ret:
268 
269 	/*
270 	 * If we actually put the node here all concurrent calls to
271 	 * load_misc_binary() will have finished. We also know
272 	 * that for the refcount to be zero someone must have concurently
273 	 * removed the binary type handler from the list and it's our job to
274 	 * free it.
275 	 */
276 	put_binfmt_handler(fmt);
277 
278 	return retval;
279 }
280 
281 /* Command parsers */
282 
283 /*
284  * parses and copies one argument enclosed in del from *sp to *dp,
285  * recognising the \x special.
286  * returns pointer to the copied argument or NULL in case of an
287  * error (and sets err) or null argument length.
288  */
289 static char *scanarg(char *s, char del)
290 {
291 	char c;
292 
293 	while ((c = *s++) != del) {
294 		if (c == '\\' && *s == 'x') {
295 			s++;
296 			if (!isxdigit(*s++))
297 				return NULL;
298 			if (!isxdigit(*s++))
299 				return NULL;
300 		}
301 	}
302 	s[-1] ='\0';
303 	return s;
304 }
305 
306 static char *check_special_flags(char *sfs, Node *e)
307 {
308 	char *p = sfs;
309 	int cont = 1;
310 
311 	/* special flags */
312 	while (cont) {
313 		switch (*p) {
314 		case 'P':
315 			pr_debug("register: flag: P (preserve argv0)\n");
316 			p++;
317 			e->flags |= MISC_FMT_PRESERVE_ARGV0;
318 			break;
319 		case 'O':
320 			pr_debug("register: flag: O (open binary)\n");
321 			p++;
322 			e->flags |= MISC_FMT_OPEN_BINARY;
323 			break;
324 		case 'C':
325 			pr_debug("register: flag: C (preserve creds)\n");
326 			p++;
327 			/* this flags also implies the
328 			   open-binary flag */
329 			e->flags |= (MISC_FMT_CREDENTIALS |
330 					MISC_FMT_OPEN_BINARY);
331 			break;
332 		case 'F':
333 			pr_debug("register: flag: F: open interpreter file now\n");
334 			p++;
335 			e->flags |= MISC_FMT_OPEN_FILE;
336 			break;
337 		default:
338 			cont = 0;
339 		}
340 	}
341 
342 	return p;
343 }
344 
345 /*
346  * This registers a new binary format, it recognises the syntax
347  * ':name:type:offset:magic:mask:interpreter:flags'
348  * where the ':' is the IFS, that can be chosen with the first char
349  */
350 static Node *create_entry(const char __user *buffer, size_t count)
351 {
352 	Node *e;
353 	int memsize, err;
354 	char *buf, *p;
355 	char del;
356 
357 	pr_debug("register: received %zu bytes\n", count);
358 
359 	/* some sanity checks */
360 	err = -EINVAL;
361 	if ((count < 11) || (count > MAX_REGISTER_LENGTH))
362 		goto out;
363 
364 	err = -ENOMEM;
365 	memsize = sizeof(Node) + count + 8;
366 	e = kmalloc(memsize, GFP_KERNEL_ACCOUNT);
367 	if (!e)
368 		goto out;
369 
370 	p = buf = (char *)e + sizeof(Node);
371 
372 	memset(e, 0, sizeof(Node));
373 	if (copy_from_user(buf, buffer, count))
374 		goto efault;
375 
376 	del = *p++;	/* delimeter */
377 
378 	pr_debug("register: delim: %#x {%c}\n", del, del);
379 
380 	/* Pad the buffer with the delim to simplify parsing below. */
381 	memset(buf + count, del, 8);
382 
383 	/* Parse the 'name' field. */
384 	e->name = p;
385 	p = strchr(p, del);
386 	if (!p)
387 		goto einval;
388 	*p++ = '\0';
389 	if (!e->name[0] ||
390 	    !strcmp(e->name, ".") ||
391 	    !strcmp(e->name, "..") ||
392 	    strchr(e->name, '/'))
393 		goto einval;
394 
395 	pr_debug("register: name: {%s}\n", e->name);
396 
397 	/* Parse the 'type' field. */
398 	switch (*p++) {
399 	case 'E':
400 		pr_debug("register: type: E (extension)\n");
401 		e->flags = 1 << Enabled;
402 		break;
403 	case 'M':
404 		pr_debug("register: type: M (magic)\n");
405 		e->flags = (1 << Enabled) | (1 << Magic);
406 		break;
407 	default:
408 		goto einval;
409 	}
410 	if (*p++ != del)
411 		goto einval;
412 
413 	if (test_bit(Magic, &e->flags)) {
414 		/* Handle the 'M' (magic) format. */
415 		char *s;
416 
417 		/* Parse the 'offset' field. */
418 		s = strchr(p, del);
419 		if (!s)
420 			goto einval;
421 		*s = '\0';
422 		if (p != s) {
423 			int r = kstrtoint(p, 10, &e->offset);
424 			if (r != 0 || e->offset < 0)
425 				goto einval;
426 		}
427 		p = s;
428 		if (*p++)
429 			goto einval;
430 		pr_debug("register: offset: %#x\n", e->offset);
431 
432 		/* Parse the 'magic' field. */
433 		e->magic = p;
434 		p = scanarg(p, del);
435 		if (!p)
436 			goto einval;
437 		if (!e->magic[0])
438 			goto einval;
439 		if (USE_DEBUG)
440 			print_hex_dump_bytes(
441 				KBUILD_MODNAME ": register: magic[raw]: ",
442 				DUMP_PREFIX_NONE, e->magic, p - e->magic);
443 
444 		/* Parse the 'mask' field. */
445 		e->mask = p;
446 		p = scanarg(p, del);
447 		if (!p)
448 			goto einval;
449 		if (!e->mask[0]) {
450 			e->mask = NULL;
451 			pr_debug("register:  mask[raw]: none\n");
452 		} else if (USE_DEBUG)
453 			print_hex_dump_bytes(
454 				KBUILD_MODNAME ": register:  mask[raw]: ",
455 				DUMP_PREFIX_NONE, e->mask, p - e->mask);
456 
457 		/*
458 		 * Decode the magic & mask fields.
459 		 * Note: while we might have accepted embedded NUL bytes from
460 		 * above, the unescape helpers here will stop at the first one
461 		 * it encounters.
462 		 */
463 		e->size = string_unescape_inplace(e->magic, UNESCAPE_HEX);
464 		if (e->mask &&
465 		    string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size)
466 			goto einval;
467 		if (e->size > BINPRM_BUF_SIZE ||
468 		    BINPRM_BUF_SIZE - e->size < e->offset)
469 			goto einval;
470 		pr_debug("register: magic/mask length: %i\n", e->size);
471 		if (USE_DEBUG) {
472 			print_hex_dump_bytes(
473 				KBUILD_MODNAME ": register: magic[decoded]: ",
474 				DUMP_PREFIX_NONE, e->magic, e->size);
475 
476 			if (e->mask) {
477 				int i;
478 				char *masked = kmalloc(e->size, GFP_KERNEL_ACCOUNT);
479 
480 				print_hex_dump_bytes(
481 					KBUILD_MODNAME ": register:  mask[decoded]: ",
482 					DUMP_PREFIX_NONE, e->mask, e->size);
483 
484 				if (masked) {
485 					for (i = 0; i < e->size; ++i)
486 						masked[i] = e->magic[i] & e->mask[i];
487 					print_hex_dump_bytes(
488 						KBUILD_MODNAME ": register:  magic[masked]: ",
489 						DUMP_PREFIX_NONE, masked, e->size);
490 
491 					kfree(masked);
492 				}
493 			}
494 		}
495 	} else {
496 		/* Handle the 'E' (extension) format. */
497 
498 		/* Skip the 'offset' field. */
499 		p = strchr(p, del);
500 		if (!p)
501 			goto einval;
502 		*p++ = '\0';
503 
504 		/* Parse the 'magic' field. */
505 		e->magic = p;
506 		p = strchr(p, del);
507 		if (!p)
508 			goto einval;
509 		*p++ = '\0';
510 		if (!e->magic[0] || strchr(e->magic, '/'))
511 			goto einval;
512 		pr_debug("register: extension: {%s}\n", e->magic);
513 
514 		/* Skip the 'mask' field. */
515 		p = strchr(p, del);
516 		if (!p)
517 			goto einval;
518 		*p++ = '\0';
519 	}
520 
521 	/* Parse the 'interpreter' field. */
522 	e->interpreter = p;
523 	p = strchr(p, del);
524 	if (!p)
525 		goto einval;
526 	*p++ = '\0';
527 	if (!e->interpreter[0])
528 		goto einval;
529 	pr_debug("register: interpreter: {%s}\n", e->interpreter);
530 
531 	/* Parse the 'flags' field. */
532 	p = check_special_flags(p, e);
533 	if (*p == '\n')
534 		p++;
535 	if (p != buf + count)
536 		goto einval;
537 
538 	return e;
539 
540 out:
541 	return ERR_PTR(err);
542 
543 efault:
544 	kfree(e);
545 	return ERR_PTR(-EFAULT);
546 einval:
547 	kfree(e);
548 	return ERR_PTR(-EINVAL);
549 }
550 
551 /*
552  * Set status of entry/binfmt_misc:
553  * '1' enables, '0' disables and '-1' clears entry/binfmt_misc
554  */
555 static int parse_command(const char __user *buffer, size_t count)
556 {
557 	char s[4];
558 
559 	if (count > 3)
560 		return -EINVAL;
561 	if (copy_from_user(s, buffer, count))
562 		return -EFAULT;
563 	if (!count)
564 		return 0;
565 	if (s[count - 1] == '\n')
566 		count--;
567 	if (count == 1 && s[0] == '0')
568 		return 1;
569 	if (count == 1 && s[0] == '1')
570 		return 2;
571 	if (count == 2 && s[0] == '-' && s[1] == '1')
572 		return 3;
573 	return -EINVAL;
574 }
575 
576 /* generic stuff */
577 
578 static void entry_status(Node *e, char *page)
579 {
580 	char *dp = page;
581 	const char *status = "disabled";
582 
583 	if (test_bit(Enabled, &e->flags))
584 		status = "enabled";
585 
586 	if (!VERBOSE_STATUS) {
587 		sprintf(page, "%s\n", status);
588 		return;
589 	}
590 
591 	dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter);
592 
593 	/* print the special flags */
594 	dp += sprintf(dp, "flags: ");
595 	if (e->flags & MISC_FMT_PRESERVE_ARGV0)
596 		*dp++ = 'P';
597 	if (e->flags & MISC_FMT_OPEN_BINARY)
598 		*dp++ = 'O';
599 	if (e->flags & MISC_FMT_CREDENTIALS)
600 		*dp++ = 'C';
601 	if (e->flags & MISC_FMT_OPEN_FILE)
602 		*dp++ = 'F';
603 	*dp++ = '\n';
604 
605 	if (!test_bit(Magic, &e->flags)) {
606 		sprintf(dp, "extension .%s\n", e->magic);
607 	} else {
608 		dp += sprintf(dp, "offset %i\nmagic ", e->offset);
609 		dp = bin2hex(dp, e->magic, e->size);
610 		if (e->mask) {
611 			dp += sprintf(dp, "\nmask ");
612 			dp = bin2hex(dp, e->mask, e->size);
613 		}
614 		*dp++ = '\n';
615 		*dp = '\0';
616 	}
617 }
618 
619 static struct inode *bm_get_inode(struct super_block *sb, int mode)
620 {
621 	struct inode *inode = new_inode(sb);
622 
623 	if (inode) {
624 		inode->i_ino = get_next_ino();
625 		inode->i_mode = mode;
626 		simple_inode_init_ts(inode);
627 	}
628 	return inode;
629 }
630 
631 /**
632  * i_binfmt_misc - retrieve struct binfmt_misc from a binfmt_misc inode
633  * @inode: inode of the relevant binfmt_misc instance
634  *
635  * This helper retrieves struct binfmt_misc from a binfmt_misc inode. This can
636  * be done without any memory barriers because we are guaranteed that
637  * user_ns->binfmt_misc is fully initialized. It was fully initialized when the
638  * binfmt_misc mount was first created.
639  *
640  * Return: struct binfmt_misc of the relevant binfmt_misc instance
641  */
642 static struct binfmt_misc *i_binfmt_misc(struct inode *inode)
643 {
644 	return inode->i_sb->s_user_ns->binfmt_misc;
645 }
646 
647 /**
648  * bm_evict_inode - cleanup data associated with @inode
649  * @inode: inode to which the data is attached
650  *
651  * Cleanup the binary type handler data associated with @inode if a binary type
652  * entry is removed or the filesystem is unmounted and the super block is
653  * shutdown.
654  *
655  * If the ->evict call was not caused by a super block shutdown but by a write
656  * to remove the entry or all entries via bm_{entry,status}_write() the entry
657  * will have already been removed from the list. We keep the list_empty() check
658  * to make that explicit.
659 */
660 static void bm_evict_inode(struct inode *inode)
661 {
662 	Node *e = inode->i_private;
663 
664 	clear_inode(inode);
665 
666 	if (e) {
667 		struct binfmt_misc *misc;
668 
669 		misc = i_binfmt_misc(inode);
670 		write_lock(&misc->entries_lock);
671 		if (!list_empty(&e->list))
672 			list_del_init(&e->list);
673 		write_unlock(&misc->entries_lock);
674 		put_binfmt_handler(e);
675 	}
676 }
677 
678 /**
679  * remove_binfmt_handler - remove a binary type handler
680  * @misc: handle to binfmt_misc instance
681  * @e: binary type handler to remove
682  *
683  * Remove a binary type handler from the list of binary type handlers and
684  * remove its associated dentry. This is called from
685  * binfmt_{entry,status}_write(). In the future, we might want to think about
686  * adding a proper ->unlink() method to binfmt_misc instead of forcing caller's
687  * to use writes to files in order to delete binary type handlers. But it has
688  * worked for so long that it's not a pressing issue.
689  */
690 static void remove_binfmt_handler(struct binfmt_misc *misc, Node *e)
691 {
692 	write_lock(&misc->entries_lock);
693 	list_del_init(&e->list);
694 	write_unlock(&misc->entries_lock);
695 	locked_recursive_removal(e->dentry, NULL);
696 }
697 
698 /* /<entry> */
699 
700 static ssize_t
701 bm_entry_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
702 {
703 	Node *e = file_inode(file)->i_private;
704 	ssize_t res;
705 	char *page;
706 
707 	page = (char *) __get_free_page(GFP_KERNEL);
708 	if (!page)
709 		return -ENOMEM;
710 
711 	entry_status(e, page);
712 
713 	res = simple_read_from_buffer(buf, nbytes, ppos, page, strlen(page));
714 
715 	free_page((unsigned long) page);
716 	return res;
717 }
718 
719 static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
720 				size_t count, loff_t *ppos)
721 {
722 	struct inode *inode = file_inode(file);
723 	Node *e = inode->i_private;
724 	int res = parse_command(buffer, count);
725 
726 	switch (res) {
727 	case 1:
728 		/* Disable this handler. */
729 		clear_bit(Enabled, &e->flags);
730 		break;
731 	case 2:
732 		/* Enable this handler. */
733 		set_bit(Enabled, &e->flags);
734 		break;
735 	case 3:
736 		/* Delete this handler. */
737 		inode = d_inode(inode->i_sb->s_root);
738 		inode_lock_nested(inode, I_MUTEX_PARENT);
739 
740 		/*
741 		 * In order to add new element or remove elements from the list
742 		 * via bm_{entry,register,status}_write() inode_lock() on the
743 		 * root inode must be held.
744 		 * The lock is exclusive ensuring that the list can't be
745 		 * modified. Only load_misc_binary() can access but does so
746 		 * read-only. So we only need to take the write lock when we
747 		 * actually remove the entry from the list.
748 		 */
749 		if (!list_empty(&e->list))
750 			remove_binfmt_handler(i_binfmt_misc(inode), e);
751 
752 		inode_unlock(inode);
753 		break;
754 	default:
755 		return res;
756 	}
757 
758 	return count;
759 }
760 
761 static const struct file_operations bm_entry_operations = {
762 	.read		= bm_entry_read,
763 	.write		= bm_entry_write,
764 	.llseek		= default_llseek,
765 };
766 
767 /* /register */
768 
769 /* add to filesystem */
770 static int add_entry(Node *e, struct super_block *sb)
771 {
772 	struct dentry *dentry = simple_start_creating(sb->s_root, e->name);
773 	struct inode *inode;
774 	struct binfmt_misc *misc;
775 
776 	if (IS_ERR(dentry))
777 		return PTR_ERR(dentry);
778 
779 	inode = bm_get_inode(sb, S_IFREG | 0644);
780 	if (unlikely(!inode)) {
781 		simple_done_creating(dentry);
782 		return -ENOMEM;
783 	}
784 
785 	refcount_set(&e->users, 1);
786 	e->dentry = dentry;
787 	inode->i_private = e;
788 	inode->i_fop = &bm_entry_operations;
789 
790 	d_make_persistent(dentry, inode);
791 	misc = i_binfmt_misc(inode);
792 	write_lock(&misc->entries_lock);
793 	list_add(&e->list, &misc->entries);
794 	write_unlock(&misc->entries_lock);
795 	simple_done_creating(dentry);
796 	return 0;
797 }
798 
799 static ssize_t bm_register_write(struct file *file, const char __user *buffer,
800 			       size_t count, loff_t *ppos)
801 {
802 	Node *e;
803 	struct super_block *sb = file_inode(file)->i_sb;
804 	int err = 0;
805 	struct file *f = NULL;
806 
807 	e = create_entry(buffer, count);
808 
809 	if (IS_ERR(e))
810 		return PTR_ERR(e);
811 
812 	if (e->flags & MISC_FMT_OPEN_FILE) {
813 		/*
814 		 * Now that we support unprivileged binfmt_misc mounts make
815 		 * sure we use the credentials that the register @file was
816 		 * opened with to also open the interpreter. Before that this
817 		 * didn't matter much as only a privileged process could open
818 		 * the register file.
819 		 */
820 		scoped_with_creds(file->f_cred)
821 			f = open_exec(e->interpreter);
822 		if (IS_ERR(f)) {
823 			pr_notice("register: failed to install interpreter file %s\n",
824 				 e->interpreter);
825 			kfree(e);
826 			return PTR_ERR(f);
827 		}
828 		e->interp_file = f;
829 	}
830 
831 	err = add_entry(e, sb);
832 	if (err) {
833 		if (f) {
834 			exe_file_allow_write_access(f);
835 			filp_close(f, NULL);
836 		}
837 		kfree(e);
838 		return err;
839 	}
840 	return count;
841 }
842 
843 static const struct file_operations bm_register_operations = {
844 	.write		= bm_register_write,
845 	.llseek		= noop_llseek,
846 };
847 
848 /* /status */
849 
850 static ssize_t
851 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
852 {
853 	struct binfmt_misc *misc;
854 	char *s;
855 
856 	misc = i_binfmt_misc(file_inode(file));
857 	s = misc->enabled ? "enabled\n" : "disabled\n";
858 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
859 }
860 
861 static ssize_t bm_status_write(struct file *file, const char __user *buffer,
862 		size_t count, loff_t *ppos)
863 {
864 	struct binfmt_misc *misc;
865 	int res = parse_command(buffer, count);
866 	Node *e, *next;
867 	struct inode *inode;
868 
869 	misc = i_binfmt_misc(file_inode(file));
870 	switch (res) {
871 	case 1:
872 		/* Disable all handlers. */
873 		misc->enabled = false;
874 		break;
875 	case 2:
876 		/* Enable all handlers. */
877 		misc->enabled = true;
878 		break;
879 	case 3:
880 		/* Delete all handlers. */
881 		inode = d_inode(file_inode(file)->i_sb->s_root);
882 		inode_lock_nested(inode, I_MUTEX_PARENT);
883 
884 		/*
885 		 * In order to add new element or remove elements from the list
886 		 * via bm_{entry,register,status}_write() inode_lock() on the
887 		 * root inode must be held.
888 		 * The lock is exclusive ensuring that the list can't be
889 		 * modified. Only load_misc_binary() can access but does so
890 		 * read-only. So we only need to take the write lock when we
891 		 * actually remove the entry from the list.
892 		 */
893 		list_for_each_entry_safe(e, next, &misc->entries, list)
894 			remove_binfmt_handler(misc, e);
895 
896 		inode_unlock(inode);
897 		break;
898 	default:
899 		return res;
900 	}
901 
902 	return count;
903 }
904 
905 static const struct file_operations bm_status_operations = {
906 	.read		= bm_status_read,
907 	.write		= bm_status_write,
908 	.llseek		= default_llseek,
909 };
910 
911 /* Superblock handling */
912 
913 static void bm_put_super(struct super_block *sb)
914 {
915 	struct user_namespace *user_ns = sb->s_fs_info;
916 
917 	sb->s_fs_info = NULL;
918 	put_user_ns(user_ns);
919 }
920 
921 static const struct super_operations s_ops = {
922 	.statfs		= simple_statfs,
923 	.evict_inode	= bm_evict_inode,
924 	.put_super	= bm_put_super,
925 };
926 
927 static int bm_fill_super(struct super_block *sb, struct fs_context *fc)
928 {
929 	int err;
930 	struct user_namespace *user_ns = sb->s_user_ns;
931 	struct binfmt_misc *misc;
932 	static const struct tree_descr bm_files[] = {
933 		[2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO},
934 		[3] = {"register", &bm_register_operations, S_IWUSR},
935 		/* last one */ {""}
936 	};
937 
938 	if (WARN_ON(user_ns != current_user_ns()))
939 		return -EINVAL;
940 
941 	/*
942 	 * Lazily allocate a new binfmt_misc instance for this namespace, i.e.
943 	 * do it here during the first mount of binfmt_misc. We don't need to
944 	 * waste memory for every user namespace allocation. It's likely much
945 	 * more common to not mount a separate binfmt_misc instance than it is
946 	 * to mount one.
947 	 *
948 	 * While multiple superblocks can exist they are keyed by userns in
949 	 * s_fs_info for binfmt_misc. Hence, the vfs guarantees that
950 	 * bm_fill_super() is called exactly once whenever a binfmt_misc
951 	 * superblock for a userns is created. This in turn lets us conclude
952 	 * that when a binfmt_misc superblock is created for the first time for
953 	 * a userns there's no one racing us. Therefore we don't need any
954 	 * barriers when we dereference binfmt_misc.
955 	 */
956 	misc = user_ns->binfmt_misc;
957 	if (!misc) {
958 		/*
959 		 * If it turns out that most user namespaces actually want to
960 		 * register their own binary type handler and therefore all
961 		 * create their own separate binfmt_misc mounts we should
962 		 * consider turning this into a kmem cache.
963 		 */
964 		misc = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
965 		if (!misc)
966 			return -ENOMEM;
967 
968 		INIT_LIST_HEAD(&misc->entries);
969 		rwlock_init(&misc->entries_lock);
970 
971 		/* Pairs with smp_load_acquire() in load_binfmt_misc(). */
972 		smp_store_release(&user_ns->binfmt_misc, misc);
973 	}
974 
975 	/*
976 	 * When the binfmt_misc superblock for this userns is shutdown
977 	 * ->enabled might have been set to false and we don't reinitialize
978 	 * ->enabled again in put_super() as someone might already be mounting
979 	 * binfmt_misc again. It also would be pointless since by the time
980 	 * ->put_super() is called we know that the binary type list for this
981 	 * bintfmt_misc mount is empty making load_misc_binary() return
982 	 * -ENOEXEC independent of whether ->enabled is true. Instead, if
983 	 * someone mounts binfmt_misc for the first time or again we simply
984 	 * reset ->enabled to true.
985 	 */
986 	misc->enabled = true;
987 
988 	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
989 	if (!err)
990 		sb->s_op = &s_ops;
991 	return err;
992 }
993 
994 static void bm_free(struct fs_context *fc)
995 {
996 	if (fc->s_fs_info)
997 		put_user_ns(fc->s_fs_info);
998 }
999 
1000 static int bm_get_tree(struct fs_context *fc)
1001 {
1002 	return get_tree_keyed(fc, bm_fill_super, get_user_ns(fc->user_ns));
1003 }
1004 
1005 static const struct fs_context_operations bm_context_ops = {
1006 	.free		= bm_free,
1007 	.get_tree	= bm_get_tree,
1008 };
1009 
1010 static int bm_init_fs_context(struct fs_context *fc)
1011 {
1012 	fc->ops = &bm_context_ops;
1013 	return 0;
1014 }
1015 
1016 static struct linux_binfmt misc_format = {
1017 	.module = THIS_MODULE,
1018 	.load_binary = load_misc_binary,
1019 };
1020 
1021 static struct file_system_type bm_fs_type = {
1022 	.owner		= THIS_MODULE,
1023 	.name		= "binfmt_misc",
1024 	.init_fs_context = bm_init_fs_context,
1025 	.fs_flags	= FS_USERNS_MOUNT,
1026 	.kill_sb	= kill_anon_super,
1027 };
1028 MODULE_ALIAS_FS("binfmt_misc");
1029 
1030 static int __init init_misc_binfmt(void)
1031 {
1032 	int err = register_filesystem(&bm_fs_type);
1033 	if (!err)
1034 		insert_binfmt(&misc_format);
1035 	return err;
1036 }
1037 
1038 static void __exit exit_misc_binfmt(void)
1039 {
1040 	unregister_binfmt(&misc_format);
1041 	unregister_filesystem(&bm_fs_type);
1042 }
1043 
1044 core_initcall(init_misc_binfmt);
1045 module_exit(exit_misc_binfmt);
1046 MODULE_DESCRIPTION("Kernel support for miscellaneous binaries");
1047 MODULE_LICENSE("GPL");
1048