xref: /linux/tools/testing/selftests/filesystems/empty_mntns/clone3_empty_mntns_test.c (revision 7c8a4671dc3247a26a702e5f5996e9f453d7070d)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Tests for empty mount namespace creation via clone3() CLONE_EMPTY_MNTNS
4  *
5  * These tests exercise the clone3() code path for creating empty mount
6  * namespaces, which is distinct from the unshare() path tested in
7  * empty_mntns_test.c.  With clone3(), CLONE_EMPTY_MNTNS (0x2000000000ULL)
8  * is a 64-bit flag that implies CLONE_NEWNS.  The implication happens in
9  * kernel_clone() before copy_process(), unlike unshare() where it goes
10  * through UNSHARE_EMPTY_MNTNS -> CLONE_EMPTY_MNTNS conversion in
11  * unshare_nsproxy_namespaces().
12  *
13  * Copyright (c) 2024 Christian Brauner <brauner@kernel.org>
14  */
15 
16 #define _GNU_SOURCE
17 #include <fcntl.h>
18 #include <linux/mount.h>
19 #include <linux/stat.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <sys/mount.h>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25 #include <unistd.h>
26 
27 #include "../utils.h"
28 #include "../wrappers.h"
29 #include "clone3/clone3_selftests.h"
30 #include "empty_mntns.h"
31 #include "kselftest_harness.h"
32 
33 static pid_t clone3_empty_mntns(uint64_t extra_flags)
34 {
35 	struct __clone_args args = {
36 		.flags		= CLONE_EMPTY_MNTNS | extra_flags,
37 		.exit_signal	= SIGCHLD,
38 	};
39 
40 	return sys_clone3(&args, sizeof(args));
41 }
42 
43 static bool clone3_empty_mntns_supported(void)
44 {
45 	pid_t pid;
46 	int status;
47 
48 	pid = fork();
49 	if (pid < 0)
50 		return false;
51 
52 	if (pid == 0) {
53 		if (enter_userns())
54 			_exit(1);
55 
56 		pid = clone3_empty_mntns(0);
57 		if (pid < 0)
58 			_exit(1);
59 
60 		if (pid == 0)
61 			_exit(0);
62 
63 		_exit(wait_for_pid(pid) != 0);
64 	}
65 
66 	if (waitpid(pid, &status, 0) != pid)
67 		return false;
68 
69 	if (!WIFEXITED(status))
70 		return false;
71 
72 	return WEXITSTATUS(status) == 0;
73 }
74 
75 FIXTURE(clone3_empty_mntns) {};
76 
77 FIXTURE_SETUP(clone3_empty_mntns)
78 {
79 	if (!clone3_empty_mntns_supported())
80 		SKIP(return, "CLONE_EMPTY_MNTNS via clone3 not supported");
81 }
82 
83 FIXTURE_TEARDOWN(clone3_empty_mntns) {}
84 
85 /*
86  * Basic clone3() with CLONE_EMPTY_MNTNS: child gets empty mount namespace
87  * with exactly 1 mount and root == cwd.
88  */
89 TEST_F(clone3_empty_mntns, basic)
90 {
91 	pid_t pid, inner;
92 
93 	pid = fork();
94 	ASSERT_GE(pid, 0);
95 
96 	if (pid == 0) {
97 		if (enter_userns())
98 			_exit(1);
99 
100 		inner = clone3_empty_mntns(0);
101 		if (inner < 0)
102 			_exit(2);
103 
104 		if (inner == 0) {
105 			uint64_t root_id, cwd_id;
106 
107 			if (count_mounts() != 1)
108 				_exit(3);
109 
110 			root_id = get_unique_mnt_id("/");
111 			cwd_id = get_unique_mnt_id(".");
112 			if (root_id == 0 || cwd_id == 0)
113 				_exit(4);
114 
115 			if (root_id != cwd_id)
116 				_exit(5);
117 
118 			_exit(0);
119 		}
120 
121 		_exit(wait_for_pid(inner));
122 	}
123 
124 	ASSERT_EQ(wait_for_pid(pid), 0);
125 }
126 
127 /*
128  * CLONE_EMPTY_MNTNS implies CLONE_NEWNS.  Verify that it works without
129  * explicitly setting CLONE_NEWNS (tests fork.c:2627-2630).
130  */
131 TEST_F(clone3_empty_mntns, implies_newns)
132 {
133 	pid_t pid, inner;
134 
135 	pid = fork();
136 	ASSERT_GE(pid, 0);
137 
138 	if (pid == 0) {
139 		ssize_t parent_mounts;
140 
141 		if (enter_userns())
142 			_exit(1);
143 
144 		/* Verify we have mounts in our current namespace. */
145 		parent_mounts = count_mounts();
146 		if (parent_mounts < 1)
147 			_exit(2);
148 
149 		/* Only CLONE_EMPTY_MNTNS, no explicit CLONE_NEWNS. */
150 		inner = clone3_empty_mntns(0);
151 		if (inner < 0)
152 			_exit(3);
153 
154 		if (inner == 0) {
155 			if (count_mounts() != 1)
156 				_exit(4);
157 
158 			_exit(0);
159 		}
160 
161 		/* Parent still has its mounts. */
162 		if (count_mounts() != parent_mounts)
163 			_exit(5);
164 
165 		_exit(wait_for_pid(inner));
166 	}
167 
168 	ASSERT_EQ(wait_for_pid(pid), 0);
169 }
170 
171 /*
172  * Helper macro: generate a test that clones with CLONE_EMPTY_MNTNS |
173  * @extra_flags and verifies the child has exactly one mount.
174  */
175 #define TEST_CLONE3_FLAGS(test_name, extra_flags)			\
176 TEST_F(clone3_empty_mntns, test_name)					\
177 {									\
178 	pid_t pid, inner;						\
179 									\
180 	pid = fork();							\
181 	ASSERT_GE(pid, 0);						\
182 									\
183 	if (pid == 0) {							\
184 		if (enter_userns())					\
185 			_exit(1);					\
186 									\
187 		inner = clone3_empty_mntns(extra_flags);		\
188 		if (inner < 0)						\
189 			_exit(2);					\
190 									\
191 		if (inner == 0) {					\
192 			if (count_mounts() != 1)			\
193 				_exit(3);				\
194 			_exit(0);					\
195 		}							\
196 									\
197 		_exit(wait_for_pid(inner));				\
198 	}								\
199 									\
200 	ASSERT_EQ(wait_for_pid(pid), 0);				\
201 }
202 
203 /* Redundant CLONE_NEWNS | CLONE_EMPTY_MNTNS should succeed. */
204 TEST_CLONE3_FLAGS(with_explicit_newns, CLONE_NEWNS)
205 
206 /* CLONE_EMPTY_MNTNS combined with CLONE_NEWUSER. */
207 TEST_CLONE3_FLAGS(with_newuser, CLONE_NEWUSER)
208 
209 /* CLONE_EMPTY_MNTNS combined with other namespace flags. */
210 TEST_CLONE3_FLAGS(with_other_ns_flags, CLONE_NEWUTS | CLONE_NEWIPC)
211 
212 /*
213  * CLONE_EMPTY_MNTNS combined with CLONE_NEWPID.
214  */
215 TEST_F(clone3_empty_mntns, with_newpid)
216 {
217 	pid_t pid, inner;
218 
219 	pid = fork();
220 	ASSERT_GE(pid, 0);
221 
222 	if (pid == 0) {
223 		if (enter_userns())
224 			_exit(1);
225 
226 		inner = clone3_empty_mntns(CLONE_NEWPID);
227 		if (inner < 0)
228 			_exit(2);
229 
230 		if (inner == 0) {
231 			if (count_mounts() != 1)
232 				_exit(3);
233 
234 			/* In a new PID namespace, getpid() returns 1. */
235 			if (getpid() != 1)
236 				_exit(4);
237 
238 			_exit(0);
239 		}
240 
241 		_exit(wait_for_pid(inner));
242 	}
243 
244 	ASSERT_EQ(wait_for_pid(pid), 0);
245 }
246 
247 /*
248  * CLONE_EMPTY_MNTNS | CLONE_FS must fail because the implied CLONE_NEWNS
249  * and CLONE_FS are mutually exclusive (fork.c:1981).
250  */
251 TEST_F(clone3_empty_mntns, with_clone_fs_fails)
252 {
253 	pid_t pid;
254 
255 	pid = fork();
256 	ASSERT_GE(pid, 0);
257 
258 	if (pid == 0) {
259 		struct __clone_args args = {
260 			.flags		= CLONE_EMPTY_MNTNS | CLONE_FS,
261 			.exit_signal	= SIGCHLD,
262 		};
263 		pid_t ret;
264 
265 		if (enter_userns())
266 			_exit(1);
267 
268 		ret = sys_clone3(&args, sizeof(args));
269 		if (ret >= 0) {
270 			if (ret == 0)
271 				_exit(0);
272 			wait_for_pid(ret);
273 			_exit(2);
274 		}
275 
276 		if (errno != EINVAL)
277 			_exit(3);
278 
279 		_exit(0);
280 	}
281 
282 	ASSERT_EQ(wait_for_pid(pid), 0);
283 }
284 
285 /*
286  * CLONE_EMPTY_MNTNS combined with CLONE_PIDFD returns a valid pidfd.
287  */
288 TEST_F(clone3_empty_mntns, with_pidfd)
289 {
290 	pid_t pid;
291 
292 	pid = fork();
293 	ASSERT_GE(pid, 0);
294 
295 	if (pid == 0) {
296 		struct __clone_args args = {
297 			.flags		= CLONE_EMPTY_MNTNS | CLONE_PIDFD,
298 			.exit_signal	= SIGCHLD,
299 		};
300 		int pidfd = -1;
301 		pid_t inner;
302 
303 		if (enter_userns())
304 			_exit(1);
305 
306 		args.pidfd = (uintptr_t)&pidfd;
307 
308 		inner = sys_clone3(&args, sizeof(args));
309 		if (inner < 0)
310 			_exit(2);
311 
312 		if (inner == 0) {
313 			if (count_mounts() != 1)
314 				_exit(3);
315 
316 			_exit(0);
317 		}
318 
319 		/* Verify we got a valid pidfd. */
320 		if (pidfd < 0)
321 			_exit(4);
322 
323 		close(pidfd);
324 		_exit(wait_for_pid(inner));
325 	}
326 
327 	ASSERT_EQ(wait_for_pid(pid), 0);
328 }
329 
330 /*
331  * clone3 without CAP_SYS_ADMIN must fail with EPERM.
332  */
333 TEST_F(clone3_empty_mntns, eperm_without_caps)
334 {
335 	pid_t pid;
336 
337 	pid = fork();
338 	ASSERT_GE(pid, 0);
339 
340 	if (pid == 0) {
341 		pid_t ret;
342 
343 		/* Skip if already root. */
344 		if (getuid() == 0)
345 			_exit(0);
346 
347 		ret = clone3_empty_mntns(0);
348 		if (ret >= 0) {
349 			if (ret == 0)
350 				_exit(0);
351 			wait_for_pid(ret);
352 			_exit(1);
353 		}
354 
355 		if (errno != EPERM)
356 			_exit(2);
357 
358 		_exit(0);
359 	}
360 
361 	ASSERT_EQ(wait_for_pid(pid), 0);
362 }
363 
364 /*
365  * Parent's mount namespace is unaffected after clone3 with CLONE_EMPTY_MNTNS.
366  */
367 TEST_F(clone3_empty_mntns, parent_unchanged)
368 {
369 	pid_t pid;
370 
371 	pid = fork();
372 	ASSERT_GE(pid, 0);
373 
374 	if (pid == 0) {
375 		ssize_t nr_before, nr_after;
376 		pid_t inner;
377 
378 		if (enter_userns())
379 			_exit(1);
380 
381 		nr_before = count_mounts();
382 		if (nr_before < 1)
383 			_exit(2);
384 
385 		inner = clone3_empty_mntns(0);
386 		if (inner < 0)
387 			_exit(3);
388 
389 		if (inner == 0)
390 			_exit(0);
391 
392 		if (wait_for_pid(inner) != 0)
393 			_exit(4);
394 
395 		nr_after = count_mounts();
396 		if (nr_after != nr_before)
397 			_exit(5);
398 
399 		_exit(0);
400 	}
401 
402 	ASSERT_EQ(wait_for_pid(pid), 0);
403 }
404 
405 /*
406  * Parent with many mounts: child still gets exactly 1 mount.
407  */
408 TEST_F(clone3_empty_mntns, many_parent_mounts)
409 {
410 	pid_t pid;
411 
412 	pid = fork();
413 	ASSERT_GE(pid, 0);
414 
415 	if (pid == 0) {
416 		char tmpdir[] = "/tmp/clone3_mntns_test.XXXXXX";
417 		pid_t inner;
418 		int i;
419 
420 		if (enter_userns())
421 			_exit(1);
422 
423 		if (unshare(CLONE_NEWNS))
424 			_exit(2);
425 
426 		if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, NULL))
427 			_exit(3);
428 
429 		if (!mkdtemp(tmpdir))
430 			_exit(4);
431 
432 		if (mount("tmpfs", tmpdir, "tmpfs", 0, "size=1M"))
433 			_exit(5);
434 
435 		for (i = 0; i < 5; i++) {
436 			char subdir[256];
437 
438 			snprintf(subdir, sizeof(subdir), "%s/sub%d", tmpdir, i);
439 			if (mkdir(subdir, 0755) && errno != EEXIST)
440 				_exit(6);
441 			if (mount(subdir, subdir, NULL, MS_BIND, NULL))
442 				_exit(7);
443 		}
444 
445 		if (count_mounts() < 5)
446 			_exit(8);
447 
448 		inner = clone3_empty_mntns(0);
449 		if (inner < 0)
450 			_exit(9);
451 
452 		if (inner == 0) {
453 			if (count_mounts() != 1)
454 				_exit(10);
455 
456 			_exit(0);
457 		}
458 
459 		_exit(wait_for_pid(inner));
460 	}
461 
462 	ASSERT_EQ(wait_for_pid(pid), 0);
463 }
464 
465 /*
466  * Verify the child's root mount is nullfs with expected statmount properties.
467  */
468 TEST_F(clone3_empty_mntns, mount_properties)
469 {
470 	pid_t pid;
471 
472 	pid = fork();
473 	ASSERT_GE(pid, 0);
474 
475 	if (pid == 0) {
476 		pid_t inner;
477 
478 		if (enter_userns())
479 			_exit(1);
480 
481 		inner = clone3_empty_mntns(0);
482 		if (inner < 0)
483 			_exit(2);
484 
485 		if (inner == 0) {
486 			struct statmount *sm;
487 			uint64_t root_id;
488 
489 			root_id = get_unique_mnt_id("/");
490 			if (!root_id)
491 				_exit(3);
492 
493 			sm = statmount_alloc(root_id, 0,
494 					     STATMOUNT_MNT_BASIC |
495 					     STATMOUNT_MNT_POINT |
496 					     STATMOUNT_FS_TYPE, 0);
497 			if (!sm)
498 				_exit(4);
499 
500 			/* Root mount point is "/". */
501 			if (!(sm->mask & STATMOUNT_MNT_POINT))
502 				_exit(5);
503 			if (strcmp(sm->str + sm->mnt_point, "/") != 0)
504 				_exit(6);
505 
506 			/* Filesystem type is nullfs. */
507 			if (!(sm->mask & STATMOUNT_FS_TYPE))
508 				_exit(7);
509 			if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
510 				_exit(8);
511 
512 			/* Root mount is its own parent. */
513 			if (!(sm->mask & STATMOUNT_MNT_BASIC))
514 				_exit(9);
515 			if (sm->mnt_parent_id != sm->mnt_id)
516 				_exit(10);
517 
518 			free(sm);
519 			_exit(0);
520 		}
521 
522 		_exit(wait_for_pid(inner));
523 	}
524 
525 	ASSERT_EQ(wait_for_pid(pid), 0);
526 }
527 
528 /*
529  * Listmount returns only the root mount in the child's empty namespace.
530  */
531 TEST_F(clone3_empty_mntns, listmount_single_entry)
532 {
533 	pid_t pid;
534 
535 	pid = fork();
536 	ASSERT_GE(pid, 0);
537 
538 	if (pid == 0) {
539 		pid_t inner;
540 
541 		if (enter_userns())
542 			_exit(1);
543 
544 		inner = clone3_empty_mntns(0);
545 		if (inner < 0)
546 			_exit(2);
547 
548 		if (inner == 0) {
549 			uint64_t list[16];
550 			ssize_t nr_mounts;
551 			uint64_t root_id;
552 
553 			nr_mounts = listmount(LSMT_ROOT, 0, 0, list, 16, 0);
554 			if (nr_mounts != 1)
555 				_exit(3);
556 
557 			root_id = get_unique_mnt_id("/");
558 			if (!root_id)
559 				_exit(4);
560 
561 			if (list[0] != root_id)
562 				_exit(5);
563 
564 			_exit(0);
565 		}
566 
567 		_exit(wait_for_pid(inner));
568 	}
569 
570 	ASSERT_EQ(wait_for_pid(pid), 0);
571 }
572 
573 /*
574  * Child can mount tmpfs over nullfs root (the primary container use case).
575  *
576  * Uses the new mount API (fsopen/fsmount/move_mount) because resolving
577  * "/" returns the process root directly without following overmounts.
578  * The mount fd from fsmount lets us fchdir + chroot into the new tmpfs.
579  */
580 TEST_F(clone3_empty_mntns, child_overmount_tmpfs)
581 {
582 	pid_t pid;
583 
584 	pid = fork();
585 	ASSERT_GE(pid, 0);
586 
587 	if (pid == 0) {
588 		pid_t inner;
589 
590 		if (enter_userns())
591 			_exit(1);
592 
593 		inner = clone3_empty_mntns(0);
594 		if (inner < 0)
595 			_exit(2);
596 
597 		if (inner == 0) {
598 			struct statmount *sm;
599 			uint64_t root_id;
600 			int fd, fsfd, mntfd;
601 
602 			if (count_mounts() != 1)
603 				_exit(3);
604 
605 			/* Verify root is nullfs. */
606 			root_id = get_unique_mnt_id("/");
607 			if (!root_id)
608 				_exit(4);
609 
610 			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
611 			if (!sm)
612 				_exit(5);
613 			if (!(sm->mask & STATMOUNT_FS_TYPE))
614 				_exit(6);
615 			if (strcmp(sm->str + sm->fs_type, "nullfs") != 0)
616 				_exit(7);
617 			free(sm);
618 
619 			/* Create tmpfs via the new mount API. */
620 			fsfd = sys_fsopen("tmpfs", 0);
621 			if (fsfd < 0)
622 				_exit(8);
623 
624 			if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING,
625 					 "size", "1M", 0)) {
626 				close(fsfd);
627 				_exit(9);
628 			}
629 
630 			if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE,
631 					 NULL, NULL, 0)) {
632 				close(fsfd);
633 				_exit(10);
634 			}
635 
636 			mntfd = sys_fsmount(fsfd, 0, 0);
637 			close(fsfd);
638 			if (mntfd < 0)
639 				_exit(11);
640 
641 			/* Attach tmpfs to "/". */
642 			if (sys_move_mount(mntfd, "", AT_FDCWD, "/",
643 					   MOVE_MOUNT_F_EMPTY_PATH)) {
644 				close(mntfd);
645 				_exit(12);
646 			}
647 
648 			if (count_mounts() != 2) {
649 				close(mntfd);
650 				_exit(13);
651 			}
652 
653 			/* Enter the tmpfs. */
654 			if (fchdir(mntfd)) {
655 				close(mntfd);
656 				_exit(14);
657 			}
658 
659 			if (chroot(".")) {
660 				close(mntfd);
661 				_exit(15);
662 			}
663 
664 			close(mntfd);
665 
666 			/* Verify "/" is now tmpfs. */
667 			root_id = get_unique_mnt_id("/");
668 			if (!root_id)
669 				_exit(16);
670 
671 			sm = statmount_alloc(root_id, 0, STATMOUNT_FS_TYPE, 0);
672 			if (!sm)
673 				_exit(17);
674 			if (!(sm->mask & STATMOUNT_FS_TYPE))
675 				_exit(18);
676 			if (strcmp(sm->str + sm->fs_type, "tmpfs") != 0)
677 				_exit(19);
678 			free(sm);
679 
680 			/* Verify tmpfs is writable. */
681 			fd = open("/testfile", O_CREAT | O_RDWR, 0644);
682 			if (fd < 0)
683 				_exit(20);
684 
685 			if (write(fd, "test", 4) != 4) {
686 				close(fd);
687 				_exit(21);
688 			}
689 			close(fd);
690 
691 			if (access("/testfile", F_OK))
692 				_exit(22);
693 
694 			_exit(0);
695 		}
696 
697 		_exit(wait_for_pid(inner));
698 	}
699 
700 	ASSERT_EQ(wait_for_pid(pid), 0);
701 }
702 
703 /*
704  * Multiple clone3 calls with CLONE_EMPTY_MNTNS produce children with
705  * distinct mount namespace root mount IDs.
706  */
707 TEST_F(clone3_empty_mntns, repeated)
708 {
709 	pid_t pid;
710 
711 	pid = fork();
712 	ASSERT_GE(pid, 0);
713 
714 	if (pid == 0) {
715 		int pipe1[2], pipe2[2];
716 		uint64_t id1 = 0, id2 = 0;
717 		pid_t inner1, inner2;
718 
719 		if (enter_userns())
720 			_exit(1);
721 
722 		if (pipe(pipe1) || pipe(pipe2))
723 			_exit(2);
724 
725 		inner1 = clone3_empty_mntns(0);
726 		if (inner1 < 0)
727 			_exit(3);
728 
729 		if (inner1 == 0) {
730 			uint64_t root_id;
731 
732 			close(pipe1[0]);
733 			root_id = get_unique_mnt_id("/");
734 			if (write(pipe1[1], &root_id, sizeof(root_id)) != sizeof(root_id))
735 				_exit(1);
736 			close(pipe1[1]);
737 			_exit(0);
738 		}
739 
740 		inner2 = clone3_empty_mntns(0);
741 		if (inner2 < 0)
742 			_exit(4);
743 
744 		if (inner2 == 0) {
745 			uint64_t root_id;
746 
747 			close(pipe2[0]);
748 			root_id = get_unique_mnt_id("/");
749 			if (write(pipe2[1], &root_id, sizeof(root_id)) != sizeof(root_id))
750 				_exit(1);
751 			close(pipe2[1]);
752 			_exit(0);
753 		}
754 
755 		close(pipe1[1]);
756 		close(pipe2[1]);
757 
758 		if (read(pipe1[0], &id1, sizeof(id1)) != sizeof(id1))
759 			_exit(5);
760 		if (read(pipe2[0], &id2, sizeof(id2)) != sizeof(id2))
761 			_exit(6);
762 
763 		close(pipe1[0]);
764 		close(pipe2[0]);
765 
766 		if (wait_for_pid(inner1) || wait_for_pid(inner2))
767 			_exit(7);
768 
769 		/* Each child must have a distinct root mount ID. */
770 		if (id1 == 0 || id2 == 0)
771 			_exit(8);
772 		if (id1 == id2)
773 			_exit(9);
774 
775 		_exit(0);
776 	}
777 
778 	ASSERT_EQ(wait_for_pid(pid), 0);
779 }
780 
781 /*
782  * Verify setns() into a child's empty mount namespace works.
783  */
784 TEST_F(clone3_empty_mntns, setns_into_child_mntns)
785 {
786 	pid_t pid;
787 
788 	pid = fork();
789 	ASSERT_GE(pid, 0);
790 
791 	if (pid == 0) {
792 		int pipe_fd[2];
793 		pid_t inner;
794 		char c;
795 
796 		if (enter_userns())
797 			_exit(1);
798 
799 		if (pipe(pipe_fd))
800 			_exit(2);
801 
802 		inner = clone3_empty_mntns(0);
803 		if (inner < 0)
804 			_exit(3);
805 
806 		if (inner == 0) {
807 			/* Signal parent we're ready. */
808 			close(pipe_fd[0]);
809 			if (write(pipe_fd[1], "r", 1) != 1)
810 				_exit(1);
811 
812 			/*
813 			 * Wait for parent to finish.  Reading from our
814 			 * write end will block until the parent closes
815 			 * its read end, giving us an implicit barrier.
816 			 */
817 			if (read(pipe_fd[1], &c, 1) < 0)
818 				;
819 			close(pipe_fd[1]);
820 			_exit(0);
821 		}
822 
823 		close(pipe_fd[1]);
824 
825 		/* Wait for child to be ready. */
826 		if (read(pipe_fd[0], &c, 1) != 1)
827 			_exit(4);
828 
829 		/* Open child's mount namespace. */
830 		{
831 			char path[64];
832 			int mntns_fd;
833 
834 			snprintf(path, sizeof(path), "/proc/%d/ns/mnt", inner);
835 			mntns_fd = open(path, O_RDONLY);
836 			if (mntns_fd < 0)
837 				_exit(5);
838 
839 			if (setns(mntns_fd, CLONE_NEWNS))
840 				_exit(6);
841 
842 			close(mntns_fd);
843 		}
844 
845 		/* Now we should be in the child's empty mntns. */
846 		if (count_mounts() != 1)
847 			_exit(7);
848 
849 		close(pipe_fd[0]);
850 		_exit(wait_for_pid(inner));
851 	}
852 
853 	ASSERT_EQ(wait_for_pid(pid), 0);
854 }
855 
856 /*
857  * Tests below do not require CLONE_EMPTY_MNTNS support.
858  */
859 
860 /*
861  * Unknown 64-bit flags beyond the known set are rejected.
862  */
863 TEST(unknown_flags_rejected)
864 {
865 	pid_t pid;
866 
867 	pid = fork();
868 	ASSERT_GE(pid, 0);
869 
870 	if (pid == 0) {
871 		struct __clone_args args = {
872 			.flags		= 0x800000000ULL,
873 			.exit_signal	= SIGCHLD,
874 		};
875 		pid_t ret;
876 
877 		ret = sys_clone3(&args, sizeof(args));
878 		if (ret >= 0) {
879 			if (ret == 0)
880 				_exit(0);
881 			wait_for_pid(ret);
882 			_exit(1);
883 		}
884 
885 		if (errno != EINVAL)
886 			_exit(2);
887 
888 		_exit(0);
889 	}
890 
891 	ASSERT_EQ(wait_for_pid(pid), 0);
892 }
893 
894 /*
895  * Regular clone3 with CLONE_NEWNS (without CLONE_EMPTY_MNTNS) still
896  * copies the full mount tree.
897  */
898 TEST(clone3_newns_full_copy)
899 {
900 	pid_t pid;
901 
902 	pid = fork();
903 	ASSERT_GE(pid, 0);
904 
905 	if (pid == 0) {
906 		struct __clone_args args = {
907 			.flags		= CLONE_NEWNS,
908 			.exit_signal	= SIGCHLD,
909 		};
910 		ssize_t parent_mounts;
911 		pid_t inner;
912 
913 		if (enter_userns())
914 			_exit(1);
915 
916 		parent_mounts = count_mounts();
917 		if (parent_mounts < 1)
918 			_exit(2);
919 
920 		inner = sys_clone3(&args, sizeof(args));
921 		if (inner < 0)
922 			_exit(3);
923 
924 		if (inner == 0) {
925 			/* Full copy should have at least as many mounts. */
926 			if (count_mounts() < parent_mounts)
927 				_exit(1);
928 
929 			_exit(0);
930 		}
931 
932 		_exit(wait_for_pid(inner));
933 	}
934 
935 	ASSERT_EQ(wait_for_pid(pid), 0);
936 }
937 
938 TEST_HARNESS_MAIN
939