xref: /linux/tools/testing/selftests/namespaces/listns_efault_test.c (revision 2509bdc8a47c2f13471ac43ec989c778ed304d77)
1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <sched.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <linux/nsfs.h>
12 #include <sys/ioctl.h>
13 #include <sys/mman.h>
14 #include <sys/mount.h>
15 #include <sys/socket.h>
16 #include <sys/stat.h>
17 #include <sys/syscall.h>
18 #include <sys/types.h>
19 #include <sys/wait.h>
20 #include <unistd.h>
21 #include "../kselftest_harness.h"
22 #include "../pidfd/pidfd.h"
23 #include "wrappers.h"
24 
25 /*
26  * Test listns() error handling with invalid buffer addresses.
27  *
28  * When the buffer pointer is invalid (e.g., crossing page boundaries
29  * into unmapped memory), listns() returns EINVAL.
30  *
31  * This test also creates mount namespaces that get destroyed during
32  * iteration, testing that namespace cleanup happens outside the RCU
33  * read lock.
34  */
35 TEST(listns_partial_fault_with_ns_cleanup)
36 {
37 	void *map;
38 	__u64 *ns_ids;
39 	ssize_t ret;
40 	long page_size;
41 	pid_t pid, iter_pid, ns_pids[5];
42 	int pidfds[5];
43 	int sv[5][2];
44 	int iter_pidfd;
45 	int i, status;
46 	char c;
47 
48 	page_size = sysconf(_SC_PAGESIZE);
49 	ASSERT_GT(page_size, 0);
50 
51 	/*
52 	 * Map two pages:
53 	 * - First page: readable and writable
54 	 * - Second page: will be unmapped to trigger EFAULT
55 	 */
56 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
57 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
58 	ASSERT_NE(map, MAP_FAILED);
59 
60 	/* Unmap the second page */
61 	ret = munmap((char *)map + page_size, page_size);
62 	ASSERT_EQ(ret, 0);
63 
64 	/*
65 	 * Position the buffer pointer so there's room for exactly one u64
66 	 * before the page boundary. The second u64 would fall into the
67 	 * unmapped page.
68 	 */
69 	ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
70 
71 	/*
72 	 * Create a separate process to run listns() in a loop concurrently
73 	 * with namespace creation and destruction.
74 	 */
75 	iter_pid = create_child(&iter_pidfd, 0);
76 	ASSERT_NE(iter_pid, -1);
77 
78 	if (iter_pid == 0) {
79 		struct ns_id_req req = {
80 			.size = sizeof(req),
81 			.spare = 0,
82 			.ns_id = 0,
83 			.ns_type = 0,  /* All types */
84 			.spare2 = 0,
85 			.user_ns_id = 0,  /* Global listing */
86 		};
87 		int iter_ret;
88 
89 		/*
90 		 * Loop calling listns() until killed.
91 		 * The kernel should:
92 		 * 1. Successfully write the first namespace ID (within valid page)
93 		 * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
94 		 * 3. Handle concurrent namespace destruction without deadlock
95 		 */
96 		while (1) {
97 			iter_ret = sys_listns(&req, ns_ids, 2, 0);
98 
99 			if (iter_ret == -1 && errno == ENOSYS)
100 				_exit(PIDFD_SKIP);
101 		}
102 	}
103 
104 	/* Small delay to let iterator start looping */
105 	usleep(50000);
106 
107 	/*
108 	 * Create several child processes, each in its own mount namespace.
109 	 * These will be destroyed while the iterator is running listns().
110 	 */
111 	for (i = 0; i < 5; i++) {
112 		/* Create socketpair for synchronization */
113 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
114 
115 		pid = create_child(&pidfds[i], CLONE_NEWNS);
116 		ASSERT_NE(pid, -1);
117 		ns_pids[i] = pid;
118 
119 		if (pid == 0) {
120 			close(sv[i][0]); /* Close parent end */
121 
122 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
123 				_exit(1);
124 
125 			/* Child: create a couple of tmpfs mounts */
126 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
127 				_exit(1);
128 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
129 				_exit(1);
130 
131 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
132 				_exit(1);
133 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
134 				_exit(1);
135 
136 			/* Signal parent that setup is complete */
137 			if (write_nointr(sv[i][1], "R", 1) != 1)
138 				_exit(1);
139 
140 			/* Wait for parent to signal us to exit */
141 			if (read_nointr(sv[i][1], &c, 1) != 1)
142 				_exit(1);
143 
144 			close(sv[i][1]);
145 			_exit(0);
146 		}
147 
148 		close(sv[i][1]); /* Close child end */
149 	}
150 
151 	/* Wait for all children to finish setup */
152 	for (i = 0; i < 5; i++) {
153 		ret = read_nointr(sv[i][0], &c, 1);
154 		ASSERT_EQ(ret, 1);
155 		ASSERT_EQ(c, 'R');
156 	}
157 
158 	/*
159 	 * Signal children to exit. This will destroy their mount namespaces
160 	 * while listns() is iterating the namespace tree.
161 	 * This tests that cleanup happens outside the RCU read lock.
162 	 */
163 	for (i = 0; i < 5; i++)
164 		write_nointr(sv[i][0], "X", 1);
165 
166 	/* Wait for all mount namespace children to exit and cleanup */
167 	for (i = 0; i < 5; i++) {
168 		waitpid(ns_pids[i], NULL, 0);
169 		close(sv[i][0]);
170 		close(pidfds[i]);
171 	}
172 
173 	/* Kill iterator and wait for it */
174 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
175 	ret = waitpid(iter_pid, &status, 0);
176 	ASSERT_EQ(ret, iter_pid);
177 	close(iter_pidfd);
178 
179 	/* If listns() is not supported the iterator exits cleanly via ENOSYS */
180 	if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) {
181 		munmap(map, page_size);
182 		SKIP(return, "listns() not supported");
183 	}
184 
185 	/* Should have been killed */
186 	ASSERT_TRUE(WIFSIGNALED(status));
187 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
188 
189 	/* Clean up */
190 	munmap(map, page_size);
191 }
192 
193 /*
194  * Test listns() error handling when the entire buffer is invalid.
195  * This is a sanity check that basic invalid pointer detection works.
196  */
197 TEST(listns_complete_fault)
198 {
199 	struct ns_id_req req = {
200 		.size = sizeof(req),
201 		.spare = 0,
202 		.ns_id = 0,
203 		.ns_type = 0,
204 		.spare2 = 0,
205 		.user_ns_id = 0,
206 	};
207 	__u64 *ns_ids;
208 	ssize_t ret;
209 
210 	/* Use a clearly invalid pointer */
211 	ns_ids = (__u64 *)0xdeadbeef;
212 
213 	ret = sys_listns(&req, ns_ids, 10, 0);
214 
215 	if (ret == -1 && errno == ENOSYS)
216 		SKIP(return, "listns() not supported");
217 
218 	/* Should fail with EFAULT */
219 	ASSERT_EQ(ret, -1);
220 	ASSERT_EQ(errno, EFAULT);
221 }
222 
223 /*
224  * Test listns() error handling when the buffer is NULL.
225  */
226 TEST(listns_null_buffer)
227 {
228 	struct ns_id_req req = {
229 		.size = sizeof(req),
230 		.spare = 0,
231 		.ns_id = 0,
232 		.ns_type = 0,
233 		.spare2 = 0,
234 		.user_ns_id = 0,
235 	};
236 	ssize_t ret;
237 
238 	/* NULL buffer with non-zero count should fail */
239 	ret = sys_listns(&req, NULL, 10, 0);
240 
241 	if (ret == -1 && errno == ENOSYS)
242 		SKIP(return, "listns() not supported");
243 
244 	/* Should fail with EFAULT */
245 	ASSERT_EQ(ret, -1);
246 	ASSERT_EQ(errno, EFAULT);
247 }
248 
249 /*
250  * Test listns() with a buffer that becomes invalid mid-iteration
251  * (after several successful writes), combined with mount namespace
252  * destruction to test RCU cleanup logic.
253  */
254 TEST(listns_late_fault_with_ns_cleanup)
255 {
256 	void *map;
257 	__u64 *ns_ids;
258 	ssize_t ret;
259 	long page_size;
260 	pid_t pid, iter_pid, ns_pids[10];
261 	int pidfds[10];
262 	int sv[10][2];
263 	int iter_pidfd;
264 	int i, status;
265 	char c;
266 
267 	page_size = sysconf(_SC_PAGESIZE);
268 	ASSERT_GT(page_size, 0);
269 
270 	/* Map two pages */
271 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
272 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
273 	ASSERT_NE(map, MAP_FAILED);
274 
275 	/* Unmap the second page */
276 	ret = munmap((char *)map + page_size, page_size);
277 	ASSERT_EQ(ret, 0);
278 
279 	/*
280 	 * Position buffer so we can write several u64s successfully
281 	 * before hitting the page boundary.
282 	 */
283 	ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
284 
285 	/*
286 	 * Create a separate process to run listns() concurrently.
287 	 */
288 	iter_pid = create_child(&iter_pidfd, 0);
289 	ASSERT_NE(iter_pid, -1);
290 
291 	if (iter_pid == 0) {
292 		struct ns_id_req req = {
293 			.size = sizeof(req),
294 			.spare = 0,
295 			.ns_id = 0,
296 			.ns_type = 0,
297 			.spare2 = 0,
298 			.user_ns_id = 0,
299 		};
300 		int iter_ret;
301 
302 		/*
303 		 * Loop calling listns() until killed.
304 		 * Request 10 namespace IDs while namespaces are being destroyed.
305 		 * This tests:
306 		 * 1. EFAULT handling when buffer becomes invalid
307 		 * 2. Namespace cleanup outside RCU read lock during iteration
308 		 */
309 		while (1) {
310 			iter_ret = sys_listns(&req, ns_ids, 10, 0);
311 
312 			if (iter_ret == -1 && errno == ENOSYS)
313 				_exit(PIDFD_SKIP);
314 		}
315 	}
316 
317 	/* Small delay to let iterator start looping */
318 	usleep(50000);
319 
320 	/*
321 	 * Create more children with mount namespaces to increase the
322 	 * likelihood that namespace cleanup happens during iteration.
323 	 */
324 	for (i = 0; i < 10; i++) {
325 		/* Create socketpair for synchronization */
326 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
327 
328 		pid = create_child(&pidfds[i], CLONE_NEWNS);
329 		ASSERT_NE(pid, -1);
330 		ns_pids[i] = pid;
331 
332 		if (pid == 0) {
333 			close(sv[i][0]); /* Close parent end */
334 
335 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
336 				_exit(1);
337 
338 			/* Child: create tmpfs mounts */
339 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
340 				_exit(1);
341 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
342 				_exit(1);
343 
344 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
345 				_exit(1);
346 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
347 				_exit(1);
348 
349 			/* Signal parent that setup is complete */
350 			if (write_nointr(sv[i][1], "R", 1) != 1)
351 				_exit(1);
352 
353 			/* Wait for parent to signal us to exit */
354 			if (read_nointr(sv[i][1], &c, 1) != 1)
355 				_exit(1);
356 
357 			close(sv[i][1]);
358 			_exit(0);
359 		}
360 
361 		close(sv[i][1]); /* Close child end */
362 	}
363 
364 	/* Wait for all children to finish setup */
365 	for (i = 0; i < 10; i++) {
366 		ret = read_nointr(sv[i][0], &c, 1);
367 		ASSERT_EQ(ret, 1);
368 		ASSERT_EQ(c, 'R');
369 	}
370 
371 	/* Kill half the children */
372 	for (i = 0; i < 5; i++)
373 		write_nointr(sv[i][0], "X", 1);
374 
375 	/* Small delay to let some exit */
376 	usleep(10000);
377 
378 	/* Kill remaining children */
379 	for (i = 5; i < 10; i++)
380 		write_nointr(sv[i][0], "X", 1);
381 
382 	/* Wait for all children and cleanup */
383 	for (i = 0; i < 10; i++) {
384 		waitpid(ns_pids[i], NULL, 0);
385 		close(sv[i][0]);
386 		close(pidfds[i]);
387 	}
388 
389 	/* Kill iterator and wait for it */
390 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
391 	ret = waitpid(iter_pid, &status, 0);
392 	ASSERT_EQ(ret, iter_pid);
393 	close(iter_pidfd);
394 
395 	/* If listns() is not supported the iterator exits cleanly via ENOSYS */
396 	if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) {
397 		munmap(map, page_size);
398 		SKIP(return, "listns() not supported");
399 	}
400 
401 	/* Should have been killed */
402 	ASSERT_TRUE(WIFSIGNALED(status));
403 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
404 
405 	/* Clean up */
406 	munmap(map, page_size);
407 }
408 
409 /*
410  * Test specifically focused on mount namespace cleanup during EFAULT.
411  * Filter for mount namespaces only.
412  */
413 TEST(listns_mnt_ns_cleanup_on_fault)
414 {
415 	void *map;
416 	__u64 *ns_ids;
417 	ssize_t ret;
418 	long page_size;
419 	pid_t pid, iter_pid, ns_pids[8];
420 	int pidfds[8];
421 	int sv[8][2];
422 	int iter_pidfd;
423 	int i, status;
424 	char c;
425 
426 	page_size = sysconf(_SC_PAGESIZE);
427 	ASSERT_GT(page_size, 0);
428 
429 	/* Set up partial fault buffer */
430 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
431 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
432 	ASSERT_NE(map, MAP_FAILED);
433 
434 	ret = munmap((char *)map + page_size, page_size);
435 	ASSERT_EQ(ret, 0);
436 
437 	/* Position for 3 successful writes, then fault */
438 	ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
439 
440 	/*
441 	 * Create a separate process to run listns() concurrently.
442 	 */
443 	iter_pid = create_child(&iter_pidfd, 0);
444 	ASSERT_NE(iter_pid, -1);
445 
446 	if (iter_pid == 0) {
447 		struct ns_id_req req = {
448 			.size = sizeof(req),
449 			.spare = 0,
450 			.ns_id = 0,
451 			.ns_type = CLONE_NEWNS,  /* Only mount namespaces */
452 			.spare2 = 0,
453 			.user_ns_id = 0,
454 		};
455 		int iter_ret;
456 
457 		/*
458 		 * Loop calling listns() until killed.
459 		 * Call listns() to race with namespace destruction.
460 		 */
461 		while (1) {
462 			iter_ret = sys_listns(&req, ns_ids, 10, 0);
463 
464 			if (iter_ret == -1 && errno == ENOSYS)
465 				_exit(PIDFD_SKIP);
466 		}
467 	}
468 
469 	/* Small delay to let iterator start looping */
470 	usleep(50000);
471 
472 	/* Create children with mount namespaces */
473 	for (i = 0; i < 8; i++) {
474 		/* Create socketpair for synchronization */
475 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
476 
477 		pid = create_child(&pidfds[i], CLONE_NEWNS);
478 		ASSERT_NE(pid, -1);
479 		ns_pids[i] = pid;
480 
481 		if (pid == 0) {
482 			close(sv[i][0]); /* Close parent end */
483 
484 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
485 				_exit(1);
486 
487 			/* Do some mount operations to make cleanup more interesting */
488 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
489 				_exit(1);
490 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
491 				_exit(1);
492 
493 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
494 				_exit(1);
495 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
496 				_exit(1);
497 
498 			/* Signal parent that setup is complete */
499 			if (write_nointr(sv[i][1], "R", 1) != 1)
500 				_exit(1);
501 
502 			/* Wait for parent to signal us to exit */
503 			if (read_nointr(sv[i][1], &c, 1) != 1)
504 				_exit(1);
505 
506 			close(sv[i][1]);
507 			_exit(0);
508 		}
509 
510 		close(sv[i][1]); /* Close child end */
511 	}
512 
513 	/* Wait for all children to finish setup */
514 	for (i = 0; i < 8; i++) {
515 		ret = read_nointr(sv[i][0], &c, 1);
516 		ASSERT_EQ(ret, 1);
517 		ASSERT_EQ(c, 'R');
518 	}
519 
520 	/* Kill children to trigger namespace destruction during iteration */
521 	for (i = 0; i < 8; i++)
522 		write_nointr(sv[i][0], "X", 1);
523 
524 	/* Wait for children and cleanup */
525 	for (i = 0; i < 8; i++) {
526 		waitpid(ns_pids[i], NULL, 0);
527 		close(sv[i][0]);
528 		close(pidfds[i]);
529 	}
530 
531 	/* Kill iterator and wait for it */
532 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
533 	ret = waitpid(iter_pid, &status, 0);
534 	ASSERT_EQ(ret, iter_pid);
535 	close(iter_pidfd);
536 
537 	/* If listns() is not supported the iterator exits cleanly via ENOSYS */
538 	if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) {
539 		munmap(map, page_size);
540 		SKIP(return, "listns() not supported");
541 	}
542 
543 	/* Should have been killed */
544 	ASSERT_TRUE(WIFSIGNALED(status));
545 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
546 
547 	munmap(map, page_size);
548 }
549 
550 TEST_HARNESS_MAIN
551