xref: /linux/tools/testing/selftests/namespaces/listns_efault_test.c (revision 07d7ad46dad48a81ffc796fb7875b1ec141c8b48)
1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <sched.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <linux/nsfs.h>
12 #include <sys/ioctl.h>
13 #include <sys/mman.h>
14 #include <sys/mount.h>
15 #include <sys/socket.h>
16 #include <sys/stat.h>
17 #include <sys/syscall.h>
18 #include <sys/types.h>
19 #include <sys/wait.h>
20 #include <unistd.h>
21 #include "../kselftest_harness.h"
22 #include "../filesystems/utils.h"
23 #include "../pidfd/pidfd.h"
24 #include "wrappers.h"
25 
26 /*
27  * Test listns() error handling with invalid buffer addresses.
28  *
29  * When the buffer pointer is invalid (e.g., crossing page boundaries
30  * into unmapped memory), listns() returns EINVAL.
31  *
32  * This test also creates mount namespaces that get destroyed during
33  * iteration, testing that namespace cleanup happens outside the RCU
34  * read lock.
35  */
36 TEST(listns_partial_fault_with_ns_cleanup)
37 {
38 	void *map;
39 	__u64 *ns_ids;
40 	ssize_t ret;
41 	long page_size;
42 	pid_t pid, iter_pid;
43 	int pidfds[5];
44 	int sv[5][2];
45 	int iter_pidfd;
46 	int i, status;
47 	char c;
48 
49 	page_size = sysconf(_SC_PAGESIZE);
50 	ASSERT_GT(page_size, 0);
51 
52 	/*
53 	 * Map two pages:
54 	 * - First page: readable and writable
55 	 * - Second page: will be unmapped to trigger EFAULT
56 	 */
57 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
58 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
59 	ASSERT_NE(map, MAP_FAILED);
60 
61 	/* Unmap the second page */
62 	ret = munmap((char *)map + page_size, page_size);
63 	ASSERT_EQ(ret, 0);
64 
65 	/*
66 	 * Position the buffer pointer so there's room for exactly one u64
67 	 * before the page boundary. The second u64 would fall into the
68 	 * unmapped page.
69 	 */
70 	ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
71 
72 	/*
73 	 * Create a separate process to run listns() in a loop concurrently
74 	 * with namespace creation and destruction.
75 	 */
76 	iter_pid = create_child(&iter_pidfd, 0);
77 	ASSERT_NE(iter_pid, -1);
78 
79 	if (iter_pid == 0) {
80 		struct ns_id_req req = {
81 			.size = sizeof(req),
82 			.spare = 0,
83 			.ns_id = 0,
84 			.ns_type = 0,  /* All types */
85 			.spare2 = 0,
86 			.user_ns_id = 0,  /* Global listing */
87 		};
88 		int iter_ret;
89 
90 		/*
91 		 * Loop calling listns() until killed.
92 		 * The kernel should:
93 		 * 1. Successfully write the first namespace ID (within valid page)
94 		 * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
95 		 * 3. Handle concurrent namespace destruction without deadlock
96 		 */
97 		while (1) {
98 			iter_ret = sys_listns(&req, ns_ids, 2, 0);
99 
100 			if (iter_ret == -1 && errno == ENOSYS)
101 				_exit(PIDFD_SKIP);
102 		}
103 	}
104 
105 	/* Small delay to let iterator start looping */
106 	usleep(50000);
107 
108 	/*
109 	 * Create several child processes, each in its own mount namespace.
110 	 * These will be destroyed while the iterator is running listns().
111 	 */
112 	for (i = 0; i < 5; i++) {
113 		/* Create socketpair for synchronization */
114 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
115 
116 		pid = create_child(&pidfds[i], CLONE_NEWNS);
117 		ASSERT_NE(pid, -1);
118 
119 		if (pid == 0) {
120 			close(sv[i][0]); /* Close parent end */
121 
122 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
123 				_exit(1);
124 
125 			/* Child: create a couple of tmpfs mounts */
126 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
127 				_exit(1);
128 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
129 				_exit(1);
130 
131 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
132 				_exit(1);
133 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
134 				_exit(1);
135 
136 			/* Signal parent that setup is complete */
137 			if (write_nointr(sv[i][1], "R", 1) != 1)
138 				_exit(1);
139 
140 			/* Wait for parent to signal us to exit */
141 			if (read_nointr(sv[i][1], &c, 1) != 1)
142 				_exit(1);
143 
144 			close(sv[i][1]);
145 			_exit(0);
146 		}
147 
148 		close(sv[i][1]); /* Close child end */
149 	}
150 
151 	/* Wait for all children to finish setup */
152 	for (i = 0; i < 5; i++) {
153 		ret = read_nointr(sv[i][0], &c, 1);
154 		ASSERT_EQ(ret, 1);
155 		ASSERT_EQ(c, 'R');
156 	}
157 
158 	/*
159 	 * Signal children to exit. This will destroy their mount namespaces
160 	 * while listns() is iterating the namespace tree.
161 	 * This tests that cleanup happens outside the RCU read lock.
162 	 */
163 	for (i = 0; i < 5; i++)
164 		write_nointr(sv[i][0], "X", 1);
165 
166 	/* Wait for all mount namespace children to exit and cleanup */
167 	for (i = 0; i < 5; i++) {
168 		waitpid(-1, NULL, 0);
169 		close(sv[i][0]);
170 		close(pidfds[i]);
171 	}
172 
173 	/* Kill iterator and wait for it */
174 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
175 	ret = waitpid(iter_pid, &status, 0);
176 	ASSERT_EQ(ret, iter_pid);
177 	close(iter_pidfd);
178 
179 	/* Should have been killed */
180 	ASSERT_TRUE(WIFSIGNALED(status));
181 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
182 
183 	/* Clean up */
184 	munmap(map, page_size);
185 }
186 
187 /*
188  * Test listns() error handling when the entire buffer is invalid.
189  * This is a sanity check that basic invalid pointer detection works.
190  */
191 TEST(listns_complete_fault)
192 {
193 	struct ns_id_req req = {
194 		.size = sizeof(req),
195 		.spare = 0,
196 		.ns_id = 0,
197 		.ns_type = 0,
198 		.spare2 = 0,
199 		.user_ns_id = 0,
200 	};
201 	__u64 *ns_ids;
202 	ssize_t ret;
203 
204 	/* Use a clearly invalid pointer */
205 	ns_ids = (__u64 *)0xdeadbeef;
206 
207 	ret = sys_listns(&req, ns_ids, 10, 0);
208 
209 	if (ret == -1 && errno == ENOSYS)
210 		SKIP(return, "listns() not supported");
211 
212 	/* Should fail with EFAULT */
213 	ASSERT_EQ(ret, -1);
214 	ASSERT_EQ(errno, EFAULT);
215 }
216 
217 /*
218  * Test listns() error handling when the buffer is NULL.
219  */
220 TEST(listns_null_buffer)
221 {
222 	struct ns_id_req req = {
223 		.size = sizeof(req),
224 		.spare = 0,
225 		.ns_id = 0,
226 		.ns_type = 0,
227 		.spare2 = 0,
228 		.user_ns_id = 0,
229 	};
230 	ssize_t ret;
231 
232 	/* NULL buffer with non-zero count should fail */
233 	ret = sys_listns(&req, NULL, 10, 0);
234 
235 	if (ret == -1 && errno == ENOSYS)
236 		SKIP(return, "listns() not supported");
237 
238 	/* Should fail with EFAULT */
239 	ASSERT_EQ(ret, -1);
240 	ASSERT_EQ(errno, EFAULT);
241 }
242 
243 /*
244  * Test listns() with a buffer that becomes invalid mid-iteration
245  * (after several successful writes), combined with mount namespace
246  * destruction to test RCU cleanup logic.
247  */
248 TEST(listns_late_fault_with_ns_cleanup)
249 {
250 	void *map;
251 	__u64 *ns_ids;
252 	ssize_t ret;
253 	long page_size;
254 	pid_t pid, iter_pid;
255 	int pidfds[10];
256 	int sv[10][2];
257 	int iter_pidfd;
258 	int i, status;
259 	char c;
260 
261 	page_size = sysconf(_SC_PAGESIZE);
262 	ASSERT_GT(page_size, 0);
263 
264 	/* Map two pages */
265 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
266 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
267 	ASSERT_NE(map, MAP_FAILED);
268 
269 	/* Unmap the second page */
270 	ret = munmap((char *)map + page_size, page_size);
271 	ASSERT_EQ(ret, 0);
272 
273 	/*
274 	 * Position buffer so we can write several u64s successfully
275 	 * before hitting the page boundary.
276 	 */
277 	ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
278 
279 	/*
280 	 * Create a separate process to run listns() concurrently.
281 	 */
282 	iter_pid = create_child(&iter_pidfd, 0);
283 	ASSERT_NE(iter_pid, -1);
284 
285 	if (iter_pid == 0) {
286 		struct ns_id_req req = {
287 			.size = sizeof(req),
288 			.spare = 0,
289 			.ns_id = 0,
290 			.ns_type = 0,
291 			.spare2 = 0,
292 			.user_ns_id = 0,
293 		};
294 		int iter_ret;
295 
296 		/*
297 		 * Loop calling listns() until killed.
298 		 * Request 10 namespace IDs while namespaces are being destroyed.
299 		 * This tests:
300 		 * 1. EFAULT handling when buffer becomes invalid
301 		 * 2. Namespace cleanup outside RCU read lock during iteration
302 		 */
303 		while (1) {
304 			iter_ret = sys_listns(&req, ns_ids, 10, 0);
305 
306 			if (iter_ret == -1 && errno == ENOSYS)
307 				_exit(PIDFD_SKIP);
308 		}
309 	}
310 
311 	/* Small delay to let iterator start looping */
312 	usleep(50000);
313 
314 	/*
315 	 * Create more children with mount namespaces to increase the
316 	 * likelihood that namespace cleanup happens during iteration.
317 	 */
318 	for (i = 0; i < 10; i++) {
319 		/* Create socketpair for synchronization */
320 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
321 
322 		pid = create_child(&pidfds[i], CLONE_NEWNS);
323 		ASSERT_NE(pid, -1);
324 
325 		if (pid == 0) {
326 			close(sv[i][0]); /* Close parent end */
327 
328 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
329 				_exit(1);
330 
331 			/* Child: create tmpfs mounts */
332 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
333 				_exit(1);
334 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
335 				_exit(1);
336 
337 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
338 				_exit(1);
339 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
340 				_exit(1);
341 
342 			/* Signal parent that setup is complete */
343 			if (write_nointr(sv[i][1], "R", 1) != 1)
344 				_exit(1);
345 
346 			/* Wait for parent to signal us to exit */
347 			if (read_nointr(sv[i][1], &c, 1) != 1)
348 				_exit(1);
349 
350 			close(sv[i][1]);
351 			_exit(0);
352 		}
353 
354 		close(sv[i][1]); /* Close child end */
355 	}
356 
357 	/* Wait for all children to finish setup */
358 	for (i = 0; i < 10; i++) {
359 		ret = read_nointr(sv[i][0], &c, 1);
360 		ASSERT_EQ(ret, 1);
361 		ASSERT_EQ(c, 'R');
362 	}
363 
364 	/* Kill half the children */
365 	for (i = 0; i < 5; i++)
366 		write_nointr(sv[i][0], "X", 1);
367 
368 	/* Small delay to let some exit */
369 	usleep(10000);
370 
371 	/* Kill remaining children */
372 	for (i = 5; i < 10; i++)
373 		write_nointr(sv[i][0], "X", 1);
374 
375 	/* Wait for all children and cleanup */
376 	for (i = 0; i < 10; i++) {
377 		waitpid(-1, NULL, 0);
378 		close(sv[i][0]);
379 		close(pidfds[i]);
380 	}
381 
382 	/* Kill iterator and wait for it */
383 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
384 	ret = waitpid(iter_pid, &status, 0);
385 	ASSERT_EQ(ret, iter_pid);
386 	close(iter_pidfd);
387 
388 	/* Should have been killed */
389 	ASSERT_TRUE(WIFSIGNALED(status));
390 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
391 
392 	/* Clean up */
393 	munmap(map, page_size);
394 }
395 
396 /*
397  * Test specifically focused on mount namespace cleanup during EFAULT.
398  * Filter for mount namespaces only.
399  */
400 TEST(listns_mnt_ns_cleanup_on_fault)
401 {
402 	void *map;
403 	__u64 *ns_ids;
404 	ssize_t ret;
405 	long page_size;
406 	pid_t pid, iter_pid;
407 	int pidfds[8];
408 	int sv[8][2];
409 	int iter_pidfd;
410 	int i, status;
411 	char c;
412 
413 	page_size = sysconf(_SC_PAGESIZE);
414 	ASSERT_GT(page_size, 0);
415 
416 	/* Set up partial fault buffer */
417 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
418 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
419 	ASSERT_NE(map, MAP_FAILED);
420 
421 	ret = munmap((char *)map + page_size, page_size);
422 	ASSERT_EQ(ret, 0);
423 
424 	/* Position for 3 successful writes, then fault */
425 	ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
426 
427 	/*
428 	 * Create a separate process to run listns() concurrently.
429 	 */
430 	iter_pid = create_child(&iter_pidfd, 0);
431 	ASSERT_NE(iter_pid, -1);
432 
433 	if (iter_pid == 0) {
434 		struct ns_id_req req = {
435 			.size = sizeof(req),
436 			.spare = 0,
437 			.ns_id = 0,
438 			.ns_type = CLONE_NEWNS,  /* Only mount namespaces */
439 			.spare2 = 0,
440 			.user_ns_id = 0,
441 		};
442 		int iter_ret;
443 
444 		/*
445 		 * Loop calling listns() until killed.
446 		 * Call listns() to race with namespace destruction.
447 		 */
448 		while (1) {
449 			iter_ret = sys_listns(&req, ns_ids, 10, 0);
450 
451 			if (iter_ret == -1 && errno == ENOSYS)
452 				_exit(PIDFD_SKIP);
453 		}
454 	}
455 
456 	/* Small delay to let iterator start looping */
457 	usleep(50000);
458 
459 	/* Create children with mount namespaces */
460 	for (i = 0; i < 8; i++) {
461 		/* Create socketpair for synchronization */
462 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
463 
464 		pid = create_child(&pidfds[i], CLONE_NEWNS);
465 		ASSERT_NE(pid, -1);
466 
467 		if (pid == 0) {
468 			close(sv[i][0]); /* Close parent end */
469 
470 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
471 				_exit(1);
472 
473 			/* Do some mount operations to make cleanup more interesting */
474 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
475 				_exit(1);
476 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
477 				_exit(1);
478 
479 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
480 				_exit(1);
481 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
482 				_exit(1);
483 
484 			/* Signal parent that setup is complete */
485 			if (write_nointr(sv[i][1], "R", 1) != 1)
486 				_exit(1);
487 
488 			/* Wait for parent to signal us to exit */
489 			if (read_nointr(sv[i][1], &c, 1) != 1)
490 				_exit(1);
491 
492 			close(sv[i][1]);
493 			_exit(0);
494 		}
495 
496 		close(sv[i][1]); /* Close child end */
497 	}
498 
499 	/* Wait for all children to finish setup */
500 	for (i = 0; i < 8; i++) {
501 		ret = read_nointr(sv[i][0], &c, 1);
502 		ASSERT_EQ(ret, 1);
503 		ASSERT_EQ(c, 'R');
504 	}
505 
506 	/* Kill children to trigger namespace destruction during iteration */
507 	for (i = 0; i < 8; i++)
508 		write_nointr(sv[i][0], "X", 1);
509 
510 	/* Wait for children and cleanup */
511 	for (i = 0; i < 8; i++) {
512 		waitpid(-1, NULL, 0);
513 		close(sv[i][0]);
514 		close(pidfds[i]);
515 	}
516 
517 	/* Kill iterator and wait for it */
518 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
519 	ret = waitpid(iter_pid, &status, 0);
520 	ASSERT_EQ(ret, iter_pid);
521 	close(iter_pidfd);
522 
523 	/* Should have been killed */
524 	ASSERT_TRUE(WIFSIGNALED(status));
525 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
526 
527 	munmap(map, page_size);
528 }
529 
530 TEST_HARNESS_MAIN
531