xref: /linux/tools/testing/selftests/namespaces/listns_efault_test.c (revision e2683c8868d03382da7e1ce8453b543a043066d1)
1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <sched.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <linux/nsfs.h>
12 #include <sys/ioctl.h>
13 #include <sys/mman.h>
14 #include <sys/mount.h>
15 #include <sys/socket.h>
16 #include <sys/stat.h>
17 #include <sys/syscall.h>
18 #include <sys/types.h>
19 #include <sys/wait.h>
20 #include <unistd.h>
21 #include "../kselftest_harness.h"
22 #include "../pidfd/pidfd.h"
23 #include "wrappers.h"
24 
25 /*
26  * Test listns() error handling with invalid buffer addresses.
27  *
28  * When the buffer pointer is invalid (e.g., crossing page boundaries
29  * into unmapped memory), listns() returns EINVAL.
30  *
31  * This test also creates mount namespaces that get destroyed during
32  * iteration, testing that namespace cleanup happens outside the RCU
33  * read lock.
34  */
35 TEST(listns_partial_fault_with_ns_cleanup)
36 {
37 	void *map;
38 	__u64 *ns_ids;
39 	ssize_t ret;
40 	long page_size;
41 	pid_t pid, iter_pid;
42 	int pidfds[5];
43 	int sv[5][2];
44 	int iter_pidfd;
45 	int i, status;
46 	char c;
47 
48 	page_size = sysconf(_SC_PAGESIZE);
49 	ASSERT_GT(page_size, 0);
50 
51 	/*
52 	 * Map two pages:
53 	 * - First page: readable and writable
54 	 * - Second page: will be unmapped to trigger EFAULT
55 	 */
56 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
57 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
58 	ASSERT_NE(map, MAP_FAILED);
59 
60 	/* Unmap the second page */
61 	ret = munmap((char *)map + page_size, page_size);
62 	ASSERT_EQ(ret, 0);
63 
64 	/*
65 	 * Position the buffer pointer so there's room for exactly one u64
66 	 * before the page boundary. The second u64 would fall into the
67 	 * unmapped page.
68 	 */
69 	ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
70 
71 	/*
72 	 * Create a separate process to run listns() in a loop concurrently
73 	 * with namespace creation and destruction.
74 	 */
75 	iter_pid = create_child(&iter_pidfd, 0);
76 	ASSERT_NE(iter_pid, -1);
77 
78 	if (iter_pid == 0) {
79 		struct ns_id_req req = {
80 			.size = sizeof(req),
81 			.spare = 0,
82 			.ns_id = 0,
83 			.ns_type = 0,  /* All types */
84 			.spare2 = 0,
85 			.user_ns_id = 0,  /* Global listing */
86 		};
87 		int iter_ret;
88 
89 		/*
90 		 * Loop calling listns() until killed.
91 		 * The kernel should:
92 		 * 1. Successfully write the first namespace ID (within valid page)
93 		 * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
94 		 * 3. Handle concurrent namespace destruction without deadlock
95 		 */
96 		while (1) {
97 			iter_ret = sys_listns(&req, ns_ids, 2, 0);
98 
99 			if (iter_ret == -1 && errno == ENOSYS)
100 				_exit(PIDFD_SKIP);
101 		}
102 	}
103 
104 	/* Small delay to let iterator start looping */
105 	usleep(50000);
106 
107 	/*
108 	 * Create several child processes, each in its own mount namespace.
109 	 * These will be destroyed while the iterator is running listns().
110 	 */
111 	for (i = 0; i < 5; i++) {
112 		/* Create socketpair for synchronization */
113 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
114 
115 		pid = create_child(&pidfds[i], CLONE_NEWNS);
116 		ASSERT_NE(pid, -1);
117 
118 		if (pid == 0) {
119 			close(sv[i][0]); /* Close parent end */
120 
121 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
122 				_exit(1);
123 
124 			/* Child: create a couple of tmpfs mounts */
125 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
126 				_exit(1);
127 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
128 				_exit(1);
129 
130 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
131 				_exit(1);
132 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
133 				_exit(1);
134 
135 			/* Signal parent that setup is complete */
136 			if (write_nointr(sv[i][1], "R", 1) != 1)
137 				_exit(1);
138 
139 			/* Wait for parent to signal us to exit */
140 			if (read_nointr(sv[i][1], &c, 1) != 1)
141 				_exit(1);
142 
143 			close(sv[i][1]);
144 			_exit(0);
145 		}
146 
147 		close(sv[i][1]); /* Close child end */
148 	}
149 
150 	/* Wait for all children to finish setup */
151 	for (i = 0; i < 5; i++) {
152 		ret = read_nointr(sv[i][0], &c, 1);
153 		ASSERT_EQ(ret, 1);
154 		ASSERT_EQ(c, 'R');
155 	}
156 
157 	/*
158 	 * Signal children to exit. This will destroy their mount namespaces
159 	 * while listns() is iterating the namespace tree.
160 	 * This tests that cleanup happens outside the RCU read lock.
161 	 */
162 	for (i = 0; i < 5; i++)
163 		write_nointr(sv[i][0], "X", 1);
164 
165 	/* Wait for all mount namespace children to exit and cleanup */
166 	for (i = 0; i < 5; i++) {
167 		waitpid(-1, NULL, 0);
168 		close(sv[i][0]);
169 		close(pidfds[i]);
170 	}
171 
172 	/* Kill iterator and wait for it */
173 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
174 	ret = waitpid(iter_pid, &status, 0);
175 	ASSERT_EQ(ret, iter_pid);
176 	close(iter_pidfd);
177 
178 	/* Should have been killed */
179 	ASSERT_TRUE(WIFSIGNALED(status));
180 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
181 
182 	/* Clean up */
183 	munmap(map, page_size);
184 }
185 
186 /*
187  * Test listns() error handling when the entire buffer is invalid.
188  * This is a sanity check that basic invalid pointer detection works.
189  */
190 TEST(listns_complete_fault)
191 {
192 	struct ns_id_req req = {
193 		.size = sizeof(req),
194 		.spare = 0,
195 		.ns_id = 0,
196 		.ns_type = 0,
197 		.spare2 = 0,
198 		.user_ns_id = 0,
199 	};
200 	__u64 *ns_ids;
201 	ssize_t ret;
202 
203 	/* Use a clearly invalid pointer */
204 	ns_ids = (__u64 *)0xdeadbeef;
205 
206 	ret = sys_listns(&req, ns_ids, 10, 0);
207 
208 	if (ret == -1 && errno == ENOSYS)
209 		SKIP(return, "listns() not supported");
210 
211 	/* Should fail with EFAULT */
212 	ASSERT_EQ(ret, -1);
213 	ASSERT_EQ(errno, EFAULT);
214 }
215 
216 /*
217  * Test listns() error handling when the buffer is NULL.
218  */
219 TEST(listns_null_buffer)
220 {
221 	struct ns_id_req req = {
222 		.size = sizeof(req),
223 		.spare = 0,
224 		.ns_id = 0,
225 		.ns_type = 0,
226 		.spare2 = 0,
227 		.user_ns_id = 0,
228 	};
229 	ssize_t ret;
230 
231 	/* NULL buffer with non-zero count should fail */
232 	ret = sys_listns(&req, NULL, 10, 0);
233 
234 	if (ret == -1 && errno == ENOSYS)
235 		SKIP(return, "listns() not supported");
236 
237 	/* Should fail with EFAULT */
238 	ASSERT_EQ(ret, -1);
239 	ASSERT_EQ(errno, EFAULT);
240 }
241 
242 /*
243  * Test listns() with a buffer that becomes invalid mid-iteration
244  * (after several successful writes), combined with mount namespace
245  * destruction to test RCU cleanup logic.
246  */
247 TEST(listns_late_fault_with_ns_cleanup)
248 {
249 	void *map;
250 	__u64 *ns_ids;
251 	ssize_t ret;
252 	long page_size;
253 	pid_t pid, iter_pid;
254 	int pidfds[10];
255 	int sv[10][2];
256 	int iter_pidfd;
257 	int i, status;
258 	char c;
259 
260 	page_size = sysconf(_SC_PAGESIZE);
261 	ASSERT_GT(page_size, 0);
262 
263 	/* Map two pages */
264 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
265 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
266 	ASSERT_NE(map, MAP_FAILED);
267 
268 	/* Unmap the second page */
269 	ret = munmap((char *)map + page_size, page_size);
270 	ASSERT_EQ(ret, 0);
271 
272 	/*
273 	 * Position buffer so we can write several u64s successfully
274 	 * before hitting the page boundary.
275 	 */
276 	ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
277 
278 	/*
279 	 * Create a separate process to run listns() concurrently.
280 	 */
281 	iter_pid = create_child(&iter_pidfd, 0);
282 	ASSERT_NE(iter_pid, -1);
283 
284 	if (iter_pid == 0) {
285 		struct ns_id_req req = {
286 			.size = sizeof(req),
287 			.spare = 0,
288 			.ns_id = 0,
289 			.ns_type = 0,
290 			.spare2 = 0,
291 			.user_ns_id = 0,
292 		};
293 		int iter_ret;
294 
295 		/*
296 		 * Loop calling listns() until killed.
297 		 * Request 10 namespace IDs while namespaces are being destroyed.
298 		 * This tests:
299 		 * 1. EFAULT handling when buffer becomes invalid
300 		 * 2. Namespace cleanup outside RCU read lock during iteration
301 		 */
302 		while (1) {
303 			iter_ret = sys_listns(&req, ns_ids, 10, 0);
304 
305 			if (iter_ret == -1 && errno == ENOSYS)
306 				_exit(PIDFD_SKIP);
307 		}
308 	}
309 
310 	/* Small delay to let iterator start looping */
311 	usleep(50000);
312 
313 	/*
314 	 * Create more children with mount namespaces to increase the
315 	 * likelihood that namespace cleanup happens during iteration.
316 	 */
317 	for (i = 0; i < 10; i++) {
318 		/* Create socketpair for synchronization */
319 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
320 
321 		pid = create_child(&pidfds[i], CLONE_NEWNS);
322 		ASSERT_NE(pid, -1);
323 
324 		if (pid == 0) {
325 			close(sv[i][0]); /* Close parent end */
326 
327 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
328 				_exit(1);
329 
330 			/* Child: create tmpfs mounts */
331 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
332 				_exit(1);
333 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
334 				_exit(1);
335 
336 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
337 				_exit(1);
338 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
339 				_exit(1);
340 
341 			/* Signal parent that setup is complete */
342 			if (write_nointr(sv[i][1], "R", 1) != 1)
343 				_exit(1);
344 
345 			/* Wait for parent to signal us to exit */
346 			if (read_nointr(sv[i][1], &c, 1) != 1)
347 				_exit(1);
348 
349 			close(sv[i][1]);
350 			_exit(0);
351 		}
352 
353 		close(sv[i][1]); /* Close child end */
354 	}
355 
356 	/* Wait for all children to finish setup */
357 	for (i = 0; i < 10; i++) {
358 		ret = read_nointr(sv[i][0], &c, 1);
359 		ASSERT_EQ(ret, 1);
360 		ASSERT_EQ(c, 'R');
361 	}
362 
363 	/* Kill half the children */
364 	for (i = 0; i < 5; i++)
365 		write_nointr(sv[i][0], "X", 1);
366 
367 	/* Small delay to let some exit */
368 	usleep(10000);
369 
370 	/* Kill remaining children */
371 	for (i = 5; i < 10; i++)
372 		write_nointr(sv[i][0], "X", 1);
373 
374 	/* Wait for all children and cleanup */
375 	for (i = 0; i < 10; i++) {
376 		waitpid(-1, NULL, 0);
377 		close(sv[i][0]);
378 		close(pidfds[i]);
379 	}
380 
381 	/* Kill iterator and wait for it */
382 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
383 	ret = waitpid(iter_pid, &status, 0);
384 	ASSERT_EQ(ret, iter_pid);
385 	close(iter_pidfd);
386 
387 	/* Should have been killed */
388 	ASSERT_TRUE(WIFSIGNALED(status));
389 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
390 
391 	/* Clean up */
392 	munmap(map, page_size);
393 }
394 
395 /*
396  * Test specifically focused on mount namespace cleanup during EFAULT.
397  * Filter for mount namespaces only.
398  */
399 TEST(listns_mnt_ns_cleanup_on_fault)
400 {
401 	void *map;
402 	__u64 *ns_ids;
403 	ssize_t ret;
404 	long page_size;
405 	pid_t pid, iter_pid;
406 	int pidfds[8];
407 	int sv[8][2];
408 	int iter_pidfd;
409 	int i, status;
410 	char c;
411 
412 	page_size = sysconf(_SC_PAGESIZE);
413 	ASSERT_GT(page_size, 0);
414 
415 	/* Set up partial fault buffer */
416 	map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
417 		   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
418 	ASSERT_NE(map, MAP_FAILED);
419 
420 	ret = munmap((char *)map + page_size, page_size);
421 	ASSERT_EQ(ret, 0);
422 
423 	/* Position for 3 successful writes, then fault */
424 	ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
425 
426 	/*
427 	 * Create a separate process to run listns() concurrently.
428 	 */
429 	iter_pid = create_child(&iter_pidfd, 0);
430 	ASSERT_NE(iter_pid, -1);
431 
432 	if (iter_pid == 0) {
433 		struct ns_id_req req = {
434 			.size = sizeof(req),
435 			.spare = 0,
436 			.ns_id = 0,
437 			.ns_type = CLONE_NEWNS,  /* Only mount namespaces */
438 			.spare2 = 0,
439 			.user_ns_id = 0,
440 		};
441 		int iter_ret;
442 
443 		/*
444 		 * Loop calling listns() until killed.
445 		 * Call listns() to race with namespace destruction.
446 		 */
447 		while (1) {
448 			iter_ret = sys_listns(&req, ns_ids, 10, 0);
449 
450 			if (iter_ret == -1 && errno == ENOSYS)
451 				_exit(PIDFD_SKIP);
452 		}
453 	}
454 
455 	/* Small delay to let iterator start looping */
456 	usleep(50000);
457 
458 	/* Create children with mount namespaces */
459 	for (i = 0; i < 8; i++) {
460 		/* Create socketpair for synchronization */
461 		ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
462 
463 		pid = create_child(&pidfds[i], CLONE_NEWNS);
464 		ASSERT_NE(pid, -1);
465 
466 		if (pid == 0) {
467 			close(sv[i][0]); /* Close parent end */
468 
469 			if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
470 				_exit(1);
471 
472 			/* Do some mount operations to make cleanup more interesting */
473 			if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
474 				_exit(1);
475 			if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
476 				_exit(1);
477 
478 			if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
479 				_exit(1);
480 			if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
481 				_exit(1);
482 
483 			/* Signal parent that setup is complete */
484 			if (write_nointr(sv[i][1], "R", 1) != 1)
485 				_exit(1);
486 
487 			/* Wait for parent to signal us to exit */
488 			if (read_nointr(sv[i][1], &c, 1) != 1)
489 				_exit(1);
490 
491 			close(sv[i][1]);
492 			_exit(0);
493 		}
494 
495 		close(sv[i][1]); /* Close child end */
496 	}
497 
498 	/* Wait for all children to finish setup */
499 	for (i = 0; i < 8; i++) {
500 		ret = read_nointr(sv[i][0], &c, 1);
501 		ASSERT_EQ(ret, 1);
502 		ASSERT_EQ(c, 'R');
503 	}
504 
505 	/* Kill children to trigger namespace destruction during iteration */
506 	for (i = 0; i < 8; i++)
507 		write_nointr(sv[i][0], "X", 1);
508 
509 	/* Wait for children and cleanup */
510 	for (i = 0; i < 8; i++) {
511 		waitpid(-1, NULL, 0);
512 		close(sv[i][0]);
513 		close(pidfds[i]);
514 	}
515 
516 	/* Kill iterator and wait for it */
517 	sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
518 	ret = waitpid(iter_pid, &status, 0);
519 	ASSERT_EQ(ret, iter_pid);
520 	close(iter_pidfd);
521 
522 	/* Should have been killed */
523 	ASSERT_TRUE(WIFSIGNALED(status));
524 	ASSERT_EQ(WTERMSIG(status), SIGKILL);
525 
526 	munmap(map, page_size);
527 }
528 
529 TEST_HARNESS_MAIN
530