1 // SPDX-License-Identifier: GPL-2.0
2 #define _GNU_SOURCE
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <limits.h>
6 #include <sched.h>
7 #include <signal.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include <linux/nsfs.h>
12 #include <sys/ioctl.h>
13 #include <sys/mman.h>
14 #include <sys/mount.h>
15 #include <sys/socket.h>
16 #include <sys/stat.h>
17 #include <sys/syscall.h>
18 #include <sys/types.h>
19 #include <sys/wait.h>
20 #include <unistd.h>
21 #include "../kselftest_harness.h"
22 #include "../filesystems/utils.h"
23 #include "../pidfd/pidfd.h"
24 #include "wrappers.h"
25
26 /*
27 * Test listns() error handling with invalid buffer addresses.
28 *
29 * When the buffer pointer is invalid (e.g., crossing page boundaries
30 * into unmapped memory), listns() returns EINVAL.
31 *
32 * This test also creates mount namespaces that get destroyed during
33 * iteration, testing that namespace cleanup happens outside the RCU
34 * read lock.
35 */
TEST(listns_partial_fault_with_ns_cleanup)36 TEST(listns_partial_fault_with_ns_cleanup)
37 {
38 void *map;
39 __u64 *ns_ids;
40 ssize_t ret;
41 long page_size;
42 pid_t pid, iter_pid;
43 int pidfds[5];
44 int sv[5][2];
45 int iter_pidfd;
46 int i, status;
47 char c;
48
49 page_size = sysconf(_SC_PAGESIZE);
50 ASSERT_GT(page_size, 0);
51
52 /*
53 * Map two pages:
54 * - First page: readable and writable
55 * - Second page: will be unmapped to trigger EFAULT
56 */
57 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
58 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
59 ASSERT_NE(map, MAP_FAILED);
60
61 /* Unmap the second page */
62 ret = munmap((char *)map + page_size, page_size);
63 ASSERT_EQ(ret, 0);
64
65 /*
66 * Position the buffer pointer so there's room for exactly one u64
67 * before the page boundary. The second u64 would fall into the
68 * unmapped page.
69 */
70 ns_ids = ((__u64 *)((char *)map + page_size)) - 1;
71
72 /*
73 * Create a separate process to run listns() in a loop concurrently
74 * with namespace creation and destruction.
75 */
76 iter_pid = create_child(&iter_pidfd, 0);
77 ASSERT_NE(iter_pid, -1);
78
79 if (iter_pid == 0) {
80 struct ns_id_req req = {
81 .size = sizeof(req),
82 .spare = 0,
83 .ns_id = 0,
84 .ns_type = 0, /* All types */
85 .spare2 = 0,
86 .user_ns_id = 0, /* Global listing */
87 };
88 int iter_ret;
89
90 /*
91 * Loop calling listns() until killed.
92 * The kernel should:
93 * 1. Successfully write the first namespace ID (within valid page)
94 * 2. Fail with EFAULT when trying to write the second ID (unmapped page)
95 * 3. Handle concurrent namespace destruction without deadlock
96 */
97 while (1) {
98 iter_ret = sys_listns(&req, ns_ids, 2, 0);
99
100 if (iter_ret == -1 && errno == ENOSYS)
101 _exit(PIDFD_SKIP);
102 }
103 }
104
105 /* Small delay to let iterator start looping */
106 usleep(50000);
107
108 /*
109 * Create several child processes, each in its own mount namespace.
110 * These will be destroyed while the iterator is running listns().
111 */
112 for (i = 0; i < 5; i++) {
113 /* Create socketpair for synchronization */
114 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
115
116 pid = create_child(&pidfds[i], CLONE_NEWNS);
117 ASSERT_NE(pid, -1);
118
119 if (pid == 0) {
120 close(sv[i][0]); /* Close parent end */
121
122 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
123 _exit(1);
124
125 /* Child: create a couple of tmpfs mounts */
126 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
127 _exit(1);
128 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
129 _exit(1);
130
131 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
132 _exit(1);
133 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
134 _exit(1);
135
136 /* Signal parent that setup is complete */
137 if (write_nointr(sv[i][1], "R", 1) != 1)
138 _exit(1);
139
140 /* Wait for parent to signal us to exit */
141 if (read_nointr(sv[i][1], &c, 1) != 1)
142 _exit(1);
143
144 close(sv[i][1]);
145 _exit(0);
146 }
147
148 close(sv[i][1]); /* Close child end */
149 }
150
151 /* Wait for all children to finish setup */
152 for (i = 0; i < 5; i++) {
153 ret = read_nointr(sv[i][0], &c, 1);
154 ASSERT_EQ(ret, 1);
155 ASSERT_EQ(c, 'R');
156 }
157
158 /*
159 * Signal children to exit. This will destroy their mount namespaces
160 * while listns() is iterating the namespace tree.
161 * This tests that cleanup happens outside the RCU read lock.
162 */
163 for (i = 0; i < 5; i++)
164 write_nointr(sv[i][0], "X", 1);
165
166 /* Wait for all mount namespace children to exit and cleanup */
167 for (i = 0; i < 5; i++) {
168 waitpid(-1, NULL, 0);
169 close(sv[i][0]);
170 close(pidfds[i]);
171 }
172
173 /* Kill iterator and wait for it */
174 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
175 ret = waitpid(iter_pid, &status, 0);
176 ASSERT_EQ(ret, iter_pid);
177 close(iter_pidfd);
178
179 /* Should have been killed */
180 ASSERT_TRUE(WIFSIGNALED(status));
181 ASSERT_EQ(WTERMSIG(status), SIGKILL);
182
183 /* Clean up */
184 munmap(map, page_size);
185 }
186
187 /*
188 * Test listns() error handling when the entire buffer is invalid.
189 * This is a sanity check that basic invalid pointer detection works.
190 */
TEST(listns_complete_fault)191 TEST(listns_complete_fault)
192 {
193 struct ns_id_req req = {
194 .size = sizeof(req),
195 .spare = 0,
196 .ns_id = 0,
197 .ns_type = 0,
198 .spare2 = 0,
199 .user_ns_id = 0,
200 };
201 __u64 *ns_ids;
202 ssize_t ret;
203
204 /* Use a clearly invalid pointer */
205 ns_ids = (__u64 *)0xdeadbeef;
206
207 ret = sys_listns(&req, ns_ids, 10, 0);
208
209 if (ret == -1 && errno == ENOSYS)
210 SKIP(return, "listns() not supported");
211
212 /* Should fail with EFAULT */
213 ASSERT_EQ(ret, -1);
214 ASSERT_EQ(errno, EFAULT);
215 }
216
217 /*
218 * Test listns() error handling when the buffer is NULL.
219 */
TEST(listns_null_buffer)220 TEST(listns_null_buffer)
221 {
222 struct ns_id_req req = {
223 .size = sizeof(req),
224 .spare = 0,
225 .ns_id = 0,
226 .ns_type = 0,
227 .spare2 = 0,
228 .user_ns_id = 0,
229 };
230 ssize_t ret;
231
232 /* NULL buffer with non-zero count should fail */
233 ret = sys_listns(&req, NULL, 10, 0);
234
235 if (ret == -1 && errno == ENOSYS)
236 SKIP(return, "listns() not supported");
237
238 /* Should fail with EFAULT */
239 ASSERT_EQ(ret, -1);
240 ASSERT_EQ(errno, EFAULT);
241 }
242
243 /*
244 * Test listns() with a buffer that becomes invalid mid-iteration
245 * (after several successful writes), combined with mount namespace
246 * destruction to test RCU cleanup logic.
247 */
TEST(listns_late_fault_with_ns_cleanup)248 TEST(listns_late_fault_with_ns_cleanup)
249 {
250 void *map;
251 __u64 *ns_ids;
252 ssize_t ret;
253 long page_size;
254 pid_t pid, iter_pid;
255 int pidfds[10];
256 int sv[10][2];
257 int iter_pidfd;
258 int i, status;
259 char c;
260
261 page_size = sysconf(_SC_PAGESIZE);
262 ASSERT_GT(page_size, 0);
263
264 /* Map two pages */
265 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
266 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
267 ASSERT_NE(map, MAP_FAILED);
268
269 /* Unmap the second page */
270 ret = munmap((char *)map + page_size, page_size);
271 ASSERT_EQ(ret, 0);
272
273 /*
274 * Position buffer so we can write several u64s successfully
275 * before hitting the page boundary.
276 */
277 ns_ids = ((__u64 *)((char *)map + page_size)) - 5;
278
279 /*
280 * Create a separate process to run listns() concurrently.
281 */
282 iter_pid = create_child(&iter_pidfd, 0);
283 ASSERT_NE(iter_pid, -1);
284
285 if (iter_pid == 0) {
286 struct ns_id_req req = {
287 .size = sizeof(req),
288 .spare = 0,
289 .ns_id = 0,
290 .ns_type = 0,
291 .spare2 = 0,
292 .user_ns_id = 0,
293 };
294 int iter_ret;
295
296 /*
297 * Loop calling listns() until killed.
298 * Request 10 namespace IDs while namespaces are being destroyed.
299 * This tests:
300 * 1. EFAULT handling when buffer becomes invalid
301 * 2. Namespace cleanup outside RCU read lock during iteration
302 */
303 while (1) {
304 iter_ret = sys_listns(&req, ns_ids, 10, 0);
305
306 if (iter_ret == -1 && errno == ENOSYS)
307 _exit(PIDFD_SKIP);
308 }
309 }
310
311 /* Small delay to let iterator start looping */
312 usleep(50000);
313
314 /*
315 * Create more children with mount namespaces to increase the
316 * likelihood that namespace cleanup happens during iteration.
317 */
318 for (i = 0; i < 10; i++) {
319 /* Create socketpair for synchronization */
320 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
321
322 pid = create_child(&pidfds[i], CLONE_NEWNS);
323 ASSERT_NE(pid, -1);
324
325 if (pid == 0) {
326 close(sv[i][0]); /* Close parent end */
327
328 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
329 _exit(1);
330
331 /* Child: create tmpfs mounts */
332 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
333 _exit(1);
334 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
335 _exit(1);
336
337 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
338 _exit(1);
339 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
340 _exit(1);
341
342 /* Signal parent that setup is complete */
343 if (write_nointr(sv[i][1], "R", 1) != 1)
344 _exit(1);
345
346 /* Wait for parent to signal us to exit */
347 if (read_nointr(sv[i][1], &c, 1) != 1)
348 _exit(1);
349
350 close(sv[i][1]);
351 _exit(0);
352 }
353
354 close(sv[i][1]); /* Close child end */
355 }
356
357 /* Wait for all children to finish setup */
358 for (i = 0; i < 10; i++) {
359 ret = read_nointr(sv[i][0], &c, 1);
360 ASSERT_EQ(ret, 1);
361 ASSERT_EQ(c, 'R');
362 }
363
364 /* Kill half the children */
365 for (i = 0; i < 5; i++)
366 write_nointr(sv[i][0], "X", 1);
367
368 /* Small delay to let some exit */
369 usleep(10000);
370
371 /* Kill remaining children */
372 for (i = 5; i < 10; i++)
373 write_nointr(sv[i][0], "X", 1);
374
375 /* Wait for all children and cleanup */
376 for (i = 0; i < 10; i++) {
377 waitpid(-1, NULL, 0);
378 close(sv[i][0]);
379 close(pidfds[i]);
380 }
381
382 /* Kill iterator and wait for it */
383 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
384 ret = waitpid(iter_pid, &status, 0);
385 ASSERT_EQ(ret, iter_pid);
386 close(iter_pidfd);
387
388 /* Should have been killed */
389 ASSERT_TRUE(WIFSIGNALED(status));
390 ASSERT_EQ(WTERMSIG(status), SIGKILL);
391
392 /* Clean up */
393 munmap(map, page_size);
394 }
395
396 /*
397 * Test specifically focused on mount namespace cleanup during EFAULT.
398 * Filter for mount namespaces only.
399 */
TEST(listns_mnt_ns_cleanup_on_fault)400 TEST(listns_mnt_ns_cleanup_on_fault)
401 {
402 void *map;
403 __u64 *ns_ids;
404 ssize_t ret;
405 long page_size;
406 pid_t pid, iter_pid;
407 int pidfds[8];
408 int sv[8][2];
409 int iter_pidfd;
410 int i, status;
411 char c;
412
413 page_size = sysconf(_SC_PAGESIZE);
414 ASSERT_GT(page_size, 0);
415
416 /* Set up partial fault buffer */
417 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE,
418 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
419 ASSERT_NE(map, MAP_FAILED);
420
421 ret = munmap((char *)map + page_size, page_size);
422 ASSERT_EQ(ret, 0);
423
424 /* Position for 3 successful writes, then fault */
425 ns_ids = ((__u64 *)((char *)map + page_size)) - 3;
426
427 /*
428 * Create a separate process to run listns() concurrently.
429 */
430 iter_pid = create_child(&iter_pidfd, 0);
431 ASSERT_NE(iter_pid, -1);
432
433 if (iter_pid == 0) {
434 struct ns_id_req req = {
435 .size = sizeof(req),
436 .spare = 0,
437 .ns_id = 0,
438 .ns_type = CLONE_NEWNS, /* Only mount namespaces */
439 .spare2 = 0,
440 .user_ns_id = 0,
441 };
442 int iter_ret;
443
444 /*
445 * Loop calling listns() until killed.
446 * Call listns() to race with namespace destruction.
447 */
448 while (1) {
449 iter_ret = sys_listns(&req, ns_ids, 10, 0);
450
451 if (iter_ret == -1 && errno == ENOSYS)
452 _exit(PIDFD_SKIP);
453 }
454 }
455
456 /* Small delay to let iterator start looping */
457 usleep(50000);
458
459 /* Create children with mount namespaces */
460 for (i = 0; i < 8; i++) {
461 /* Create socketpair for synchronization */
462 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0);
463
464 pid = create_child(&pidfds[i], CLONE_NEWNS);
465 ASSERT_NE(pid, -1);
466
467 if (pid == 0) {
468 close(sv[i][0]); /* Close parent end */
469
470 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0))
471 _exit(1);
472
473 /* Do some mount operations to make cleanup more interesting */
474 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST)
475 _exit(1);
476 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST)
477 _exit(1);
478
479 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1)
480 _exit(1);
481 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1)
482 _exit(1);
483
484 /* Signal parent that setup is complete */
485 if (write_nointr(sv[i][1], "R", 1) != 1)
486 _exit(1);
487
488 /* Wait for parent to signal us to exit */
489 if (read_nointr(sv[i][1], &c, 1) != 1)
490 _exit(1);
491
492 close(sv[i][1]);
493 _exit(0);
494 }
495
496 close(sv[i][1]); /* Close child end */
497 }
498
499 /* Wait for all children to finish setup */
500 for (i = 0; i < 8; i++) {
501 ret = read_nointr(sv[i][0], &c, 1);
502 ASSERT_EQ(ret, 1);
503 ASSERT_EQ(c, 'R');
504 }
505
506 /* Kill children to trigger namespace destruction during iteration */
507 for (i = 0; i < 8; i++)
508 write_nointr(sv[i][0], "X", 1);
509
510 /* Wait for children and cleanup */
511 for (i = 0; i < 8; i++) {
512 waitpid(-1, NULL, 0);
513 close(sv[i][0]);
514 close(pidfds[i]);
515 }
516
517 /* Kill iterator and wait for it */
518 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0);
519 ret = waitpid(iter_pid, &status, 0);
520 ASSERT_EQ(ret, iter_pid);
521 close(iter_pidfd);
522
523 /* Should have been killed */
524 ASSERT_TRUE(WIFSIGNALED(status));
525 ASSERT_EQ(WTERMSIG(status), SIGKILL);
526
527 munmap(map, page_size);
528 }
529
530 TEST_HARNESS_MAIN
531