1 // SPDX-License-Identifier: GPL-2.0 2 #define _GNU_SOURCE 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <limits.h> 6 #include <sched.h> 7 #include <signal.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include <linux/nsfs.h> 12 #include <sys/ioctl.h> 13 #include <sys/mman.h> 14 #include <sys/mount.h> 15 #include <sys/socket.h> 16 #include <sys/stat.h> 17 #include <sys/syscall.h> 18 #include <sys/types.h> 19 #include <sys/wait.h> 20 #include <unistd.h> 21 #include "../kselftest_harness.h" 22 #include "../pidfd/pidfd.h" 23 #include "wrappers.h" 24 25 /* 26 * Test listns() error handling with invalid buffer addresses. 27 * 28 * When the buffer pointer is invalid (e.g., crossing page boundaries 29 * into unmapped memory), listns() returns EINVAL. 30 * 31 * This test also creates mount namespaces that get destroyed during 32 * iteration, testing that namespace cleanup happens outside the RCU 33 * read lock. 34 */ 35 TEST(listns_partial_fault_with_ns_cleanup) 36 { 37 void *map; 38 __u64 *ns_ids; 39 ssize_t ret; 40 long page_size; 41 pid_t pid, iter_pid, ns_pids[5]; 42 int pidfds[5]; 43 int sv[5][2]; 44 int iter_pidfd; 45 int i, status; 46 char c; 47 48 page_size = sysconf(_SC_PAGESIZE); 49 ASSERT_GT(page_size, 0); 50 51 /* 52 * Map two pages: 53 * - First page: readable and writable 54 * - Second page: will be unmapped to trigger EFAULT 55 */ 56 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 57 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 58 ASSERT_NE(map, MAP_FAILED); 59 60 /* Unmap the second page */ 61 ret = munmap((char *)map + page_size, page_size); 62 ASSERT_EQ(ret, 0); 63 64 /* 65 * Position the buffer pointer so there's room for exactly one u64 66 * before the page boundary. The second u64 would fall into the 67 * unmapped page. 68 */ 69 ns_ids = ((__u64 *)((char *)map + page_size)) - 1; 70 71 /* 72 * Create a separate process to run listns() in a loop concurrently 73 * with namespace creation and destruction. 74 */ 75 iter_pid = create_child(&iter_pidfd, 0); 76 ASSERT_NE(iter_pid, -1); 77 78 if (iter_pid == 0) { 79 struct ns_id_req req = { 80 .size = sizeof(req), 81 .spare = 0, 82 .ns_id = 0, 83 .ns_type = 0, /* All types */ 84 .spare2 = 0, 85 .user_ns_id = 0, /* Global listing */ 86 }; 87 int iter_ret; 88 89 /* 90 * Loop calling listns() until killed. 91 * The kernel should: 92 * 1. Successfully write the first namespace ID (within valid page) 93 * 2. Fail with EFAULT when trying to write the second ID (unmapped page) 94 * 3. Handle concurrent namespace destruction without deadlock 95 */ 96 while (1) { 97 iter_ret = sys_listns(&req, ns_ids, 2, 0); 98 99 if (iter_ret == -1 && errno == ENOSYS) 100 _exit(PIDFD_SKIP); 101 } 102 } 103 104 /* Small delay to let iterator start looping */ 105 usleep(50000); 106 107 /* 108 * Create several child processes, each in its own mount namespace. 109 * These will be destroyed while the iterator is running listns(). 110 */ 111 for (i = 0; i < 5; i++) { 112 /* Create socketpair for synchronization */ 113 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 114 115 pid = create_child(&pidfds[i], CLONE_NEWNS); 116 ASSERT_NE(pid, -1); 117 ns_pids[i] = pid; 118 119 if (pid == 0) { 120 close(sv[i][0]); /* Close parent end */ 121 122 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 123 _exit(1); 124 125 /* Child: create a couple of tmpfs mounts */ 126 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 127 _exit(1); 128 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 129 _exit(1); 130 131 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 132 _exit(1); 133 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 134 _exit(1); 135 136 /* Signal parent that setup is complete */ 137 if (write_nointr(sv[i][1], "R", 1) != 1) 138 _exit(1); 139 140 /* Wait for parent to signal us to exit */ 141 if (read_nointr(sv[i][1], &c, 1) != 1) 142 _exit(1); 143 144 close(sv[i][1]); 145 _exit(0); 146 } 147 148 close(sv[i][1]); /* Close child end */ 149 } 150 151 /* Wait for all children to finish setup */ 152 for (i = 0; i < 5; i++) { 153 ret = read_nointr(sv[i][0], &c, 1); 154 ASSERT_EQ(ret, 1); 155 ASSERT_EQ(c, 'R'); 156 } 157 158 /* 159 * Signal children to exit. This will destroy their mount namespaces 160 * while listns() is iterating the namespace tree. 161 * This tests that cleanup happens outside the RCU read lock. 162 */ 163 for (i = 0; i < 5; i++) 164 write_nointr(sv[i][0], "X", 1); 165 166 /* Wait for all mount namespace children to exit and cleanup */ 167 for (i = 0; i < 5; i++) { 168 waitpid(ns_pids[i], NULL, 0); 169 close(sv[i][0]); 170 close(pidfds[i]); 171 } 172 173 /* Kill iterator and wait for it */ 174 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 175 ret = waitpid(iter_pid, &status, 0); 176 ASSERT_EQ(ret, iter_pid); 177 close(iter_pidfd); 178 179 /* If listns() is not supported the iterator exits cleanly via ENOSYS */ 180 if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) { 181 munmap(map, page_size); 182 SKIP(return, "listns() not supported"); 183 } 184 185 /* Should have been killed */ 186 ASSERT_TRUE(WIFSIGNALED(status)); 187 ASSERT_EQ(WTERMSIG(status), SIGKILL); 188 189 /* Clean up */ 190 munmap(map, page_size); 191 } 192 193 /* 194 * Test listns() error handling when the entire buffer is invalid. 195 * This is a sanity check that basic invalid pointer detection works. 196 */ 197 TEST(listns_complete_fault) 198 { 199 struct ns_id_req req = { 200 .size = sizeof(req), 201 .spare = 0, 202 .ns_id = 0, 203 .ns_type = 0, 204 .spare2 = 0, 205 .user_ns_id = 0, 206 }; 207 __u64 *ns_ids; 208 ssize_t ret; 209 210 /* Use a clearly invalid pointer */ 211 ns_ids = (__u64 *)0xdeadbeef; 212 213 ret = sys_listns(&req, ns_ids, 10, 0); 214 215 if (ret == -1 && errno == ENOSYS) 216 SKIP(return, "listns() not supported"); 217 218 /* Should fail with EFAULT */ 219 ASSERT_EQ(ret, -1); 220 ASSERT_EQ(errno, EFAULT); 221 } 222 223 /* 224 * Test listns() error handling when the buffer is NULL. 225 */ 226 TEST(listns_null_buffer) 227 { 228 struct ns_id_req req = { 229 .size = sizeof(req), 230 .spare = 0, 231 .ns_id = 0, 232 .ns_type = 0, 233 .spare2 = 0, 234 .user_ns_id = 0, 235 }; 236 ssize_t ret; 237 238 /* NULL buffer with non-zero count should fail */ 239 ret = sys_listns(&req, NULL, 10, 0); 240 241 if (ret == -1 && errno == ENOSYS) 242 SKIP(return, "listns() not supported"); 243 244 /* Should fail with EFAULT */ 245 ASSERT_EQ(ret, -1); 246 ASSERT_EQ(errno, EFAULT); 247 } 248 249 /* 250 * Test listns() with a buffer that becomes invalid mid-iteration 251 * (after several successful writes), combined with mount namespace 252 * destruction to test RCU cleanup logic. 253 */ 254 TEST(listns_late_fault_with_ns_cleanup) 255 { 256 void *map; 257 __u64 *ns_ids; 258 ssize_t ret; 259 long page_size; 260 pid_t pid, iter_pid, ns_pids[10]; 261 int pidfds[10]; 262 int sv[10][2]; 263 int iter_pidfd; 264 int i, status; 265 char c; 266 267 page_size = sysconf(_SC_PAGESIZE); 268 ASSERT_GT(page_size, 0); 269 270 /* Map two pages */ 271 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 272 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 273 ASSERT_NE(map, MAP_FAILED); 274 275 /* Unmap the second page */ 276 ret = munmap((char *)map + page_size, page_size); 277 ASSERT_EQ(ret, 0); 278 279 /* 280 * Position buffer so we can write several u64s successfully 281 * before hitting the page boundary. 282 */ 283 ns_ids = ((__u64 *)((char *)map + page_size)) - 5; 284 285 /* 286 * Create a separate process to run listns() concurrently. 287 */ 288 iter_pid = create_child(&iter_pidfd, 0); 289 ASSERT_NE(iter_pid, -1); 290 291 if (iter_pid == 0) { 292 struct ns_id_req req = { 293 .size = sizeof(req), 294 .spare = 0, 295 .ns_id = 0, 296 .ns_type = 0, 297 .spare2 = 0, 298 .user_ns_id = 0, 299 }; 300 int iter_ret; 301 302 /* 303 * Loop calling listns() until killed. 304 * Request 10 namespace IDs while namespaces are being destroyed. 305 * This tests: 306 * 1. EFAULT handling when buffer becomes invalid 307 * 2. Namespace cleanup outside RCU read lock during iteration 308 */ 309 while (1) { 310 iter_ret = sys_listns(&req, ns_ids, 10, 0); 311 312 if (iter_ret == -1 && errno == ENOSYS) 313 _exit(PIDFD_SKIP); 314 } 315 } 316 317 /* Small delay to let iterator start looping */ 318 usleep(50000); 319 320 /* 321 * Create more children with mount namespaces to increase the 322 * likelihood that namespace cleanup happens during iteration. 323 */ 324 for (i = 0; i < 10; i++) { 325 /* Create socketpair for synchronization */ 326 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 327 328 pid = create_child(&pidfds[i], CLONE_NEWNS); 329 ASSERT_NE(pid, -1); 330 ns_pids[i] = pid; 331 332 if (pid == 0) { 333 close(sv[i][0]); /* Close parent end */ 334 335 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 336 _exit(1); 337 338 /* Child: create tmpfs mounts */ 339 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 340 _exit(1); 341 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 342 _exit(1); 343 344 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 345 _exit(1); 346 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 347 _exit(1); 348 349 /* Signal parent that setup is complete */ 350 if (write_nointr(sv[i][1], "R", 1) != 1) 351 _exit(1); 352 353 /* Wait for parent to signal us to exit */ 354 if (read_nointr(sv[i][1], &c, 1) != 1) 355 _exit(1); 356 357 close(sv[i][1]); 358 _exit(0); 359 } 360 361 close(sv[i][1]); /* Close child end */ 362 } 363 364 /* Wait for all children to finish setup */ 365 for (i = 0; i < 10; i++) { 366 ret = read_nointr(sv[i][0], &c, 1); 367 ASSERT_EQ(ret, 1); 368 ASSERT_EQ(c, 'R'); 369 } 370 371 /* Kill half the children */ 372 for (i = 0; i < 5; i++) 373 write_nointr(sv[i][0], "X", 1); 374 375 /* Small delay to let some exit */ 376 usleep(10000); 377 378 /* Kill remaining children */ 379 for (i = 5; i < 10; i++) 380 write_nointr(sv[i][0], "X", 1); 381 382 /* Wait for all children and cleanup */ 383 for (i = 0; i < 10; i++) { 384 waitpid(ns_pids[i], NULL, 0); 385 close(sv[i][0]); 386 close(pidfds[i]); 387 } 388 389 /* Kill iterator and wait for it */ 390 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 391 ret = waitpid(iter_pid, &status, 0); 392 ASSERT_EQ(ret, iter_pid); 393 close(iter_pidfd); 394 395 /* If listns() is not supported the iterator exits cleanly via ENOSYS */ 396 if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) { 397 munmap(map, page_size); 398 SKIP(return, "listns() not supported"); 399 } 400 401 /* Should have been killed */ 402 ASSERT_TRUE(WIFSIGNALED(status)); 403 ASSERT_EQ(WTERMSIG(status), SIGKILL); 404 405 /* Clean up */ 406 munmap(map, page_size); 407 } 408 409 /* 410 * Test specifically focused on mount namespace cleanup during EFAULT. 411 * Filter for mount namespaces only. 412 */ 413 TEST(listns_mnt_ns_cleanup_on_fault) 414 { 415 void *map; 416 __u64 *ns_ids; 417 ssize_t ret; 418 long page_size; 419 pid_t pid, iter_pid, ns_pids[8]; 420 int pidfds[8]; 421 int sv[8][2]; 422 int iter_pidfd; 423 int i, status; 424 char c; 425 426 page_size = sysconf(_SC_PAGESIZE); 427 ASSERT_GT(page_size, 0); 428 429 /* Set up partial fault buffer */ 430 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 431 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 432 ASSERT_NE(map, MAP_FAILED); 433 434 ret = munmap((char *)map + page_size, page_size); 435 ASSERT_EQ(ret, 0); 436 437 /* Position for 3 successful writes, then fault */ 438 ns_ids = ((__u64 *)((char *)map + page_size)) - 3; 439 440 /* 441 * Create a separate process to run listns() concurrently. 442 */ 443 iter_pid = create_child(&iter_pidfd, 0); 444 ASSERT_NE(iter_pid, -1); 445 446 if (iter_pid == 0) { 447 struct ns_id_req req = { 448 .size = sizeof(req), 449 .spare = 0, 450 .ns_id = 0, 451 .ns_type = CLONE_NEWNS, /* Only mount namespaces */ 452 .spare2 = 0, 453 .user_ns_id = 0, 454 }; 455 int iter_ret; 456 457 /* 458 * Loop calling listns() until killed. 459 * Call listns() to race with namespace destruction. 460 */ 461 while (1) { 462 iter_ret = sys_listns(&req, ns_ids, 10, 0); 463 464 if (iter_ret == -1 && errno == ENOSYS) 465 _exit(PIDFD_SKIP); 466 } 467 } 468 469 /* Small delay to let iterator start looping */ 470 usleep(50000); 471 472 /* Create children with mount namespaces */ 473 for (i = 0; i < 8; i++) { 474 /* Create socketpair for synchronization */ 475 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 476 477 pid = create_child(&pidfds[i], CLONE_NEWNS); 478 ASSERT_NE(pid, -1); 479 ns_pids[i] = pid; 480 481 if (pid == 0) { 482 close(sv[i][0]); /* Close parent end */ 483 484 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 485 _exit(1); 486 487 /* Do some mount operations to make cleanup more interesting */ 488 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 489 _exit(1); 490 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 491 _exit(1); 492 493 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 494 _exit(1); 495 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 496 _exit(1); 497 498 /* Signal parent that setup is complete */ 499 if (write_nointr(sv[i][1], "R", 1) != 1) 500 _exit(1); 501 502 /* Wait for parent to signal us to exit */ 503 if (read_nointr(sv[i][1], &c, 1) != 1) 504 _exit(1); 505 506 close(sv[i][1]); 507 _exit(0); 508 } 509 510 close(sv[i][1]); /* Close child end */ 511 } 512 513 /* Wait for all children to finish setup */ 514 for (i = 0; i < 8; i++) { 515 ret = read_nointr(sv[i][0], &c, 1); 516 ASSERT_EQ(ret, 1); 517 ASSERT_EQ(c, 'R'); 518 } 519 520 /* Kill children to trigger namespace destruction during iteration */ 521 for (i = 0; i < 8; i++) 522 write_nointr(sv[i][0], "X", 1); 523 524 /* Wait for children and cleanup */ 525 for (i = 0; i < 8; i++) { 526 waitpid(ns_pids[i], NULL, 0); 527 close(sv[i][0]); 528 close(pidfds[i]); 529 } 530 531 /* Kill iterator and wait for it */ 532 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 533 ret = waitpid(iter_pid, &status, 0); 534 ASSERT_EQ(ret, iter_pid); 535 close(iter_pidfd); 536 537 /* If listns() is not supported the iterator exits cleanly via ENOSYS */ 538 if (WIFEXITED(status) && WEXITSTATUS(status) == PIDFD_SKIP) { 539 munmap(map, page_size); 540 SKIP(return, "listns() not supported"); 541 } 542 543 /* Should have been killed */ 544 ASSERT_TRUE(WIFSIGNALED(status)); 545 ASSERT_EQ(WTERMSIG(status), SIGKILL); 546 547 munmap(map, page_size); 548 } 549 550 TEST_HARNESS_MAIN 551