1 // SPDX-License-Identifier: GPL-2.0 2 #define _GNU_SOURCE 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <limits.h> 6 #include <sched.h> 7 #include <signal.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include <linux/nsfs.h> 12 #include <sys/ioctl.h> 13 #include <sys/mman.h> 14 #include <sys/mount.h> 15 #include <sys/socket.h> 16 #include <sys/stat.h> 17 #include <sys/syscall.h> 18 #include <sys/types.h> 19 #include <sys/wait.h> 20 #include <unistd.h> 21 #include "../kselftest_harness.h" 22 #include "../filesystems/utils.h" 23 #include "../pidfd/pidfd.h" 24 #include "wrappers.h" 25 26 /* 27 * Test listns() error handling with invalid buffer addresses. 28 * 29 * When the buffer pointer is invalid (e.g., crossing page boundaries 30 * into unmapped memory), listns() returns EINVAL. 31 * 32 * This test also creates mount namespaces that get destroyed during 33 * iteration, testing that namespace cleanup happens outside the RCU 34 * read lock. 35 */ 36 TEST(listns_partial_fault_with_ns_cleanup) 37 { 38 void *map; 39 __u64 *ns_ids; 40 ssize_t ret; 41 long page_size; 42 pid_t pid, iter_pid; 43 int pidfds[5]; 44 int sv[5][2]; 45 int iter_pidfd; 46 int i, status; 47 char c; 48 49 page_size = sysconf(_SC_PAGESIZE); 50 ASSERT_GT(page_size, 0); 51 52 /* 53 * Map two pages: 54 * - First page: readable and writable 55 * - Second page: will be unmapped to trigger EFAULT 56 */ 57 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 58 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 59 ASSERT_NE(map, MAP_FAILED); 60 61 /* Unmap the second page */ 62 ret = munmap((char *)map + page_size, page_size); 63 ASSERT_EQ(ret, 0); 64 65 /* 66 * Position the buffer pointer so there's room for exactly one u64 67 * before the page boundary. The second u64 would fall into the 68 * unmapped page. 69 */ 70 ns_ids = ((__u64 *)((char *)map + page_size)) - 1; 71 72 /* 73 * Create a separate process to run listns() in a loop concurrently 74 * with namespace creation and destruction. 75 */ 76 iter_pid = create_child(&iter_pidfd, 0); 77 ASSERT_NE(iter_pid, -1); 78 79 if (iter_pid == 0) { 80 struct ns_id_req req = { 81 .size = sizeof(req), 82 .spare = 0, 83 .ns_id = 0, 84 .ns_type = 0, /* All types */ 85 .spare2 = 0, 86 .user_ns_id = 0, /* Global listing */ 87 }; 88 int iter_ret; 89 90 /* 91 * Loop calling listns() until killed. 92 * The kernel should: 93 * 1. Successfully write the first namespace ID (within valid page) 94 * 2. Fail with EFAULT when trying to write the second ID (unmapped page) 95 * 3. Handle concurrent namespace destruction without deadlock 96 */ 97 while (1) { 98 iter_ret = sys_listns(&req, ns_ids, 2, 0); 99 100 if (iter_ret == -1 && errno == ENOSYS) 101 _exit(PIDFD_SKIP); 102 } 103 } 104 105 /* Small delay to let iterator start looping */ 106 usleep(50000); 107 108 /* 109 * Create several child processes, each in its own mount namespace. 110 * These will be destroyed while the iterator is running listns(). 111 */ 112 for (i = 0; i < 5; i++) { 113 /* Create socketpair for synchronization */ 114 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 115 116 pid = create_child(&pidfds[i], CLONE_NEWNS); 117 ASSERT_NE(pid, -1); 118 119 if (pid == 0) { 120 close(sv[i][0]); /* Close parent end */ 121 122 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 123 _exit(1); 124 125 /* Child: create a couple of tmpfs mounts */ 126 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 127 _exit(1); 128 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 129 _exit(1); 130 131 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 132 _exit(1); 133 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 134 _exit(1); 135 136 /* Signal parent that setup is complete */ 137 if (write_nointr(sv[i][1], "R", 1) != 1) 138 _exit(1); 139 140 /* Wait for parent to signal us to exit */ 141 if (read_nointr(sv[i][1], &c, 1) != 1) 142 _exit(1); 143 144 close(sv[i][1]); 145 _exit(0); 146 } 147 148 close(sv[i][1]); /* Close child end */ 149 } 150 151 /* Wait for all children to finish setup */ 152 for (i = 0; i < 5; i++) { 153 ret = read_nointr(sv[i][0], &c, 1); 154 ASSERT_EQ(ret, 1); 155 ASSERT_EQ(c, 'R'); 156 } 157 158 /* 159 * Signal children to exit. This will destroy their mount namespaces 160 * while listns() is iterating the namespace tree. 161 * This tests that cleanup happens outside the RCU read lock. 162 */ 163 for (i = 0; i < 5; i++) 164 write_nointr(sv[i][0], "X", 1); 165 166 /* Wait for all mount namespace children to exit and cleanup */ 167 for (i = 0; i < 5; i++) { 168 waitpid(-1, NULL, 0); 169 close(sv[i][0]); 170 close(pidfds[i]); 171 } 172 173 /* Kill iterator and wait for it */ 174 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 175 ret = waitpid(iter_pid, &status, 0); 176 ASSERT_EQ(ret, iter_pid); 177 close(iter_pidfd); 178 179 /* Should have been killed */ 180 ASSERT_TRUE(WIFSIGNALED(status)); 181 ASSERT_EQ(WTERMSIG(status), SIGKILL); 182 183 /* Clean up */ 184 munmap(map, page_size); 185 } 186 187 /* 188 * Test listns() error handling when the entire buffer is invalid. 189 * This is a sanity check that basic invalid pointer detection works. 190 */ 191 TEST(listns_complete_fault) 192 { 193 struct ns_id_req req = { 194 .size = sizeof(req), 195 .spare = 0, 196 .ns_id = 0, 197 .ns_type = 0, 198 .spare2 = 0, 199 .user_ns_id = 0, 200 }; 201 __u64 *ns_ids; 202 ssize_t ret; 203 204 /* Use a clearly invalid pointer */ 205 ns_ids = (__u64 *)0xdeadbeef; 206 207 ret = sys_listns(&req, ns_ids, 10, 0); 208 209 if (ret == -1 && errno == ENOSYS) 210 SKIP(return, "listns() not supported"); 211 212 /* Should fail with EFAULT */ 213 ASSERT_EQ(ret, -1); 214 ASSERT_EQ(errno, EFAULT); 215 } 216 217 /* 218 * Test listns() error handling when the buffer is NULL. 219 */ 220 TEST(listns_null_buffer) 221 { 222 struct ns_id_req req = { 223 .size = sizeof(req), 224 .spare = 0, 225 .ns_id = 0, 226 .ns_type = 0, 227 .spare2 = 0, 228 .user_ns_id = 0, 229 }; 230 ssize_t ret; 231 232 /* NULL buffer with non-zero count should fail */ 233 ret = sys_listns(&req, NULL, 10, 0); 234 235 if (ret == -1 && errno == ENOSYS) 236 SKIP(return, "listns() not supported"); 237 238 /* Should fail with EFAULT */ 239 ASSERT_EQ(ret, -1); 240 ASSERT_EQ(errno, EFAULT); 241 } 242 243 /* 244 * Test listns() with a buffer that becomes invalid mid-iteration 245 * (after several successful writes), combined with mount namespace 246 * destruction to test RCU cleanup logic. 247 */ 248 TEST(listns_late_fault_with_ns_cleanup) 249 { 250 void *map; 251 __u64 *ns_ids; 252 ssize_t ret; 253 long page_size; 254 pid_t pid, iter_pid; 255 int pidfds[10]; 256 int sv[10][2]; 257 int iter_pidfd; 258 int i, status; 259 char c; 260 261 page_size = sysconf(_SC_PAGESIZE); 262 ASSERT_GT(page_size, 0); 263 264 /* Map two pages */ 265 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 266 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 267 ASSERT_NE(map, MAP_FAILED); 268 269 /* Unmap the second page */ 270 ret = munmap((char *)map + page_size, page_size); 271 ASSERT_EQ(ret, 0); 272 273 /* 274 * Position buffer so we can write several u64s successfully 275 * before hitting the page boundary. 276 */ 277 ns_ids = ((__u64 *)((char *)map + page_size)) - 5; 278 279 /* 280 * Create a separate process to run listns() concurrently. 281 */ 282 iter_pid = create_child(&iter_pidfd, 0); 283 ASSERT_NE(iter_pid, -1); 284 285 if (iter_pid == 0) { 286 struct ns_id_req req = { 287 .size = sizeof(req), 288 .spare = 0, 289 .ns_id = 0, 290 .ns_type = 0, 291 .spare2 = 0, 292 .user_ns_id = 0, 293 }; 294 int iter_ret; 295 296 /* 297 * Loop calling listns() until killed. 298 * Request 10 namespace IDs while namespaces are being destroyed. 299 * This tests: 300 * 1. EFAULT handling when buffer becomes invalid 301 * 2. Namespace cleanup outside RCU read lock during iteration 302 */ 303 while (1) { 304 iter_ret = sys_listns(&req, ns_ids, 10, 0); 305 306 if (iter_ret == -1 && errno == ENOSYS) 307 _exit(PIDFD_SKIP); 308 } 309 } 310 311 /* Small delay to let iterator start looping */ 312 usleep(50000); 313 314 /* 315 * Create more children with mount namespaces to increase the 316 * likelihood that namespace cleanup happens during iteration. 317 */ 318 for (i = 0; i < 10; i++) { 319 /* Create socketpair for synchronization */ 320 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 321 322 pid = create_child(&pidfds[i], CLONE_NEWNS); 323 ASSERT_NE(pid, -1); 324 325 if (pid == 0) { 326 close(sv[i][0]); /* Close parent end */ 327 328 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 329 _exit(1); 330 331 /* Child: create tmpfs mounts */ 332 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 333 _exit(1); 334 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 335 _exit(1); 336 337 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 338 _exit(1); 339 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 340 _exit(1); 341 342 /* Signal parent that setup is complete */ 343 if (write_nointr(sv[i][1], "R", 1) != 1) 344 _exit(1); 345 346 /* Wait for parent to signal us to exit */ 347 if (read_nointr(sv[i][1], &c, 1) != 1) 348 _exit(1); 349 350 close(sv[i][1]); 351 _exit(0); 352 } 353 354 close(sv[i][1]); /* Close child end */ 355 } 356 357 /* Wait for all children to finish setup */ 358 for (i = 0; i < 10; i++) { 359 ret = read_nointr(sv[i][0], &c, 1); 360 ASSERT_EQ(ret, 1); 361 ASSERT_EQ(c, 'R'); 362 } 363 364 /* Kill half the children */ 365 for (i = 0; i < 5; i++) 366 write_nointr(sv[i][0], "X", 1); 367 368 /* Small delay to let some exit */ 369 usleep(10000); 370 371 /* Kill remaining children */ 372 for (i = 5; i < 10; i++) 373 write_nointr(sv[i][0], "X", 1); 374 375 /* Wait for all children and cleanup */ 376 for (i = 0; i < 10; i++) { 377 waitpid(-1, NULL, 0); 378 close(sv[i][0]); 379 close(pidfds[i]); 380 } 381 382 /* Kill iterator and wait for it */ 383 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 384 ret = waitpid(iter_pid, &status, 0); 385 ASSERT_EQ(ret, iter_pid); 386 close(iter_pidfd); 387 388 /* Should have been killed */ 389 ASSERT_TRUE(WIFSIGNALED(status)); 390 ASSERT_EQ(WTERMSIG(status), SIGKILL); 391 392 /* Clean up */ 393 munmap(map, page_size); 394 } 395 396 /* 397 * Test specifically focused on mount namespace cleanup during EFAULT. 398 * Filter for mount namespaces only. 399 */ 400 TEST(listns_mnt_ns_cleanup_on_fault) 401 { 402 void *map; 403 __u64 *ns_ids; 404 ssize_t ret; 405 long page_size; 406 pid_t pid, iter_pid; 407 int pidfds[8]; 408 int sv[8][2]; 409 int iter_pidfd; 410 int i, status; 411 char c; 412 413 page_size = sysconf(_SC_PAGESIZE); 414 ASSERT_GT(page_size, 0); 415 416 /* Set up partial fault buffer */ 417 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 418 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 419 ASSERT_NE(map, MAP_FAILED); 420 421 ret = munmap((char *)map + page_size, page_size); 422 ASSERT_EQ(ret, 0); 423 424 /* Position for 3 successful writes, then fault */ 425 ns_ids = ((__u64 *)((char *)map + page_size)) - 3; 426 427 /* 428 * Create a separate process to run listns() concurrently. 429 */ 430 iter_pid = create_child(&iter_pidfd, 0); 431 ASSERT_NE(iter_pid, -1); 432 433 if (iter_pid == 0) { 434 struct ns_id_req req = { 435 .size = sizeof(req), 436 .spare = 0, 437 .ns_id = 0, 438 .ns_type = CLONE_NEWNS, /* Only mount namespaces */ 439 .spare2 = 0, 440 .user_ns_id = 0, 441 }; 442 int iter_ret; 443 444 /* 445 * Loop calling listns() until killed. 446 * Call listns() to race with namespace destruction. 447 */ 448 while (1) { 449 iter_ret = sys_listns(&req, ns_ids, 10, 0); 450 451 if (iter_ret == -1 && errno == ENOSYS) 452 _exit(PIDFD_SKIP); 453 } 454 } 455 456 /* Small delay to let iterator start looping */ 457 usleep(50000); 458 459 /* Create children with mount namespaces */ 460 for (i = 0; i < 8; i++) { 461 /* Create socketpair for synchronization */ 462 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 463 464 pid = create_child(&pidfds[i], CLONE_NEWNS); 465 ASSERT_NE(pid, -1); 466 467 if (pid == 0) { 468 close(sv[i][0]); /* Close parent end */ 469 470 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 471 _exit(1); 472 473 /* Do some mount operations to make cleanup more interesting */ 474 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 475 _exit(1); 476 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 477 _exit(1); 478 479 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 480 _exit(1); 481 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 482 _exit(1); 483 484 /* Signal parent that setup is complete */ 485 if (write_nointr(sv[i][1], "R", 1) != 1) 486 _exit(1); 487 488 /* Wait for parent to signal us to exit */ 489 if (read_nointr(sv[i][1], &c, 1) != 1) 490 _exit(1); 491 492 close(sv[i][1]); 493 _exit(0); 494 } 495 496 close(sv[i][1]); /* Close child end */ 497 } 498 499 /* Wait for all children to finish setup */ 500 for (i = 0; i < 8; i++) { 501 ret = read_nointr(sv[i][0], &c, 1); 502 ASSERT_EQ(ret, 1); 503 ASSERT_EQ(c, 'R'); 504 } 505 506 /* Kill children to trigger namespace destruction during iteration */ 507 for (i = 0; i < 8; i++) 508 write_nointr(sv[i][0], "X", 1); 509 510 /* Wait for children and cleanup */ 511 for (i = 0; i < 8; i++) { 512 waitpid(-1, NULL, 0); 513 close(sv[i][0]); 514 close(pidfds[i]); 515 } 516 517 /* Kill iterator and wait for it */ 518 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 519 ret = waitpid(iter_pid, &status, 0); 520 ASSERT_EQ(ret, iter_pid); 521 close(iter_pidfd); 522 523 /* Should have been killed */ 524 ASSERT_TRUE(WIFSIGNALED(status)); 525 ASSERT_EQ(WTERMSIG(status), SIGKILL); 526 527 munmap(map, page_size); 528 } 529 530 TEST_HARNESS_MAIN 531