1 // SPDX-License-Identifier: GPL-2.0 2 #define _GNU_SOURCE 3 #include <errno.h> 4 #include <fcntl.h> 5 #include <limits.h> 6 #include <sched.h> 7 #include <signal.h> 8 #include <stdio.h> 9 #include <stdlib.h> 10 #include <string.h> 11 #include <linux/nsfs.h> 12 #include <sys/ioctl.h> 13 #include <sys/mman.h> 14 #include <sys/mount.h> 15 #include <sys/socket.h> 16 #include <sys/stat.h> 17 #include <sys/syscall.h> 18 #include <sys/types.h> 19 #include <sys/wait.h> 20 #include <unistd.h> 21 #include "../kselftest_harness.h" 22 #include "../pidfd/pidfd.h" 23 #include "wrappers.h" 24 25 /* 26 * Test listns() error handling with invalid buffer addresses. 27 * 28 * When the buffer pointer is invalid (e.g., crossing page boundaries 29 * into unmapped memory), listns() returns EINVAL. 30 * 31 * This test also creates mount namespaces that get destroyed during 32 * iteration, testing that namespace cleanup happens outside the RCU 33 * read lock. 34 */ 35 TEST(listns_partial_fault_with_ns_cleanup) 36 { 37 void *map; 38 __u64 *ns_ids; 39 ssize_t ret; 40 long page_size; 41 pid_t pid, iter_pid; 42 int pidfds[5]; 43 int sv[5][2]; 44 int iter_pidfd; 45 int i, status; 46 char c; 47 48 page_size = sysconf(_SC_PAGESIZE); 49 ASSERT_GT(page_size, 0); 50 51 /* 52 * Map two pages: 53 * - First page: readable and writable 54 * - Second page: will be unmapped to trigger EFAULT 55 */ 56 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 57 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 58 ASSERT_NE(map, MAP_FAILED); 59 60 /* Unmap the second page */ 61 ret = munmap((char *)map + page_size, page_size); 62 ASSERT_EQ(ret, 0); 63 64 /* 65 * Position the buffer pointer so there's room for exactly one u64 66 * before the page boundary. The second u64 would fall into the 67 * unmapped page. 68 */ 69 ns_ids = ((__u64 *)((char *)map + page_size)) - 1; 70 71 /* 72 * Create a separate process to run listns() in a loop concurrently 73 * with namespace creation and destruction. 74 */ 75 iter_pid = create_child(&iter_pidfd, 0); 76 ASSERT_NE(iter_pid, -1); 77 78 if (iter_pid == 0) { 79 struct ns_id_req req = { 80 .size = sizeof(req), 81 .spare = 0, 82 .ns_id = 0, 83 .ns_type = 0, /* All types */ 84 .spare2 = 0, 85 .user_ns_id = 0, /* Global listing */ 86 }; 87 int iter_ret; 88 89 /* 90 * Loop calling listns() until killed. 91 * The kernel should: 92 * 1. Successfully write the first namespace ID (within valid page) 93 * 2. Fail with EFAULT when trying to write the second ID (unmapped page) 94 * 3. Handle concurrent namespace destruction without deadlock 95 */ 96 while (1) { 97 iter_ret = sys_listns(&req, ns_ids, 2, 0); 98 99 if (iter_ret == -1 && errno == ENOSYS) 100 _exit(PIDFD_SKIP); 101 } 102 } 103 104 /* Small delay to let iterator start looping */ 105 usleep(50000); 106 107 /* 108 * Create several child processes, each in its own mount namespace. 109 * These will be destroyed while the iterator is running listns(). 110 */ 111 for (i = 0; i < 5; i++) { 112 /* Create socketpair for synchronization */ 113 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 114 115 pid = create_child(&pidfds[i], CLONE_NEWNS); 116 ASSERT_NE(pid, -1); 117 118 if (pid == 0) { 119 close(sv[i][0]); /* Close parent end */ 120 121 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 122 _exit(1); 123 124 /* Child: create a couple of tmpfs mounts */ 125 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 126 _exit(1); 127 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 128 _exit(1); 129 130 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 131 _exit(1); 132 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 133 _exit(1); 134 135 /* Signal parent that setup is complete */ 136 if (write_nointr(sv[i][1], "R", 1) != 1) 137 _exit(1); 138 139 /* Wait for parent to signal us to exit */ 140 if (read_nointr(sv[i][1], &c, 1) != 1) 141 _exit(1); 142 143 close(sv[i][1]); 144 _exit(0); 145 } 146 147 close(sv[i][1]); /* Close child end */ 148 } 149 150 /* Wait for all children to finish setup */ 151 for (i = 0; i < 5; i++) { 152 ret = read_nointr(sv[i][0], &c, 1); 153 ASSERT_EQ(ret, 1); 154 ASSERT_EQ(c, 'R'); 155 } 156 157 /* 158 * Signal children to exit. This will destroy their mount namespaces 159 * while listns() is iterating the namespace tree. 160 * This tests that cleanup happens outside the RCU read lock. 161 */ 162 for (i = 0; i < 5; i++) 163 write_nointr(sv[i][0], "X", 1); 164 165 /* Wait for all mount namespace children to exit and cleanup */ 166 for (i = 0; i < 5; i++) { 167 waitpid(-1, NULL, 0); 168 close(sv[i][0]); 169 close(pidfds[i]); 170 } 171 172 /* Kill iterator and wait for it */ 173 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 174 ret = waitpid(iter_pid, &status, 0); 175 ASSERT_EQ(ret, iter_pid); 176 close(iter_pidfd); 177 178 /* Should have been killed */ 179 ASSERT_TRUE(WIFSIGNALED(status)); 180 ASSERT_EQ(WTERMSIG(status), SIGKILL); 181 182 /* Clean up */ 183 munmap(map, page_size); 184 } 185 186 /* 187 * Test listns() error handling when the entire buffer is invalid. 188 * This is a sanity check that basic invalid pointer detection works. 189 */ 190 TEST(listns_complete_fault) 191 { 192 struct ns_id_req req = { 193 .size = sizeof(req), 194 .spare = 0, 195 .ns_id = 0, 196 .ns_type = 0, 197 .spare2 = 0, 198 .user_ns_id = 0, 199 }; 200 __u64 *ns_ids; 201 ssize_t ret; 202 203 /* Use a clearly invalid pointer */ 204 ns_ids = (__u64 *)0xdeadbeef; 205 206 ret = sys_listns(&req, ns_ids, 10, 0); 207 208 if (ret == -1 && errno == ENOSYS) 209 SKIP(return, "listns() not supported"); 210 211 /* Should fail with EFAULT */ 212 ASSERT_EQ(ret, -1); 213 ASSERT_EQ(errno, EFAULT); 214 } 215 216 /* 217 * Test listns() error handling when the buffer is NULL. 218 */ 219 TEST(listns_null_buffer) 220 { 221 struct ns_id_req req = { 222 .size = sizeof(req), 223 .spare = 0, 224 .ns_id = 0, 225 .ns_type = 0, 226 .spare2 = 0, 227 .user_ns_id = 0, 228 }; 229 ssize_t ret; 230 231 /* NULL buffer with non-zero count should fail */ 232 ret = sys_listns(&req, NULL, 10, 0); 233 234 if (ret == -1 && errno == ENOSYS) 235 SKIP(return, "listns() not supported"); 236 237 /* Should fail with EFAULT */ 238 ASSERT_EQ(ret, -1); 239 ASSERT_EQ(errno, EFAULT); 240 } 241 242 /* 243 * Test listns() with a buffer that becomes invalid mid-iteration 244 * (after several successful writes), combined with mount namespace 245 * destruction to test RCU cleanup logic. 246 */ 247 TEST(listns_late_fault_with_ns_cleanup) 248 { 249 void *map; 250 __u64 *ns_ids; 251 ssize_t ret; 252 long page_size; 253 pid_t pid, iter_pid; 254 int pidfds[10]; 255 int sv[10][2]; 256 int iter_pidfd; 257 int i, status; 258 char c; 259 260 page_size = sysconf(_SC_PAGESIZE); 261 ASSERT_GT(page_size, 0); 262 263 /* Map two pages */ 264 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 265 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 266 ASSERT_NE(map, MAP_FAILED); 267 268 /* Unmap the second page */ 269 ret = munmap((char *)map + page_size, page_size); 270 ASSERT_EQ(ret, 0); 271 272 /* 273 * Position buffer so we can write several u64s successfully 274 * before hitting the page boundary. 275 */ 276 ns_ids = ((__u64 *)((char *)map + page_size)) - 5; 277 278 /* 279 * Create a separate process to run listns() concurrently. 280 */ 281 iter_pid = create_child(&iter_pidfd, 0); 282 ASSERT_NE(iter_pid, -1); 283 284 if (iter_pid == 0) { 285 struct ns_id_req req = { 286 .size = sizeof(req), 287 .spare = 0, 288 .ns_id = 0, 289 .ns_type = 0, 290 .spare2 = 0, 291 .user_ns_id = 0, 292 }; 293 int iter_ret; 294 295 /* 296 * Loop calling listns() until killed. 297 * Request 10 namespace IDs while namespaces are being destroyed. 298 * This tests: 299 * 1. EFAULT handling when buffer becomes invalid 300 * 2. Namespace cleanup outside RCU read lock during iteration 301 */ 302 while (1) { 303 iter_ret = sys_listns(&req, ns_ids, 10, 0); 304 305 if (iter_ret == -1 && errno == ENOSYS) 306 _exit(PIDFD_SKIP); 307 } 308 } 309 310 /* Small delay to let iterator start looping */ 311 usleep(50000); 312 313 /* 314 * Create more children with mount namespaces to increase the 315 * likelihood that namespace cleanup happens during iteration. 316 */ 317 for (i = 0; i < 10; i++) { 318 /* Create socketpair for synchronization */ 319 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 320 321 pid = create_child(&pidfds[i], CLONE_NEWNS); 322 ASSERT_NE(pid, -1); 323 324 if (pid == 0) { 325 close(sv[i][0]); /* Close parent end */ 326 327 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 328 _exit(1); 329 330 /* Child: create tmpfs mounts */ 331 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 332 _exit(1); 333 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 334 _exit(1); 335 336 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 337 _exit(1); 338 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 339 _exit(1); 340 341 /* Signal parent that setup is complete */ 342 if (write_nointr(sv[i][1], "R", 1) != 1) 343 _exit(1); 344 345 /* Wait for parent to signal us to exit */ 346 if (read_nointr(sv[i][1], &c, 1) != 1) 347 _exit(1); 348 349 close(sv[i][1]); 350 _exit(0); 351 } 352 353 close(sv[i][1]); /* Close child end */ 354 } 355 356 /* Wait for all children to finish setup */ 357 for (i = 0; i < 10; i++) { 358 ret = read_nointr(sv[i][0], &c, 1); 359 ASSERT_EQ(ret, 1); 360 ASSERT_EQ(c, 'R'); 361 } 362 363 /* Kill half the children */ 364 for (i = 0; i < 5; i++) 365 write_nointr(sv[i][0], "X", 1); 366 367 /* Small delay to let some exit */ 368 usleep(10000); 369 370 /* Kill remaining children */ 371 for (i = 5; i < 10; i++) 372 write_nointr(sv[i][0], "X", 1); 373 374 /* Wait for all children and cleanup */ 375 for (i = 0; i < 10; i++) { 376 waitpid(-1, NULL, 0); 377 close(sv[i][0]); 378 close(pidfds[i]); 379 } 380 381 /* Kill iterator and wait for it */ 382 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 383 ret = waitpid(iter_pid, &status, 0); 384 ASSERT_EQ(ret, iter_pid); 385 close(iter_pidfd); 386 387 /* Should have been killed */ 388 ASSERT_TRUE(WIFSIGNALED(status)); 389 ASSERT_EQ(WTERMSIG(status), SIGKILL); 390 391 /* Clean up */ 392 munmap(map, page_size); 393 } 394 395 /* 396 * Test specifically focused on mount namespace cleanup during EFAULT. 397 * Filter for mount namespaces only. 398 */ 399 TEST(listns_mnt_ns_cleanup_on_fault) 400 { 401 void *map; 402 __u64 *ns_ids; 403 ssize_t ret; 404 long page_size; 405 pid_t pid, iter_pid; 406 int pidfds[8]; 407 int sv[8][2]; 408 int iter_pidfd; 409 int i, status; 410 char c; 411 412 page_size = sysconf(_SC_PAGESIZE); 413 ASSERT_GT(page_size, 0); 414 415 /* Set up partial fault buffer */ 416 map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 417 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 418 ASSERT_NE(map, MAP_FAILED); 419 420 ret = munmap((char *)map + page_size, page_size); 421 ASSERT_EQ(ret, 0); 422 423 /* Position for 3 successful writes, then fault */ 424 ns_ids = ((__u64 *)((char *)map + page_size)) - 3; 425 426 /* 427 * Create a separate process to run listns() concurrently. 428 */ 429 iter_pid = create_child(&iter_pidfd, 0); 430 ASSERT_NE(iter_pid, -1); 431 432 if (iter_pid == 0) { 433 struct ns_id_req req = { 434 .size = sizeof(req), 435 .spare = 0, 436 .ns_id = 0, 437 .ns_type = CLONE_NEWNS, /* Only mount namespaces */ 438 .spare2 = 0, 439 .user_ns_id = 0, 440 }; 441 int iter_ret; 442 443 /* 444 * Loop calling listns() until killed. 445 * Call listns() to race with namespace destruction. 446 */ 447 while (1) { 448 iter_ret = sys_listns(&req, ns_ids, 10, 0); 449 450 if (iter_ret == -1 && errno == ENOSYS) 451 _exit(PIDFD_SKIP); 452 } 453 } 454 455 /* Small delay to let iterator start looping */ 456 usleep(50000); 457 458 /* Create children with mount namespaces */ 459 for (i = 0; i < 8; i++) { 460 /* Create socketpair for synchronization */ 461 ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 462 463 pid = create_child(&pidfds[i], CLONE_NEWNS); 464 ASSERT_NE(pid, -1); 465 466 if (pid == 0) { 467 close(sv[i][0]); /* Close parent end */ 468 469 if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 470 _exit(1); 471 472 /* Do some mount operations to make cleanup more interesting */ 473 if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 474 _exit(1); 475 if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 476 _exit(1); 477 478 if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 479 _exit(1); 480 if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 481 _exit(1); 482 483 /* Signal parent that setup is complete */ 484 if (write_nointr(sv[i][1], "R", 1) != 1) 485 _exit(1); 486 487 /* Wait for parent to signal us to exit */ 488 if (read_nointr(sv[i][1], &c, 1) != 1) 489 _exit(1); 490 491 close(sv[i][1]); 492 _exit(0); 493 } 494 495 close(sv[i][1]); /* Close child end */ 496 } 497 498 /* Wait for all children to finish setup */ 499 for (i = 0; i < 8; i++) { 500 ret = read_nointr(sv[i][0], &c, 1); 501 ASSERT_EQ(ret, 1); 502 ASSERT_EQ(c, 'R'); 503 } 504 505 /* Kill children to trigger namespace destruction during iteration */ 506 for (i = 0; i < 8; i++) 507 write_nointr(sv[i][0], "X", 1); 508 509 /* Wait for children and cleanup */ 510 for (i = 0; i < 8; i++) { 511 waitpid(-1, NULL, 0); 512 close(sv[i][0]); 513 close(pidfds[i]); 514 } 515 516 /* Kill iterator and wait for it */ 517 sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 518 ret = waitpid(iter_pid, &status, 0); 519 ASSERT_EQ(ret, iter_pid); 520 close(iter_pidfd); 521 522 /* Should have been killed */ 523 ASSERT_TRUE(WIFSIGNALED(status)); 524 ASSERT_EQ(WTERMSIG(status), SIGKILL); 525 526 munmap(map, page_size); 527 } 528 529 TEST_HARNESS_MAIN 530