1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/exportfs.h> 4 #include <linux/file.h> 5 #include <linux/fs.h> 6 #include <linux/cgroup.h> 7 #include <linux/magic.h> 8 #include <linux/mount.h> 9 #include <linux/pid.h> 10 #include <linux/pidfs.h> 11 #include <linux/pid_namespace.h> 12 #include <linux/poll.h> 13 #include <linux/proc_fs.h> 14 #include <linux/proc_ns.h> 15 #include <linux/pseudo_fs.h> 16 #include <linux/ptrace.h> 17 #include <linux/seq_file.h> 18 #include <uapi/linux/pidfd.h> 19 #include <linux/ipc_namespace.h> 20 #include <linux/time_namespace.h> 21 #include <linux/utsname.h> 22 #include <net/net_namespace.h> 23 #include <linux/coredump.h> 24 #include <linux/rhashtable.h> 25 #include <linux/xattr.h> 26 #include <linux/cookie.h> 27 28 #include "internal.h" 29 #include "mount.h" 30 31 #define PIDFS_PID_DEAD ERR_PTR(-ESRCH) 32 33 static struct kmem_cache *pidfs_attr_cachep __ro_after_init; 34 static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; 35 36 static struct path pidfs_root_path = {}; 37 38 void pidfs_get_root(struct path *path) 39 { 40 *path = pidfs_root_path; 41 path_get(path); 42 } 43 44 enum pidfs_attr_mask_bits { 45 PIDFS_ATTR_BIT_EXIT = 0, 46 PIDFS_ATTR_BIT_COREDUMP = 1, 47 }; 48 49 struct pidfs_attr { 50 unsigned long attr_mask; 51 struct simple_xattrs *xattrs; 52 struct /* exit info */ { 53 __u64 cgroupid; 54 __s32 exit_code; 55 }; 56 __u32 coredump_mask; 57 __u32 coredump_signal; 58 }; 59 60 static struct rhashtable pidfs_ino_ht; 61 62 static const struct rhashtable_params pidfs_ino_ht_params = { 63 .key_offset = offsetof(struct pid, ino), 64 .key_len = sizeof(u64), 65 .head_offset = offsetof(struct pid, pidfs_hash), 66 .automatic_shrinking = true, 67 }; 68 69 /* 70 * inode number handling 71 * 72 * On 64 bit nothing special happens. The 64bit number assigned 73 * to struct pid is the inode number. 74 * 75 * On 32 bit the 64 bit number assigned to struct pid is split 76 * into two 32 bit numbers. The lower 32 bits are used as the 77 * inode number and the upper 32 bits are used as the inode 78 * generation number. 79 * 80 * On 32 bit pidfs_ino() will return the lower 32 bit. When 81 * pidfs_ino() returns zero a wrap around happened. When a 82 * wraparound happens the 64 bit number will be incremented by 1 83 * so inode numbering starts at 1 again. 84 * 85 * On 64 bit comparing two pidfds is as simple as comparing 86 * inode numbers. 87 * 88 * When a wraparound happens on 32 bit multiple pidfds with the 89 * same inode number are likely to exist (This isn't a problem 90 * since before pidfs pidfds used the anonymous inode meaning 91 * all pidfds had the same inode number.). Userspace can 92 * reconstruct the 64 bit identifier by retrieving both the 93 * inode number and the inode generation number to compare or 94 * use file handles. 95 */ 96 97 #if BITS_PER_LONG == 32 98 99 DEFINE_SPINLOCK(pidfs_ino_lock); 100 static u64 pidfs_ino_nr = 1; 101 102 static inline unsigned long pidfs_ino(u64 ino) 103 { 104 return lower_32_bits(ino); 105 } 106 107 /* On 32 bit the generation number are the upper 32 bits. */ 108 static inline u32 pidfs_gen(u64 ino) 109 { 110 return upper_32_bits(ino); 111 } 112 113 static inline u64 pidfs_alloc_ino(void) 114 { 115 u64 ino; 116 117 spin_lock(&pidfs_ino_lock); 118 if (pidfs_ino(pidfs_ino_nr) == 0) 119 pidfs_ino_nr++; 120 ino = pidfs_ino_nr++; 121 spin_unlock(&pidfs_ino_lock); 122 return ino; 123 } 124 125 #else 126 127 /* On 64 bit simply return ino. */ 128 static inline unsigned long pidfs_ino(u64 ino) 129 { 130 return ino; 131 } 132 133 /* On 64 bit the generation number is 0. */ 134 static inline u32 pidfs_gen(u64 ino) 135 { 136 return 0; 137 } 138 139 DEFINE_COOKIE(pidfs_ino_cookie); 140 141 static u64 pidfs_alloc_ino(void) 142 { 143 u64 ino; 144 145 preempt_disable(); 146 ino = gen_cookie_next(&pidfs_ino_cookie); 147 preempt_enable(); 148 149 VFS_WARN_ON_ONCE(ino < 1); 150 return ino; 151 } 152 153 #endif 154 155 void pidfs_prepare_pid(struct pid *pid) 156 { 157 pid->stashed = NULL; 158 pid->attr = NULL; 159 pid->ino = 0; 160 } 161 162 int pidfs_add_pid(struct pid *pid) 163 { 164 int ret; 165 166 pid->ino = pidfs_alloc_ino(); 167 ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, 168 pidfs_ino_ht_params); 169 if (unlikely(ret)) 170 pid->ino = 0; 171 return ret; 172 } 173 174 void pidfs_remove_pid(struct pid *pid) 175 { 176 if (likely(pid->ino)) 177 rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, 178 pidfs_ino_ht_params); 179 } 180 181 void pidfs_free_pid(struct pid *pid) 182 { 183 struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr); 184 struct simple_xattrs *xattrs __free(kfree) = NULL; 185 186 /* 187 * Any dentry must've been wiped from the pid by now. 188 * Otherwise there's a reference count bug. 189 */ 190 VFS_WARN_ON_ONCE(pid->stashed); 191 192 /* 193 * This if an error occurred during e.g., task creation that 194 * causes us to never go through the exit path. 195 */ 196 if (unlikely(!attr)) 197 return; 198 199 /* This never had a pidfd created. */ 200 if (IS_ERR(attr)) 201 return; 202 203 xattrs = no_free_ptr(attr->xattrs); 204 if (xattrs) 205 simple_xattrs_free(xattrs, NULL); 206 } 207 208 #ifdef CONFIG_PROC_FS 209 /** 210 * pidfd_show_fdinfo - print information about a pidfd 211 * @m: proc fdinfo file 212 * @f: file referencing a pidfd 213 * 214 * Pid: 215 * This function will print the pid that a given pidfd refers to in the 216 * pid namespace of the procfs instance. 217 * If the pid namespace of the process is not a descendant of the pid 218 * namespace of the procfs instance 0 will be shown as its pid. This is 219 * similar to calling getppid() on a process whose parent is outside of 220 * its pid namespace. 221 * 222 * NSpid: 223 * If pid namespaces are supported then this function will also print 224 * the pid of a given pidfd refers to for all descendant pid namespaces 225 * starting from the current pid namespace of the instance, i.e. the 226 * Pid field and the first entry in the NSpid field will be identical. 227 * If the pid namespace of the process is not a descendant of the pid 228 * namespace of the procfs instance 0 will be shown as its first NSpid 229 * entry and no others will be shown. 230 * Note that this differs from the Pid and NSpid fields in 231 * /proc/<pid>/status where Pid and NSpid are always shown relative to 232 * the pid namespace of the procfs instance. The difference becomes 233 * obvious when sending around a pidfd between pid namespaces from a 234 * different branch of the tree, i.e. where no ancestral relation is 235 * present between the pid namespaces: 236 * - create two new pid namespaces ns1 and ns2 in the initial pid 237 * namespace (also take care to create new mount namespaces in the 238 * new pid namespace and mount procfs) 239 * - create a process with a pidfd in ns1 240 * - send pidfd from ns1 to ns2 241 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 242 * have exactly one entry, which is 0 243 */ 244 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 245 { 246 struct pid *pid = pidfd_pid(f); 247 struct pid_namespace *ns; 248 pid_t nr = -1; 249 250 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 251 ns = proc_pid_ns(file_inode(m->file)->i_sb); 252 nr = pid_nr_ns(pid, ns); 253 } 254 255 seq_put_decimal_ll(m, "Pid:\t", nr); 256 257 #ifdef CONFIG_PID_NS 258 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 259 if (nr > 0) { 260 int i; 261 262 /* If nr is non-zero it means that 'pid' is valid and that 263 * ns, i.e. the pid namespace associated with the procfs 264 * instance, is in the pid namespace hierarchy of pid. 265 * Start at one below the already printed level. 266 */ 267 for (i = ns->level + 1; i <= pid->level; i++) 268 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 269 } 270 #endif 271 seq_putc(m, '\n'); 272 } 273 #endif 274 275 /* 276 * Poll support for process exit notification. 277 */ 278 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 279 { 280 struct pid *pid = pidfd_pid(file); 281 struct task_struct *task; 282 __poll_t poll_flags = 0; 283 284 poll_wait(file, &pid->wait_pidfd, pts); 285 /* 286 * Don't wake waiters if the thread-group leader exited 287 * prematurely. They either get notified when the last subthread 288 * exits or not at all if one of the remaining subthreads execs 289 * and assumes the struct pid of the old thread-group leader. 290 */ 291 guard(rcu)(); 292 task = pid_task(pid, PIDTYPE_PID); 293 if (!task) 294 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 295 else if (task->exit_state && !delay_group_leader(task)) 296 poll_flags = EPOLLIN | EPOLLRDNORM; 297 298 return poll_flags; 299 } 300 301 static inline bool pid_in_current_pidns(const struct pid *pid) 302 { 303 const struct pid_namespace *ns = task_active_pid_ns(current); 304 305 if (ns->level <= pid->level) 306 return pid->numbers[ns->level].ns == ns; 307 308 return false; 309 } 310 311 static __u32 pidfs_coredump_mask(unsigned long mm_flags) 312 { 313 switch (__get_dumpable(mm_flags)) { 314 case SUID_DUMP_USER: 315 return PIDFD_COREDUMP_USER; 316 case SUID_DUMP_ROOT: 317 return PIDFD_COREDUMP_ROOT; 318 case SUID_DUMP_DISABLE: 319 return PIDFD_COREDUMP_SKIP; 320 default: 321 WARN_ON_ONCE(true); 322 } 323 324 return 0; 325 } 326 327 /* This must be updated whenever a new flag is added */ 328 #define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \ 329 PIDFD_INFO_CREDS | \ 330 PIDFD_INFO_CGROUPID | \ 331 PIDFD_INFO_EXIT | \ 332 PIDFD_INFO_COREDUMP | \ 333 PIDFD_INFO_SUPPORTED_MASK | \ 334 PIDFD_INFO_COREDUMP_SIGNAL) 335 336 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) 337 { 338 struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; 339 struct task_struct *task __free(put_task) = NULL; 340 struct pid *pid = pidfd_pid(file); 341 size_t usize = _IOC_SIZE(cmd); 342 struct pidfd_info kinfo = {}; 343 struct user_namespace *user_ns; 344 struct pidfs_attr *attr; 345 const struct cred *c; 346 __u64 mask; 347 348 BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2); 349 350 if (!uinfo) 351 return -EINVAL; 352 if (usize < PIDFD_INFO_SIZE_VER0) 353 return -EINVAL; /* First version, no smaller struct possible */ 354 355 if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) 356 return -EFAULT; 357 358 /* 359 * Restrict information retrieval to tasks within the caller's pid 360 * namespace hierarchy. 361 */ 362 if (!pid_in_current_pidns(pid)) 363 return -EREMOTE; 364 365 attr = READ_ONCE(pid->attr); 366 if (mask & PIDFD_INFO_EXIT) { 367 if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) { 368 smp_rmb(); 369 kinfo.mask |= PIDFD_INFO_EXIT; 370 #ifdef CONFIG_CGROUPS 371 kinfo.cgroupid = attr->cgroupid; 372 kinfo.mask |= PIDFD_INFO_CGROUPID; 373 #endif 374 kinfo.exit_code = attr->exit_code; 375 } 376 } 377 378 if (mask & PIDFD_INFO_COREDUMP) { 379 if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) { 380 smp_rmb(); 381 kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; 382 kinfo.coredump_mask = attr->coredump_mask; 383 kinfo.coredump_signal = attr->coredump_signal; 384 } 385 } 386 387 task = get_pid_task(pid, PIDTYPE_PID); 388 if (!task) { 389 /* 390 * If the task has already been reaped, only exit 391 * information is available 392 */ 393 if (!(mask & PIDFD_INFO_EXIT)) 394 return -ESRCH; 395 396 goto copy_out; 397 } 398 399 c = get_task_cred(task); 400 if (!c) 401 return -ESRCH; 402 403 if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) { 404 guard(task_lock)(task); 405 if (task->mm) { 406 unsigned long flags = __mm_flags_get_dumpable(task->mm); 407 408 kinfo.coredump_mask = pidfs_coredump_mask(flags); 409 kinfo.mask |= PIDFD_INFO_COREDUMP; 410 /* No coredump actually took place, so no coredump signal. */ 411 } 412 } 413 414 /* Unconditionally return identifiers and credentials, the rest only on request */ 415 416 user_ns = current_user_ns(); 417 kinfo.ruid = from_kuid_munged(user_ns, c->uid); 418 kinfo.rgid = from_kgid_munged(user_ns, c->gid); 419 kinfo.euid = from_kuid_munged(user_ns, c->euid); 420 kinfo.egid = from_kgid_munged(user_ns, c->egid); 421 kinfo.suid = from_kuid_munged(user_ns, c->suid); 422 kinfo.sgid = from_kgid_munged(user_ns, c->sgid); 423 kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); 424 kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); 425 kinfo.mask |= PIDFD_INFO_CREDS; 426 put_cred(c); 427 428 #ifdef CONFIG_CGROUPS 429 if (!kinfo.cgroupid) { 430 struct cgroup *cgrp; 431 432 rcu_read_lock(); 433 cgrp = task_dfl_cgroup(task); 434 kinfo.cgroupid = cgroup_id(cgrp); 435 kinfo.mask |= PIDFD_INFO_CGROUPID; 436 rcu_read_unlock(); 437 } 438 #endif 439 440 /* 441 * Copy pid/tgid last, to reduce the chances the information might be 442 * stale. Note that it is not possible to ensure it will be valid as the 443 * task might return as soon as the copy_to_user finishes, but that's ok 444 * and userspace expects that might happen and can act accordingly, so 445 * this is just best-effort. What we can do however is checking that all 446 * the fields are set correctly, or return ESRCH to avoid providing 447 * incomplete information. */ 448 449 kinfo.ppid = task_ppid_vnr(task); 450 kinfo.tgid = task_tgid_vnr(task); 451 kinfo.pid = task_pid_vnr(task); 452 kinfo.mask |= PIDFD_INFO_PID; 453 454 if (kinfo.pid == 0 || kinfo.tgid == 0) 455 return -ESRCH; 456 457 copy_out: 458 if (mask & PIDFD_INFO_SUPPORTED_MASK) { 459 kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK; 460 kinfo.supported_mask = PIDFD_INFO_SUPPORTED; 461 } 462 463 /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */ 464 WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask); 465 /* 466 * If userspace and the kernel have the same struct size it can just 467 * be copied. If userspace provides an older struct, only the bits that 468 * userspace knows about will be copied. If userspace provides a new 469 * struct, only the bits that the kernel knows about will be copied. 470 */ 471 return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL); 472 } 473 474 static bool pidfs_ioctl_valid(unsigned int cmd) 475 { 476 switch (cmd) { 477 case FS_IOC_GETVERSION: 478 case PIDFD_GET_CGROUP_NAMESPACE: 479 case PIDFD_GET_IPC_NAMESPACE: 480 case PIDFD_GET_MNT_NAMESPACE: 481 case PIDFD_GET_NET_NAMESPACE: 482 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 483 case PIDFD_GET_TIME_NAMESPACE: 484 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 485 case PIDFD_GET_UTS_NAMESPACE: 486 case PIDFD_GET_USER_NAMESPACE: 487 case PIDFD_GET_PID_NAMESPACE: 488 return true; 489 } 490 491 /* Extensible ioctls require some more careful checks. */ 492 switch (_IOC_NR(cmd)) { 493 case _IOC_NR(PIDFD_GET_INFO): 494 /* 495 * Try to prevent performing a pidfd ioctl when someone 496 * erronously mistook the file descriptor for a pidfd. 497 * This is not perfect but will catch most cases. 498 */ 499 return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); 500 } 501 502 return false; 503 } 504 505 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 506 { 507 struct task_struct *task __free(put_task) = NULL; 508 struct nsproxy *nsp __free(put_nsproxy) = NULL; 509 struct ns_common *ns_common = NULL; 510 511 if (!pidfs_ioctl_valid(cmd)) 512 return -ENOIOCTLCMD; 513 514 if (cmd == FS_IOC_GETVERSION) { 515 if (!arg) 516 return -EINVAL; 517 518 __u32 __user *argp = (__u32 __user *)arg; 519 return put_user(file_inode(file)->i_generation, argp); 520 } 521 522 /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ 523 if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) 524 return pidfd_info(file, cmd, arg); 525 526 task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); 527 if (!task) 528 return -ESRCH; 529 530 if (arg) 531 return -EINVAL; 532 533 scoped_guard(task_lock, task) { 534 nsp = task->nsproxy; 535 if (nsp) 536 get_nsproxy(nsp); 537 } 538 if (!nsp) 539 return -ESRCH; /* just pretend it didn't exist */ 540 541 /* 542 * We're trying to open a file descriptor to the namespace so perform a 543 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 544 */ 545 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 546 return -EACCES; 547 548 switch (cmd) { 549 /* Namespaces that hang of nsproxy. */ 550 case PIDFD_GET_CGROUP_NAMESPACE: 551 #ifdef CONFIG_CGROUPS 552 if (!ns_ref_get(nsp->cgroup_ns)) 553 break; 554 ns_common = to_ns_common(nsp->cgroup_ns); 555 #endif 556 break; 557 case PIDFD_GET_IPC_NAMESPACE: 558 #ifdef CONFIG_IPC_NS 559 if (!ns_ref_get(nsp->ipc_ns)) 560 break; 561 ns_common = to_ns_common(nsp->ipc_ns); 562 #endif 563 break; 564 case PIDFD_GET_MNT_NAMESPACE: 565 if (!ns_ref_get(nsp->mnt_ns)) 566 break; 567 ns_common = to_ns_common(nsp->mnt_ns); 568 break; 569 case PIDFD_GET_NET_NAMESPACE: 570 #ifdef CONFIG_NET_NS 571 if (!ns_ref_get(nsp->net_ns)) 572 break; 573 ns_common = to_ns_common(nsp->net_ns); 574 #endif 575 break; 576 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 577 #ifdef CONFIG_PID_NS 578 if (!ns_ref_get(nsp->pid_ns_for_children)) 579 break; 580 ns_common = to_ns_common(nsp->pid_ns_for_children); 581 #endif 582 break; 583 case PIDFD_GET_TIME_NAMESPACE: 584 #ifdef CONFIG_TIME_NS 585 if (!ns_ref_get(nsp->time_ns)) 586 break; 587 ns_common = to_ns_common(nsp->time_ns); 588 #endif 589 break; 590 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 591 #ifdef CONFIG_TIME_NS 592 if (!ns_ref_get(nsp->time_ns_for_children)) 593 break; 594 ns_common = to_ns_common(nsp->time_ns_for_children); 595 #endif 596 break; 597 case PIDFD_GET_UTS_NAMESPACE: 598 #ifdef CONFIG_UTS_NS 599 if (!ns_ref_get(nsp->uts_ns)) 600 break; 601 ns_common = to_ns_common(nsp->uts_ns); 602 #endif 603 break; 604 /* Namespaces that don't hang of nsproxy. */ 605 case PIDFD_GET_USER_NAMESPACE: 606 #ifdef CONFIG_USER_NS 607 scoped_guard(rcu) { 608 struct user_namespace *user_ns; 609 610 user_ns = task_cred_xxx(task, user_ns); 611 if (ns_ref_get(user_ns)) 612 ns_common = to_ns_common(user_ns); 613 } 614 #endif 615 break; 616 case PIDFD_GET_PID_NAMESPACE: 617 #ifdef CONFIG_PID_NS 618 scoped_guard(rcu) { 619 struct pid_namespace *pid_ns; 620 621 pid_ns = task_active_pid_ns(task); 622 if (ns_ref_get(pid_ns)) 623 ns_common = to_ns_common(pid_ns); 624 } 625 #endif 626 break; 627 default: 628 return -ENOIOCTLCMD; 629 } 630 631 if (!ns_common) 632 return -EOPNOTSUPP; 633 634 /* open_namespace() unconditionally consumes the reference */ 635 return open_namespace(ns_common); 636 } 637 638 static const struct file_operations pidfs_file_operations = { 639 .poll = pidfd_poll, 640 #ifdef CONFIG_PROC_FS 641 .show_fdinfo = pidfd_show_fdinfo, 642 #endif 643 .unlocked_ioctl = pidfd_ioctl, 644 .compat_ioctl = compat_ptr_ioctl, 645 }; 646 647 struct pid *pidfd_pid(const struct file *file) 648 { 649 if (file->f_op != &pidfs_file_operations) 650 return ERR_PTR(-EBADF); 651 return file_inode(file)->i_private; 652 } 653 654 /* 655 * We're called from release_task(). We know there's at least one 656 * reference to struct pid being held that won't be released until the 657 * task has been reaped which cannot happen until we're out of 658 * release_task(). 659 * 660 * If this struct pid has at least once been referred to by a pidfd then 661 * pid->attr will be allocated. If not we mark the struct pid as dead so 662 * anyone who is trying to register it with pidfs will fail to do so. 663 * Otherwise we would hand out pidfs for reaped tasks without having 664 * exit information available. 665 * 666 * Worst case is that we've filled in the info and the pid gets freed 667 * right away in free_pid() when no one holds a pidfd anymore. Since 668 * pidfs_exit() currently is placed after exit_task_work() we know that 669 * it cannot be us aka the exiting task holding a pidfd to itself. 670 */ 671 void pidfs_exit(struct task_struct *tsk) 672 { 673 struct pid *pid = task_pid(tsk); 674 struct pidfs_attr *attr; 675 #ifdef CONFIG_CGROUPS 676 struct cgroup *cgrp; 677 #endif 678 679 might_sleep(); 680 681 /* Synchronize with pidfs_register_pid(). */ 682 scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { 683 attr = pid->attr; 684 if (!attr) { 685 /* 686 * No one ever held a pidfd for this struct pid. 687 * Mark it as dead so no one can add a pidfs 688 * entry anymore. We're about to be reaped and 689 * so no exit information would be available. 690 */ 691 pid->attr = PIDFS_PID_DEAD; 692 return; 693 } 694 } 695 696 /* 697 * If @pid->attr is set someone might still legitimately hold a 698 * pidfd to @pid or someone might concurrently still be getting 699 * a reference to an already stashed dentry from @pid->stashed. 700 * So defer cleaning @pid->attr until the last reference to @pid 701 * is put 702 */ 703 704 #ifdef CONFIG_CGROUPS 705 rcu_read_lock(); 706 cgrp = task_dfl_cgroup(tsk); 707 attr->cgroupid = cgroup_id(cgrp); 708 rcu_read_unlock(); 709 #endif 710 attr->exit_code = tsk->exit_code; 711 712 /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ 713 smp_wmb(); 714 set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask); 715 } 716 717 #ifdef CONFIG_COREDUMP 718 void pidfs_coredump(const struct coredump_params *cprm) 719 { 720 struct pid *pid = cprm->pid; 721 struct pidfs_attr *attr; 722 723 attr = READ_ONCE(pid->attr); 724 725 VFS_WARN_ON_ONCE(!attr); 726 VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); 727 728 /* Note how we were coredumped and that we coredumped. */ 729 attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) | 730 PIDFD_COREDUMPED; 731 /* If coredumping is set to skip we should never end up here. */ 732 VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP); 733 /* Expose the signal number that caused the coredump. */ 734 attr->coredump_signal = cprm->siginfo->si_signo; 735 smp_wmb(); 736 set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask); 737 } 738 #endif 739 740 static struct vfsmount *pidfs_mnt __ro_after_init; 741 742 /* 743 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 744 * implemented. Let's reject it completely until we have a clean 745 * permission concept for pidfds. 746 */ 747 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 748 struct iattr *attr) 749 { 750 return anon_inode_setattr(idmap, dentry, attr); 751 } 752 753 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 754 struct kstat *stat, u32 request_mask, 755 unsigned int query_flags) 756 { 757 return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); 758 } 759 760 static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) 761 { 762 struct inode *inode = d_inode(dentry); 763 struct pid *pid = inode->i_private; 764 struct pidfs_attr *attr = pid->attr; 765 struct simple_xattrs *xattrs; 766 767 xattrs = READ_ONCE(attr->xattrs); 768 if (!xattrs) 769 return 0; 770 771 return simple_xattr_list(inode, xattrs, buf, size); 772 } 773 774 static const struct inode_operations pidfs_inode_operations = { 775 .getattr = pidfs_getattr, 776 .setattr = pidfs_setattr, 777 .listxattr = pidfs_listxattr, 778 }; 779 780 static void pidfs_evict_inode(struct inode *inode) 781 { 782 struct pid *pid = inode->i_private; 783 784 clear_inode(inode); 785 put_pid(pid); 786 } 787 788 static const struct super_operations pidfs_sops = { 789 .drop_inode = inode_just_drop, 790 .evict_inode = pidfs_evict_inode, 791 .statfs = simple_statfs, 792 }; 793 794 /* 795 * 'lsof' has knowledge of out historical anon_inode use, and expects 796 * the pidfs dentry name to start with 'anon_inode'. 797 */ 798 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 799 { 800 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 801 } 802 803 const struct dentry_operations pidfs_dentry_operations = { 804 .d_dname = pidfs_dname, 805 .d_prune = stashed_dentry_prune, 806 }; 807 808 static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, 809 struct inode *parent) 810 { 811 const struct pid *pid = inode->i_private; 812 813 if (*max_len < 2) { 814 *max_len = 2; 815 return FILEID_INVALID; 816 } 817 818 *max_len = 2; 819 *(u64 *)fh = pid->ino; 820 return FILEID_KERNFS; 821 } 822 823 /* Find a struct pid based on the inode number. */ 824 static struct pid *pidfs_ino_get_pid(u64 ino) 825 { 826 struct pid *pid; 827 struct pidfs_attr *attr; 828 829 guard(rcu)(); 830 pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params); 831 if (!pid) 832 return NULL; 833 attr = READ_ONCE(pid->attr); 834 if (IS_ERR_OR_NULL(attr)) 835 return NULL; 836 if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) 837 return NULL; 838 /* Within our pid namespace hierarchy? */ 839 if (pid_vnr(pid) == 0) 840 return NULL; 841 return get_pid(pid); 842 } 843 844 static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, 845 struct fid *fid, int fh_len, 846 int fh_type) 847 { 848 int ret; 849 u64 pid_ino; 850 struct path path; 851 struct pid *pid; 852 853 if (fh_len < 2) 854 return NULL; 855 856 switch (fh_type) { 857 case FILEID_KERNFS: 858 pid_ino = *(u64 *)fid; 859 break; 860 default: 861 return NULL; 862 } 863 864 pid = pidfs_ino_get_pid(pid_ino); 865 if (!pid) 866 return NULL; 867 868 ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); 869 if (ret < 0) 870 return ERR_PTR(ret); 871 872 VFS_WARN_ON_ONCE(!pid->attr); 873 874 mntput(path.mnt); 875 return path.dentry; 876 } 877 878 /* 879 * Make sure that we reject any nonsensical flags that users pass via 880 * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and 881 * PIDFD_NONBLOCK as O_NONBLOCK. 882 */ 883 #define VALID_FILE_HANDLE_OPEN_FLAGS \ 884 (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) 885 886 static int pidfs_export_permission(struct handle_to_path_ctx *ctx, 887 unsigned int oflags) 888 { 889 if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) 890 return -EINVAL; 891 892 /* 893 * pidfd_ino_get_pid() will verify that the struct pid is part 894 * of the caller's pid namespace hierarchy. No further 895 * permission checks are needed. 896 */ 897 return 0; 898 } 899 900 static struct file *pidfs_export_open(const struct path *path, unsigned int oflags) 901 { 902 /* 903 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise 904 * O_RDWR as pidfds always are. 905 */ 906 oflags &= ~O_LARGEFILE; 907 return dentry_open(path, oflags | O_RDWR, current_cred()); 908 } 909 910 static const struct export_operations pidfs_export_operations = { 911 .encode_fh = pidfs_encode_fh, 912 .fh_to_dentry = pidfs_fh_to_dentry, 913 .open = pidfs_export_open, 914 .permission = pidfs_export_permission, 915 }; 916 917 static int pidfs_init_inode(struct inode *inode, void *data) 918 { 919 const struct pid *pid = data; 920 921 inode->i_private = data; 922 inode->i_flags |= S_PRIVATE | S_ANON_INODE; 923 /* We allow to set xattrs. */ 924 inode->i_flags &= ~S_IMMUTABLE; 925 inode->i_mode |= S_IRWXU; 926 inode->i_op = &pidfs_inode_operations; 927 inode->i_fop = &pidfs_file_operations; 928 inode->i_ino = pidfs_ino(pid->ino); 929 inode->i_generation = pidfs_gen(pid->ino); 930 return 0; 931 } 932 933 static void pidfs_put_data(void *data) 934 { 935 struct pid *pid = data; 936 put_pid(pid); 937 } 938 939 /** 940 * pidfs_register_pid - register a struct pid in pidfs 941 * @pid: pid to pin 942 * 943 * Register a struct pid in pidfs. 944 * 945 * Return: On success zero, on error a negative error code is returned. 946 */ 947 int pidfs_register_pid(struct pid *pid) 948 { 949 struct pidfs_attr *new_attr __free(kfree) = NULL; 950 struct pidfs_attr *attr; 951 952 might_sleep(); 953 954 if (!pid) 955 return 0; 956 957 attr = READ_ONCE(pid->attr); 958 if (unlikely(attr == PIDFS_PID_DEAD)) 959 return PTR_ERR(PIDFS_PID_DEAD); 960 if (attr) 961 return 0; 962 963 new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL); 964 if (!new_attr) 965 return -ENOMEM; 966 967 /* Synchronize with pidfs_exit(). */ 968 guard(spinlock_irq)(&pid->wait_pidfd.lock); 969 970 attr = pid->attr; 971 if (unlikely(attr == PIDFS_PID_DEAD)) 972 return PTR_ERR(PIDFS_PID_DEAD); 973 if (unlikely(attr)) 974 return 0; 975 976 pid->attr = no_free_ptr(new_attr); 977 return 0; 978 } 979 980 static struct dentry *pidfs_stash_dentry(struct dentry **stashed, 981 struct dentry *dentry) 982 { 983 int ret; 984 struct pid *pid = d_inode(dentry)->i_private; 985 986 VFS_WARN_ON_ONCE(stashed != &pid->stashed); 987 988 ret = pidfs_register_pid(pid); 989 if (ret) 990 return ERR_PTR(ret); 991 992 return stash_dentry(stashed, dentry); 993 } 994 995 static const struct stashed_operations pidfs_stashed_ops = { 996 .stash_dentry = pidfs_stash_dentry, 997 .init_inode = pidfs_init_inode, 998 .put_data = pidfs_put_data, 999 }; 1000 1001 static int pidfs_xattr_get(const struct xattr_handler *handler, 1002 struct dentry *unused, struct inode *inode, 1003 const char *suffix, void *value, size_t size) 1004 { 1005 struct pid *pid = inode->i_private; 1006 struct pidfs_attr *attr = pid->attr; 1007 const char *name; 1008 struct simple_xattrs *xattrs; 1009 1010 xattrs = READ_ONCE(attr->xattrs); 1011 if (!xattrs) 1012 return 0; 1013 1014 name = xattr_full_name(handler, suffix); 1015 return simple_xattr_get(xattrs, name, value, size); 1016 } 1017 1018 static int pidfs_xattr_set(const struct xattr_handler *handler, 1019 struct mnt_idmap *idmap, struct dentry *unused, 1020 struct inode *inode, const char *suffix, 1021 const void *value, size_t size, int flags) 1022 { 1023 struct pid *pid = inode->i_private; 1024 struct pidfs_attr *attr = pid->attr; 1025 const char *name; 1026 struct simple_xattrs *xattrs; 1027 struct simple_xattr *old_xattr; 1028 1029 /* Ensure we're the only one to set @attr->xattrs. */ 1030 WARN_ON_ONCE(!inode_is_locked(inode)); 1031 1032 xattrs = READ_ONCE(attr->xattrs); 1033 if (!xattrs) { 1034 xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL); 1035 if (!xattrs) 1036 return -ENOMEM; 1037 1038 simple_xattrs_init(xattrs); 1039 smp_store_release(&pid->attr->xattrs, xattrs); 1040 } 1041 1042 name = xattr_full_name(handler, suffix); 1043 old_xattr = simple_xattr_set(xattrs, name, value, size, flags); 1044 if (IS_ERR(old_xattr)) 1045 return PTR_ERR(old_xattr); 1046 1047 simple_xattr_free(old_xattr); 1048 return 0; 1049 } 1050 1051 static const struct xattr_handler pidfs_trusted_xattr_handler = { 1052 .prefix = XATTR_TRUSTED_PREFIX, 1053 .get = pidfs_xattr_get, 1054 .set = pidfs_xattr_set, 1055 }; 1056 1057 static const struct xattr_handler *const pidfs_xattr_handlers[] = { 1058 &pidfs_trusted_xattr_handler, 1059 NULL 1060 }; 1061 1062 static int pidfs_init_fs_context(struct fs_context *fc) 1063 { 1064 struct pseudo_fs_context *ctx; 1065 1066 ctx = init_pseudo(fc, PID_FS_MAGIC); 1067 if (!ctx) 1068 return -ENOMEM; 1069 1070 fc->s_iflags |= SB_I_NOEXEC; 1071 fc->s_iflags |= SB_I_NODEV; 1072 ctx->s_d_flags |= DCACHE_DONTCACHE; 1073 ctx->ops = &pidfs_sops; 1074 ctx->eops = &pidfs_export_operations; 1075 ctx->dops = &pidfs_dentry_operations; 1076 ctx->xattr = pidfs_xattr_handlers; 1077 fc->s_fs_info = (void *)&pidfs_stashed_ops; 1078 return 0; 1079 } 1080 1081 static struct file_system_type pidfs_type = { 1082 .name = "pidfs", 1083 .init_fs_context = pidfs_init_fs_context, 1084 .kill_sb = kill_anon_super, 1085 }; 1086 1087 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 1088 { 1089 struct file *pidfd_file; 1090 struct path path __free(path_put) = {}; 1091 int ret; 1092 1093 /* 1094 * Ensure that PIDFD_STALE can be passed as a flag without 1095 * overloading other uapi pidfd flags. 1096 */ 1097 BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); 1098 BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); 1099 1100 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 1101 if (ret < 0) 1102 return ERR_PTR(ret); 1103 1104 VFS_WARN_ON_ONCE(!pid->attr); 1105 1106 flags &= ~PIDFD_STALE; 1107 flags |= O_RDWR; 1108 pidfd_file = dentry_open(&path, flags, current_cred()); 1109 /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ 1110 if (!IS_ERR(pidfd_file)) 1111 pidfd_file->f_flags |= (flags & PIDFD_THREAD); 1112 1113 return pidfd_file; 1114 } 1115 1116 void __init pidfs_init(void) 1117 { 1118 if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params)) 1119 panic("Failed to initialize pidfs hashtable"); 1120 1121 pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, 1122 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1123 SLAB_ACCOUNT | SLAB_PANIC), NULL); 1124 1125 pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache", 1126 sizeof(struct simple_xattrs), 0, 1127 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1128 SLAB_ACCOUNT | SLAB_PANIC), NULL); 1129 1130 pidfs_mnt = kern_mount(&pidfs_type); 1131 if (IS_ERR(pidfs_mnt)) 1132 panic("Failed to mount pidfs pseudo filesystem"); 1133 1134 pidfs_root_path.mnt = pidfs_mnt; 1135 pidfs_root_path.dentry = pidfs_mnt->mnt_root; 1136 } 1137