1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/exportfs.h> 4 #include <linux/file.h> 5 #include <linux/fs.h> 6 #include <linux/cgroup.h> 7 #include <linux/magic.h> 8 #include <linux/mount.h> 9 #include <linux/pid.h> 10 #include <linux/pidfs.h> 11 #include <linux/pid_namespace.h> 12 #include <linux/poll.h> 13 #include <linux/proc_fs.h> 14 #include <linux/proc_ns.h> 15 #include <linux/pseudo_fs.h> 16 #include <linux/ptrace.h> 17 #include <linux/seq_file.h> 18 #include <uapi/linux/pidfd.h> 19 #include <linux/ipc_namespace.h> 20 #include <linux/time_namespace.h> 21 #include <linux/utsname.h> 22 #include <net/net_namespace.h> 23 #include <linux/coredump.h> 24 #include <linux/rhashtable.h> 25 #include <linux/xattr.h> 26 #include <linux/cookie.h> 27 28 #include "internal.h" 29 #include "mount.h" 30 31 #define PIDFS_PID_DEAD ERR_PTR(-ESRCH) 32 33 static struct kmem_cache *pidfs_attr_cachep __ro_after_init; 34 static struct kmem_cache *pidfs_xattr_cachep __ro_after_init; 35 36 static struct path pidfs_root_path = {}; 37 38 void pidfs_get_root(struct path *path) 39 { 40 *path = pidfs_root_path; 41 path_get(path); 42 } 43 44 enum pidfs_attr_mask_bits { 45 PIDFS_ATTR_BIT_EXIT = 0, 46 PIDFS_ATTR_BIT_COREDUMP = 1, 47 }; 48 49 struct pidfs_attr { 50 unsigned long attr_mask; 51 struct simple_xattrs *xattrs; 52 struct /* exit info */ { 53 __u64 cgroupid; 54 __s32 exit_code; 55 }; 56 __u32 coredump_mask; 57 __u32 coredump_signal; 58 }; 59 60 static struct rhashtable pidfs_ino_ht; 61 62 static const struct rhashtable_params pidfs_ino_ht_params = { 63 .key_offset = offsetof(struct pid, ino), 64 .key_len = sizeof(u64), 65 .head_offset = offsetof(struct pid, pidfs_hash), 66 .automatic_shrinking = true, 67 }; 68 69 /* 70 * inode number handling 71 * 72 * On 64 bit nothing special happens. The 64bit number assigned 73 * to struct pid is the inode number. 74 * 75 * On 32 bit the 64 bit number assigned to struct pid is split 76 * into two 32 bit numbers. The lower 32 bits are used as the 77 * inode number and the upper 32 bits are used as the inode 78 * generation number. 79 * 80 * On 32 bit pidfs_ino() will return the lower 32 bit. When 81 * pidfs_ino() returns zero a wrap around happened. When a 82 * wraparound happens the 64 bit number will be incremented by 1 83 * so inode numbering starts at 1 again. 84 * 85 * On 64 bit comparing two pidfds is as simple as comparing 86 * inode numbers. 87 * 88 * When a wraparound happens on 32 bit multiple pidfds with the 89 * same inode number are likely to exist (This isn't a problem 90 * since before pidfs pidfds used the anonymous inode meaning 91 * all pidfds had the same inode number.). Userspace can 92 * reconstruct the 64 bit identifier by retrieving both the 93 * inode number and the inode generation number to compare or 94 * use file handles. 95 */ 96 97 #if BITS_PER_LONG == 32 98 99 DEFINE_SPINLOCK(pidfs_ino_lock); 100 static u64 pidfs_ino_nr = 1; 101 102 static inline unsigned long pidfs_ino(u64 ino) 103 { 104 return lower_32_bits(ino); 105 } 106 107 /* On 32 bit the generation number are the upper 32 bits. */ 108 static inline u32 pidfs_gen(u64 ino) 109 { 110 return upper_32_bits(ino); 111 } 112 113 static inline u64 pidfs_alloc_ino(void) 114 { 115 u64 ino; 116 117 spin_lock(&pidfs_ino_lock); 118 if (pidfs_ino(pidfs_ino_nr) == 0) 119 pidfs_ino_nr++; 120 ino = pidfs_ino_nr++; 121 spin_unlock(&pidfs_ino_lock); 122 return ino; 123 } 124 125 #else 126 127 /* On 64 bit simply return ino. */ 128 static inline unsigned long pidfs_ino(u64 ino) 129 { 130 return ino; 131 } 132 133 /* On 64 bit the generation number is 0. */ 134 static inline u32 pidfs_gen(u64 ino) 135 { 136 return 0; 137 } 138 139 DEFINE_COOKIE(pidfs_ino_cookie); 140 141 static u64 pidfs_alloc_ino(void) 142 { 143 u64 ino; 144 145 preempt_disable(); 146 ino = gen_cookie_next(&pidfs_ino_cookie); 147 preempt_enable(); 148 149 VFS_WARN_ON_ONCE(ino < 1); 150 return ino; 151 } 152 153 #endif 154 155 void pidfs_prepare_pid(struct pid *pid) 156 { 157 pid->stashed = NULL; 158 pid->attr = NULL; 159 pid->ino = 0; 160 } 161 162 int pidfs_add_pid(struct pid *pid) 163 { 164 int ret; 165 166 pid->ino = pidfs_alloc_ino(); 167 ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, 168 pidfs_ino_ht_params); 169 if (unlikely(ret)) 170 pid->ino = 0; 171 return ret; 172 } 173 174 void pidfs_remove_pid(struct pid *pid) 175 { 176 if (likely(pid->ino)) 177 rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, 178 pidfs_ino_ht_params); 179 } 180 181 void pidfs_free_pid(struct pid *pid) 182 { 183 struct pidfs_attr *attr __free(kfree) = no_free_ptr(pid->attr); 184 struct simple_xattrs *xattrs __free(kfree) = NULL; 185 186 /* 187 * Any dentry must've been wiped from the pid by now. 188 * Otherwise there's a reference count bug. 189 */ 190 VFS_WARN_ON_ONCE(pid->stashed); 191 192 /* 193 * This if an error occurred during e.g., task creation that 194 * causes us to never go through the exit path. 195 */ 196 if (unlikely(!attr)) 197 return; 198 199 /* This never had a pidfd created. */ 200 if (IS_ERR(attr)) 201 return; 202 203 xattrs = no_free_ptr(attr->xattrs); 204 if (xattrs) 205 simple_xattrs_free(xattrs, NULL); 206 } 207 208 #ifdef CONFIG_PROC_FS 209 /** 210 * pidfd_show_fdinfo - print information about a pidfd 211 * @m: proc fdinfo file 212 * @f: file referencing a pidfd 213 * 214 * Pid: 215 * This function will print the pid that a given pidfd refers to in the 216 * pid namespace of the procfs instance. 217 * If the pid namespace of the process is not a descendant of the pid 218 * namespace of the procfs instance 0 will be shown as its pid. This is 219 * similar to calling getppid() on a process whose parent is outside of 220 * its pid namespace. 221 * 222 * NSpid: 223 * If pid namespaces are supported then this function will also print 224 * the pid of a given pidfd refers to for all descendant pid namespaces 225 * starting from the current pid namespace of the instance, i.e. the 226 * Pid field and the first entry in the NSpid field will be identical. 227 * If the pid namespace of the process is not a descendant of the pid 228 * namespace of the procfs instance 0 will be shown as its first NSpid 229 * entry and no others will be shown. 230 * Note that this differs from the Pid and NSpid fields in 231 * /proc/<pid>/status where Pid and NSpid are always shown relative to 232 * the pid namespace of the procfs instance. The difference becomes 233 * obvious when sending around a pidfd between pid namespaces from a 234 * different branch of the tree, i.e. where no ancestral relation is 235 * present between the pid namespaces: 236 * - create two new pid namespaces ns1 and ns2 in the initial pid 237 * namespace (also take care to create new mount namespaces in the 238 * new pid namespace and mount procfs) 239 * - create a process with a pidfd in ns1 240 * - send pidfd from ns1 to ns2 241 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 242 * have exactly one entry, which is 0 243 */ 244 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 245 { 246 struct pid *pid = pidfd_pid(f); 247 struct pid_namespace *ns; 248 pid_t nr = -1; 249 250 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 251 ns = proc_pid_ns(file_inode(m->file)->i_sb); 252 nr = pid_nr_ns(pid, ns); 253 } 254 255 seq_put_decimal_ll(m, "Pid:\t", nr); 256 257 #ifdef CONFIG_PID_NS 258 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 259 if (nr > 0) { 260 int i; 261 262 /* If nr is non-zero it means that 'pid' is valid and that 263 * ns, i.e. the pid namespace associated with the procfs 264 * instance, is in the pid namespace hierarchy of pid. 265 * Start at one below the already printed level. 266 */ 267 for (i = ns->level + 1; i <= pid->level; i++) 268 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 269 } 270 #endif 271 seq_putc(m, '\n'); 272 } 273 #endif 274 275 /* 276 * Poll support for process exit notification. 277 */ 278 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 279 { 280 struct pid *pid = pidfd_pid(file); 281 struct task_struct *task; 282 __poll_t poll_flags = 0; 283 284 poll_wait(file, &pid->wait_pidfd, pts); 285 /* 286 * Don't wake waiters if the thread-group leader exited 287 * prematurely. They either get notified when the last subthread 288 * exits or not at all if one of the remaining subthreads execs 289 * and assumes the struct pid of the old thread-group leader. 290 */ 291 guard(rcu)(); 292 task = pid_task(pid, PIDTYPE_PID); 293 if (!task) 294 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 295 else if (task->exit_state && !delay_group_leader(task)) 296 poll_flags = EPOLLIN | EPOLLRDNORM; 297 298 return poll_flags; 299 } 300 301 static inline bool pid_in_current_pidns(const struct pid *pid) 302 { 303 const struct pid_namespace *ns = task_active_pid_ns(current); 304 305 if (ns->level <= pid->level) 306 return pid->numbers[ns->level].ns == ns; 307 308 return false; 309 } 310 311 static __u32 pidfs_coredump_mask(unsigned long mm_flags) 312 { 313 switch (__get_dumpable(mm_flags)) { 314 case SUID_DUMP_USER: 315 return PIDFD_COREDUMP_USER; 316 case SUID_DUMP_ROOT: 317 return PIDFD_COREDUMP_ROOT; 318 case SUID_DUMP_DISABLE: 319 return PIDFD_COREDUMP_SKIP; 320 default: 321 WARN_ON_ONCE(true); 322 } 323 324 return 0; 325 } 326 327 /* This must be updated whenever a new flag is added */ 328 #define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \ 329 PIDFD_INFO_CREDS | \ 330 PIDFD_INFO_CGROUPID | \ 331 PIDFD_INFO_EXIT | \ 332 PIDFD_INFO_COREDUMP | \ 333 PIDFD_INFO_SUPPORTED_MASK | \ 334 PIDFD_INFO_COREDUMP_SIGNAL) 335 336 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) 337 { 338 struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; 339 struct task_struct *task __free(put_task) = NULL; 340 struct pid *pid = pidfd_pid(file); 341 size_t usize = _IOC_SIZE(cmd); 342 struct pidfd_info kinfo = {}; 343 struct user_namespace *user_ns; 344 struct pidfs_attr *attr; 345 const struct cred *c; 346 __u64 mask; 347 348 BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER2); 349 350 if (!uinfo) 351 return -EINVAL; 352 if (usize < PIDFD_INFO_SIZE_VER0) 353 return -EINVAL; /* First version, no smaller struct possible */ 354 355 if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) 356 return -EFAULT; 357 358 /* 359 * Restrict information retrieval to tasks within the caller's pid 360 * namespace hierarchy. 361 */ 362 if (!pid_in_current_pidns(pid)) 363 return -EREMOTE; 364 365 attr = READ_ONCE(pid->attr); 366 if (mask & PIDFD_INFO_EXIT) { 367 if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) { 368 smp_rmb(); 369 kinfo.mask |= PIDFD_INFO_EXIT; 370 #ifdef CONFIG_CGROUPS 371 kinfo.cgroupid = attr->cgroupid; 372 kinfo.mask |= PIDFD_INFO_CGROUPID; 373 #endif 374 kinfo.exit_code = attr->exit_code; 375 } 376 } 377 378 if (mask & PIDFD_INFO_COREDUMP) { 379 if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) { 380 smp_rmb(); 381 kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL; 382 kinfo.coredump_mask = attr->coredump_mask; 383 kinfo.coredump_signal = attr->coredump_signal; 384 } 385 } 386 387 task = get_pid_task(pid, PIDTYPE_PID); 388 if (!task) { 389 /* 390 * If the task has already been reaped, only exit 391 * information is available 392 */ 393 if (!(mask & PIDFD_INFO_EXIT)) 394 return -ESRCH; 395 396 goto copy_out; 397 } 398 399 c = get_task_cred(task); 400 if (!c) 401 return -ESRCH; 402 403 if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) { 404 guard(task_lock)(task); 405 if (task->mm) { 406 unsigned long flags = __mm_flags_get_dumpable(task->mm); 407 408 kinfo.coredump_mask = pidfs_coredump_mask(flags); 409 kinfo.mask |= PIDFD_INFO_COREDUMP; 410 /* No coredump actually took place, so no coredump signal. */ 411 } 412 } 413 414 /* Unconditionally return identifiers and credentials, the rest only on request */ 415 416 user_ns = current_user_ns(); 417 kinfo.ruid = from_kuid_munged(user_ns, c->uid); 418 kinfo.rgid = from_kgid_munged(user_ns, c->gid); 419 kinfo.euid = from_kuid_munged(user_ns, c->euid); 420 kinfo.egid = from_kgid_munged(user_ns, c->egid); 421 kinfo.suid = from_kuid_munged(user_ns, c->suid); 422 kinfo.sgid = from_kgid_munged(user_ns, c->sgid); 423 kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); 424 kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); 425 kinfo.mask |= PIDFD_INFO_CREDS; 426 put_cred(c); 427 428 #ifdef CONFIG_CGROUPS 429 if (!kinfo.cgroupid) { 430 struct cgroup *cgrp; 431 432 rcu_read_lock(); 433 cgrp = task_dfl_cgroup(task); 434 kinfo.cgroupid = cgroup_id(cgrp); 435 kinfo.mask |= PIDFD_INFO_CGROUPID; 436 rcu_read_unlock(); 437 } 438 #endif 439 440 /* 441 * Copy pid/tgid last, to reduce the chances the information might be 442 * stale. Note that it is not possible to ensure it will be valid as the 443 * task might return as soon as the copy_to_user finishes, but that's ok 444 * and userspace expects that might happen and can act accordingly, so 445 * this is just best-effort. What we can do however is checking that all 446 * the fields are set correctly, or return ESRCH to avoid providing 447 * incomplete information. */ 448 449 kinfo.ppid = task_ppid_vnr(task); 450 kinfo.tgid = task_tgid_vnr(task); 451 kinfo.pid = task_pid_vnr(task); 452 kinfo.mask |= PIDFD_INFO_PID; 453 454 if (kinfo.pid == 0 || kinfo.tgid == 0) 455 return -ESRCH; 456 457 copy_out: 458 if (mask & PIDFD_INFO_SUPPORTED_MASK) { 459 kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK; 460 kinfo.supported_mask = PIDFD_INFO_SUPPORTED; 461 } 462 463 /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */ 464 WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask); 465 /* 466 * If userspace and the kernel have the same struct size it can just 467 * be copied. If userspace provides an older struct, only the bits that 468 * userspace knows about will be copied. If userspace provides a new 469 * struct, only the bits that the kernel knows about will be copied. 470 */ 471 return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL); 472 } 473 474 static bool pidfs_ioctl_valid(unsigned int cmd) 475 { 476 switch (cmd) { 477 case FS_IOC_GETVERSION: 478 case PIDFD_GET_CGROUP_NAMESPACE: 479 case PIDFD_GET_IPC_NAMESPACE: 480 case PIDFD_GET_MNT_NAMESPACE: 481 case PIDFD_GET_NET_NAMESPACE: 482 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 483 case PIDFD_GET_TIME_NAMESPACE: 484 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 485 case PIDFD_GET_UTS_NAMESPACE: 486 case PIDFD_GET_USER_NAMESPACE: 487 case PIDFD_GET_PID_NAMESPACE: 488 return true; 489 } 490 491 /* Extensible ioctls require some more careful checks. */ 492 switch (_IOC_NR(cmd)) { 493 case _IOC_NR(PIDFD_GET_INFO): 494 /* 495 * Try to prevent performing a pidfd ioctl when someone 496 * erronously mistook the file descriptor for a pidfd. 497 * This is not perfect but will catch most cases. 498 */ 499 return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); 500 } 501 502 return false; 503 } 504 505 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 506 { 507 struct task_struct *task __free(put_task) = NULL; 508 struct nsproxy *nsp __free(put_nsproxy) = NULL; 509 struct ns_common *ns_common = NULL; 510 511 if (!pidfs_ioctl_valid(cmd)) 512 return -ENOIOCTLCMD; 513 514 if (cmd == FS_IOC_GETVERSION) { 515 if (!arg) 516 return -EINVAL; 517 518 __u32 __user *argp = (__u32 __user *)arg; 519 return put_user(file_inode(file)->i_generation, argp); 520 } 521 522 /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ 523 if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) 524 return pidfd_info(file, cmd, arg); 525 526 task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); 527 if (!task) 528 return -ESRCH; 529 530 if (arg) 531 return -EINVAL; 532 533 scoped_guard(task_lock, task) { 534 nsp = task->nsproxy; 535 if (nsp) 536 get_nsproxy(nsp); 537 } 538 if (!nsp) 539 return -ESRCH; /* just pretend it didn't exist */ 540 541 /* 542 * We're trying to open a file descriptor to the namespace so perform a 543 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 544 */ 545 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 546 return -EACCES; 547 548 switch (cmd) { 549 /* Namespaces that hang of nsproxy. */ 550 case PIDFD_GET_CGROUP_NAMESPACE: 551 #ifdef CONFIG_CGROUPS 552 if (!ns_ref_get(nsp->cgroup_ns)) 553 break; 554 ns_common = to_ns_common(nsp->cgroup_ns); 555 #endif 556 break; 557 case PIDFD_GET_IPC_NAMESPACE: 558 #ifdef CONFIG_IPC_NS 559 if (!ns_ref_get(nsp->ipc_ns)) 560 break; 561 ns_common = to_ns_common(nsp->ipc_ns); 562 #endif 563 break; 564 case PIDFD_GET_MNT_NAMESPACE: 565 if (!ns_ref_get(nsp->mnt_ns)) 566 break; 567 ns_common = to_ns_common(nsp->mnt_ns); 568 break; 569 case PIDFD_GET_NET_NAMESPACE: 570 #ifdef CONFIG_NET_NS 571 if (!ns_ref_get(nsp->net_ns)) 572 break; 573 ns_common = to_ns_common(nsp->net_ns); 574 #endif 575 break; 576 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 577 #ifdef CONFIG_PID_NS 578 if (!ns_ref_get(nsp->pid_ns_for_children)) 579 break; 580 ns_common = to_ns_common(nsp->pid_ns_for_children); 581 #endif 582 break; 583 case PIDFD_GET_TIME_NAMESPACE: 584 #ifdef CONFIG_TIME_NS 585 if (!ns_ref_get(nsp->time_ns)) 586 break; 587 ns_common = to_ns_common(nsp->time_ns); 588 #endif 589 break; 590 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 591 #ifdef CONFIG_TIME_NS 592 if (!ns_ref_get(nsp->time_ns_for_children)) 593 break; 594 ns_common = to_ns_common(nsp->time_ns_for_children); 595 #endif 596 break; 597 case PIDFD_GET_UTS_NAMESPACE: 598 #ifdef CONFIG_UTS_NS 599 if (!ns_ref_get(nsp->uts_ns)) 600 break; 601 ns_common = to_ns_common(nsp->uts_ns); 602 #endif 603 break; 604 /* Namespaces that don't hang of nsproxy. */ 605 case PIDFD_GET_USER_NAMESPACE: 606 #ifdef CONFIG_USER_NS 607 scoped_guard(rcu) { 608 struct user_namespace *user_ns; 609 610 user_ns = task_cred_xxx(task, user_ns); 611 if (!ns_ref_get(user_ns)) 612 break; 613 ns_common = to_ns_common(user_ns); 614 } 615 #endif 616 break; 617 case PIDFD_GET_PID_NAMESPACE: 618 #ifdef CONFIG_PID_NS 619 scoped_guard(rcu) { 620 struct pid_namespace *pid_ns; 621 622 pid_ns = task_active_pid_ns(task); 623 if (!ns_ref_get(pid_ns)) 624 break; 625 ns_common = to_ns_common(pid_ns); 626 } 627 #endif 628 break; 629 default: 630 return -ENOIOCTLCMD; 631 } 632 633 if (!ns_common) 634 return -EOPNOTSUPP; 635 636 /* open_namespace() unconditionally consumes the reference */ 637 return open_namespace(ns_common); 638 } 639 640 static const struct file_operations pidfs_file_operations = { 641 .poll = pidfd_poll, 642 #ifdef CONFIG_PROC_FS 643 .show_fdinfo = pidfd_show_fdinfo, 644 #endif 645 .unlocked_ioctl = pidfd_ioctl, 646 .compat_ioctl = compat_ptr_ioctl, 647 }; 648 649 struct pid *pidfd_pid(const struct file *file) 650 { 651 if (file->f_op != &pidfs_file_operations) 652 return ERR_PTR(-EBADF); 653 return file_inode(file)->i_private; 654 } 655 656 /* 657 * We're called from release_task(). We know there's at least one 658 * reference to struct pid being held that won't be released until the 659 * task has been reaped which cannot happen until we're out of 660 * release_task(). 661 * 662 * If this struct pid has at least once been referred to by a pidfd then 663 * pid->attr will be allocated. If not we mark the struct pid as dead so 664 * anyone who is trying to register it with pidfs will fail to do so. 665 * Otherwise we would hand out pidfs for reaped tasks without having 666 * exit information available. 667 * 668 * Worst case is that we've filled in the info and the pid gets freed 669 * right away in free_pid() when no one holds a pidfd anymore. Since 670 * pidfs_exit() currently is placed after exit_task_work() we know that 671 * it cannot be us aka the exiting task holding a pidfd to itself. 672 */ 673 void pidfs_exit(struct task_struct *tsk) 674 { 675 struct pid *pid = task_pid(tsk); 676 struct pidfs_attr *attr; 677 #ifdef CONFIG_CGROUPS 678 struct cgroup *cgrp; 679 #endif 680 681 might_sleep(); 682 683 /* Synchronize with pidfs_register_pid(). */ 684 scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { 685 attr = pid->attr; 686 if (!attr) { 687 /* 688 * No one ever held a pidfd for this struct pid. 689 * Mark it as dead so no one can add a pidfs 690 * entry anymore. We're about to be reaped and 691 * so no exit information would be available. 692 */ 693 pid->attr = PIDFS_PID_DEAD; 694 return; 695 } 696 } 697 698 /* 699 * If @pid->attr is set someone might still legitimately hold a 700 * pidfd to @pid or someone might concurrently still be getting 701 * a reference to an already stashed dentry from @pid->stashed. 702 * So defer cleaning @pid->attr until the last reference to @pid 703 * is put 704 */ 705 706 #ifdef CONFIG_CGROUPS 707 rcu_read_lock(); 708 cgrp = task_dfl_cgroup(tsk); 709 attr->cgroupid = cgroup_id(cgrp); 710 rcu_read_unlock(); 711 #endif 712 attr->exit_code = tsk->exit_code; 713 714 /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ 715 smp_wmb(); 716 set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask); 717 } 718 719 #ifdef CONFIG_COREDUMP 720 void pidfs_coredump(const struct coredump_params *cprm) 721 { 722 struct pid *pid = cprm->pid; 723 struct pidfs_attr *attr; 724 725 attr = READ_ONCE(pid->attr); 726 727 VFS_WARN_ON_ONCE(!attr); 728 VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); 729 730 /* Note how we were coredumped and that we coredumped. */ 731 attr->coredump_mask = pidfs_coredump_mask(cprm->mm_flags) | 732 PIDFD_COREDUMPED; 733 /* If coredumping is set to skip we should never end up here. */ 734 VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP); 735 /* Expose the signal number that caused the coredump. */ 736 attr->coredump_signal = cprm->siginfo->si_signo; 737 smp_wmb(); 738 set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask); 739 } 740 #endif 741 742 static struct vfsmount *pidfs_mnt __ro_after_init; 743 744 /* 745 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 746 * implemented. Let's reject it completely until we have a clean 747 * permission concept for pidfds. 748 */ 749 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 750 struct iattr *attr) 751 { 752 return anon_inode_setattr(idmap, dentry, attr); 753 } 754 755 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 756 struct kstat *stat, u32 request_mask, 757 unsigned int query_flags) 758 { 759 return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); 760 } 761 762 static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) 763 { 764 struct inode *inode = d_inode(dentry); 765 struct pid *pid = inode->i_private; 766 struct pidfs_attr *attr = pid->attr; 767 struct simple_xattrs *xattrs; 768 769 xattrs = READ_ONCE(attr->xattrs); 770 if (!xattrs) 771 return 0; 772 773 return simple_xattr_list(inode, xattrs, buf, size); 774 } 775 776 static const struct inode_operations pidfs_inode_operations = { 777 .getattr = pidfs_getattr, 778 .setattr = pidfs_setattr, 779 .listxattr = pidfs_listxattr, 780 }; 781 782 static void pidfs_evict_inode(struct inode *inode) 783 { 784 struct pid *pid = inode->i_private; 785 786 clear_inode(inode); 787 put_pid(pid); 788 } 789 790 static const struct super_operations pidfs_sops = { 791 .drop_inode = inode_just_drop, 792 .evict_inode = pidfs_evict_inode, 793 .statfs = simple_statfs, 794 }; 795 796 /* 797 * 'lsof' has knowledge of out historical anon_inode use, and expects 798 * the pidfs dentry name to start with 'anon_inode'. 799 */ 800 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 801 { 802 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 803 } 804 805 const struct dentry_operations pidfs_dentry_operations = { 806 .d_dname = pidfs_dname, 807 .d_prune = stashed_dentry_prune, 808 }; 809 810 static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, 811 struct inode *parent) 812 { 813 const struct pid *pid = inode->i_private; 814 815 if (*max_len < 2) { 816 *max_len = 2; 817 return FILEID_INVALID; 818 } 819 820 *max_len = 2; 821 *(u64 *)fh = pid->ino; 822 return FILEID_KERNFS; 823 } 824 825 /* Find a struct pid based on the inode number. */ 826 static struct pid *pidfs_ino_get_pid(u64 ino) 827 { 828 struct pid *pid; 829 struct pidfs_attr *attr; 830 831 guard(rcu)(); 832 pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params); 833 if (!pid) 834 return NULL; 835 attr = READ_ONCE(pid->attr); 836 if (IS_ERR_OR_NULL(attr)) 837 return NULL; 838 if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) 839 return NULL; 840 /* Within our pid namespace hierarchy? */ 841 if (pid_vnr(pid) == 0) 842 return NULL; 843 return get_pid(pid); 844 } 845 846 static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, 847 struct fid *fid, int fh_len, 848 int fh_type) 849 { 850 int ret; 851 u64 pid_ino; 852 struct path path; 853 struct pid *pid; 854 855 if (fh_len < 2) 856 return NULL; 857 858 switch (fh_type) { 859 case FILEID_KERNFS: 860 pid_ino = *(u64 *)fid; 861 break; 862 default: 863 return NULL; 864 } 865 866 pid = pidfs_ino_get_pid(pid_ino); 867 if (!pid) 868 return NULL; 869 870 ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); 871 if (ret < 0) 872 return ERR_PTR(ret); 873 874 VFS_WARN_ON_ONCE(!pid->attr); 875 876 mntput(path.mnt); 877 return path.dentry; 878 } 879 880 /* 881 * Make sure that we reject any nonsensical flags that users pass via 882 * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and 883 * PIDFD_NONBLOCK as O_NONBLOCK. 884 */ 885 #define VALID_FILE_HANDLE_OPEN_FLAGS \ 886 (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) 887 888 static int pidfs_export_permission(struct handle_to_path_ctx *ctx, 889 unsigned int oflags) 890 { 891 if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) 892 return -EINVAL; 893 894 /* 895 * pidfd_ino_get_pid() will verify that the struct pid is part 896 * of the caller's pid namespace hierarchy. No further 897 * permission checks are needed. 898 */ 899 return 0; 900 } 901 902 static struct file *pidfs_export_open(const struct path *path, unsigned int oflags) 903 { 904 /* 905 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise 906 * O_RDWR as pidfds always are. 907 */ 908 oflags &= ~O_LARGEFILE; 909 return dentry_open(path, oflags | O_RDWR, current_cred()); 910 } 911 912 static const struct export_operations pidfs_export_operations = { 913 .encode_fh = pidfs_encode_fh, 914 .fh_to_dentry = pidfs_fh_to_dentry, 915 .open = pidfs_export_open, 916 .permission = pidfs_export_permission, 917 }; 918 919 static int pidfs_init_inode(struct inode *inode, void *data) 920 { 921 const struct pid *pid = data; 922 923 inode->i_private = data; 924 inode->i_flags |= S_PRIVATE | S_ANON_INODE; 925 /* We allow to set xattrs. */ 926 inode->i_flags &= ~S_IMMUTABLE; 927 inode->i_mode |= S_IRWXU; 928 inode->i_op = &pidfs_inode_operations; 929 inode->i_fop = &pidfs_file_operations; 930 inode->i_ino = pidfs_ino(pid->ino); 931 inode->i_generation = pidfs_gen(pid->ino); 932 return 0; 933 } 934 935 static void pidfs_put_data(void *data) 936 { 937 struct pid *pid = data; 938 put_pid(pid); 939 } 940 941 /** 942 * pidfs_register_pid - register a struct pid in pidfs 943 * @pid: pid to pin 944 * 945 * Register a struct pid in pidfs. 946 * 947 * Return: On success zero, on error a negative error code is returned. 948 */ 949 int pidfs_register_pid(struct pid *pid) 950 { 951 struct pidfs_attr *new_attr __free(kfree) = NULL; 952 struct pidfs_attr *attr; 953 954 might_sleep(); 955 956 if (!pid) 957 return 0; 958 959 attr = READ_ONCE(pid->attr); 960 if (unlikely(attr == PIDFS_PID_DEAD)) 961 return PTR_ERR(PIDFS_PID_DEAD); 962 if (attr) 963 return 0; 964 965 new_attr = kmem_cache_zalloc(pidfs_attr_cachep, GFP_KERNEL); 966 if (!new_attr) 967 return -ENOMEM; 968 969 /* Synchronize with pidfs_exit(). */ 970 guard(spinlock_irq)(&pid->wait_pidfd.lock); 971 972 attr = pid->attr; 973 if (unlikely(attr == PIDFS_PID_DEAD)) 974 return PTR_ERR(PIDFS_PID_DEAD); 975 if (unlikely(attr)) 976 return 0; 977 978 pid->attr = no_free_ptr(new_attr); 979 return 0; 980 } 981 982 static struct dentry *pidfs_stash_dentry(struct dentry **stashed, 983 struct dentry *dentry) 984 { 985 int ret; 986 struct pid *pid = d_inode(dentry)->i_private; 987 988 VFS_WARN_ON_ONCE(stashed != &pid->stashed); 989 990 ret = pidfs_register_pid(pid); 991 if (ret) 992 return ERR_PTR(ret); 993 994 return stash_dentry(stashed, dentry); 995 } 996 997 static const struct stashed_operations pidfs_stashed_ops = { 998 .stash_dentry = pidfs_stash_dentry, 999 .init_inode = pidfs_init_inode, 1000 .put_data = pidfs_put_data, 1001 }; 1002 1003 static int pidfs_xattr_get(const struct xattr_handler *handler, 1004 struct dentry *unused, struct inode *inode, 1005 const char *suffix, void *value, size_t size) 1006 { 1007 struct pid *pid = inode->i_private; 1008 struct pidfs_attr *attr = pid->attr; 1009 const char *name; 1010 struct simple_xattrs *xattrs; 1011 1012 xattrs = READ_ONCE(attr->xattrs); 1013 if (!xattrs) 1014 return 0; 1015 1016 name = xattr_full_name(handler, suffix); 1017 return simple_xattr_get(xattrs, name, value, size); 1018 } 1019 1020 static int pidfs_xattr_set(const struct xattr_handler *handler, 1021 struct mnt_idmap *idmap, struct dentry *unused, 1022 struct inode *inode, const char *suffix, 1023 const void *value, size_t size, int flags) 1024 { 1025 struct pid *pid = inode->i_private; 1026 struct pidfs_attr *attr = pid->attr; 1027 const char *name; 1028 struct simple_xattrs *xattrs; 1029 struct simple_xattr *old_xattr; 1030 1031 /* Ensure we're the only one to set @attr->xattrs. */ 1032 WARN_ON_ONCE(!inode_is_locked(inode)); 1033 1034 xattrs = READ_ONCE(attr->xattrs); 1035 if (!xattrs) { 1036 xattrs = kmem_cache_zalloc(pidfs_xattr_cachep, GFP_KERNEL); 1037 if (!xattrs) 1038 return -ENOMEM; 1039 1040 simple_xattrs_init(xattrs); 1041 smp_store_release(&pid->attr->xattrs, xattrs); 1042 } 1043 1044 name = xattr_full_name(handler, suffix); 1045 old_xattr = simple_xattr_set(xattrs, name, value, size, flags); 1046 if (IS_ERR(old_xattr)) 1047 return PTR_ERR(old_xattr); 1048 1049 simple_xattr_free(old_xattr); 1050 return 0; 1051 } 1052 1053 static const struct xattr_handler pidfs_trusted_xattr_handler = { 1054 .prefix = XATTR_TRUSTED_PREFIX, 1055 .get = pidfs_xattr_get, 1056 .set = pidfs_xattr_set, 1057 }; 1058 1059 static const struct xattr_handler *const pidfs_xattr_handlers[] = { 1060 &pidfs_trusted_xattr_handler, 1061 NULL 1062 }; 1063 1064 static int pidfs_init_fs_context(struct fs_context *fc) 1065 { 1066 struct pseudo_fs_context *ctx; 1067 1068 ctx = init_pseudo(fc, PID_FS_MAGIC); 1069 if (!ctx) 1070 return -ENOMEM; 1071 1072 fc->s_iflags |= SB_I_NOEXEC; 1073 fc->s_iflags |= SB_I_NODEV; 1074 ctx->s_d_flags |= DCACHE_DONTCACHE; 1075 ctx->ops = &pidfs_sops; 1076 ctx->eops = &pidfs_export_operations; 1077 ctx->dops = &pidfs_dentry_operations; 1078 ctx->xattr = pidfs_xattr_handlers; 1079 fc->s_fs_info = (void *)&pidfs_stashed_ops; 1080 return 0; 1081 } 1082 1083 static struct file_system_type pidfs_type = { 1084 .name = "pidfs", 1085 .init_fs_context = pidfs_init_fs_context, 1086 .kill_sb = kill_anon_super, 1087 }; 1088 1089 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 1090 { 1091 struct file *pidfd_file; 1092 struct path path __free(path_put) = {}; 1093 int ret; 1094 1095 /* 1096 * Ensure that PIDFD_STALE can be passed as a flag without 1097 * overloading other uapi pidfd flags. 1098 */ 1099 BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); 1100 BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); 1101 1102 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 1103 if (ret < 0) 1104 return ERR_PTR(ret); 1105 1106 VFS_WARN_ON_ONCE(!pid->attr); 1107 1108 flags &= ~PIDFD_STALE; 1109 flags |= O_RDWR; 1110 pidfd_file = dentry_open(&path, flags, current_cred()); 1111 /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ 1112 if (!IS_ERR(pidfd_file)) 1113 pidfd_file->f_flags |= (flags & PIDFD_THREAD); 1114 1115 return pidfd_file; 1116 } 1117 1118 void __init pidfs_init(void) 1119 { 1120 if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params)) 1121 panic("Failed to initialize pidfs hashtable"); 1122 1123 pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, 1124 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1125 SLAB_ACCOUNT | SLAB_PANIC), NULL); 1126 1127 pidfs_xattr_cachep = kmem_cache_create("pidfs_xattr_cache", 1128 sizeof(struct simple_xattrs), 0, 1129 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1130 SLAB_ACCOUNT | SLAB_PANIC), NULL); 1131 1132 pidfs_mnt = kern_mount(&pidfs_type); 1133 if (IS_ERR(pidfs_mnt)) 1134 panic("Failed to mount pidfs pseudo filesystem"); 1135 1136 pidfs_root_path.mnt = pidfs_mnt; 1137 pidfs_root_path.dentry = pidfs_mnt->mnt_root; 1138 } 1139