1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/exportfs.h> 4 #include <linux/file.h> 5 #include <linux/fs.h> 6 #include <linux/cgroup.h> 7 #include <linux/magic.h> 8 #include <linux/mount.h> 9 #include <linux/pid.h> 10 #include <linux/pidfs.h> 11 #include <linux/sched/signal.h> 12 #include <linux/signal.h> 13 #include <linux/pid_namespace.h> 14 #include <linux/poll.h> 15 #include <linux/proc_fs.h> 16 #include <linux/proc_ns.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/ptrace.h> 19 #include <linux/seq_file.h> 20 #include <uapi/linux/pidfd.h> 21 #include <linux/ipc_namespace.h> 22 #include <linux/time_namespace.h> 23 #include <linux/utsname.h> 24 #include <net/net_namespace.h> 25 #include <linux/coredump.h> 26 #include <linux/rhashtable.h> 27 #include <linux/llist.h> 28 #include <linux/xattr.h> 29 #include <linux/cookie.h> 30 31 #include "internal.h" 32 #include "mount.h" 33 34 #define PIDFS_PID_DEAD ERR_PTR(-ESRCH) 35 36 static struct kmem_cache *pidfs_attr_cachep __ro_after_init; 37 38 static struct path pidfs_root_path = {}; 39 40 static struct simple_xattr_cache pidfs_xa_cache; 41 42 void pidfs_get_root(struct path *path) 43 { 44 *path = pidfs_root_path; 45 path_get(path); 46 } 47 48 enum pidfs_attr_mask_bits { 49 PIDFS_ATTR_BIT_EXIT = 0, 50 PIDFS_ATTR_BIT_COREDUMP = 1, 51 }; 52 53 struct pidfs_anon_attr { 54 unsigned long attr_mask; 55 struct /* exit info */ { 56 __u64 cgroupid; 57 __s32 exit_code; 58 }; 59 __u32 coredump_mask; 60 __u32 coredump_signal; 61 __u32 coredump_code; 62 }; 63 64 static struct rhashtable pidfs_ino_ht; 65 66 static const struct rhashtable_params pidfs_ino_ht_params = { 67 .key_offset = offsetof(struct pid, ino), 68 .key_len = sizeof(u64), 69 .head_offset = offsetof(struct pid, pidfs_hash), 70 .automatic_shrinking = true, 71 }; 72 73 /* 74 * inode number handling 75 * 76 * On 64 bit nothing special happens. The 64bit number assigned 77 * to struct pid is the inode number. 78 * 79 * On 32 bit the 64 bit number assigned to struct pid is split 80 * into two 32 bit numbers. The lower 32 bits are used as the 81 * inode number and the upper 32 bits are used as the inode 82 * generation number. 83 * 84 * On 32 bit pidfs_ino() will return the lower 32 bit. When 85 * pidfs_ino() returns zero a wrap around happened. When a 86 * wraparound happens the 64 bit number will be incremented by 1 87 * so inode numbering starts at 1 again. 88 * 89 * On 64 bit comparing two pidfds is as simple as comparing 90 * inode numbers. 91 * 92 * When a wraparound happens on 32 bit multiple pidfds with the 93 * same inode number are likely to exist (This isn't a problem 94 * since before pidfs pidfds used the anonymous inode meaning 95 * all pidfds had the same inode number.). Userspace can 96 * reconstruct the 64 bit identifier by retrieving both the 97 * inode number and the inode generation number to compare or 98 * use file handles. 99 */ 100 struct pidfs_attr { 101 struct list_head xattrs; 102 union { 103 struct pidfs_anon_attr; 104 struct llist_node pidfs_llist; 105 }; 106 }; 107 108 #if BITS_PER_LONG == 32 109 110 DEFINE_SPINLOCK(pidfs_ino_lock); 111 static u64 pidfs_ino_nr = 1; 112 113 static inline unsigned long pidfs_ino(u64 ino) 114 { 115 return lower_32_bits(ino); 116 } 117 118 /* On 32 bit the generation number are the upper 32 bits. */ 119 static inline u32 pidfs_gen(u64 ino) 120 { 121 return upper_32_bits(ino); 122 } 123 124 static inline u64 pidfs_alloc_ino(void) 125 { 126 u64 ino; 127 128 spin_lock(&pidfs_ino_lock); 129 if (pidfs_ino(pidfs_ino_nr) == 0) 130 pidfs_ino_nr++; 131 ino = pidfs_ino_nr++; 132 spin_unlock(&pidfs_ino_lock); 133 return ino; 134 } 135 136 #else 137 138 /* On 64 bit simply return ino. */ 139 static inline unsigned long pidfs_ino(u64 ino) 140 { 141 return ino; 142 } 143 144 /* On 64 bit the generation number is 0. */ 145 static inline u32 pidfs_gen(u64 ino) 146 { 147 return 0; 148 } 149 150 DEFINE_COOKIE(pidfs_ino_cookie); 151 152 static u64 pidfs_alloc_ino(void) 153 { 154 u64 ino; 155 156 preempt_disable(); 157 ino = gen_cookie_next(&pidfs_ino_cookie); 158 preempt_enable(); 159 160 VFS_WARN_ON_ONCE(ino < 1); 161 return ino; 162 } 163 164 #endif 165 166 void pidfs_prepare_pid(struct pid *pid) 167 { 168 pid->stashed = NULL; 169 pid->attr = NULL; 170 pid->ino = 0; 171 } 172 173 int pidfs_add_pid(struct pid *pid) 174 { 175 int ret; 176 177 pid->ino = pidfs_alloc_ino(); 178 ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, 179 pidfs_ino_ht_params); 180 if (unlikely(ret)) 181 pid->ino = 0; 182 return ret; 183 } 184 185 void pidfs_remove_pid(struct pid *pid) 186 { 187 if (likely(pid->ino)) 188 rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, 189 pidfs_ino_ht_params); 190 } 191 192 static LLIST_HEAD(pidfs_free_list); 193 194 static void pidfs_free_attr_work(struct work_struct *work) 195 { 196 struct pidfs_attr *attr, *next; 197 struct llist_node *head; 198 199 head = llist_del_all(&pidfs_free_list); 200 llist_for_each_entry_safe(attr, next, head, pidfs_llist) { 201 simple_xattrs_free(&pidfs_xa_cache, &attr->xattrs, NULL); 202 kfree(attr); 203 } 204 } 205 206 static DECLARE_WORK(pidfs_free_work, pidfs_free_attr_work); 207 208 void pidfs_free_pid(struct pid *pid) 209 { 210 struct pidfs_attr *attr = pid->attr; 211 212 /* 213 * Any dentry must've been wiped from the pid by now. 214 * Otherwise there's a reference count bug. 215 */ 216 VFS_WARN_ON_ONCE(pid->stashed); 217 218 /* 219 * This if an error occurred during e.g., task creation that 220 * causes us to never go through the exit path. 221 */ 222 if (unlikely(!attr)) 223 return; 224 225 /* This never had a pidfd created. */ 226 if (IS_ERR(attr)) 227 return; 228 229 if (likely(list_empty(&attr->xattrs))) 230 kfree(attr); 231 else if (llist_add(&attr->pidfs_llist, &pidfs_free_list)) 232 schedule_work(&pidfs_free_work); 233 } 234 235 #ifdef CONFIG_PROC_FS 236 /** 237 * pidfd_show_fdinfo - print information about a pidfd 238 * @m: proc fdinfo file 239 * @f: file referencing a pidfd 240 * 241 * Pid: 242 * This function will print the pid that a given pidfd refers to in the 243 * pid namespace of the procfs instance. 244 * If the pid namespace of the process is not a descendant of the pid 245 * namespace of the procfs instance 0 will be shown as its pid. This is 246 * similar to calling getppid() on a process whose parent is outside of 247 * its pid namespace. 248 * 249 * NSpid: 250 * If pid namespaces are supported then this function will also print 251 * the pid of a given pidfd refers to for all descendant pid namespaces 252 * starting from the current pid namespace of the instance, i.e. the 253 * Pid field and the first entry in the NSpid field will be identical. 254 * If the pid namespace of the process is not a descendant of the pid 255 * namespace of the procfs instance 0 will be shown as its first NSpid 256 * entry and no others will be shown. 257 * Note that this differs from the Pid and NSpid fields in 258 * /proc/<pid>/status where Pid and NSpid are always shown relative to 259 * the pid namespace of the procfs instance. The difference becomes 260 * obvious when sending around a pidfd between pid namespaces from a 261 * different branch of the tree, i.e. where no ancestral relation is 262 * present between the pid namespaces: 263 * - create two new pid namespaces ns1 and ns2 in the initial pid 264 * namespace (also take care to create new mount namespaces in the 265 * new pid namespace and mount procfs) 266 * - create a process with a pidfd in ns1 267 * - send pidfd from ns1 to ns2 268 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 269 * have exactly one entry, which is 0 270 */ 271 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 272 { 273 struct pid *pid = pidfd_pid(f); 274 struct pid_namespace *ns; 275 pid_t nr = -1; 276 277 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 278 ns = proc_pid_ns(file_inode(m->file)->i_sb); 279 nr = pid_nr_ns(pid, ns); 280 } 281 282 seq_put_decimal_ll(m, "Pid:\t", nr); 283 284 #ifdef CONFIG_PID_NS 285 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 286 if (nr > 0) { 287 int i; 288 289 /* If nr is non-zero it means that 'pid' is valid and that 290 * ns, i.e. the pid namespace associated with the procfs 291 * instance, is in the pid namespace hierarchy of pid. 292 * Start at one below the already printed level. 293 */ 294 for (i = ns->level + 1; i <= pid->level; i++) 295 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 296 } 297 #endif 298 seq_putc(m, '\n'); 299 } 300 #endif 301 302 /* 303 * Poll support for process exit notification. 304 */ 305 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 306 { 307 struct pid *pid = pidfd_pid(file); 308 struct task_struct *task; 309 __poll_t poll_flags = 0; 310 311 poll_wait(file, &pid->wait_pidfd, pts); 312 /* 313 * Don't wake waiters if the thread-group leader exited 314 * prematurely. They either get notified when the last subthread 315 * exits or not at all if one of the remaining subthreads execs 316 * and assumes the struct pid of the old thread-group leader. 317 */ 318 guard(rcu)(); 319 task = pid_task(pid, PIDTYPE_PID); 320 if (!task) 321 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 322 else if (task->exit_state && !delay_group_leader(task)) 323 poll_flags = EPOLLIN | EPOLLRDNORM; 324 325 return poll_flags; 326 } 327 328 static inline bool pid_in_current_pidns(const struct pid *pid) 329 { 330 const struct pid_namespace *ns = task_active_pid_ns(current); 331 332 if (ns->level <= pid->level) 333 return pid->numbers[ns->level].ns == ns; 334 335 return false; 336 } 337 338 static __u32 pidfs_coredump_mask(enum task_dumpable dumpable) 339 { 340 switch (dumpable) { 341 case TASK_DUMPABLE_OWNER: 342 return PIDFD_COREDUMP_USER; 343 case TASK_DUMPABLE_ROOT: 344 return PIDFD_COREDUMP_ROOT; 345 case TASK_DUMPABLE_OFF: 346 return PIDFD_COREDUMP_SKIP; 347 default: 348 WARN_ON_ONCE(true); 349 } 350 351 return 0; 352 } 353 354 /* This must be updated whenever a new flag is added */ 355 #define PIDFD_INFO_SUPPORTED (PIDFD_INFO_PID | \ 356 PIDFD_INFO_CREDS | \ 357 PIDFD_INFO_CGROUPID | \ 358 PIDFD_INFO_EXIT | \ 359 PIDFD_INFO_COREDUMP | \ 360 PIDFD_INFO_SUPPORTED_MASK | \ 361 PIDFD_INFO_COREDUMP_SIGNAL | \ 362 PIDFD_INFO_COREDUMP_CODE) 363 364 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) 365 { 366 struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; 367 struct task_struct *task __free(put_task) = NULL; 368 struct pid *pid = pidfd_pid(file); 369 size_t usize = _IOC_SIZE(cmd); 370 struct pidfd_info kinfo = {}; 371 struct user_namespace *user_ns; 372 struct pidfs_attr *attr; 373 const struct cred *c; 374 __u64 mask; 375 376 BUILD_BUG_ON(sizeof(struct pidfd_info) != PIDFD_INFO_SIZE_VER3); 377 378 if (!uinfo) 379 return -EINVAL; 380 if (usize < PIDFD_INFO_SIZE_VER0) 381 return -EINVAL; /* First version, no smaller struct possible */ 382 383 if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) 384 return -EFAULT; 385 386 /* 387 * Restrict information retrieval to tasks within the caller's pid 388 * namespace hierarchy. 389 */ 390 if (!pid_in_current_pidns(pid)) 391 return -EREMOTE; 392 393 attr = READ_ONCE(pid->attr); 394 if (mask & PIDFD_INFO_EXIT) { 395 if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) { 396 smp_rmb(); 397 kinfo.mask |= PIDFD_INFO_EXIT; 398 #ifdef CONFIG_CGROUPS 399 kinfo.cgroupid = attr->cgroupid; 400 kinfo.mask |= PIDFD_INFO_CGROUPID; 401 #endif 402 kinfo.exit_code = attr->exit_code; 403 } 404 } 405 406 if (mask & PIDFD_INFO_COREDUMP) { 407 if (test_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask)) { 408 smp_rmb(); 409 kinfo.mask |= PIDFD_INFO_COREDUMP | PIDFD_INFO_COREDUMP_SIGNAL | PIDFD_INFO_COREDUMP_CODE; 410 kinfo.coredump_mask = attr->coredump_mask; 411 kinfo.coredump_signal = attr->coredump_signal; 412 kinfo.coredump_code = attr->coredump_code; 413 } 414 } 415 416 task = get_pid_task(pid, PIDTYPE_PID); 417 if (!task) { 418 /* 419 * If the task has already been reaped, only exit 420 * information is available 421 */ 422 if (!(mask & PIDFD_INFO_EXIT)) 423 return -ESRCH; 424 425 goto copy_out; 426 } 427 428 c = get_task_cred(task); 429 if (!c) 430 return -ESRCH; 431 432 if ((mask & PIDFD_INFO_COREDUMP) && !kinfo.coredump_mask) { 433 kinfo.coredump_mask = pidfs_coredump_mask(task_exec_state_get_dumpable(task)); 434 kinfo.mask |= PIDFD_INFO_COREDUMP; 435 /* No coredump actually took place, so no coredump signal. */ 436 } 437 438 /* Unconditionally return identifiers and credentials, the rest only on request */ 439 440 user_ns = current_user_ns(); 441 kinfo.ruid = from_kuid_munged(user_ns, c->uid); 442 kinfo.rgid = from_kgid_munged(user_ns, c->gid); 443 kinfo.euid = from_kuid_munged(user_ns, c->euid); 444 kinfo.egid = from_kgid_munged(user_ns, c->egid); 445 kinfo.suid = from_kuid_munged(user_ns, c->suid); 446 kinfo.sgid = from_kgid_munged(user_ns, c->sgid); 447 kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); 448 kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); 449 kinfo.mask |= PIDFD_INFO_CREDS; 450 put_cred(c); 451 452 #ifdef CONFIG_CGROUPS 453 if (!kinfo.cgroupid) { 454 struct cgroup *cgrp; 455 456 rcu_read_lock(); 457 cgrp = task_dfl_cgroup(task); 458 kinfo.cgroupid = cgroup_id(cgrp); 459 kinfo.mask |= PIDFD_INFO_CGROUPID; 460 rcu_read_unlock(); 461 } 462 #endif 463 464 /* 465 * Copy pid/tgid last, to reduce the chances the information might be 466 * stale. Note that it is not possible to ensure it will be valid as the 467 * task might return as soon as the copy_to_user finishes, but that's ok 468 * and userspace expects that might happen and can act accordingly, so 469 * this is just best-effort. What we can do however is checking that all 470 * the fields are set correctly, or return ESRCH to avoid providing 471 * incomplete information. */ 472 473 kinfo.ppid = task_ppid_vnr(task); 474 kinfo.tgid = task_tgid_vnr(task); 475 kinfo.pid = task_pid_vnr(task); 476 kinfo.mask |= PIDFD_INFO_PID; 477 478 if (kinfo.pid == 0 || kinfo.tgid == 0) 479 return -ESRCH; 480 481 copy_out: 482 if (mask & PIDFD_INFO_SUPPORTED_MASK) { 483 kinfo.mask |= PIDFD_INFO_SUPPORTED_MASK; 484 kinfo.supported_mask = PIDFD_INFO_SUPPORTED; 485 } 486 487 /* Are there bits in the return mask not present in PIDFD_INFO_SUPPORTED? */ 488 WARN_ON_ONCE(~PIDFD_INFO_SUPPORTED & kinfo.mask); 489 /* 490 * If userspace and the kernel have the same struct size it can just 491 * be copied. If userspace provides an older struct, only the bits that 492 * userspace knows about will be copied. If userspace provides a new 493 * struct, only the bits that the kernel knows about will be copied. 494 */ 495 return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL); 496 } 497 498 static bool pidfs_ioctl_valid(unsigned int cmd) 499 { 500 switch (cmd) { 501 case FS_IOC_GETVERSION: 502 case PIDFD_GET_CGROUP_NAMESPACE: 503 case PIDFD_GET_IPC_NAMESPACE: 504 case PIDFD_GET_MNT_NAMESPACE: 505 case PIDFD_GET_NET_NAMESPACE: 506 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 507 case PIDFD_GET_TIME_NAMESPACE: 508 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 509 case PIDFD_GET_UTS_NAMESPACE: 510 case PIDFD_GET_USER_NAMESPACE: 511 case PIDFD_GET_PID_NAMESPACE: 512 return true; 513 } 514 515 /* Extensible ioctls require some more careful checks. */ 516 switch (_IOC_NR(cmd)) { 517 case _IOC_NR(PIDFD_GET_INFO): 518 /* 519 * Try to prevent performing a pidfd ioctl when someone 520 * erronously mistook the file descriptor for a pidfd. 521 * This is not perfect but will catch most cases. 522 */ 523 return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); 524 } 525 526 return false; 527 } 528 529 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 530 { 531 struct task_struct *task __free(put_task) = NULL; 532 struct nsproxy *nsp __free(put_nsproxy) = NULL; 533 struct ns_common *ns_common = NULL; 534 535 if (!pidfs_ioctl_valid(cmd)) 536 return -ENOIOCTLCMD; 537 538 if (cmd == FS_IOC_GETVERSION) { 539 if (!arg) 540 return -EINVAL; 541 542 __u32 __user *argp = (__u32 __user *)arg; 543 return put_user(file_inode(file)->i_generation, argp); 544 } 545 546 /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ 547 if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) 548 return pidfd_info(file, cmd, arg); 549 550 task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); 551 if (!task) 552 return -ESRCH; 553 554 if (arg) 555 return -EINVAL; 556 557 scoped_guard(task_lock, task) { 558 nsp = task->nsproxy; 559 if (nsp) 560 get_nsproxy(nsp); 561 } 562 if (!nsp) 563 return -ESRCH; /* just pretend it didn't exist */ 564 565 /* 566 * We're trying to open a file descriptor to the namespace so perform a 567 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 568 */ 569 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 570 return -EACCES; 571 572 switch (cmd) { 573 /* Namespaces that hang of nsproxy. */ 574 case PIDFD_GET_CGROUP_NAMESPACE: 575 #ifdef CONFIG_CGROUPS 576 if (!ns_ref_get(nsp->cgroup_ns)) 577 break; 578 ns_common = to_ns_common(nsp->cgroup_ns); 579 #endif 580 break; 581 case PIDFD_GET_IPC_NAMESPACE: 582 #ifdef CONFIG_IPC_NS 583 if (!ns_ref_get(nsp->ipc_ns)) 584 break; 585 ns_common = to_ns_common(nsp->ipc_ns); 586 #endif 587 break; 588 case PIDFD_GET_MNT_NAMESPACE: 589 if (!ns_ref_get(nsp->mnt_ns)) 590 break; 591 ns_common = to_ns_common(nsp->mnt_ns); 592 break; 593 case PIDFD_GET_NET_NAMESPACE: 594 #ifdef CONFIG_NET_NS 595 if (!ns_ref_get(nsp->net_ns)) 596 break; 597 ns_common = to_ns_common(nsp->net_ns); 598 #endif 599 break; 600 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 601 #ifdef CONFIG_PID_NS 602 if (!ns_ref_get(nsp->pid_ns_for_children)) 603 break; 604 ns_common = to_ns_common(nsp->pid_ns_for_children); 605 #endif 606 break; 607 case PIDFD_GET_TIME_NAMESPACE: 608 #ifdef CONFIG_TIME_NS 609 if (!ns_ref_get(nsp->time_ns)) 610 break; 611 ns_common = to_ns_common(nsp->time_ns); 612 #endif 613 break; 614 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 615 #ifdef CONFIG_TIME_NS 616 if (!ns_ref_get(nsp->time_ns_for_children)) 617 break; 618 ns_common = to_ns_common(nsp->time_ns_for_children); 619 #endif 620 break; 621 case PIDFD_GET_UTS_NAMESPACE: 622 #ifdef CONFIG_UTS_NS 623 if (!ns_ref_get(nsp->uts_ns)) 624 break; 625 ns_common = to_ns_common(nsp->uts_ns); 626 #endif 627 break; 628 /* Namespaces that don't hang of nsproxy. */ 629 case PIDFD_GET_USER_NAMESPACE: 630 #ifdef CONFIG_USER_NS 631 scoped_guard(rcu) { 632 struct user_namespace *user_ns; 633 634 user_ns = task_cred_xxx(task, user_ns); 635 if (ns_ref_get(user_ns)) 636 ns_common = to_ns_common(user_ns); 637 } 638 #endif 639 break; 640 case PIDFD_GET_PID_NAMESPACE: 641 #ifdef CONFIG_PID_NS 642 scoped_guard(rcu) { 643 struct pid_namespace *pid_ns; 644 645 pid_ns = task_active_pid_ns(task); 646 if (ns_ref_get(pid_ns)) 647 ns_common = to_ns_common(pid_ns); 648 } 649 #endif 650 break; 651 default: 652 return -ENOIOCTLCMD; 653 } 654 655 if (!ns_common) 656 return -EOPNOTSUPP; 657 658 /* open_namespace() unconditionally consumes the reference */ 659 return open_namespace(ns_common); 660 } 661 662 static int pidfs_file_release(struct inode *inode, struct file *file) 663 { 664 struct pid *pid = inode->i_private; 665 struct task_struct *task; 666 667 if (!(file->f_flags & PIDFD_AUTOKILL)) 668 return 0; 669 670 guard(rcu)(); 671 task = pid_task(pid, PIDTYPE_TGID); 672 if (!task) 673 return 0; 674 675 /* Not available for kthreads or user workers for now. */ 676 if (WARN_ON_ONCE(task->flags & (PF_KTHREAD | PF_USER_WORKER))) 677 return 0; 678 do_send_sig_info(SIGKILL, SEND_SIG_PRIV, task, PIDTYPE_TGID); 679 return 0; 680 } 681 682 static const struct file_operations pidfs_file_operations = { 683 .release = pidfs_file_release, 684 .poll = pidfd_poll, 685 #ifdef CONFIG_PROC_FS 686 .show_fdinfo = pidfd_show_fdinfo, 687 #endif 688 .unlocked_ioctl = pidfd_ioctl, 689 .compat_ioctl = compat_ptr_ioctl, 690 }; 691 692 struct pid *pidfd_pid(const struct file *file) 693 { 694 if (file->f_op != &pidfs_file_operations) 695 return ERR_PTR(-EBADF); 696 return file_inode(file)->i_private; 697 } 698 699 /* 700 * We're called from release_task(). We know there's at least one 701 * reference to struct pid being held that won't be released until the 702 * task has been reaped which cannot happen until we're out of 703 * release_task(). 704 * 705 * If this struct pid has at least once been referred to by a pidfd then 706 * pid->attr will be allocated. If not we mark the struct pid as dead so 707 * anyone who is trying to register it with pidfs will fail to do so. 708 * Otherwise we would hand out pidfs for reaped tasks without having 709 * exit information available. 710 * 711 * Worst case is that we've filled in the info and the pid gets freed 712 * right away in free_pid() when no one holds a pidfd anymore. Since 713 * pidfs_exit() currently is placed after exit_task_work() we know that 714 * it cannot be us aka the exiting task holding a pidfd to itself. 715 */ 716 void pidfs_exit(struct task_struct *tsk) 717 { 718 struct pid *pid = task_pid(tsk); 719 struct pidfs_attr *attr; 720 #ifdef CONFIG_CGROUPS 721 struct cgroup *cgrp; 722 #endif 723 724 might_sleep(); 725 726 /* Synchronize with pidfs_register_pid(). */ 727 scoped_guard(spinlock_irq, &pid->wait_pidfd.lock) { 728 attr = pid->attr; 729 if (!attr) { 730 /* 731 * No one ever held a pidfd for this struct pid. 732 * Mark it as dead so no one can add a pidfs 733 * entry anymore. We're about to be reaped and 734 * so no exit information would be available. 735 */ 736 pid->attr = PIDFS_PID_DEAD; 737 return; 738 } 739 } 740 741 /* 742 * If @pid->attr is set someone might still legitimately hold a 743 * pidfd to @pid or someone might concurrently still be getting 744 * a reference to an already stashed dentry from @pid->stashed. 745 * So defer cleaning @pid->attr until the last reference to @pid 746 * is put 747 */ 748 749 #ifdef CONFIG_CGROUPS 750 rcu_read_lock(); 751 cgrp = task_dfl_cgroup(tsk); 752 attr->cgroupid = cgroup_id(cgrp); 753 rcu_read_unlock(); 754 #endif 755 attr->exit_code = tsk->exit_code; 756 757 /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ 758 smp_wmb(); 759 set_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask); 760 } 761 762 #ifdef CONFIG_COREDUMP 763 void pidfs_coredump(const struct coredump_params *cprm) 764 { 765 struct pid *pid = cprm->pid; 766 struct pidfs_attr *attr; 767 768 attr = READ_ONCE(pid->attr); 769 770 VFS_WARN_ON_ONCE(!attr); 771 VFS_WARN_ON_ONCE(attr == PIDFS_PID_DEAD); 772 773 /* Note how we were coredumped and that we coredumped. */ 774 attr->coredump_mask = pidfs_coredump_mask(cprm->dumpable) | 775 PIDFD_COREDUMPED; 776 /* If coredumping is set to skip we should never end up here. */ 777 VFS_WARN_ON_ONCE(attr->coredump_mask & PIDFD_COREDUMP_SKIP); 778 /* Expose the signal number and code that caused the coredump. */ 779 attr->coredump_signal = cprm->siginfo->si_signo; 780 attr->coredump_code = cprm->siginfo->si_code; 781 smp_wmb(); 782 set_bit(PIDFS_ATTR_BIT_COREDUMP, &attr->attr_mask); 783 } 784 #endif 785 786 static struct vfsmount *pidfs_mnt __ro_after_init; 787 788 /* 789 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 790 * implemented. Let's reject it completely until we have a clean 791 * permission concept for pidfds. 792 */ 793 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 794 struct iattr *attr) 795 { 796 return anon_inode_setattr(idmap, dentry, attr); 797 } 798 799 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 800 struct kstat *stat, u32 request_mask, 801 unsigned int query_flags) 802 { 803 return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); 804 } 805 806 static ssize_t pidfs_listxattr(struct dentry *dentry, char *buf, size_t size) 807 { 808 struct inode *inode = d_inode(dentry); 809 struct pid *pid = inode->i_private; 810 811 return simple_xattr_list(inode, &pid->attr->xattrs, buf, size); 812 } 813 814 static const struct inode_operations pidfs_inode_operations = { 815 .getattr = pidfs_getattr, 816 .setattr = pidfs_setattr, 817 .listxattr = pidfs_listxattr, 818 }; 819 820 static void pidfs_evict_inode(struct inode *inode) 821 { 822 struct pid *pid = inode->i_private; 823 824 clear_inode(inode); 825 put_pid(pid); 826 } 827 828 static const struct super_operations pidfs_sops = { 829 .drop_inode = inode_just_drop, 830 .evict_inode = pidfs_evict_inode, 831 .statfs = simple_statfs, 832 }; 833 834 /* 835 * 'lsof' has knowledge of out historical anon_inode use, and expects 836 * the pidfs dentry name to start with 'anon_inode'. 837 */ 838 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 839 { 840 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 841 } 842 843 const struct dentry_operations pidfs_dentry_operations = { 844 .d_dname = pidfs_dname, 845 .d_prune = stashed_dentry_prune, 846 }; 847 848 static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, 849 struct inode *parent) 850 { 851 const struct pid *pid = inode->i_private; 852 853 if (*max_len < 2) { 854 *max_len = 2; 855 return FILEID_INVALID; 856 } 857 858 *max_len = 2; 859 *(u64 *)fh = pid->ino; 860 return FILEID_KERNFS; 861 } 862 863 /* Find a struct pid based on the inode number. */ 864 static struct pid *pidfs_ino_get_pid(u64 ino) 865 { 866 struct pid *pid; 867 struct pidfs_attr *attr; 868 869 guard(rcu)(); 870 pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params); 871 if (!pid) 872 return NULL; 873 attr = READ_ONCE(pid->attr); 874 if (IS_ERR_OR_NULL(attr)) 875 return NULL; 876 if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) 877 return NULL; 878 /* Within our pid namespace hierarchy? */ 879 if (pid_vnr(pid) == 0) 880 return NULL; 881 return get_pid(pid); 882 } 883 884 static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, 885 struct fid *fid, int fh_len, 886 int fh_type) 887 { 888 int ret; 889 u64 pid_ino; 890 struct path path; 891 struct pid *pid; 892 893 if (fh_len < 2) 894 return NULL; 895 896 switch (fh_type) { 897 case FILEID_KERNFS: 898 pid_ino = *(u64 *)fid; 899 break; 900 default: 901 return NULL; 902 } 903 904 pid = pidfs_ino_get_pid(pid_ino); 905 if (!pid) 906 return NULL; 907 908 ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); 909 if (ret < 0) 910 return ERR_PTR(ret); 911 912 VFS_WARN_ON_ONCE(!pid->attr); 913 914 mntput(path.mnt); 915 return path.dentry; 916 } 917 918 /* 919 * Make sure that we reject any nonsensical flags that users pass via 920 * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and 921 * PIDFD_NONBLOCK as O_NONBLOCK. 922 */ 923 #define VALID_FILE_HANDLE_OPEN_FLAGS \ 924 (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) 925 926 static int pidfs_export_permission(struct handle_to_path_ctx *ctx, 927 unsigned int oflags) 928 { 929 if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) 930 return -EINVAL; 931 932 /* 933 * pidfd_ino_get_pid() will verify that the struct pid is part 934 * of the caller's pid namespace hierarchy. No further 935 * permission checks are needed. 936 */ 937 return 0; 938 } 939 940 static struct file *pidfs_export_open(const struct path *path, unsigned int oflags) 941 { 942 /* 943 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise 944 * O_RDWR as pidfds always are. 945 */ 946 oflags &= ~O_LARGEFILE; 947 return dentry_open(path, oflags | O_RDWR, current_cred()); 948 } 949 950 static const struct export_operations pidfs_export_operations = { 951 .encode_fh = pidfs_encode_fh, 952 .fh_to_dentry = pidfs_fh_to_dentry, 953 .open = pidfs_export_open, 954 .permission = pidfs_export_permission, 955 }; 956 957 static int pidfs_init_inode(struct inode *inode, void *data) 958 { 959 const struct pid *pid = data; 960 961 inode->i_private = data; 962 inode->i_flags |= S_PRIVATE | S_ANON_INODE; 963 /* We allow to set xattrs. */ 964 inode->i_flags &= ~S_IMMUTABLE; 965 inode->i_mode |= S_IRWXU; 966 inode->i_op = &pidfs_inode_operations; 967 inode->i_fop = &pidfs_file_operations; 968 inode->i_ino = pidfs_ino(pid->ino); 969 inode->i_generation = pidfs_gen(pid->ino); 970 return 0; 971 } 972 973 static void pidfs_put_data(void *data) 974 { 975 struct pid *pid = data; 976 put_pid(pid); 977 } 978 979 /** 980 * pidfs_register_pid_gfp - register a struct pid in pidfs with custom GFP 981 * flags 982 * @pid: pid to pin 983 * @gfp: GFP flags for memory allocation 984 * 985 * Register a struct pid in pidfs with custom GFP flags. 986 * 987 * Return: On success zero, on error a negative error code is returned. 988 */ 989 int pidfs_register_pid_gfp(struct pid *pid, gfp_t gfp) 990 { 991 struct pidfs_attr *new_attr __free(kfree) = NULL; 992 struct pidfs_attr *attr; 993 994 might_sleep(); 995 996 if (!pid) 997 return 0; 998 999 attr = READ_ONCE(pid->attr); 1000 if (unlikely(attr == PIDFS_PID_DEAD)) 1001 return PTR_ERR(PIDFS_PID_DEAD); 1002 if (attr) 1003 return 0; 1004 1005 new_attr = kmem_cache_zalloc(pidfs_attr_cachep, gfp); 1006 if (!new_attr) 1007 return -ENOMEM; 1008 1009 INIT_LIST_HEAD_RCU(&new_attr->xattrs); 1010 1011 /* Synchronize with pidfs_exit(). */ 1012 guard(spinlock_irq)(&pid->wait_pidfd.lock); 1013 1014 attr = pid->attr; 1015 if (unlikely(attr == PIDFS_PID_DEAD)) 1016 return PTR_ERR(PIDFS_PID_DEAD); 1017 if (unlikely(attr)) 1018 return 0; 1019 1020 pid->attr = no_free_ptr(new_attr); 1021 return 0; 1022 } 1023 1024 static struct dentry *pidfs_stash_dentry(struct dentry **stashed, 1025 struct dentry *dentry) 1026 { 1027 int ret; 1028 struct pid *pid = d_inode(dentry)->i_private; 1029 1030 VFS_WARN_ON_ONCE(stashed != &pid->stashed); 1031 1032 ret = pidfs_register_pid(pid); 1033 if (ret) 1034 return ERR_PTR(ret); 1035 1036 return stash_dentry(stashed, dentry); 1037 } 1038 1039 static const struct stashed_operations pidfs_stashed_ops = { 1040 .stash_dentry = pidfs_stash_dentry, 1041 .init_inode = pidfs_init_inode, 1042 .put_data = pidfs_put_data, 1043 }; 1044 1045 static int pidfs_xattr_get(const struct xattr_handler *handler, 1046 struct dentry *unused, struct inode *inode, 1047 const char *suffix, void *value, size_t size) 1048 { 1049 struct pid *pid = inode->i_private; 1050 const char *name = xattr_full_name(handler, suffix); 1051 1052 return simple_xattr_get(&pidfs_xa_cache, &pid->attr->xattrs, name, value, size); 1053 } 1054 1055 static int pidfs_xattr_set(const struct xattr_handler *handler, 1056 struct mnt_idmap *idmap, struct dentry *unused, 1057 struct inode *inode, const char *suffix, 1058 const void *value, size_t size, int flags) 1059 { 1060 struct pid *pid = inode->i_private; 1061 const char *name = xattr_full_name(handler, suffix); 1062 struct simple_xattr *old_xattr; 1063 1064 /* Ensure we're the only one to set @attr->xattrs. */ 1065 WARN_ON_ONCE(!inode_is_locked(inode)); 1066 1067 old_xattr = simple_xattr_set(&pidfs_xa_cache, &pid->attr->xattrs, name, value, size, flags); 1068 if (IS_ERR(old_xattr)) 1069 return PTR_ERR(old_xattr); 1070 1071 simple_xattr_free_rcu(old_xattr); 1072 return 0; 1073 } 1074 1075 static const struct xattr_handler pidfs_trusted_xattr_handler = { 1076 .prefix = XATTR_TRUSTED_PREFIX, 1077 .get = pidfs_xattr_get, 1078 .set = pidfs_xattr_set, 1079 }; 1080 1081 static const struct xattr_handler *const pidfs_xattr_handlers[] = { 1082 &pidfs_trusted_xattr_handler, 1083 NULL 1084 }; 1085 1086 static int pidfs_init_fs_context(struct fs_context *fc) 1087 { 1088 struct pseudo_fs_context *ctx; 1089 1090 ctx = init_pseudo(fc, PID_FS_MAGIC); 1091 if (!ctx) 1092 return -ENOMEM; 1093 1094 ctx->s_d_flags |= DCACHE_DONTCACHE; 1095 ctx->ops = &pidfs_sops; 1096 ctx->eops = &pidfs_export_operations; 1097 ctx->dops = &pidfs_dentry_operations; 1098 ctx->xattr = pidfs_xattr_handlers; 1099 fc->s_fs_info = (void *)&pidfs_stashed_ops; 1100 return 0; 1101 } 1102 1103 static struct file_system_type pidfs_type = { 1104 .name = "pidfs", 1105 .init_fs_context = pidfs_init_fs_context, 1106 .kill_sb = kill_anon_super, 1107 }; 1108 1109 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 1110 { 1111 struct file *pidfd_file; 1112 struct path path __free(path_put) = {}; 1113 int ret; 1114 1115 /* 1116 * Ensure that internal pidfd flags don't overlap with each 1117 * other or with uapi pidfd flags. 1118 */ 1119 BUILD_BUG_ON(hweight32(PIDFD_THREAD | PIDFD_NONBLOCK | 1120 PIDFD_STALE | PIDFD_AUTOKILL) != 4); 1121 1122 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 1123 if (ret < 0) 1124 return ERR_PTR(ret); 1125 1126 VFS_WARN_ON_ONCE(!pid->attr); 1127 1128 flags &= ~PIDFD_STALE; 1129 flags |= O_RDWR; 1130 pidfd_file = dentry_open(&path, flags, current_cred()); 1131 /* 1132 * Raise PIDFD_THREAD and PIDFD_AUTOKILL explicitly as 1133 * do_dentry_open() strips O_EXCL and O_TRUNC. 1134 */ 1135 if (!IS_ERR(pidfd_file)) 1136 pidfd_file->f_flags |= (flags & (PIDFD_THREAD | PIDFD_AUTOKILL)); 1137 1138 return pidfd_file; 1139 } 1140 1141 void __init pidfs_init(void) 1142 { 1143 if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params)) 1144 panic("Failed to initialize pidfs hashtable"); 1145 1146 pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, 1147 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1148 SLAB_ACCOUNT | SLAB_PANIC), NULL); 1149 1150 pidfs_mnt = kern_mount(&pidfs_type); 1151 if (IS_ERR(pidfs_mnt)) 1152 panic("Failed to mount pidfs pseudo filesystem"); 1153 1154 pidfs_root_path.mnt = pidfs_mnt; 1155 pidfs_root_path.dentry = pidfs_mnt->mnt_root; 1156 } 1157