1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/exportfs.h> 4 #include <linux/file.h> 5 #include <linux/fs.h> 6 #include <linux/cgroup.h> 7 #include <linux/magic.h> 8 #include <linux/mount.h> 9 #include <linux/pid.h> 10 #include <linux/pidfs.h> 11 #include <linux/pid_namespace.h> 12 #include <linux/poll.h> 13 #include <linux/proc_fs.h> 14 #include <linux/proc_ns.h> 15 #include <linux/pseudo_fs.h> 16 #include <linux/ptrace.h> 17 #include <linux/seq_file.h> 18 #include <uapi/linux/pidfd.h> 19 #include <linux/ipc_namespace.h> 20 #include <linux/time_namespace.h> 21 #include <linux/utsname.h> 22 #include <net/net_namespace.h> 23 #include <linux/coredump.h> 24 25 #include "internal.h" 26 #include "mount.h" 27 28 static struct kmem_cache *pidfs_cachep __ro_after_init; 29 30 /* 31 * Stashes information that userspace needs to access even after the 32 * process has been reaped. 33 */ 34 struct pidfs_exit_info { 35 __u64 cgroupid; 36 __s32 exit_code; 37 __u32 coredump_mask; 38 }; 39 40 struct pidfs_inode { 41 struct pidfs_exit_info __pei; 42 struct pidfs_exit_info *exit_info; 43 struct inode vfs_inode; 44 }; 45 46 static inline struct pidfs_inode *pidfs_i(struct inode *inode) 47 { 48 return container_of(inode, struct pidfs_inode, vfs_inode); 49 } 50 51 static struct rb_root pidfs_ino_tree = RB_ROOT; 52 53 #if BITS_PER_LONG == 32 54 static inline unsigned long pidfs_ino(u64 ino) 55 { 56 return lower_32_bits(ino); 57 } 58 59 /* On 32 bit the generation number are the upper 32 bits. */ 60 static inline u32 pidfs_gen(u64 ino) 61 { 62 return upper_32_bits(ino); 63 } 64 65 #else 66 67 /* On 64 bit simply return ino. */ 68 static inline unsigned long pidfs_ino(u64 ino) 69 { 70 return ino; 71 } 72 73 /* On 64 bit the generation number is 0. */ 74 static inline u32 pidfs_gen(u64 ino) 75 { 76 return 0; 77 } 78 #endif 79 80 static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) 81 { 82 struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); 83 struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); 84 u64 pid_ino_a = pid_a->ino; 85 u64 pid_ino_b = pid_b->ino; 86 87 if (pid_ino_a < pid_ino_b) 88 return -1; 89 if (pid_ino_a > pid_ino_b) 90 return 1; 91 return 0; 92 } 93 94 void pidfs_add_pid(struct pid *pid) 95 { 96 static u64 pidfs_ino_nr = 2; 97 98 /* 99 * On 64 bit nothing special happens. The 64bit number assigned 100 * to struct pid is the inode number. 101 * 102 * On 32 bit the 64 bit number assigned to struct pid is split 103 * into two 32 bit numbers. The lower 32 bits are used as the 104 * inode number and the upper 32 bits are used as the inode 105 * generation number. 106 * 107 * On 32 bit pidfs_ino() will return the lower 32 bit. When 108 * pidfs_ino() returns zero a wrap around happened. When a 109 * wraparound happens the 64 bit number will be incremented by 2 110 * so inode numbering starts at 2 again. 111 * 112 * On 64 bit comparing two pidfds is as simple as comparing 113 * inode numbers. 114 * 115 * When a wraparound happens on 32 bit multiple pidfds with the 116 * same inode number are likely to exist (This isn't a problem 117 * since before pidfs pidfds used the anonymous inode meaning 118 * all pidfds had the same inode number.). Userspace can 119 * reconstruct the 64 bit identifier by retrieving both the 120 * inode number and the inode generation number to compare or 121 * use file handles. 122 */ 123 if (pidfs_ino(pidfs_ino_nr) == 0) 124 pidfs_ino_nr += 2; 125 126 pid->ino = pidfs_ino_nr; 127 pid->stashed = NULL; 128 pidfs_ino_nr++; 129 130 write_seqcount_begin(&pidmap_lock_seq); 131 rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); 132 write_seqcount_end(&pidmap_lock_seq); 133 } 134 135 void pidfs_remove_pid(struct pid *pid) 136 { 137 write_seqcount_begin(&pidmap_lock_seq); 138 rb_erase(&pid->pidfs_node, &pidfs_ino_tree); 139 write_seqcount_end(&pidmap_lock_seq); 140 } 141 142 #ifdef CONFIG_PROC_FS 143 /** 144 * pidfd_show_fdinfo - print information about a pidfd 145 * @m: proc fdinfo file 146 * @f: file referencing a pidfd 147 * 148 * Pid: 149 * This function will print the pid that a given pidfd refers to in the 150 * pid namespace of the procfs instance. 151 * If the pid namespace of the process is not a descendant of the pid 152 * namespace of the procfs instance 0 will be shown as its pid. This is 153 * similar to calling getppid() on a process whose parent is outside of 154 * its pid namespace. 155 * 156 * NSpid: 157 * If pid namespaces are supported then this function will also print 158 * the pid of a given pidfd refers to for all descendant pid namespaces 159 * starting from the current pid namespace of the instance, i.e. the 160 * Pid field and the first entry in the NSpid field will be identical. 161 * If the pid namespace of the process is not a descendant of the pid 162 * namespace of the procfs instance 0 will be shown as its first NSpid 163 * entry and no others will be shown. 164 * Note that this differs from the Pid and NSpid fields in 165 * /proc/<pid>/status where Pid and NSpid are always shown relative to 166 * the pid namespace of the procfs instance. The difference becomes 167 * obvious when sending around a pidfd between pid namespaces from a 168 * different branch of the tree, i.e. where no ancestral relation is 169 * present between the pid namespaces: 170 * - create two new pid namespaces ns1 and ns2 in the initial pid 171 * namespace (also take care to create new mount namespaces in the 172 * new pid namespace and mount procfs) 173 * - create a process with a pidfd in ns1 174 * - send pidfd from ns1 to ns2 175 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 176 * have exactly one entry, which is 0 177 */ 178 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 179 { 180 struct pid *pid = pidfd_pid(f); 181 struct pid_namespace *ns; 182 pid_t nr = -1; 183 184 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 185 ns = proc_pid_ns(file_inode(m->file)->i_sb); 186 nr = pid_nr_ns(pid, ns); 187 } 188 189 seq_put_decimal_ll(m, "Pid:\t", nr); 190 191 #ifdef CONFIG_PID_NS 192 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 193 if (nr > 0) { 194 int i; 195 196 /* If nr is non-zero it means that 'pid' is valid and that 197 * ns, i.e. the pid namespace associated with the procfs 198 * instance, is in the pid namespace hierarchy of pid. 199 * Start at one below the already printed level. 200 */ 201 for (i = ns->level + 1; i <= pid->level; i++) 202 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 203 } 204 #endif 205 seq_putc(m, '\n'); 206 } 207 #endif 208 209 /* 210 * Poll support for process exit notification. 211 */ 212 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 213 { 214 struct pid *pid = pidfd_pid(file); 215 struct task_struct *task; 216 __poll_t poll_flags = 0; 217 218 poll_wait(file, &pid->wait_pidfd, pts); 219 /* 220 * Don't wake waiters if the thread-group leader exited 221 * prematurely. They either get notified when the last subthread 222 * exits or not at all if one of the remaining subthreads execs 223 * and assumes the struct pid of the old thread-group leader. 224 */ 225 guard(rcu)(); 226 task = pid_task(pid, PIDTYPE_PID); 227 if (!task) 228 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 229 else if (task->exit_state && !delay_group_leader(task)) 230 poll_flags = EPOLLIN | EPOLLRDNORM; 231 232 return poll_flags; 233 } 234 235 static inline bool pid_in_current_pidns(const struct pid *pid) 236 { 237 const struct pid_namespace *ns = task_active_pid_ns(current); 238 239 if (ns->level <= pid->level) 240 return pid->numbers[ns->level].ns == ns; 241 242 return false; 243 } 244 245 static __u32 pidfs_coredump_mask(unsigned long mm_flags) 246 { 247 switch (__get_dumpable(mm_flags)) { 248 case SUID_DUMP_USER: 249 return PIDFD_COREDUMP_USER; 250 case SUID_DUMP_ROOT: 251 return PIDFD_COREDUMP_ROOT; 252 case SUID_DUMP_DISABLE: 253 return PIDFD_COREDUMP_SKIP; 254 default: 255 WARN_ON_ONCE(true); 256 } 257 258 return 0; 259 } 260 261 static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) 262 { 263 struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; 264 struct inode *inode = file_inode(file); 265 struct pid *pid = pidfd_pid(file); 266 size_t usize = _IOC_SIZE(cmd); 267 struct pidfd_info kinfo = {}; 268 struct pidfs_exit_info *exit_info; 269 struct user_namespace *user_ns; 270 struct task_struct *task; 271 const struct cred *c; 272 __u64 mask; 273 274 if (!uinfo) 275 return -EINVAL; 276 if (usize < PIDFD_INFO_SIZE_VER0) 277 return -EINVAL; /* First version, no smaller struct possible */ 278 279 if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) 280 return -EFAULT; 281 282 /* 283 * Restrict information retrieval to tasks within the caller's pid 284 * namespace hierarchy. 285 */ 286 if (!pid_in_current_pidns(pid)) 287 return -ESRCH; 288 289 if (mask & PIDFD_INFO_EXIT) { 290 exit_info = READ_ONCE(pidfs_i(inode)->exit_info); 291 if (exit_info) { 292 kinfo.mask |= PIDFD_INFO_EXIT; 293 #ifdef CONFIG_CGROUPS 294 kinfo.cgroupid = exit_info->cgroupid; 295 kinfo.mask |= PIDFD_INFO_CGROUPID; 296 #endif 297 kinfo.exit_code = exit_info->exit_code; 298 } 299 } 300 301 if (mask & PIDFD_INFO_COREDUMP) { 302 kinfo.mask |= PIDFD_INFO_COREDUMP; 303 kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask); 304 } 305 306 task = get_pid_task(pid, PIDTYPE_PID); 307 if (!task) { 308 /* 309 * If the task has already been reaped, only exit 310 * information is available 311 */ 312 if (!(mask & PIDFD_INFO_EXIT)) 313 return -ESRCH; 314 315 goto copy_out; 316 } 317 318 c = get_task_cred(task); 319 if (!c) 320 return -ESRCH; 321 322 if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) { 323 task_lock(task); 324 if (task->mm) 325 kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags); 326 task_unlock(task); 327 } 328 329 /* Unconditionally return identifiers and credentials, the rest only on request */ 330 331 user_ns = current_user_ns(); 332 kinfo.ruid = from_kuid_munged(user_ns, c->uid); 333 kinfo.rgid = from_kgid_munged(user_ns, c->gid); 334 kinfo.euid = from_kuid_munged(user_ns, c->euid); 335 kinfo.egid = from_kgid_munged(user_ns, c->egid); 336 kinfo.suid = from_kuid_munged(user_ns, c->suid); 337 kinfo.sgid = from_kgid_munged(user_ns, c->sgid); 338 kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); 339 kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); 340 kinfo.mask |= PIDFD_INFO_CREDS; 341 put_cred(c); 342 343 #ifdef CONFIG_CGROUPS 344 if (!kinfo.cgroupid) { 345 struct cgroup *cgrp; 346 347 rcu_read_lock(); 348 cgrp = task_dfl_cgroup(task); 349 kinfo.cgroupid = cgroup_id(cgrp); 350 kinfo.mask |= PIDFD_INFO_CGROUPID; 351 rcu_read_unlock(); 352 } 353 #endif 354 355 /* 356 * Copy pid/tgid last, to reduce the chances the information might be 357 * stale. Note that it is not possible to ensure it will be valid as the 358 * task might return as soon as the copy_to_user finishes, but that's ok 359 * and userspace expects that might happen and can act accordingly, so 360 * this is just best-effort. What we can do however is checking that all 361 * the fields are set correctly, or return ESRCH to avoid providing 362 * incomplete information. */ 363 364 kinfo.ppid = task_ppid_nr_ns(task, NULL); 365 kinfo.tgid = task_tgid_vnr(task); 366 kinfo.pid = task_pid_vnr(task); 367 kinfo.mask |= PIDFD_INFO_PID; 368 369 if (kinfo.pid == 0 || kinfo.tgid == 0) 370 return -ESRCH; 371 372 copy_out: 373 /* 374 * If userspace and the kernel have the same struct size it can just 375 * be copied. If userspace provides an older struct, only the bits that 376 * userspace knows about will be copied. If userspace provides a new 377 * struct, only the bits that the kernel knows about will be copied. 378 */ 379 return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL); 380 } 381 382 static bool pidfs_ioctl_valid(unsigned int cmd) 383 { 384 switch (cmd) { 385 case FS_IOC_GETVERSION: 386 case PIDFD_GET_CGROUP_NAMESPACE: 387 case PIDFD_GET_IPC_NAMESPACE: 388 case PIDFD_GET_MNT_NAMESPACE: 389 case PIDFD_GET_NET_NAMESPACE: 390 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 391 case PIDFD_GET_TIME_NAMESPACE: 392 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 393 case PIDFD_GET_UTS_NAMESPACE: 394 case PIDFD_GET_USER_NAMESPACE: 395 case PIDFD_GET_PID_NAMESPACE: 396 return true; 397 } 398 399 /* Extensible ioctls require some more careful checks. */ 400 switch (_IOC_NR(cmd)) { 401 case _IOC_NR(PIDFD_GET_INFO): 402 /* 403 * Try to prevent performing a pidfd ioctl when someone 404 * erronously mistook the file descriptor for a pidfd. 405 * This is not perfect but will catch most cases. 406 */ 407 return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); 408 } 409 410 return false; 411 } 412 413 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 414 { 415 struct task_struct *task __free(put_task) = NULL; 416 struct nsproxy *nsp __free(put_nsproxy) = NULL; 417 struct ns_common *ns_common = NULL; 418 struct pid_namespace *pid_ns; 419 420 if (!pidfs_ioctl_valid(cmd)) 421 return -ENOIOCTLCMD; 422 423 if (cmd == FS_IOC_GETVERSION) { 424 if (!arg) 425 return -EINVAL; 426 427 __u32 __user *argp = (__u32 __user *)arg; 428 return put_user(file_inode(file)->i_generation, argp); 429 } 430 431 /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ 432 if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) 433 return pidfd_info(file, cmd, arg); 434 435 task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); 436 if (!task) 437 return -ESRCH; 438 439 if (arg) 440 return -EINVAL; 441 442 scoped_guard(task_lock, task) { 443 nsp = task->nsproxy; 444 if (nsp) 445 get_nsproxy(nsp); 446 } 447 if (!nsp) 448 return -ESRCH; /* just pretend it didn't exist */ 449 450 /* 451 * We're trying to open a file descriptor to the namespace so perform a 452 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 453 */ 454 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 455 return -EACCES; 456 457 switch (cmd) { 458 /* Namespaces that hang of nsproxy. */ 459 case PIDFD_GET_CGROUP_NAMESPACE: 460 if (IS_ENABLED(CONFIG_CGROUPS)) { 461 get_cgroup_ns(nsp->cgroup_ns); 462 ns_common = to_ns_common(nsp->cgroup_ns); 463 } 464 break; 465 case PIDFD_GET_IPC_NAMESPACE: 466 if (IS_ENABLED(CONFIG_IPC_NS)) { 467 get_ipc_ns(nsp->ipc_ns); 468 ns_common = to_ns_common(nsp->ipc_ns); 469 } 470 break; 471 case PIDFD_GET_MNT_NAMESPACE: 472 get_mnt_ns(nsp->mnt_ns); 473 ns_common = to_ns_common(nsp->mnt_ns); 474 break; 475 case PIDFD_GET_NET_NAMESPACE: 476 if (IS_ENABLED(CONFIG_NET_NS)) { 477 ns_common = to_ns_common(nsp->net_ns); 478 get_net_ns(ns_common); 479 } 480 break; 481 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 482 if (IS_ENABLED(CONFIG_PID_NS)) { 483 get_pid_ns(nsp->pid_ns_for_children); 484 ns_common = to_ns_common(nsp->pid_ns_for_children); 485 } 486 break; 487 case PIDFD_GET_TIME_NAMESPACE: 488 if (IS_ENABLED(CONFIG_TIME_NS)) { 489 get_time_ns(nsp->time_ns); 490 ns_common = to_ns_common(nsp->time_ns); 491 } 492 break; 493 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 494 if (IS_ENABLED(CONFIG_TIME_NS)) { 495 get_time_ns(nsp->time_ns_for_children); 496 ns_common = to_ns_common(nsp->time_ns_for_children); 497 } 498 break; 499 case PIDFD_GET_UTS_NAMESPACE: 500 if (IS_ENABLED(CONFIG_UTS_NS)) { 501 get_uts_ns(nsp->uts_ns); 502 ns_common = to_ns_common(nsp->uts_ns); 503 } 504 break; 505 /* Namespaces that don't hang of nsproxy. */ 506 case PIDFD_GET_USER_NAMESPACE: 507 if (IS_ENABLED(CONFIG_USER_NS)) { 508 rcu_read_lock(); 509 ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); 510 rcu_read_unlock(); 511 } 512 break; 513 case PIDFD_GET_PID_NAMESPACE: 514 if (IS_ENABLED(CONFIG_PID_NS)) { 515 rcu_read_lock(); 516 pid_ns = task_active_pid_ns(task); 517 if (pid_ns) 518 ns_common = to_ns_common(get_pid_ns(pid_ns)); 519 rcu_read_unlock(); 520 } 521 break; 522 default: 523 return -ENOIOCTLCMD; 524 } 525 526 if (!ns_common) 527 return -EOPNOTSUPP; 528 529 /* open_namespace() unconditionally consumes the reference */ 530 return open_namespace(ns_common); 531 } 532 533 static const struct file_operations pidfs_file_operations = { 534 .poll = pidfd_poll, 535 #ifdef CONFIG_PROC_FS 536 .show_fdinfo = pidfd_show_fdinfo, 537 #endif 538 .unlocked_ioctl = pidfd_ioctl, 539 .compat_ioctl = compat_ptr_ioctl, 540 }; 541 542 struct pid *pidfd_pid(const struct file *file) 543 { 544 if (file->f_op != &pidfs_file_operations) 545 return ERR_PTR(-EBADF); 546 return file_inode(file)->i_private; 547 } 548 549 /* 550 * We're called from release_task(). We know there's at least one 551 * reference to struct pid being held that won't be released until the 552 * task has been reaped which cannot happen until we're out of 553 * release_task(). 554 * 555 * If this struct pid is referred to by a pidfd then 556 * stashed_dentry_get() will return the dentry and inode for that struct 557 * pid. Since we've taken a reference on it there's now an additional 558 * reference from the exit path on it. Which is fine. We're going to put 559 * it again in a second and we know that the pid is kept alive anyway. 560 * 561 * Worst case is that we've filled in the info and immediately free the 562 * dentry and inode afterwards since the pidfd has been closed. Since 563 * pidfs_exit() currently is placed after exit_task_work() we know that 564 * it cannot be us aka the exiting task holding a pidfd to ourselves. 565 */ 566 void pidfs_exit(struct task_struct *tsk) 567 { 568 struct dentry *dentry; 569 570 might_sleep(); 571 572 dentry = stashed_dentry_get(&task_pid(tsk)->stashed); 573 if (dentry) { 574 struct inode *inode = d_inode(dentry); 575 struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; 576 #ifdef CONFIG_CGROUPS 577 struct cgroup *cgrp; 578 579 rcu_read_lock(); 580 cgrp = task_dfl_cgroup(tsk); 581 exit_info->cgroupid = cgroup_id(cgrp); 582 rcu_read_unlock(); 583 #endif 584 exit_info->exit_code = tsk->exit_code; 585 586 /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ 587 smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); 588 dput(dentry); 589 } 590 } 591 592 #ifdef CONFIG_COREDUMP 593 void pidfs_coredump(const struct coredump_params *cprm) 594 { 595 struct pid *pid = cprm->pid; 596 struct pidfs_exit_info *exit_info; 597 struct dentry *dentry; 598 struct inode *inode; 599 __u32 coredump_mask = 0; 600 601 dentry = pid->stashed; 602 if (WARN_ON_ONCE(!dentry)) 603 return; 604 605 inode = d_inode(dentry); 606 exit_info = &pidfs_i(inode)->__pei; 607 /* Note how we were coredumped. */ 608 coredump_mask = pidfs_coredump_mask(cprm->mm_flags); 609 /* Note that we actually did coredump. */ 610 coredump_mask |= PIDFD_COREDUMPED; 611 /* If coredumping is set to skip we should never end up here. */ 612 VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP); 613 smp_store_release(&exit_info->coredump_mask, coredump_mask); 614 } 615 #endif 616 617 static struct vfsmount *pidfs_mnt __ro_after_init; 618 619 /* 620 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 621 * implemented. Let's reject it completely until we have a clean 622 * permission concept for pidfds. 623 */ 624 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 625 struct iattr *attr) 626 { 627 return anon_inode_setattr(idmap, dentry, attr); 628 } 629 630 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 631 struct kstat *stat, u32 request_mask, 632 unsigned int query_flags) 633 { 634 return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); 635 } 636 637 static const struct inode_operations pidfs_inode_operations = { 638 .getattr = pidfs_getattr, 639 .setattr = pidfs_setattr, 640 }; 641 642 static void pidfs_evict_inode(struct inode *inode) 643 { 644 struct pid *pid = inode->i_private; 645 646 clear_inode(inode); 647 put_pid(pid); 648 } 649 650 static struct inode *pidfs_alloc_inode(struct super_block *sb) 651 { 652 struct pidfs_inode *pi; 653 654 pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); 655 if (!pi) 656 return NULL; 657 658 memset(&pi->__pei, 0, sizeof(pi->__pei)); 659 pi->exit_info = NULL; 660 661 return &pi->vfs_inode; 662 } 663 664 static void pidfs_free_inode(struct inode *inode) 665 { 666 kmem_cache_free(pidfs_cachep, pidfs_i(inode)); 667 } 668 669 static const struct super_operations pidfs_sops = { 670 .alloc_inode = pidfs_alloc_inode, 671 .drop_inode = generic_delete_inode, 672 .evict_inode = pidfs_evict_inode, 673 .free_inode = pidfs_free_inode, 674 .statfs = simple_statfs, 675 }; 676 677 /* 678 * 'lsof' has knowledge of out historical anon_inode use, and expects 679 * the pidfs dentry name to start with 'anon_inode'. 680 */ 681 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 682 { 683 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 684 } 685 686 const struct dentry_operations pidfs_dentry_operations = { 687 .d_dname = pidfs_dname, 688 .d_prune = stashed_dentry_prune, 689 }; 690 691 static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, 692 struct inode *parent) 693 { 694 const struct pid *pid = inode->i_private; 695 696 if (*max_len < 2) { 697 *max_len = 2; 698 return FILEID_INVALID; 699 } 700 701 *max_len = 2; 702 *(u64 *)fh = pid->ino; 703 return FILEID_KERNFS; 704 } 705 706 static int pidfs_ino_find(const void *key, const struct rb_node *node) 707 { 708 const u64 pid_ino = *(u64 *)key; 709 const struct pid *pid = rb_entry(node, struct pid, pidfs_node); 710 711 if (pid_ino < pid->ino) 712 return -1; 713 if (pid_ino > pid->ino) 714 return 1; 715 return 0; 716 } 717 718 /* Find a struct pid based on the inode number. */ 719 static struct pid *pidfs_ino_get_pid(u64 ino) 720 { 721 struct pid *pid; 722 struct rb_node *node; 723 unsigned int seq; 724 725 guard(rcu)(); 726 do { 727 seq = read_seqcount_begin(&pidmap_lock_seq); 728 node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); 729 if (node) 730 break; 731 } while (read_seqcount_retry(&pidmap_lock_seq, seq)); 732 733 if (!node) 734 return NULL; 735 736 pid = rb_entry(node, struct pid, pidfs_node); 737 738 /* Within our pid namespace hierarchy? */ 739 if (pid_vnr(pid) == 0) 740 return NULL; 741 742 return get_pid(pid); 743 } 744 745 static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, 746 struct fid *fid, int fh_len, 747 int fh_type) 748 { 749 int ret; 750 u64 pid_ino; 751 struct path path; 752 struct pid *pid; 753 754 if (fh_len < 2) 755 return NULL; 756 757 switch (fh_type) { 758 case FILEID_KERNFS: 759 pid_ino = *(u64 *)fid; 760 break; 761 default: 762 return NULL; 763 } 764 765 pid = pidfs_ino_get_pid(pid_ino); 766 if (!pid) 767 return NULL; 768 769 ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); 770 if (ret < 0) 771 return ERR_PTR(ret); 772 773 mntput(path.mnt); 774 return path.dentry; 775 } 776 777 /* 778 * Make sure that we reject any nonsensical flags that users pass via 779 * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and 780 * PIDFD_NONBLOCK as O_NONBLOCK. 781 */ 782 #define VALID_FILE_HANDLE_OPEN_FLAGS \ 783 (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) 784 785 static int pidfs_export_permission(struct handle_to_path_ctx *ctx, 786 unsigned int oflags) 787 { 788 if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) 789 return -EINVAL; 790 791 /* 792 * pidfd_ino_get_pid() will verify that the struct pid is part 793 * of the caller's pid namespace hierarchy. No further 794 * permission checks are needed. 795 */ 796 return 0; 797 } 798 799 static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, 800 unsigned int flags) 801 { 802 enum pid_type type; 803 804 if (flags & PIDFD_STALE) 805 return true; 806 807 /* 808 * Make sure that if a pidfd is created PIDFD_INFO_EXIT 809 * information will be available. So after an inode for the 810 * pidfd has been allocated perform another check that the pid 811 * is still alive. If it is exit information is available even 812 * if the task gets reaped before the pidfd is returned to 813 * userspace. The only exception are indicated by PIDFD_STALE: 814 * 815 * (1) The kernel is in the middle of task creation and thus no 816 * task linkage has been established yet. 817 * (2) The caller knows @pid has been registered in pidfs at a 818 * time when the task was still alive. 819 * 820 * In both cases exit information will have been reported. 821 */ 822 if (flags & PIDFD_THREAD) 823 type = PIDTYPE_PID; 824 else 825 type = PIDTYPE_TGID; 826 827 /* 828 * Since pidfs_exit() is called before struct pid's task linkage 829 * is removed the case where the task got reaped but a dentry 830 * was already attached to struct pid and exit information was 831 * recorded and published can be handled correctly. 832 */ 833 if (unlikely(!pid_has_task(pid, type))) { 834 struct inode *inode = d_inode(path->dentry); 835 return !!READ_ONCE(pidfs_i(inode)->exit_info); 836 } 837 838 return true; 839 } 840 841 static struct file *pidfs_export_open(struct path *path, unsigned int oflags) 842 { 843 if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) 844 return ERR_PTR(-ESRCH); 845 846 /* 847 * Clear O_LARGEFILE as open_by_handle_at() forces it and raise 848 * O_RDWR as pidfds always are. 849 */ 850 oflags &= ~O_LARGEFILE; 851 return dentry_open(path, oflags | O_RDWR, current_cred()); 852 } 853 854 static const struct export_operations pidfs_export_operations = { 855 .encode_fh = pidfs_encode_fh, 856 .fh_to_dentry = pidfs_fh_to_dentry, 857 .open = pidfs_export_open, 858 .permission = pidfs_export_permission, 859 }; 860 861 static int pidfs_init_inode(struct inode *inode, void *data) 862 { 863 const struct pid *pid = data; 864 865 inode->i_private = data; 866 inode->i_flags |= S_PRIVATE | S_ANON_INODE; 867 inode->i_mode |= S_IRWXU; 868 inode->i_op = &pidfs_inode_operations; 869 inode->i_fop = &pidfs_file_operations; 870 inode->i_ino = pidfs_ino(pid->ino); 871 inode->i_generation = pidfs_gen(pid->ino); 872 return 0; 873 } 874 875 static void pidfs_put_data(void *data) 876 { 877 struct pid *pid = data; 878 put_pid(pid); 879 } 880 881 static const struct stashed_operations pidfs_stashed_ops = { 882 .init_inode = pidfs_init_inode, 883 .put_data = pidfs_put_data, 884 }; 885 886 static int pidfs_init_fs_context(struct fs_context *fc) 887 { 888 struct pseudo_fs_context *ctx; 889 890 ctx = init_pseudo(fc, PID_FS_MAGIC); 891 if (!ctx) 892 return -ENOMEM; 893 894 ctx->ops = &pidfs_sops; 895 ctx->eops = &pidfs_export_operations; 896 ctx->dops = &pidfs_dentry_operations; 897 fc->s_fs_info = (void *)&pidfs_stashed_ops; 898 return 0; 899 } 900 901 static struct file_system_type pidfs_type = { 902 .name = "pidfs", 903 .init_fs_context = pidfs_init_fs_context, 904 .kill_sb = kill_anon_super, 905 }; 906 907 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 908 { 909 struct file *pidfd_file; 910 struct path path __free(path_put) = {}; 911 int ret; 912 913 /* 914 * Ensure that PIDFD_STALE can be passed as a flag without 915 * overloading other uapi pidfd flags. 916 */ 917 BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); 918 BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); 919 920 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 921 if (ret < 0) 922 return ERR_PTR(ret); 923 924 if (!pidfs_pid_valid(pid, &path, flags)) 925 return ERR_PTR(-ESRCH); 926 927 flags &= ~PIDFD_STALE; 928 flags |= O_RDWR; 929 pidfd_file = dentry_open(&path, flags, current_cred()); 930 /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ 931 if (!IS_ERR(pidfd_file)) 932 pidfd_file->f_flags |= (flags & PIDFD_THREAD); 933 934 return pidfd_file; 935 } 936 937 /** 938 * pidfs_register_pid - register a struct pid in pidfs 939 * @pid: pid to pin 940 * 941 * Register a struct pid in pidfs. Needs to be paired with 942 * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. 943 * 944 * Return: On success zero, on error a negative error code is returned. 945 */ 946 int pidfs_register_pid(struct pid *pid) 947 { 948 struct path path __free(path_put) = {}; 949 int ret; 950 951 might_sleep(); 952 953 if (!pid) 954 return 0; 955 956 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 957 if (unlikely(ret)) 958 return ret; 959 /* Keep the dentry and only put the reference to the mount. */ 960 path.dentry = NULL; 961 return 0; 962 } 963 964 /** 965 * pidfs_get_pid - pin a struct pid through pidfs 966 * @pid: pid to pin 967 * 968 * Similar to pidfs_register_pid() but only valid if the caller knows 969 * there's a reference to the @pid through a dentry already that can't 970 * go away. 971 */ 972 void pidfs_get_pid(struct pid *pid) 973 { 974 if (!pid) 975 return; 976 WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); 977 } 978 979 /** 980 * pidfs_put_pid - drop a pidfs reference 981 * @pid: pid to drop 982 * 983 * Drop a reference to @pid via pidfs. This is only safe if the 984 * reference has been taken via pidfs_get_pid(). 985 */ 986 void pidfs_put_pid(struct pid *pid) 987 { 988 might_sleep(); 989 990 if (!pid) 991 return; 992 VFS_WARN_ON_ONCE(!pid->stashed); 993 dput(pid->stashed); 994 } 995 996 static void pidfs_inode_init_once(void *data) 997 { 998 struct pidfs_inode *pi = data; 999 1000 inode_init_once(&pi->vfs_inode); 1001 } 1002 1003 void __init pidfs_init(void) 1004 { 1005 pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, 1006 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1007 SLAB_ACCOUNT | SLAB_PANIC), 1008 pidfs_inode_init_once); 1009 pidfs_mnt = kern_mount(&pidfs_type); 1010 if (IS_ERR(pidfs_mnt)) 1011 panic("Failed to mount pidfs pseudo filesystem"); 1012 } 1013