1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/file.h> 4 #include <linux/fs.h> 5 #include <linux/cgroup.h> 6 #include <linux/magic.h> 7 #include <linux/mount.h> 8 #include <linux/pid.h> 9 #include <linux/pidfs.h> 10 #include <linux/pid_namespace.h> 11 #include <linux/poll.h> 12 #include <linux/proc_fs.h> 13 #include <linux/proc_ns.h> 14 #include <linux/pseudo_fs.h> 15 #include <linux/ptrace.h> 16 #include <linux/seq_file.h> 17 #include <uapi/linux/pidfd.h> 18 #include <linux/ipc_namespace.h> 19 #include <linux/time_namespace.h> 20 #include <linux/utsname.h> 21 #include <net/net_namespace.h> 22 23 #include "internal.h" 24 #include "mount.h" 25 26 #ifdef CONFIG_PROC_FS 27 /** 28 * pidfd_show_fdinfo - print information about a pidfd 29 * @m: proc fdinfo file 30 * @f: file referencing a pidfd 31 * 32 * Pid: 33 * This function will print the pid that a given pidfd refers to in the 34 * pid namespace of the procfs instance. 35 * If the pid namespace of the process is not a descendant of the pid 36 * namespace of the procfs instance 0 will be shown as its pid. This is 37 * similar to calling getppid() on a process whose parent is outside of 38 * its pid namespace. 39 * 40 * NSpid: 41 * If pid namespaces are supported then this function will also print 42 * the pid of a given pidfd refers to for all descendant pid namespaces 43 * starting from the current pid namespace of the instance, i.e. the 44 * Pid field and the first entry in the NSpid field will be identical. 45 * If the pid namespace of the process is not a descendant of the pid 46 * namespace of the procfs instance 0 will be shown as its first NSpid 47 * entry and no others will be shown. 48 * Note that this differs from the Pid and NSpid fields in 49 * /proc/<pid>/status where Pid and NSpid are always shown relative to 50 * the pid namespace of the procfs instance. The difference becomes 51 * obvious when sending around a pidfd between pid namespaces from a 52 * different branch of the tree, i.e. where no ancestral relation is 53 * present between the pid namespaces: 54 * - create two new pid namespaces ns1 and ns2 in the initial pid 55 * namespace (also take care to create new mount namespaces in the 56 * new pid namespace and mount procfs) 57 * - create a process with a pidfd in ns1 58 * - send pidfd from ns1 to ns2 59 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 60 * have exactly one entry, which is 0 61 */ 62 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 63 { 64 struct pid *pid = pidfd_pid(f); 65 struct pid_namespace *ns; 66 pid_t nr = -1; 67 68 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 69 ns = proc_pid_ns(file_inode(m->file)->i_sb); 70 nr = pid_nr_ns(pid, ns); 71 } 72 73 seq_put_decimal_ll(m, "Pid:\t", nr); 74 75 #ifdef CONFIG_PID_NS 76 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 77 if (nr > 0) { 78 int i; 79 80 /* If nr is non-zero it means that 'pid' is valid and that 81 * ns, i.e. the pid namespace associated with the procfs 82 * instance, is in the pid namespace hierarchy of pid. 83 * Start at one below the already printed level. 84 */ 85 for (i = ns->level + 1; i <= pid->level; i++) 86 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 87 } 88 #endif 89 seq_putc(m, '\n'); 90 } 91 #endif 92 93 /* 94 * Poll support for process exit notification. 95 */ 96 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 97 { 98 struct pid *pid = pidfd_pid(file); 99 bool thread = file->f_flags & PIDFD_THREAD; 100 struct task_struct *task; 101 __poll_t poll_flags = 0; 102 103 poll_wait(file, &pid->wait_pidfd, pts); 104 /* 105 * Depending on PIDFD_THREAD, inform pollers when the thread 106 * or the whole thread-group exits. 107 */ 108 guard(rcu)(); 109 task = pid_task(pid, PIDTYPE_PID); 110 if (!task) 111 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 112 else if (task->exit_state && (thread || thread_group_empty(task))) 113 poll_flags = EPOLLIN | EPOLLRDNORM; 114 115 return poll_flags; 116 } 117 118 static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) 119 { 120 struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; 121 size_t usize = _IOC_SIZE(cmd); 122 struct pidfd_info kinfo = {}; 123 struct user_namespace *user_ns; 124 const struct cred *c; 125 __u64 mask; 126 #ifdef CONFIG_CGROUPS 127 struct cgroup *cgrp; 128 #endif 129 130 if (!uinfo) 131 return -EINVAL; 132 if (usize < PIDFD_INFO_SIZE_VER0) 133 return -EINVAL; /* First version, no smaller struct possible */ 134 135 if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) 136 return -EFAULT; 137 138 c = get_task_cred(task); 139 if (!c) 140 return -ESRCH; 141 142 /* Unconditionally return identifiers and credentials, the rest only on request */ 143 144 user_ns = current_user_ns(); 145 kinfo.ruid = from_kuid_munged(user_ns, c->uid); 146 kinfo.rgid = from_kgid_munged(user_ns, c->gid); 147 kinfo.euid = from_kuid_munged(user_ns, c->euid); 148 kinfo.egid = from_kgid_munged(user_ns, c->egid); 149 kinfo.suid = from_kuid_munged(user_ns, c->suid); 150 kinfo.sgid = from_kgid_munged(user_ns, c->sgid); 151 kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); 152 kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); 153 kinfo.mask |= PIDFD_INFO_CREDS; 154 put_cred(c); 155 156 #ifdef CONFIG_CGROUPS 157 rcu_read_lock(); 158 cgrp = task_dfl_cgroup(task); 159 kinfo.cgroupid = cgroup_id(cgrp); 160 kinfo.mask |= PIDFD_INFO_CGROUPID; 161 rcu_read_unlock(); 162 #endif 163 164 /* 165 * Copy pid/tgid last, to reduce the chances the information might be 166 * stale. Note that it is not possible to ensure it will be valid as the 167 * task might return as soon as the copy_to_user finishes, but that's ok 168 * and userspace expects that might happen and can act accordingly, so 169 * this is just best-effort. What we can do however is checking that all 170 * the fields are set correctly, or return ESRCH to avoid providing 171 * incomplete information. */ 172 173 kinfo.ppid = task_ppid_nr_ns(task, NULL); 174 kinfo.tgid = task_tgid_vnr(task); 175 kinfo.pid = task_pid_vnr(task); 176 kinfo.mask |= PIDFD_INFO_PID; 177 178 if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) 179 return -ESRCH; 180 181 /* 182 * If userspace and the kernel have the same struct size it can just 183 * be copied. If userspace provides an older struct, only the bits that 184 * userspace knows about will be copied. If userspace provides a new 185 * struct, only the bits that the kernel knows about will be copied. 186 */ 187 if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) 188 return -EFAULT; 189 190 return 0; 191 } 192 193 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 194 { 195 struct task_struct *task __free(put_task) = NULL; 196 struct nsproxy *nsp __free(put_nsproxy) = NULL; 197 struct pid *pid = pidfd_pid(file); 198 struct ns_common *ns_common = NULL; 199 struct pid_namespace *pid_ns; 200 201 task = get_pid_task(pid, PIDTYPE_PID); 202 if (!task) 203 return -ESRCH; 204 205 /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ 206 if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) 207 return pidfd_info(task, cmd, arg); 208 209 if (arg) 210 return -EINVAL; 211 212 scoped_guard(task_lock, task) { 213 nsp = task->nsproxy; 214 if (nsp) 215 get_nsproxy(nsp); 216 } 217 if (!nsp) 218 return -ESRCH; /* just pretend it didn't exist */ 219 220 /* 221 * We're trying to open a file descriptor to the namespace so perform a 222 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 223 */ 224 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 225 return -EACCES; 226 227 switch (cmd) { 228 /* Namespaces that hang of nsproxy. */ 229 case PIDFD_GET_CGROUP_NAMESPACE: 230 if (IS_ENABLED(CONFIG_CGROUPS)) { 231 get_cgroup_ns(nsp->cgroup_ns); 232 ns_common = to_ns_common(nsp->cgroup_ns); 233 } 234 break; 235 case PIDFD_GET_IPC_NAMESPACE: 236 if (IS_ENABLED(CONFIG_IPC_NS)) { 237 get_ipc_ns(nsp->ipc_ns); 238 ns_common = to_ns_common(nsp->ipc_ns); 239 } 240 break; 241 case PIDFD_GET_MNT_NAMESPACE: 242 get_mnt_ns(nsp->mnt_ns); 243 ns_common = to_ns_common(nsp->mnt_ns); 244 break; 245 case PIDFD_GET_NET_NAMESPACE: 246 if (IS_ENABLED(CONFIG_NET_NS)) { 247 ns_common = to_ns_common(nsp->net_ns); 248 get_net_ns(ns_common); 249 } 250 break; 251 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 252 if (IS_ENABLED(CONFIG_PID_NS)) { 253 get_pid_ns(nsp->pid_ns_for_children); 254 ns_common = to_ns_common(nsp->pid_ns_for_children); 255 } 256 break; 257 case PIDFD_GET_TIME_NAMESPACE: 258 if (IS_ENABLED(CONFIG_TIME_NS)) { 259 get_time_ns(nsp->time_ns); 260 ns_common = to_ns_common(nsp->time_ns); 261 } 262 break; 263 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 264 if (IS_ENABLED(CONFIG_TIME_NS)) { 265 get_time_ns(nsp->time_ns_for_children); 266 ns_common = to_ns_common(nsp->time_ns_for_children); 267 } 268 break; 269 case PIDFD_GET_UTS_NAMESPACE: 270 if (IS_ENABLED(CONFIG_UTS_NS)) { 271 get_uts_ns(nsp->uts_ns); 272 ns_common = to_ns_common(nsp->uts_ns); 273 } 274 break; 275 /* Namespaces that don't hang of nsproxy. */ 276 case PIDFD_GET_USER_NAMESPACE: 277 if (IS_ENABLED(CONFIG_USER_NS)) { 278 rcu_read_lock(); 279 ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); 280 rcu_read_unlock(); 281 } 282 break; 283 case PIDFD_GET_PID_NAMESPACE: 284 if (IS_ENABLED(CONFIG_PID_NS)) { 285 rcu_read_lock(); 286 pid_ns = task_active_pid_ns(task); 287 if (pid_ns) 288 ns_common = to_ns_common(get_pid_ns(pid_ns)); 289 rcu_read_unlock(); 290 } 291 break; 292 default: 293 return -ENOIOCTLCMD; 294 } 295 296 if (!ns_common) 297 return -EOPNOTSUPP; 298 299 /* open_namespace() unconditionally consumes the reference */ 300 return open_namespace(ns_common); 301 } 302 303 static const struct file_operations pidfs_file_operations = { 304 .poll = pidfd_poll, 305 #ifdef CONFIG_PROC_FS 306 .show_fdinfo = pidfd_show_fdinfo, 307 #endif 308 .unlocked_ioctl = pidfd_ioctl, 309 .compat_ioctl = compat_ptr_ioctl, 310 }; 311 312 struct pid *pidfd_pid(const struct file *file) 313 { 314 if (file->f_op != &pidfs_file_operations) 315 return ERR_PTR(-EBADF); 316 return file_inode(file)->i_private; 317 } 318 319 static struct vfsmount *pidfs_mnt __ro_after_init; 320 321 #if BITS_PER_LONG == 32 322 /* 323 * Provide a fallback mechanism for 32-bit systems so processes remain 324 * reliably comparable by inode number even on those systems. 325 */ 326 static DEFINE_IDA(pidfd_inum_ida); 327 328 static int pidfs_inum(struct pid *pid, unsigned long *ino) 329 { 330 int ret; 331 332 ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, 333 UINT_MAX, GFP_ATOMIC); 334 if (ret < 0) 335 return -ENOSPC; 336 337 *ino = ret; 338 return 0; 339 } 340 341 static inline void pidfs_free_inum(unsigned long ino) 342 { 343 if (ino > 0) 344 ida_free(&pidfd_inum_ida, ino); 345 } 346 #else 347 static inline int pidfs_inum(struct pid *pid, unsigned long *ino) 348 { 349 *ino = pid->ino; 350 return 0; 351 } 352 #define pidfs_free_inum(ino) ((void)(ino)) 353 #endif 354 355 /* 356 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 357 * implemented. Let's reject it completely until we have a clean 358 * permission concept for pidfds. 359 */ 360 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 361 struct iattr *attr) 362 { 363 return -EOPNOTSUPP; 364 } 365 366 367 /* 368 * User space expects pidfs inodes to have no file type in st_mode. 369 * 370 * In particular, 'lsof' has this legacy logic: 371 * 372 * type = s->st_mode & S_IFMT; 373 * switch (type) { 374 * ... 375 * case 0: 376 * if (!strcmp(p, "anon_inode")) 377 * Lf->ntype = Ntype = N_ANON_INODE; 378 * 379 * to detect our old anon_inode logic. 380 * 381 * Rather than mess with our internal sane inode data, just fix it 382 * up here in getattr() by masking off the format bits. 383 */ 384 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 385 struct kstat *stat, u32 request_mask, 386 unsigned int query_flags) 387 { 388 struct inode *inode = d_inode(path->dentry); 389 390 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 391 stat->mode &= ~S_IFMT; 392 return 0; 393 } 394 395 static const struct inode_operations pidfs_inode_operations = { 396 .getattr = pidfs_getattr, 397 .setattr = pidfs_setattr, 398 }; 399 400 static void pidfs_evict_inode(struct inode *inode) 401 { 402 struct pid *pid = inode->i_private; 403 404 clear_inode(inode); 405 put_pid(pid); 406 pidfs_free_inum(inode->i_ino); 407 } 408 409 static const struct super_operations pidfs_sops = { 410 .drop_inode = generic_delete_inode, 411 .evict_inode = pidfs_evict_inode, 412 .statfs = simple_statfs, 413 }; 414 415 /* 416 * 'lsof' has knowledge of out historical anon_inode use, and expects 417 * the pidfs dentry name to start with 'anon_inode'. 418 */ 419 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 420 { 421 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 422 } 423 424 static const struct dentry_operations pidfs_dentry_operations = { 425 .d_delete = always_delete_dentry, 426 .d_dname = pidfs_dname, 427 .d_prune = stashed_dentry_prune, 428 }; 429 430 static int pidfs_init_inode(struct inode *inode, void *data) 431 { 432 inode->i_private = data; 433 inode->i_flags |= S_PRIVATE; 434 inode->i_mode |= S_IRWXU; 435 inode->i_op = &pidfs_inode_operations; 436 inode->i_fop = &pidfs_file_operations; 437 /* 438 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This 439 * avoids collisions with the root inode which is 1 for pseudo 440 * filesystems. 441 */ 442 return pidfs_inum(data, &inode->i_ino); 443 } 444 445 static void pidfs_put_data(void *data) 446 { 447 struct pid *pid = data; 448 put_pid(pid); 449 } 450 451 static const struct stashed_operations pidfs_stashed_ops = { 452 .init_inode = pidfs_init_inode, 453 .put_data = pidfs_put_data, 454 }; 455 456 static int pidfs_init_fs_context(struct fs_context *fc) 457 { 458 struct pseudo_fs_context *ctx; 459 460 ctx = init_pseudo(fc, PID_FS_MAGIC); 461 if (!ctx) 462 return -ENOMEM; 463 464 ctx->ops = &pidfs_sops; 465 ctx->dops = &pidfs_dentry_operations; 466 fc->s_fs_info = (void *)&pidfs_stashed_ops; 467 return 0; 468 } 469 470 static struct file_system_type pidfs_type = { 471 .name = "pidfs", 472 .init_fs_context = pidfs_init_fs_context, 473 .kill_sb = kill_anon_super, 474 }; 475 476 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 477 { 478 479 struct file *pidfd_file; 480 struct path path; 481 int ret; 482 483 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 484 if (ret < 0) 485 return ERR_PTR(ret); 486 487 pidfd_file = dentry_open(&path, flags, current_cred()); 488 path_put(&path); 489 return pidfd_file; 490 } 491 492 void __init pidfs_init(void) 493 { 494 pidfs_mnt = kern_mount(&pidfs_type); 495 if (IS_ERR(pidfs_mnt)) 496 panic("Failed to mount pidfs pseudo filesystem"); 497 } 498