1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/file.h> 4 #include <linux/fs.h> 5 #include <linux/magic.h> 6 #include <linux/mount.h> 7 #include <linux/pid.h> 8 #include <linux/pidfs.h> 9 #include <linux/pid_namespace.h> 10 #include <linux/poll.h> 11 #include <linux/proc_fs.h> 12 #include <linux/proc_ns.h> 13 #include <linux/pseudo_fs.h> 14 #include <linux/ptrace.h> 15 #include <linux/seq_file.h> 16 #include <uapi/linux/pidfd.h> 17 #include <linux/ipc_namespace.h> 18 #include <linux/time_namespace.h> 19 #include <linux/utsname.h> 20 #include <net/net_namespace.h> 21 22 #include "internal.h" 23 #include "mount.h" 24 25 #ifdef CONFIG_PROC_FS 26 /** 27 * pidfd_show_fdinfo - print information about a pidfd 28 * @m: proc fdinfo file 29 * @f: file referencing a pidfd 30 * 31 * Pid: 32 * This function will print the pid that a given pidfd refers to in the 33 * pid namespace of the procfs instance. 34 * If the pid namespace of the process is not a descendant of the pid 35 * namespace of the procfs instance 0 will be shown as its pid. This is 36 * similar to calling getppid() on a process whose parent is outside of 37 * its pid namespace. 38 * 39 * NSpid: 40 * If pid namespaces are supported then this function will also print 41 * the pid of a given pidfd refers to for all descendant pid namespaces 42 * starting from the current pid namespace of the instance, i.e. the 43 * Pid field and the first entry in the NSpid field will be identical. 44 * If the pid namespace of the process is not a descendant of the pid 45 * namespace of the procfs instance 0 will be shown as its first NSpid 46 * entry and no others will be shown. 47 * Note that this differs from the Pid and NSpid fields in 48 * /proc/<pid>/status where Pid and NSpid are always shown relative to 49 * the pid namespace of the procfs instance. The difference becomes 50 * obvious when sending around a pidfd between pid namespaces from a 51 * different branch of the tree, i.e. where no ancestral relation is 52 * present between the pid namespaces: 53 * - create two new pid namespaces ns1 and ns2 in the initial pid 54 * namespace (also take care to create new mount namespaces in the 55 * new pid namespace and mount procfs) 56 * - create a process with a pidfd in ns1 57 * - send pidfd from ns1 to ns2 58 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 59 * have exactly one entry, which is 0 60 */ 61 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 62 { 63 struct pid *pid = pidfd_pid(f); 64 struct pid_namespace *ns; 65 pid_t nr = -1; 66 67 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 68 ns = proc_pid_ns(file_inode(m->file)->i_sb); 69 nr = pid_nr_ns(pid, ns); 70 } 71 72 seq_put_decimal_ll(m, "Pid:\t", nr); 73 74 #ifdef CONFIG_PID_NS 75 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 76 if (nr > 0) { 77 int i; 78 79 /* If nr is non-zero it means that 'pid' is valid and that 80 * ns, i.e. the pid namespace associated with the procfs 81 * instance, is in the pid namespace hierarchy of pid. 82 * Start at one below the already printed level. 83 */ 84 for (i = ns->level + 1; i <= pid->level; i++) 85 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 86 } 87 #endif 88 seq_putc(m, '\n'); 89 } 90 #endif 91 92 /* 93 * Poll support for process exit notification. 94 */ 95 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 96 { 97 struct pid *pid = pidfd_pid(file); 98 bool thread = file->f_flags & PIDFD_THREAD; 99 struct task_struct *task; 100 __poll_t poll_flags = 0; 101 102 poll_wait(file, &pid->wait_pidfd, pts); 103 /* 104 * Depending on PIDFD_THREAD, inform pollers when the thread 105 * or the whole thread-group exits. 106 */ 107 guard(rcu)(); 108 task = pid_task(pid, PIDTYPE_PID); 109 if (!task) 110 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 111 else if (task->exit_state && (thread || thread_group_empty(task))) 112 poll_flags = EPOLLIN | EPOLLRDNORM; 113 114 return poll_flags; 115 } 116 117 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 118 { 119 struct task_struct *task __free(put_task) = NULL; 120 struct nsproxy *nsp __free(put_nsproxy) = NULL; 121 struct pid *pid = pidfd_pid(file); 122 struct ns_common *ns_common; 123 124 if (arg) 125 return -EINVAL; 126 127 task = get_pid_task(pid, PIDTYPE_PID); 128 if (!task) 129 return -ESRCH; 130 131 scoped_guard(task_lock, task) { 132 nsp = task->nsproxy; 133 if (nsp) 134 get_nsproxy(nsp); 135 } 136 if (!nsp) 137 return -ESRCH; /* just pretend it didn't exist */ 138 139 /* 140 * We're trying to open a file descriptor to the namespace so perform a 141 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 142 */ 143 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 144 return -EACCES; 145 146 switch (cmd) { 147 /* Namespaces that hang of nsproxy. */ 148 case PIDFD_GET_CGROUP_NAMESPACE: 149 get_cgroup_ns(nsp->cgroup_ns); 150 ns_common = to_ns_common(nsp->cgroup_ns); 151 break; 152 case PIDFD_GET_IPC_NAMESPACE: 153 get_ipc_ns(nsp->ipc_ns); 154 ns_common = to_ns_common(nsp->ipc_ns); 155 break; 156 case PIDFD_GET_MNT_NAMESPACE: 157 get_mnt_ns(nsp->mnt_ns); 158 ns_common = to_ns_common(nsp->mnt_ns); 159 break; 160 case PIDFD_GET_NET_NAMESPACE: 161 ns_common = to_ns_common(nsp->net_ns); 162 get_net_ns(ns_common); 163 break; 164 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 165 get_pid_ns(nsp->pid_ns_for_children); 166 ns_common = to_ns_common(nsp->pid_ns_for_children); 167 break; 168 case PIDFD_GET_TIME_NAMESPACE: 169 get_time_ns(nsp->time_ns); 170 ns_common = to_ns_common(nsp->time_ns); 171 break; 172 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 173 get_time_ns(nsp->time_ns_for_children); 174 ns_common = to_ns_common(nsp->time_ns_for_children); 175 break; 176 case PIDFD_GET_UTS_NAMESPACE: 177 get_uts_ns(nsp->uts_ns); 178 ns_common = to_ns_common(nsp->uts_ns); 179 break; 180 /* Namespaces that don't hang of nsproxy. */ 181 case PIDFD_GET_USER_NAMESPACE: 182 rcu_read_lock(); 183 ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); 184 rcu_read_unlock(); 185 break; 186 case PIDFD_GET_PID_NAMESPACE: 187 rcu_read_lock(); 188 ns_common = to_ns_common(get_pid_ns(task_active_pid_ns(task))); 189 rcu_read_unlock(); 190 break; 191 default: 192 return -ENOIOCTLCMD; 193 } 194 195 /* open_namespace() unconditionally consumes the reference */ 196 return open_namespace(ns_common); 197 } 198 199 static const struct file_operations pidfs_file_operations = { 200 .poll = pidfd_poll, 201 #ifdef CONFIG_PROC_FS 202 .show_fdinfo = pidfd_show_fdinfo, 203 #endif 204 .unlocked_ioctl = pidfd_ioctl, 205 .compat_ioctl = compat_ptr_ioctl, 206 }; 207 208 struct pid *pidfd_pid(const struct file *file) 209 { 210 if (file->f_op != &pidfs_file_operations) 211 return ERR_PTR(-EBADF); 212 return file_inode(file)->i_private; 213 } 214 215 static struct vfsmount *pidfs_mnt __ro_after_init; 216 217 #if BITS_PER_LONG == 32 218 /* 219 * Provide a fallback mechanism for 32-bit systems so processes remain 220 * reliably comparable by inode number even on those systems. 221 */ 222 static DEFINE_IDA(pidfd_inum_ida); 223 224 static int pidfs_inum(struct pid *pid, unsigned long *ino) 225 { 226 int ret; 227 228 ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, 229 UINT_MAX, GFP_ATOMIC); 230 if (ret < 0) 231 return -ENOSPC; 232 233 *ino = ret; 234 return 0; 235 } 236 237 static inline void pidfs_free_inum(unsigned long ino) 238 { 239 if (ino > 0) 240 ida_free(&pidfd_inum_ida, ino); 241 } 242 #else 243 static inline int pidfs_inum(struct pid *pid, unsigned long *ino) 244 { 245 *ino = pid->ino; 246 return 0; 247 } 248 #define pidfs_free_inum(ino) ((void)(ino)) 249 #endif 250 251 /* 252 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 253 * implemented. Let's reject it completely until we have a clean 254 * permission concept for pidfds. 255 */ 256 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 257 struct iattr *attr) 258 { 259 return -EOPNOTSUPP; 260 } 261 262 263 /* 264 * User space expects pidfs inodes to have no file type in st_mode. 265 * 266 * In particular, 'lsof' has this legacy logic: 267 * 268 * type = s->st_mode & S_IFMT; 269 * switch (type) { 270 * ... 271 * case 0: 272 * if (!strcmp(p, "anon_inode")) 273 * Lf->ntype = Ntype = N_ANON_INODE; 274 * 275 * to detect our old anon_inode logic. 276 * 277 * Rather than mess with our internal sane inode data, just fix it 278 * up here in getattr() by masking off the format bits. 279 */ 280 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 281 struct kstat *stat, u32 request_mask, 282 unsigned int query_flags) 283 { 284 struct inode *inode = d_inode(path->dentry); 285 286 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 287 stat->mode &= ~S_IFMT; 288 return 0; 289 } 290 291 static const struct inode_operations pidfs_inode_operations = { 292 .getattr = pidfs_getattr, 293 .setattr = pidfs_setattr, 294 }; 295 296 static void pidfs_evict_inode(struct inode *inode) 297 { 298 struct pid *pid = inode->i_private; 299 300 clear_inode(inode); 301 put_pid(pid); 302 pidfs_free_inum(inode->i_ino); 303 } 304 305 static const struct super_operations pidfs_sops = { 306 .drop_inode = generic_delete_inode, 307 .evict_inode = pidfs_evict_inode, 308 .statfs = simple_statfs, 309 }; 310 311 /* 312 * 'lsof' has knowledge of out historical anon_inode use, and expects 313 * the pidfs dentry name to start with 'anon_inode'. 314 */ 315 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 316 { 317 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 318 } 319 320 static const struct dentry_operations pidfs_dentry_operations = { 321 .d_delete = always_delete_dentry, 322 .d_dname = pidfs_dname, 323 .d_prune = stashed_dentry_prune, 324 }; 325 326 static int pidfs_init_inode(struct inode *inode, void *data) 327 { 328 inode->i_private = data; 329 inode->i_flags |= S_PRIVATE; 330 inode->i_mode |= S_IRWXU; 331 inode->i_op = &pidfs_inode_operations; 332 inode->i_fop = &pidfs_file_operations; 333 /* 334 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This 335 * avoids collisions with the root inode which is 1 for pseudo 336 * filesystems. 337 */ 338 return pidfs_inum(data, &inode->i_ino); 339 } 340 341 static void pidfs_put_data(void *data) 342 { 343 struct pid *pid = data; 344 put_pid(pid); 345 } 346 347 static const struct stashed_operations pidfs_stashed_ops = { 348 .init_inode = pidfs_init_inode, 349 .put_data = pidfs_put_data, 350 }; 351 352 static int pidfs_init_fs_context(struct fs_context *fc) 353 { 354 struct pseudo_fs_context *ctx; 355 356 ctx = init_pseudo(fc, PID_FS_MAGIC); 357 if (!ctx) 358 return -ENOMEM; 359 360 ctx->ops = &pidfs_sops; 361 ctx->dops = &pidfs_dentry_operations; 362 fc->s_fs_info = (void *)&pidfs_stashed_ops; 363 return 0; 364 } 365 366 static struct file_system_type pidfs_type = { 367 .name = "pidfs", 368 .init_fs_context = pidfs_init_fs_context, 369 .kill_sb = kill_anon_super, 370 }; 371 372 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 373 { 374 375 struct file *pidfd_file; 376 struct path path; 377 int ret; 378 379 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 380 if (ret < 0) 381 return ERR_PTR(ret); 382 383 pidfd_file = dentry_open(&path, flags, current_cred()); 384 path_put(&path); 385 return pidfd_file; 386 } 387 388 void __init pidfs_init(void) 389 { 390 pidfs_mnt = kern_mount(&pidfs_type); 391 if (IS_ERR(pidfs_mnt)) 392 panic("Failed to mount pidfs pseudo filesystem"); 393 } 394