1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/file.h> 4 #include <linux/fs.h> 5 #include <linux/magic.h> 6 #include <linux/mount.h> 7 #include <linux/pid.h> 8 #include <linux/pidfs.h> 9 #include <linux/pid_namespace.h> 10 #include <linux/poll.h> 11 #include <linux/proc_fs.h> 12 #include <linux/proc_ns.h> 13 #include <linux/pseudo_fs.h> 14 #include <linux/ptrace.h> 15 #include <linux/seq_file.h> 16 #include <uapi/linux/pidfd.h> 17 #include <linux/ipc_namespace.h> 18 #include <linux/time_namespace.h> 19 #include <linux/utsname.h> 20 #include <net/net_namespace.h> 21 22 #include "internal.h" 23 #include "mount.h" 24 25 #ifdef CONFIG_PROC_FS 26 /** 27 * pidfd_show_fdinfo - print information about a pidfd 28 * @m: proc fdinfo file 29 * @f: file referencing a pidfd 30 * 31 * Pid: 32 * This function will print the pid that a given pidfd refers to in the 33 * pid namespace of the procfs instance. 34 * If the pid namespace of the process is not a descendant of the pid 35 * namespace of the procfs instance 0 will be shown as its pid. This is 36 * similar to calling getppid() on a process whose parent is outside of 37 * its pid namespace. 38 * 39 * NSpid: 40 * If pid namespaces are supported then this function will also print 41 * the pid of a given pidfd refers to for all descendant pid namespaces 42 * starting from the current pid namespace of the instance, i.e. the 43 * Pid field and the first entry in the NSpid field will be identical. 44 * If the pid namespace of the process is not a descendant of the pid 45 * namespace of the procfs instance 0 will be shown as its first NSpid 46 * entry and no others will be shown. 47 * Note that this differs from the Pid and NSpid fields in 48 * /proc/<pid>/status where Pid and NSpid are always shown relative to 49 * the pid namespace of the procfs instance. The difference becomes 50 * obvious when sending around a pidfd between pid namespaces from a 51 * different branch of the tree, i.e. where no ancestral relation is 52 * present between the pid namespaces: 53 * - create two new pid namespaces ns1 and ns2 in the initial pid 54 * namespace (also take care to create new mount namespaces in the 55 * new pid namespace and mount procfs) 56 * - create a process with a pidfd in ns1 57 * - send pidfd from ns1 to ns2 58 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 59 * have exactly one entry, which is 0 60 */ 61 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 62 { 63 struct pid *pid = pidfd_pid(f); 64 struct pid_namespace *ns; 65 pid_t nr = -1; 66 67 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 68 ns = proc_pid_ns(file_inode(m->file)->i_sb); 69 nr = pid_nr_ns(pid, ns); 70 } 71 72 seq_put_decimal_ll(m, "Pid:\t", nr); 73 74 #ifdef CONFIG_PID_NS 75 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 76 if (nr > 0) { 77 int i; 78 79 /* If nr is non-zero it means that 'pid' is valid and that 80 * ns, i.e. the pid namespace associated with the procfs 81 * instance, is in the pid namespace hierarchy of pid. 82 * Start at one below the already printed level. 83 */ 84 for (i = ns->level + 1; i <= pid->level; i++) 85 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 86 } 87 #endif 88 seq_putc(m, '\n'); 89 } 90 #endif 91 92 /* 93 * Poll support for process exit notification. 94 */ 95 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 96 { 97 struct pid *pid = pidfd_pid(file); 98 bool thread = file->f_flags & PIDFD_THREAD; 99 struct task_struct *task; 100 __poll_t poll_flags = 0; 101 102 poll_wait(file, &pid->wait_pidfd, pts); 103 /* 104 * Depending on PIDFD_THREAD, inform pollers when the thread 105 * or the whole thread-group exits. 106 */ 107 guard(rcu)(); 108 task = pid_task(pid, PIDTYPE_PID); 109 if (!task) 110 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 111 else if (task->exit_state && (thread || thread_group_empty(task))) 112 poll_flags = EPOLLIN | EPOLLRDNORM; 113 114 return poll_flags; 115 } 116 117 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 118 { 119 struct task_struct *task __free(put_task) = NULL; 120 struct nsproxy *nsp __free(put_nsproxy) = NULL; 121 struct pid *pid = pidfd_pid(file); 122 struct ns_common *ns_common = NULL; 123 124 if (arg) 125 return -EINVAL; 126 127 task = get_pid_task(pid, PIDTYPE_PID); 128 if (!task) 129 return -ESRCH; 130 131 scoped_guard(task_lock, task) { 132 nsp = task->nsproxy; 133 if (nsp) 134 get_nsproxy(nsp); 135 } 136 if (!nsp) 137 return -ESRCH; /* just pretend it didn't exist */ 138 139 /* 140 * We're trying to open a file descriptor to the namespace so perform a 141 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 142 */ 143 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 144 return -EACCES; 145 146 switch (cmd) { 147 /* Namespaces that hang of nsproxy. */ 148 case PIDFD_GET_CGROUP_NAMESPACE: 149 if (IS_ENABLED(CONFIG_CGROUPS)) { 150 get_cgroup_ns(nsp->cgroup_ns); 151 ns_common = to_ns_common(nsp->cgroup_ns); 152 } 153 break; 154 case PIDFD_GET_IPC_NAMESPACE: 155 if (IS_ENABLED(CONFIG_IPC_NS)) { 156 get_ipc_ns(nsp->ipc_ns); 157 ns_common = to_ns_common(nsp->ipc_ns); 158 } 159 break; 160 case PIDFD_GET_MNT_NAMESPACE: 161 get_mnt_ns(nsp->mnt_ns); 162 ns_common = to_ns_common(nsp->mnt_ns); 163 break; 164 case PIDFD_GET_NET_NAMESPACE: 165 if (IS_ENABLED(CONFIG_NET_NS)) { 166 ns_common = to_ns_common(nsp->net_ns); 167 get_net_ns(ns_common); 168 } 169 break; 170 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 171 if (IS_ENABLED(CONFIG_PID_NS)) { 172 get_pid_ns(nsp->pid_ns_for_children); 173 ns_common = to_ns_common(nsp->pid_ns_for_children); 174 } 175 break; 176 case PIDFD_GET_TIME_NAMESPACE: 177 if (IS_ENABLED(CONFIG_TIME_NS)) { 178 get_time_ns(nsp->time_ns); 179 ns_common = to_ns_common(nsp->time_ns); 180 } 181 break; 182 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 183 if (IS_ENABLED(CONFIG_TIME_NS)) { 184 get_time_ns(nsp->time_ns_for_children); 185 ns_common = to_ns_common(nsp->time_ns_for_children); 186 } 187 break; 188 case PIDFD_GET_UTS_NAMESPACE: 189 if (IS_ENABLED(CONFIG_UTS_NS)) { 190 get_uts_ns(nsp->uts_ns); 191 ns_common = to_ns_common(nsp->uts_ns); 192 } 193 break; 194 /* Namespaces that don't hang of nsproxy. */ 195 case PIDFD_GET_USER_NAMESPACE: 196 if (IS_ENABLED(CONFIG_USER_NS)) { 197 rcu_read_lock(); 198 ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); 199 rcu_read_unlock(); 200 } 201 break; 202 case PIDFD_GET_PID_NAMESPACE: 203 if (IS_ENABLED(CONFIG_PID_NS)) { 204 rcu_read_lock(); 205 ns_common = to_ns_common( get_pid_ns(task_active_pid_ns(task))); 206 rcu_read_unlock(); 207 } 208 break; 209 default: 210 return -ENOIOCTLCMD; 211 } 212 213 if (!ns_common) 214 return -EOPNOTSUPP; 215 216 /* open_namespace() unconditionally consumes the reference */ 217 return open_namespace(ns_common); 218 } 219 220 static const struct file_operations pidfs_file_operations = { 221 .poll = pidfd_poll, 222 #ifdef CONFIG_PROC_FS 223 .show_fdinfo = pidfd_show_fdinfo, 224 #endif 225 .unlocked_ioctl = pidfd_ioctl, 226 .compat_ioctl = compat_ptr_ioctl, 227 }; 228 229 struct pid *pidfd_pid(const struct file *file) 230 { 231 if (file->f_op != &pidfs_file_operations) 232 return ERR_PTR(-EBADF); 233 return file_inode(file)->i_private; 234 } 235 236 static struct vfsmount *pidfs_mnt __ro_after_init; 237 238 #if BITS_PER_LONG == 32 239 /* 240 * Provide a fallback mechanism for 32-bit systems so processes remain 241 * reliably comparable by inode number even on those systems. 242 */ 243 static DEFINE_IDA(pidfd_inum_ida); 244 245 static int pidfs_inum(struct pid *pid, unsigned long *ino) 246 { 247 int ret; 248 249 ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, 250 UINT_MAX, GFP_ATOMIC); 251 if (ret < 0) 252 return -ENOSPC; 253 254 *ino = ret; 255 return 0; 256 } 257 258 static inline void pidfs_free_inum(unsigned long ino) 259 { 260 if (ino > 0) 261 ida_free(&pidfd_inum_ida, ino); 262 } 263 #else 264 static inline int pidfs_inum(struct pid *pid, unsigned long *ino) 265 { 266 *ino = pid->ino; 267 return 0; 268 } 269 #define pidfs_free_inum(ino) ((void)(ino)) 270 #endif 271 272 /* 273 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 274 * implemented. Let's reject it completely until we have a clean 275 * permission concept for pidfds. 276 */ 277 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 278 struct iattr *attr) 279 { 280 return -EOPNOTSUPP; 281 } 282 283 284 /* 285 * User space expects pidfs inodes to have no file type in st_mode. 286 * 287 * In particular, 'lsof' has this legacy logic: 288 * 289 * type = s->st_mode & S_IFMT; 290 * switch (type) { 291 * ... 292 * case 0: 293 * if (!strcmp(p, "anon_inode")) 294 * Lf->ntype = Ntype = N_ANON_INODE; 295 * 296 * to detect our old anon_inode logic. 297 * 298 * Rather than mess with our internal sane inode data, just fix it 299 * up here in getattr() by masking off the format bits. 300 */ 301 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 302 struct kstat *stat, u32 request_mask, 303 unsigned int query_flags) 304 { 305 struct inode *inode = d_inode(path->dentry); 306 307 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 308 stat->mode &= ~S_IFMT; 309 return 0; 310 } 311 312 static const struct inode_operations pidfs_inode_operations = { 313 .getattr = pidfs_getattr, 314 .setattr = pidfs_setattr, 315 }; 316 317 static void pidfs_evict_inode(struct inode *inode) 318 { 319 struct pid *pid = inode->i_private; 320 321 clear_inode(inode); 322 put_pid(pid); 323 pidfs_free_inum(inode->i_ino); 324 } 325 326 static const struct super_operations pidfs_sops = { 327 .drop_inode = generic_delete_inode, 328 .evict_inode = pidfs_evict_inode, 329 .statfs = simple_statfs, 330 }; 331 332 /* 333 * 'lsof' has knowledge of out historical anon_inode use, and expects 334 * the pidfs dentry name to start with 'anon_inode'. 335 */ 336 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 337 { 338 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 339 } 340 341 static const struct dentry_operations pidfs_dentry_operations = { 342 .d_delete = always_delete_dentry, 343 .d_dname = pidfs_dname, 344 .d_prune = stashed_dentry_prune, 345 }; 346 347 static int pidfs_init_inode(struct inode *inode, void *data) 348 { 349 inode->i_private = data; 350 inode->i_flags |= S_PRIVATE; 351 inode->i_mode |= S_IRWXU; 352 inode->i_op = &pidfs_inode_operations; 353 inode->i_fop = &pidfs_file_operations; 354 /* 355 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This 356 * avoids collisions with the root inode which is 1 for pseudo 357 * filesystems. 358 */ 359 return pidfs_inum(data, &inode->i_ino); 360 } 361 362 static void pidfs_put_data(void *data) 363 { 364 struct pid *pid = data; 365 put_pid(pid); 366 } 367 368 static const struct stashed_operations pidfs_stashed_ops = { 369 .init_inode = pidfs_init_inode, 370 .put_data = pidfs_put_data, 371 }; 372 373 static int pidfs_init_fs_context(struct fs_context *fc) 374 { 375 struct pseudo_fs_context *ctx; 376 377 ctx = init_pseudo(fc, PID_FS_MAGIC); 378 if (!ctx) 379 return -ENOMEM; 380 381 ctx->ops = &pidfs_sops; 382 ctx->dops = &pidfs_dentry_operations; 383 fc->s_fs_info = (void *)&pidfs_stashed_ops; 384 return 0; 385 } 386 387 static struct file_system_type pidfs_type = { 388 .name = "pidfs", 389 .init_fs_context = pidfs_init_fs_context, 390 .kill_sb = kill_anon_super, 391 }; 392 393 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 394 { 395 396 struct file *pidfd_file; 397 struct path path; 398 int ret; 399 400 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 401 if (ret < 0) 402 return ERR_PTR(ret); 403 404 pidfd_file = dentry_open(&path, flags, current_cred()); 405 path_put(&path); 406 return pidfd_file; 407 } 408 409 void __init pidfs_init(void) 410 { 411 pidfs_mnt = kern_mount(&pidfs_type); 412 if (IS_ERR(pidfs_mnt)) 413 panic("Failed to mount pidfs pseudo filesystem"); 414 } 415