1 // SPDX-License-Identifier: GPL-2.0 2 #include <linux/anon_inodes.h> 3 #include <linux/file.h> 4 #include <linux/fs.h> 5 #include <linux/magic.h> 6 #include <linux/mount.h> 7 #include <linux/pid.h> 8 #include <linux/pidfs.h> 9 #include <linux/pid_namespace.h> 10 #include <linux/poll.h> 11 #include <linux/proc_fs.h> 12 #include <linux/proc_ns.h> 13 #include <linux/pseudo_fs.h> 14 #include <linux/ptrace.h> 15 #include <linux/seq_file.h> 16 #include <uapi/linux/pidfd.h> 17 #include <linux/ipc_namespace.h> 18 #include <linux/time_namespace.h> 19 #include <linux/utsname.h> 20 #include <net/net_namespace.h> 21 22 #include "internal.h" 23 #include "mount.h" 24 25 #ifdef CONFIG_PROC_FS 26 /** 27 * pidfd_show_fdinfo - print information about a pidfd 28 * @m: proc fdinfo file 29 * @f: file referencing a pidfd 30 * 31 * Pid: 32 * This function will print the pid that a given pidfd refers to in the 33 * pid namespace of the procfs instance. 34 * If the pid namespace of the process is not a descendant of the pid 35 * namespace of the procfs instance 0 will be shown as its pid. This is 36 * similar to calling getppid() on a process whose parent is outside of 37 * its pid namespace. 38 * 39 * NSpid: 40 * If pid namespaces are supported then this function will also print 41 * the pid of a given pidfd refers to for all descendant pid namespaces 42 * starting from the current pid namespace of the instance, i.e. the 43 * Pid field and the first entry in the NSpid field will be identical. 44 * If the pid namespace of the process is not a descendant of the pid 45 * namespace of the procfs instance 0 will be shown as its first NSpid 46 * entry and no others will be shown. 47 * Note that this differs from the Pid and NSpid fields in 48 * /proc/<pid>/status where Pid and NSpid are always shown relative to 49 * the pid namespace of the procfs instance. The difference becomes 50 * obvious when sending around a pidfd between pid namespaces from a 51 * different branch of the tree, i.e. where no ancestral relation is 52 * present between the pid namespaces: 53 * - create two new pid namespaces ns1 and ns2 in the initial pid 54 * namespace (also take care to create new mount namespaces in the 55 * new pid namespace and mount procfs) 56 * - create a process with a pidfd in ns1 57 * - send pidfd from ns1 to ns2 58 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid 59 * have exactly one entry, which is 0 60 */ 61 static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) 62 { 63 struct pid *pid = pidfd_pid(f); 64 struct pid_namespace *ns; 65 pid_t nr = -1; 66 67 if (likely(pid_has_task(pid, PIDTYPE_PID))) { 68 ns = proc_pid_ns(file_inode(m->file)->i_sb); 69 nr = pid_nr_ns(pid, ns); 70 } 71 72 seq_put_decimal_ll(m, "Pid:\t", nr); 73 74 #ifdef CONFIG_PID_NS 75 seq_put_decimal_ll(m, "\nNSpid:\t", nr); 76 if (nr > 0) { 77 int i; 78 79 /* If nr is non-zero it means that 'pid' is valid and that 80 * ns, i.e. the pid namespace associated with the procfs 81 * instance, is in the pid namespace hierarchy of pid. 82 * Start at one below the already printed level. 83 */ 84 for (i = ns->level + 1; i <= pid->level; i++) 85 seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); 86 } 87 #endif 88 seq_putc(m, '\n'); 89 } 90 #endif 91 92 /* 93 * Poll support for process exit notification. 94 */ 95 static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) 96 { 97 struct pid *pid = pidfd_pid(file); 98 bool thread = file->f_flags & PIDFD_THREAD; 99 struct task_struct *task; 100 __poll_t poll_flags = 0; 101 102 poll_wait(file, &pid->wait_pidfd, pts); 103 /* 104 * Depending on PIDFD_THREAD, inform pollers when the thread 105 * or the whole thread-group exits. 106 */ 107 guard(rcu)(); 108 task = pid_task(pid, PIDTYPE_PID); 109 if (!task) 110 poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; 111 else if (task->exit_state && (thread || thread_group_empty(task))) 112 poll_flags = EPOLLIN | EPOLLRDNORM; 113 114 return poll_flags; 115 } 116 117 static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 118 { 119 struct task_struct *task __free(put_task) = NULL; 120 struct nsproxy *nsp __free(put_nsproxy) = NULL; 121 struct pid *pid = pidfd_pid(file); 122 struct ns_common *ns_common = NULL; 123 struct pid_namespace *pid_ns; 124 125 if (arg) 126 return -EINVAL; 127 128 task = get_pid_task(pid, PIDTYPE_PID); 129 if (!task) 130 return -ESRCH; 131 132 scoped_guard(task_lock, task) { 133 nsp = task->nsproxy; 134 if (nsp) 135 get_nsproxy(nsp); 136 } 137 if (!nsp) 138 return -ESRCH; /* just pretend it didn't exist */ 139 140 /* 141 * We're trying to open a file descriptor to the namespace so perform a 142 * filesystem cred ptrace check. Also, we mirror nsfs behavior. 143 */ 144 if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) 145 return -EACCES; 146 147 switch (cmd) { 148 /* Namespaces that hang of nsproxy. */ 149 case PIDFD_GET_CGROUP_NAMESPACE: 150 if (IS_ENABLED(CONFIG_CGROUPS)) { 151 get_cgroup_ns(nsp->cgroup_ns); 152 ns_common = to_ns_common(nsp->cgroup_ns); 153 } 154 break; 155 case PIDFD_GET_IPC_NAMESPACE: 156 if (IS_ENABLED(CONFIG_IPC_NS)) { 157 get_ipc_ns(nsp->ipc_ns); 158 ns_common = to_ns_common(nsp->ipc_ns); 159 } 160 break; 161 case PIDFD_GET_MNT_NAMESPACE: 162 get_mnt_ns(nsp->mnt_ns); 163 ns_common = to_ns_common(nsp->mnt_ns); 164 break; 165 case PIDFD_GET_NET_NAMESPACE: 166 if (IS_ENABLED(CONFIG_NET_NS)) { 167 ns_common = to_ns_common(nsp->net_ns); 168 get_net_ns(ns_common); 169 } 170 break; 171 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 172 if (IS_ENABLED(CONFIG_PID_NS)) { 173 get_pid_ns(nsp->pid_ns_for_children); 174 ns_common = to_ns_common(nsp->pid_ns_for_children); 175 } 176 break; 177 case PIDFD_GET_TIME_NAMESPACE: 178 if (IS_ENABLED(CONFIG_TIME_NS)) { 179 get_time_ns(nsp->time_ns); 180 ns_common = to_ns_common(nsp->time_ns); 181 } 182 break; 183 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 184 if (IS_ENABLED(CONFIG_TIME_NS)) { 185 get_time_ns(nsp->time_ns_for_children); 186 ns_common = to_ns_common(nsp->time_ns_for_children); 187 } 188 break; 189 case PIDFD_GET_UTS_NAMESPACE: 190 if (IS_ENABLED(CONFIG_UTS_NS)) { 191 get_uts_ns(nsp->uts_ns); 192 ns_common = to_ns_common(nsp->uts_ns); 193 } 194 break; 195 /* Namespaces that don't hang of nsproxy. */ 196 case PIDFD_GET_USER_NAMESPACE: 197 if (IS_ENABLED(CONFIG_USER_NS)) { 198 rcu_read_lock(); 199 ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); 200 rcu_read_unlock(); 201 } 202 break; 203 case PIDFD_GET_PID_NAMESPACE: 204 if (IS_ENABLED(CONFIG_PID_NS)) { 205 rcu_read_lock(); 206 pid_ns = task_active_pid_ns(task); 207 if (pid_ns) 208 ns_common = to_ns_common(get_pid_ns(pid_ns)); 209 rcu_read_unlock(); 210 } 211 break; 212 default: 213 return -ENOIOCTLCMD; 214 } 215 216 if (!ns_common) 217 return -EOPNOTSUPP; 218 219 /* open_namespace() unconditionally consumes the reference */ 220 return open_namespace(ns_common); 221 } 222 223 static const struct file_operations pidfs_file_operations = { 224 .poll = pidfd_poll, 225 #ifdef CONFIG_PROC_FS 226 .show_fdinfo = pidfd_show_fdinfo, 227 #endif 228 .unlocked_ioctl = pidfd_ioctl, 229 .compat_ioctl = compat_ptr_ioctl, 230 }; 231 232 struct pid *pidfd_pid(const struct file *file) 233 { 234 if (file->f_op != &pidfs_file_operations) 235 return ERR_PTR(-EBADF); 236 return file_inode(file)->i_private; 237 } 238 239 static struct vfsmount *pidfs_mnt __ro_after_init; 240 241 #if BITS_PER_LONG == 32 242 /* 243 * Provide a fallback mechanism for 32-bit systems so processes remain 244 * reliably comparable by inode number even on those systems. 245 */ 246 static DEFINE_IDA(pidfd_inum_ida); 247 248 static int pidfs_inum(struct pid *pid, unsigned long *ino) 249 { 250 int ret; 251 252 ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1, 253 UINT_MAX, GFP_ATOMIC); 254 if (ret < 0) 255 return -ENOSPC; 256 257 *ino = ret; 258 return 0; 259 } 260 261 static inline void pidfs_free_inum(unsigned long ino) 262 { 263 if (ino > 0) 264 ida_free(&pidfd_inum_ida, ino); 265 } 266 #else 267 static inline int pidfs_inum(struct pid *pid, unsigned long *ino) 268 { 269 *ino = pid->ino; 270 return 0; 271 } 272 #define pidfs_free_inum(ino) ((void)(ino)) 273 #endif 274 275 /* 276 * The vfs falls back to simple_setattr() if i_op->setattr() isn't 277 * implemented. Let's reject it completely until we have a clean 278 * permission concept for pidfds. 279 */ 280 static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 281 struct iattr *attr) 282 { 283 return -EOPNOTSUPP; 284 } 285 286 287 /* 288 * User space expects pidfs inodes to have no file type in st_mode. 289 * 290 * In particular, 'lsof' has this legacy logic: 291 * 292 * type = s->st_mode & S_IFMT; 293 * switch (type) { 294 * ... 295 * case 0: 296 * if (!strcmp(p, "anon_inode")) 297 * Lf->ntype = Ntype = N_ANON_INODE; 298 * 299 * to detect our old anon_inode logic. 300 * 301 * Rather than mess with our internal sane inode data, just fix it 302 * up here in getattr() by masking off the format bits. 303 */ 304 static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, 305 struct kstat *stat, u32 request_mask, 306 unsigned int query_flags) 307 { 308 struct inode *inode = d_inode(path->dentry); 309 310 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 311 stat->mode &= ~S_IFMT; 312 return 0; 313 } 314 315 static const struct inode_operations pidfs_inode_operations = { 316 .getattr = pidfs_getattr, 317 .setattr = pidfs_setattr, 318 }; 319 320 static void pidfs_evict_inode(struct inode *inode) 321 { 322 struct pid *pid = inode->i_private; 323 324 clear_inode(inode); 325 put_pid(pid); 326 pidfs_free_inum(inode->i_ino); 327 } 328 329 static const struct super_operations pidfs_sops = { 330 .drop_inode = generic_delete_inode, 331 .evict_inode = pidfs_evict_inode, 332 .statfs = simple_statfs, 333 }; 334 335 /* 336 * 'lsof' has knowledge of out historical anon_inode use, and expects 337 * the pidfs dentry name to start with 'anon_inode'. 338 */ 339 static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) 340 { 341 return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); 342 } 343 344 static const struct dentry_operations pidfs_dentry_operations = { 345 .d_delete = always_delete_dentry, 346 .d_dname = pidfs_dname, 347 .d_prune = stashed_dentry_prune, 348 }; 349 350 static int pidfs_init_inode(struct inode *inode, void *data) 351 { 352 inode->i_private = data; 353 inode->i_flags |= S_PRIVATE; 354 inode->i_mode |= S_IRWXU; 355 inode->i_op = &pidfs_inode_operations; 356 inode->i_fop = &pidfs_file_operations; 357 /* 358 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This 359 * avoids collisions with the root inode which is 1 for pseudo 360 * filesystems. 361 */ 362 return pidfs_inum(data, &inode->i_ino); 363 } 364 365 static void pidfs_put_data(void *data) 366 { 367 struct pid *pid = data; 368 put_pid(pid); 369 } 370 371 static const struct stashed_operations pidfs_stashed_ops = { 372 .init_inode = pidfs_init_inode, 373 .put_data = pidfs_put_data, 374 }; 375 376 static int pidfs_init_fs_context(struct fs_context *fc) 377 { 378 struct pseudo_fs_context *ctx; 379 380 ctx = init_pseudo(fc, PID_FS_MAGIC); 381 if (!ctx) 382 return -ENOMEM; 383 384 ctx->ops = &pidfs_sops; 385 ctx->dops = &pidfs_dentry_operations; 386 fc->s_fs_info = (void *)&pidfs_stashed_ops; 387 return 0; 388 } 389 390 static struct file_system_type pidfs_type = { 391 .name = "pidfs", 392 .init_fs_context = pidfs_init_fs_context, 393 .kill_sb = kill_anon_super, 394 }; 395 396 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) 397 { 398 399 struct file *pidfd_file; 400 struct path path; 401 int ret; 402 403 ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); 404 if (ret < 0) 405 return ERR_PTR(ret); 406 407 pidfd_file = dentry_open(&path, flags, current_cred()); 408 path_put(&path); 409 return pidfd_file; 410 } 411 412 void __init pidfs_init(void) 413 { 414 pidfs_mnt = kern_mount(&pidfs_type); 415 if (IS_ERR(pidfs_mnt)) 416 panic("Failed to mount pidfs pseudo filesystem"); 417 } 418