xref: /linux/fs/pidfs.c (revision 196e9f5bf2c566682f52fb6a25276794dded2fe9)
1  // SPDX-License-Identifier: GPL-2.0
2  #include <linux/anon_inodes.h>
3  #include <linux/file.h>
4  #include <linux/fs.h>
5  #include <linux/magic.h>
6  #include <linux/mount.h>
7  #include <linux/pid.h>
8  #include <linux/pidfs.h>
9  #include <linux/pid_namespace.h>
10  #include <linux/poll.h>
11  #include <linux/proc_fs.h>
12  #include <linux/proc_ns.h>
13  #include <linux/pseudo_fs.h>
14  #include <linux/seq_file.h>
15  #include <uapi/linux/pidfd.h>
16  
17  #include "internal.h"
18  
19  #ifdef CONFIG_PROC_FS
20  /**
21   * pidfd_show_fdinfo - print information about a pidfd
22   * @m: proc fdinfo file
23   * @f: file referencing a pidfd
24   *
25   * Pid:
26   * This function will print the pid that a given pidfd refers to in the
27   * pid namespace of the procfs instance.
28   * If the pid namespace of the process is not a descendant of the pid
29   * namespace of the procfs instance 0 will be shown as its pid. This is
30   * similar to calling getppid() on a process whose parent is outside of
31   * its pid namespace.
32   *
33   * NSpid:
34   * If pid namespaces are supported then this function will also print
35   * the pid of a given pidfd refers to for all descendant pid namespaces
36   * starting from the current pid namespace of the instance, i.e. the
37   * Pid field and the first entry in the NSpid field will be identical.
38   * If the pid namespace of the process is not a descendant of the pid
39   * namespace of the procfs instance 0 will be shown as its first NSpid
40   * entry and no others will be shown.
41   * Note that this differs from the Pid and NSpid fields in
42   * /proc/<pid>/status where Pid and NSpid are always shown relative to
43   * the  pid namespace of the procfs instance. The difference becomes
44   * obvious when sending around a pidfd between pid namespaces from a
45   * different branch of the tree, i.e. where no ancestral relation is
46   * present between the pid namespaces:
47   * - create two new pid namespaces ns1 and ns2 in the initial pid
48   *   namespace (also take care to create new mount namespaces in the
49   *   new pid namespace and mount procfs)
50   * - create a process with a pidfd in ns1
51   * - send pidfd from ns1 to ns2
52   * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
53   *   have exactly one entry, which is 0
54   */
55  static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
56  {
57  	struct pid *pid = pidfd_pid(f);
58  	struct pid_namespace *ns;
59  	pid_t nr = -1;
60  
61  	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
62  		ns = proc_pid_ns(file_inode(m->file)->i_sb);
63  		nr = pid_nr_ns(pid, ns);
64  	}
65  
66  	seq_put_decimal_ll(m, "Pid:\t", nr);
67  
68  #ifdef CONFIG_PID_NS
69  	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
70  	if (nr > 0) {
71  		int i;
72  
73  		/* If nr is non-zero it means that 'pid' is valid and that
74  		 * ns, i.e. the pid namespace associated with the procfs
75  		 * instance, is in the pid namespace hierarchy of pid.
76  		 * Start at one below the already printed level.
77  		 */
78  		for (i = ns->level + 1; i <= pid->level; i++)
79  			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
80  	}
81  #endif
82  	seq_putc(m, '\n');
83  }
84  #endif
85  
86  /*
87   * Poll support for process exit notification.
88   */
89  static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
90  {
91  	struct pid *pid = pidfd_pid(file);
92  	bool thread = file->f_flags & PIDFD_THREAD;
93  	struct task_struct *task;
94  	__poll_t poll_flags = 0;
95  
96  	poll_wait(file, &pid->wait_pidfd, pts);
97  	/*
98  	 * Depending on PIDFD_THREAD, inform pollers when the thread
99  	 * or the whole thread-group exits.
100  	 */
101  	guard(rcu)();
102  	task = pid_task(pid, PIDTYPE_PID);
103  	if (!task)
104  		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
105  	else if (task->exit_state && (thread || thread_group_empty(task)))
106  		poll_flags = EPOLLIN | EPOLLRDNORM;
107  
108  	return poll_flags;
109  }
110  
111  static const struct file_operations pidfs_file_operations = {
112  	.poll		= pidfd_poll,
113  #ifdef CONFIG_PROC_FS
114  	.show_fdinfo	= pidfd_show_fdinfo,
115  #endif
116  };
117  
118  struct pid *pidfd_pid(const struct file *file)
119  {
120  	if (file->f_op != &pidfs_file_operations)
121  		return ERR_PTR(-EBADF);
122  	return file_inode(file)->i_private;
123  }
124  
125  static struct vfsmount *pidfs_mnt __ro_after_init;
126  
127  #if BITS_PER_LONG == 32
128  /*
129   * Provide a fallback mechanism for 32-bit systems so processes remain
130   * reliably comparable by inode number even on those systems.
131   */
132  static DEFINE_IDA(pidfd_inum_ida);
133  
134  static int pidfs_inum(struct pid *pid, unsigned long *ino)
135  {
136  	int ret;
137  
138  	ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
139  			      UINT_MAX, GFP_ATOMIC);
140  	if (ret < 0)
141  		return -ENOSPC;
142  
143  	*ino = ret;
144  	return 0;
145  }
146  
147  static inline void pidfs_free_inum(unsigned long ino)
148  {
149  	if (ino > 0)
150  		ida_free(&pidfd_inum_ida, ino);
151  }
152  #else
153  static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
154  {
155  	*ino = pid->ino;
156  	return 0;
157  }
158  #define pidfs_free_inum(ino) ((void)(ino))
159  #endif
160  
161  /*
162   * The vfs falls back to simple_setattr() if i_op->setattr() isn't
163   * implemented. Let's reject it completely until we have a clean
164   * permission concept for pidfds.
165   */
166  static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
167  			 struct iattr *attr)
168  {
169  	return -EOPNOTSUPP;
170  }
171  
172  
173  /*
174   * User space expects pidfs inodes to have no file type in st_mode.
175   *
176   * In particular, 'lsof' has this legacy logic:
177   *
178   *	type = s->st_mode & S_IFMT;
179   *	switch (type) {
180   *	  ...
181   *	case 0:
182   *		if (!strcmp(p, "anon_inode"))
183   *			Lf->ntype = Ntype = N_ANON_INODE;
184   *
185   * to detect our old anon_inode logic.
186   *
187   * Rather than mess with our internal sane inode data, just fix it
188   * up here in getattr() by masking off the format bits.
189   */
190  static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
191  			 struct kstat *stat, u32 request_mask,
192  			 unsigned int query_flags)
193  {
194  	struct inode *inode = d_inode(path->dentry);
195  
196  	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
197  	stat->mode &= ~S_IFMT;
198  	return 0;
199  }
200  
201  static const struct inode_operations pidfs_inode_operations = {
202  	.getattr = pidfs_getattr,
203  	.setattr = pidfs_setattr,
204  };
205  
206  static void pidfs_evict_inode(struct inode *inode)
207  {
208  	struct pid *pid = inode->i_private;
209  
210  	clear_inode(inode);
211  	put_pid(pid);
212  	pidfs_free_inum(inode->i_ino);
213  }
214  
215  static const struct super_operations pidfs_sops = {
216  	.drop_inode	= generic_delete_inode,
217  	.evict_inode	= pidfs_evict_inode,
218  	.statfs		= simple_statfs,
219  };
220  
221  /*
222   * 'lsof' has knowledge of out historical anon_inode use, and expects
223   * the pidfs dentry name to start with 'anon_inode'.
224   */
225  static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
226  {
227  	return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
228  }
229  
230  static const struct dentry_operations pidfs_dentry_operations = {
231  	.d_delete	= always_delete_dentry,
232  	.d_dname	= pidfs_dname,
233  	.d_prune	= stashed_dentry_prune,
234  };
235  
236  static int pidfs_init_inode(struct inode *inode, void *data)
237  {
238  	inode->i_private = data;
239  	inode->i_flags |= S_PRIVATE;
240  	inode->i_mode |= S_IRWXU;
241  	inode->i_op = &pidfs_inode_operations;
242  	inode->i_fop = &pidfs_file_operations;
243  	/*
244  	 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
245  	 * avoids collisions with the root inode which is 1 for pseudo
246  	 * filesystems.
247  	 */
248  	return pidfs_inum(data, &inode->i_ino);
249  }
250  
251  static void pidfs_put_data(void *data)
252  {
253  	struct pid *pid = data;
254  	put_pid(pid);
255  }
256  
257  static const struct stashed_operations pidfs_stashed_ops = {
258  	.init_inode = pidfs_init_inode,
259  	.put_data = pidfs_put_data,
260  };
261  
262  static int pidfs_init_fs_context(struct fs_context *fc)
263  {
264  	struct pseudo_fs_context *ctx;
265  
266  	ctx = init_pseudo(fc, PID_FS_MAGIC);
267  	if (!ctx)
268  		return -ENOMEM;
269  
270  	ctx->ops = &pidfs_sops;
271  	ctx->dops = &pidfs_dentry_operations;
272  	fc->s_fs_info = (void *)&pidfs_stashed_ops;
273  	return 0;
274  }
275  
276  static struct file_system_type pidfs_type = {
277  	.name			= "pidfs",
278  	.init_fs_context	= pidfs_init_fs_context,
279  	.kill_sb		= kill_anon_super,
280  };
281  
282  struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
283  {
284  
285  	struct file *pidfd_file;
286  	struct path path;
287  	int ret;
288  
289  	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
290  	if (ret < 0)
291  		return ERR_PTR(ret);
292  
293  	pidfd_file = dentry_open(&path, flags, current_cred());
294  	path_put(&path);
295  	return pidfd_file;
296  }
297  
298  void __init pidfs_init(void)
299  {
300  	pidfs_mnt = kern_mount(&pidfs_type);
301  	if (IS_ERR(pidfs_mnt))
302  		panic("Failed to mount pidfs pseudo filesystem");
303  }
304