xref: /linux/fs/namespace.c (revision adc4fb9c814b5d5cc6021022900fd5eb0b3c8165)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   *  linux/fs/namespace.c
4   *
5   * (C) Copyright Al Viro 2000, 2001
6   *
7   * Based on code from fs/super.c, copyright Linus Torvalds and others.
8   * Heavily rewritten.
9   */
10  
11  #include <linux/syscalls.h>
12  #include <linux/export.h>
13  #include <linux/capability.h>
14  #include <linux/mnt_namespace.h>
15  #include <linux/user_namespace.h>
16  #include <linux/namei.h>
17  #include <linux/security.h>
18  #include <linux/cred.h>
19  #include <linux/idr.h>
20  #include <linux/init.h>		/* init_rootfs */
21  #include <linux/fs_struct.h>	/* get_fs_root et.al. */
22  #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
23  #include <linux/file.h>
24  #include <linux/uaccess.h>
25  #include <linux/proc_ns.h>
26  #include <linux/magic.h>
27  #include <linux/memblock.h>
28  #include <linux/proc_fs.h>
29  #include <linux/task_work.h>
30  #include <linux/sched/task.h>
31  #include <uapi/linux/mount.h>
32  #include <linux/fs_context.h>
33  #include <linux/shmem_fs.h>
34  #include <linux/mnt_idmapping.h>
35  #include <linux/pidfs.h>
36  
37  #include "pnode.h"
38  #include "internal.h"
39  
40  /* Maximum number of mounts in a mount namespace */
41  static unsigned int sysctl_mount_max __read_mostly = 100000;
42  
43  static unsigned int m_hash_mask __ro_after_init;
44  static unsigned int m_hash_shift __ro_after_init;
45  static unsigned int mp_hash_mask __ro_after_init;
46  static unsigned int mp_hash_shift __ro_after_init;
47  
48  static __initdata unsigned long mhash_entries;
set_mhash_entries(char * str)49  static int __init set_mhash_entries(char *str)
50  {
51  	if (!str)
52  		return 0;
53  	mhash_entries = simple_strtoul(str, &str, 0);
54  	return 1;
55  }
56  __setup("mhash_entries=", set_mhash_entries);
57  
58  static __initdata unsigned long mphash_entries;
set_mphash_entries(char * str)59  static int __init set_mphash_entries(char *str)
60  {
61  	if (!str)
62  		return 0;
63  	mphash_entries = simple_strtoul(str, &str, 0);
64  	return 1;
65  }
66  __setup("mphash_entries=", set_mphash_entries);
67  
68  static u64 event;
69  static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
70  static DEFINE_IDA(mnt_group_ida);
71  
72  /* Don't allow confusion with old 32bit mount ID */
73  #define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
74  static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
75  
76  static struct hlist_head *mount_hashtable __ro_after_init;
77  static struct hlist_head *mountpoint_hashtable __ro_after_init;
78  static struct kmem_cache *mnt_cache __ro_after_init;
79  static DECLARE_RWSEM(namespace_sem);
80  static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
81  static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
82  static DEFINE_SEQLOCK(mnt_ns_tree_lock);
83  
84  #ifdef CONFIG_FSNOTIFY
85  LIST_HEAD(notify_list); /* protected by namespace_sem */
86  #endif
87  static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
88  static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
89  
90  enum mount_kattr_flags_t {
91  	MOUNT_KATTR_RECURSE		= (1 << 0),
92  	MOUNT_KATTR_IDMAP_REPLACE	= (1 << 1),
93  };
94  
95  struct mount_kattr {
96  	unsigned int attr_set;
97  	unsigned int attr_clr;
98  	unsigned int propagation;
99  	unsigned int lookup_flags;
100  	enum mount_kattr_flags_t kflags;
101  	struct user_namespace *mnt_userns;
102  	struct mnt_idmap *mnt_idmap;
103  };
104  
105  /* /sys/fs */
106  struct kobject *fs_kobj __ro_after_init;
107  EXPORT_SYMBOL_GPL(fs_kobj);
108  
109  /*
110   * vfsmount lock may be taken for read to prevent changes to the
111   * vfsmount hash, ie. during mountpoint lookups or walking back
112   * up the tree.
113   *
114   * It should be taken for write in all cases where the vfsmount
115   * tree or hash is modified or when a vfsmount structure is modified.
116   */
117  __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
118  
node_to_mnt_ns(const struct rb_node * node)119  static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
120  {
121  	if (!node)
122  		return NULL;
123  	return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
124  }
125  
mnt_ns_cmp(struct rb_node * a,const struct rb_node * b)126  static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
127  {
128  	struct mnt_namespace *ns_a = node_to_mnt_ns(a);
129  	struct mnt_namespace *ns_b = node_to_mnt_ns(b);
130  	u64 seq_a = ns_a->seq;
131  	u64 seq_b = ns_b->seq;
132  
133  	if (seq_a < seq_b)
134  		return -1;
135  	if (seq_a > seq_b)
136  		return 1;
137  	return 0;
138  }
139  
mnt_ns_tree_write_lock(void)140  static inline void mnt_ns_tree_write_lock(void)
141  {
142  	write_seqlock(&mnt_ns_tree_lock);
143  }
144  
mnt_ns_tree_write_unlock(void)145  static inline void mnt_ns_tree_write_unlock(void)
146  {
147  	write_sequnlock(&mnt_ns_tree_lock);
148  }
149  
mnt_ns_tree_add(struct mnt_namespace * ns)150  static void mnt_ns_tree_add(struct mnt_namespace *ns)
151  {
152  	struct rb_node *node, *prev;
153  
154  	mnt_ns_tree_write_lock();
155  	node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
156  	/*
157  	 * If there's no previous entry simply add it after the
158  	 * head and if there is add it after the previous entry.
159  	 */
160  	prev = rb_prev(&ns->mnt_ns_tree_node);
161  	if (!prev)
162  		list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
163  	else
164  		list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
165  	mnt_ns_tree_write_unlock();
166  
167  	WARN_ON_ONCE(node);
168  }
169  
mnt_ns_release(struct mnt_namespace * ns)170  static void mnt_ns_release(struct mnt_namespace *ns)
171  {
172  	/* keep alive for {list,stat}mount() */
173  	if (refcount_dec_and_test(&ns->passive)) {
174  		fsnotify_mntns_delete(ns);
175  		put_user_ns(ns->user_ns);
176  		kfree(ns);
177  	}
178  }
DEFINE_FREE(mnt_ns_release,struct mnt_namespace *,if (_T)mnt_ns_release (_T))179  DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
180  
181  static void mnt_ns_release_rcu(struct rcu_head *rcu)
182  {
183  	mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
184  }
185  
mnt_ns_tree_remove(struct mnt_namespace * ns)186  static void mnt_ns_tree_remove(struct mnt_namespace *ns)
187  {
188  	/* remove from global mount namespace list */
189  	if (!is_anon_ns(ns)) {
190  		mnt_ns_tree_write_lock();
191  		rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
192  		list_bidir_del_rcu(&ns->mnt_ns_list);
193  		mnt_ns_tree_write_unlock();
194  	}
195  
196  	call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
197  }
198  
mnt_ns_find(const void * key,const struct rb_node * node)199  static int mnt_ns_find(const void *key, const struct rb_node *node)
200  {
201  	const u64 mnt_ns_id = *(u64 *)key;
202  	const struct mnt_namespace *ns = node_to_mnt_ns(node);
203  
204  	if (mnt_ns_id < ns->seq)
205  		return -1;
206  	if (mnt_ns_id > ns->seq)
207  		return 1;
208  	return 0;
209  }
210  
211  /*
212   * Lookup a mount namespace by id and take a passive reference count. Taking a
213   * passive reference means the mount namespace can be emptied if e.g., the last
214   * task holding an active reference exits. To access the mounts of the
215   * namespace the @namespace_sem must first be acquired. If the namespace has
216   * already shut down before acquiring @namespace_sem, {list,stat}mount() will
217   * see that the mount rbtree of the namespace is empty.
218   *
219   * Note the lookup is lockless protected by a sequence counter. We only
220   * need to guard against false negatives as false positives aren't
221   * possible. So if we didn't find a mount namespace and the sequence
222   * counter has changed we need to retry. If the sequence counter is
223   * still the same we know the search actually failed.
224   */
lookup_mnt_ns(u64 mnt_ns_id)225  static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
226  {
227  	struct mnt_namespace *ns;
228  	struct rb_node *node;
229  	unsigned int seq;
230  
231  	guard(rcu)();
232  	do {
233  		seq = read_seqbegin(&mnt_ns_tree_lock);
234  		node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
235  		if (node)
236  			break;
237  	} while (read_seqretry(&mnt_ns_tree_lock, seq));
238  
239  	if (!node)
240  		return NULL;
241  
242  	/*
243  	 * The last reference count is put with RCU delay so we can
244  	 * unconditonally acquire a reference here.
245  	 */
246  	ns = node_to_mnt_ns(node);
247  	refcount_inc(&ns->passive);
248  	return ns;
249  }
250  
lock_mount_hash(void)251  static inline void lock_mount_hash(void)
252  {
253  	write_seqlock(&mount_lock);
254  }
255  
unlock_mount_hash(void)256  static inline void unlock_mount_hash(void)
257  {
258  	write_sequnlock(&mount_lock);
259  }
260  
m_hash(struct vfsmount * mnt,struct dentry * dentry)261  static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry)
262  {
263  	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
264  	tmp += ((unsigned long)dentry / L1_CACHE_BYTES);
265  	tmp = tmp + (tmp >> m_hash_shift);
266  	return &mount_hashtable[tmp & m_hash_mask];
267  }
268  
mp_hash(struct dentry * dentry)269  static inline struct hlist_head *mp_hash(struct dentry *dentry)
270  {
271  	unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES);
272  	tmp = tmp + (tmp >> mp_hash_shift);
273  	return &mountpoint_hashtable[tmp & mp_hash_mask];
274  }
275  
mnt_alloc_id(struct mount * mnt)276  static int mnt_alloc_id(struct mount *mnt)
277  {
278  	int res;
279  
280  	xa_lock(&mnt_id_xa);
281  	res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
282  	if (!res)
283  		mnt->mnt_id_unique = ++mnt_id_ctr;
284  	xa_unlock(&mnt_id_xa);
285  	return res;
286  }
287  
mnt_free_id(struct mount * mnt)288  static void mnt_free_id(struct mount *mnt)
289  {
290  	xa_erase(&mnt_id_xa, mnt->mnt_id);
291  }
292  
293  /*
294   * Allocate a new peer group ID
295   */
mnt_alloc_group_id(struct mount * mnt)296  static int mnt_alloc_group_id(struct mount *mnt)
297  {
298  	int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL);
299  
300  	if (res < 0)
301  		return res;
302  	mnt->mnt_group_id = res;
303  	return 0;
304  }
305  
306  /*
307   * Release a peer group ID
308   */
mnt_release_group_id(struct mount * mnt)309  void mnt_release_group_id(struct mount *mnt)
310  {
311  	ida_free(&mnt_group_ida, mnt->mnt_group_id);
312  	mnt->mnt_group_id = 0;
313  }
314  
315  /*
316   * vfsmount lock must be held for read
317   */
mnt_add_count(struct mount * mnt,int n)318  static inline void mnt_add_count(struct mount *mnt, int n)
319  {
320  #ifdef CONFIG_SMP
321  	this_cpu_add(mnt->mnt_pcp->mnt_count, n);
322  #else
323  	preempt_disable();
324  	mnt->mnt_count += n;
325  	preempt_enable();
326  #endif
327  }
328  
329  /*
330   * vfsmount lock must be held for write
331   */
mnt_get_count(struct mount * mnt)332  int mnt_get_count(struct mount *mnt)
333  {
334  #ifdef CONFIG_SMP
335  	int count = 0;
336  	int cpu;
337  
338  	for_each_possible_cpu(cpu) {
339  		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count;
340  	}
341  
342  	return count;
343  #else
344  	return mnt->mnt_count;
345  #endif
346  }
347  
alloc_vfsmnt(const char * name)348  static struct mount *alloc_vfsmnt(const char *name)
349  {
350  	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
351  	if (mnt) {
352  		int err;
353  
354  		err = mnt_alloc_id(mnt);
355  		if (err)
356  			goto out_free_cache;
357  
358  		if (name) {
359  			mnt->mnt_devname = kstrdup_const(name,
360  							 GFP_KERNEL_ACCOUNT);
361  			if (!mnt->mnt_devname)
362  				goto out_free_id;
363  		}
364  
365  #ifdef CONFIG_SMP
366  		mnt->mnt_pcp = alloc_percpu(struct mnt_pcp);
367  		if (!mnt->mnt_pcp)
368  			goto out_free_devname;
369  
370  		this_cpu_add(mnt->mnt_pcp->mnt_count, 1);
371  #else
372  		mnt->mnt_count = 1;
373  		mnt->mnt_writers = 0;
374  #endif
375  
376  		INIT_HLIST_NODE(&mnt->mnt_hash);
377  		INIT_LIST_HEAD(&mnt->mnt_child);
378  		INIT_LIST_HEAD(&mnt->mnt_mounts);
379  		INIT_LIST_HEAD(&mnt->mnt_list);
380  		INIT_LIST_HEAD(&mnt->mnt_expire);
381  		INIT_LIST_HEAD(&mnt->mnt_share);
382  		INIT_LIST_HEAD(&mnt->mnt_slave_list);
383  		INIT_LIST_HEAD(&mnt->mnt_slave);
384  		INIT_HLIST_NODE(&mnt->mnt_mp_list);
385  		INIT_LIST_HEAD(&mnt->mnt_umounting);
386  		INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
387  		RB_CLEAR_NODE(&mnt->mnt_node);
388  		mnt->mnt.mnt_idmap = &nop_mnt_idmap;
389  	}
390  	return mnt;
391  
392  #ifdef CONFIG_SMP
393  out_free_devname:
394  	kfree_const(mnt->mnt_devname);
395  #endif
396  out_free_id:
397  	mnt_free_id(mnt);
398  out_free_cache:
399  	kmem_cache_free(mnt_cache, mnt);
400  	return NULL;
401  }
402  
403  /*
404   * Most r/o checks on a fs are for operations that take
405   * discrete amounts of time, like a write() or unlink().
406   * We must keep track of when those operations start
407   * (for permission checks) and when they end, so that
408   * we can determine when writes are able to occur to
409   * a filesystem.
410   */
411  /*
412   * __mnt_is_readonly: check whether a mount is read-only
413   * @mnt: the mount to check for its write status
414   *
415   * This shouldn't be used directly ouside of the VFS.
416   * It does not guarantee that the filesystem will stay
417   * r/w, just that it is right *now*.  This can not and
418   * should not be used in place of IS_RDONLY(inode).
419   * mnt_want/drop_write() will _keep_ the filesystem
420   * r/w.
421   */
__mnt_is_readonly(struct vfsmount * mnt)422  bool __mnt_is_readonly(struct vfsmount *mnt)
423  {
424  	return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb);
425  }
426  EXPORT_SYMBOL_GPL(__mnt_is_readonly);
427  
mnt_inc_writers(struct mount * mnt)428  static inline void mnt_inc_writers(struct mount *mnt)
429  {
430  #ifdef CONFIG_SMP
431  	this_cpu_inc(mnt->mnt_pcp->mnt_writers);
432  #else
433  	mnt->mnt_writers++;
434  #endif
435  }
436  
mnt_dec_writers(struct mount * mnt)437  static inline void mnt_dec_writers(struct mount *mnt)
438  {
439  #ifdef CONFIG_SMP
440  	this_cpu_dec(mnt->mnt_pcp->mnt_writers);
441  #else
442  	mnt->mnt_writers--;
443  #endif
444  }
445  
mnt_get_writers(struct mount * mnt)446  static unsigned int mnt_get_writers(struct mount *mnt)
447  {
448  #ifdef CONFIG_SMP
449  	unsigned int count = 0;
450  	int cpu;
451  
452  	for_each_possible_cpu(cpu) {
453  		count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers;
454  	}
455  
456  	return count;
457  #else
458  	return mnt->mnt_writers;
459  #endif
460  }
461  
mnt_is_readonly(struct vfsmount * mnt)462  static int mnt_is_readonly(struct vfsmount *mnt)
463  {
464  	if (READ_ONCE(mnt->mnt_sb->s_readonly_remount))
465  		return 1;
466  	/*
467  	 * The barrier pairs with the barrier in sb_start_ro_state_change()
468  	 * making sure if we don't see s_readonly_remount set yet, we also will
469  	 * not see any superblock / mount flag changes done by remount.
470  	 * It also pairs with the barrier in sb_end_ro_state_change()
471  	 * assuring that if we see s_readonly_remount already cleared, we will
472  	 * see the values of superblock / mount flags updated by remount.
473  	 */
474  	smp_rmb();
475  	return __mnt_is_readonly(mnt);
476  }
477  
478  /*
479   * Most r/o & frozen checks on a fs are for operations that take discrete
480   * amounts of time, like a write() or unlink().  We must keep track of when
481   * those operations start (for permission checks) and when they end, so that we
482   * can determine when writes are able to occur to a filesystem.
483   */
484  /**
485   * mnt_get_write_access - get write access to a mount without freeze protection
486   * @m: the mount on which to take a write
487   *
488   * This tells the low-level filesystem that a write is about to be performed to
489   * it, and makes sure that writes are allowed (mnt it read-write) before
490   * returning success. This operation does not protect against filesystem being
491   * frozen. When the write operation is finished, mnt_put_write_access() must be
492   * called. This is effectively a refcount.
493   */
mnt_get_write_access(struct vfsmount * m)494  int mnt_get_write_access(struct vfsmount *m)
495  {
496  	struct mount *mnt = real_mount(m);
497  	int ret = 0;
498  
499  	preempt_disable();
500  	mnt_inc_writers(mnt);
501  	/*
502  	 * The store to mnt_inc_writers must be visible before we pass
503  	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
504  	 * incremented count after it has set MNT_WRITE_HOLD.
505  	 */
506  	smp_mb();
507  	might_lock(&mount_lock.lock);
508  	while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) {
509  		if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
510  			cpu_relax();
511  		} else {
512  			/*
513  			 * This prevents priority inversion, if the task
514  			 * setting MNT_WRITE_HOLD got preempted on a remote
515  			 * CPU, and it prevents life lock if the task setting
516  			 * MNT_WRITE_HOLD has a lower priority and is bound to
517  			 * the same CPU as the task that is spinning here.
518  			 */
519  			preempt_enable();
520  			lock_mount_hash();
521  			unlock_mount_hash();
522  			preempt_disable();
523  		}
524  	}
525  	/*
526  	 * The barrier pairs with the barrier sb_start_ro_state_change() making
527  	 * sure that if we see MNT_WRITE_HOLD cleared, we will also see
528  	 * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in
529  	 * mnt_is_readonly() and bail in case we are racing with remount
530  	 * read-only.
531  	 */
532  	smp_rmb();
533  	if (mnt_is_readonly(m)) {
534  		mnt_dec_writers(mnt);
535  		ret = -EROFS;
536  	}
537  	preempt_enable();
538  
539  	return ret;
540  }
541  EXPORT_SYMBOL_GPL(mnt_get_write_access);
542  
543  /**
544   * mnt_want_write - get write access to a mount
545   * @m: the mount on which to take a write
546   *
547   * This tells the low-level filesystem that a write is about to be performed to
548   * it, and makes sure that writes are allowed (mount is read-write, filesystem
549   * is not frozen) before returning success.  When the write operation is
550   * finished, mnt_drop_write() must be called.  This is effectively a refcount.
551   */
mnt_want_write(struct vfsmount * m)552  int mnt_want_write(struct vfsmount *m)
553  {
554  	int ret;
555  
556  	sb_start_write(m->mnt_sb);
557  	ret = mnt_get_write_access(m);
558  	if (ret)
559  		sb_end_write(m->mnt_sb);
560  	return ret;
561  }
562  EXPORT_SYMBOL_GPL(mnt_want_write);
563  
564  /**
565   * mnt_get_write_access_file - get write access to a file's mount
566   * @file: the file who's mount on which to take a write
567   *
568   * This is like mnt_get_write_access, but if @file is already open for write it
569   * skips incrementing mnt_writers (since the open file already has a reference)
570   * and instead only does the check for emergency r/o remounts.  This must be
571   * paired with mnt_put_write_access_file.
572   */
mnt_get_write_access_file(struct file * file)573  int mnt_get_write_access_file(struct file *file)
574  {
575  	if (file->f_mode & FMODE_WRITER) {
576  		/*
577  		 * Superblock may have become readonly while there are still
578  		 * writable fd's, e.g. due to a fs error with errors=remount-ro
579  		 */
580  		if (__mnt_is_readonly(file->f_path.mnt))
581  			return -EROFS;
582  		return 0;
583  	}
584  	return mnt_get_write_access(file->f_path.mnt);
585  }
586  
587  /**
588   * mnt_want_write_file - get write access to a file's mount
589   * @file: the file who's mount on which to take a write
590   *
591   * This is like mnt_want_write, but if the file is already open for writing it
592   * skips incrementing mnt_writers (since the open file already has a reference)
593   * and instead only does the freeze protection and the check for emergency r/o
594   * remounts.  This must be paired with mnt_drop_write_file.
595   */
mnt_want_write_file(struct file * file)596  int mnt_want_write_file(struct file *file)
597  {
598  	int ret;
599  
600  	sb_start_write(file_inode(file)->i_sb);
601  	ret = mnt_get_write_access_file(file);
602  	if (ret)
603  		sb_end_write(file_inode(file)->i_sb);
604  	return ret;
605  }
606  EXPORT_SYMBOL_GPL(mnt_want_write_file);
607  
608  /**
609   * mnt_put_write_access - give up write access to a mount
610   * @mnt: the mount on which to give up write access
611   *
612   * Tells the low-level filesystem that we are done
613   * performing writes to it.  Must be matched with
614   * mnt_get_write_access() call above.
615   */
mnt_put_write_access(struct vfsmount * mnt)616  void mnt_put_write_access(struct vfsmount *mnt)
617  {
618  	preempt_disable();
619  	mnt_dec_writers(real_mount(mnt));
620  	preempt_enable();
621  }
622  EXPORT_SYMBOL_GPL(mnt_put_write_access);
623  
624  /**
625   * mnt_drop_write - give up write access to a mount
626   * @mnt: the mount on which to give up write access
627   *
628   * Tells the low-level filesystem that we are done performing writes to it and
629   * also allows filesystem to be frozen again.  Must be matched with
630   * mnt_want_write() call above.
631   */
mnt_drop_write(struct vfsmount * mnt)632  void mnt_drop_write(struct vfsmount *mnt)
633  {
634  	mnt_put_write_access(mnt);
635  	sb_end_write(mnt->mnt_sb);
636  }
637  EXPORT_SYMBOL_GPL(mnt_drop_write);
638  
mnt_put_write_access_file(struct file * file)639  void mnt_put_write_access_file(struct file *file)
640  {
641  	if (!(file->f_mode & FMODE_WRITER))
642  		mnt_put_write_access(file->f_path.mnt);
643  }
644  
mnt_drop_write_file(struct file * file)645  void mnt_drop_write_file(struct file *file)
646  {
647  	mnt_put_write_access_file(file);
648  	sb_end_write(file_inode(file)->i_sb);
649  }
650  EXPORT_SYMBOL(mnt_drop_write_file);
651  
652  /**
653   * mnt_hold_writers - prevent write access to the given mount
654   * @mnt: mnt to prevent write access to
655   *
656   * Prevents write access to @mnt if there are no active writers for @mnt.
657   * This function needs to be called and return successfully before changing
658   * properties of @mnt that need to remain stable for callers with write access
659   * to @mnt.
660   *
661   * After this functions has been called successfully callers must pair it with
662   * a call to mnt_unhold_writers() in order to stop preventing write access to
663   * @mnt.
664   *
665   * Context: This function expects lock_mount_hash() to be held serializing
666   *          setting MNT_WRITE_HOLD.
667   * Return: On success 0 is returned.
668   *	   On error, -EBUSY is returned.
669   */
mnt_hold_writers(struct mount * mnt)670  static inline int mnt_hold_writers(struct mount *mnt)
671  {
672  	mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
673  	/*
674  	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
675  	 * should be visible before we do.
676  	 */
677  	smp_mb();
678  
679  	/*
680  	 * With writers on hold, if this value is zero, then there are
681  	 * definitely no active writers (although held writers may subsequently
682  	 * increment the count, they'll have to wait, and decrement it after
683  	 * seeing MNT_READONLY).
684  	 *
685  	 * It is OK to have counter incremented on one CPU and decremented on
686  	 * another: the sum will add up correctly. The danger would be when we
687  	 * sum up each counter, if we read a counter before it is incremented,
688  	 * but then read another CPU's count which it has been subsequently
689  	 * decremented from -- we would see more decrements than we should.
690  	 * MNT_WRITE_HOLD protects against this scenario, because
691  	 * mnt_want_write first increments count, then smp_mb, then spins on
692  	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
693  	 * we're counting up here.
694  	 */
695  	if (mnt_get_writers(mnt) > 0)
696  		return -EBUSY;
697  
698  	return 0;
699  }
700  
701  /**
702   * mnt_unhold_writers - stop preventing write access to the given mount
703   * @mnt: mnt to stop preventing write access to
704   *
705   * Stop preventing write access to @mnt allowing callers to gain write access
706   * to @mnt again.
707   *
708   * This function can only be called after a successful call to
709   * mnt_hold_writers().
710   *
711   * Context: This function expects lock_mount_hash() to be held.
712   */
mnt_unhold_writers(struct mount * mnt)713  static inline void mnt_unhold_writers(struct mount *mnt)
714  {
715  	/*
716  	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
717  	 * that become unheld will see MNT_READONLY.
718  	 */
719  	smp_wmb();
720  	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
721  }
722  
mnt_make_readonly(struct mount * mnt)723  static int mnt_make_readonly(struct mount *mnt)
724  {
725  	int ret;
726  
727  	ret = mnt_hold_writers(mnt);
728  	if (!ret)
729  		mnt->mnt.mnt_flags |= MNT_READONLY;
730  	mnt_unhold_writers(mnt);
731  	return ret;
732  }
733  
sb_prepare_remount_readonly(struct super_block * sb)734  int sb_prepare_remount_readonly(struct super_block *sb)
735  {
736  	struct mount *mnt;
737  	int err = 0;
738  
739  	/* Racy optimization.  Recheck the counter under MNT_WRITE_HOLD */
740  	if (atomic_long_read(&sb->s_remove_count))
741  		return -EBUSY;
742  
743  	lock_mount_hash();
744  	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
745  		if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
746  			err = mnt_hold_writers(mnt);
747  			if (err)
748  				break;
749  		}
750  	}
751  	if (!err && atomic_long_read(&sb->s_remove_count))
752  		err = -EBUSY;
753  
754  	if (!err)
755  		sb_start_ro_state_change(sb);
756  	list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
757  		if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
758  			mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
759  	}
760  	unlock_mount_hash();
761  
762  	return err;
763  }
764  
free_vfsmnt(struct mount * mnt)765  static void free_vfsmnt(struct mount *mnt)
766  {
767  	mnt_idmap_put(mnt_idmap(&mnt->mnt));
768  	kfree_const(mnt->mnt_devname);
769  #ifdef CONFIG_SMP
770  	free_percpu(mnt->mnt_pcp);
771  #endif
772  	kmem_cache_free(mnt_cache, mnt);
773  }
774  
delayed_free_vfsmnt(struct rcu_head * head)775  static void delayed_free_vfsmnt(struct rcu_head *head)
776  {
777  	free_vfsmnt(container_of(head, struct mount, mnt_rcu));
778  }
779  
780  /* call under rcu_read_lock */
__legitimize_mnt(struct vfsmount * bastard,unsigned seq)781  int __legitimize_mnt(struct vfsmount *bastard, unsigned seq)
782  {
783  	struct mount *mnt;
784  	if (read_seqretry(&mount_lock, seq))
785  		return 1;
786  	if (bastard == NULL)
787  		return 0;
788  	mnt = real_mount(bastard);
789  	mnt_add_count(mnt, 1);
790  	smp_mb();			// see mntput_no_expire()
791  	if (likely(!read_seqretry(&mount_lock, seq)))
792  		return 0;
793  	if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
794  		mnt_add_count(mnt, -1);
795  		return 1;
796  	}
797  	lock_mount_hash();
798  	if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
799  		mnt_add_count(mnt, -1);
800  		unlock_mount_hash();
801  		return 1;
802  	}
803  	unlock_mount_hash();
804  	/* caller will mntput() */
805  	return -1;
806  }
807  
808  /* call under rcu_read_lock */
legitimize_mnt(struct vfsmount * bastard,unsigned seq)809  static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq)
810  {
811  	int res = __legitimize_mnt(bastard, seq);
812  	if (likely(!res))
813  		return true;
814  	if (unlikely(res < 0)) {
815  		rcu_read_unlock();
816  		mntput(bastard);
817  		rcu_read_lock();
818  	}
819  	return false;
820  }
821  
822  /**
823   * __lookup_mnt - find first child mount
824   * @mnt:	parent mount
825   * @dentry:	mountpoint
826   *
827   * If @mnt has a child mount @c mounted @dentry find and return it.
828   *
829   * Note that the child mount @c need not be unique. There are cases
830   * where shadow mounts are created. For example, during mount
831   * propagation when a source mount @mnt whose root got overmounted by a
832   * mount @o after path lookup but before @namespace_sem could be
833   * acquired gets copied and propagated. So @mnt gets copied including
834   * @o. When @mnt is propagated to a destination mount @d that already
835   * has another mount @n mounted at the same mountpoint then the source
836   * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on
837   * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt
838   * on @dentry.
839   *
840   * Return: The first child of @mnt mounted @dentry or NULL.
841   */
__lookup_mnt(struct vfsmount * mnt,struct dentry * dentry)842  struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
843  {
844  	struct hlist_head *head = m_hash(mnt, dentry);
845  	struct mount *p;
846  
847  	hlist_for_each_entry_rcu(p, head, mnt_hash)
848  		if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry)
849  			return p;
850  	return NULL;
851  }
852  
853  /*
854   * lookup_mnt - Return the first child mount mounted at path
855   *
856   * "First" means first mounted chronologically.  If you create the
857   * following mounts:
858   *
859   * mount /dev/sda1 /mnt
860   * mount /dev/sda2 /mnt
861   * mount /dev/sda3 /mnt
862   *
863   * Then lookup_mnt() on the base /mnt dentry in the root mount will
864   * return successively the root dentry and vfsmount of /dev/sda1, then
865   * /dev/sda2, then /dev/sda3, then NULL.
866   *
867   * lookup_mnt takes a reference to the found vfsmount.
868   */
lookup_mnt(const struct path * path)869  struct vfsmount *lookup_mnt(const struct path *path)
870  {
871  	struct mount *child_mnt;
872  	struct vfsmount *m;
873  	unsigned seq;
874  
875  	rcu_read_lock();
876  	do {
877  		seq = read_seqbegin(&mount_lock);
878  		child_mnt = __lookup_mnt(path->mnt, path->dentry);
879  		m = child_mnt ? &child_mnt->mnt : NULL;
880  	} while (!legitimize_mnt(m, seq));
881  	rcu_read_unlock();
882  	return m;
883  }
884  
885  /*
886   * __is_local_mountpoint - Test to see if dentry is a mountpoint in the
887   *                         current mount namespace.
888   *
889   * The common case is dentries are not mountpoints at all and that
890   * test is handled inline.  For the slow case when we are actually
891   * dealing with a mountpoint of some kind, walk through all of the
892   * mounts in the current mount namespace and test to see if the dentry
893   * is a mountpoint.
894   *
895   * The mount_hashtable is not usable in the context because we
896   * need to identify all mounts that may be in the current mount
897   * namespace not just a mount that happens to have some specified
898   * parent mount.
899   */
__is_local_mountpoint(struct dentry * dentry)900  bool __is_local_mountpoint(struct dentry *dentry)
901  {
902  	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
903  	struct mount *mnt, *n;
904  	bool is_covered = false;
905  
906  	down_read(&namespace_sem);
907  	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
908  		is_covered = (mnt->mnt_mountpoint == dentry);
909  		if (is_covered)
910  			break;
911  	}
912  	up_read(&namespace_sem);
913  
914  	return is_covered;
915  }
916  
lookup_mountpoint(struct dentry * dentry)917  static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
918  {
919  	struct hlist_head *chain = mp_hash(dentry);
920  	struct mountpoint *mp;
921  
922  	hlist_for_each_entry(mp, chain, m_hash) {
923  		if (mp->m_dentry == dentry) {
924  			mp->m_count++;
925  			return mp;
926  		}
927  	}
928  	return NULL;
929  }
930  
get_mountpoint(struct dentry * dentry)931  static struct mountpoint *get_mountpoint(struct dentry *dentry)
932  {
933  	struct mountpoint *mp, *new = NULL;
934  	int ret;
935  
936  	if (d_mountpoint(dentry)) {
937  		/* might be worth a WARN_ON() */
938  		if (d_unlinked(dentry))
939  			return ERR_PTR(-ENOENT);
940  mountpoint:
941  		read_seqlock_excl(&mount_lock);
942  		mp = lookup_mountpoint(dentry);
943  		read_sequnlock_excl(&mount_lock);
944  		if (mp)
945  			goto done;
946  	}
947  
948  	if (!new)
949  		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
950  	if (!new)
951  		return ERR_PTR(-ENOMEM);
952  
953  
954  	/* Exactly one processes may set d_mounted */
955  	ret = d_set_mounted(dentry);
956  
957  	/* Someone else set d_mounted? */
958  	if (ret == -EBUSY)
959  		goto mountpoint;
960  
961  	/* The dentry is not available as a mountpoint? */
962  	mp = ERR_PTR(ret);
963  	if (ret)
964  		goto done;
965  
966  	/* Add the new mountpoint to the hash table */
967  	read_seqlock_excl(&mount_lock);
968  	new->m_dentry = dget(dentry);
969  	new->m_count = 1;
970  	hlist_add_head(&new->m_hash, mp_hash(dentry));
971  	INIT_HLIST_HEAD(&new->m_list);
972  	read_sequnlock_excl(&mount_lock);
973  
974  	mp = new;
975  	new = NULL;
976  done:
977  	kfree(new);
978  	return mp;
979  }
980  
981  /*
982   * vfsmount lock must be held.  Additionally, the caller is responsible
983   * for serializing calls for given disposal list.
984   */
__put_mountpoint(struct mountpoint * mp,struct list_head * list)985  static void __put_mountpoint(struct mountpoint *mp, struct list_head *list)
986  {
987  	if (!--mp->m_count) {
988  		struct dentry *dentry = mp->m_dentry;
989  		BUG_ON(!hlist_empty(&mp->m_list));
990  		spin_lock(&dentry->d_lock);
991  		dentry->d_flags &= ~DCACHE_MOUNTED;
992  		spin_unlock(&dentry->d_lock);
993  		dput_to_list(dentry, list);
994  		hlist_del(&mp->m_hash);
995  		kfree(mp);
996  	}
997  }
998  
999  /* called with namespace_lock and vfsmount lock */
put_mountpoint(struct mountpoint * mp)1000  static void put_mountpoint(struct mountpoint *mp)
1001  {
1002  	__put_mountpoint(mp, &ex_mountpoints);
1003  }
1004  
check_mnt(struct mount * mnt)1005  static inline int check_mnt(struct mount *mnt)
1006  {
1007  	return mnt->mnt_ns == current->nsproxy->mnt_ns;
1008  }
1009  
check_anonymous_mnt(struct mount * mnt)1010  static inline bool check_anonymous_mnt(struct mount *mnt)
1011  {
1012  	u64 seq;
1013  
1014  	if (!is_anon_ns(mnt->mnt_ns))
1015  		return false;
1016  
1017  	seq = mnt->mnt_ns->seq_origin;
1018  	return !seq || (seq == current->nsproxy->mnt_ns->seq);
1019  }
1020  
1021  /*
1022   * vfsmount lock must be held for write
1023   */
touch_mnt_namespace(struct mnt_namespace * ns)1024  static void touch_mnt_namespace(struct mnt_namespace *ns)
1025  {
1026  	if (ns) {
1027  		ns->event = ++event;
1028  		wake_up_interruptible(&ns->poll);
1029  	}
1030  }
1031  
1032  /*
1033   * vfsmount lock must be held for write
1034   */
__touch_mnt_namespace(struct mnt_namespace * ns)1035  static void __touch_mnt_namespace(struct mnt_namespace *ns)
1036  {
1037  	if (ns && ns->event != event) {
1038  		ns->event = event;
1039  		wake_up_interruptible(&ns->poll);
1040  	}
1041  }
1042  
1043  /*
1044   * vfsmount lock must be held for write
1045   */
unhash_mnt(struct mount * mnt)1046  static struct mountpoint *unhash_mnt(struct mount *mnt)
1047  {
1048  	struct mountpoint *mp;
1049  	mnt->mnt_parent = mnt;
1050  	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1051  	list_del_init(&mnt->mnt_child);
1052  	hlist_del_init_rcu(&mnt->mnt_hash);
1053  	hlist_del_init(&mnt->mnt_mp_list);
1054  	mp = mnt->mnt_mp;
1055  	mnt->mnt_mp = NULL;
1056  	return mp;
1057  }
1058  
1059  /*
1060   * vfsmount lock must be held for write
1061   */
umount_mnt(struct mount * mnt)1062  static void umount_mnt(struct mount *mnt)
1063  {
1064  	put_mountpoint(unhash_mnt(mnt));
1065  }
1066  
1067  /*
1068   * vfsmount lock must be held for write
1069   */
mnt_set_mountpoint(struct mount * mnt,struct mountpoint * mp,struct mount * child_mnt)1070  void mnt_set_mountpoint(struct mount *mnt,
1071  			struct mountpoint *mp,
1072  			struct mount *child_mnt)
1073  {
1074  	mp->m_count++;
1075  	mnt_add_count(mnt, 1);	/* essentially, that's mntget */
1076  	child_mnt->mnt_mountpoint = mp->m_dentry;
1077  	child_mnt->mnt_parent = mnt;
1078  	child_mnt->mnt_mp = mp;
1079  	hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
1080  }
1081  
1082  /**
1083   * mnt_set_mountpoint_beneath - mount a mount beneath another one
1084   *
1085   * @new_parent: the source mount
1086   * @top_mnt:    the mount beneath which @new_parent is mounted
1087   * @new_mp:     the new mountpoint of @top_mnt on @new_parent
1088   *
1089   * Remove @top_mnt from its current mountpoint @top_mnt->mnt_mp and
1090   * parent @top_mnt->mnt_parent and mount it on top of @new_parent at
1091   * @new_mp. And mount @new_parent on the old parent and old
1092   * mountpoint of @top_mnt.
1093   *
1094   * Context: This function expects namespace_lock() and lock_mount_hash()
1095   *          to have been acquired in that order.
1096   */
mnt_set_mountpoint_beneath(struct mount * new_parent,struct mount * top_mnt,struct mountpoint * new_mp)1097  static void mnt_set_mountpoint_beneath(struct mount *new_parent,
1098  				       struct mount *top_mnt,
1099  				       struct mountpoint *new_mp)
1100  {
1101  	struct mount *old_top_parent = top_mnt->mnt_parent;
1102  	struct mountpoint *old_top_mp = top_mnt->mnt_mp;
1103  
1104  	mnt_set_mountpoint(old_top_parent, old_top_mp, new_parent);
1105  	mnt_change_mountpoint(new_parent, new_mp, top_mnt);
1106  }
1107  
1108  
__attach_mnt(struct mount * mnt,struct mount * parent)1109  static void __attach_mnt(struct mount *mnt, struct mount *parent)
1110  {
1111  	hlist_add_head_rcu(&mnt->mnt_hash,
1112  			   m_hash(&parent->mnt, mnt->mnt_mountpoint));
1113  	list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
1114  }
1115  
1116  /**
1117   * attach_mnt - mount a mount, attach to @mount_hashtable and parent's
1118   *              list of child mounts
1119   * @parent:  the parent
1120   * @mnt:     the new mount
1121   * @mp:      the new mountpoint
1122   * @beneath: whether to mount @mnt beneath or on top of @parent
1123   *
1124   * If @beneath is false, mount @mnt at @mp on @parent. Then attach @mnt
1125   * to @parent's child mount list and to @mount_hashtable.
1126   *
1127   * If @beneath is true, remove @mnt from its current parent and
1128   * mountpoint and mount it on @mp on @parent, and mount @parent on the
1129   * old parent and old mountpoint of @mnt. Finally, attach @parent to
1130   * @mnt_hashtable and @parent->mnt_parent->mnt_mounts.
1131   *
1132   * Note, when __attach_mnt() is called @mnt->mnt_parent already points
1133   * to the correct parent.
1134   *
1135   * Context: This function expects namespace_lock() and lock_mount_hash()
1136   *          to have been acquired in that order.
1137   */
attach_mnt(struct mount * mnt,struct mount * parent,struct mountpoint * mp,bool beneath)1138  static void attach_mnt(struct mount *mnt, struct mount *parent,
1139  		       struct mountpoint *mp, bool beneath)
1140  {
1141  	if (beneath)
1142  		mnt_set_mountpoint_beneath(mnt, parent, mp);
1143  	else
1144  		mnt_set_mountpoint(parent, mp, mnt);
1145  	/*
1146  	 * Note, @mnt->mnt_parent has to be used. If @mnt was mounted
1147  	 * beneath @parent then @mnt will need to be attached to
1148  	 * @parent's old parent, not @parent. IOW, @mnt->mnt_parent
1149  	 * isn't the same mount as @parent.
1150  	 */
1151  	__attach_mnt(mnt, mnt->mnt_parent);
1152  }
1153  
mnt_change_mountpoint(struct mount * parent,struct mountpoint * mp,struct mount * mnt)1154  void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
1155  {
1156  	struct mountpoint *old_mp = mnt->mnt_mp;
1157  	struct mount *old_parent = mnt->mnt_parent;
1158  
1159  	list_del_init(&mnt->mnt_child);
1160  	hlist_del_init(&mnt->mnt_mp_list);
1161  	hlist_del_init_rcu(&mnt->mnt_hash);
1162  
1163  	attach_mnt(mnt, parent, mp, false);
1164  
1165  	put_mountpoint(old_mp);
1166  	mnt_add_count(old_parent, -1);
1167  }
1168  
node_to_mount(struct rb_node * node)1169  static inline struct mount *node_to_mount(struct rb_node *node)
1170  {
1171  	return node ? rb_entry(node, struct mount, mnt_node) : NULL;
1172  }
1173  
mnt_add_to_ns(struct mnt_namespace * ns,struct mount * mnt)1174  static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
1175  {
1176  	struct rb_node **link = &ns->mounts.rb_node;
1177  	struct rb_node *parent = NULL;
1178  	bool mnt_first_node = true, mnt_last_node = true;
1179  
1180  	WARN_ON(mnt_ns_attached(mnt));
1181  	mnt->mnt_ns = ns;
1182  	while (*link) {
1183  		parent = *link;
1184  		if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
1185  			link = &parent->rb_left;
1186  			mnt_last_node = false;
1187  		} else {
1188  			link = &parent->rb_right;
1189  			mnt_first_node = false;
1190  		}
1191  	}
1192  
1193  	if (mnt_last_node)
1194  		ns->mnt_last_node = &mnt->mnt_node;
1195  	if (mnt_first_node)
1196  		ns->mnt_first_node = &mnt->mnt_node;
1197  	rb_link_node(&mnt->mnt_node, parent, link);
1198  	rb_insert_color(&mnt->mnt_node, &ns->mounts);
1199  
1200  	mnt_notify_add(mnt);
1201  }
1202  
1203  /*
1204   * vfsmount lock must be held for write
1205   */
commit_tree(struct mount * mnt)1206  static void commit_tree(struct mount *mnt)
1207  {
1208  	struct mount *parent = mnt->mnt_parent;
1209  	struct mount *m;
1210  	LIST_HEAD(head);
1211  	struct mnt_namespace *n = parent->mnt_ns;
1212  
1213  	BUG_ON(parent == mnt);
1214  
1215  	list_add_tail(&head, &mnt->mnt_list);
1216  	while (!list_empty(&head)) {
1217  		m = list_first_entry(&head, typeof(*m), mnt_list);
1218  		list_del(&m->mnt_list);
1219  
1220  		mnt_add_to_ns(n, m);
1221  	}
1222  	n->nr_mounts += n->pending_mounts;
1223  	n->pending_mounts = 0;
1224  
1225  	__attach_mnt(mnt, parent);
1226  	touch_mnt_namespace(n);
1227  }
1228  
next_mnt(struct mount * p,struct mount * root)1229  static struct mount *next_mnt(struct mount *p, struct mount *root)
1230  {
1231  	struct list_head *next = p->mnt_mounts.next;
1232  	if (next == &p->mnt_mounts) {
1233  		while (1) {
1234  			if (p == root)
1235  				return NULL;
1236  			next = p->mnt_child.next;
1237  			if (next != &p->mnt_parent->mnt_mounts)
1238  				break;
1239  			p = p->mnt_parent;
1240  		}
1241  	}
1242  	return list_entry(next, struct mount, mnt_child);
1243  }
1244  
skip_mnt_tree(struct mount * p)1245  static struct mount *skip_mnt_tree(struct mount *p)
1246  {
1247  	struct list_head *prev = p->mnt_mounts.prev;
1248  	while (prev != &p->mnt_mounts) {
1249  		p = list_entry(prev, struct mount, mnt_child);
1250  		prev = p->mnt_mounts.prev;
1251  	}
1252  	return p;
1253  }
1254  
1255  /**
1256   * vfs_create_mount - Create a mount for a configured superblock
1257   * @fc: The configuration context with the superblock attached
1258   *
1259   * Create a mount to an already configured superblock.  If necessary, the
1260   * caller should invoke vfs_get_tree() before calling this.
1261   *
1262   * Note that this does not attach the mount to anything.
1263   */
vfs_create_mount(struct fs_context * fc)1264  struct vfsmount *vfs_create_mount(struct fs_context *fc)
1265  {
1266  	struct mount *mnt;
1267  
1268  	if (!fc->root)
1269  		return ERR_PTR(-EINVAL);
1270  
1271  	mnt = alloc_vfsmnt(fc->source ?: "none");
1272  	if (!mnt)
1273  		return ERR_PTR(-ENOMEM);
1274  
1275  	if (fc->sb_flags & SB_KERNMOUNT)
1276  		mnt->mnt.mnt_flags = MNT_INTERNAL;
1277  
1278  	atomic_inc(&fc->root->d_sb->s_active);
1279  	mnt->mnt.mnt_sb		= fc->root->d_sb;
1280  	mnt->mnt.mnt_root	= dget(fc->root);
1281  	mnt->mnt_mountpoint	= mnt->mnt.mnt_root;
1282  	mnt->mnt_parent		= mnt;
1283  
1284  	lock_mount_hash();
1285  	list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts);
1286  	unlock_mount_hash();
1287  	return &mnt->mnt;
1288  }
1289  EXPORT_SYMBOL(vfs_create_mount);
1290  
fc_mount(struct fs_context * fc)1291  struct vfsmount *fc_mount(struct fs_context *fc)
1292  {
1293  	int err = vfs_get_tree(fc);
1294  	if (!err) {
1295  		up_write(&fc->root->d_sb->s_umount);
1296  		return vfs_create_mount(fc);
1297  	}
1298  	return ERR_PTR(err);
1299  }
1300  EXPORT_SYMBOL(fc_mount);
1301  
vfs_kern_mount(struct file_system_type * type,int flags,const char * name,void * data)1302  struct vfsmount *vfs_kern_mount(struct file_system_type *type,
1303  				int flags, const char *name,
1304  				void *data)
1305  {
1306  	struct fs_context *fc;
1307  	struct vfsmount *mnt;
1308  	int ret = 0;
1309  
1310  	if (!type)
1311  		return ERR_PTR(-EINVAL);
1312  
1313  	fc = fs_context_for_mount(type, flags);
1314  	if (IS_ERR(fc))
1315  		return ERR_CAST(fc);
1316  
1317  	if (name)
1318  		ret = vfs_parse_fs_string(fc, "source",
1319  					  name, strlen(name));
1320  	if (!ret)
1321  		ret = parse_monolithic_mount_data(fc, data);
1322  	if (!ret)
1323  		mnt = fc_mount(fc);
1324  	else
1325  		mnt = ERR_PTR(ret);
1326  
1327  	put_fs_context(fc);
1328  	return mnt;
1329  }
1330  EXPORT_SYMBOL_GPL(vfs_kern_mount);
1331  
1332  struct vfsmount *
vfs_submount(const struct dentry * mountpoint,struct file_system_type * type,const char * name,void * data)1333  vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
1334  	     const char *name, void *data)
1335  {
1336  	/* Until it is worked out how to pass the user namespace
1337  	 * through from the parent mount to the submount don't support
1338  	 * unprivileged mounts with submounts.
1339  	 */
1340  	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
1341  		return ERR_PTR(-EPERM);
1342  
1343  	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
1344  }
1345  EXPORT_SYMBOL_GPL(vfs_submount);
1346  
clone_mnt(struct mount * old,struct dentry * root,int flag)1347  static struct mount *clone_mnt(struct mount *old, struct dentry *root,
1348  					int flag)
1349  {
1350  	struct super_block *sb = old->mnt.mnt_sb;
1351  	struct mount *mnt;
1352  	int err;
1353  
1354  	mnt = alloc_vfsmnt(old->mnt_devname);
1355  	if (!mnt)
1356  		return ERR_PTR(-ENOMEM);
1357  
1358  	if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
1359  		mnt->mnt_group_id = 0; /* not a peer of original */
1360  	else
1361  		mnt->mnt_group_id = old->mnt_group_id;
1362  
1363  	if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
1364  		err = mnt_alloc_group_id(mnt);
1365  		if (err)
1366  			goto out_free;
1367  	}
1368  
1369  	mnt->mnt.mnt_flags = old->mnt.mnt_flags;
1370  	mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
1371  
1372  	atomic_inc(&sb->s_active);
1373  	mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
1374  
1375  	mnt->mnt.mnt_sb = sb;
1376  	mnt->mnt.mnt_root = dget(root);
1377  	mnt->mnt_mountpoint = mnt->mnt.mnt_root;
1378  	mnt->mnt_parent = mnt;
1379  	lock_mount_hash();
1380  	list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
1381  	unlock_mount_hash();
1382  
1383  	if ((flag & CL_SLAVE) ||
1384  	    ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) {
1385  		list_add(&mnt->mnt_slave, &old->mnt_slave_list);
1386  		mnt->mnt_master = old;
1387  		CLEAR_MNT_SHARED(mnt);
1388  	} else if (!(flag & CL_PRIVATE)) {
1389  		if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old))
1390  			list_add(&mnt->mnt_share, &old->mnt_share);
1391  		if (IS_MNT_SLAVE(old))
1392  			list_add(&mnt->mnt_slave, &old->mnt_slave);
1393  		mnt->mnt_master = old->mnt_master;
1394  	} else {
1395  		CLEAR_MNT_SHARED(mnt);
1396  	}
1397  	if (flag & CL_MAKE_SHARED)
1398  		set_mnt_shared(mnt);
1399  
1400  	/* stick the duplicate mount on the same expiry list
1401  	 * as the original if that was on one */
1402  	if (flag & CL_EXPIRE) {
1403  		if (!list_empty(&old->mnt_expire))
1404  			list_add(&mnt->mnt_expire, &old->mnt_expire);
1405  	}
1406  
1407  	return mnt;
1408  
1409   out_free:
1410  	mnt_free_id(mnt);
1411  	free_vfsmnt(mnt);
1412  	return ERR_PTR(err);
1413  }
1414  
cleanup_mnt(struct mount * mnt)1415  static void cleanup_mnt(struct mount *mnt)
1416  {
1417  	struct hlist_node *p;
1418  	struct mount *m;
1419  	/*
1420  	 * The warning here probably indicates that somebody messed
1421  	 * up a mnt_want/drop_write() pair.  If this happens, the
1422  	 * filesystem was probably unable to make r/w->r/o transitions.
1423  	 * The locking used to deal with mnt_count decrement provides barriers,
1424  	 * so mnt_get_writers() below is safe.
1425  	 */
1426  	WARN_ON(mnt_get_writers(mnt));
1427  	if (unlikely(mnt->mnt_pins.first))
1428  		mnt_pin_kill(mnt);
1429  	hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) {
1430  		hlist_del(&m->mnt_umount);
1431  		mntput(&m->mnt);
1432  	}
1433  	fsnotify_vfsmount_delete(&mnt->mnt);
1434  	dput(mnt->mnt.mnt_root);
1435  	deactivate_super(mnt->mnt.mnt_sb);
1436  	mnt_free_id(mnt);
1437  	call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt);
1438  }
1439  
__cleanup_mnt(struct rcu_head * head)1440  static void __cleanup_mnt(struct rcu_head *head)
1441  {
1442  	cleanup_mnt(container_of(head, struct mount, mnt_rcu));
1443  }
1444  
1445  static LLIST_HEAD(delayed_mntput_list);
delayed_mntput(struct work_struct * unused)1446  static void delayed_mntput(struct work_struct *unused)
1447  {
1448  	struct llist_node *node = llist_del_all(&delayed_mntput_list);
1449  	struct mount *m, *t;
1450  
1451  	llist_for_each_entry_safe(m, t, node, mnt_llist)
1452  		cleanup_mnt(m);
1453  }
1454  static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput);
1455  
mntput_no_expire(struct mount * mnt)1456  static void mntput_no_expire(struct mount *mnt)
1457  {
1458  	LIST_HEAD(list);
1459  	int count;
1460  
1461  	rcu_read_lock();
1462  	if (likely(READ_ONCE(mnt->mnt_ns))) {
1463  		/*
1464  		 * Since we don't do lock_mount_hash() here,
1465  		 * ->mnt_ns can change under us.  However, if it's
1466  		 * non-NULL, then there's a reference that won't
1467  		 * be dropped until after an RCU delay done after
1468  		 * turning ->mnt_ns NULL.  So if we observe it
1469  		 * non-NULL under rcu_read_lock(), the reference
1470  		 * we are dropping is not the final one.
1471  		 */
1472  		mnt_add_count(mnt, -1);
1473  		rcu_read_unlock();
1474  		return;
1475  	}
1476  	lock_mount_hash();
1477  	/*
1478  	 * make sure that if __legitimize_mnt() has not seen us grab
1479  	 * mount_lock, we'll see their refcount increment here.
1480  	 */
1481  	smp_mb();
1482  	mnt_add_count(mnt, -1);
1483  	count = mnt_get_count(mnt);
1484  	if (count != 0) {
1485  		WARN_ON(count < 0);
1486  		rcu_read_unlock();
1487  		unlock_mount_hash();
1488  		return;
1489  	}
1490  	if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) {
1491  		rcu_read_unlock();
1492  		unlock_mount_hash();
1493  		return;
1494  	}
1495  	mnt->mnt.mnt_flags |= MNT_DOOMED;
1496  	rcu_read_unlock();
1497  
1498  	list_del(&mnt->mnt_instance);
1499  
1500  	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
1501  		struct mount *p, *tmp;
1502  		list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts,  mnt_child) {
1503  			__put_mountpoint(unhash_mnt(p), &list);
1504  			hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children);
1505  		}
1506  	}
1507  	unlock_mount_hash();
1508  	shrink_dentry_list(&list);
1509  
1510  	if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) {
1511  		struct task_struct *task = current;
1512  		if (likely(!(task->flags & PF_KTHREAD))) {
1513  			init_task_work(&mnt->mnt_rcu, __cleanup_mnt);
1514  			if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME))
1515  				return;
1516  		}
1517  		if (llist_add(&mnt->mnt_llist, &delayed_mntput_list))
1518  			schedule_delayed_work(&delayed_mntput_work, 1);
1519  		return;
1520  	}
1521  	cleanup_mnt(mnt);
1522  }
1523  
mntput(struct vfsmount * mnt)1524  void mntput(struct vfsmount *mnt)
1525  {
1526  	if (mnt) {
1527  		struct mount *m = real_mount(mnt);
1528  		/* avoid cacheline pingpong */
1529  		if (unlikely(m->mnt_expiry_mark))
1530  			WRITE_ONCE(m->mnt_expiry_mark, 0);
1531  		mntput_no_expire(m);
1532  	}
1533  }
1534  EXPORT_SYMBOL(mntput);
1535  
mntget(struct vfsmount * mnt)1536  struct vfsmount *mntget(struct vfsmount *mnt)
1537  {
1538  	if (mnt)
1539  		mnt_add_count(real_mount(mnt), 1);
1540  	return mnt;
1541  }
1542  EXPORT_SYMBOL(mntget);
1543  
1544  /*
1545   * Make a mount point inaccessible to new lookups.
1546   * Because there may still be current users, the caller MUST WAIT
1547   * for an RCU grace period before destroying the mount point.
1548   */
mnt_make_shortterm(struct vfsmount * mnt)1549  void mnt_make_shortterm(struct vfsmount *mnt)
1550  {
1551  	if (mnt)
1552  		real_mount(mnt)->mnt_ns = NULL;
1553  }
1554  
1555  /**
1556   * path_is_mountpoint() - Check if path is a mount in the current namespace.
1557   * @path: path to check
1558   *
1559   *  d_mountpoint() can only be used reliably to establish if a dentry is
1560   *  not mounted in any namespace and that common case is handled inline.
1561   *  d_mountpoint() isn't aware of the possibility there may be multiple
1562   *  mounts using a given dentry in a different namespace. This function
1563   *  checks if the passed in path is a mountpoint rather than the dentry
1564   *  alone.
1565   */
path_is_mountpoint(const struct path * path)1566  bool path_is_mountpoint(const struct path *path)
1567  {
1568  	unsigned seq;
1569  	bool res;
1570  
1571  	if (!d_mountpoint(path->dentry))
1572  		return false;
1573  
1574  	rcu_read_lock();
1575  	do {
1576  		seq = read_seqbegin(&mount_lock);
1577  		res = __path_is_mountpoint(path);
1578  	} while (read_seqretry(&mount_lock, seq));
1579  	rcu_read_unlock();
1580  
1581  	return res;
1582  }
1583  EXPORT_SYMBOL(path_is_mountpoint);
1584  
mnt_clone_internal(const struct path * path)1585  struct vfsmount *mnt_clone_internal(const struct path *path)
1586  {
1587  	struct mount *p;
1588  	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
1589  	if (IS_ERR(p))
1590  		return ERR_CAST(p);
1591  	p->mnt.mnt_flags |= MNT_INTERNAL;
1592  	return &p->mnt;
1593  }
1594  
1595  /*
1596   * Returns the mount which either has the specified mnt_id, or has the next
1597   * smallest id afer the specified one.
1598   */
mnt_find_id_at(struct mnt_namespace * ns,u64 mnt_id)1599  static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
1600  {
1601  	struct rb_node *node = ns->mounts.rb_node;
1602  	struct mount *ret = NULL;
1603  
1604  	while (node) {
1605  		struct mount *m = node_to_mount(node);
1606  
1607  		if (mnt_id <= m->mnt_id_unique) {
1608  			ret = node_to_mount(node);
1609  			if (mnt_id == m->mnt_id_unique)
1610  				break;
1611  			node = node->rb_left;
1612  		} else {
1613  			node = node->rb_right;
1614  		}
1615  	}
1616  	return ret;
1617  }
1618  
1619  /*
1620   * Returns the mount which either has the specified mnt_id, or has the next
1621   * greater id before the specified one.
1622   */
mnt_find_id_at_reverse(struct mnt_namespace * ns,u64 mnt_id)1623  static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id)
1624  {
1625  	struct rb_node *node = ns->mounts.rb_node;
1626  	struct mount *ret = NULL;
1627  
1628  	while (node) {
1629  		struct mount *m = node_to_mount(node);
1630  
1631  		if (mnt_id >= m->mnt_id_unique) {
1632  			ret = node_to_mount(node);
1633  			if (mnt_id == m->mnt_id_unique)
1634  				break;
1635  			node = node->rb_right;
1636  		} else {
1637  			node = node->rb_left;
1638  		}
1639  	}
1640  	return ret;
1641  }
1642  
1643  #ifdef CONFIG_PROC_FS
1644  
1645  /* iterator; we want it to have access to namespace_sem, thus here... */
m_start(struct seq_file * m,loff_t * pos)1646  static void *m_start(struct seq_file *m, loff_t *pos)
1647  {
1648  	struct proc_mounts *p = m->private;
1649  
1650  	down_read(&namespace_sem);
1651  
1652  	return mnt_find_id_at(p->ns, *pos);
1653  }
1654  
m_next(struct seq_file * m,void * v,loff_t * pos)1655  static void *m_next(struct seq_file *m, void *v, loff_t *pos)
1656  {
1657  	struct mount *next = NULL, *mnt = v;
1658  	struct rb_node *node = rb_next(&mnt->mnt_node);
1659  
1660  	++*pos;
1661  	if (node) {
1662  		next = node_to_mount(node);
1663  		*pos = next->mnt_id_unique;
1664  	}
1665  	return next;
1666  }
1667  
m_stop(struct seq_file * m,void * v)1668  static void m_stop(struct seq_file *m, void *v)
1669  {
1670  	up_read(&namespace_sem);
1671  }
1672  
m_show(struct seq_file * m,void * v)1673  static int m_show(struct seq_file *m, void *v)
1674  {
1675  	struct proc_mounts *p = m->private;
1676  	struct mount *r = v;
1677  	return p->show(m, &r->mnt);
1678  }
1679  
1680  const struct seq_operations mounts_op = {
1681  	.start	= m_start,
1682  	.next	= m_next,
1683  	.stop	= m_stop,
1684  	.show	= m_show,
1685  };
1686  
1687  #endif  /* CONFIG_PROC_FS */
1688  
1689  /**
1690   * may_umount_tree - check if a mount tree is busy
1691   * @m: root of mount tree
1692   *
1693   * This is called to check if a tree of mounts has any
1694   * open files, pwds, chroots or sub mounts that are
1695   * busy.
1696   */
may_umount_tree(struct vfsmount * m)1697  int may_umount_tree(struct vfsmount *m)
1698  {
1699  	struct mount *mnt = real_mount(m);
1700  	int actual_refs = 0;
1701  	int minimum_refs = 0;
1702  	struct mount *p;
1703  	BUG_ON(!m);
1704  
1705  	/* write lock needed for mnt_get_count */
1706  	lock_mount_hash();
1707  	for (p = mnt; p; p = next_mnt(p, mnt)) {
1708  		actual_refs += mnt_get_count(p);
1709  		minimum_refs += 2;
1710  	}
1711  	unlock_mount_hash();
1712  
1713  	if (actual_refs > minimum_refs)
1714  		return 0;
1715  
1716  	return 1;
1717  }
1718  
1719  EXPORT_SYMBOL(may_umount_tree);
1720  
1721  /**
1722   * may_umount - check if a mount point is busy
1723   * @mnt: root of mount
1724   *
1725   * This is called to check if a mount point has any
1726   * open files, pwds, chroots or sub mounts. If the
1727   * mount has sub mounts this will return busy
1728   * regardless of whether the sub mounts are busy.
1729   *
1730   * Doesn't take quota and stuff into account. IOW, in some cases it will
1731   * give false negatives. The main reason why it's here is that we need
1732   * a non-destructive way to look for easily umountable filesystems.
1733   */
may_umount(struct vfsmount * mnt)1734  int may_umount(struct vfsmount *mnt)
1735  {
1736  	int ret = 1;
1737  	down_read(&namespace_sem);
1738  	lock_mount_hash();
1739  	if (propagate_mount_busy(real_mount(mnt), 2))
1740  		ret = 0;
1741  	unlock_mount_hash();
1742  	up_read(&namespace_sem);
1743  	return ret;
1744  }
1745  
1746  EXPORT_SYMBOL(may_umount);
1747  
1748  #ifdef CONFIG_FSNOTIFY
mnt_notify(struct mount * p)1749  static void mnt_notify(struct mount *p)
1750  {
1751  	if (!p->prev_ns && p->mnt_ns) {
1752  		fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
1753  	} else if (p->prev_ns && !p->mnt_ns) {
1754  		fsnotify_mnt_detach(p->prev_ns, &p->mnt);
1755  	} else if (p->prev_ns == p->mnt_ns) {
1756  		fsnotify_mnt_move(p->mnt_ns, &p->mnt);
1757  	} else {
1758  		fsnotify_mnt_detach(p->prev_ns, &p->mnt);
1759  		fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
1760  	}
1761  	p->prev_ns = p->mnt_ns;
1762  }
1763  
notify_mnt_list(void)1764  static void notify_mnt_list(void)
1765  {
1766  	struct mount *m, *tmp;
1767  	/*
1768  	 * Notify about mounts that were added/reparented/detached/remain
1769  	 * connected after unmount.
1770  	 */
1771  	list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
1772  		mnt_notify(m);
1773  		list_del_init(&m->to_notify);
1774  	}
1775  }
1776  
need_notify_mnt_list(void)1777  static bool need_notify_mnt_list(void)
1778  {
1779  	return !list_empty(&notify_list);
1780  }
1781  #else
notify_mnt_list(void)1782  static void notify_mnt_list(void)
1783  {
1784  }
1785  
need_notify_mnt_list(void)1786  static bool need_notify_mnt_list(void)
1787  {
1788  	return false;
1789  }
1790  #endif
1791  
namespace_unlock(void)1792  static void namespace_unlock(void)
1793  {
1794  	struct hlist_head head;
1795  	struct hlist_node *p;
1796  	struct mount *m;
1797  	LIST_HEAD(list);
1798  
1799  	hlist_move_list(&unmounted, &head);
1800  	list_splice_init(&ex_mountpoints, &list);
1801  
1802  	if (need_notify_mnt_list()) {
1803  		/*
1804  		 * No point blocking out concurrent readers while notifications
1805  		 * are sent. This will also allow statmount()/listmount() to run
1806  		 * concurrently.
1807  		 */
1808  		downgrade_write(&namespace_sem);
1809  		notify_mnt_list();
1810  		up_read(&namespace_sem);
1811  	} else {
1812  		up_write(&namespace_sem);
1813  	}
1814  
1815  	shrink_dentry_list(&list);
1816  
1817  	if (likely(hlist_empty(&head)))
1818  		return;
1819  
1820  	synchronize_rcu_expedited();
1821  
1822  	hlist_for_each_entry_safe(m, p, &head, mnt_umount) {
1823  		hlist_del(&m->mnt_umount);
1824  		mntput(&m->mnt);
1825  	}
1826  }
1827  
namespace_lock(void)1828  static inline void namespace_lock(void)
1829  {
1830  	down_write(&namespace_sem);
1831  }
1832  
1833  enum umount_tree_flags {
1834  	UMOUNT_SYNC = 1,
1835  	UMOUNT_PROPAGATE = 2,
1836  	UMOUNT_CONNECTED = 4,
1837  };
1838  
disconnect_mount(struct mount * mnt,enum umount_tree_flags how)1839  static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how)
1840  {
1841  	/* Leaving mounts connected is only valid for lazy umounts */
1842  	if (how & UMOUNT_SYNC)
1843  		return true;
1844  
1845  	/* A mount without a parent has nothing to be connected to */
1846  	if (!mnt_has_parent(mnt))
1847  		return true;
1848  
1849  	/* Because the reference counting rules change when mounts are
1850  	 * unmounted and connected, umounted mounts may not be
1851  	 * connected to mounted mounts.
1852  	 */
1853  	if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT))
1854  		return true;
1855  
1856  	/* Has it been requested that the mount remain connected? */
1857  	if (how & UMOUNT_CONNECTED)
1858  		return false;
1859  
1860  	/* Is the mount locked such that it needs to remain connected? */
1861  	if (IS_MNT_LOCKED(mnt))
1862  		return false;
1863  
1864  	/* By default disconnect the mount */
1865  	return true;
1866  }
1867  
1868  /*
1869   * mount_lock must be held
1870   * namespace_sem must be held for write
1871   */
umount_tree(struct mount * mnt,enum umount_tree_flags how)1872  static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
1873  {
1874  	LIST_HEAD(tmp_list);
1875  	struct mount *p;
1876  
1877  	if (how & UMOUNT_PROPAGATE)
1878  		propagate_mount_unlock(mnt);
1879  
1880  	/* Gather the mounts to umount */
1881  	for (p = mnt; p; p = next_mnt(p, mnt)) {
1882  		p->mnt.mnt_flags |= MNT_UMOUNT;
1883  		if (mnt_ns_attached(p))
1884  			move_from_ns(p, &tmp_list);
1885  		else
1886  			list_move(&p->mnt_list, &tmp_list);
1887  	}
1888  
1889  	/* Hide the mounts from mnt_mounts */
1890  	list_for_each_entry(p, &tmp_list, mnt_list) {
1891  		list_del_init(&p->mnt_child);
1892  	}
1893  
1894  	/* Add propagated mounts to the tmp_list */
1895  	if (how & UMOUNT_PROPAGATE)
1896  		propagate_umount(&tmp_list);
1897  
1898  	while (!list_empty(&tmp_list)) {
1899  		struct mnt_namespace *ns;
1900  		bool disconnect;
1901  		p = list_first_entry(&tmp_list, struct mount, mnt_list);
1902  		list_del_init(&p->mnt_expire);
1903  		list_del_init(&p->mnt_list);
1904  		ns = p->mnt_ns;
1905  		if (ns) {
1906  			ns->nr_mounts--;
1907  			__touch_mnt_namespace(ns);
1908  		}
1909  		p->mnt_ns = NULL;
1910  		if (how & UMOUNT_SYNC)
1911  			p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
1912  
1913  		disconnect = disconnect_mount(p, how);
1914  		if (mnt_has_parent(p)) {
1915  			mnt_add_count(p->mnt_parent, -1);
1916  			if (!disconnect) {
1917  				/* Don't forget about p */
1918  				list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts);
1919  			} else {
1920  				umount_mnt(p);
1921  			}
1922  		}
1923  		change_mnt_propagation(p, MS_PRIVATE);
1924  		if (disconnect)
1925  			hlist_add_head(&p->mnt_umount, &unmounted);
1926  
1927  		/*
1928  		 * At this point p->mnt_ns is NULL, notification will be queued
1929  		 * only if
1930  		 *
1931  		 *  - p->prev_ns is non-NULL *and*
1932  		 *  - p->prev_ns->n_fsnotify_marks is non-NULL
1933  		 *
1934  		 * This will preclude queuing the mount if this is a cleanup
1935  		 * after a failed copy_tree() or destruction of an anonymous
1936  		 * namespace, etc.
1937  		 */
1938  		mnt_notify_add(p);
1939  	}
1940  }
1941  
1942  static void shrink_submounts(struct mount *mnt);
1943  
do_umount_root(struct super_block * sb)1944  static int do_umount_root(struct super_block *sb)
1945  {
1946  	int ret = 0;
1947  
1948  	down_write(&sb->s_umount);
1949  	if (!sb_rdonly(sb)) {
1950  		struct fs_context *fc;
1951  
1952  		fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY,
1953  						SB_RDONLY);
1954  		if (IS_ERR(fc)) {
1955  			ret = PTR_ERR(fc);
1956  		} else {
1957  			ret = parse_monolithic_mount_data(fc, NULL);
1958  			if (!ret)
1959  				ret = reconfigure_super(fc);
1960  			put_fs_context(fc);
1961  		}
1962  	}
1963  	up_write(&sb->s_umount);
1964  	return ret;
1965  }
1966  
do_umount(struct mount * mnt,int flags)1967  static int do_umount(struct mount *mnt, int flags)
1968  {
1969  	struct super_block *sb = mnt->mnt.mnt_sb;
1970  	int retval;
1971  
1972  	retval = security_sb_umount(&mnt->mnt, flags);
1973  	if (retval)
1974  		return retval;
1975  
1976  	/*
1977  	 * Allow userspace to request a mountpoint be expired rather than
1978  	 * unmounting unconditionally. Unmount only happens if:
1979  	 *  (1) the mark is already set (the mark is cleared by mntput())
1980  	 *  (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount]
1981  	 */
1982  	if (flags & MNT_EXPIRE) {
1983  		if (&mnt->mnt == current->fs->root.mnt ||
1984  		    flags & (MNT_FORCE | MNT_DETACH))
1985  			return -EINVAL;
1986  
1987  		/*
1988  		 * probably don't strictly need the lock here if we examined
1989  		 * all race cases, but it's a slowpath.
1990  		 */
1991  		lock_mount_hash();
1992  		if (mnt_get_count(mnt) != 2) {
1993  			unlock_mount_hash();
1994  			return -EBUSY;
1995  		}
1996  		unlock_mount_hash();
1997  
1998  		if (!xchg(&mnt->mnt_expiry_mark, 1))
1999  			return -EAGAIN;
2000  	}
2001  
2002  	/*
2003  	 * If we may have to abort operations to get out of this
2004  	 * mount, and they will themselves hold resources we must
2005  	 * allow the fs to do things. In the Unix tradition of
2006  	 * 'Gee thats tricky lets do it in userspace' the umount_begin
2007  	 * might fail to complete on the first run through as other tasks
2008  	 * must return, and the like. Thats for the mount program to worry
2009  	 * about for the moment.
2010  	 */
2011  
2012  	if (flags & MNT_FORCE && sb->s_op->umount_begin) {
2013  		sb->s_op->umount_begin(sb);
2014  	}
2015  
2016  	/*
2017  	 * No sense to grab the lock for this test, but test itself looks
2018  	 * somewhat bogus. Suggestions for better replacement?
2019  	 * Ho-hum... In principle, we might treat that as umount + switch
2020  	 * to rootfs. GC would eventually take care of the old vfsmount.
2021  	 * Actually it makes sense, especially if rootfs would contain a
2022  	 * /reboot - static binary that would close all descriptors and
2023  	 * call reboot(9). Then init(8) could umount root and exec /reboot.
2024  	 */
2025  	if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) {
2026  		/*
2027  		 * Special case for "unmounting" root ...
2028  		 * we just try to remount it readonly.
2029  		 */
2030  		if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
2031  			return -EPERM;
2032  		return do_umount_root(sb);
2033  	}
2034  
2035  	namespace_lock();
2036  	lock_mount_hash();
2037  
2038  	/* Recheck MNT_LOCKED with the locks held */
2039  	retval = -EINVAL;
2040  	if (mnt->mnt.mnt_flags & MNT_LOCKED)
2041  		goto out;
2042  
2043  	event++;
2044  	if (flags & MNT_DETACH) {
2045  		if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
2046  			umount_tree(mnt, UMOUNT_PROPAGATE);
2047  		retval = 0;
2048  	} else {
2049  		shrink_submounts(mnt);
2050  		retval = -EBUSY;
2051  		if (!propagate_mount_busy(mnt, 2)) {
2052  			if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
2053  				umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
2054  			retval = 0;
2055  		}
2056  	}
2057  out:
2058  	unlock_mount_hash();
2059  	namespace_unlock();
2060  	return retval;
2061  }
2062  
2063  /*
2064   * __detach_mounts - lazily unmount all mounts on the specified dentry
2065   *
2066   * During unlink, rmdir, and d_drop it is possible to loose the path
2067   * to an existing mountpoint, and wind up leaking the mount.
2068   * detach_mounts allows lazily unmounting those mounts instead of
2069   * leaking them.
2070   *
2071   * The caller may hold dentry->d_inode->i_mutex.
2072   */
__detach_mounts(struct dentry * dentry)2073  void __detach_mounts(struct dentry *dentry)
2074  {
2075  	struct mountpoint *mp;
2076  	struct mount *mnt;
2077  
2078  	namespace_lock();
2079  	lock_mount_hash();
2080  	mp = lookup_mountpoint(dentry);
2081  	if (!mp)
2082  		goto out_unlock;
2083  
2084  	event++;
2085  	while (!hlist_empty(&mp->m_list)) {
2086  		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
2087  		if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
2088  			umount_mnt(mnt);
2089  			hlist_add_head(&mnt->mnt_umount, &unmounted);
2090  		}
2091  		else umount_tree(mnt, UMOUNT_CONNECTED);
2092  	}
2093  	put_mountpoint(mp);
2094  out_unlock:
2095  	unlock_mount_hash();
2096  	namespace_unlock();
2097  }
2098  
2099  /*
2100   * Is the caller allowed to modify his namespace?
2101   */
may_mount(void)2102  bool may_mount(void)
2103  {
2104  	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
2105  }
2106  
warn_mandlock(void)2107  static void warn_mandlock(void)
2108  {
2109  	pr_warn_once("=======================================================\n"
2110  		     "WARNING: The mand mount option has been deprecated and\n"
2111  		     "         and is ignored by this kernel. Remove the mand\n"
2112  		     "         option from the mount to silence this warning.\n"
2113  		     "=======================================================\n");
2114  }
2115  
can_umount(const struct path * path,int flags)2116  static int can_umount(const struct path *path, int flags)
2117  {
2118  	struct mount *mnt = real_mount(path->mnt);
2119  	struct super_block *sb = path->dentry->d_sb;
2120  
2121  	if (!may_mount())
2122  		return -EPERM;
2123  	if (!path_mounted(path))
2124  		return -EINVAL;
2125  	if (!check_mnt(mnt))
2126  		return -EINVAL;
2127  	if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
2128  		return -EINVAL;
2129  	if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
2130  		return -EPERM;
2131  	return 0;
2132  }
2133  
2134  // caller is responsible for flags being sane
path_umount(struct path * path,int flags)2135  int path_umount(struct path *path, int flags)
2136  {
2137  	struct mount *mnt = real_mount(path->mnt);
2138  	int ret;
2139  
2140  	ret = can_umount(path, flags);
2141  	if (!ret)
2142  		ret = do_umount(mnt, flags);
2143  
2144  	/* we mustn't call path_put() as that would clear mnt_expiry_mark */
2145  	dput(path->dentry);
2146  	mntput_no_expire(mnt);
2147  	return ret;
2148  }
2149  
ksys_umount(char __user * name,int flags)2150  static int ksys_umount(char __user *name, int flags)
2151  {
2152  	int lookup_flags = LOOKUP_MOUNTPOINT;
2153  	struct path path;
2154  	int ret;
2155  
2156  	// basic validity checks done first
2157  	if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW))
2158  		return -EINVAL;
2159  
2160  	if (!(flags & UMOUNT_NOFOLLOW))
2161  		lookup_flags |= LOOKUP_FOLLOW;
2162  	ret = user_path_at(AT_FDCWD, name, lookup_flags, &path);
2163  	if (ret)
2164  		return ret;
2165  	return path_umount(&path, flags);
2166  }
2167  
SYSCALL_DEFINE2(umount,char __user *,name,int,flags)2168  SYSCALL_DEFINE2(umount, char __user *, name, int, flags)
2169  {
2170  	return ksys_umount(name, flags);
2171  }
2172  
2173  #ifdef __ARCH_WANT_SYS_OLDUMOUNT
2174  
2175  /*
2176   *	The 2.0 compatible umount. No flags.
2177   */
SYSCALL_DEFINE1(oldumount,char __user *,name)2178  SYSCALL_DEFINE1(oldumount, char __user *, name)
2179  {
2180  	return ksys_umount(name, 0);
2181  }
2182  
2183  #endif
2184  
is_mnt_ns_file(struct dentry * dentry)2185  static bool is_mnt_ns_file(struct dentry *dentry)
2186  {
2187  	struct ns_common *ns;
2188  
2189  	/* Is this a proxy for a mount namespace? */
2190  	if (dentry->d_op != &ns_dentry_operations)
2191  		return false;
2192  
2193  	ns = d_inode(dentry)->i_private;
2194  
2195  	return ns->ops == &mntns_operations;
2196  }
2197  
from_mnt_ns(struct mnt_namespace * mnt)2198  struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
2199  {
2200  	return &mnt->ns;
2201  }
2202  
get_sequential_mnt_ns(struct mnt_namespace * mntns,bool previous)2203  struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
2204  {
2205  	guard(rcu)();
2206  
2207  	for (;;) {
2208  		struct list_head *list;
2209  
2210  		if (previous)
2211  			list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
2212  		else
2213  			list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
2214  		if (list_is_head(list, &mnt_ns_list))
2215  			return ERR_PTR(-ENOENT);
2216  
2217  		mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
2218  
2219  		/*
2220  		 * The last passive reference count is put with RCU
2221  		 * delay so accessing the mount namespace is not just
2222  		 * safe but all relevant members are still valid.
2223  		 */
2224  		if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
2225  			continue;
2226  
2227  		/*
2228  		 * We need an active reference count as we're persisting
2229  		 * the mount namespace and it might already be on its
2230  		 * deathbed.
2231  		 */
2232  		if (!refcount_inc_not_zero(&mntns->ns.count))
2233  			continue;
2234  
2235  		return mntns;
2236  	}
2237  }
2238  
mnt_ns_from_dentry(struct dentry * dentry)2239  struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
2240  {
2241  	if (!is_mnt_ns_file(dentry))
2242  		return NULL;
2243  
2244  	return to_mnt_ns(get_proc_ns(dentry->d_inode));
2245  }
2246  
mnt_ns_loop(struct dentry * dentry)2247  static bool mnt_ns_loop(struct dentry *dentry)
2248  {
2249  	/* Could bind mounting the mount namespace inode cause a
2250  	 * mount namespace loop?
2251  	 */
2252  	struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
2253  
2254  	if (!mnt_ns)
2255  		return false;
2256  
2257  	return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
2258  }
2259  
copy_tree(struct mount * src_root,struct dentry * dentry,int flag)2260  struct mount *copy_tree(struct mount *src_root, struct dentry *dentry,
2261  					int flag)
2262  {
2263  	struct mount *res, *src_parent, *src_root_child, *src_mnt,
2264  		*dst_parent, *dst_mnt;
2265  
2266  	if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root))
2267  		return ERR_PTR(-EINVAL);
2268  
2269  	if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry))
2270  		return ERR_PTR(-EINVAL);
2271  
2272  	res = dst_mnt = clone_mnt(src_root, dentry, flag);
2273  	if (IS_ERR(dst_mnt))
2274  		return dst_mnt;
2275  
2276  	src_parent = src_root;
2277  	dst_mnt->mnt_mountpoint = src_root->mnt_mountpoint;
2278  
2279  	list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) {
2280  		if (!is_subdir(src_root_child->mnt_mountpoint, dentry))
2281  			continue;
2282  
2283  		for (src_mnt = src_root_child; src_mnt;
2284  		    src_mnt = next_mnt(src_mnt, src_root_child)) {
2285  			if (!(flag & CL_COPY_UNBINDABLE) &&
2286  			    IS_MNT_UNBINDABLE(src_mnt)) {
2287  				if (src_mnt->mnt.mnt_flags & MNT_LOCKED) {
2288  					/* Both unbindable and locked. */
2289  					dst_mnt = ERR_PTR(-EPERM);
2290  					goto out;
2291  				} else {
2292  					src_mnt = skip_mnt_tree(src_mnt);
2293  					continue;
2294  				}
2295  			}
2296  			if (!(flag & CL_COPY_MNT_NS_FILE) &&
2297  			    is_mnt_ns_file(src_mnt->mnt.mnt_root)) {
2298  				src_mnt = skip_mnt_tree(src_mnt);
2299  				continue;
2300  			}
2301  			while (src_parent != src_mnt->mnt_parent) {
2302  				src_parent = src_parent->mnt_parent;
2303  				dst_mnt = dst_mnt->mnt_parent;
2304  			}
2305  
2306  			src_parent = src_mnt;
2307  			dst_parent = dst_mnt;
2308  			dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag);
2309  			if (IS_ERR(dst_mnt))
2310  				goto out;
2311  			lock_mount_hash();
2312  			list_add_tail(&dst_mnt->mnt_list, &res->mnt_list);
2313  			attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp, false);
2314  			unlock_mount_hash();
2315  		}
2316  	}
2317  	return res;
2318  
2319  out:
2320  	if (res) {
2321  		lock_mount_hash();
2322  		umount_tree(res, UMOUNT_SYNC);
2323  		unlock_mount_hash();
2324  	}
2325  	return dst_mnt;
2326  }
2327  
2328  /* Caller should check returned pointer for errors */
2329  
collect_mounts(const struct path * path)2330  struct vfsmount *collect_mounts(const struct path *path)
2331  {
2332  	struct mount *tree;
2333  	namespace_lock();
2334  	if (!check_mnt(real_mount(path->mnt)))
2335  		tree = ERR_PTR(-EINVAL);
2336  	else
2337  		tree = copy_tree(real_mount(path->mnt), path->dentry,
2338  				 CL_COPY_ALL | CL_PRIVATE);
2339  	namespace_unlock();
2340  	if (IS_ERR(tree))
2341  		return ERR_CAST(tree);
2342  	return &tree->mnt;
2343  }
2344  
2345  static void free_mnt_ns(struct mnt_namespace *);
2346  static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
2347  
must_dissolve(struct mnt_namespace * mnt_ns)2348  static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
2349  {
2350  	/*
2351          * This mount belonged to an anonymous mount namespace
2352          * but was moved to a non-anonymous mount namespace and
2353          * then unmounted.
2354          */
2355  	if (unlikely(!mnt_ns))
2356  		return false;
2357  
2358  	/*
2359          * This mount belongs to a non-anonymous mount namespace
2360          * and we know that such a mount can never transition to
2361          * an anonymous mount namespace again.
2362          */
2363  	if (!is_anon_ns(mnt_ns)) {
2364  		/*
2365  		 * A detached mount either belongs to an anonymous mount
2366  		 * namespace or a non-anonymous mount namespace. It
2367  		 * should never belong to something purely internal.
2368  		 */
2369  		VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
2370  		return false;
2371  	}
2372  
2373  	return true;
2374  }
2375  
dissolve_on_fput(struct vfsmount * mnt)2376  void dissolve_on_fput(struct vfsmount *mnt)
2377  {
2378  	struct mnt_namespace *ns;
2379  	struct mount *m = real_mount(mnt);
2380  
2381  	scoped_guard(rcu) {
2382  		if (!must_dissolve(READ_ONCE(m->mnt_ns)))
2383  			return;
2384  	}
2385  
2386  	scoped_guard(rwsem_write, &namespace_sem) {
2387  		ns = m->mnt_ns;
2388  		if (!must_dissolve(ns))
2389  			return;
2390  
2391  		/*
2392  		 * After must_dissolve() we know that this is a detached
2393  		 * mount in an anonymous mount namespace.
2394  		 *
2395  		 * Now when mnt_has_parent() reports that this mount
2396  		 * tree has a parent, we know that this anonymous mount
2397  		 * tree has been moved to another anonymous mount
2398  		 * namespace.
2399  		 *
2400  		 * So when closing this file we cannot unmount the mount
2401  		 * tree. This will be done when the file referring to
2402  		 * the root of the anonymous mount namespace will be
2403  		 * closed (It could already be closed but it would sync
2404  		 * on @namespace_sem and wait for us to finish.).
2405  		 */
2406  		if (mnt_has_parent(m))
2407  			return;
2408  
2409  		lock_mount_hash();
2410  		umount_tree(m, UMOUNT_CONNECTED);
2411  		unlock_mount_hash();
2412  	}
2413  
2414  	/* Make sure we notice when we leak mounts. */
2415  	VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
2416  	free_mnt_ns(ns);
2417  }
2418  
drop_collected_mounts(struct vfsmount * mnt)2419  void drop_collected_mounts(struct vfsmount *mnt)
2420  {
2421  	namespace_lock();
2422  	lock_mount_hash();
2423  	umount_tree(real_mount(mnt), 0);
2424  	unlock_mount_hash();
2425  	namespace_unlock();
2426  }
2427  
has_locked_children(struct mount * mnt,struct dentry * dentry)2428  bool has_locked_children(struct mount *mnt, struct dentry *dentry)
2429  {
2430  	struct mount *child;
2431  
2432  	list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
2433  		if (!is_subdir(child->mnt_mountpoint, dentry))
2434  			continue;
2435  
2436  		if (child->mnt.mnt_flags & MNT_LOCKED)
2437  			return true;
2438  	}
2439  	return false;
2440  }
2441  
2442  /*
2443   * Check that there aren't references to earlier/same mount namespaces in the
2444   * specified subtree.  Such references can act as pins for mount namespaces
2445   * that aren't checked by the mount-cycle checking code, thereby allowing
2446   * cycles to be made.
2447   */
check_for_nsfs_mounts(struct mount * subtree)2448  static bool check_for_nsfs_mounts(struct mount *subtree)
2449  {
2450  	struct mount *p;
2451  	bool ret = false;
2452  
2453  	lock_mount_hash();
2454  	for (p = subtree; p; p = next_mnt(p, subtree))
2455  		if (mnt_ns_loop(p->mnt.mnt_root))
2456  			goto out;
2457  
2458  	ret = true;
2459  out:
2460  	unlock_mount_hash();
2461  	return ret;
2462  }
2463  
2464  /**
2465   * clone_private_mount - create a private clone of a path
2466   * @path: path to clone
2467   *
2468   * This creates a new vfsmount, which will be the clone of @path.  The new mount
2469   * will not be attached anywhere in the namespace and will be private (i.e.
2470   * changes to the originating mount won't be propagated into this).
2471   *
2472   * This assumes caller has called or done the equivalent of may_mount().
2473   *
2474   * Release with mntput().
2475   */
clone_private_mount(const struct path * path)2476  struct vfsmount *clone_private_mount(const struct path *path)
2477  {
2478  	struct mount *old_mnt = real_mount(path->mnt);
2479  	struct mount *new_mnt;
2480  
2481  	scoped_guard(rwsem_read, &namespace_sem)
2482  	if (IS_MNT_UNBINDABLE(old_mnt))
2483  		return ERR_PTR(-EINVAL);
2484  
2485  	if (mnt_has_parent(old_mnt)) {
2486  		if (!check_mnt(old_mnt))
2487  			return ERR_PTR(-EINVAL);
2488  	} else {
2489  		if (!is_mounted(&old_mnt->mnt))
2490  			return ERR_PTR(-EINVAL);
2491  
2492  		/* Make sure this isn't something purely kernel internal. */
2493  		if (!is_anon_ns(old_mnt->mnt_ns))
2494  			return ERR_PTR(-EINVAL);
2495  
2496  		/* Make sure we don't create mount namespace loops. */
2497  		if (!check_for_nsfs_mounts(old_mnt))
2498  			return ERR_PTR(-EINVAL);
2499  	}
2500  
2501  	if (has_locked_children(old_mnt, path->dentry))
2502  		return ERR_PTR(-EINVAL);
2503  
2504  	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
2505  	if (IS_ERR(new_mnt))
2506  		return ERR_PTR(-EINVAL);
2507  
2508  	/* Longterm mount to be removed by kern_unmount*() */
2509  	new_mnt->mnt_ns = MNT_NS_INTERNAL;
2510  	return &new_mnt->mnt;
2511  }
2512  EXPORT_SYMBOL_GPL(clone_private_mount);
2513  
iterate_mounts(int (* f)(struct vfsmount *,void *),void * arg,struct vfsmount * root)2514  int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
2515  		   struct vfsmount *root)
2516  {
2517  	struct mount *mnt;
2518  	int res = f(root, arg);
2519  	if (res)
2520  		return res;
2521  	list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) {
2522  		res = f(&mnt->mnt, arg);
2523  		if (res)
2524  			return res;
2525  	}
2526  	return 0;
2527  }
2528  
lock_mnt_tree(struct mount * mnt)2529  static void lock_mnt_tree(struct mount *mnt)
2530  {
2531  	struct mount *p;
2532  
2533  	for (p = mnt; p; p = next_mnt(p, mnt)) {
2534  		int flags = p->mnt.mnt_flags;
2535  		/* Don't allow unprivileged users to change mount flags */
2536  		flags |= MNT_LOCK_ATIME;
2537  
2538  		if (flags & MNT_READONLY)
2539  			flags |= MNT_LOCK_READONLY;
2540  
2541  		if (flags & MNT_NODEV)
2542  			flags |= MNT_LOCK_NODEV;
2543  
2544  		if (flags & MNT_NOSUID)
2545  			flags |= MNT_LOCK_NOSUID;
2546  
2547  		if (flags & MNT_NOEXEC)
2548  			flags |= MNT_LOCK_NOEXEC;
2549  		/* Don't allow unprivileged users to reveal what is under a mount */
2550  		if (list_empty(&p->mnt_expire))
2551  			flags |= MNT_LOCKED;
2552  		p->mnt.mnt_flags = flags;
2553  	}
2554  }
2555  
cleanup_group_ids(struct mount * mnt,struct mount * end)2556  static void cleanup_group_ids(struct mount *mnt, struct mount *end)
2557  {
2558  	struct mount *p;
2559  
2560  	for (p = mnt; p != end; p = next_mnt(p, mnt)) {
2561  		if (p->mnt_group_id && !IS_MNT_SHARED(p))
2562  			mnt_release_group_id(p);
2563  	}
2564  }
2565  
invent_group_ids(struct mount * mnt,bool recurse)2566  static int invent_group_ids(struct mount *mnt, bool recurse)
2567  {
2568  	struct mount *p;
2569  
2570  	for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
2571  		if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
2572  			int err = mnt_alloc_group_id(p);
2573  			if (err) {
2574  				cleanup_group_ids(mnt, p);
2575  				return err;
2576  			}
2577  		}
2578  	}
2579  
2580  	return 0;
2581  }
2582  
count_mounts(struct mnt_namespace * ns,struct mount * mnt)2583  int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
2584  {
2585  	unsigned int max = READ_ONCE(sysctl_mount_max);
2586  	unsigned int mounts = 0;
2587  	struct mount *p;
2588  
2589  	if (ns->nr_mounts >= max)
2590  		return -ENOSPC;
2591  	max -= ns->nr_mounts;
2592  	if (ns->pending_mounts >= max)
2593  		return -ENOSPC;
2594  	max -= ns->pending_mounts;
2595  
2596  	for (p = mnt; p; p = next_mnt(p, mnt))
2597  		mounts++;
2598  
2599  	if (mounts > max)
2600  		return -ENOSPC;
2601  
2602  	ns->pending_mounts += mounts;
2603  	return 0;
2604  }
2605  
2606  enum mnt_tree_flags_t {
2607  	MNT_TREE_MOVE = BIT(0),
2608  	MNT_TREE_BENEATH = BIT(1),
2609  	MNT_TREE_PROPAGATION = BIT(2),
2610  };
2611  
2612  /**
2613   * attach_recursive_mnt - attach a source mount tree
2614   * @source_mnt: mount tree to be attached
2615   * @top_mnt:    mount that @source_mnt will be mounted on or mounted beneath
2616   * @dest_mp:    the mountpoint @source_mnt will be mounted at
2617   * @flags:      modify how @source_mnt is supposed to be attached
2618   *
2619   *  NOTE: in the table below explains the semantics when a source mount
2620   *  of a given type is attached to a destination mount of a given type.
2621   * ---------------------------------------------------------------------------
2622   * |         BIND MOUNT OPERATION                                            |
2623   * |**************************************************************************
2624   * | source-->| shared        |       private  |       slave    | unbindable |
2625   * | dest     |               |                |                |            |
2626   * |   |      |               |                |                |            |
2627   * |   v      |               |                |                |            |
2628   * |**************************************************************************
2629   * |  shared  | shared (++)   |     shared (+) |     shared(+++)|  invalid   |
2630   * |          |               |                |                |            |
2631   * |non-shared| shared (+)    |      private   |      slave (*) |  invalid   |
2632   * ***************************************************************************
2633   * A bind operation clones the source mount and mounts the clone on the
2634   * destination mount.
2635   *
2636   * (++)  the cloned mount is propagated to all the mounts in the propagation
2637   * 	 tree of the destination mount and the cloned mount is added to
2638   * 	 the peer group of the source mount.
2639   * (+)   the cloned mount is created under the destination mount and is marked
2640   *       as shared. The cloned mount is added to the peer group of the source
2641   *       mount.
2642   * (+++) the mount is propagated to all the mounts in the propagation tree
2643   *       of the destination mount and the cloned mount is made slave
2644   *       of the same master as that of the source mount. The cloned mount
2645   *       is marked as 'shared and slave'.
2646   * (*)   the cloned mount is made a slave of the same master as that of the
2647   * 	 source mount.
2648   *
2649   * ---------------------------------------------------------------------------
2650   * |         		MOVE MOUNT OPERATION                                 |
2651   * |**************************************************************************
2652   * | source-->| shared        |       private  |       slave    | unbindable |
2653   * | dest     |               |                |                |            |
2654   * |   |      |               |                |                |            |
2655   * |   v      |               |                |                |            |
2656   * |**************************************************************************
2657   * |  shared  | shared (+)    |     shared (+) |    shared(+++) |  invalid   |
2658   * |          |               |                |                |            |
2659   * |non-shared| shared (+*)   |      private   |    slave (*)   | unbindable |
2660   * ***************************************************************************
2661   *
2662   * (+)  the mount is moved to the destination. And is then propagated to
2663   * 	all the mounts in the propagation tree of the destination mount.
2664   * (+*)  the mount is moved to the destination.
2665   * (+++)  the mount is moved to the destination and is then propagated to
2666   * 	all the mounts belonging to the destination mount's propagation tree.
2667   * 	the mount is marked as 'shared and slave'.
2668   * (*)	the mount continues to be a slave at the new location.
2669   *
2670   * if the source mount is a tree, the operations explained above is
2671   * applied to each mount in the tree.
2672   * Must be called without spinlocks held, since this function can sleep
2673   * in allocations.
2674   *
2675   * Context: The function expects namespace_lock() to be held.
2676   * Return: If @source_mnt was successfully attached 0 is returned.
2677   *         Otherwise a negative error code is returned.
2678   */
attach_recursive_mnt(struct mount * source_mnt,struct mount * top_mnt,struct mountpoint * dest_mp,enum mnt_tree_flags_t flags)2679  static int attach_recursive_mnt(struct mount *source_mnt,
2680  				struct mount *top_mnt,
2681  				struct mountpoint *dest_mp,
2682  				enum mnt_tree_flags_t flags)
2683  {
2684  	struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
2685  	HLIST_HEAD(tree_list);
2686  	struct mnt_namespace *ns = top_mnt->mnt_ns;
2687  	struct mountpoint *smp;
2688  	struct mount *child, *dest_mnt, *p;
2689  	struct hlist_node *n;
2690  	int err = 0;
2691  	bool moving = flags & MNT_TREE_MOVE, beneath = flags & MNT_TREE_BENEATH;
2692  
2693  	/*
2694  	 * Preallocate a mountpoint in case the new mounts need to be
2695  	 * mounted beneath mounts on the same mountpoint.
2696  	 */
2697  	smp = get_mountpoint(source_mnt->mnt.mnt_root);
2698  	if (IS_ERR(smp))
2699  		return PTR_ERR(smp);
2700  
2701  	/* Is there space to add these mounts to the mount namespace? */
2702  	if (!moving) {
2703  		err = count_mounts(ns, source_mnt);
2704  		if (err)
2705  			goto out;
2706  	}
2707  
2708  	if (beneath)
2709  		dest_mnt = top_mnt->mnt_parent;
2710  	else
2711  		dest_mnt = top_mnt;
2712  
2713  	if (IS_MNT_SHARED(dest_mnt)) {
2714  		err = invent_group_ids(source_mnt, true);
2715  		if (err)
2716  			goto out;
2717  		err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list);
2718  	}
2719  	lock_mount_hash();
2720  	if (err)
2721  		goto out_cleanup_ids;
2722  
2723  	if (IS_MNT_SHARED(dest_mnt)) {
2724  		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2725  			set_mnt_shared(p);
2726  	}
2727  
2728  	if (moving) {
2729  		if (beneath)
2730  			dest_mp = smp;
2731  		unhash_mnt(source_mnt);
2732  		attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
2733  		mnt_notify_add(source_mnt);
2734  		touch_mnt_namespace(source_mnt->mnt_ns);
2735  	} else {
2736  		if (source_mnt->mnt_ns) {
2737  			LIST_HEAD(head);
2738  
2739  			/* move from anon - the caller will destroy */
2740  			for (p = source_mnt; p; p = next_mnt(p, source_mnt))
2741  				move_from_ns(p, &head);
2742  			list_del_init(&head);
2743  		}
2744  		if (beneath)
2745  			mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
2746  		else
2747  			mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
2748  		commit_tree(source_mnt);
2749  	}
2750  
2751  	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
2752  		struct mount *q;
2753  		hlist_del_init(&child->mnt_hash);
2754  		q = __lookup_mnt(&child->mnt_parent->mnt,
2755  				 child->mnt_mountpoint);
2756  		if (q)
2757  			mnt_change_mountpoint(child, smp, q);
2758  		/* Notice when we are propagating across user namespaces */
2759  		if (child->mnt_parent->mnt_ns->user_ns != user_ns)
2760  			lock_mnt_tree(child);
2761  		child->mnt.mnt_flags &= ~MNT_LOCKED;
2762  		commit_tree(child);
2763  	}
2764  	put_mountpoint(smp);
2765  	unlock_mount_hash();
2766  
2767  	return 0;
2768  
2769   out_cleanup_ids:
2770  	while (!hlist_empty(&tree_list)) {
2771  		child = hlist_entry(tree_list.first, struct mount, mnt_hash);
2772  		child->mnt_parent->mnt_ns->pending_mounts = 0;
2773  		umount_tree(child, UMOUNT_SYNC);
2774  	}
2775  	unlock_mount_hash();
2776  	cleanup_group_ids(source_mnt, NULL);
2777   out:
2778  	ns->pending_mounts = 0;
2779  
2780  	read_seqlock_excl(&mount_lock);
2781  	put_mountpoint(smp);
2782  	read_sequnlock_excl(&mount_lock);
2783  
2784  	return err;
2785  }
2786  
2787  /**
2788   * do_lock_mount - lock mount and mountpoint
2789   * @path:    target path
2790   * @beneath: whether the intention is to mount beneath @path
2791   *
2792   * Follow the mount stack on @path until the top mount @mnt is found. If
2793   * the initial @path->{mnt,dentry} is a mountpoint lookup the first
2794   * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root}
2795   * until nothing is stacked on top of it anymore.
2796   *
2797   * Acquire the inode_lock() on the top mount's ->mnt_root to protect
2798   * against concurrent removal of the new mountpoint from another mount
2799   * namespace.
2800   *
2801   * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint
2802   * @mp on @mnt->mnt_parent must be acquired. This protects against a
2803   * concurrent unlink of @mp->mnt_dentry from another mount namespace
2804   * where @mnt doesn't have a child mount mounted @mp. A concurrent
2805   * removal of @mnt->mnt_root doesn't matter as nothing will be mounted
2806   * on top of it for @beneath.
2807   *
2808   * In addition, @beneath needs to make sure that @mnt hasn't been
2809   * unmounted or moved from its current mountpoint in between dropping
2810   * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt
2811   * being unmounted would be detected later by e.g., calling
2812   * check_mnt(mnt) in the function it's called from. For the @beneath
2813   * case however, it's useful to detect it directly in do_lock_mount().
2814   * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points
2815   * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will
2816   * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL.
2817   *
2818   * Return: Either the target mountpoint on the top mount or the top
2819   *         mount's mountpoint.
2820   */
do_lock_mount(struct path * path,bool beneath)2821  static struct mountpoint *do_lock_mount(struct path *path, bool beneath)
2822  {
2823  	struct vfsmount *mnt = path->mnt;
2824  	struct dentry *dentry;
2825  	struct mountpoint *mp = ERR_PTR(-ENOENT);
2826  
2827  	for (;;) {
2828  		struct mount *m;
2829  
2830  		if (beneath) {
2831  			m = real_mount(mnt);
2832  			read_seqlock_excl(&mount_lock);
2833  			dentry = dget(m->mnt_mountpoint);
2834  			read_sequnlock_excl(&mount_lock);
2835  		} else {
2836  			dentry = path->dentry;
2837  		}
2838  
2839  		inode_lock(dentry->d_inode);
2840  		if (unlikely(cant_mount(dentry))) {
2841  			inode_unlock(dentry->d_inode);
2842  			goto out;
2843  		}
2844  
2845  		namespace_lock();
2846  
2847  		if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) {
2848  			namespace_unlock();
2849  			inode_unlock(dentry->d_inode);
2850  			goto out;
2851  		}
2852  
2853  		mnt = lookup_mnt(path);
2854  		if (likely(!mnt))
2855  			break;
2856  
2857  		namespace_unlock();
2858  		inode_unlock(dentry->d_inode);
2859  		if (beneath)
2860  			dput(dentry);
2861  		path_put(path);
2862  		path->mnt = mnt;
2863  		path->dentry = dget(mnt->mnt_root);
2864  	}
2865  
2866  	mp = get_mountpoint(dentry);
2867  	if (IS_ERR(mp)) {
2868  		namespace_unlock();
2869  		inode_unlock(dentry->d_inode);
2870  	}
2871  
2872  out:
2873  	if (beneath)
2874  		dput(dentry);
2875  
2876  	return mp;
2877  }
2878  
lock_mount(struct path * path)2879  static inline struct mountpoint *lock_mount(struct path *path)
2880  {
2881  	return do_lock_mount(path, false);
2882  }
2883  
unlock_mount(struct mountpoint * where)2884  static void unlock_mount(struct mountpoint *where)
2885  {
2886  	struct dentry *dentry = where->m_dentry;
2887  
2888  	read_seqlock_excl(&mount_lock);
2889  	put_mountpoint(where);
2890  	read_sequnlock_excl(&mount_lock);
2891  
2892  	namespace_unlock();
2893  	inode_unlock(dentry->d_inode);
2894  }
2895  
graft_tree(struct mount * mnt,struct mount * p,struct mountpoint * mp)2896  static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
2897  {
2898  	if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER)
2899  		return -EINVAL;
2900  
2901  	if (d_is_dir(mp->m_dentry) !=
2902  	      d_is_dir(mnt->mnt.mnt_root))
2903  		return -ENOTDIR;
2904  
2905  	return attach_recursive_mnt(mnt, p, mp, 0);
2906  }
2907  
2908  /*
2909   * Sanity check the flags to change_mnt_propagation.
2910   */
2911  
flags_to_propagation_type(int ms_flags)2912  static int flags_to_propagation_type(int ms_flags)
2913  {
2914  	int type = ms_flags & ~(MS_REC | MS_SILENT);
2915  
2916  	/* Fail if any non-propagation flags are set */
2917  	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
2918  		return 0;
2919  	/* Only one propagation flag should be set */
2920  	if (!is_power_of_2(type))
2921  		return 0;
2922  	return type;
2923  }
2924  
2925  /*
2926   * recursively change the type of the mountpoint.
2927   */
do_change_type(struct path * path,int ms_flags)2928  static int do_change_type(struct path *path, int ms_flags)
2929  {
2930  	struct mount *m;
2931  	struct mount *mnt = real_mount(path->mnt);
2932  	int recurse = ms_flags & MS_REC;
2933  	int type;
2934  	int err = 0;
2935  
2936  	if (!path_mounted(path))
2937  		return -EINVAL;
2938  
2939  	type = flags_to_propagation_type(ms_flags);
2940  	if (!type)
2941  		return -EINVAL;
2942  
2943  	namespace_lock();
2944  	if (type == MS_SHARED) {
2945  		err = invent_group_ids(mnt, recurse);
2946  		if (err)
2947  			goto out_unlock;
2948  	}
2949  
2950  	lock_mount_hash();
2951  	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
2952  		change_mnt_propagation(m, type);
2953  	unlock_mount_hash();
2954  
2955   out_unlock:
2956  	namespace_unlock();
2957  	return err;
2958  }
2959  
2960  /* may_copy_tree() - check if a mount tree can be copied
2961   * @path: path to the mount tree to be copied
2962   *
2963   * This helper checks if the caller may copy the mount tree starting
2964   * from @path->mnt. The caller may copy the mount tree under the
2965   * following circumstances:
2966   *
2967   * (1) The caller is located in the mount namespace of the mount tree.
2968   *     This also implies that the mount does not belong to an anonymous
2969   *     mount namespace.
2970   * (2) The caller tries to copy an nfs mount referring to a mount
2971   *     namespace, i.e., the caller is trying to copy a mount namespace
2972   *     entry from nsfs.
2973   * (3) The caller tries to copy a pidfs mount referring to a pidfd.
2974   * (4) The caller is trying to copy a mount tree that belongs to an
2975   *     anonymous mount namespace.
2976   *
2977   *     For that to be safe, this helper enforces that the origin mount
2978   *     namespace the anonymous mount namespace was created from is the
2979   *     same as the caller's mount namespace by comparing the sequence
2980   *     numbers.
2981   *
2982   *     This is not strictly necessary. The current semantics of the new
2983   *     mount api enforce that the caller must be located in the same
2984   *     mount namespace as the mount tree it interacts with. Using the
2985   *     origin sequence number preserves these semantics even for
2986   *     anonymous mount namespaces. However, one could envision extending
2987   *     the api to directly operate across mount namespace if needed.
2988   *
2989   *     The ownership of a non-anonymous mount namespace such as the
2990   *     caller's cannot change.
2991   *     => We know that the caller's mount namespace is stable.
2992   *
2993   *     If the origin sequence number of the anonymous mount namespace is
2994   *     the same as the sequence number of the caller's mount namespace.
2995   *     => The owning namespaces are the same.
2996   *
2997   *     ==> The earlier capability check on the owning namespace of the
2998   *         caller's mount namespace ensures that the caller has the
2999   *         ability to copy the mount tree.
3000   *
3001   * Returns true if the mount tree can be copied, false otherwise.
3002   */
may_copy_tree(struct path * path)3003  static inline bool may_copy_tree(struct path *path)
3004  {
3005  	struct mount *mnt = real_mount(path->mnt);
3006  	const struct dentry_operations *d_op;
3007  
3008  	if (check_mnt(mnt))
3009  		return true;
3010  
3011  	d_op = path->dentry->d_op;
3012  	if (d_op == &ns_dentry_operations)
3013  		return true;
3014  
3015  	if (d_op == &pidfs_dentry_operations)
3016  		return true;
3017  
3018  	if (!is_mounted(path->mnt))
3019  		return false;
3020  
3021  	return check_anonymous_mnt(mnt);
3022  }
3023  
3024  
__do_loopback(struct path * old_path,int recurse)3025  static struct mount *__do_loopback(struct path *old_path, int recurse)
3026  {
3027  	struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
3028  
3029  	if (IS_MNT_UNBINDABLE(old))
3030  		return mnt;
3031  
3032  	if (!may_copy_tree(old_path))
3033  		return mnt;
3034  
3035  	if (!recurse && has_locked_children(old, old_path->dentry))
3036  		return mnt;
3037  
3038  	if (recurse)
3039  		mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE);
3040  	else
3041  		mnt = clone_mnt(old, old_path->dentry, 0);
3042  
3043  	if (!IS_ERR(mnt))
3044  		mnt->mnt.mnt_flags &= ~MNT_LOCKED;
3045  
3046  	return mnt;
3047  }
3048  
3049  /*
3050   * do loopback mount.
3051   */
do_loopback(struct path * path,const char * old_name,int recurse)3052  static int do_loopback(struct path *path, const char *old_name,
3053  				int recurse)
3054  {
3055  	struct path old_path;
3056  	struct mount *mnt = NULL, *parent;
3057  	struct mountpoint *mp;
3058  	int err;
3059  	if (!old_name || !*old_name)
3060  		return -EINVAL;
3061  	err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
3062  	if (err)
3063  		return err;
3064  
3065  	err = -EINVAL;
3066  	if (mnt_ns_loop(old_path.dentry))
3067  		goto out;
3068  
3069  	mp = lock_mount(path);
3070  	if (IS_ERR(mp)) {
3071  		err = PTR_ERR(mp);
3072  		goto out;
3073  	}
3074  
3075  	parent = real_mount(path->mnt);
3076  	if (!check_mnt(parent))
3077  		goto out2;
3078  
3079  	mnt = __do_loopback(&old_path, recurse);
3080  	if (IS_ERR(mnt)) {
3081  		err = PTR_ERR(mnt);
3082  		goto out2;
3083  	}
3084  
3085  	err = graft_tree(mnt, parent, mp);
3086  	if (err) {
3087  		lock_mount_hash();
3088  		umount_tree(mnt, UMOUNT_SYNC);
3089  		unlock_mount_hash();
3090  	}
3091  out2:
3092  	unlock_mount(mp);
3093  out:
3094  	path_put(&old_path);
3095  	return err;
3096  }
3097  
open_detached_copy(struct path * path,bool recursive)3098  static struct file *open_detached_copy(struct path *path, bool recursive)
3099  {
3100  	struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
3101  	struct user_namespace *user_ns = mnt_ns->user_ns;
3102  	struct mount *mnt, *p;
3103  	struct file *file;
3104  
3105  	ns = alloc_mnt_ns(user_ns, true);
3106  	if (IS_ERR(ns))
3107  		return ERR_CAST(ns);
3108  
3109  	namespace_lock();
3110  
3111  	/*
3112  	 * Record the sequence number of the source mount namespace.
3113  	 * This needs to hold namespace_sem to ensure that the mount
3114  	 * doesn't get attached.
3115  	 */
3116  	if (is_mounted(path->mnt)) {
3117  		src_mnt_ns = real_mount(path->mnt)->mnt_ns;
3118  		if (is_anon_ns(src_mnt_ns))
3119  			ns->seq_origin = src_mnt_ns->seq_origin;
3120  		else
3121  			ns->seq_origin = src_mnt_ns->seq;
3122  	}
3123  
3124  	mnt = __do_loopback(path, recursive);
3125  	if (IS_ERR(mnt)) {
3126  		namespace_unlock();
3127  		free_mnt_ns(ns);
3128  		return ERR_CAST(mnt);
3129  	}
3130  
3131  	lock_mount_hash();
3132  	for (p = mnt; p; p = next_mnt(p, mnt)) {
3133  		mnt_add_to_ns(ns, p);
3134  		ns->nr_mounts++;
3135  	}
3136  	ns->root = mnt;
3137  	mntget(&mnt->mnt);
3138  	unlock_mount_hash();
3139  	namespace_unlock();
3140  
3141  	mntput(path->mnt);
3142  	path->mnt = &mnt->mnt;
3143  	file = dentry_open(path, O_PATH, current_cred());
3144  	if (IS_ERR(file))
3145  		dissolve_on_fput(path->mnt);
3146  	else
3147  		file->f_mode |= FMODE_NEED_UNMOUNT;
3148  	return file;
3149  }
3150  
vfs_open_tree(int dfd,const char __user * filename,unsigned int flags)3151  static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
3152  {
3153  	int ret;
3154  	struct path path __free(path_put) = {};
3155  	int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
3156  	bool detached = flags & OPEN_TREE_CLONE;
3157  
3158  	BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
3159  
3160  	if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
3161  		      AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
3162  		      OPEN_TREE_CLOEXEC))
3163  		return ERR_PTR(-EINVAL);
3164  
3165  	if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
3166  		return ERR_PTR(-EINVAL);
3167  
3168  	if (flags & AT_NO_AUTOMOUNT)
3169  		lookup_flags &= ~LOOKUP_AUTOMOUNT;
3170  	if (flags & AT_SYMLINK_NOFOLLOW)
3171  		lookup_flags &= ~LOOKUP_FOLLOW;
3172  	if (flags & AT_EMPTY_PATH)
3173  		lookup_flags |= LOOKUP_EMPTY;
3174  
3175  	if (detached && !may_mount())
3176  		return ERR_PTR(-EPERM);
3177  
3178  	ret = user_path_at(dfd, filename, lookup_flags, &path);
3179  	if (unlikely(ret))
3180  		return ERR_PTR(ret);
3181  
3182  	if (detached)
3183  		return open_detached_copy(&path, flags & AT_RECURSIVE);
3184  
3185  	return dentry_open(&path, O_PATH, current_cred());
3186  }
3187  
SYSCALL_DEFINE3(open_tree,int,dfd,const char __user *,filename,unsigned,flags)3188  SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
3189  {
3190  	int fd;
3191  	struct file *file __free(fput) = NULL;
3192  
3193  	file = vfs_open_tree(dfd, filename, flags);
3194  	if (IS_ERR(file))
3195  		return PTR_ERR(file);
3196  
3197  	fd = get_unused_fd_flags(flags & O_CLOEXEC);
3198  	if (fd < 0)
3199  		return fd;
3200  
3201  	fd_install(fd, no_free_ptr(file));
3202  	return fd;
3203  }
3204  
3205  /*
3206   * Don't allow locked mount flags to be cleared.
3207   *
3208   * No locks need to be held here while testing the various MNT_LOCK
3209   * flags because those flags can never be cleared once they are set.
3210   */
can_change_locked_flags(struct mount * mnt,unsigned int mnt_flags)3211  static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags)
3212  {
3213  	unsigned int fl = mnt->mnt.mnt_flags;
3214  
3215  	if ((fl & MNT_LOCK_READONLY) &&
3216  	    !(mnt_flags & MNT_READONLY))
3217  		return false;
3218  
3219  	if ((fl & MNT_LOCK_NODEV) &&
3220  	    !(mnt_flags & MNT_NODEV))
3221  		return false;
3222  
3223  	if ((fl & MNT_LOCK_NOSUID) &&
3224  	    !(mnt_flags & MNT_NOSUID))
3225  		return false;
3226  
3227  	if ((fl & MNT_LOCK_NOEXEC) &&
3228  	    !(mnt_flags & MNT_NOEXEC))
3229  		return false;
3230  
3231  	if ((fl & MNT_LOCK_ATIME) &&
3232  	    ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK)))
3233  		return false;
3234  
3235  	return true;
3236  }
3237  
change_mount_ro_state(struct mount * mnt,unsigned int mnt_flags)3238  static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags)
3239  {
3240  	bool readonly_request = (mnt_flags & MNT_READONLY);
3241  
3242  	if (readonly_request == __mnt_is_readonly(&mnt->mnt))
3243  		return 0;
3244  
3245  	if (readonly_request)
3246  		return mnt_make_readonly(mnt);
3247  
3248  	mnt->mnt.mnt_flags &= ~MNT_READONLY;
3249  	return 0;
3250  }
3251  
set_mount_attributes(struct mount * mnt,unsigned int mnt_flags)3252  static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
3253  {
3254  	mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
3255  	mnt->mnt.mnt_flags = mnt_flags;
3256  	touch_mnt_namespace(mnt->mnt_ns);
3257  }
3258  
mnt_warn_timestamp_expiry(struct path * mountpoint,struct vfsmount * mnt)3259  static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
3260  {
3261  	struct super_block *sb = mnt->mnt_sb;
3262  
3263  	if (!__mnt_is_readonly(mnt) &&
3264  	   (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) &&
3265  	   (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) {
3266  		char *buf, *mntpath;
3267  
3268  		buf = (char *)__get_free_page(GFP_KERNEL);
3269  		if (buf)
3270  			mntpath = d_path(mountpoint, buf, PAGE_SIZE);
3271  		else
3272  			mntpath = ERR_PTR(-ENOMEM);
3273  		if (IS_ERR(mntpath))
3274  			mntpath = "(unknown)";
3275  
3276  		pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n",
3277  			sb->s_type->name,
3278  			is_mounted(mnt) ? "remounted" : "mounted",
3279  			mntpath, &sb->s_time_max,
3280  			(unsigned long long)sb->s_time_max);
3281  
3282  		sb->s_iflags |= SB_I_TS_EXPIRY_WARNED;
3283  		if (buf)
3284  			free_page((unsigned long)buf);
3285  	}
3286  }
3287  
3288  /*
3289   * Handle reconfiguration of the mountpoint only without alteration of the
3290   * superblock it refers to.  This is triggered by specifying MS_REMOUNT|MS_BIND
3291   * to mount(2).
3292   */
do_reconfigure_mnt(struct path * path,unsigned int mnt_flags)3293  static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags)
3294  {
3295  	struct super_block *sb = path->mnt->mnt_sb;
3296  	struct mount *mnt = real_mount(path->mnt);
3297  	int ret;
3298  
3299  	if (!check_mnt(mnt))
3300  		return -EINVAL;
3301  
3302  	if (!path_mounted(path))
3303  		return -EINVAL;
3304  
3305  	if (!can_change_locked_flags(mnt, mnt_flags))
3306  		return -EPERM;
3307  
3308  	/*
3309  	 * We're only checking whether the superblock is read-only not
3310  	 * changing it, so only take down_read(&sb->s_umount).
3311  	 */
3312  	down_read(&sb->s_umount);
3313  	lock_mount_hash();
3314  	ret = change_mount_ro_state(mnt, mnt_flags);
3315  	if (ret == 0)
3316  		set_mount_attributes(mnt, mnt_flags);
3317  	unlock_mount_hash();
3318  	up_read(&sb->s_umount);
3319  
3320  	mnt_warn_timestamp_expiry(path, &mnt->mnt);
3321  
3322  	return ret;
3323  }
3324  
3325  /*
3326   * change filesystem flags. dir should be a physical root of filesystem.
3327   * If you've mounted a non-root directory somewhere and want to do remount
3328   * on it - tough luck.
3329   */
do_remount(struct path * path,int ms_flags,int sb_flags,int mnt_flags,void * data)3330  static int do_remount(struct path *path, int ms_flags, int sb_flags,
3331  		      int mnt_flags, void *data)
3332  {
3333  	int err;
3334  	struct super_block *sb = path->mnt->mnt_sb;
3335  	struct mount *mnt = real_mount(path->mnt);
3336  	struct fs_context *fc;
3337  
3338  	if (!check_mnt(mnt))
3339  		return -EINVAL;
3340  
3341  	if (!path_mounted(path))
3342  		return -EINVAL;
3343  
3344  	if (!can_change_locked_flags(mnt, mnt_flags))
3345  		return -EPERM;
3346  
3347  	fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK);
3348  	if (IS_ERR(fc))
3349  		return PTR_ERR(fc);
3350  
3351  	/*
3352  	 * Indicate to the filesystem that the remount request is coming
3353  	 * from the legacy mount system call.
3354  	 */
3355  	fc->oldapi = true;
3356  
3357  	err = parse_monolithic_mount_data(fc, data);
3358  	if (!err) {
3359  		down_write(&sb->s_umount);
3360  		err = -EPERM;
3361  		if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) {
3362  			err = reconfigure_super(fc);
3363  			if (!err) {
3364  				lock_mount_hash();
3365  				set_mount_attributes(mnt, mnt_flags);
3366  				unlock_mount_hash();
3367  			}
3368  		}
3369  		up_write(&sb->s_umount);
3370  	}
3371  
3372  	mnt_warn_timestamp_expiry(path, &mnt->mnt);
3373  
3374  	put_fs_context(fc);
3375  	return err;
3376  }
3377  
tree_contains_unbindable(struct mount * mnt)3378  static inline int tree_contains_unbindable(struct mount *mnt)
3379  {
3380  	struct mount *p;
3381  	for (p = mnt; p; p = next_mnt(p, mnt)) {
3382  		if (IS_MNT_UNBINDABLE(p))
3383  			return 1;
3384  	}
3385  	return 0;
3386  }
3387  
do_set_group(struct path * from_path,struct path * to_path)3388  static int do_set_group(struct path *from_path, struct path *to_path)
3389  {
3390  	struct mount *from, *to;
3391  	int err;
3392  
3393  	from = real_mount(from_path->mnt);
3394  	to = real_mount(to_path->mnt);
3395  
3396  	namespace_lock();
3397  
3398  	err = -EINVAL;
3399  	/* To and From must be mounted */
3400  	if (!is_mounted(&from->mnt))
3401  		goto out;
3402  	if (!is_mounted(&to->mnt))
3403  		goto out;
3404  
3405  	err = -EPERM;
3406  	/* We should be allowed to modify mount namespaces of both mounts */
3407  	if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN))
3408  		goto out;
3409  	if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN))
3410  		goto out;
3411  
3412  	err = -EINVAL;
3413  	/* To and From paths should be mount roots */
3414  	if (!path_mounted(from_path))
3415  		goto out;
3416  	if (!path_mounted(to_path))
3417  		goto out;
3418  
3419  	/* Setting sharing groups is only allowed across same superblock */
3420  	if (from->mnt.mnt_sb != to->mnt.mnt_sb)
3421  		goto out;
3422  
3423  	/* From mount root should be wider than To mount root */
3424  	if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root))
3425  		goto out;
3426  
3427  	/* From mount should not have locked children in place of To's root */
3428  	if (has_locked_children(from, to->mnt.mnt_root))
3429  		goto out;
3430  
3431  	/* Setting sharing groups is only allowed on private mounts */
3432  	if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to))
3433  		goto out;
3434  
3435  	/* From should not be private */
3436  	if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from))
3437  		goto out;
3438  
3439  	if (IS_MNT_SLAVE(from)) {
3440  		struct mount *m = from->mnt_master;
3441  
3442  		list_add(&to->mnt_slave, &m->mnt_slave_list);
3443  		to->mnt_master = m;
3444  	}
3445  
3446  	if (IS_MNT_SHARED(from)) {
3447  		to->mnt_group_id = from->mnt_group_id;
3448  		list_add(&to->mnt_share, &from->mnt_share);
3449  		lock_mount_hash();
3450  		set_mnt_shared(to);
3451  		unlock_mount_hash();
3452  	}
3453  
3454  	err = 0;
3455  out:
3456  	namespace_unlock();
3457  	return err;
3458  }
3459  
3460  /**
3461   * path_overmounted - check if path is overmounted
3462   * @path: path to check
3463   *
3464   * Check if path is overmounted, i.e., if there's a mount on top of
3465   * @path->mnt with @path->dentry as mountpoint.
3466   *
3467   * Context: This function expects namespace_lock() to be held.
3468   * Return: If path is overmounted true is returned, false if not.
3469   */
path_overmounted(const struct path * path)3470  static inline bool path_overmounted(const struct path *path)
3471  {
3472  	rcu_read_lock();
3473  	if (unlikely(__lookup_mnt(path->mnt, path->dentry))) {
3474  		rcu_read_unlock();
3475  		return true;
3476  	}
3477  	rcu_read_unlock();
3478  	return false;
3479  }
3480  
3481  /**
3482   * can_move_mount_beneath - check that we can mount beneath the top mount
3483   * @from: mount to mount beneath
3484   * @to:   mount under which to mount
3485   * @mp:   mountpoint of @to
3486   *
3487   * - Make sure that @to->dentry is actually the root of a mount under
3488   *   which we can mount another mount.
3489   * - Make sure that nothing can be mounted beneath the caller's current
3490   *   root or the rootfs of the namespace.
3491   * - Make sure that the caller can unmount the topmost mount ensuring
3492   *   that the caller could reveal the underlying mountpoint.
3493   * - Ensure that nothing has been mounted on top of @from before we
3494   *   grabbed @namespace_sem to avoid creating pointless shadow mounts.
3495   * - Prevent mounting beneath a mount if the propagation relationship
3496   *   between the source mount, parent mount, and top mount would lead to
3497   *   nonsensical mount trees.
3498   *
3499   * Context: This function expects namespace_lock() to be held.
3500   * Return: On success 0, and on error a negative error code is returned.
3501   */
can_move_mount_beneath(const struct path * from,const struct path * to,const struct mountpoint * mp)3502  static int can_move_mount_beneath(const struct path *from,
3503  				  const struct path *to,
3504  				  const struct mountpoint *mp)
3505  {
3506  	struct mount *mnt_from = real_mount(from->mnt),
3507  		     *mnt_to = real_mount(to->mnt),
3508  		     *parent_mnt_to = mnt_to->mnt_parent;
3509  
3510  	if (!mnt_has_parent(mnt_to))
3511  		return -EINVAL;
3512  
3513  	if (!path_mounted(to))
3514  		return -EINVAL;
3515  
3516  	if (IS_MNT_LOCKED(mnt_to))
3517  		return -EINVAL;
3518  
3519  	/* Avoid creating shadow mounts during mount propagation. */
3520  	if (path_overmounted(from))
3521  		return -EINVAL;
3522  
3523  	/*
3524  	 * Mounting beneath the rootfs only makes sense when the
3525  	 * semantics of pivot_root(".", ".") are used.
3526  	 */
3527  	if (&mnt_to->mnt == current->fs->root.mnt)
3528  		return -EINVAL;
3529  	if (parent_mnt_to == current->nsproxy->mnt_ns->root)
3530  		return -EINVAL;
3531  
3532  	for (struct mount *p = mnt_from; mnt_has_parent(p); p = p->mnt_parent)
3533  		if (p == mnt_to)
3534  			return -EINVAL;
3535  
3536  	/*
3537  	 * If the parent mount propagates to the child mount this would
3538  	 * mean mounting @mnt_from on @mnt_to->mnt_parent and then
3539  	 * propagating a copy @c of @mnt_from on top of @mnt_to. This
3540  	 * defeats the whole purpose of mounting beneath another mount.
3541  	 */
3542  	if (propagation_would_overmount(parent_mnt_to, mnt_to, mp))
3543  		return -EINVAL;
3544  
3545  	/*
3546  	 * If @mnt_to->mnt_parent propagates to @mnt_from this would
3547  	 * mean propagating a copy @c of @mnt_from on top of @mnt_from.
3548  	 * Afterwards @mnt_from would be mounted on top of
3549  	 * @mnt_to->mnt_parent and @mnt_to would be unmounted from
3550  	 * @mnt->mnt_parent and remounted on @mnt_from. But since @c is
3551  	 * already mounted on @mnt_from, @mnt_to would ultimately be
3552  	 * remounted on top of @c. Afterwards, @mnt_from would be
3553  	 * covered by a copy @c of @mnt_from and @c would be covered by
3554  	 * @mnt_from itself. This defeats the whole purpose of mounting
3555  	 * @mnt_from beneath @mnt_to.
3556  	 */
3557  	if (propagation_would_overmount(parent_mnt_to, mnt_from, mp))
3558  		return -EINVAL;
3559  
3560  	return 0;
3561  }
3562  
3563  /* may_use_mount() - check if a mount tree can be used
3564   * @mnt: vfsmount to be used
3565   *
3566   * This helper checks if the caller may use the mount tree starting
3567   * from @path->mnt. The caller may use the mount tree under the
3568   * following circumstances:
3569   *
3570   * (1) The caller is located in the mount namespace of the mount tree.
3571   *     This also implies that the mount does not belong to an anonymous
3572   *     mount namespace.
3573   * (2) The caller is trying to use a mount tree that belongs to an
3574   *     anonymous mount namespace.
3575   *
3576   *     For that to be safe, this helper enforces that the origin mount
3577   *     namespace the anonymous mount namespace was created from is the
3578   *     same as the caller's mount namespace by comparing the sequence
3579   *     numbers.
3580   *
3581   *     The ownership of a non-anonymous mount namespace such as the
3582   *     caller's cannot change.
3583   *     => We know that the caller's mount namespace is stable.
3584   *
3585   *     If the origin sequence number of the anonymous mount namespace is
3586   *     the same as the sequence number of the caller's mount namespace.
3587   *     => The owning namespaces are the same.
3588   *
3589   *     ==> The earlier capability check on the owning namespace of the
3590   *         caller's mount namespace ensures that the caller has the
3591   *         ability to use the mount tree.
3592   *
3593   * Returns true if the mount tree can be used, false otherwise.
3594   */
may_use_mount(struct mount * mnt)3595  static inline bool may_use_mount(struct mount *mnt)
3596  {
3597  	if (check_mnt(mnt))
3598  		return true;
3599  
3600  	/*
3601  	 * Make sure that noone unmounted the target path or somehow
3602  	 * managed to get their hands on something purely kernel
3603  	 * internal.
3604  	 */
3605  	if (!is_mounted(&mnt->mnt))
3606  		return false;
3607  
3608  	return check_anonymous_mnt(mnt);
3609  }
3610  
do_move_mount(struct path * old_path,struct path * new_path,enum mnt_tree_flags_t flags)3611  static int do_move_mount(struct path *old_path,
3612  			 struct path *new_path, enum mnt_tree_flags_t flags)
3613  {
3614  	struct mnt_namespace *ns;
3615  	struct mount *p;
3616  	struct mount *old;
3617  	struct mount *parent;
3618  	struct mountpoint *mp, *old_mp;
3619  	int err;
3620  	bool attached, beneath = flags & MNT_TREE_BENEATH;
3621  
3622  	mp = do_lock_mount(new_path, beneath);
3623  	if (IS_ERR(mp))
3624  		return PTR_ERR(mp);
3625  
3626  	old = real_mount(old_path->mnt);
3627  	p = real_mount(new_path->mnt);
3628  	parent = old->mnt_parent;
3629  	attached = mnt_has_parent(old);
3630  	if (attached)
3631  		flags |= MNT_TREE_MOVE;
3632  	old_mp = old->mnt_mp;
3633  	ns = old->mnt_ns;
3634  
3635  	err = -EINVAL;
3636  	if (!may_use_mount(p))
3637  		goto out;
3638  
3639  	/* The thing moved must be mounted... */
3640  	if (!is_mounted(&old->mnt))
3641  		goto out;
3642  
3643  	/* ... and either ours or the root of anon namespace */
3644  	if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
3645  		goto out;
3646  
3647  	if (is_anon_ns(ns)) {
3648  		/*
3649  		 * Ending up with two files referring to the root of the
3650  		 * same anonymous mount namespace would cause an error
3651  		 * as this would mean trying to move the same mount
3652  		 * twice into the mount tree which would be rejected
3653  		 * later. But be explicit about it right here.
3654  		 */
3655  		if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
3656  			goto out;
3657  
3658  		/*
3659  		 * If this is an anonymous mount tree ensure that mount
3660  		 * propagation can detect mounts that were just
3661  		 * propagated to the target mount tree so we don't
3662  		 * propagate onto them.
3663  		 */
3664  		ns->mntns_flags |= MNTNS_PROPAGATING;
3665  	} else if (is_anon_ns(p->mnt_ns)) {
3666  		/*
3667  		 * Don't allow moving an attached mount tree to an
3668  		 * anonymous mount tree.
3669  		 */
3670  		goto out;
3671  	}
3672  
3673  	if (old->mnt.mnt_flags & MNT_LOCKED)
3674  		goto out;
3675  
3676  	if (!path_mounted(old_path))
3677  		goto out;
3678  
3679  	if (d_is_dir(new_path->dentry) !=
3680  	    d_is_dir(old_path->dentry))
3681  		goto out;
3682  	/*
3683  	 * Don't move a mount residing in a shared parent.
3684  	 */
3685  	if (attached && IS_MNT_SHARED(parent))
3686  		goto out;
3687  
3688  	if (beneath) {
3689  		err = can_move_mount_beneath(old_path, new_path, mp);
3690  		if (err)
3691  			goto out;
3692  
3693  		err = -EINVAL;
3694  		p = p->mnt_parent;
3695  		flags |= MNT_TREE_BENEATH;
3696  	}
3697  
3698  	/*
3699  	 * Don't move a mount tree containing unbindable mounts to a destination
3700  	 * mount which is shared.
3701  	 */
3702  	if (IS_MNT_SHARED(p) && tree_contains_unbindable(old))
3703  		goto out;
3704  	err = -ELOOP;
3705  	if (!check_for_nsfs_mounts(old))
3706  		goto out;
3707  	for (; mnt_has_parent(p); p = p->mnt_parent)
3708  		if (p == old)
3709  			goto out;
3710  
3711  	err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, flags);
3712  	if (err)
3713  		goto out;
3714  
3715  	if (is_anon_ns(ns))
3716  		ns->mntns_flags &= ~MNTNS_PROPAGATING;
3717  
3718  	/* if the mount is moved, it should no longer be expire
3719  	 * automatically */
3720  	list_del_init(&old->mnt_expire);
3721  	if (attached)
3722  		put_mountpoint(old_mp);
3723  out:
3724  	unlock_mount(mp);
3725  	if (!err) {
3726  		if (attached) {
3727  			mntput_no_expire(parent);
3728  		} else {
3729  			/* Make sure we notice when we leak mounts. */
3730  			VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
3731  			free_mnt_ns(ns);
3732  		}
3733  	}
3734  	return err;
3735  }
3736  
do_move_mount_old(struct path * path,const char * old_name)3737  static int do_move_mount_old(struct path *path, const char *old_name)
3738  {
3739  	struct path old_path;
3740  	int err;
3741  
3742  	if (!old_name || !*old_name)
3743  		return -EINVAL;
3744  
3745  	err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
3746  	if (err)
3747  		return err;
3748  
3749  	err = do_move_mount(&old_path, path, 0);
3750  	path_put(&old_path);
3751  	return err;
3752  }
3753  
3754  /*
3755   * add a mount into a namespace's mount tree
3756   */
do_add_mount(struct mount * newmnt,struct mountpoint * mp,const struct path * path,int mnt_flags)3757  static int do_add_mount(struct mount *newmnt, struct mountpoint *mp,
3758  			const struct path *path, int mnt_flags)
3759  {
3760  	struct mount *parent = real_mount(path->mnt);
3761  
3762  	mnt_flags &= ~MNT_INTERNAL_FLAGS;
3763  
3764  	if (unlikely(!check_mnt(parent))) {
3765  		/* that's acceptable only for automounts done in private ns */
3766  		if (!(mnt_flags & MNT_SHRINKABLE))
3767  			return -EINVAL;
3768  		/* ... and for those we'd better have mountpoint still alive */
3769  		if (!parent->mnt_ns)
3770  			return -EINVAL;
3771  	}
3772  
3773  	/* Refuse the same filesystem on the same mount point */
3774  	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path))
3775  		return -EBUSY;
3776  
3777  	if (d_is_symlink(newmnt->mnt.mnt_root))
3778  		return -EINVAL;
3779  
3780  	newmnt->mnt.mnt_flags = mnt_flags;
3781  	return graft_tree(newmnt, parent, mp);
3782  }
3783  
3784  static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags);
3785  
3786  /*
3787   * Create a new mount using a superblock configuration and request it
3788   * be added to the namespace tree.
3789   */
do_new_mount_fc(struct fs_context * fc,struct path * mountpoint,unsigned int mnt_flags)3790  static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint,
3791  			   unsigned int mnt_flags)
3792  {
3793  	struct vfsmount *mnt;
3794  	struct mountpoint *mp;
3795  	struct super_block *sb = fc->root->d_sb;
3796  	int error;
3797  
3798  	error = security_sb_kern_mount(sb);
3799  	if (!error && mount_too_revealing(sb, &mnt_flags))
3800  		error = -EPERM;
3801  
3802  	if (unlikely(error)) {
3803  		fc_drop_locked(fc);
3804  		return error;
3805  	}
3806  
3807  	up_write(&sb->s_umount);
3808  
3809  	mnt = vfs_create_mount(fc);
3810  	if (IS_ERR(mnt))
3811  		return PTR_ERR(mnt);
3812  
3813  	mnt_warn_timestamp_expiry(mountpoint, mnt);
3814  
3815  	mp = lock_mount(mountpoint);
3816  	if (IS_ERR(mp)) {
3817  		mntput(mnt);
3818  		return PTR_ERR(mp);
3819  	}
3820  	error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags);
3821  	unlock_mount(mp);
3822  	if (error < 0)
3823  		mntput(mnt);
3824  	return error;
3825  }
3826  
3827  /*
3828   * create a new mount for userspace and request it to be added into the
3829   * namespace's tree
3830   */
do_new_mount(struct path * path,const char * fstype,int sb_flags,int mnt_flags,const char * name,void * data)3831  static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
3832  			int mnt_flags, const char *name, void *data)
3833  {
3834  	struct file_system_type *type;
3835  	struct fs_context *fc;
3836  	const char *subtype = NULL;
3837  	int err = 0;
3838  
3839  	if (!fstype)
3840  		return -EINVAL;
3841  
3842  	type = get_fs_type(fstype);
3843  	if (!type)
3844  		return -ENODEV;
3845  
3846  	if (type->fs_flags & FS_HAS_SUBTYPE) {
3847  		subtype = strchr(fstype, '.');
3848  		if (subtype) {
3849  			subtype++;
3850  			if (!*subtype) {
3851  				put_filesystem(type);
3852  				return -EINVAL;
3853  			}
3854  		}
3855  	}
3856  
3857  	fc = fs_context_for_mount(type, sb_flags);
3858  	put_filesystem(type);
3859  	if (IS_ERR(fc))
3860  		return PTR_ERR(fc);
3861  
3862  	/*
3863  	 * Indicate to the filesystem that the mount request is coming
3864  	 * from the legacy mount system call.
3865  	 */
3866  	fc->oldapi = true;
3867  
3868  	if (subtype)
3869  		err = vfs_parse_fs_string(fc, "subtype",
3870  					  subtype, strlen(subtype));
3871  	if (!err && name)
3872  		err = vfs_parse_fs_string(fc, "source", name, strlen(name));
3873  	if (!err)
3874  		err = parse_monolithic_mount_data(fc, data);
3875  	if (!err && !mount_capable(fc))
3876  		err = -EPERM;
3877  	if (!err)
3878  		err = vfs_get_tree(fc);
3879  	if (!err)
3880  		err = do_new_mount_fc(fc, path, mnt_flags);
3881  
3882  	put_fs_context(fc);
3883  	return err;
3884  }
3885  
finish_automount(struct vfsmount * m,const struct path * path)3886  int finish_automount(struct vfsmount *m, const struct path *path)
3887  {
3888  	struct dentry *dentry = path->dentry;
3889  	struct mountpoint *mp;
3890  	struct mount *mnt;
3891  	int err;
3892  
3893  	if (!m)
3894  		return 0;
3895  	if (IS_ERR(m))
3896  		return PTR_ERR(m);
3897  
3898  	mnt = real_mount(m);
3899  	/* The new mount record should have at least 2 refs to prevent it being
3900  	 * expired before we get a chance to add it
3901  	 */
3902  	BUG_ON(mnt_get_count(mnt) < 2);
3903  
3904  	if (m->mnt_sb == path->mnt->mnt_sb &&
3905  	    m->mnt_root == dentry) {
3906  		err = -ELOOP;
3907  		goto discard;
3908  	}
3909  
3910  	/*
3911  	 * we don't want to use lock_mount() - in this case finding something
3912  	 * that overmounts our mountpoint to be means "quitely drop what we've
3913  	 * got", not "try to mount it on top".
3914  	 */
3915  	inode_lock(dentry->d_inode);
3916  	namespace_lock();
3917  	if (unlikely(cant_mount(dentry))) {
3918  		err = -ENOENT;
3919  		goto discard_locked;
3920  	}
3921  	if (path_overmounted(path)) {
3922  		err = 0;
3923  		goto discard_locked;
3924  	}
3925  	mp = get_mountpoint(dentry);
3926  	if (IS_ERR(mp)) {
3927  		err = PTR_ERR(mp);
3928  		goto discard_locked;
3929  	}
3930  
3931  	err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE);
3932  	unlock_mount(mp);
3933  	if (unlikely(err))
3934  		goto discard;
3935  	mntput(m);
3936  	return 0;
3937  
3938  discard_locked:
3939  	namespace_unlock();
3940  	inode_unlock(dentry->d_inode);
3941  discard:
3942  	/* remove m from any expiration list it may be on */
3943  	if (!list_empty(&mnt->mnt_expire)) {
3944  		namespace_lock();
3945  		list_del_init(&mnt->mnt_expire);
3946  		namespace_unlock();
3947  	}
3948  	mntput(m);
3949  	mntput(m);
3950  	return err;
3951  }
3952  
3953  /**
3954   * mnt_set_expiry - Put a mount on an expiration list
3955   * @mnt: The mount to list.
3956   * @expiry_list: The list to add the mount to.
3957   */
mnt_set_expiry(struct vfsmount * mnt,struct list_head * expiry_list)3958  void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
3959  {
3960  	namespace_lock();
3961  
3962  	list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
3963  
3964  	namespace_unlock();
3965  }
3966  EXPORT_SYMBOL(mnt_set_expiry);
3967  
3968  /*
3969   * process a list of expirable mountpoints with the intent of discarding any
3970   * mountpoints that aren't in use and haven't been touched since last we came
3971   * here
3972   */
mark_mounts_for_expiry(struct list_head * mounts)3973  void mark_mounts_for_expiry(struct list_head *mounts)
3974  {
3975  	struct mount *mnt, *next;
3976  	LIST_HEAD(graveyard);
3977  
3978  	if (list_empty(mounts))
3979  		return;
3980  
3981  	namespace_lock();
3982  	lock_mount_hash();
3983  
3984  	/* extract from the expiration list every vfsmount that matches the
3985  	 * following criteria:
3986  	 * - only referenced by its parent vfsmount
3987  	 * - still marked for expiry (marked on the last call here; marks are
3988  	 *   cleared by mntput())
3989  	 */
3990  	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
3991  		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
3992  			propagate_mount_busy(mnt, 1))
3993  			continue;
3994  		list_move(&mnt->mnt_expire, &graveyard);
3995  	}
3996  	while (!list_empty(&graveyard)) {
3997  		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
3998  		touch_mnt_namespace(mnt->mnt_ns);
3999  		umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
4000  	}
4001  	unlock_mount_hash();
4002  	namespace_unlock();
4003  }
4004  
4005  EXPORT_SYMBOL_GPL(mark_mounts_for_expiry);
4006  
4007  /*
4008   * Ripoff of 'select_parent()'
4009   *
4010   * search the list of submounts for a given mountpoint, and move any
4011   * shrinkable submounts to the 'graveyard' list.
4012   */
select_submounts(struct mount * parent,struct list_head * graveyard)4013  static int select_submounts(struct mount *parent, struct list_head *graveyard)
4014  {
4015  	struct mount *this_parent = parent;
4016  	struct list_head *next;
4017  	int found = 0;
4018  
4019  repeat:
4020  	next = this_parent->mnt_mounts.next;
4021  resume:
4022  	while (next != &this_parent->mnt_mounts) {
4023  		struct list_head *tmp = next;
4024  		struct mount *mnt = list_entry(tmp, struct mount, mnt_child);
4025  
4026  		next = tmp->next;
4027  		if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE))
4028  			continue;
4029  		/*
4030  		 * Descend a level if the d_mounts list is non-empty.
4031  		 */
4032  		if (!list_empty(&mnt->mnt_mounts)) {
4033  			this_parent = mnt;
4034  			goto repeat;
4035  		}
4036  
4037  		if (!propagate_mount_busy(mnt, 1)) {
4038  			list_move_tail(&mnt->mnt_expire, graveyard);
4039  			found++;
4040  		}
4041  	}
4042  	/*
4043  	 * All done at this level ... ascend and resume the search
4044  	 */
4045  	if (this_parent != parent) {
4046  		next = this_parent->mnt_child.next;
4047  		this_parent = this_parent->mnt_parent;
4048  		goto resume;
4049  	}
4050  	return found;
4051  }
4052  
4053  /*
4054   * process a list of expirable mountpoints with the intent of discarding any
4055   * submounts of a specific parent mountpoint
4056   *
4057   * mount_lock must be held for write
4058   */
shrink_submounts(struct mount * mnt)4059  static void shrink_submounts(struct mount *mnt)
4060  {
4061  	LIST_HEAD(graveyard);
4062  	struct mount *m;
4063  
4064  	/* extract submounts of 'mountpoint' from the expiration list */
4065  	while (select_submounts(mnt, &graveyard)) {
4066  		while (!list_empty(&graveyard)) {
4067  			m = list_first_entry(&graveyard, struct mount,
4068  						mnt_expire);
4069  			touch_mnt_namespace(m->mnt_ns);
4070  			umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC);
4071  		}
4072  	}
4073  }
4074  
copy_mount_options(const void __user * data)4075  static void *copy_mount_options(const void __user * data)
4076  {
4077  	char *copy;
4078  	unsigned left, offset;
4079  
4080  	if (!data)
4081  		return NULL;
4082  
4083  	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
4084  	if (!copy)
4085  		return ERR_PTR(-ENOMEM);
4086  
4087  	left = copy_from_user(copy, data, PAGE_SIZE);
4088  
4089  	/*
4090  	 * Not all architectures have an exact copy_from_user(). Resort to
4091  	 * byte at a time.
4092  	 */
4093  	offset = PAGE_SIZE - left;
4094  	while (left) {
4095  		char c;
4096  		if (get_user(c, (const char __user *)data + offset))
4097  			break;
4098  		copy[offset] = c;
4099  		left--;
4100  		offset++;
4101  	}
4102  
4103  	if (left == PAGE_SIZE) {
4104  		kfree(copy);
4105  		return ERR_PTR(-EFAULT);
4106  	}
4107  
4108  	return copy;
4109  }
4110  
copy_mount_string(const void __user * data)4111  static char *copy_mount_string(const void __user *data)
4112  {
4113  	return data ? strndup_user(data, PATH_MAX) : NULL;
4114  }
4115  
4116  /*
4117   * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to
4118   * be given to the mount() call (ie: read-only, no-dev, no-suid etc).
4119   *
4120   * data is a (void *) that can point to any structure up to
4121   * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent
4122   * information (or be NULL).
4123   *
4124   * Pre-0.97 versions of mount() didn't have a flags word.
4125   * When the flags word was introduced its top half was required
4126   * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9.
4127   * Therefore, if this magic number is present, it carries no information
4128   * and must be discarded.
4129   */
path_mount(const char * dev_name,struct path * path,const char * type_page,unsigned long flags,void * data_page)4130  int path_mount(const char *dev_name, struct path *path,
4131  		const char *type_page, unsigned long flags, void *data_page)
4132  {
4133  	unsigned int mnt_flags = 0, sb_flags;
4134  	int ret;
4135  
4136  	/* Discard magic */
4137  	if ((flags & MS_MGC_MSK) == MS_MGC_VAL)
4138  		flags &= ~MS_MGC_MSK;
4139  
4140  	/* Basic sanity checks */
4141  	if (data_page)
4142  		((char *)data_page)[PAGE_SIZE - 1] = 0;
4143  
4144  	if (flags & MS_NOUSER)
4145  		return -EINVAL;
4146  
4147  	ret = security_sb_mount(dev_name, path, type_page, flags, data_page);
4148  	if (ret)
4149  		return ret;
4150  	if (!may_mount())
4151  		return -EPERM;
4152  	if (flags & SB_MANDLOCK)
4153  		warn_mandlock();
4154  
4155  	/* Default to relatime unless overriden */
4156  	if (!(flags & MS_NOATIME))
4157  		mnt_flags |= MNT_RELATIME;
4158  
4159  	/* Separate the per-mountpoint flags */
4160  	if (flags & MS_NOSUID)
4161  		mnt_flags |= MNT_NOSUID;
4162  	if (flags & MS_NODEV)
4163  		mnt_flags |= MNT_NODEV;
4164  	if (flags & MS_NOEXEC)
4165  		mnt_flags |= MNT_NOEXEC;
4166  	if (flags & MS_NOATIME)
4167  		mnt_flags |= MNT_NOATIME;
4168  	if (flags & MS_NODIRATIME)
4169  		mnt_flags |= MNT_NODIRATIME;
4170  	if (flags & MS_STRICTATIME)
4171  		mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME);
4172  	if (flags & MS_RDONLY)
4173  		mnt_flags |= MNT_READONLY;
4174  	if (flags & MS_NOSYMFOLLOW)
4175  		mnt_flags |= MNT_NOSYMFOLLOW;
4176  
4177  	/* The default atime for remount is preservation */
4178  	if ((flags & MS_REMOUNT) &&
4179  	    ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME |
4180  		       MS_STRICTATIME)) == 0)) {
4181  		mnt_flags &= ~MNT_ATIME_MASK;
4182  		mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK;
4183  	}
4184  
4185  	sb_flags = flags & (SB_RDONLY |
4186  			    SB_SYNCHRONOUS |
4187  			    SB_MANDLOCK |
4188  			    SB_DIRSYNC |
4189  			    SB_SILENT |
4190  			    SB_POSIXACL |
4191  			    SB_LAZYTIME |
4192  			    SB_I_VERSION);
4193  
4194  	if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND))
4195  		return do_reconfigure_mnt(path, mnt_flags);
4196  	if (flags & MS_REMOUNT)
4197  		return do_remount(path, flags, sb_flags, mnt_flags, data_page);
4198  	if (flags & MS_BIND)
4199  		return do_loopback(path, dev_name, flags & MS_REC);
4200  	if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
4201  		return do_change_type(path, flags);
4202  	if (flags & MS_MOVE)
4203  		return do_move_mount_old(path, dev_name);
4204  
4205  	return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name,
4206  			    data_page);
4207  }
4208  
do_mount(const char * dev_name,const char __user * dir_name,const char * type_page,unsigned long flags,void * data_page)4209  int do_mount(const char *dev_name, const char __user *dir_name,
4210  		const char *type_page, unsigned long flags, void *data_page)
4211  {
4212  	struct path path;
4213  	int ret;
4214  
4215  	ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path);
4216  	if (ret)
4217  		return ret;
4218  	ret = path_mount(dev_name, &path, type_page, flags, data_page);
4219  	path_put(&path);
4220  	return ret;
4221  }
4222  
inc_mnt_namespaces(struct user_namespace * ns)4223  static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns)
4224  {
4225  	return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES);
4226  }
4227  
dec_mnt_namespaces(struct ucounts * ucounts)4228  static void dec_mnt_namespaces(struct ucounts *ucounts)
4229  {
4230  	dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES);
4231  }
4232  
free_mnt_ns(struct mnt_namespace * ns)4233  static void free_mnt_ns(struct mnt_namespace *ns)
4234  {
4235  	if (!is_anon_ns(ns))
4236  		ns_free_inum(&ns->ns);
4237  	dec_mnt_namespaces(ns->ucounts);
4238  	mnt_ns_tree_remove(ns);
4239  }
4240  
4241  /*
4242   * Assign a sequence number so we can detect when we attempt to bind
4243   * mount a reference to an older mount namespace into the current
4244   * mount namespace, preventing reference counting loops.  A 64bit
4245   * number incrementing at 10Ghz will take 12,427 years to wrap which
4246   * is effectively never, so we can ignore the possibility.
4247   */
4248  static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1);
4249  
alloc_mnt_ns(struct user_namespace * user_ns,bool anon)4250  static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon)
4251  {
4252  	struct mnt_namespace *new_ns;
4253  	struct ucounts *ucounts;
4254  	int ret;
4255  
4256  	ucounts = inc_mnt_namespaces(user_ns);
4257  	if (!ucounts)
4258  		return ERR_PTR(-ENOSPC);
4259  
4260  	new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT);
4261  	if (!new_ns) {
4262  		dec_mnt_namespaces(ucounts);
4263  		return ERR_PTR(-ENOMEM);
4264  	}
4265  	if (!anon) {
4266  		ret = ns_alloc_inum(&new_ns->ns);
4267  		if (ret) {
4268  			kfree(new_ns);
4269  			dec_mnt_namespaces(ucounts);
4270  			return ERR_PTR(ret);
4271  		}
4272  	}
4273  	new_ns->ns.ops = &mntns_operations;
4274  	if (!anon)
4275  		new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
4276  	refcount_set(&new_ns->ns.count, 1);
4277  	refcount_set(&new_ns->passive, 1);
4278  	new_ns->mounts = RB_ROOT;
4279  	INIT_LIST_HEAD(&new_ns->mnt_ns_list);
4280  	RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
4281  	init_waitqueue_head(&new_ns->poll);
4282  	new_ns->user_ns = get_user_ns(user_ns);
4283  	new_ns->ucounts = ucounts;
4284  	return new_ns;
4285  }
4286  
4287  __latent_entropy
copy_mnt_ns(unsigned long flags,struct mnt_namespace * ns,struct user_namespace * user_ns,struct fs_struct * new_fs)4288  struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
4289  		struct user_namespace *user_ns, struct fs_struct *new_fs)
4290  {
4291  	struct mnt_namespace *new_ns;
4292  	struct vfsmount *rootmnt = NULL, *pwdmnt = NULL;
4293  	struct mount *p, *q;
4294  	struct mount *old;
4295  	struct mount *new;
4296  	int copy_flags;
4297  
4298  	BUG_ON(!ns);
4299  
4300  	if (likely(!(flags & CLONE_NEWNS))) {
4301  		get_mnt_ns(ns);
4302  		return ns;
4303  	}
4304  
4305  	old = ns->root;
4306  
4307  	new_ns = alloc_mnt_ns(user_ns, false);
4308  	if (IS_ERR(new_ns))
4309  		return new_ns;
4310  
4311  	namespace_lock();
4312  	/* First pass: copy the tree topology */
4313  	copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE;
4314  	if (user_ns != ns->user_ns)
4315  		copy_flags |= CL_SHARED_TO_SLAVE;
4316  	new = copy_tree(old, old->mnt.mnt_root, copy_flags);
4317  	if (IS_ERR(new)) {
4318  		namespace_unlock();
4319  		ns_free_inum(&new_ns->ns);
4320  		dec_mnt_namespaces(new_ns->ucounts);
4321  		mnt_ns_release(new_ns);
4322  		return ERR_CAST(new);
4323  	}
4324  	if (user_ns != ns->user_ns) {
4325  		lock_mount_hash();
4326  		lock_mnt_tree(new);
4327  		unlock_mount_hash();
4328  	}
4329  	new_ns->root = new;
4330  
4331  	/*
4332  	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
4333  	 * as belonging to new namespace.  We have already acquired a private
4334  	 * fs_struct, so tsk->fs->lock is not needed.
4335  	 */
4336  	p = old;
4337  	q = new;
4338  	while (p) {
4339  		mnt_add_to_ns(new_ns, q);
4340  		new_ns->nr_mounts++;
4341  		if (new_fs) {
4342  			if (&p->mnt == new_fs->root.mnt) {
4343  				new_fs->root.mnt = mntget(&q->mnt);
4344  				rootmnt = &p->mnt;
4345  			}
4346  			if (&p->mnt == new_fs->pwd.mnt) {
4347  				new_fs->pwd.mnt = mntget(&q->mnt);
4348  				pwdmnt = &p->mnt;
4349  			}
4350  		}
4351  		p = next_mnt(p, old);
4352  		q = next_mnt(q, new);
4353  		if (!q)
4354  			break;
4355  		// an mntns binding we'd skipped?
4356  		while (p->mnt.mnt_root != q->mnt.mnt_root)
4357  			p = next_mnt(skip_mnt_tree(p), old);
4358  	}
4359  	namespace_unlock();
4360  
4361  	if (rootmnt)
4362  		mntput(rootmnt);
4363  	if (pwdmnt)
4364  		mntput(pwdmnt);
4365  
4366  	mnt_ns_tree_add(new_ns);
4367  	return new_ns;
4368  }
4369  
mount_subtree(struct vfsmount * m,const char * name)4370  struct dentry *mount_subtree(struct vfsmount *m, const char *name)
4371  {
4372  	struct mount *mnt = real_mount(m);
4373  	struct mnt_namespace *ns;
4374  	struct super_block *s;
4375  	struct path path;
4376  	int err;
4377  
4378  	ns = alloc_mnt_ns(&init_user_ns, true);
4379  	if (IS_ERR(ns)) {
4380  		mntput(m);
4381  		return ERR_CAST(ns);
4382  	}
4383  	ns->root = mnt;
4384  	ns->nr_mounts++;
4385  	mnt_add_to_ns(ns, mnt);
4386  
4387  	err = vfs_path_lookup(m->mnt_root, m,
4388  			name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
4389  
4390  	put_mnt_ns(ns);
4391  
4392  	if (err)
4393  		return ERR_PTR(err);
4394  
4395  	/* trade a vfsmount reference for active sb one */
4396  	s = path.mnt->mnt_sb;
4397  	atomic_inc(&s->s_active);
4398  	mntput(path.mnt);
4399  	/* lock the sucker */
4400  	down_write(&s->s_umount);
4401  	/* ... and return the root of (sub)tree on it */
4402  	return path.dentry;
4403  }
4404  EXPORT_SYMBOL(mount_subtree);
4405  
SYSCALL_DEFINE5(mount,char __user *,dev_name,char __user *,dir_name,char __user *,type,unsigned long,flags,void __user *,data)4406  SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
4407  		char __user *, type, unsigned long, flags, void __user *, data)
4408  {
4409  	int ret;
4410  	char *kernel_type;
4411  	char *kernel_dev;
4412  	void *options;
4413  
4414  	kernel_type = copy_mount_string(type);
4415  	ret = PTR_ERR(kernel_type);
4416  	if (IS_ERR(kernel_type))
4417  		goto out_type;
4418  
4419  	kernel_dev = copy_mount_string(dev_name);
4420  	ret = PTR_ERR(kernel_dev);
4421  	if (IS_ERR(kernel_dev))
4422  		goto out_dev;
4423  
4424  	options = copy_mount_options(data);
4425  	ret = PTR_ERR(options);
4426  	if (IS_ERR(options))
4427  		goto out_data;
4428  
4429  	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
4430  
4431  	kfree(options);
4432  out_data:
4433  	kfree(kernel_dev);
4434  out_dev:
4435  	kfree(kernel_type);
4436  out_type:
4437  	return ret;
4438  }
4439  
4440  #define FSMOUNT_VALID_FLAGS                                                    \
4441  	(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV |            \
4442  	 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME |       \
4443  	 MOUNT_ATTR_NOSYMFOLLOW)
4444  
4445  #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
4446  
4447  #define MOUNT_SETATTR_PROPAGATION_FLAGS \
4448  	(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
4449  
attr_flags_to_mnt_flags(u64 attr_flags)4450  static unsigned int attr_flags_to_mnt_flags(u64 attr_flags)
4451  {
4452  	unsigned int mnt_flags = 0;
4453  
4454  	if (attr_flags & MOUNT_ATTR_RDONLY)
4455  		mnt_flags |= MNT_READONLY;
4456  	if (attr_flags & MOUNT_ATTR_NOSUID)
4457  		mnt_flags |= MNT_NOSUID;
4458  	if (attr_flags & MOUNT_ATTR_NODEV)
4459  		mnt_flags |= MNT_NODEV;
4460  	if (attr_flags & MOUNT_ATTR_NOEXEC)
4461  		mnt_flags |= MNT_NOEXEC;
4462  	if (attr_flags & MOUNT_ATTR_NODIRATIME)
4463  		mnt_flags |= MNT_NODIRATIME;
4464  	if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW)
4465  		mnt_flags |= MNT_NOSYMFOLLOW;
4466  
4467  	return mnt_flags;
4468  }
4469  
4470  /*
4471   * Create a kernel mount representation for a new, prepared superblock
4472   * (specified by fs_fd) and attach to an open_tree-like file descriptor.
4473   */
SYSCALL_DEFINE3(fsmount,int,fs_fd,unsigned int,flags,unsigned int,attr_flags)4474  SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
4475  		unsigned int, attr_flags)
4476  {
4477  	struct mnt_namespace *ns;
4478  	struct fs_context *fc;
4479  	struct file *file;
4480  	struct path newmount;
4481  	struct mount *mnt;
4482  	unsigned int mnt_flags = 0;
4483  	long ret;
4484  
4485  	if (!may_mount())
4486  		return -EPERM;
4487  
4488  	if ((flags & ~(FSMOUNT_CLOEXEC)) != 0)
4489  		return -EINVAL;
4490  
4491  	if (attr_flags & ~FSMOUNT_VALID_FLAGS)
4492  		return -EINVAL;
4493  
4494  	mnt_flags = attr_flags_to_mnt_flags(attr_flags);
4495  
4496  	switch (attr_flags & MOUNT_ATTR__ATIME) {
4497  	case MOUNT_ATTR_STRICTATIME:
4498  		break;
4499  	case MOUNT_ATTR_NOATIME:
4500  		mnt_flags |= MNT_NOATIME;
4501  		break;
4502  	case MOUNT_ATTR_RELATIME:
4503  		mnt_flags |= MNT_RELATIME;
4504  		break;
4505  	default:
4506  		return -EINVAL;
4507  	}
4508  
4509  	CLASS(fd, f)(fs_fd);
4510  	if (fd_empty(f))
4511  		return -EBADF;
4512  
4513  	if (fd_file(f)->f_op != &fscontext_fops)
4514  		return -EINVAL;
4515  
4516  	fc = fd_file(f)->private_data;
4517  
4518  	ret = mutex_lock_interruptible(&fc->uapi_mutex);
4519  	if (ret < 0)
4520  		return ret;
4521  
4522  	/* There must be a valid superblock or we can't mount it */
4523  	ret = -EINVAL;
4524  	if (!fc->root)
4525  		goto err_unlock;
4526  
4527  	ret = -EPERM;
4528  	if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) {
4529  		pr_warn("VFS: Mount too revealing\n");
4530  		goto err_unlock;
4531  	}
4532  
4533  	ret = -EBUSY;
4534  	if (fc->phase != FS_CONTEXT_AWAITING_MOUNT)
4535  		goto err_unlock;
4536  
4537  	if (fc->sb_flags & SB_MANDLOCK)
4538  		warn_mandlock();
4539  
4540  	newmount.mnt = vfs_create_mount(fc);
4541  	if (IS_ERR(newmount.mnt)) {
4542  		ret = PTR_ERR(newmount.mnt);
4543  		goto err_unlock;
4544  	}
4545  	newmount.dentry = dget(fc->root);
4546  	newmount.mnt->mnt_flags = mnt_flags;
4547  
4548  	/* We've done the mount bit - now move the file context into more or
4549  	 * less the same state as if we'd done an fspick().  We don't want to
4550  	 * do any memory allocation or anything like that at this point as we
4551  	 * don't want to have to handle any errors incurred.
4552  	 */
4553  	vfs_clean_context(fc);
4554  
4555  	ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true);
4556  	if (IS_ERR(ns)) {
4557  		ret = PTR_ERR(ns);
4558  		goto err_path;
4559  	}
4560  	mnt = real_mount(newmount.mnt);
4561  	ns->root = mnt;
4562  	ns->nr_mounts = 1;
4563  	mnt_add_to_ns(ns, mnt);
4564  	mntget(newmount.mnt);
4565  
4566  	/* Attach to an apparent O_PATH fd with a note that we need to unmount
4567  	 * it, not just simply put it.
4568  	 */
4569  	file = dentry_open(&newmount, O_PATH, fc->cred);
4570  	if (IS_ERR(file)) {
4571  		dissolve_on_fput(newmount.mnt);
4572  		ret = PTR_ERR(file);
4573  		goto err_path;
4574  	}
4575  	file->f_mode |= FMODE_NEED_UNMOUNT;
4576  
4577  	ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0);
4578  	if (ret >= 0)
4579  		fd_install(ret, file);
4580  	else
4581  		fput(file);
4582  
4583  err_path:
4584  	path_put(&newmount);
4585  err_unlock:
4586  	mutex_unlock(&fc->uapi_mutex);
4587  	return ret;
4588  }
4589  
vfs_move_mount(struct path * from_path,struct path * to_path,enum mnt_tree_flags_t mflags)4590  static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
4591  				 enum mnt_tree_flags_t mflags)
4592  {
4593  	int ret;
4594  
4595  	ret = security_move_mount(from_path, to_path);
4596  	if (ret)
4597  		return ret;
4598  
4599  	if (mflags & MNT_TREE_PROPAGATION)
4600  		return do_set_group(from_path, to_path);
4601  
4602  	return do_move_mount(from_path, to_path, mflags);
4603  }
4604  
4605  /*
4606   * Move a mount from one place to another.  In combination with
4607   * fsopen()/fsmount() this is used to install a new mount and in combination
4608   * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy
4609   * a mount subtree.
4610   *
4611   * Note the flags value is a combination of MOVE_MOUNT_* flags.
4612   */
SYSCALL_DEFINE5(move_mount,int,from_dfd,const char __user *,from_pathname,int,to_dfd,const char __user *,to_pathname,unsigned int,flags)4613  SYSCALL_DEFINE5(move_mount,
4614  		int, from_dfd, const char __user *, from_pathname,
4615  		int, to_dfd, const char __user *, to_pathname,
4616  		unsigned int, flags)
4617  {
4618  	struct path to_path __free(path_put) = {};
4619  	struct path from_path __free(path_put) = {};
4620  	struct filename *to_name __free(putname) = NULL;
4621  	struct filename *from_name __free(putname) = NULL;
4622  	unsigned int lflags, uflags;
4623  	enum mnt_tree_flags_t mflags = 0;
4624  	int ret = 0;
4625  
4626  	if (!may_mount())
4627  		return -EPERM;
4628  
4629  	if (flags & ~MOVE_MOUNT__MASK)
4630  		return -EINVAL;
4631  
4632  	if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) ==
4633  	    (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
4634  		return -EINVAL;
4635  
4636  	if (flags & MOVE_MOUNT_SET_GROUP)	mflags |= MNT_TREE_PROPAGATION;
4637  	if (flags & MOVE_MOUNT_BENEATH)		mflags |= MNT_TREE_BENEATH;
4638  
4639  	lflags = 0;
4640  	if (flags & MOVE_MOUNT_F_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
4641  	if (flags & MOVE_MOUNT_F_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
4642  	uflags = 0;
4643  	if (flags & MOVE_MOUNT_F_EMPTY_PATH)	uflags = AT_EMPTY_PATH;
4644  	from_name = getname_maybe_null(from_pathname, uflags);
4645  	if (IS_ERR(from_name))
4646  		return PTR_ERR(from_name);
4647  
4648  	lflags = 0;
4649  	if (flags & MOVE_MOUNT_T_SYMLINKS)	lflags |= LOOKUP_FOLLOW;
4650  	if (flags & MOVE_MOUNT_T_AUTOMOUNTS)	lflags |= LOOKUP_AUTOMOUNT;
4651  	uflags = 0;
4652  	if (flags & MOVE_MOUNT_T_EMPTY_PATH)	uflags = AT_EMPTY_PATH;
4653  	to_name = getname_maybe_null(to_pathname, uflags);
4654  	if (IS_ERR(to_name))
4655  		return PTR_ERR(to_name);
4656  
4657  	if (!to_name && to_dfd >= 0) {
4658  		CLASS(fd_raw, f_to)(to_dfd);
4659  		if (fd_empty(f_to))
4660  			return -EBADF;
4661  
4662  		to_path = fd_file(f_to)->f_path;
4663  		path_get(&to_path);
4664  	} else {
4665  		ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
4666  		if (ret)
4667  			return ret;
4668  	}
4669  
4670  	if (!from_name && from_dfd >= 0) {
4671  		CLASS(fd_raw, f_from)(from_dfd);
4672  		if (fd_empty(f_from))
4673  			return -EBADF;
4674  
4675  		return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
4676  	}
4677  
4678  	ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
4679  	if (ret)
4680  		return ret;
4681  
4682  	return vfs_move_mount(&from_path, &to_path, mflags);
4683  }
4684  
4685  /*
4686   * Return true if path is reachable from root
4687   *
4688   * namespace_sem or mount_lock is held
4689   */
is_path_reachable(struct mount * mnt,struct dentry * dentry,const struct path * root)4690  bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
4691  			 const struct path *root)
4692  {
4693  	while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) {
4694  		dentry = mnt->mnt_mountpoint;
4695  		mnt = mnt->mnt_parent;
4696  	}
4697  	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
4698  }
4699  
path_is_under(const struct path * path1,const struct path * path2)4700  bool path_is_under(const struct path *path1, const struct path *path2)
4701  {
4702  	bool res;
4703  	read_seqlock_excl(&mount_lock);
4704  	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
4705  	read_sequnlock_excl(&mount_lock);
4706  	return res;
4707  }
4708  EXPORT_SYMBOL(path_is_under);
4709  
4710  /*
4711   * pivot_root Semantics:
4712   * Moves the root file system of the current process to the directory put_old,
4713   * makes new_root as the new root file system of the current process, and sets
4714   * root/cwd of all processes which had them on the current root to new_root.
4715   *
4716   * Restrictions:
4717   * The new_root and put_old must be directories, and  must not be on the
4718   * same file  system as the current process root. The put_old  must  be
4719   * underneath new_root,  i.e. adding a non-zero number of /.. to the string
4720   * pointed to by put_old must yield the same directory as new_root. No other
4721   * file system may be mounted on put_old. After all, new_root is a mountpoint.
4722   *
4723   * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem.
4724   * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives
4725   * in this situation.
4726   *
4727   * Notes:
4728   *  - we don't move root/cwd if they are not at the root (reason: if something
4729   *    cared enough to change them, it's probably wrong to force them elsewhere)
4730   *  - it's okay to pick a root that isn't the root of a file system, e.g.
4731   *    /nfs/my_root where /nfs is the mount point. It must be a mountpoint,
4732   *    though, so you may need to say mount --bind /nfs/my_root /nfs/my_root
4733   *    first.
4734   */
SYSCALL_DEFINE2(pivot_root,const char __user *,new_root,const char __user *,put_old)4735  SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
4736  		const char __user *, put_old)
4737  {
4738  	struct path new, old, root;
4739  	struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent;
4740  	struct mountpoint *old_mp, *root_mp;
4741  	int error;
4742  
4743  	if (!may_mount())
4744  		return -EPERM;
4745  
4746  	error = user_path_at(AT_FDCWD, new_root,
4747  			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new);
4748  	if (error)
4749  		goto out0;
4750  
4751  	error = user_path_at(AT_FDCWD, put_old,
4752  			     LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old);
4753  	if (error)
4754  		goto out1;
4755  
4756  	error = security_sb_pivotroot(&old, &new);
4757  	if (error)
4758  		goto out2;
4759  
4760  	get_fs_root(current->fs, &root);
4761  	old_mp = lock_mount(&old);
4762  	error = PTR_ERR(old_mp);
4763  	if (IS_ERR(old_mp))
4764  		goto out3;
4765  
4766  	error = -EINVAL;
4767  	new_mnt = real_mount(new.mnt);
4768  	root_mnt = real_mount(root.mnt);
4769  	old_mnt = real_mount(old.mnt);
4770  	ex_parent = new_mnt->mnt_parent;
4771  	root_parent = root_mnt->mnt_parent;
4772  	if (IS_MNT_SHARED(old_mnt) ||
4773  		IS_MNT_SHARED(ex_parent) ||
4774  		IS_MNT_SHARED(root_parent))
4775  		goto out4;
4776  	if (!check_mnt(root_mnt) || !check_mnt(new_mnt))
4777  		goto out4;
4778  	if (new_mnt->mnt.mnt_flags & MNT_LOCKED)
4779  		goto out4;
4780  	error = -ENOENT;
4781  	if (d_unlinked(new.dentry))
4782  		goto out4;
4783  	error = -EBUSY;
4784  	if (new_mnt == root_mnt || old_mnt == root_mnt)
4785  		goto out4; /* loop, on the same file system  */
4786  	error = -EINVAL;
4787  	if (!path_mounted(&root))
4788  		goto out4; /* not a mountpoint */
4789  	if (!mnt_has_parent(root_mnt))
4790  		goto out4; /* not attached */
4791  	if (!path_mounted(&new))
4792  		goto out4; /* not a mountpoint */
4793  	if (!mnt_has_parent(new_mnt))
4794  		goto out4; /* not attached */
4795  	/* make sure we can reach put_old from new_root */
4796  	if (!is_path_reachable(old_mnt, old.dentry, &new))
4797  		goto out4;
4798  	/* make certain new is below the root */
4799  	if (!is_path_reachable(new_mnt, new.dentry, &root))
4800  		goto out4;
4801  	lock_mount_hash();
4802  	umount_mnt(new_mnt);
4803  	root_mp = unhash_mnt(root_mnt);  /* we'll need its mountpoint */
4804  	if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
4805  		new_mnt->mnt.mnt_flags |= MNT_LOCKED;
4806  		root_mnt->mnt.mnt_flags &= ~MNT_LOCKED;
4807  	}
4808  	/* mount old root on put_old */
4809  	attach_mnt(root_mnt, old_mnt, old_mp, false);
4810  	/* mount new_root on / */
4811  	attach_mnt(new_mnt, root_parent, root_mp, false);
4812  	mnt_add_count(root_parent, -1);
4813  	touch_mnt_namespace(current->nsproxy->mnt_ns);
4814  	/* A moved mount should not expire automatically */
4815  	list_del_init(&new_mnt->mnt_expire);
4816  	put_mountpoint(root_mp);
4817  	unlock_mount_hash();
4818  	mnt_notify_add(root_mnt);
4819  	mnt_notify_add(new_mnt);
4820  	chroot_fs_refs(&root, &new);
4821  	error = 0;
4822  out4:
4823  	unlock_mount(old_mp);
4824  	if (!error)
4825  		mntput_no_expire(ex_parent);
4826  out3:
4827  	path_put(&root);
4828  out2:
4829  	path_put(&old);
4830  out1:
4831  	path_put(&new);
4832  out0:
4833  	return error;
4834  }
4835  
recalc_flags(struct mount_kattr * kattr,struct mount * mnt)4836  static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
4837  {
4838  	unsigned int flags = mnt->mnt.mnt_flags;
4839  
4840  	/*  flags to clear */
4841  	flags &= ~kattr->attr_clr;
4842  	/* flags to raise */
4843  	flags |= kattr->attr_set;
4844  
4845  	return flags;
4846  }
4847  
can_idmap_mount(const struct mount_kattr * kattr,struct mount * mnt)4848  static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4849  {
4850  	struct vfsmount *m = &mnt->mnt;
4851  	struct user_namespace *fs_userns = m->mnt_sb->s_user_ns;
4852  
4853  	if (!kattr->mnt_idmap)
4854  		return 0;
4855  
4856  	/*
4857  	 * Creating an idmapped mount with the filesystem wide idmapping
4858  	 * doesn't make sense so block that. We don't allow mushy semantics.
4859  	 */
4860  	if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
4861  		return -EINVAL;
4862  
4863  	/*
4864  	 * We only allow an mount to change it's idmapping if it has
4865  	 * never been accessible to userspace.
4866  	 */
4867  	if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
4868  		return -EPERM;
4869  
4870  	/* The underlying filesystem doesn't support idmapped mounts yet. */
4871  	if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
4872  		return -EINVAL;
4873  
4874  	/* The filesystem has turned off idmapped mounts. */
4875  	if (m->mnt_sb->s_iflags & SB_I_NOIDMAP)
4876  		return -EINVAL;
4877  
4878  	/* We're not controlling the superblock. */
4879  	if (!ns_capable(fs_userns, CAP_SYS_ADMIN))
4880  		return -EPERM;
4881  
4882  	/* Mount has already been visible in the filesystem hierarchy. */
4883  	if (!is_anon_ns(mnt->mnt_ns))
4884  		return -EINVAL;
4885  
4886  	return 0;
4887  }
4888  
4889  /**
4890   * mnt_allow_writers() - check whether the attribute change allows writers
4891   * @kattr: the new mount attributes
4892   * @mnt: the mount to which @kattr will be applied
4893   *
4894   * Check whether thew new mount attributes in @kattr allow concurrent writers.
4895   *
4896   * Return: true if writers need to be held, false if not
4897   */
mnt_allow_writers(const struct mount_kattr * kattr,const struct mount * mnt)4898  static inline bool mnt_allow_writers(const struct mount_kattr *kattr,
4899  				     const struct mount *mnt)
4900  {
4901  	return (!(kattr->attr_set & MNT_READONLY) ||
4902  		(mnt->mnt.mnt_flags & MNT_READONLY)) &&
4903  	       !kattr->mnt_idmap;
4904  }
4905  
mount_setattr_prepare(struct mount_kattr * kattr,struct mount * mnt)4906  static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
4907  {
4908  	struct mount *m;
4909  	int err;
4910  
4911  	for (m = mnt; m; m = next_mnt(m, mnt)) {
4912  		if (!can_change_locked_flags(m, recalc_flags(kattr, m))) {
4913  			err = -EPERM;
4914  			break;
4915  		}
4916  
4917  		err = can_idmap_mount(kattr, m);
4918  		if (err)
4919  			break;
4920  
4921  		if (!mnt_allow_writers(kattr, m)) {
4922  			err = mnt_hold_writers(m);
4923  			if (err)
4924  				break;
4925  		}
4926  
4927  		if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4928  			return 0;
4929  	}
4930  
4931  	if (err) {
4932  		struct mount *p;
4933  
4934  		/*
4935  		 * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will
4936  		 * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all
4937  		 * mounts and needs to take care to include the first mount.
4938  		 */
4939  		for (p = mnt; p; p = next_mnt(p, mnt)) {
4940  			/* If we had to hold writers unblock them. */
4941  			if (p->mnt.mnt_flags & MNT_WRITE_HOLD)
4942  				mnt_unhold_writers(p);
4943  
4944  			/*
4945  			 * We're done once the first mount we changed got
4946  			 * MNT_WRITE_HOLD unset.
4947  			 */
4948  			if (p == m)
4949  				break;
4950  		}
4951  	}
4952  	return err;
4953  }
4954  
do_idmap_mount(const struct mount_kattr * kattr,struct mount * mnt)4955  static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
4956  {
4957  	struct mnt_idmap *old_idmap;
4958  
4959  	if (!kattr->mnt_idmap)
4960  		return;
4961  
4962  	old_idmap = mnt_idmap(&mnt->mnt);
4963  
4964  	/* Pairs with smp_load_acquire() in mnt_idmap(). */
4965  	smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
4966  	mnt_idmap_put(old_idmap);
4967  }
4968  
mount_setattr_commit(struct mount_kattr * kattr,struct mount * mnt)4969  static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
4970  {
4971  	struct mount *m;
4972  
4973  	for (m = mnt; m; m = next_mnt(m, mnt)) {
4974  		unsigned int flags;
4975  
4976  		do_idmap_mount(kattr, m);
4977  		flags = recalc_flags(kattr, m);
4978  		WRITE_ONCE(m->mnt.mnt_flags, flags);
4979  
4980  		/* If we had to hold writers unblock them. */
4981  		if (m->mnt.mnt_flags & MNT_WRITE_HOLD)
4982  			mnt_unhold_writers(m);
4983  
4984  		if (kattr->propagation)
4985  			change_mnt_propagation(m, kattr->propagation);
4986  		if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
4987  			break;
4988  	}
4989  	touch_mnt_namespace(mnt->mnt_ns);
4990  }
4991  
do_mount_setattr(struct path * path,struct mount_kattr * kattr)4992  static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
4993  {
4994  	struct mount *mnt = real_mount(path->mnt);
4995  	int err = 0;
4996  
4997  	if (!path_mounted(path))
4998  		return -EINVAL;
4999  
5000  	if (kattr->mnt_userns) {
5001  		struct mnt_idmap *mnt_idmap;
5002  
5003  		mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns);
5004  		if (IS_ERR(mnt_idmap))
5005  			return PTR_ERR(mnt_idmap);
5006  		kattr->mnt_idmap = mnt_idmap;
5007  	}
5008  
5009  	if (kattr->propagation) {
5010  		/*
5011  		 * Only take namespace_lock() if we're actually changing
5012  		 * propagation.
5013  		 */
5014  		namespace_lock();
5015  		if (kattr->propagation == MS_SHARED) {
5016  			err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
5017  			if (err) {
5018  				namespace_unlock();
5019  				return err;
5020  			}
5021  		}
5022  	}
5023  
5024  	err = -EINVAL;
5025  	lock_mount_hash();
5026  
5027  	/* Ensure that this isn't anything purely vfs internal. */
5028  	if (!is_mounted(&mnt->mnt))
5029  		goto out;
5030  
5031  	/*
5032  	 * If this is an attached mount make sure it's located in the callers
5033  	 * mount namespace. If it's not don't let the caller interact with it.
5034  	 *
5035  	 * If this mount doesn't have a parent it's most often simply a
5036  	 * detached mount with an anonymous mount namespace. IOW, something
5037  	 * that's simply not attached yet. But there are apparently also users
5038  	 * that do change mount properties on the rootfs itself. That obviously
5039  	 * neither has a parent nor is it a detached mount so we cannot
5040  	 * unconditionally check for detached mounts.
5041  	 */
5042  	if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
5043  		goto out;
5044  
5045  	/*
5046  	 * First, we get the mount tree in a shape where we can change mount
5047  	 * properties without failure. If we succeeded to do so we commit all
5048  	 * changes and if we failed we clean up.
5049  	 */
5050  	err = mount_setattr_prepare(kattr, mnt);
5051  	if (!err)
5052  		mount_setattr_commit(kattr, mnt);
5053  
5054  out:
5055  	unlock_mount_hash();
5056  
5057  	if (kattr->propagation) {
5058  		if (err)
5059  			cleanup_group_ids(mnt, NULL);
5060  		namespace_unlock();
5061  	}
5062  
5063  	return err;
5064  }
5065  
build_mount_idmapped(const struct mount_attr * attr,size_t usize,struct mount_kattr * kattr)5066  static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
5067  				struct mount_kattr *kattr)
5068  {
5069  	struct ns_common *ns;
5070  	struct user_namespace *mnt_userns;
5071  
5072  	if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
5073  		return 0;
5074  
5075  	if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
5076  		/*
5077  		 * We can only remove an idmapping if it's never been
5078  		 * exposed to userspace.
5079  		 */
5080  		if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
5081  			return -EINVAL;
5082  
5083  		/*
5084  		 * Removal of idmappings is equivalent to setting
5085  		 * nop_mnt_idmap.
5086  		 */
5087  		if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
5088  			kattr->mnt_idmap = &nop_mnt_idmap;
5089  			return 0;
5090  		}
5091  	}
5092  
5093  	if (attr->userns_fd > INT_MAX)
5094  		return -EINVAL;
5095  
5096  	CLASS(fd, f)(attr->userns_fd);
5097  	if (fd_empty(f))
5098  		return -EBADF;
5099  
5100  	if (!proc_ns_file(fd_file(f)))
5101  		return -EINVAL;
5102  
5103  	ns = get_proc_ns(file_inode(fd_file(f)));
5104  	if (ns->ops->type != CLONE_NEWUSER)
5105  		return -EINVAL;
5106  
5107  	/*
5108  	 * The initial idmapping cannot be used to create an idmapped
5109  	 * mount. We use the initial idmapping as an indicator of a mount
5110  	 * that is not idmapped. It can simply be passed into helpers that
5111  	 * are aware of idmapped mounts as a convenient shortcut. A user
5112  	 * can just create a dedicated identity mapping to achieve the same
5113  	 * result.
5114  	 */
5115  	mnt_userns = container_of(ns, struct user_namespace, ns);
5116  	if (mnt_userns == &init_user_ns)
5117  		return -EPERM;
5118  
5119  	/* We're not controlling the target namespace. */
5120  	if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
5121  		return -EPERM;
5122  
5123  	kattr->mnt_userns = get_user_ns(mnt_userns);
5124  	return 0;
5125  }
5126  
build_mount_kattr(const struct mount_attr * attr,size_t usize,struct mount_kattr * kattr)5127  static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
5128  			     struct mount_kattr *kattr)
5129  {
5130  	if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
5131  		return -EINVAL;
5132  	if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
5133  		return -EINVAL;
5134  	kattr->propagation = attr->propagation;
5135  
5136  	if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
5137  		return -EINVAL;
5138  
5139  	kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
5140  	kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
5141  
5142  	/*
5143  	 * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap,
5144  	 * users wanting to transition to a different atime setting cannot
5145  	 * simply specify the atime setting in @attr_set, but must also
5146  	 * specify MOUNT_ATTR__ATIME in the @attr_clr field.
5147  	 * So ensure that MOUNT_ATTR__ATIME can't be partially set in
5148  	 * @attr_clr and that @attr_set can't have any atime bits set if
5149  	 * MOUNT_ATTR__ATIME isn't set in @attr_clr.
5150  	 */
5151  	if (attr->attr_clr & MOUNT_ATTR__ATIME) {
5152  		if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME)
5153  			return -EINVAL;
5154  
5155  		/*
5156  		 * Clear all previous time settings as they are mutually
5157  		 * exclusive.
5158  		 */
5159  		kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME;
5160  		switch (attr->attr_set & MOUNT_ATTR__ATIME) {
5161  		case MOUNT_ATTR_RELATIME:
5162  			kattr->attr_set |= MNT_RELATIME;
5163  			break;
5164  		case MOUNT_ATTR_NOATIME:
5165  			kattr->attr_set |= MNT_NOATIME;
5166  			break;
5167  		case MOUNT_ATTR_STRICTATIME:
5168  			break;
5169  		default:
5170  			return -EINVAL;
5171  		}
5172  	} else {
5173  		if (attr->attr_set & MOUNT_ATTR__ATIME)
5174  			return -EINVAL;
5175  	}
5176  
5177  	return build_mount_idmapped(attr, usize, kattr);
5178  }
5179  
finish_mount_kattr(struct mount_kattr * kattr)5180  static void finish_mount_kattr(struct mount_kattr *kattr)
5181  {
5182  	if (kattr->mnt_userns) {
5183  		put_user_ns(kattr->mnt_userns);
5184  		kattr->mnt_userns = NULL;
5185  	}
5186  
5187  	if (kattr->mnt_idmap)
5188  		mnt_idmap_put(kattr->mnt_idmap);
5189  }
5190  
copy_mount_setattr(struct mount_attr __user * uattr,size_t usize,struct mount_kattr * kattr)5191  static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
5192  			      struct mount_kattr *kattr)
5193  {
5194  	int ret;
5195  	struct mount_attr attr;
5196  
5197  	BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
5198  
5199  	if (unlikely(usize > PAGE_SIZE))
5200  		return -E2BIG;
5201  	if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
5202  		return -EINVAL;
5203  
5204  	if (!may_mount())
5205  		return -EPERM;
5206  
5207  	ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
5208  	if (ret)
5209  		return ret;
5210  
5211  	/* Don't bother walking through the mounts if this is a nop. */
5212  	if (attr.attr_set == 0 &&
5213  	    attr.attr_clr == 0 &&
5214  	    attr.propagation == 0)
5215  		return 0;
5216  
5217  	return build_mount_kattr(&attr, usize, kattr);
5218  }
5219  
SYSCALL_DEFINE5(mount_setattr,int,dfd,const char __user *,path,unsigned int,flags,struct mount_attr __user *,uattr,size_t,usize)5220  SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
5221  		unsigned int, flags, struct mount_attr __user *, uattr,
5222  		size_t, usize)
5223  {
5224  	int err;
5225  	struct path target;
5226  	struct mount_kattr kattr;
5227  	unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
5228  
5229  	if (flags & ~(AT_EMPTY_PATH |
5230  		      AT_RECURSIVE |
5231  		      AT_SYMLINK_NOFOLLOW |
5232  		      AT_NO_AUTOMOUNT))
5233  		return -EINVAL;
5234  
5235  	if (flags & AT_NO_AUTOMOUNT)
5236  		lookup_flags &= ~LOOKUP_AUTOMOUNT;
5237  	if (flags & AT_SYMLINK_NOFOLLOW)
5238  		lookup_flags &= ~LOOKUP_FOLLOW;
5239  	if (flags & AT_EMPTY_PATH)
5240  		lookup_flags |= LOOKUP_EMPTY;
5241  
5242  	kattr = (struct mount_kattr) {
5243  		.lookup_flags	= lookup_flags,
5244  	};
5245  
5246  	if (flags & AT_RECURSIVE)
5247  		kattr.kflags |= MOUNT_KATTR_RECURSE;
5248  
5249  	err = copy_mount_setattr(uattr, usize, &kattr);
5250  	if (err)
5251  		return err;
5252  
5253  	err = user_path_at(dfd, path, kattr.lookup_flags, &target);
5254  	if (!err) {
5255  		err = do_mount_setattr(&target, &kattr);
5256  		path_put(&target);
5257  	}
5258  	finish_mount_kattr(&kattr);
5259  	return err;
5260  }
5261  
SYSCALL_DEFINE5(open_tree_attr,int,dfd,const char __user *,filename,unsigned,flags,struct mount_attr __user *,uattr,size_t,usize)5262  SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
5263  		unsigned, flags, struct mount_attr __user *, uattr,
5264  		size_t, usize)
5265  {
5266  	struct file __free(fput) *file = NULL;
5267  	int fd;
5268  
5269  	if (!uattr && usize)
5270  		return -EINVAL;
5271  
5272  	file = vfs_open_tree(dfd, filename, flags);
5273  	if (IS_ERR(file))
5274  		return PTR_ERR(file);
5275  
5276  	if (uattr) {
5277  		int ret;
5278  		struct mount_kattr kattr = {};
5279  
5280  		kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
5281  		if (flags & AT_RECURSIVE)
5282  			kattr.kflags |= MOUNT_KATTR_RECURSE;
5283  
5284  		ret = copy_mount_setattr(uattr, usize, &kattr);
5285  		if (ret)
5286  			return ret;
5287  
5288  		ret = do_mount_setattr(&file->f_path, &kattr);
5289  		if (ret)
5290  			return ret;
5291  
5292  		finish_mount_kattr(&kattr);
5293  	}
5294  
5295  	fd = get_unused_fd_flags(flags & O_CLOEXEC);
5296  	if (fd < 0)
5297  		return fd;
5298  
5299  	fd_install(fd, no_free_ptr(file));
5300  	return fd;
5301  }
5302  
show_path(struct seq_file * m,struct dentry * root)5303  int show_path(struct seq_file *m, struct dentry *root)
5304  {
5305  	if (root->d_sb->s_op->show_path)
5306  		return root->d_sb->s_op->show_path(m, root);
5307  
5308  	seq_dentry(m, root, " \t\n\\");
5309  	return 0;
5310  }
5311  
lookup_mnt_in_ns(u64 id,struct mnt_namespace * ns)5312  static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
5313  {
5314  	struct mount *mnt = mnt_find_id_at(ns, id);
5315  
5316  	if (!mnt || mnt->mnt_id_unique != id)
5317  		return NULL;
5318  
5319  	return &mnt->mnt;
5320  }
5321  
5322  struct kstatmount {
5323  	struct statmount __user *buf;
5324  	size_t bufsize;
5325  	struct vfsmount *mnt;
5326  	struct mnt_idmap *idmap;
5327  	u64 mask;
5328  	struct path root;
5329  	struct statmount sm;
5330  	struct seq_file seq;
5331  };
5332  
mnt_to_attr_flags(struct vfsmount * mnt)5333  static u64 mnt_to_attr_flags(struct vfsmount *mnt)
5334  {
5335  	unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
5336  	u64 attr_flags = 0;
5337  
5338  	if (mnt_flags & MNT_READONLY)
5339  		attr_flags |= MOUNT_ATTR_RDONLY;
5340  	if (mnt_flags & MNT_NOSUID)
5341  		attr_flags |= MOUNT_ATTR_NOSUID;
5342  	if (mnt_flags & MNT_NODEV)
5343  		attr_flags |= MOUNT_ATTR_NODEV;
5344  	if (mnt_flags & MNT_NOEXEC)
5345  		attr_flags |= MOUNT_ATTR_NOEXEC;
5346  	if (mnt_flags & MNT_NODIRATIME)
5347  		attr_flags |= MOUNT_ATTR_NODIRATIME;
5348  	if (mnt_flags & MNT_NOSYMFOLLOW)
5349  		attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
5350  
5351  	if (mnt_flags & MNT_NOATIME)
5352  		attr_flags |= MOUNT_ATTR_NOATIME;
5353  	else if (mnt_flags & MNT_RELATIME)
5354  		attr_flags |= MOUNT_ATTR_RELATIME;
5355  	else
5356  		attr_flags |= MOUNT_ATTR_STRICTATIME;
5357  
5358  	if (is_idmapped_mnt(mnt))
5359  		attr_flags |= MOUNT_ATTR_IDMAP;
5360  
5361  	return attr_flags;
5362  }
5363  
mnt_to_propagation_flags(struct mount * m)5364  static u64 mnt_to_propagation_flags(struct mount *m)
5365  {
5366  	u64 propagation = 0;
5367  
5368  	if (IS_MNT_SHARED(m))
5369  		propagation |= MS_SHARED;
5370  	if (IS_MNT_SLAVE(m))
5371  		propagation |= MS_SLAVE;
5372  	if (IS_MNT_UNBINDABLE(m))
5373  		propagation |= MS_UNBINDABLE;
5374  	if (!propagation)
5375  		propagation |= MS_PRIVATE;
5376  
5377  	return propagation;
5378  }
5379  
statmount_sb_basic(struct kstatmount * s)5380  static void statmount_sb_basic(struct kstatmount *s)
5381  {
5382  	struct super_block *sb = s->mnt->mnt_sb;
5383  
5384  	s->sm.mask |= STATMOUNT_SB_BASIC;
5385  	s->sm.sb_dev_major = MAJOR(sb->s_dev);
5386  	s->sm.sb_dev_minor = MINOR(sb->s_dev);
5387  	s->sm.sb_magic = sb->s_magic;
5388  	s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
5389  }
5390  
statmount_mnt_basic(struct kstatmount * s)5391  static void statmount_mnt_basic(struct kstatmount *s)
5392  {
5393  	struct mount *m = real_mount(s->mnt);
5394  
5395  	s->sm.mask |= STATMOUNT_MNT_BASIC;
5396  	s->sm.mnt_id = m->mnt_id_unique;
5397  	s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
5398  	s->sm.mnt_id_old = m->mnt_id;
5399  	s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
5400  	s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
5401  	s->sm.mnt_propagation = mnt_to_propagation_flags(m);
5402  	s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
5403  	s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
5404  }
5405  
statmount_propagate_from(struct kstatmount * s)5406  static void statmount_propagate_from(struct kstatmount *s)
5407  {
5408  	struct mount *m = real_mount(s->mnt);
5409  
5410  	s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
5411  	if (IS_MNT_SLAVE(m))
5412  		s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
5413  }
5414  
statmount_mnt_root(struct kstatmount * s,struct seq_file * seq)5415  static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
5416  {
5417  	int ret;
5418  	size_t start = seq->count;
5419  
5420  	ret = show_path(seq, s->mnt->mnt_root);
5421  	if (ret)
5422  		return ret;
5423  
5424  	if (unlikely(seq_has_overflowed(seq)))
5425  		return -EAGAIN;
5426  
5427  	/*
5428           * Unescape the result. It would be better if supplied string was not
5429           * escaped in the first place, but that's a pretty invasive change.
5430           */
5431  	seq->buf[seq->count] = '\0';
5432  	seq->count = start;
5433  	seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
5434  	return 0;
5435  }
5436  
statmount_mnt_point(struct kstatmount * s,struct seq_file * seq)5437  static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
5438  {
5439  	struct vfsmount *mnt = s->mnt;
5440  	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
5441  	int err;
5442  
5443  	err = seq_path_root(seq, &mnt_path, &s->root, "");
5444  	return err == SEQ_SKIP ? 0 : err;
5445  }
5446  
statmount_fs_type(struct kstatmount * s,struct seq_file * seq)5447  static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
5448  {
5449  	struct super_block *sb = s->mnt->mnt_sb;
5450  
5451  	seq_puts(seq, sb->s_type->name);
5452  	return 0;
5453  }
5454  
statmount_fs_subtype(struct kstatmount * s,struct seq_file * seq)5455  static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
5456  {
5457  	struct super_block *sb = s->mnt->mnt_sb;
5458  
5459  	if (sb->s_subtype)
5460  		seq_puts(seq, sb->s_subtype);
5461  }
5462  
statmount_sb_source(struct kstatmount * s,struct seq_file * seq)5463  static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
5464  {
5465  	struct super_block *sb = s->mnt->mnt_sb;
5466  	struct mount *r = real_mount(s->mnt);
5467  
5468  	if (sb->s_op->show_devname) {
5469  		size_t start = seq->count;
5470  		int ret;
5471  
5472  		ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
5473  		if (ret)
5474  			return ret;
5475  
5476  		if (unlikely(seq_has_overflowed(seq)))
5477  			return -EAGAIN;
5478  
5479  		/* Unescape the result */
5480  		seq->buf[seq->count] = '\0';
5481  		seq->count = start;
5482  		seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
5483  	} else if (r->mnt_devname) {
5484  		seq_puts(seq, r->mnt_devname);
5485  	}
5486  	return 0;
5487  }
5488  
statmount_mnt_ns_id(struct kstatmount * s,struct mnt_namespace * ns)5489  static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
5490  {
5491  	s->sm.mask |= STATMOUNT_MNT_NS_ID;
5492  	s->sm.mnt_ns_id = ns->seq;
5493  }
5494  
statmount_mnt_opts(struct kstatmount * s,struct seq_file * seq)5495  static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
5496  {
5497  	struct vfsmount *mnt = s->mnt;
5498  	struct super_block *sb = mnt->mnt_sb;
5499  	size_t start = seq->count;
5500  	int err;
5501  
5502  	err = security_sb_show_options(seq, sb);
5503  	if (err)
5504  		return err;
5505  
5506  	if (sb->s_op->show_options) {
5507  		err = sb->s_op->show_options(seq, mnt->mnt_root);
5508  		if (err)
5509  			return err;
5510  	}
5511  
5512  	if (unlikely(seq_has_overflowed(seq)))
5513  		return -EAGAIN;
5514  
5515  	if (seq->count == start)
5516  		return 0;
5517  
5518  	/* skip leading comma */
5519  	memmove(seq->buf + start, seq->buf + start + 1,
5520  		seq->count - start - 1);
5521  	seq->count--;
5522  
5523  	return 0;
5524  }
5525  
statmount_opt_process(struct seq_file * seq,size_t start)5526  static inline int statmount_opt_process(struct seq_file *seq, size_t start)
5527  {
5528  	char *buf_end, *opt_end, *src, *dst;
5529  	int count = 0;
5530  
5531  	if (unlikely(seq_has_overflowed(seq)))
5532  		return -EAGAIN;
5533  
5534  	buf_end = seq->buf + seq->count;
5535  	dst = seq->buf + start;
5536  	src = dst + 1;	/* skip initial comma */
5537  
5538  	if (src >= buf_end) {
5539  		seq->count = start;
5540  		return 0;
5541  	}
5542  
5543  	*buf_end = '\0';
5544  	for (; src < buf_end; src = opt_end + 1) {
5545  		opt_end = strchrnul(src, ',');
5546  		*opt_end = '\0';
5547  		dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1;
5548  		if (WARN_ON_ONCE(++count == INT_MAX))
5549  			return -EOVERFLOW;
5550  	}
5551  	seq->count = dst - 1 - seq->buf;
5552  	return count;
5553  }
5554  
statmount_opt_array(struct kstatmount * s,struct seq_file * seq)5555  static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
5556  {
5557  	struct vfsmount *mnt = s->mnt;
5558  	struct super_block *sb = mnt->mnt_sb;
5559  	size_t start = seq->count;
5560  	int err;
5561  
5562  	if (!sb->s_op->show_options)
5563  		return 0;
5564  
5565  	err = sb->s_op->show_options(seq, mnt->mnt_root);
5566  	if (err)
5567  		return err;
5568  
5569  	err = statmount_opt_process(seq, start);
5570  	if (err < 0)
5571  		return err;
5572  
5573  	s->sm.opt_num = err;
5574  	return 0;
5575  }
5576  
statmount_opt_sec_array(struct kstatmount * s,struct seq_file * seq)5577  static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
5578  {
5579  	struct vfsmount *mnt = s->mnt;
5580  	struct super_block *sb = mnt->mnt_sb;
5581  	size_t start = seq->count;
5582  	int err;
5583  
5584  	err = security_sb_show_options(seq, sb);
5585  	if (err)
5586  		return err;
5587  
5588  	err = statmount_opt_process(seq, start);
5589  	if (err < 0)
5590  		return err;
5591  
5592  	s->sm.opt_sec_num = err;
5593  	return 0;
5594  }
5595  
statmount_mnt_uidmap(struct kstatmount * s,struct seq_file * seq)5596  static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
5597  {
5598  	int ret;
5599  
5600  	ret = statmount_mnt_idmap(s->idmap, seq, true);
5601  	if (ret < 0)
5602  		return ret;
5603  
5604  	s->sm.mnt_uidmap_num = ret;
5605  	/*
5606  	 * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
5607  	 * mappings. This allows userspace to distinguish between a
5608  	 * non-idmapped mount and an idmapped mount where none of the
5609  	 * individual mappings are valid in the caller's idmapping.
5610  	 */
5611  	if (is_valid_mnt_idmap(s->idmap))
5612  		s->sm.mask |= STATMOUNT_MNT_UIDMAP;
5613  	return 0;
5614  }
5615  
statmount_mnt_gidmap(struct kstatmount * s,struct seq_file * seq)5616  static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
5617  {
5618  	int ret;
5619  
5620  	ret = statmount_mnt_idmap(s->idmap, seq, false);
5621  	if (ret < 0)
5622  		return ret;
5623  
5624  	s->sm.mnt_gidmap_num = ret;
5625  	/*
5626  	 * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
5627  	 * mappings. This allows userspace to distinguish between a
5628  	 * non-idmapped mount and an idmapped mount where none of the
5629  	 * individual mappings are valid in the caller's idmapping.
5630  	 */
5631  	if (is_valid_mnt_idmap(s->idmap))
5632  		s->sm.mask |= STATMOUNT_MNT_GIDMAP;
5633  	return 0;
5634  }
5635  
statmount_string(struct kstatmount * s,u64 flag)5636  static int statmount_string(struct kstatmount *s, u64 flag)
5637  {
5638  	int ret = 0;
5639  	size_t kbufsize;
5640  	struct seq_file *seq = &s->seq;
5641  	struct statmount *sm = &s->sm;
5642  	u32 start, *offp;
5643  
5644  	/* Reserve an empty string at the beginning for any unset offsets */
5645  	if (!seq->count)
5646  		seq_putc(seq, 0);
5647  
5648  	start = seq->count;
5649  
5650  	switch (flag) {
5651  	case STATMOUNT_FS_TYPE:
5652  		offp = &sm->fs_type;
5653  		ret = statmount_fs_type(s, seq);
5654  		break;
5655  	case STATMOUNT_MNT_ROOT:
5656  		offp = &sm->mnt_root;
5657  		ret = statmount_mnt_root(s, seq);
5658  		break;
5659  	case STATMOUNT_MNT_POINT:
5660  		offp = &sm->mnt_point;
5661  		ret = statmount_mnt_point(s, seq);
5662  		break;
5663  	case STATMOUNT_MNT_OPTS:
5664  		offp = &sm->mnt_opts;
5665  		ret = statmount_mnt_opts(s, seq);
5666  		break;
5667  	case STATMOUNT_OPT_ARRAY:
5668  		offp = &sm->opt_array;
5669  		ret = statmount_opt_array(s, seq);
5670  		break;
5671  	case STATMOUNT_OPT_SEC_ARRAY:
5672  		offp = &sm->opt_sec_array;
5673  		ret = statmount_opt_sec_array(s, seq);
5674  		break;
5675  	case STATMOUNT_FS_SUBTYPE:
5676  		offp = &sm->fs_subtype;
5677  		statmount_fs_subtype(s, seq);
5678  		break;
5679  	case STATMOUNT_SB_SOURCE:
5680  		offp = &sm->sb_source;
5681  		ret = statmount_sb_source(s, seq);
5682  		break;
5683  	case STATMOUNT_MNT_UIDMAP:
5684  		sm->mnt_uidmap = start;
5685  		ret = statmount_mnt_uidmap(s, seq);
5686  		break;
5687  	case STATMOUNT_MNT_GIDMAP:
5688  		sm->mnt_gidmap = start;
5689  		ret = statmount_mnt_gidmap(s, seq);
5690  		break;
5691  	default:
5692  		WARN_ON_ONCE(true);
5693  		return -EINVAL;
5694  	}
5695  
5696  	/*
5697  	 * If nothing was emitted, return to avoid setting the flag
5698  	 * and terminating the buffer.
5699  	 */
5700  	if (seq->count == start)
5701  		return ret;
5702  	if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
5703  		return -EOVERFLOW;
5704  	if (kbufsize >= s->bufsize)
5705  		return -EOVERFLOW;
5706  
5707  	/* signal a retry */
5708  	if (unlikely(seq_has_overflowed(seq)))
5709  		return -EAGAIN;
5710  
5711  	if (ret)
5712  		return ret;
5713  
5714  	seq->buf[seq->count++] = '\0';
5715  	sm->mask |= flag;
5716  	*offp = start;
5717  	return 0;
5718  }
5719  
copy_statmount_to_user(struct kstatmount * s)5720  static int copy_statmount_to_user(struct kstatmount *s)
5721  {
5722  	struct statmount *sm = &s->sm;
5723  	struct seq_file *seq = &s->seq;
5724  	char __user *str = ((char __user *)s->buf) + sizeof(*sm);
5725  	size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
5726  
5727  	if (seq->count && copy_to_user(str, seq->buf, seq->count))
5728  		return -EFAULT;
5729  
5730  	/* Return the number of bytes copied to the buffer */
5731  	sm->size = copysize + seq->count;
5732  	if (copy_to_user(s->buf, sm, copysize))
5733  		return -EFAULT;
5734  
5735  	return 0;
5736  }
5737  
listmnt_next(struct mount * curr,bool reverse)5738  static struct mount *listmnt_next(struct mount *curr, bool reverse)
5739  {
5740  	struct rb_node *node;
5741  
5742  	if (reverse)
5743  		node = rb_prev(&curr->mnt_node);
5744  	else
5745  		node = rb_next(&curr->mnt_node);
5746  
5747  	return node_to_mount(node);
5748  }
5749  
grab_requested_root(struct mnt_namespace * ns,struct path * root)5750  static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
5751  {
5752  	struct mount *first, *child;
5753  
5754  	rwsem_assert_held(&namespace_sem);
5755  
5756  	/* We're looking at our own ns, just use get_fs_root. */
5757  	if (ns == current->nsproxy->mnt_ns) {
5758  		get_fs_root(current->fs, root);
5759  		return 0;
5760  	}
5761  
5762  	/*
5763  	 * We have to find the first mount in our ns and use that, however it
5764  	 * may not exist, so handle that properly.
5765  	 */
5766  	if (mnt_ns_empty(ns))
5767  		return -ENOENT;
5768  
5769  	first = child = ns->root;
5770  	for (;;) {
5771  		child = listmnt_next(child, false);
5772  		if (!child)
5773  			return -ENOENT;
5774  		if (child->mnt_parent == first)
5775  			break;
5776  	}
5777  
5778  	root->mnt = mntget(&child->mnt);
5779  	root->dentry = dget(root->mnt->mnt_root);
5780  	return 0;
5781  }
5782  
5783  /* This must be updated whenever a new flag is added */
5784  #define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
5785  			     STATMOUNT_MNT_BASIC | \
5786  			     STATMOUNT_PROPAGATE_FROM | \
5787  			     STATMOUNT_MNT_ROOT | \
5788  			     STATMOUNT_MNT_POINT | \
5789  			     STATMOUNT_FS_TYPE | \
5790  			     STATMOUNT_MNT_NS_ID | \
5791  			     STATMOUNT_MNT_OPTS | \
5792  			     STATMOUNT_FS_SUBTYPE | \
5793  			     STATMOUNT_SB_SOURCE | \
5794  			     STATMOUNT_OPT_ARRAY | \
5795  			     STATMOUNT_OPT_SEC_ARRAY | \
5796  			     STATMOUNT_SUPPORTED_MASK)
5797  
do_statmount(struct kstatmount * s,u64 mnt_id,u64 mnt_ns_id,struct mnt_namespace * ns)5798  static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
5799  			struct mnt_namespace *ns)
5800  {
5801  	struct path root __free(path_put) = {};
5802  	struct mount *m;
5803  	int err;
5804  
5805  	/* Has the namespace already been emptied? */
5806  	if (mnt_ns_id && mnt_ns_empty(ns))
5807  		return -ENOENT;
5808  
5809  	s->mnt = lookup_mnt_in_ns(mnt_id, ns);
5810  	if (!s->mnt)
5811  		return -ENOENT;
5812  
5813  	err = grab_requested_root(ns, &root);
5814  	if (err)
5815  		return err;
5816  
5817  	/*
5818  	 * Don't trigger audit denials. We just want to determine what
5819  	 * mounts to show users.
5820  	 */
5821  	m = real_mount(s->mnt);
5822  	if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
5823  	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
5824  		return -EPERM;
5825  
5826  	err = security_sb_statfs(s->mnt->mnt_root);
5827  	if (err)
5828  		return err;
5829  
5830  	s->root = root;
5831  	s->idmap = mnt_idmap(s->mnt);
5832  	if (s->mask & STATMOUNT_SB_BASIC)
5833  		statmount_sb_basic(s);
5834  
5835  	if (s->mask & STATMOUNT_MNT_BASIC)
5836  		statmount_mnt_basic(s);
5837  
5838  	if (s->mask & STATMOUNT_PROPAGATE_FROM)
5839  		statmount_propagate_from(s);
5840  
5841  	if (s->mask & STATMOUNT_FS_TYPE)
5842  		err = statmount_string(s, STATMOUNT_FS_TYPE);
5843  
5844  	if (!err && s->mask & STATMOUNT_MNT_ROOT)
5845  		err = statmount_string(s, STATMOUNT_MNT_ROOT);
5846  
5847  	if (!err && s->mask & STATMOUNT_MNT_POINT)
5848  		err = statmount_string(s, STATMOUNT_MNT_POINT);
5849  
5850  	if (!err && s->mask & STATMOUNT_MNT_OPTS)
5851  		err = statmount_string(s, STATMOUNT_MNT_OPTS);
5852  
5853  	if (!err && s->mask & STATMOUNT_OPT_ARRAY)
5854  		err = statmount_string(s, STATMOUNT_OPT_ARRAY);
5855  
5856  	if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
5857  		err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
5858  
5859  	if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
5860  		err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
5861  
5862  	if (!err && s->mask & STATMOUNT_SB_SOURCE)
5863  		err = statmount_string(s, STATMOUNT_SB_SOURCE);
5864  
5865  	if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
5866  		err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
5867  
5868  	if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
5869  		err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
5870  
5871  	if (!err && s->mask & STATMOUNT_MNT_NS_ID)
5872  		statmount_mnt_ns_id(s, ns);
5873  
5874  	if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
5875  		s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
5876  		s->sm.supported_mask = STATMOUNT_SUPPORTED;
5877  	}
5878  
5879  	if (err)
5880  		return err;
5881  
5882  	/* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
5883  	WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
5884  
5885  	return 0;
5886  }
5887  
retry_statmount(const long ret,size_t * seq_size)5888  static inline bool retry_statmount(const long ret, size_t *seq_size)
5889  {
5890  	if (likely(ret != -EAGAIN))
5891  		return false;
5892  	if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
5893  		return false;
5894  	if (unlikely(*seq_size > MAX_RW_COUNT))
5895  		return false;
5896  	return true;
5897  }
5898  
5899  #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
5900  			      STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
5901  			      STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
5902  			      STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
5903  			      STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)
5904  
prepare_kstatmount(struct kstatmount * ks,struct mnt_id_req * kreq,struct statmount __user * buf,size_t bufsize,size_t seq_size)5905  static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
5906  			      struct statmount __user *buf, size_t bufsize,
5907  			      size_t seq_size)
5908  {
5909  	if (!access_ok(buf, bufsize))
5910  		return -EFAULT;
5911  
5912  	memset(ks, 0, sizeof(*ks));
5913  	ks->mask = kreq->param;
5914  	ks->buf = buf;
5915  	ks->bufsize = bufsize;
5916  
5917  	if (ks->mask & STATMOUNT_STRING_REQ) {
5918  		if (bufsize == sizeof(ks->sm))
5919  			return -EOVERFLOW;
5920  
5921  		ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
5922  		if (!ks->seq.buf)
5923  			return -ENOMEM;
5924  
5925  		ks->seq.size = seq_size;
5926  	}
5927  
5928  	return 0;
5929  }
5930  
copy_mnt_id_req(const struct mnt_id_req __user * req,struct mnt_id_req * kreq)5931  static int copy_mnt_id_req(const struct mnt_id_req __user *req,
5932  			   struct mnt_id_req *kreq)
5933  {
5934  	int ret;
5935  	size_t usize;
5936  
5937  	BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1);
5938  
5939  	ret = get_user(usize, &req->size);
5940  	if (ret)
5941  		return -EFAULT;
5942  	if (unlikely(usize > PAGE_SIZE))
5943  		return -E2BIG;
5944  	if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
5945  		return -EINVAL;
5946  	memset(kreq, 0, sizeof(*kreq));
5947  	ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
5948  	if (ret)
5949  		return ret;
5950  	if (kreq->spare != 0)
5951  		return -EINVAL;
5952  	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
5953  	if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET)
5954  		return -EINVAL;
5955  	return 0;
5956  }
5957  
5958  /*
5959   * If the user requested a specific mount namespace id, look that up and return
5960   * that, or if not simply grab a passive reference on our mount namespace and
5961   * return that.
5962   */
grab_requested_mnt_ns(const struct mnt_id_req * kreq)5963  static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq)
5964  {
5965  	struct mnt_namespace *mnt_ns;
5966  
5967  	if (kreq->mnt_ns_id && kreq->spare)
5968  		return ERR_PTR(-EINVAL);
5969  
5970  	if (kreq->mnt_ns_id)
5971  		return lookup_mnt_ns(kreq->mnt_ns_id);
5972  
5973  	if (kreq->spare) {
5974  		struct ns_common *ns;
5975  
5976  		CLASS(fd, f)(kreq->spare);
5977  		if (fd_empty(f))
5978  			return ERR_PTR(-EBADF);
5979  
5980  		if (!proc_ns_file(fd_file(f)))
5981  			return ERR_PTR(-EINVAL);
5982  
5983  		ns = get_proc_ns(file_inode(fd_file(f)));
5984  		if (ns->ops->type != CLONE_NEWNS)
5985  			return ERR_PTR(-EINVAL);
5986  
5987  		mnt_ns = to_mnt_ns(ns);
5988  	} else {
5989  		mnt_ns = current->nsproxy->mnt_ns;
5990  	}
5991  
5992  	refcount_inc(&mnt_ns->passive);
5993  	return mnt_ns;
5994  }
5995  
SYSCALL_DEFINE4(statmount,const struct mnt_id_req __user *,req,struct statmount __user *,buf,size_t,bufsize,unsigned int,flags)5996  SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
5997  		struct statmount __user *, buf, size_t, bufsize,
5998  		unsigned int, flags)
5999  {
6000  	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
6001  	struct kstatmount *ks __free(kfree) = NULL;
6002  	struct mnt_id_req kreq;
6003  	/* We currently support retrieval of 3 strings. */
6004  	size_t seq_size = 3 * PATH_MAX;
6005  	int ret;
6006  
6007  	if (flags)
6008  		return -EINVAL;
6009  
6010  	ret = copy_mnt_id_req(req, &kreq);
6011  	if (ret)
6012  		return ret;
6013  
6014  	ns = grab_requested_mnt_ns(&kreq);
6015  	if (!ns)
6016  		return -ENOENT;
6017  
6018  	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
6019  	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
6020  		return -ENOENT;
6021  
6022  	ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT);
6023  	if (!ks)
6024  		return -ENOMEM;
6025  
6026  retry:
6027  	ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size);
6028  	if (ret)
6029  		return ret;
6030  
6031  	scoped_guard(rwsem_read, &namespace_sem)
6032  		ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns);
6033  
6034  	if (!ret)
6035  		ret = copy_statmount_to_user(ks);
6036  	kvfree(ks->seq.buf);
6037  	if (retry_statmount(ret, &seq_size))
6038  		goto retry;
6039  	return ret;
6040  }
6041  
do_listmount(struct mnt_namespace * ns,u64 mnt_parent_id,u64 last_mnt_id,u64 * mnt_ids,size_t nr_mnt_ids,bool reverse)6042  static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
6043  			    u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
6044  			    bool reverse)
6045  {
6046  	struct path root __free(path_put) = {};
6047  	struct path orig;
6048  	struct mount *r, *first;
6049  	ssize_t ret;
6050  
6051  	rwsem_assert_held(&namespace_sem);
6052  
6053  	ret = grab_requested_root(ns, &root);
6054  	if (ret)
6055  		return ret;
6056  
6057  	if (mnt_parent_id == LSMT_ROOT) {
6058  		orig = root;
6059  	} else {
6060  		orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
6061  		if (!orig.mnt)
6062  			return -ENOENT;
6063  		orig.dentry = orig.mnt->mnt_root;
6064  	}
6065  
6066  	/*
6067  	 * Don't trigger audit denials. We just want to determine what
6068  	 * mounts to show users.
6069  	 */
6070  	if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
6071  	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
6072  		return -EPERM;
6073  
6074  	ret = security_sb_statfs(orig.dentry);
6075  	if (ret)
6076  		return ret;
6077  
6078  	if (!last_mnt_id) {
6079  		if (reverse)
6080  			first = node_to_mount(ns->mnt_last_node);
6081  		else
6082  			first = node_to_mount(ns->mnt_first_node);
6083  	} else {
6084  		if (reverse)
6085  			first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
6086  		else
6087  			first = mnt_find_id_at(ns, last_mnt_id + 1);
6088  	}
6089  
6090  	for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) {
6091  		if (r->mnt_id_unique == mnt_parent_id)
6092  			continue;
6093  		if (!is_path_reachable(r, r->mnt.mnt_root, &orig))
6094  			continue;
6095  		*mnt_ids = r->mnt_id_unique;
6096  		mnt_ids++;
6097  		nr_mnt_ids--;
6098  		ret++;
6099  	}
6100  	return ret;
6101  }
6102  
SYSCALL_DEFINE4(listmount,const struct mnt_id_req __user *,req,u64 __user *,mnt_ids,size_t,nr_mnt_ids,unsigned int,flags)6103  SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
6104  		u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
6105  {
6106  	u64 *kmnt_ids __free(kvfree) = NULL;
6107  	const size_t maxcount = 1000000;
6108  	struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
6109  	struct mnt_id_req kreq;
6110  	u64 last_mnt_id;
6111  	ssize_t ret;
6112  
6113  	if (flags & ~LISTMOUNT_REVERSE)
6114  		return -EINVAL;
6115  
6116  	/*
6117  	 * If the mount namespace really has more than 1 million mounts the
6118  	 * caller must iterate over the mount namespace (and reconsider their
6119  	 * system design...).
6120  	 */
6121  	if (unlikely(nr_mnt_ids > maxcount))
6122  		return -EOVERFLOW;
6123  
6124  	if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
6125  		return -EFAULT;
6126  
6127  	ret = copy_mnt_id_req(req, &kreq);
6128  	if (ret)
6129  		return ret;
6130  
6131  	last_mnt_id = kreq.param;
6132  	/* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
6133  	if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
6134  		return -EINVAL;
6135  
6136  	kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
6137  				  GFP_KERNEL_ACCOUNT);
6138  	if (!kmnt_ids)
6139  		return -ENOMEM;
6140  
6141  	ns = grab_requested_mnt_ns(&kreq);
6142  	if (!ns)
6143  		return -ENOENT;
6144  
6145  	if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
6146  	    !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
6147  		return -ENOENT;
6148  
6149  	scoped_guard(rwsem_read, &namespace_sem)
6150  		ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
6151  				   nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
6152  	if (ret <= 0)
6153  		return ret;
6154  
6155  	if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
6156  		return -EFAULT;
6157  
6158  	return ret;
6159  }
6160  
init_mount_tree(void)6161  static void __init init_mount_tree(void)
6162  {
6163  	struct vfsmount *mnt;
6164  	struct mount *m;
6165  	struct mnt_namespace *ns;
6166  	struct path root;
6167  
6168  	mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
6169  	if (IS_ERR(mnt))
6170  		panic("Can't create rootfs");
6171  
6172  	ns = alloc_mnt_ns(&init_user_ns, false);
6173  	if (IS_ERR(ns))
6174  		panic("Can't allocate initial namespace");
6175  	m = real_mount(mnt);
6176  	ns->root = m;
6177  	ns->nr_mounts = 1;
6178  	mnt_add_to_ns(ns, m);
6179  	init_task.nsproxy->mnt_ns = ns;
6180  	get_mnt_ns(ns);
6181  
6182  	root.mnt = mnt;
6183  	root.dentry = mnt->mnt_root;
6184  	mnt->mnt_flags |= MNT_LOCKED;
6185  
6186  	set_fs_pwd(current->fs, &root);
6187  	set_fs_root(current->fs, &root);
6188  
6189  	mnt_ns_tree_add(ns);
6190  }
6191  
mnt_init(void)6192  void __init mnt_init(void)
6193  {
6194  	int err;
6195  
6196  	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
6197  			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
6198  
6199  	mount_hashtable = alloc_large_system_hash("Mount-cache",
6200  				sizeof(struct hlist_head),
6201  				mhash_entries, 19,
6202  				HASH_ZERO,
6203  				&m_hash_shift, &m_hash_mask, 0, 0);
6204  	mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache",
6205  				sizeof(struct hlist_head),
6206  				mphash_entries, 19,
6207  				HASH_ZERO,
6208  				&mp_hash_shift, &mp_hash_mask, 0, 0);
6209  
6210  	if (!mount_hashtable || !mountpoint_hashtable)
6211  		panic("Failed to allocate mount hash table\n");
6212  
6213  	kernfs_init();
6214  
6215  	err = sysfs_init();
6216  	if (err)
6217  		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
6218  			__func__, err);
6219  	fs_kobj = kobject_create_and_add("fs", NULL);
6220  	if (!fs_kobj)
6221  		printk(KERN_WARNING "%s: kobj create error\n", __func__);
6222  	shmem_init();
6223  	init_rootfs();
6224  	init_mount_tree();
6225  }
6226  
put_mnt_ns(struct mnt_namespace * ns)6227  void put_mnt_ns(struct mnt_namespace *ns)
6228  {
6229  	if (!refcount_dec_and_test(&ns->ns.count))
6230  		return;
6231  	drop_collected_mounts(&ns->root->mnt);
6232  	free_mnt_ns(ns);
6233  }
6234  
kern_mount(struct file_system_type * type)6235  struct vfsmount *kern_mount(struct file_system_type *type)
6236  {
6237  	struct vfsmount *mnt;
6238  	mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL);
6239  	if (!IS_ERR(mnt)) {
6240  		/*
6241  		 * it is a longterm mount, don't release mnt until
6242  		 * we unmount before file sys is unregistered
6243  		*/
6244  		real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL;
6245  	}
6246  	return mnt;
6247  }
6248  EXPORT_SYMBOL_GPL(kern_mount);
6249  
kern_unmount(struct vfsmount * mnt)6250  void kern_unmount(struct vfsmount *mnt)
6251  {
6252  	/* release long term mount so mount point can be released */
6253  	if (!IS_ERR(mnt)) {
6254  		mnt_make_shortterm(mnt);
6255  		synchronize_rcu();	/* yecchhh... */
6256  		mntput(mnt);
6257  	}
6258  }
6259  EXPORT_SYMBOL(kern_unmount);
6260  
kern_unmount_array(struct vfsmount * mnt[],unsigned int num)6261  void kern_unmount_array(struct vfsmount *mnt[], unsigned int num)
6262  {
6263  	unsigned int i;
6264  
6265  	for (i = 0; i < num; i++)
6266  		mnt_make_shortterm(mnt[i]);
6267  	synchronize_rcu_expedited();
6268  	for (i = 0; i < num; i++)
6269  		mntput(mnt[i]);
6270  }
6271  EXPORT_SYMBOL(kern_unmount_array);
6272  
our_mnt(struct vfsmount * mnt)6273  bool our_mnt(struct vfsmount *mnt)
6274  {
6275  	return check_mnt(real_mount(mnt));
6276  }
6277  
current_chrooted(void)6278  bool current_chrooted(void)
6279  {
6280  	/* Does the current process have a non-standard root */
6281  	struct path ns_root;
6282  	struct path fs_root;
6283  	bool chrooted;
6284  
6285  	/* Find the namespace root */
6286  	ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt;
6287  	ns_root.dentry = ns_root.mnt->mnt_root;
6288  	path_get(&ns_root);
6289  	while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root))
6290  		;
6291  
6292  	get_fs_root(current->fs, &fs_root);
6293  
6294  	chrooted = !path_equal(&fs_root, &ns_root);
6295  
6296  	path_put(&fs_root);
6297  	path_put(&ns_root);
6298  
6299  	return chrooted;
6300  }
6301  
mnt_already_visible(struct mnt_namespace * ns,const struct super_block * sb,int * new_mnt_flags)6302  static bool mnt_already_visible(struct mnt_namespace *ns,
6303  				const struct super_block *sb,
6304  				int *new_mnt_flags)
6305  {
6306  	int new_flags = *new_mnt_flags;
6307  	struct mount *mnt, *n;
6308  	bool visible = false;
6309  
6310  	down_read(&namespace_sem);
6311  	rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
6312  		struct mount *child;
6313  		int mnt_flags;
6314  
6315  		if (mnt->mnt.mnt_sb->s_type != sb->s_type)
6316  			continue;
6317  
6318  		/* This mount is not fully visible if it's root directory
6319  		 * is not the root directory of the filesystem.
6320  		 */
6321  		if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
6322  			continue;
6323  
6324  		/* A local view of the mount flags */
6325  		mnt_flags = mnt->mnt.mnt_flags;
6326  
6327  		/* Don't miss readonly hidden in the superblock flags */
6328  		if (sb_rdonly(mnt->mnt.mnt_sb))
6329  			mnt_flags |= MNT_LOCK_READONLY;
6330  
6331  		/* Verify the mount flags are equal to or more permissive
6332  		 * than the proposed new mount.
6333  		 */
6334  		if ((mnt_flags & MNT_LOCK_READONLY) &&
6335  		    !(new_flags & MNT_READONLY))
6336  			continue;
6337  		if ((mnt_flags & MNT_LOCK_ATIME) &&
6338  		    ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
6339  			continue;
6340  
6341  		/* This mount is not fully visible if there are any
6342  		 * locked child mounts that cover anything except for
6343  		 * empty directories.
6344  		 */
6345  		list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
6346  			struct inode *inode = child->mnt_mountpoint->d_inode;
6347  			/* Only worry about locked mounts */
6348  			if (!(child->mnt.mnt_flags & MNT_LOCKED))
6349  				continue;
6350  			/* Is the directory permanently empty? */
6351  			if (!is_empty_dir_inode(inode))
6352  				goto next;
6353  		}
6354  		/* Preserve the locked attributes */
6355  		*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
6356  					       MNT_LOCK_ATIME);
6357  		visible = true;
6358  		goto found;
6359  	next:	;
6360  	}
6361  found:
6362  	up_read(&namespace_sem);
6363  	return visible;
6364  }
6365  
mount_too_revealing(const struct super_block * sb,int * new_mnt_flags)6366  static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags)
6367  {
6368  	const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
6369  	struct mnt_namespace *ns = current->nsproxy->mnt_ns;
6370  	unsigned long s_iflags;
6371  
6372  	if (ns->user_ns == &init_user_ns)
6373  		return false;
6374  
6375  	/* Can this filesystem be too revealing? */
6376  	s_iflags = sb->s_iflags;
6377  	if (!(s_iflags & SB_I_USERNS_VISIBLE))
6378  		return false;
6379  
6380  	if ((s_iflags & required_iflags) != required_iflags) {
6381  		WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
6382  			  required_iflags);
6383  		return true;
6384  	}
6385  
6386  	return !mnt_already_visible(ns, sb, new_mnt_flags);
6387  }
6388  
mnt_may_suid(struct vfsmount * mnt)6389  bool mnt_may_suid(struct vfsmount *mnt)
6390  {
6391  	/*
6392  	 * Foreign mounts (accessed via fchdir or through /proc
6393  	 * symlinks) are always treated as if they are nosuid.  This
6394  	 * prevents namespaces from trusting potentially unsafe
6395  	 * suid/sgid bits, file caps, or security labels that originate
6396  	 * in other namespaces.
6397  	 */
6398  	return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
6399  	       current_in_userns(mnt->mnt_sb->s_user_ns);
6400  }
6401  
mntns_get(struct task_struct * task)6402  static struct ns_common *mntns_get(struct task_struct *task)
6403  {
6404  	struct ns_common *ns = NULL;
6405  	struct nsproxy *nsproxy;
6406  
6407  	task_lock(task);
6408  	nsproxy = task->nsproxy;
6409  	if (nsproxy) {
6410  		ns = &nsproxy->mnt_ns->ns;
6411  		get_mnt_ns(to_mnt_ns(ns));
6412  	}
6413  	task_unlock(task);
6414  
6415  	return ns;
6416  }
6417  
mntns_put(struct ns_common * ns)6418  static void mntns_put(struct ns_common *ns)
6419  {
6420  	put_mnt_ns(to_mnt_ns(ns));
6421  }
6422  
mntns_install(struct nsset * nsset,struct ns_common * ns)6423  static int mntns_install(struct nsset *nsset, struct ns_common *ns)
6424  {
6425  	struct nsproxy *nsproxy = nsset->nsproxy;
6426  	struct fs_struct *fs = nsset->fs;
6427  	struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns;
6428  	struct user_namespace *user_ns = nsset->cred->user_ns;
6429  	struct path root;
6430  	int err;
6431  
6432  	if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) ||
6433  	    !ns_capable(user_ns, CAP_SYS_CHROOT) ||
6434  	    !ns_capable(user_ns, CAP_SYS_ADMIN))
6435  		return -EPERM;
6436  
6437  	if (is_anon_ns(mnt_ns))
6438  		return -EINVAL;
6439  
6440  	if (fs->users != 1)
6441  		return -EINVAL;
6442  
6443  	get_mnt_ns(mnt_ns);
6444  	old_mnt_ns = nsproxy->mnt_ns;
6445  	nsproxy->mnt_ns = mnt_ns;
6446  
6447  	/* Find the root */
6448  	err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt,
6449  				"/", LOOKUP_DOWN, &root);
6450  	if (err) {
6451  		/* revert to old namespace */
6452  		nsproxy->mnt_ns = old_mnt_ns;
6453  		put_mnt_ns(mnt_ns);
6454  		return err;
6455  	}
6456  
6457  	put_mnt_ns(old_mnt_ns);
6458  
6459  	/* Update the pwd and root */
6460  	set_fs_pwd(fs, &root);
6461  	set_fs_root(fs, &root);
6462  
6463  	path_put(&root);
6464  	return 0;
6465  }
6466  
mntns_owner(struct ns_common * ns)6467  static struct user_namespace *mntns_owner(struct ns_common *ns)
6468  {
6469  	return to_mnt_ns(ns)->user_ns;
6470  }
6471  
6472  const struct proc_ns_operations mntns_operations = {
6473  	.name		= "mnt",
6474  	.type		= CLONE_NEWNS,
6475  	.get		= mntns_get,
6476  	.put		= mntns_put,
6477  	.install	= mntns_install,
6478  	.owner		= mntns_owner,
6479  };
6480  
6481  #ifdef CONFIG_SYSCTL
6482  static const struct ctl_table fs_namespace_sysctls[] = {
6483  	{
6484  		.procname	= "mount-max",
6485  		.data		= &sysctl_mount_max,
6486  		.maxlen		= sizeof(unsigned int),
6487  		.mode		= 0644,
6488  		.proc_handler	= proc_dointvec_minmax,
6489  		.extra1		= SYSCTL_ONE,
6490  	},
6491  };
6492  
init_fs_namespace_sysctls(void)6493  static int __init init_fs_namespace_sysctls(void)
6494  {
6495  	register_sysctl_init("fs", fs_namespace_sysctls);
6496  	return 0;
6497  }
6498  fs_initcall(init_fs_namespace_sysctls);
6499  
6500  #endif /* CONFIG_SYSCTL */
6501