1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * linux/fs/namespace.c 4 * 5 * (C) Copyright Al Viro 2000, 2001 6 * 7 * Based on code from fs/super.c, copyright Linus Torvalds and others. 8 * Heavily rewritten. 9 */ 10 11 #include <linux/syscalls.h> 12 #include <linux/export.h> 13 #include <linux/capability.h> 14 #include <linux/mnt_namespace.h> 15 #include <linux/user_namespace.h> 16 #include <linux/namei.h> 17 #include <linux/security.h> 18 #include <linux/cred.h> 19 #include <linux/idr.h> 20 #include <linux/init.h> /* init_rootfs */ 21 #include <linux/fs_struct.h> /* get_fs_root et.al. */ 22 #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ 23 #include <linux/file.h> 24 #include <linux/uaccess.h> 25 #include <linux/proc_ns.h> 26 #include <linux/magic.h> 27 #include <linux/memblock.h> 28 #include <linux/proc_fs.h> 29 #include <linux/task_work.h> 30 #include <linux/sched/task.h> 31 #include <uapi/linux/mount.h> 32 #include <linux/fs_context.h> 33 #include <linux/shmem_fs.h> 34 #include <linux/mnt_idmapping.h> 35 #include <linux/pidfs.h> 36 #include <linux/nstree.h> 37 38 #include "pnode.h" 39 #include "internal.h" 40 41 /* Maximum number of mounts in a mount namespace */ 42 static unsigned int sysctl_mount_max __read_mostly = 100000; 43 44 static unsigned int m_hash_mask __ro_after_init; 45 static unsigned int m_hash_shift __ro_after_init; 46 static unsigned int mp_hash_mask __ro_after_init; 47 static unsigned int mp_hash_shift __ro_after_init; 48 49 static __initdata unsigned long mhash_entries; 50 static int __init set_mhash_entries(char *str) 51 { 52 return kstrtoul(str, 0, &mhash_entries) == 0; 53 } 54 __setup("mhash_entries=", set_mhash_entries); 55 56 static __initdata unsigned long mphash_entries; 57 static int __init set_mphash_entries(char *str) 58 { 59 return kstrtoul(str, 0, &mphash_entries) == 0; 60 } 61 __setup("mphash_entries=", set_mphash_entries); 62 63 static char * __initdata initramfs_options; 64 static int __init initramfs_options_setup(char *str) 65 { 66 initramfs_options = str; 67 return 1; 68 } 69 70 __setup("initramfs_options=", initramfs_options_setup); 71 72 static u64 event; 73 static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); 74 static DEFINE_IDA(mnt_group_ida); 75 76 /* Don't allow confusion with old 32bit mount ID */ 77 #define MNT_UNIQUE_ID_OFFSET (1ULL << 31) 78 static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET; 79 80 static struct hlist_head *mount_hashtable __ro_after_init; 81 static struct hlist_head *mountpoint_hashtable __ro_after_init; 82 static struct kmem_cache *mnt_cache __ro_after_init; 83 static DECLARE_RWSEM(namespace_sem); 84 static HLIST_HEAD(unmounted); /* protected by namespace_sem */ 85 static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ 86 static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ 87 88 static inline void namespace_lock(void); 89 static void namespace_unlock(void); 90 DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) 91 DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), 92 up_read(&namespace_sem)) 93 94 DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T)) 95 96 #ifdef CONFIG_FSNOTIFY 97 LIST_HEAD(notify_list); /* protected by namespace_sem */ 98 #endif 99 100 enum mount_kattr_flags_t { 101 MOUNT_KATTR_RECURSE = (1 << 0), 102 MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), 103 }; 104 105 struct mount_kattr { 106 unsigned int attr_set; 107 unsigned int attr_clr; 108 unsigned int propagation; 109 unsigned int lookup_flags; 110 enum mount_kattr_flags_t kflags; 111 struct user_namespace *mnt_userns; 112 struct mnt_idmap *mnt_idmap; 113 }; 114 115 /* /sys/fs */ 116 struct kobject *fs_kobj __ro_after_init; 117 EXPORT_SYMBOL_GPL(fs_kobj); 118 119 /* 120 * vfsmount lock may be taken for read to prevent changes to the 121 * vfsmount hash, ie. during mountpoint lookups or walking back 122 * up the tree. 123 * 124 * It should be taken for write in all cases where the vfsmount 125 * tree or hash is modified or when a vfsmount structure is modified. 126 */ 127 __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); 128 129 static void mnt_ns_release(struct mnt_namespace *ns) 130 { 131 /* keep alive for {list,stat}mount() */ 132 if (ns && refcount_dec_and_test(&ns->passive)) { 133 fsnotify_mntns_delete(ns); 134 put_user_ns(ns->user_ns); 135 kfree(ns); 136 } 137 } 138 DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, 139 if (!IS_ERR(_T)) mnt_ns_release(_T)) 140 141 static void mnt_ns_release_rcu(struct rcu_head *rcu) 142 { 143 mnt_ns_release(container_of(rcu, struct mnt_namespace, ns.ns_rcu)); 144 } 145 146 static void mnt_ns_tree_remove(struct mnt_namespace *ns) 147 { 148 /* remove from global mount namespace list */ 149 if (ns_tree_active(ns)) 150 ns_tree_remove(ns); 151 152 call_rcu(&ns->ns.ns_rcu, mnt_ns_release_rcu); 153 } 154 155 /* 156 * Lookup a mount namespace by id and take a passive reference count. Taking a 157 * passive reference means the mount namespace can be emptied if e.g., the last 158 * task holding an active reference exits. To access the mounts of the 159 * namespace the @namespace_sem must first be acquired. If the namespace has 160 * already shut down before acquiring @namespace_sem, {list,stat}mount() will 161 * see that the mount rbtree of the namespace is empty. 162 * 163 * Note the lookup is lockless protected by a sequence counter. We only 164 * need to guard against false negatives as false positives aren't 165 * possible. So if we didn't find a mount namespace and the sequence 166 * counter has changed we need to retry. If the sequence counter is 167 * still the same we know the search actually failed. 168 */ 169 static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id) 170 { 171 struct mnt_namespace *mnt_ns; 172 struct ns_common *ns; 173 174 guard(rcu)(); 175 ns = ns_tree_lookup_rcu(mnt_ns_id, CLONE_NEWNS); 176 if (!ns) 177 return NULL; 178 179 /* 180 * The last reference count is put with RCU delay so we can 181 * unconditonally acquire a reference here. 182 */ 183 mnt_ns = container_of(ns, struct mnt_namespace, ns); 184 refcount_inc(&mnt_ns->passive); 185 return mnt_ns; 186 } 187 188 static inline void lock_mount_hash(void) 189 { 190 write_seqlock(&mount_lock); 191 } 192 193 static inline void unlock_mount_hash(void) 194 { 195 write_sequnlock(&mount_lock); 196 } 197 198 static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) 199 { 200 unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); 201 tmp += ((unsigned long)dentry / L1_CACHE_BYTES); 202 tmp = tmp + (tmp >> m_hash_shift); 203 return &mount_hashtable[tmp & m_hash_mask]; 204 } 205 206 static inline struct hlist_head *mp_hash(struct dentry *dentry) 207 { 208 unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); 209 tmp = tmp + (tmp >> mp_hash_shift); 210 return &mountpoint_hashtable[tmp & mp_hash_mask]; 211 } 212 213 static int mnt_alloc_id(struct mount *mnt) 214 { 215 int res; 216 217 xa_lock(&mnt_id_xa); 218 res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, xa_limit_31b, GFP_KERNEL); 219 if (!res) 220 mnt->mnt_id_unique = ++mnt_id_ctr; 221 xa_unlock(&mnt_id_xa); 222 return res; 223 } 224 225 static void mnt_free_id(struct mount *mnt) 226 { 227 xa_erase(&mnt_id_xa, mnt->mnt_id); 228 } 229 230 /* 231 * Allocate a new peer group ID 232 */ 233 static int mnt_alloc_group_id(struct mount *mnt) 234 { 235 int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL); 236 237 if (res < 0) 238 return res; 239 mnt->mnt_group_id = res; 240 return 0; 241 } 242 243 /* 244 * Release a peer group ID 245 */ 246 void mnt_release_group_id(struct mount *mnt) 247 { 248 ida_free(&mnt_group_ida, mnt->mnt_group_id); 249 mnt->mnt_group_id = 0; 250 } 251 252 /* 253 * vfsmount lock must be held for read 254 */ 255 static inline void mnt_add_count(struct mount *mnt, int n) 256 { 257 #ifdef CONFIG_SMP 258 this_cpu_add(mnt->mnt_pcp->mnt_count, n); 259 #else 260 preempt_disable(); 261 mnt->mnt_count += n; 262 preempt_enable(); 263 #endif 264 } 265 266 /* 267 * vfsmount lock must be held for write 268 */ 269 int mnt_get_count(struct mount *mnt) 270 { 271 #ifdef CONFIG_SMP 272 int count = 0; 273 int cpu; 274 275 for_each_possible_cpu(cpu) { 276 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; 277 } 278 279 return count; 280 #else 281 return mnt->mnt_count; 282 #endif 283 } 284 285 static struct mount *alloc_vfsmnt(const char *name) 286 { 287 struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 288 if (mnt) { 289 int err; 290 291 err = mnt_alloc_id(mnt); 292 if (err) 293 goto out_free_cache; 294 295 if (name) 296 mnt->mnt_devname = kstrdup_const(name, 297 GFP_KERNEL_ACCOUNT); 298 else 299 mnt->mnt_devname = "none"; 300 if (!mnt->mnt_devname) 301 goto out_free_id; 302 303 #ifdef CONFIG_SMP 304 mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); 305 if (!mnt->mnt_pcp) 306 goto out_free_devname; 307 308 this_cpu_add(mnt->mnt_pcp->mnt_count, 1); 309 #else 310 mnt->mnt_count = 1; 311 mnt->mnt_writers = 0; 312 #endif 313 314 INIT_HLIST_NODE(&mnt->mnt_hash); 315 INIT_LIST_HEAD(&mnt->mnt_child); 316 INIT_LIST_HEAD(&mnt->mnt_mounts); 317 INIT_LIST_HEAD(&mnt->mnt_list); 318 INIT_LIST_HEAD(&mnt->mnt_expire); 319 INIT_LIST_HEAD(&mnt->mnt_share); 320 INIT_HLIST_HEAD(&mnt->mnt_slave_list); 321 INIT_HLIST_NODE(&mnt->mnt_slave); 322 INIT_HLIST_NODE(&mnt->mnt_mp_list); 323 INIT_HLIST_HEAD(&mnt->mnt_stuck_children); 324 INIT_HLIST_NODE(&mnt->mnt_ns_visible); 325 RB_CLEAR_NODE(&mnt->mnt_node); 326 mnt->mnt.mnt_idmap = &nop_mnt_idmap; 327 } 328 return mnt; 329 330 #ifdef CONFIG_SMP 331 out_free_devname: 332 kfree_const(mnt->mnt_devname); 333 #endif 334 out_free_id: 335 mnt_free_id(mnt); 336 out_free_cache: 337 kmem_cache_free(mnt_cache, mnt); 338 return NULL; 339 } 340 341 /* 342 * Most r/o checks on a fs are for operations that take 343 * discrete amounts of time, like a write() or unlink(). 344 * We must keep track of when those operations start 345 * (for permission checks) and when they end, so that 346 * we can determine when writes are able to occur to 347 * a filesystem. 348 */ 349 /* 350 * __mnt_is_readonly: check whether a mount is read-only 351 * @mnt: the mount to check for its write status 352 * 353 * This shouldn't be used directly ouside of the VFS. 354 * It does not guarantee that the filesystem will stay 355 * r/w, just that it is right *now*. This can not and 356 * should not be used in place of IS_RDONLY(inode). 357 * mnt_want/drop_write() will _keep_ the filesystem 358 * r/w. 359 */ 360 bool __mnt_is_readonly(const struct vfsmount *mnt) 361 { 362 return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); 363 } 364 EXPORT_SYMBOL_GPL(__mnt_is_readonly); 365 366 static inline void mnt_inc_writers(struct mount *mnt) 367 { 368 #ifdef CONFIG_SMP 369 this_cpu_inc(mnt->mnt_pcp->mnt_writers); 370 #else 371 mnt->mnt_writers++; 372 #endif 373 } 374 375 static inline void mnt_dec_writers(struct mount *mnt) 376 { 377 #ifdef CONFIG_SMP 378 this_cpu_dec(mnt->mnt_pcp->mnt_writers); 379 #else 380 mnt->mnt_writers--; 381 #endif 382 } 383 384 static unsigned int mnt_get_writers(struct mount *mnt) 385 { 386 #ifdef CONFIG_SMP 387 unsigned int count = 0; 388 int cpu; 389 390 for_each_possible_cpu(cpu) { 391 count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; 392 } 393 394 return count; 395 #else 396 return mnt->mnt_writers; 397 #endif 398 } 399 400 static int mnt_is_readonly(const struct vfsmount *mnt) 401 { 402 if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) 403 return 1; 404 /* 405 * The barrier pairs with the barrier in sb_start_ro_state_change() 406 * making sure if we don't see s_readonly_remount set yet, we also will 407 * not see any superblock / mount flag changes done by remount. 408 * It also pairs with the barrier in sb_end_ro_state_change() 409 * assuring that if we see s_readonly_remount already cleared, we will 410 * see the values of superblock / mount flags updated by remount. 411 */ 412 smp_rmb(); 413 return __mnt_is_readonly(mnt); 414 } 415 416 /* 417 * Most r/o & frozen checks on a fs are for operations that take discrete 418 * amounts of time, like a write() or unlink(). We must keep track of when 419 * those operations start (for permission checks) and when they end, so that we 420 * can determine when writes are able to occur to a filesystem. 421 */ 422 /** 423 * mnt_get_write_access - get write access to a mount without freeze protection 424 * @m: the mount on which to take a write 425 * 426 * This tells the low-level filesystem that a write is about to be performed to 427 * it, and makes sure that writes are allowed (mnt it read-write) before 428 * returning success. This operation does not protect against filesystem being 429 * frozen. When the write operation is finished, mnt_put_write_access() must be 430 * called. This is effectively a refcount. 431 */ 432 int mnt_get_write_access(struct vfsmount *m) 433 { 434 struct mount *mnt = real_mount(m); 435 int ret = 0; 436 437 preempt_disable(); 438 mnt_inc_writers(mnt); 439 /* 440 * The store to mnt_inc_writers must be visible before we pass 441 * WRITE_HOLD loop below, so that the slowpath can see our 442 * incremented count after it has set WRITE_HOLD. 443 */ 444 smp_mb(); 445 might_lock(&mount_lock.lock); 446 while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { 447 if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { 448 cpu_relax(); 449 } else { 450 /* 451 * This prevents priority inversion, if the task 452 * setting WRITE_HOLD got preempted on a remote 453 * CPU, and it prevents life lock if the task setting 454 * WRITE_HOLD has a lower priority and is bound to 455 * the same CPU as the task that is spinning here. 456 */ 457 preempt_enable(); 458 read_seqlock_excl(&mount_lock); 459 read_sequnlock_excl(&mount_lock); 460 preempt_disable(); 461 } 462 } 463 /* 464 * The barrier pairs with the barrier sb_start_ro_state_change() making 465 * sure that if we see WRITE_HOLD cleared, we will also see 466 * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in 467 * mnt_is_readonly() and bail in case we are racing with remount 468 * read-only. 469 */ 470 smp_rmb(); 471 if (mnt_is_readonly(m)) { 472 mnt_dec_writers(mnt); 473 ret = -EROFS; 474 } 475 preempt_enable(); 476 477 return ret; 478 } 479 EXPORT_SYMBOL_GPL(mnt_get_write_access); 480 481 /** 482 * mnt_want_write - get write access to a mount 483 * @m: the mount on which to take a write 484 * 485 * This tells the low-level filesystem that a write is about to be performed to 486 * it, and makes sure that writes are allowed (mount is read-write, filesystem 487 * is not frozen) before returning success. When the write operation is 488 * finished, mnt_drop_write() must be called. This is effectively a refcount. 489 */ 490 int mnt_want_write(struct vfsmount *m) 491 { 492 int ret; 493 494 sb_start_write(m->mnt_sb); 495 ret = mnt_get_write_access(m); 496 if (ret) 497 sb_end_write(m->mnt_sb); 498 return ret; 499 } 500 EXPORT_SYMBOL_GPL(mnt_want_write); 501 502 /** 503 * mnt_get_write_access_file - get write access to a file's mount 504 * @file: the file who's mount on which to take a write 505 * 506 * This is like mnt_get_write_access, but if @file is already open for write it 507 * skips incrementing mnt_writers (since the open file already has a reference) 508 * and instead only does the check for emergency r/o remounts. This must be 509 * paired with mnt_put_write_access_file. 510 */ 511 int mnt_get_write_access_file(struct file *file) 512 { 513 if (file->f_mode & FMODE_WRITER) { 514 /* 515 * Superblock may have become readonly while there are still 516 * writable fd's, e.g. due to a fs error with errors=remount-ro 517 */ 518 if (__mnt_is_readonly(file->f_path.mnt)) 519 return -EROFS; 520 return 0; 521 } 522 return mnt_get_write_access(file->f_path.mnt); 523 } 524 525 /** 526 * mnt_want_write_file - get write access to a file's mount 527 * @file: the file who's mount on which to take a write 528 * 529 * This is like mnt_want_write, but if the file is already open for writing it 530 * skips incrementing mnt_writers (since the open file already has a reference) 531 * and instead only does the freeze protection and the check for emergency r/o 532 * remounts. This must be paired with mnt_drop_write_file. 533 */ 534 int mnt_want_write_file(struct file *file) 535 { 536 int ret; 537 538 sb_start_write(file_inode(file)->i_sb); 539 ret = mnt_get_write_access_file(file); 540 if (ret) 541 sb_end_write(file_inode(file)->i_sb); 542 return ret; 543 } 544 EXPORT_SYMBOL_GPL(mnt_want_write_file); 545 546 /** 547 * mnt_put_write_access - give up write access to a mount 548 * @mnt: the mount on which to give up write access 549 * 550 * Tells the low-level filesystem that we are done 551 * performing writes to it. Must be matched with 552 * mnt_get_write_access() call above. 553 */ 554 void mnt_put_write_access(struct vfsmount *mnt) 555 { 556 preempt_disable(); 557 mnt_dec_writers(real_mount(mnt)); 558 preempt_enable(); 559 } 560 EXPORT_SYMBOL_GPL(mnt_put_write_access); 561 562 /** 563 * mnt_drop_write - give up write access to a mount 564 * @mnt: the mount on which to give up write access 565 * 566 * Tells the low-level filesystem that we are done performing writes to it and 567 * also allows filesystem to be frozen again. Must be matched with 568 * mnt_want_write() call above. 569 */ 570 void mnt_drop_write(struct vfsmount *mnt) 571 { 572 mnt_put_write_access(mnt); 573 sb_end_write(mnt->mnt_sb); 574 } 575 EXPORT_SYMBOL_GPL(mnt_drop_write); 576 577 void mnt_put_write_access_file(struct file *file) 578 { 579 if (!(file->f_mode & FMODE_WRITER)) 580 mnt_put_write_access(file->f_path.mnt); 581 } 582 583 void mnt_drop_write_file(struct file *file) 584 { 585 mnt_put_write_access_file(file); 586 sb_end_write(file_inode(file)->i_sb); 587 } 588 EXPORT_SYMBOL(mnt_drop_write_file); 589 590 /** 591 * mnt_hold_writers - prevent write access to the given mount 592 * @mnt: mnt to prevent write access to 593 * 594 * Prevents write access to @mnt if there are no active writers for @mnt. 595 * This function needs to be called and return successfully before changing 596 * properties of @mnt that need to remain stable for callers with write access 597 * to @mnt. 598 * 599 * After this functions has been called successfully callers must pair it with 600 * a call to mnt_unhold_writers() in order to stop preventing write access to 601 * @mnt. 602 * 603 * Context: This function expects to be in mount_locked_reader scope serializing 604 * setting WRITE_HOLD. 605 * Return: On success 0 is returned. 606 * On error, -EBUSY is returned. 607 */ 608 static inline int mnt_hold_writers(struct mount *mnt) 609 { 610 set_write_hold(mnt); 611 /* 612 * After storing WRITE_HOLD, we'll read the counters. This store 613 * should be visible before we do. 614 */ 615 smp_mb(); 616 617 /* 618 * With writers on hold, if this value is zero, then there are 619 * definitely no active writers (although held writers may subsequently 620 * increment the count, they'll have to wait, and decrement it after 621 * seeing MNT_READONLY). 622 * 623 * It is OK to have counter incremented on one CPU and decremented on 624 * another: the sum will add up correctly. The danger would be when we 625 * sum up each counter, if we read a counter before it is incremented, 626 * but then read another CPU's count which it has been subsequently 627 * decremented from -- we would see more decrements than we should. 628 * WRITE_HOLD protects against this scenario, because 629 * mnt_want_write first increments count, then smp_mb, then spins on 630 * WRITE_HOLD, so it can't be decremented by another CPU while 631 * we're counting up here. 632 */ 633 if (mnt_get_writers(mnt) > 0) 634 return -EBUSY; 635 636 return 0; 637 } 638 639 /** 640 * mnt_unhold_writers - stop preventing write access to the given mount 641 * @mnt: mnt to stop preventing write access to 642 * 643 * Stop preventing write access to @mnt allowing callers to gain write access 644 * to @mnt again. 645 * 646 * This function can only be called after a call to mnt_hold_writers(). 647 * 648 * Context: This function expects to be in the same mount_locked_reader scope 649 * as the matching mnt_hold_writers(). 650 */ 651 static inline void mnt_unhold_writers(struct mount *mnt) 652 { 653 if (!test_write_hold(mnt)) 654 return; 655 /* 656 * MNT_READONLY must become visible before ~WRITE_HOLD, so writers 657 * that become unheld will see MNT_READONLY. 658 */ 659 smp_wmb(); 660 clear_write_hold(mnt); 661 } 662 663 static inline void mnt_del_instance(struct mount *m) 664 { 665 struct mount **p = m->mnt_pprev_for_sb; 666 struct mount *next = m->mnt_next_for_sb; 667 668 if (next) 669 next->mnt_pprev_for_sb = p; 670 *p = next; 671 } 672 673 static inline void mnt_add_instance(struct mount *m, struct super_block *s) 674 { 675 struct mount *first = s->s_mounts; 676 677 if (first) 678 first->mnt_pprev_for_sb = &m->mnt_next_for_sb; 679 m->mnt_next_for_sb = first; 680 m->mnt_pprev_for_sb = &s->s_mounts; 681 s->s_mounts = m; 682 } 683 684 static int mnt_make_readonly(struct mount *mnt) 685 { 686 int ret; 687 688 ret = mnt_hold_writers(mnt); 689 if (!ret) 690 mnt->mnt.mnt_flags |= MNT_READONLY; 691 mnt_unhold_writers(mnt); 692 return ret; 693 } 694 695 int sb_prepare_remount_readonly(struct super_block *sb) 696 { 697 int err = 0; 698 699 /* Racy optimization. Recheck the counter under WRITE_HOLD */ 700 if (atomic_long_read(&sb->s_remove_count)) 701 return -EBUSY; 702 703 guard(mount_locked_reader)(); 704 705 for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { 706 if (!(m->mnt.mnt_flags & MNT_READONLY)) { 707 err = mnt_hold_writers(m); 708 if (err) 709 break; 710 } 711 } 712 if (!err && atomic_long_read(&sb->s_remove_count)) 713 err = -EBUSY; 714 715 if (!err) 716 sb_start_ro_state_change(sb); 717 for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { 718 if (test_write_hold(m)) 719 clear_write_hold(m); 720 } 721 722 return err; 723 } 724 725 static void free_vfsmnt(struct mount *mnt) 726 { 727 mnt_idmap_put(mnt_idmap(&mnt->mnt)); 728 kfree_const(mnt->mnt_devname); 729 #ifdef CONFIG_SMP 730 free_percpu(mnt->mnt_pcp); 731 #endif 732 kmem_cache_free(mnt_cache, mnt); 733 } 734 735 static void delayed_free_vfsmnt(struct rcu_head *head) 736 { 737 free_vfsmnt(container_of(head, struct mount, mnt_rcu)); 738 } 739 740 /* call under rcu_read_lock */ 741 int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) 742 { 743 struct mount *mnt; 744 if (read_seqretry(&mount_lock, seq)) 745 return 1; 746 if (bastard == NULL) 747 return 0; 748 mnt = real_mount(bastard); 749 mnt_add_count(mnt, 1); 750 smp_mb(); // see mntput_no_expire() and do_umount() 751 if (likely(!read_seqretry(&mount_lock, seq))) 752 return 0; 753 lock_mount_hash(); 754 if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { 755 mnt_add_count(mnt, -1); 756 unlock_mount_hash(); 757 return 1; 758 } 759 unlock_mount_hash(); 760 /* caller will mntput() */ 761 return -1; 762 } 763 764 /* call under rcu_read_lock */ 765 static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) 766 { 767 int res = __legitimize_mnt(bastard, seq); 768 if (likely(!res)) 769 return true; 770 if (unlikely(res < 0)) { 771 rcu_read_unlock(); 772 mntput(bastard); 773 rcu_read_lock(); 774 } 775 return false; 776 } 777 778 /** 779 * __lookup_mnt - mount hash lookup 780 * @mnt: parent mount 781 * @dentry: dentry of mountpoint 782 * 783 * If @mnt has a child mount @c mounted on @dentry find and return it. 784 * Caller must either hold the spinlock component of @mount_lock or 785 * hold rcu_read_lock(), sample the seqcount component before the call 786 * and recheck it afterwards. 787 * 788 * Return: The child of @mnt mounted on @dentry or %NULL. 789 */ 790 struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) 791 { 792 struct hlist_head *head = m_hash(mnt, dentry); 793 struct mount *p; 794 795 hlist_for_each_entry_rcu(p, head, mnt_hash) 796 if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) 797 return p; 798 return NULL; 799 } 800 801 /** 802 * lookup_mnt - Return the child mount mounted at given location 803 * @path: location in the namespace 804 * 805 * Acquires and returns a new reference to mount at given location 806 * or %NULL if nothing is mounted there. 807 */ 808 struct vfsmount *lookup_mnt(const struct path *path) 809 { 810 struct mount *child_mnt; 811 struct vfsmount *m; 812 unsigned seq; 813 814 rcu_read_lock(); 815 do { 816 seq = read_seqbegin(&mount_lock); 817 child_mnt = __lookup_mnt(path->mnt, path->dentry); 818 m = child_mnt ? &child_mnt->mnt : NULL; 819 } while (!legitimize_mnt(m, seq)); 820 rcu_read_unlock(); 821 return m; 822 } 823 824 /* 825 * __is_local_mountpoint - Test to see if dentry is a mountpoint in the 826 * current mount namespace. 827 * 828 * The common case is dentries are not mountpoints at all and that 829 * test is handled inline. For the slow case when we are actually 830 * dealing with a mountpoint of some kind, walk through all of the 831 * mounts in the current mount namespace and test to see if the dentry 832 * is a mountpoint. 833 * 834 * The mount_hashtable is not usable in the context because we 835 * need to identify all mounts that may be in the current mount 836 * namespace not just a mount that happens to have some specified 837 * parent mount. 838 */ 839 bool __is_local_mountpoint(const struct dentry *dentry) 840 { 841 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 842 struct mount *mnt, *n; 843 844 guard(namespace_shared)(); 845 846 rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) 847 if (mnt->mnt_mountpoint == dentry) 848 return true; 849 850 return false; 851 } 852 853 struct pinned_mountpoint { 854 struct hlist_node node; 855 struct mountpoint *mp; 856 struct mount *parent; 857 }; 858 859 static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) 860 { 861 struct hlist_head *chain = mp_hash(dentry); 862 struct mountpoint *mp; 863 864 hlist_for_each_entry(mp, chain, m_hash) { 865 if (mp->m_dentry == dentry) { 866 hlist_add_head(&m->node, &mp->m_list); 867 m->mp = mp; 868 return true; 869 } 870 } 871 return false; 872 } 873 874 static int get_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) 875 { 876 struct mountpoint *mp __free(kfree) = NULL; 877 bool found; 878 int ret; 879 880 if (d_mountpoint(dentry)) { 881 /* might be worth a WARN_ON() */ 882 if (d_unlinked(dentry)) 883 return -ENOENT; 884 mountpoint: 885 read_seqlock_excl(&mount_lock); 886 found = lookup_mountpoint(dentry, m); 887 read_sequnlock_excl(&mount_lock); 888 if (found) 889 return 0; 890 } 891 892 if (!mp) 893 mp = kmalloc_obj(struct mountpoint); 894 if (!mp) 895 return -ENOMEM; 896 897 /* Exactly one processes may set d_mounted */ 898 ret = d_set_mounted(dentry); 899 900 /* Someone else set d_mounted? */ 901 if (ret == -EBUSY) 902 goto mountpoint; 903 904 /* The dentry is not available as a mountpoint? */ 905 if (ret) 906 return ret; 907 908 /* Add the new mountpoint to the hash table */ 909 read_seqlock_excl(&mount_lock); 910 mp->m_dentry = dget(dentry); 911 hlist_add_head(&mp->m_hash, mp_hash(dentry)); 912 INIT_HLIST_HEAD(&mp->m_list); 913 hlist_add_head(&m->node, &mp->m_list); 914 m->mp = no_free_ptr(mp); 915 read_sequnlock_excl(&mount_lock); 916 return 0; 917 } 918 919 /* 920 * vfsmount lock must be held. Additionally, the caller is responsible 921 * for serializing calls for given disposal list. 922 */ 923 static void maybe_free_mountpoint(struct mountpoint *mp, struct list_head *list) 924 { 925 if (hlist_empty(&mp->m_list)) { 926 struct dentry *dentry = mp->m_dentry; 927 spin_lock(&dentry->d_lock); 928 dentry->d_flags &= ~DCACHE_MOUNTED; 929 spin_unlock(&dentry->d_lock); 930 dput_to_list(dentry, list); 931 hlist_del(&mp->m_hash); 932 kfree(mp); 933 } 934 } 935 936 /* 937 * locks: mount_lock [read_seqlock_excl], namespace_sem [excl] 938 */ 939 static void unpin_mountpoint(struct pinned_mountpoint *m) 940 { 941 if (m->mp) { 942 hlist_del(&m->node); 943 maybe_free_mountpoint(m->mp, &ex_mountpoints); 944 } 945 } 946 947 static inline int check_mnt(const struct mount *mnt) 948 { 949 return mnt->mnt_ns == current->nsproxy->mnt_ns; 950 } 951 952 static inline bool check_anonymous_mnt(struct mount *mnt) 953 { 954 u64 seq; 955 956 if (!is_anon_ns(mnt->mnt_ns)) 957 return false; 958 959 seq = mnt->mnt_ns->seq_origin; 960 return !seq || (seq == current->nsproxy->mnt_ns->ns.ns_id); 961 } 962 963 /* 964 * vfsmount lock must be held for write 965 */ 966 static void touch_mnt_namespace(struct mnt_namespace *ns) 967 { 968 if (ns) { 969 ns->event = ++event; 970 wake_up_interruptible(&ns->poll); 971 } 972 } 973 974 /* 975 * vfsmount lock must be held for write 976 */ 977 static void __touch_mnt_namespace(struct mnt_namespace *ns) 978 { 979 if (ns && ns->event != event) { 980 ns->event = event; 981 wake_up_interruptible(&ns->poll); 982 } 983 } 984 985 /* 986 * locks: mount_lock[write_seqlock] 987 */ 988 static void __umount_mnt(struct mount *mnt, struct list_head *shrink_list) 989 { 990 struct mountpoint *mp; 991 struct mount *parent = mnt->mnt_parent; 992 if (unlikely(parent->overmount == mnt)) 993 parent->overmount = NULL; 994 mnt->mnt_parent = mnt; 995 mnt->mnt_mountpoint = mnt->mnt.mnt_root; 996 list_del_init(&mnt->mnt_child); 997 hlist_del_init_rcu(&mnt->mnt_hash); 998 hlist_del_init(&mnt->mnt_mp_list); 999 mp = mnt->mnt_mp; 1000 mnt->mnt_mp = NULL; 1001 maybe_free_mountpoint(mp, shrink_list); 1002 } 1003 1004 /* 1005 * locks: mount_lock[write_seqlock], namespace_sem[excl] (for ex_mountpoints) 1006 */ 1007 static void umount_mnt(struct mount *mnt) 1008 { 1009 __umount_mnt(mnt, &ex_mountpoints); 1010 } 1011 1012 /* 1013 * vfsmount lock must be held for write 1014 */ 1015 void mnt_set_mountpoint(struct mount *mnt, 1016 struct mountpoint *mp, 1017 struct mount *child_mnt) 1018 { 1019 child_mnt->mnt_mountpoint = mp->m_dentry; 1020 child_mnt->mnt_parent = mnt; 1021 child_mnt->mnt_mp = mp; 1022 hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); 1023 } 1024 1025 static void make_visible(struct mount *mnt) 1026 { 1027 struct mount *parent = mnt->mnt_parent; 1028 if (unlikely(mnt->mnt_mountpoint == parent->mnt.mnt_root)) 1029 parent->overmount = mnt; 1030 hlist_add_head_rcu(&mnt->mnt_hash, 1031 m_hash(&parent->mnt, mnt->mnt_mountpoint)); 1032 list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); 1033 } 1034 1035 /** 1036 * attach_mnt - mount a mount, attach to @mount_hashtable and parent's 1037 * list of child mounts 1038 * @parent: the parent 1039 * @mnt: the new mount 1040 * @mp: the new mountpoint 1041 * 1042 * Mount @mnt at @mp on @parent. Then attach @mnt 1043 * to @parent's child mount list and to @mount_hashtable. 1044 * 1045 * Note, when make_visible() is called @mnt->mnt_parent already points 1046 * to the correct parent. 1047 * 1048 * Context: This function expects namespace_lock() and lock_mount_hash() 1049 * to have been acquired in that order. 1050 */ 1051 static void attach_mnt(struct mount *mnt, struct mount *parent, 1052 struct mountpoint *mp) 1053 { 1054 mnt_set_mountpoint(parent, mp, mnt); 1055 make_visible(mnt); 1056 } 1057 1058 void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) 1059 { 1060 struct mountpoint *old_mp = mnt->mnt_mp; 1061 1062 list_del_init(&mnt->mnt_child); 1063 hlist_del_init(&mnt->mnt_mp_list); 1064 hlist_del_init_rcu(&mnt->mnt_hash); 1065 1066 attach_mnt(mnt, parent, mp); 1067 1068 maybe_free_mountpoint(old_mp, &ex_mountpoints); 1069 } 1070 1071 static inline struct mount *node_to_mount(struct rb_node *node) 1072 { 1073 return node ? rb_entry(node, struct mount, mnt_node) : NULL; 1074 } 1075 1076 static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) 1077 { 1078 struct rb_node **link = &ns->mounts.rb_node; 1079 struct rb_node *parent = NULL; 1080 bool mnt_first_node = true, mnt_last_node = true; 1081 1082 WARN_ON(mnt_ns_attached(mnt)); 1083 WRITE_ONCE(mnt->mnt_ns, ns); 1084 while (*link) { 1085 parent = *link; 1086 if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) { 1087 link = &parent->rb_left; 1088 mnt_last_node = false; 1089 } else { 1090 link = &parent->rb_right; 1091 mnt_first_node = false; 1092 } 1093 } 1094 1095 if (mnt_last_node) 1096 ns->mnt_last_node = &mnt->mnt_node; 1097 if (mnt_first_node) 1098 ns->mnt_first_node = &mnt->mnt_node; 1099 rb_link_node(&mnt->mnt_node, parent, link); 1100 rb_insert_color(&mnt->mnt_node, &ns->mounts); 1101 1102 if ((mnt->mnt.mnt_sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED) && 1103 mnt->mnt.mnt_root == mnt->mnt.mnt_sb->s_root) 1104 hlist_add_head(&mnt->mnt_ns_visible, &ns->mnt_visible_mounts); 1105 1106 mnt_notify_add(mnt); 1107 } 1108 1109 static struct mount *next_mnt(struct mount *p, struct mount *root) 1110 { 1111 struct list_head *next = p->mnt_mounts.next; 1112 if (next == &p->mnt_mounts) { 1113 while (1) { 1114 if (p == root) 1115 return NULL; 1116 next = p->mnt_child.next; 1117 if (next != &p->mnt_parent->mnt_mounts) 1118 break; 1119 p = p->mnt_parent; 1120 } 1121 } 1122 return list_entry(next, struct mount, mnt_child); 1123 } 1124 1125 static struct mount *skip_mnt_tree(struct mount *p) 1126 { 1127 struct list_head *prev = p->mnt_mounts.prev; 1128 while (prev != &p->mnt_mounts) { 1129 p = list_entry(prev, struct mount, mnt_child); 1130 prev = p->mnt_mounts.prev; 1131 } 1132 return p; 1133 } 1134 1135 /* 1136 * vfsmount lock must be held for write 1137 */ 1138 static void commit_tree(struct mount *mnt) 1139 { 1140 struct mnt_namespace *n = mnt->mnt_parent->mnt_ns; 1141 1142 if (!mnt_ns_attached(mnt)) { 1143 for (struct mount *m = mnt; m; m = next_mnt(m, mnt)) 1144 mnt_add_to_ns(n, m); 1145 n->nr_mounts += n->pending_mounts; 1146 n->pending_mounts = 0; 1147 } 1148 1149 make_visible(mnt); 1150 touch_mnt_namespace(n); 1151 } 1152 1153 static void setup_mnt(struct mount *m, struct dentry *root) 1154 { 1155 struct super_block *s = root->d_sb; 1156 1157 atomic_inc(&s->s_active); 1158 m->mnt.mnt_sb = s; 1159 m->mnt.mnt_root = dget(root); 1160 m->mnt_mountpoint = m->mnt.mnt_root; 1161 m->mnt_parent = m; 1162 1163 guard(mount_locked_reader)(); 1164 mnt_add_instance(m, s); 1165 } 1166 1167 /** 1168 * vfs_create_mount - Create a mount for a configured superblock 1169 * @fc: The configuration context with the superblock attached 1170 * 1171 * Create a mount to an already configured superblock. If necessary, the 1172 * caller should invoke vfs_get_tree() before calling this. 1173 * 1174 * Note that this does not attach the mount to anything. 1175 */ 1176 struct vfsmount *vfs_create_mount(struct fs_context *fc) 1177 { 1178 struct mount *mnt; 1179 1180 if (!fc->root) 1181 return ERR_PTR(-EINVAL); 1182 1183 mnt = alloc_vfsmnt(fc->source); 1184 if (!mnt) 1185 return ERR_PTR(-ENOMEM); 1186 1187 if (fc->sb_flags & SB_KERNMOUNT) 1188 mnt->mnt.mnt_flags = MNT_INTERNAL; 1189 1190 setup_mnt(mnt, fc->root); 1191 1192 return &mnt->mnt; 1193 } 1194 EXPORT_SYMBOL(vfs_create_mount); 1195 1196 struct vfsmount *fc_mount(struct fs_context *fc) 1197 { 1198 int err = vfs_get_tree(fc); 1199 if (!err) { 1200 up_write(&fc->root->d_sb->s_umount); 1201 return vfs_create_mount(fc); 1202 } 1203 return ERR_PTR(err); 1204 } 1205 EXPORT_SYMBOL(fc_mount); 1206 1207 struct vfsmount *fc_mount_longterm(struct fs_context *fc) 1208 { 1209 struct vfsmount *mnt = fc_mount(fc); 1210 if (!IS_ERR(mnt)) 1211 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; 1212 return mnt; 1213 } 1214 EXPORT_SYMBOL(fc_mount_longterm); 1215 1216 struct vfsmount *vfs_kern_mount(struct file_system_type *type, 1217 int flags, const char *name, 1218 void *data) 1219 { 1220 struct fs_context *fc; 1221 struct vfsmount *mnt; 1222 int ret = 0; 1223 1224 if (!type) 1225 return ERR_PTR(-EINVAL); 1226 1227 fc = fs_context_for_mount(type, flags); 1228 if (IS_ERR(fc)) 1229 return ERR_CAST(fc); 1230 1231 if (name) 1232 ret = vfs_parse_fs_string(fc, "source", name); 1233 if (!ret) 1234 ret = parse_monolithic_mount_data(fc, data); 1235 if (!ret) 1236 mnt = fc_mount(fc); 1237 else 1238 mnt = ERR_PTR(ret); 1239 1240 put_fs_context(fc); 1241 return mnt; 1242 } 1243 EXPORT_SYMBOL_GPL(vfs_kern_mount); 1244 1245 static struct mount *clone_mnt(struct mount *old, struct dentry *root, 1246 int flag) 1247 { 1248 struct mount *mnt; 1249 int err; 1250 1251 mnt = alloc_vfsmnt(old->mnt_devname); 1252 if (!mnt) 1253 return ERR_PTR(-ENOMEM); 1254 1255 mnt->mnt.mnt_flags = READ_ONCE(old->mnt.mnt_flags) & 1256 ~MNT_INTERNAL_FLAGS; 1257 1258 if (flag & (CL_SLAVE | CL_PRIVATE)) 1259 mnt->mnt_group_id = 0; /* not a peer of original */ 1260 else 1261 mnt->mnt_group_id = old->mnt_group_id; 1262 1263 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { 1264 err = mnt_alloc_group_id(mnt); 1265 if (err) 1266 goto out_free; 1267 } 1268 1269 if (mnt->mnt_group_id) 1270 set_mnt_shared(mnt); 1271 1272 mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); 1273 1274 setup_mnt(mnt, root); 1275 1276 if (flag & CL_PRIVATE) // we are done with it 1277 return mnt; 1278 1279 if (peers(mnt, old)) 1280 list_add(&mnt->mnt_share, &old->mnt_share); 1281 1282 if ((flag & CL_SLAVE) && old->mnt_group_id) { 1283 hlist_add_head(&mnt->mnt_slave, &old->mnt_slave_list); 1284 mnt->mnt_master = old; 1285 } else if (IS_MNT_SLAVE(old)) { 1286 hlist_add_behind(&mnt->mnt_slave, &old->mnt_slave); 1287 mnt->mnt_master = old->mnt_master; 1288 } 1289 return mnt; 1290 1291 out_free: 1292 mnt_free_id(mnt); 1293 free_vfsmnt(mnt); 1294 return ERR_PTR(err); 1295 } 1296 1297 static void cleanup_mnt(struct mount *mnt) 1298 { 1299 struct hlist_node *p; 1300 struct mount *m; 1301 /* 1302 * The warning here probably indicates that somebody messed 1303 * up a mnt_want/drop_write() pair. If this happens, the 1304 * filesystem was probably unable to make r/w->r/o transitions. 1305 * The locking used to deal with mnt_count decrement provides barriers, 1306 * so mnt_get_writers() below is safe. 1307 */ 1308 WARN_ON(mnt_get_writers(mnt)); 1309 if (unlikely(mnt->mnt_pins.first)) 1310 mnt_pin_kill(mnt); 1311 hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { 1312 hlist_del(&m->mnt_umount); 1313 mntput(&m->mnt); 1314 } 1315 fsnotify_vfsmount_delete(&mnt->mnt); 1316 dput(mnt->mnt.mnt_root); 1317 deactivate_super(mnt->mnt.mnt_sb); 1318 mnt_free_id(mnt); 1319 call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); 1320 } 1321 1322 static void __cleanup_mnt(struct rcu_head *head) 1323 { 1324 cleanup_mnt(container_of(head, struct mount, mnt_rcu)); 1325 } 1326 1327 static LLIST_HEAD(delayed_mntput_list); 1328 static void delayed_mntput(struct work_struct *unused) 1329 { 1330 struct llist_node *node = llist_del_all(&delayed_mntput_list); 1331 struct mount *m, *t; 1332 1333 llist_for_each_entry_safe(m, t, node, mnt_llist) 1334 cleanup_mnt(m); 1335 } 1336 static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); 1337 1338 static void noinline mntput_no_expire_slowpath(struct mount *mnt) 1339 { 1340 LIST_HEAD(list); 1341 int count; 1342 1343 VFS_BUG_ON(mnt->mnt_ns); 1344 lock_mount_hash(); 1345 /* 1346 * make sure that if __legitimize_mnt() has not seen us grab 1347 * mount_lock, we'll see their refcount increment here. 1348 */ 1349 smp_mb(); 1350 mnt_add_count(mnt, -1); 1351 count = mnt_get_count(mnt); 1352 if (count != 0) { 1353 WARN_ON(count < 0); 1354 rcu_read_unlock(); 1355 unlock_mount_hash(); 1356 return; 1357 } 1358 if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { 1359 rcu_read_unlock(); 1360 unlock_mount_hash(); 1361 return; 1362 } 1363 mnt->mnt.mnt_flags |= MNT_DOOMED; 1364 rcu_read_unlock(); 1365 1366 mnt_del_instance(mnt); 1367 if (unlikely(!list_empty(&mnt->mnt_expire))) 1368 list_del(&mnt->mnt_expire); 1369 1370 if (unlikely(!list_empty(&mnt->mnt_mounts))) { 1371 struct mount *p, *tmp; 1372 list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { 1373 __umount_mnt(p, &list); 1374 hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); 1375 } 1376 } 1377 unlock_mount_hash(); 1378 shrink_dentry_list(&list); 1379 1380 if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { 1381 struct task_struct *task = current; 1382 if (likely(!(task->flags & PF_KTHREAD))) { 1383 init_task_work(&mnt->mnt_rcu, __cleanup_mnt); 1384 if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) 1385 return; 1386 } 1387 if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) 1388 schedule_delayed_work(&delayed_mntput_work, 1); 1389 return; 1390 } 1391 cleanup_mnt(mnt); 1392 } 1393 1394 static void mntput_no_expire(struct mount *mnt) 1395 { 1396 rcu_read_lock(); 1397 if (likely(READ_ONCE(mnt->mnt_ns))) { 1398 /* 1399 * Since we don't do lock_mount_hash() here, 1400 * ->mnt_ns can change under us. However, if it's 1401 * non-NULL, then there's a reference that won't 1402 * be dropped until after an RCU delay done after 1403 * turning ->mnt_ns NULL. So if we observe it 1404 * non-NULL under rcu_read_lock(), the reference 1405 * we are dropping is not the final one. 1406 */ 1407 mnt_add_count(mnt, -1); 1408 rcu_read_unlock(); 1409 return; 1410 } 1411 mntput_no_expire_slowpath(mnt); 1412 } 1413 1414 void mntput(struct vfsmount *mnt) 1415 { 1416 if (mnt) { 1417 struct mount *m = real_mount(mnt); 1418 /* avoid cacheline pingpong */ 1419 if (unlikely(m->mnt_expiry_mark)) 1420 WRITE_ONCE(m->mnt_expiry_mark, 0); 1421 mntput_no_expire(m); 1422 } 1423 } 1424 EXPORT_SYMBOL(mntput); 1425 1426 struct vfsmount *mntget(struct vfsmount *mnt) 1427 { 1428 if (mnt) 1429 mnt_add_count(real_mount(mnt), 1); 1430 return mnt; 1431 } 1432 EXPORT_SYMBOL(mntget); 1433 1434 /* 1435 * Make a mount point inaccessible to new lookups. 1436 * Because there may still be current users, the caller MUST WAIT 1437 * for an RCU grace period before destroying the mount point. 1438 */ 1439 void mnt_make_shortterm(struct vfsmount *mnt) 1440 { 1441 if (mnt) 1442 WRITE_ONCE(real_mount(mnt)->mnt_ns, NULL); 1443 } 1444 1445 /** 1446 * path_is_mountpoint() - Check if path is a mount in the current namespace. 1447 * @path: path to check 1448 * 1449 * d_mountpoint() can only be used reliably to establish if a dentry is 1450 * not mounted in any namespace and that common case is handled inline. 1451 * d_mountpoint() isn't aware of the possibility there may be multiple 1452 * mounts using a given dentry in a different namespace. This function 1453 * checks if the passed in path is a mountpoint rather than the dentry 1454 * alone. 1455 */ 1456 bool path_is_mountpoint(const struct path *path) 1457 { 1458 unsigned seq; 1459 bool res; 1460 1461 if (!d_mountpoint(path->dentry)) 1462 return false; 1463 1464 rcu_read_lock(); 1465 do { 1466 seq = read_seqbegin(&mount_lock); 1467 res = __path_is_mountpoint(path); 1468 } while (read_seqretry(&mount_lock, seq)); 1469 rcu_read_unlock(); 1470 1471 return res; 1472 } 1473 EXPORT_SYMBOL(path_is_mountpoint); 1474 1475 struct vfsmount *mnt_clone_internal(const struct path *path) 1476 { 1477 struct mount *p; 1478 p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); 1479 if (IS_ERR(p)) 1480 return ERR_CAST(p); 1481 p->mnt.mnt_flags |= MNT_INTERNAL; 1482 return &p->mnt; 1483 } 1484 1485 /* 1486 * Returns the mount which either has the specified mnt_id, or has the next 1487 * smallest id afer the specified one. 1488 */ 1489 static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id) 1490 { 1491 struct rb_node *node = ns->mounts.rb_node; 1492 struct mount *ret = NULL; 1493 1494 while (node) { 1495 struct mount *m = node_to_mount(node); 1496 1497 if (mnt_id <= m->mnt_id_unique) { 1498 ret = node_to_mount(node); 1499 if (mnt_id == m->mnt_id_unique) 1500 break; 1501 node = node->rb_left; 1502 } else { 1503 node = node->rb_right; 1504 } 1505 } 1506 return ret; 1507 } 1508 1509 /* 1510 * Returns the mount which either has the specified mnt_id, or has the next 1511 * greater id before the specified one. 1512 */ 1513 static struct mount *mnt_find_id_at_reverse(struct mnt_namespace *ns, u64 mnt_id) 1514 { 1515 struct rb_node *node = ns->mounts.rb_node; 1516 struct mount *ret = NULL; 1517 1518 while (node) { 1519 struct mount *m = node_to_mount(node); 1520 1521 if (mnt_id >= m->mnt_id_unique) { 1522 ret = node_to_mount(node); 1523 if (mnt_id == m->mnt_id_unique) 1524 break; 1525 node = node->rb_right; 1526 } else { 1527 node = node->rb_left; 1528 } 1529 } 1530 return ret; 1531 } 1532 1533 #ifdef CONFIG_PROC_FS 1534 1535 /* iterator; we want it to have access to namespace_sem, thus here... */ 1536 static void *m_start(struct seq_file *m, loff_t *pos) 1537 { 1538 struct proc_mounts *p = m->private; 1539 struct mount *mnt; 1540 1541 down_read(&namespace_sem); 1542 1543 mnt = mnt_find_id_at(p->ns, *pos); 1544 if (mnt) 1545 *pos = mnt->mnt_id_unique; 1546 return mnt; 1547 } 1548 1549 static void *m_next(struct seq_file *m, void *v, loff_t *pos) 1550 { 1551 struct mount *mnt = v; 1552 struct rb_node *node = rb_next(&mnt->mnt_node); 1553 1554 if (node) { 1555 struct mount *next = node_to_mount(node); 1556 *pos = next->mnt_id_unique; 1557 return next; 1558 } 1559 1560 /* 1561 * No more mounts. Set pos past current mount's ID so that if 1562 * iteration restarts, mnt_find_id_at() returns NULL. 1563 */ 1564 *pos = mnt->mnt_id_unique + 1; 1565 return NULL; 1566 } 1567 1568 static void m_stop(struct seq_file *m, void *v) 1569 { 1570 up_read(&namespace_sem); 1571 } 1572 1573 static int m_show(struct seq_file *m, void *v) 1574 { 1575 struct proc_mounts *p = m->private; 1576 struct mount *r = v; 1577 return p->show(m, &r->mnt); 1578 } 1579 1580 const struct seq_operations mounts_op = { 1581 .start = m_start, 1582 .next = m_next, 1583 .stop = m_stop, 1584 .show = m_show, 1585 }; 1586 1587 #endif /* CONFIG_PROC_FS */ 1588 1589 /** 1590 * may_umount_tree - check if a mount tree is busy 1591 * @m: root of mount tree 1592 * 1593 * This is called to check if a tree of mounts has any 1594 * open files, pwds, chroots or sub mounts that are 1595 * busy. 1596 */ 1597 int may_umount_tree(struct vfsmount *m) 1598 { 1599 struct mount *mnt = real_mount(m); 1600 bool busy = false; 1601 1602 /* write lock needed for mnt_get_count */ 1603 lock_mount_hash(); 1604 for (struct mount *p = mnt; p; p = next_mnt(p, mnt)) { 1605 if (mnt_get_count(p) > (p == mnt ? 2 : 1)) { 1606 busy = true; 1607 break; 1608 } 1609 } 1610 unlock_mount_hash(); 1611 1612 return !busy; 1613 } 1614 1615 EXPORT_SYMBOL(may_umount_tree); 1616 1617 /** 1618 * may_umount - check if a mount point is busy 1619 * @mnt: root of mount 1620 * 1621 * This is called to check if a mount point has any 1622 * open files, pwds, chroots or sub mounts. If the 1623 * mount has sub mounts this will return busy 1624 * regardless of whether the sub mounts are busy. 1625 * 1626 * Doesn't take quota and stuff into account. IOW, in some cases it will 1627 * give false negatives. The main reason why it's here is that we need 1628 * a non-destructive way to look for easily umountable filesystems. 1629 */ 1630 int may_umount(struct vfsmount *mnt) 1631 { 1632 int ret = 1; 1633 down_read(&namespace_sem); 1634 lock_mount_hash(); 1635 if (propagate_mount_busy(real_mount(mnt), 2)) 1636 ret = 0; 1637 unlock_mount_hash(); 1638 up_read(&namespace_sem); 1639 return ret; 1640 } 1641 1642 EXPORT_SYMBOL(may_umount); 1643 1644 #ifdef CONFIG_FSNOTIFY 1645 static void mnt_notify(struct mount *p) 1646 { 1647 if (!p->prev_ns && p->mnt_ns) { 1648 fsnotify_mnt_attach(p->mnt_ns, &p->mnt); 1649 } else if (p->prev_ns && !p->mnt_ns) { 1650 fsnotify_mnt_detach(p->prev_ns, &p->mnt); 1651 } else if (p->prev_ns == p->mnt_ns) { 1652 fsnotify_mnt_move(p->mnt_ns, &p->mnt); 1653 } else { 1654 fsnotify_mnt_detach(p->prev_ns, &p->mnt); 1655 fsnotify_mnt_attach(p->mnt_ns, &p->mnt); 1656 } 1657 p->prev_ns = p->mnt_ns; 1658 } 1659 1660 static void notify_mnt_list(void) 1661 { 1662 struct mount *m, *tmp; 1663 /* 1664 * Notify about mounts that were added/reparented/detached/remain 1665 * connected after unmount. 1666 */ 1667 list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) { 1668 mnt_notify(m); 1669 list_del_init(&m->to_notify); 1670 } 1671 } 1672 1673 static bool need_notify_mnt_list(void) 1674 { 1675 return !list_empty(¬ify_list); 1676 } 1677 #else 1678 static void notify_mnt_list(void) 1679 { 1680 } 1681 1682 static bool need_notify_mnt_list(void) 1683 { 1684 return false; 1685 } 1686 #endif 1687 1688 static void free_mnt_ns(struct mnt_namespace *); 1689 static void namespace_unlock(void) 1690 { 1691 struct hlist_head head; 1692 struct hlist_node *p; 1693 struct mount *m; 1694 struct mnt_namespace *ns = emptied_ns; 1695 LIST_HEAD(list); 1696 1697 hlist_move_list(&unmounted, &head); 1698 list_splice_init(&ex_mountpoints, &list); 1699 emptied_ns = NULL; 1700 1701 if (need_notify_mnt_list()) { 1702 /* 1703 * No point blocking out concurrent readers while notifications 1704 * are sent. This will also allow statmount()/listmount() to run 1705 * concurrently. 1706 */ 1707 downgrade_write(&namespace_sem); 1708 notify_mnt_list(); 1709 up_read(&namespace_sem); 1710 } else { 1711 up_write(&namespace_sem); 1712 } 1713 if (unlikely(ns)) { 1714 /* Make sure we notice when we leak mounts. */ 1715 VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); 1716 free_mnt_ns(ns); 1717 } 1718 1719 shrink_dentry_list(&list); 1720 1721 if (likely(hlist_empty(&head))) 1722 return; 1723 1724 synchronize_rcu_expedited(); 1725 1726 hlist_for_each_entry_safe(m, p, &head, mnt_umount) { 1727 hlist_del(&m->mnt_umount); 1728 mntput(&m->mnt); 1729 } 1730 } 1731 1732 static inline void namespace_lock(void) 1733 { 1734 down_write(&namespace_sem); 1735 } 1736 1737 enum umount_tree_flags { 1738 UMOUNT_SYNC = 1, 1739 UMOUNT_PROPAGATE = 2, 1740 UMOUNT_CONNECTED = 4, 1741 }; 1742 1743 static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) 1744 { 1745 /* Leaving mounts connected is only valid for lazy umounts */ 1746 if (how & UMOUNT_SYNC) 1747 return true; 1748 1749 /* A mount without a parent has nothing to be connected to */ 1750 if (!mnt_has_parent(mnt)) 1751 return true; 1752 1753 /* Because the reference counting rules change when mounts are 1754 * unmounted and connected, umounted mounts may not be 1755 * connected to mounted mounts. 1756 */ 1757 if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) 1758 return true; 1759 1760 /* Has it been requested that the mount remain connected? */ 1761 if (how & UMOUNT_CONNECTED) 1762 return false; 1763 1764 /* Is the mount locked such that it needs to remain connected? */ 1765 if (IS_MNT_LOCKED(mnt)) 1766 return false; 1767 1768 /* By default disconnect the mount */ 1769 return true; 1770 } 1771 1772 /* 1773 * mount_lock must be held 1774 * namespace_sem must be held for write 1775 */ 1776 static void umount_tree(struct mount *mnt, enum umount_tree_flags how) 1777 { 1778 LIST_HEAD(tmp_list); 1779 struct mount *p; 1780 1781 if (how & UMOUNT_PROPAGATE) 1782 propagate_mount_unlock(mnt); 1783 1784 /* Gather the mounts to umount */ 1785 for (p = mnt; p; p = next_mnt(p, mnt)) { 1786 p->mnt.mnt_flags |= MNT_UMOUNT; 1787 if (mnt_ns_attached(p)) 1788 move_from_ns(p); 1789 list_add_tail(&p->mnt_list, &tmp_list); 1790 } 1791 1792 /* Hide the mounts from mnt_mounts */ 1793 list_for_each_entry(p, &tmp_list, mnt_list) { 1794 list_del_init(&p->mnt_child); 1795 } 1796 1797 /* Add propagated mounts to the tmp_list */ 1798 if (how & UMOUNT_PROPAGATE) 1799 propagate_umount(&tmp_list); 1800 1801 bulk_make_private(&tmp_list); 1802 1803 while (!list_empty(&tmp_list)) { 1804 struct mnt_namespace *ns; 1805 bool disconnect; 1806 p = list_first_entry(&tmp_list, struct mount, mnt_list); 1807 list_del_init(&p->mnt_expire); 1808 list_del_init(&p->mnt_list); 1809 ns = p->mnt_ns; 1810 if (ns) { 1811 ns->nr_mounts--; 1812 __touch_mnt_namespace(ns); 1813 } 1814 WRITE_ONCE(p->mnt_ns, NULL); 1815 if (how & UMOUNT_SYNC) 1816 p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; 1817 1818 disconnect = disconnect_mount(p, how); 1819 if (mnt_has_parent(p)) { 1820 if (!disconnect) { 1821 /* Don't forget about p */ 1822 list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); 1823 } else { 1824 umount_mnt(p); 1825 } 1826 } 1827 if (disconnect) 1828 hlist_add_head(&p->mnt_umount, &unmounted); 1829 1830 /* 1831 * At this point p->mnt_ns is NULL, notification will be queued 1832 * only if 1833 * 1834 * - p->prev_ns is non-NULL *and* 1835 * - p->prev_ns->n_fsnotify_marks is non-NULL 1836 * 1837 * This will preclude queuing the mount if this is a cleanup 1838 * after a failed copy_tree() or destruction of an anonymous 1839 * namespace, etc. 1840 */ 1841 mnt_notify_add(p); 1842 } 1843 } 1844 1845 static void shrink_submounts(struct mount *mnt); 1846 1847 static int do_umount_root(struct super_block *sb) 1848 { 1849 int ret = 0; 1850 1851 down_write(&sb->s_umount); 1852 if (!sb_rdonly(sb)) { 1853 struct fs_context *fc; 1854 1855 fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, 1856 SB_RDONLY); 1857 if (IS_ERR(fc)) { 1858 ret = PTR_ERR(fc); 1859 } else { 1860 ret = parse_monolithic_mount_data(fc, NULL); 1861 if (!ret) 1862 ret = reconfigure_super(fc); 1863 put_fs_context(fc); 1864 } 1865 } 1866 up_write(&sb->s_umount); 1867 return ret; 1868 } 1869 1870 static int do_umount(struct mount *mnt, int flags) 1871 { 1872 struct super_block *sb = mnt->mnt.mnt_sb; 1873 int retval; 1874 1875 retval = security_sb_umount(&mnt->mnt, flags); 1876 if (retval) 1877 return retval; 1878 1879 /* 1880 * Allow userspace to request a mountpoint be expired rather than 1881 * unmounting unconditionally. Unmount only happens if: 1882 * (1) the mark is already set (the mark is cleared by mntput()) 1883 * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] 1884 */ 1885 if (flags & MNT_EXPIRE) { 1886 if (&mnt->mnt == current->fs->root.mnt || 1887 flags & (MNT_FORCE | MNT_DETACH)) 1888 return -EINVAL; 1889 1890 /* 1891 * probably don't strictly need the lock here if we examined 1892 * all race cases, but it's a slowpath. 1893 */ 1894 lock_mount_hash(); 1895 if (!list_empty(&mnt->mnt_mounts) || mnt_get_count(mnt) != 2) { 1896 unlock_mount_hash(); 1897 return -EBUSY; 1898 } 1899 unlock_mount_hash(); 1900 1901 if (!xchg(&mnt->mnt_expiry_mark, 1)) 1902 return -EAGAIN; 1903 } 1904 1905 /* 1906 * If we may have to abort operations to get out of this 1907 * mount, and they will themselves hold resources we must 1908 * allow the fs to do things. In the Unix tradition of 1909 * 'Gee thats tricky lets do it in userspace' the umount_begin 1910 * might fail to complete on the first run through as other tasks 1911 * must return, and the like. Thats for the mount program to worry 1912 * about for the moment. 1913 */ 1914 1915 if (flags & MNT_FORCE && sb->s_op->umount_begin) { 1916 sb->s_op->umount_begin(sb); 1917 } 1918 1919 /* 1920 * No sense to grab the lock for this test, but test itself looks 1921 * somewhat bogus. Suggestions for better replacement? 1922 * Ho-hum... In principle, we might treat that as umount + switch 1923 * to rootfs. GC would eventually take care of the old vfsmount. 1924 * Actually it makes sense, especially if rootfs would contain a 1925 * /reboot - static binary that would close all descriptors and 1926 * call reboot(9). Then init(8) could umount root and exec /reboot. 1927 */ 1928 if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { 1929 /* 1930 * Special case for "unmounting" root ... 1931 * we just try to remount it readonly. 1932 */ 1933 if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) 1934 return -EPERM; 1935 return do_umount_root(sb); 1936 } 1937 1938 namespace_lock(); 1939 lock_mount_hash(); 1940 1941 /* Repeat the earlier racy checks, now that we are holding the locks */ 1942 retval = -EINVAL; 1943 if (!check_mnt(mnt)) 1944 goto out; 1945 1946 if (mnt->mnt.mnt_flags & MNT_LOCKED) 1947 goto out; 1948 1949 if (!mnt_has_parent(mnt)) /* not the absolute root */ 1950 goto out; 1951 1952 event++; 1953 if (flags & MNT_DETACH) { 1954 umount_tree(mnt, UMOUNT_PROPAGATE); 1955 retval = 0; 1956 } else { 1957 smp_mb(); // paired with __legitimize_mnt() 1958 shrink_submounts(mnt); 1959 retval = -EBUSY; 1960 if (!propagate_mount_busy(mnt, 2)) { 1961 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); 1962 retval = 0; 1963 } 1964 } 1965 out: 1966 unlock_mount_hash(); 1967 namespace_unlock(); 1968 return retval; 1969 } 1970 1971 /* 1972 * __detach_mounts - lazily unmount all mounts on the specified dentry 1973 * 1974 * During unlink, rmdir, and d_drop it is possible to loose the path 1975 * to an existing mountpoint, and wind up leaking the mount. 1976 * detach_mounts allows lazily unmounting those mounts instead of 1977 * leaking them. 1978 * 1979 * The caller may hold dentry->d_inode->i_rwsem. 1980 */ 1981 void __detach_mounts(struct dentry *dentry) 1982 { 1983 struct pinned_mountpoint mp = {}; 1984 struct mount *mnt; 1985 1986 guard(namespace_excl)(); 1987 guard(mount_writer)(); 1988 1989 if (!lookup_mountpoint(dentry, &mp)) 1990 return; 1991 1992 event++; 1993 while (mp.node.next) { 1994 mnt = hlist_entry(mp.node.next, struct mount, mnt_mp_list); 1995 if (mnt->mnt.mnt_flags & MNT_UMOUNT) { 1996 umount_mnt(mnt); 1997 hlist_add_head(&mnt->mnt_umount, &unmounted); 1998 } 1999 else umount_tree(mnt, UMOUNT_CONNECTED); 2000 } 2001 unpin_mountpoint(&mp); 2002 } 2003 2004 /* 2005 * Is the caller allowed to modify his namespace? 2006 */ 2007 bool may_mount(void) 2008 { 2009 return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); 2010 } 2011 2012 static void warn_mandlock(void) 2013 { 2014 pr_warn_once("=======================================================\n" 2015 "WARNING: The mand mount option has been deprecated and\n" 2016 " and is ignored by this kernel. Remove the mand\n" 2017 " option from the mount to silence this warning.\n" 2018 "=======================================================\n"); 2019 } 2020 2021 static int can_umount(const struct path *path, int flags) 2022 { 2023 struct mount *mnt = real_mount(path->mnt); 2024 struct super_block *sb = path->dentry->d_sb; 2025 2026 if (!may_mount()) 2027 return -EPERM; 2028 if (!path_mounted(path)) 2029 return -EINVAL; 2030 if (!check_mnt(mnt)) 2031 return -EINVAL; 2032 if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ 2033 return -EINVAL; 2034 if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) 2035 return -EPERM; 2036 return 0; 2037 } 2038 2039 // caller is responsible for flags being sane 2040 int path_umount(const struct path *path, int flags) 2041 { 2042 struct mount *mnt = real_mount(path->mnt); 2043 int ret; 2044 2045 ret = can_umount(path, flags); 2046 if (!ret) 2047 ret = do_umount(mnt, flags); 2048 2049 /* we mustn't call path_put() as that would clear mnt_expiry_mark */ 2050 dput(path->dentry); 2051 mntput_no_expire(mnt); 2052 return ret; 2053 } 2054 2055 static int ksys_umount(char __user *name, int flags) 2056 { 2057 int lookup_flags = LOOKUP_MOUNTPOINT; 2058 struct path path; 2059 int ret; 2060 2061 // basic validity checks done first 2062 if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) 2063 return -EINVAL; 2064 2065 if (!(flags & UMOUNT_NOFOLLOW)) 2066 lookup_flags |= LOOKUP_FOLLOW; 2067 ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); 2068 if (ret) 2069 return ret; 2070 return path_umount(&path, flags); 2071 } 2072 2073 SYSCALL_DEFINE2(umount, char __user *, name, int, flags) 2074 { 2075 return ksys_umount(name, flags); 2076 } 2077 2078 #ifdef __ARCH_WANT_SYS_OLDUMOUNT 2079 2080 /* 2081 * The 2.0 compatible umount. No flags. 2082 */ 2083 SYSCALL_DEFINE1(oldumount, char __user *, name) 2084 { 2085 return ksys_umount(name, 0); 2086 } 2087 2088 #endif 2089 2090 static bool is_mnt_ns_file(struct dentry *dentry) 2091 { 2092 struct ns_common *ns; 2093 2094 /* Is this a proxy for a mount namespace? */ 2095 if (dentry->d_op != &ns_dentry_operations) 2096 return false; 2097 2098 ns = d_inode(dentry)->i_private; 2099 2100 return ns->ops == &mntns_operations; 2101 } 2102 2103 struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) 2104 { 2105 return &mnt->ns; 2106 } 2107 2108 struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous) 2109 { 2110 struct ns_common *ns; 2111 2112 guard(rcu)(); 2113 2114 for (;;) { 2115 ns = ns_tree_adjoined_rcu(mntns, previous); 2116 if (IS_ERR(ns)) 2117 return ERR_CAST(ns); 2118 2119 mntns = to_mnt_ns(ns); 2120 2121 /* 2122 * The last passive reference count is put with RCU 2123 * delay so accessing the mount namespace is not just 2124 * safe but all relevant members are still valid. 2125 */ 2126 if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN)) 2127 continue; 2128 2129 /* 2130 * We need an active reference count as we're persisting 2131 * the mount namespace and it might already be on its 2132 * deathbed. 2133 */ 2134 if (!ns_ref_get(mntns)) 2135 continue; 2136 2137 return mntns; 2138 } 2139 } 2140 2141 struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) 2142 { 2143 if (!is_mnt_ns_file(dentry)) 2144 return NULL; 2145 2146 return to_mnt_ns(get_proc_ns(dentry->d_inode)); 2147 } 2148 2149 static bool mnt_ns_loop(struct dentry *dentry) 2150 { 2151 /* Could bind mounting the mount namespace inode cause a 2152 * mount namespace loop? 2153 */ 2154 struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); 2155 2156 if (!mnt_ns) 2157 return false; 2158 2159 return current->nsproxy->mnt_ns->ns.ns_id >= mnt_ns->ns.ns_id; 2160 } 2161 2162 struct mount *copy_tree(struct mount *src_root, struct dentry *dentry, 2163 int flag) 2164 { 2165 struct mount *res, *src_parent, *src_root_child, *src_mnt, 2166 *dst_parent, *dst_mnt; 2167 2168 if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(src_root)) 2169 return ERR_PTR(-EINVAL); 2170 2171 if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) 2172 return ERR_PTR(-EINVAL); 2173 2174 res = dst_mnt = clone_mnt(src_root, dentry, flag); 2175 if (IS_ERR(dst_mnt)) 2176 return dst_mnt; 2177 2178 src_parent = src_root; 2179 2180 list_for_each_entry(src_root_child, &src_root->mnt_mounts, mnt_child) { 2181 if (!is_subdir(src_root_child->mnt_mountpoint, dentry)) 2182 continue; 2183 2184 for (src_mnt = src_root_child; src_mnt; 2185 src_mnt = next_mnt(src_mnt, src_root_child)) { 2186 if (!(flag & CL_COPY_UNBINDABLE) && 2187 IS_MNT_UNBINDABLE(src_mnt)) { 2188 if (src_mnt->mnt.mnt_flags & MNT_LOCKED) { 2189 /* Both unbindable and locked. */ 2190 dst_mnt = ERR_PTR(-EPERM); 2191 goto out; 2192 } else { 2193 src_mnt = skip_mnt_tree(src_mnt); 2194 continue; 2195 } 2196 } 2197 if (!(flag & CL_COPY_MNT_NS_FILE) && 2198 is_mnt_ns_file(src_mnt->mnt.mnt_root)) { 2199 src_mnt = skip_mnt_tree(src_mnt); 2200 continue; 2201 } 2202 while (src_parent != src_mnt->mnt_parent) { 2203 src_parent = src_parent->mnt_parent; 2204 dst_mnt = dst_mnt->mnt_parent; 2205 } 2206 2207 src_parent = src_mnt; 2208 dst_parent = dst_mnt; 2209 dst_mnt = clone_mnt(src_mnt, src_mnt->mnt.mnt_root, flag); 2210 if (IS_ERR(dst_mnt)) 2211 goto out; 2212 lock_mount_hash(); 2213 if (src_mnt->mnt.mnt_flags & MNT_LOCKED) 2214 dst_mnt->mnt.mnt_flags |= MNT_LOCKED; 2215 if (unlikely(flag & CL_EXPIRE)) { 2216 /* stick the duplicate mount on the same expiry 2217 * list as the original if that was on one */ 2218 if (!list_empty(&src_mnt->mnt_expire)) 2219 list_add(&dst_mnt->mnt_expire, 2220 &src_mnt->mnt_expire); 2221 } 2222 attach_mnt(dst_mnt, dst_parent, src_parent->mnt_mp); 2223 unlock_mount_hash(); 2224 } 2225 } 2226 return res; 2227 2228 out: 2229 if (res) { 2230 lock_mount_hash(); 2231 umount_tree(res, UMOUNT_SYNC); 2232 unlock_mount_hash(); 2233 } 2234 return dst_mnt; 2235 } 2236 2237 static inline bool extend_array(struct path **res, struct path **to_free, 2238 unsigned n, unsigned *count, unsigned new_count) 2239 { 2240 struct path *p; 2241 2242 if (likely(n < *count)) 2243 return true; 2244 p = kmalloc_objs(struct path, new_count); 2245 if (p && *count) 2246 memcpy(p, *res, *count * sizeof(struct path)); 2247 *count = new_count; 2248 kfree(*to_free); 2249 *to_free = *res = p; 2250 return p; 2251 } 2252 2253 const struct path *collect_paths(const struct path *path, 2254 struct path *prealloc, unsigned count) 2255 { 2256 struct mount *root = real_mount(path->mnt); 2257 struct mount *child; 2258 struct path *res = prealloc, *to_free = NULL; 2259 unsigned n = 0; 2260 2261 guard(namespace_shared)(); 2262 2263 if (!check_mnt(root)) 2264 return ERR_PTR(-EINVAL); 2265 if (!extend_array(&res, &to_free, 0, &count, 32)) 2266 return ERR_PTR(-ENOMEM); 2267 res[n++] = *path; 2268 list_for_each_entry(child, &root->mnt_mounts, mnt_child) { 2269 if (!is_subdir(child->mnt_mountpoint, path->dentry)) 2270 continue; 2271 for (struct mount *m = child; m; m = next_mnt(m, child)) { 2272 if (!extend_array(&res, &to_free, n, &count, 2 * count)) 2273 return ERR_PTR(-ENOMEM); 2274 res[n].mnt = &m->mnt; 2275 res[n].dentry = m->mnt.mnt_root; 2276 n++; 2277 } 2278 } 2279 if (!extend_array(&res, &to_free, n, &count, count + 1)) 2280 return ERR_PTR(-ENOMEM); 2281 memset(res + n, 0, (count - n) * sizeof(struct path)); 2282 for (struct path *p = res; p->mnt; p++) 2283 path_get(p); 2284 return res; 2285 } 2286 2287 void drop_collected_paths(const struct path *paths, const struct path *prealloc) 2288 { 2289 for (const struct path *p = paths; p->mnt; p++) 2290 path_put(p); 2291 if (paths != prealloc) 2292 kfree(paths); 2293 } 2294 2295 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); 2296 2297 void dissolve_on_fput(struct vfsmount *mnt) 2298 { 2299 struct mount *m = real_mount(mnt); 2300 2301 /* 2302 * m used to be the root of anon namespace; if it still is one, 2303 * we need to dissolve the mount tree and free that namespace. 2304 * Let's try to avoid taking namespace_sem if we can determine 2305 * that there's nothing to do without it - rcu_read_lock() is 2306 * enough to make anon_ns_root() memory-safe and once m has 2307 * left its namespace, it's no longer our concern, since it will 2308 * never become a root of anon ns again. 2309 */ 2310 2311 scoped_guard(rcu) { 2312 if (!anon_ns_root(m)) 2313 return; 2314 } 2315 2316 scoped_guard(namespace_excl) { 2317 if (!anon_ns_root(m)) 2318 return; 2319 2320 emptied_ns = m->mnt_ns; 2321 lock_mount_hash(); 2322 umount_tree(m, UMOUNT_CONNECTED); 2323 unlock_mount_hash(); 2324 } 2325 } 2326 2327 /* locks: namespace_shared && pinned(mnt) || mount_locked_reader */ 2328 static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) 2329 { 2330 struct mount *child; 2331 2332 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 2333 if (!is_subdir(child->mnt_mountpoint, dentry)) 2334 continue; 2335 2336 if (child->mnt.mnt_flags & MNT_LOCKED) 2337 return true; 2338 } 2339 return false; 2340 } 2341 2342 bool has_locked_children(struct mount *mnt, struct dentry *dentry) 2343 { 2344 guard(mount_locked_reader)(); 2345 return __has_locked_children(mnt, dentry); 2346 } 2347 2348 /* 2349 * Check that there aren't references to earlier/same mount namespaces in the 2350 * specified subtree. Such references can act as pins for mount namespaces 2351 * that aren't checked by the mount-cycle checking code, thereby allowing 2352 * cycles to be made. 2353 * 2354 * locks: mount_locked_reader || namespace_shared && pinned(subtree) 2355 */ 2356 static bool check_for_nsfs_mounts(struct mount *subtree) 2357 { 2358 for (struct mount *p = subtree; p; p = next_mnt(p, subtree)) 2359 if (mnt_ns_loop(p->mnt.mnt_root)) 2360 return false; 2361 return true; 2362 } 2363 2364 /** 2365 * clone_private_mount - create a private clone of a path 2366 * @path: path to clone 2367 * 2368 * This creates a new vfsmount, which will be the clone of @path. The new mount 2369 * will not be attached anywhere in the namespace and will be private (i.e. 2370 * changes to the originating mount won't be propagated into this). 2371 * 2372 * This assumes caller has called or done the equivalent of may_mount(). 2373 * 2374 * Release with mntput(). 2375 */ 2376 struct vfsmount *clone_private_mount(const struct path *path) 2377 { 2378 struct mount *old_mnt = real_mount(path->mnt); 2379 struct mount *new_mnt; 2380 2381 guard(namespace_shared)(); 2382 2383 if (IS_MNT_UNBINDABLE(old_mnt)) 2384 return ERR_PTR(-EINVAL); 2385 2386 /* 2387 * Make sure the source mount is acceptable. 2388 * Anything mounted in our mount namespace is allowed. 2389 * Otherwise, it must be the root of an anonymous mount 2390 * namespace, and we need to make sure no namespace 2391 * loops get created. 2392 */ 2393 if (!check_mnt(old_mnt)) { 2394 if (!anon_ns_root(old_mnt)) 2395 return ERR_PTR(-EINVAL); 2396 2397 if (!check_for_nsfs_mounts(old_mnt)) 2398 return ERR_PTR(-EINVAL); 2399 } 2400 2401 if (!ns_capable(old_mnt->mnt_ns->user_ns, CAP_SYS_ADMIN)) 2402 return ERR_PTR(-EPERM); 2403 2404 if (__has_locked_children(old_mnt, path->dentry)) 2405 return ERR_PTR(-EINVAL); 2406 2407 new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); 2408 if (IS_ERR(new_mnt)) 2409 return ERR_PTR(-EINVAL); 2410 2411 /* Longterm mount to be removed by kern_unmount*() */ 2412 new_mnt->mnt_ns = MNT_NS_INTERNAL; 2413 return &new_mnt->mnt; 2414 } 2415 EXPORT_SYMBOL_GPL(clone_private_mount); 2416 2417 static void lock_mnt_tree(struct mount *mnt) 2418 { 2419 struct mount *p; 2420 2421 for (p = mnt; p; p = next_mnt(p, mnt)) { 2422 int flags = p->mnt.mnt_flags; 2423 /* Don't allow unprivileged users to change mount flags */ 2424 flags |= MNT_LOCK_ATIME; 2425 2426 if (flags & MNT_READONLY) 2427 flags |= MNT_LOCK_READONLY; 2428 2429 if (flags & MNT_NODEV) 2430 flags |= MNT_LOCK_NODEV; 2431 2432 if (flags & MNT_NOSUID) 2433 flags |= MNT_LOCK_NOSUID; 2434 2435 if (flags & MNT_NOEXEC) 2436 flags |= MNT_LOCK_NOEXEC; 2437 /* Don't allow unprivileged users to reveal what is under a mount */ 2438 if (list_empty(&p->mnt_expire) && p != mnt) 2439 flags |= MNT_LOCKED; 2440 p->mnt.mnt_flags = flags; 2441 } 2442 } 2443 2444 static void cleanup_group_ids(struct mount *mnt, struct mount *end) 2445 { 2446 struct mount *p; 2447 2448 for (p = mnt; p != end; p = next_mnt(p, mnt)) { 2449 if (p->mnt_group_id && !IS_MNT_SHARED(p)) 2450 mnt_release_group_id(p); 2451 } 2452 } 2453 2454 static int invent_group_ids(struct mount *mnt, bool recurse) 2455 { 2456 struct mount *p; 2457 2458 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { 2459 if (!p->mnt_group_id) { 2460 int err = mnt_alloc_group_id(p); 2461 if (err) { 2462 cleanup_group_ids(mnt, p); 2463 return err; 2464 } 2465 } 2466 } 2467 2468 return 0; 2469 } 2470 2471 int count_mounts(struct mnt_namespace *ns, struct mount *mnt) 2472 { 2473 unsigned int max = READ_ONCE(sysctl_mount_max); 2474 unsigned int mounts = 0; 2475 struct mount *p; 2476 2477 if (ns->nr_mounts >= max) 2478 return -ENOSPC; 2479 max -= ns->nr_mounts; 2480 if (ns->pending_mounts >= max) 2481 return -ENOSPC; 2482 max -= ns->pending_mounts; 2483 2484 for (p = mnt; p; p = next_mnt(p, mnt)) 2485 mounts++; 2486 2487 if (mounts > max) 2488 return -ENOSPC; 2489 2490 ns->pending_mounts += mounts; 2491 return 0; 2492 } 2493 2494 enum mnt_tree_flags_t { 2495 MNT_TREE_BENEATH = BIT(0), 2496 MNT_TREE_PROPAGATION = BIT(1), 2497 }; 2498 2499 /** 2500 * attach_recursive_mnt - attach a source mount tree 2501 * @source_mnt: mount tree to be attached 2502 * @dest: the context for mounting at the place where the tree should go 2503 * 2504 * NOTE: in the table below explains the semantics when a source mount 2505 * of a given type is attached to a destination mount of a given type. 2506 * --------------------------------------------------------------------------- 2507 * | BIND MOUNT OPERATION | 2508 * |************************************************************************** 2509 * | source-->| shared | private | slave | unbindable | 2510 * | dest | | | | | 2511 * | | | | | | | 2512 * | v | | | | | 2513 * |************************************************************************** 2514 * | shared | shared (++) | shared (+) | shared(+++)| invalid | 2515 * | | | | | | 2516 * |non-shared| shared (+) | private | slave (*) | invalid | 2517 * *************************************************************************** 2518 * A bind operation clones the source mount and mounts the clone on the 2519 * destination mount. 2520 * 2521 * (++) the cloned mount is propagated to all the mounts in the propagation 2522 * tree of the destination mount and the cloned mount is added to 2523 * the peer group of the source mount. 2524 * (+) the cloned mount is created under the destination mount and is marked 2525 * as shared. The cloned mount is added to the peer group of the source 2526 * mount. 2527 * (+++) the mount is propagated to all the mounts in the propagation tree 2528 * of the destination mount and the cloned mount is made slave 2529 * of the same master as that of the source mount. The cloned mount 2530 * is marked as 'shared and slave'. 2531 * (*) the cloned mount is made a slave of the same master as that of the 2532 * source mount. 2533 * 2534 * --------------------------------------------------------------------------- 2535 * | MOVE MOUNT OPERATION | 2536 * |************************************************************************** 2537 * | source-->| shared | private | slave | unbindable | 2538 * | dest | | | | | 2539 * | | | | | | | 2540 * | v | | | | | 2541 * |************************************************************************** 2542 * | shared | shared (+) | shared (+) | shared(+++) | invalid | 2543 * | | | | | | 2544 * |non-shared| shared (+*) | private | slave (*) | unbindable | 2545 * *************************************************************************** 2546 * 2547 * (+) the mount is moved to the destination. And is then propagated to 2548 * all the mounts in the propagation tree of the destination mount. 2549 * (+*) the mount is moved to the destination. 2550 * (+++) the mount is moved to the destination and is then propagated to 2551 * all the mounts belonging to the destination mount's propagation tree. 2552 * the mount is marked as 'shared and slave'. 2553 * (*) the mount continues to be a slave at the new location. 2554 * 2555 * if the source mount is a tree, the operations explained above is 2556 * applied to each mount in the tree. 2557 * Must be called without spinlocks held, since this function can sleep 2558 * in allocations. 2559 * 2560 * Context: The function expects namespace_lock() to be held. 2561 * Return: If @source_mnt was successfully attached 0 is returned. 2562 * Otherwise a negative error code is returned. 2563 */ 2564 static int attach_recursive_mnt(struct mount *source_mnt, 2565 const struct pinned_mountpoint *dest) 2566 { 2567 struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; 2568 struct mount *dest_mnt = dest->parent; 2569 struct mountpoint *dest_mp = dest->mp; 2570 HLIST_HEAD(tree_list); 2571 struct mnt_namespace *ns = dest_mnt->mnt_ns; 2572 struct pinned_mountpoint root = {}; 2573 struct mountpoint *shorter = NULL; 2574 struct mount *child, *p; 2575 struct mount *top; 2576 struct hlist_node *n; 2577 int err = 0; 2578 bool moving = mnt_has_parent(source_mnt); 2579 2580 /* 2581 * Preallocate a mountpoint in case the new mounts need to be 2582 * mounted beneath mounts on the same mountpoint. 2583 */ 2584 for (top = source_mnt; unlikely(top->overmount); top = top->overmount) { 2585 if (!shorter && is_mnt_ns_file(top->mnt.mnt_root)) 2586 shorter = top->mnt_mp; 2587 } 2588 err = get_mountpoint(top->mnt.mnt_root, &root); 2589 if (err) 2590 return err; 2591 2592 /* Is there space to add these mounts to the mount namespace? */ 2593 if (!moving) { 2594 err = count_mounts(ns, source_mnt); 2595 if (err) 2596 goto out; 2597 } 2598 2599 if (IS_MNT_SHARED(dest_mnt)) { 2600 err = invent_group_ids(source_mnt, true); 2601 if (err) 2602 goto out; 2603 err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); 2604 } 2605 lock_mount_hash(); 2606 if (err) 2607 goto out_cleanup_ids; 2608 2609 if (IS_MNT_SHARED(dest_mnt)) { 2610 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 2611 set_mnt_shared(p); 2612 } 2613 2614 if (moving) { 2615 umount_mnt(source_mnt); 2616 mnt_notify_add(source_mnt); 2617 /* if the mount is moved, it should no longer be expired 2618 * automatically */ 2619 list_del_init(&source_mnt->mnt_expire); 2620 } else { 2621 if (source_mnt->mnt_ns) { 2622 /* move from anon - the caller will destroy */ 2623 emptied_ns = source_mnt->mnt_ns; 2624 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 2625 move_from_ns(p); 2626 } 2627 } 2628 2629 mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); 2630 /* 2631 * Now the original copy is in the same state as the secondaries - 2632 * its root attached to mountpoint, but not hashed and all mounts 2633 * in it are either in our namespace or in no namespace at all. 2634 * Add the original to the list of copies and deal with the 2635 * rest of work for all of them uniformly. 2636 */ 2637 hlist_add_head(&source_mnt->mnt_hash, &tree_list); 2638 2639 hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { 2640 struct mount *q; 2641 hlist_del_init(&child->mnt_hash); 2642 /* Notice when we are propagating across user namespaces */ 2643 if (child->mnt_parent->mnt_ns->user_ns != user_ns) 2644 lock_mnt_tree(child); 2645 q = __lookup_mnt(&child->mnt_parent->mnt, 2646 child->mnt_mountpoint); 2647 commit_tree(child); 2648 if (q) { 2649 struct mount *r = topmost_overmount(child); 2650 struct mountpoint *mp = root.mp; 2651 2652 if (unlikely(shorter) && child != source_mnt) 2653 mp = shorter; 2654 /* 2655 * If @q was locked it was meant to hide 2656 * whatever was under it. Let @child take over 2657 * that job and lock it, then we can unlock @q. 2658 * That'll allow another namespace to shed @q 2659 * and reveal @child. Clearly, that mounter 2660 * consented to this by not severing the mount 2661 * relationship. Otherwise, what's the point. 2662 */ 2663 if (IS_MNT_LOCKED(q)) { 2664 child->mnt.mnt_flags |= MNT_LOCKED; 2665 q->mnt.mnt_flags &= ~MNT_LOCKED; 2666 } 2667 mnt_change_mountpoint(r, mp, q); 2668 } 2669 } 2670 unpin_mountpoint(&root); 2671 unlock_mount_hash(); 2672 2673 return 0; 2674 2675 out_cleanup_ids: 2676 while (!hlist_empty(&tree_list)) { 2677 child = hlist_entry(tree_list.first, struct mount, mnt_hash); 2678 child->mnt_parent->mnt_ns->pending_mounts = 0; 2679 umount_tree(child, UMOUNT_SYNC); 2680 } 2681 unlock_mount_hash(); 2682 cleanup_group_ids(source_mnt, NULL); 2683 out: 2684 ns->pending_mounts = 0; 2685 2686 read_seqlock_excl(&mount_lock); 2687 unpin_mountpoint(&root); 2688 read_sequnlock_excl(&mount_lock); 2689 2690 return err; 2691 } 2692 2693 static inline struct mount *where_to_mount(const struct path *path, 2694 struct dentry **dentry, 2695 bool beneath) 2696 { 2697 struct mount *m; 2698 2699 if (unlikely(beneath)) { 2700 m = topmost_overmount(real_mount(path->mnt)); 2701 *dentry = m->mnt_mountpoint; 2702 return m->mnt_parent; 2703 } 2704 m = __lookup_mnt(path->mnt, path->dentry); 2705 if (unlikely(m)) { 2706 m = topmost_overmount(m); 2707 *dentry = m->mnt.mnt_root; 2708 return m; 2709 } 2710 *dentry = path->dentry; 2711 return real_mount(path->mnt); 2712 } 2713 2714 /** 2715 * do_lock_mount - acquire environment for mounting 2716 * @path: target path 2717 * @res: context to set up 2718 * @beneath: whether the intention is to mount beneath @path 2719 * 2720 * To mount something at given location, we need 2721 * namespace_sem locked exclusive 2722 * inode of dentry we are mounting on locked exclusive 2723 * struct mountpoint for that dentry 2724 * struct mount we are mounting on 2725 * 2726 * Results are stored in caller-supplied context (pinned_mountpoint); 2727 * on success we have res->parent and res->mp pointing to parent and 2728 * mountpoint respectively and res->node inserted into the ->m_list 2729 * of the mountpoint, making sure the mountpoint won't disappear. 2730 * On failure we have res->parent set to ERR_PTR(-E...), res->mp 2731 * left NULL, res->node - empty. 2732 * In case of success do_lock_mount returns with locks acquired (in 2733 * proper order - inode lock nests outside of namespace_sem). 2734 * 2735 * Request to mount on overmounted location is treated as "mount on 2736 * top of whatever's overmounting it"; request to mount beneath 2737 * a location - "mount immediately beneath the topmost mount at that 2738 * place". 2739 * 2740 * In all cases the location must not have been unmounted and the 2741 * chosen mountpoint must be allowed to be mounted on. For "beneath" 2742 * case we also require the location to be at the root of a mount 2743 * that has something mounted on top of it (i.e. has an overmount). 2744 */ 2745 static void do_lock_mount(const struct path *path, 2746 struct pinned_mountpoint *res, 2747 bool beneath) 2748 { 2749 int err; 2750 2751 if (unlikely(beneath) && !path_mounted(path)) { 2752 res->parent = ERR_PTR(-EINVAL); 2753 return; 2754 } 2755 2756 do { 2757 struct dentry *dentry, *d; 2758 struct mount *m, *n; 2759 2760 scoped_guard(mount_locked_reader) { 2761 m = where_to_mount(path, &dentry, beneath); 2762 if (&m->mnt != path->mnt) { 2763 mntget(&m->mnt); 2764 dget(dentry); 2765 } 2766 } 2767 2768 inode_lock(dentry->d_inode); 2769 namespace_lock(); 2770 2771 // check if the chain of mounts (if any) has changed. 2772 scoped_guard(mount_locked_reader) 2773 n = where_to_mount(path, &d, beneath); 2774 2775 if (unlikely(n != m || dentry != d)) 2776 err = -EAGAIN; // something moved, retry 2777 else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt))) 2778 err = -ENOENT; // not to be mounted on 2779 else if (beneath && &m->mnt == path->mnt && !m->overmount) 2780 err = -EINVAL; 2781 else 2782 err = get_mountpoint(dentry, res); 2783 2784 if (unlikely(err)) { 2785 res->parent = ERR_PTR(err); 2786 namespace_unlock(); 2787 inode_unlock(dentry->d_inode); 2788 } else { 2789 res->parent = m; 2790 } 2791 /* 2792 * Drop the temporary references. This is subtle - on success 2793 * we are doing that under namespace_sem, which would normally 2794 * be forbidden. However, in that case we are guaranteed that 2795 * refcounts won't reach zero, since we know that path->mnt 2796 * is mounted and thus all mounts reachable from it are pinned 2797 * and stable, along with their mountpoints and roots. 2798 */ 2799 if (&m->mnt != path->mnt) { 2800 dput(dentry); 2801 mntput(&m->mnt); 2802 } 2803 } while (err == -EAGAIN); 2804 } 2805 2806 static void __unlock_mount(struct pinned_mountpoint *m) 2807 { 2808 inode_unlock(m->mp->m_dentry->d_inode); 2809 read_seqlock_excl(&mount_lock); 2810 unpin_mountpoint(m); 2811 read_sequnlock_excl(&mount_lock); 2812 namespace_unlock(); 2813 } 2814 2815 static inline void unlock_mount(struct pinned_mountpoint *m) 2816 { 2817 if (!IS_ERR(m->parent)) 2818 __unlock_mount(m); 2819 } 2820 2821 static void lock_mount_exact(const struct path *path, 2822 struct pinned_mountpoint *mp, bool copy_mount, 2823 unsigned int copy_flags); 2824 2825 #define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ 2826 struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ 2827 do_lock_mount((path), &mp, (beneath)) 2828 #define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) 2829 #define LOCK_MOUNT_EXACT(mp, path) \ 2830 struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ 2831 lock_mount_exact((path), &mp, false, 0) 2832 #define LOCK_MOUNT_EXACT_COPY(mp, path, copy_flags) \ 2833 struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ 2834 lock_mount_exact((path), &mp, true, (copy_flags)) 2835 2836 static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) 2837 { 2838 if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) 2839 return -EINVAL; 2840 2841 if (d_is_dir(mp->mp->m_dentry) != 2842 d_is_dir(mnt->mnt.mnt_root)) 2843 return -ENOTDIR; 2844 2845 return attach_recursive_mnt(mnt, mp); 2846 } 2847 2848 static int may_change_propagation(const struct mount *m) 2849 { 2850 struct mnt_namespace *ns = m->mnt_ns; 2851 2852 // it must be mounted in some namespace 2853 if (IS_ERR_OR_NULL(ns)) // is_mounted() 2854 return -EINVAL; 2855 // and the caller must be admin in userns of that namespace 2856 if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) 2857 return -EPERM; 2858 return 0; 2859 } 2860 2861 /* 2862 * Sanity check the flags to change_mnt_propagation. 2863 */ 2864 2865 static int flags_to_propagation_type(int ms_flags) 2866 { 2867 int type = ms_flags & ~(MS_REC | MS_SILENT); 2868 2869 /* Fail if any non-propagation flags are set */ 2870 if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 2871 return 0; 2872 /* Only one propagation flag should be set */ 2873 if (!is_power_of_2(type)) 2874 return 0; 2875 return type; 2876 } 2877 2878 /* 2879 * recursively change the type of the mountpoint. 2880 */ 2881 static int do_change_type(const struct path *path, int ms_flags) 2882 { 2883 struct mount *m; 2884 struct mount *mnt = real_mount(path->mnt); 2885 int recurse = ms_flags & MS_REC; 2886 int type; 2887 int err; 2888 2889 if (!path_mounted(path)) 2890 return -EINVAL; 2891 2892 type = flags_to_propagation_type(ms_flags); 2893 if (!type) 2894 return -EINVAL; 2895 2896 guard(namespace_excl)(); 2897 2898 err = may_change_propagation(mnt); 2899 if (err) 2900 return err; 2901 2902 if (type == MS_SHARED) { 2903 err = invent_group_ids(mnt, recurse); 2904 if (err) 2905 return err; 2906 } 2907 2908 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 2909 change_mnt_propagation(m, type); 2910 2911 return 0; 2912 } 2913 2914 /* may_copy_tree() - check if a mount tree can be copied 2915 * @path: path to the mount tree to be copied 2916 * 2917 * This helper checks if the caller may copy the mount tree starting 2918 * from @path->mnt. The caller may copy the mount tree under the 2919 * following circumstances: 2920 * 2921 * (1) The caller is located in the mount namespace of the mount tree. 2922 * This also implies that the mount does not belong to an anonymous 2923 * mount namespace. 2924 * (2) The caller tries to copy an nfs mount referring to a mount 2925 * namespace, i.e., the caller is trying to copy a mount namespace 2926 * entry from nsfs. 2927 * (3) The caller tries to copy a pidfs mount referring to a pidfd. 2928 * (4) The caller is trying to copy a mount tree that belongs to an 2929 * anonymous mount namespace. 2930 * 2931 * For that to be safe, this helper enforces that the origin mount 2932 * namespace the anonymous mount namespace was created from is the 2933 * same as the caller's mount namespace by comparing the sequence 2934 * numbers. 2935 * 2936 * This is not strictly necessary. The current semantics of the new 2937 * mount api enforce that the caller must be located in the same 2938 * mount namespace as the mount tree it interacts with. Using the 2939 * origin sequence number preserves these semantics even for 2940 * anonymous mount namespaces. However, one could envision extending 2941 * the api to directly operate across mount namespace if needed. 2942 * 2943 * The ownership of a non-anonymous mount namespace such as the 2944 * caller's cannot change. 2945 * => We know that the caller's mount namespace is stable. 2946 * 2947 * If the origin sequence number of the anonymous mount namespace is 2948 * the same as the sequence number of the caller's mount namespace. 2949 * => The owning namespaces are the same. 2950 * 2951 * ==> The earlier capability check on the owning namespace of the 2952 * caller's mount namespace ensures that the caller has the 2953 * ability to copy the mount tree. 2954 * 2955 * Returns true if the mount tree can be copied, false otherwise. 2956 */ 2957 static inline bool may_copy_tree(const struct path *path) 2958 { 2959 struct mount *mnt = real_mount(path->mnt); 2960 const struct dentry_operations *d_op; 2961 2962 if (check_mnt(mnt)) 2963 return true; 2964 2965 d_op = path->dentry->d_op; 2966 if (d_op == &ns_dentry_operations) 2967 return true; 2968 2969 if (d_op == &pidfs_dentry_operations) 2970 return true; 2971 2972 if (!is_mounted(path->mnt)) 2973 return false; 2974 2975 return check_anonymous_mnt(mnt); 2976 } 2977 2978 static struct mount *__do_loopback(const struct path *old_path, 2979 bool recurse, unsigned int copy_flags) 2980 { 2981 struct mount *old = real_mount(old_path->mnt); 2982 2983 if (IS_MNT_UNBINDABLE(old)) 2984 return ERR_PTR(-EINVAL); 2985 2986 if (!may_copy_tree(old_path)) 2987 return ERR_PTR(-EINVAL); 2988 2989 if (!recurse && __has_locked_children(old, old_path->dentry)) 2990 return ERR_PTR(-EINVAL); 2991 2992 if (recurse) 2993 return copy_tree(old, old_path->dentry, copy_flags); 2994 2995 return clone_mnt(old, old_path->dentry, copy_flags); 2996 } 2997 2998 /* 2999 * do loopback mount. 3000 */ 3001 static int do_loopback(const struct path *path, const char *old_name, 3002 int recurse) 3003 { 3004 struct path old_path __free(path_put) = {}; 3005 struct mount *mnt = NULL; 3006 int err; 3007 3008 if (!old_name || !*old_name) 3009 return -EINVAL; 3010 err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); 3011 if (err) 3012 return err; 3013 3014 if (mnt_ns_loop(old_path.dentry)) 3015 return -EINVAL; 3016 3017 LOCK_MOUNT(mp, path); 3018 if (IS_ERR(mp.parent)) 3019 return PTR_ERR(mp.parent); 3020 3021 if (!check_mnt(mp.parent)) 3022 return -EINVAL; 3023 3024 mnt = __do_loopback(&old_path, recurse, CL_COPY_MNT_NS_FILE); 3025 if (IS_ERR(mnt)) 3026 return PTR_ERR(mnt); 3027 3028 err = graft_tree(mnt, &mp); 3029 if (err) { 3030 lock_mount_hash(); 3031 umount_tree(mnt, UMOUNT_SYNC); 3032 unlock_mount_hash(); 3033 } 3034 return err; 3035 } 3036 3037 static struct mnt_namespace *get_detached_copy(const struct path *path, unsigned int flags) 3038 { 3039 struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; 3040 struct user_namespace *user_ns = mnt_ns->user_ns; 3041 struct mount *mnt, *p; 3042 3043 ns = alloc_mnt_ns(user_ns, true); 3044 if (IS_ERR(ns)) 3045 return ns; 3046 3047 guard(namespace_excl)(); 3048 3049 /* 3050 * Record the sequence number of the source mount namespace. 3051 * This needs to hold namespace_sem to ensure that the mount 3052 * doesn't get attached. 3053 */ 3054 if (is_mounted(path->mnt)) { 3055 src_mnt_ns = real_mount(path->mnt)->mnt_ns; 3056 if (is_anon_ns(src_mnt_ns)) 3057 ns->seq_origin = src_mnt_ns->seq_origin; 3058 else 3059 ns->seq_origin = src_mnt_ns->ns.ns_id; 3060 } 3061 3062 mnt = __do_loopback(path, (flags & AT_RECURSIVE), CL_COPY_MNT_NS_FILE); 3063 if (IS_ERR(mnt)) { 3064 emptied_ns = ns; 3065 return ERR_CAST(mnt); 3066 } 3067 3068 for (p = mnt; p; p = next_mnt(p, mnt)) { 3069 mnt_add_to_ns(ns, p); 3070 ns->nr_mounts++; 3071 } 3072 ns->root = mnt; 3073 return ns; 3074 } 3075 3076 static struct file *open_detached_copy(struct path *path, unsigned int flags) 3077 { 3078 struct mnt_namespace *ns = get_detached_copy(path, flags); 3079 struct file *file; 3080 3081 if (IS_ERR(ns)) 3082 return ERR_CAST(ns); 3083 3084 mntput(path->mnt); 3085 path->mnt = mntget(&ns->root->mnt); 3086 file = dentry_open(path, O_PATH, current_cred()); 3087 if (IS_ERR(file)) 3088 dissolve_on_fput(path->mnt); 3089 else 3090 file->f_mode |= FMODE_NEED_UNMOUNT; 3091 return file; 3092 } 3093 3094 enum mount_copy_flags_t { 3095 MOUNT_COPY_RECURSIVE = (1 << 0), 3096 MOUNT_COPY_NEW = (1 << 1), 3097 }; 3098 3099 static struct mnt_namespace *create_new_namespace(struct path *path, 3100 enum mount_copy_flags_t flags) 3101 { 3102 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 3103 struct user_namespace *user_ns = current_user_ns(); 3104 struct mnt_namespace *new_ns; 3105 struct mount *new_ns_root, *old_ns_root; 3106 struct path to_path; 3107 struct mount *mnt; 3108 unsigned int copy_flags = 0; 3109 bool locked = false, recurse = flags & MOUNT_COPY_RECURSIVE; 3110 3111 if (unlikely(!d_can_lookup(path->dentry))) 3112 return ERR_PTR(-ENOTDIR); 3113 3114 if (user_ns != ns->user_ns) 3115 copy_flags |= CL_SLAVE; 3116 3117 new_ns = alloc_mnt_ns(user_ns, false); 3118 if (IS_ERR(new_ns)) 3119 return ERR_CAST(new_ns); 3120 3121 old_ns_root = ns->root; 3122 to_path.mnt = &old_ns_root->mnt; 3123 to_path.dentry = old_ns_root->mnt.mnt_root; 3124 3125 VFS_WARN_ON_ONCE(old_ns_root->mnt.mnt_sb->s_type != &nullfs_fs_type); 3126 3127 LOCK_MOUNT_EXACT_COPY(mp, &to_path, copy_flags); 3128 if (IS_ERR(mp.parent)) { 3129 free_mnt_ns(new_ns); 3130 return ERR_CAST(mp.parent); 3131 } 3132 new_ns_root = mp.parent; 3133 3134 /* 3135 * If the real rootfs had a locked mount on top of it somewhere 3136 * in the stack, lock the new mount tree as well so it can't be 3137 * exposed. 3138 */ 3139 mnt = old_ns_root; 3140 while (mnt->overmount) { 3141 mnt = mnt->overmount; 3142 if (mnt->mnt.mnt_flags & MNT_LOCKED) 3143 locked = true; 3144 } 3145 3146 /* 3147 * We don't emulate unshare()ing a mount namespace. We stick to 3148 * the restrictions of creating detached bind-mounts. It has a 3149 * lot saner and simpler semantics. 3150 */ 3151 if (flags & MOUNT_COPY_NEW) 3152 mnt = clone_mnt(real_mount(path->mnt), path->dentry, copy_flags); 3153 else 3154 mnt = __do_loopback(path, recurse, copy_flags); 3155 scoped_guard(mount_writer) { 3156 if (IS_ERR(mnt)) { 3157 emptied_ns = new_ns; 3158 umount_tree(new_ns_root, 0); 3159 return ERR_CAST(mnt); 3160 } 3161 3162 if (locked) 3163 mnt->mnt.mnt_flags |= MNT_LOCKED; 3164 /* 3165 * now mount the detached tree on top of the copy 3166 * of the real rootfs we created. 3167 */ 3168 attach_mnt(mnt, new_ns_root, mp.mp); 3169 if (user_ns != ns->user_ns) 3170 lock_mnt_tree(new_ns_root); 3171 } 3172 3173 for (mnt = new_ns_root; mnt; mnt = next_mnt(mnt, new_ns_root)) { 3174 mnt_add_to_ns(new_ns, mnt); 3175 new_ns->nr_mounts++; 3176 } 3177 3178 new_ns->root = new_ns_root; 3179 ns_tree_add_raw(new_ns); 3180 return new_ns; 3181 } 3182 3183 static struct file *open_new_namespace(struct path *path, 3184 enum mount_copy_flags_t flags) 3185 { 3186 struct mnt_namespace *new_ns; 3187 3188 new_ns = create_new_namespace(path, flags); 3189 if (IS_ERR(new_ns)) 3190 return ERR_CAST(new_ns); 3191 return open_namespace_file(to_ns_common(new_ns)); 3192 } 3193 3194 static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) 3195 { 3196 int ret; 3197 struct path path __free(path_put) = {}; 3198 int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; 3199 3200 BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); 3201 3202 if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | 3203 AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | 3204 OPEN_TREE_CLOEXEC | OPEN_TREE_NAMESPACE)) 3205 return ERR_PTR(-EINVAL); 3206 3207 if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) == 3208 AT_RECURSIVE) 3209 return ERR_PTR(-EINVAL); 3210 3211 if (hweight32(flags & (OPEN_TREE_CLONE | OPEN_TREE_NAMESPACE)) > 1) 3212 return ERR_PTR(-EINVAL); 3213 3214 if (flags & AT_NO_AUTOMOUNT) 3215 lookup_flags &= ~LOOKUP_AUTOMOUNT; 3216 if (flags & AT_SYMLINK_NOFOLLOW) 3217 lookup_flags &= ~LOOKUP_FOLLOW; 3218 3219 /* 3220 * If we create a new mount namespace with the cloned mount tree we 3221 * just care about being privileged over our current user namespace. 3222 * The new mount namespace will be owned by it. 3223 */ 3224 if ((flags & OPEN_TREE_NAMESPACE) && 3225 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 3226 return ERR_PTR(-EPERM); 3227 3228 if ((flags & OPEN_TREE_CLONE) && !may_mount()) 3229 return ERR_PTR(-EPERM); 3230 3231 CLASS(filename_uflags, name)(filename, flags); 3232 ret = filename_lookup(dfd, name, lookup_flags, &path, NULL); 3233 if (unlikely(ret)) 3234 return ERR_PTR(ret); 3235 3236 if (flags & OPEN_TREE_NAMESPACE) 3237 return open_new_namespace(&path, (flags & AT_RECURSIVE) ? MOUNT_COPY_RECURSIVE : 0); 3238 3239 if (flags & OPEN_TREE_CLONE) 3240 return open_detached_copy(&path, flags); 3241 3242 return dentry_open(&path, O_PATH, current_cred()); 3243 } 3244 3245 SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) 3246 { 3247 return FD_ADD(flags, vfs_open_tree(dfd, filename, flags)); 3248 } 3249 3250 /* 3251 * Don't allow locked mount flags to be cleared. 3252 * 3253 * No locks need to be held here while testing the various MNT_LOCK 3254 * flags because those flags can never be cleared once they are set. 3255 */ 3256 static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) 3257 { 3258 unsigned int fl = mnt->mnt.mnt_flags; 3259 3260 if ((fl & MNT_LOCK_READONLY) && 3261 !(mnt_flags & MNT_READONLY)) 3262 return false; 3263 3264 if ((fl & MNT_LOCK_NODEV) && 3265 !(mnt_flags & MNT_NODEV)) 3266 return false; 3267 3268 if ((fl & MNT_LOCK_NOSUID) && 3269 !(mnt_flags & MNT_NOSUID)) 3270 return false; 3271 3272 if ((fl & MNT_LOCK_NOEXEC) && 3273 !(mnt_flags & MNT_NOEXEC)) 3274 return false; 3275 3276 if ((fl & MNT_LOCK_ATIME) && 3277 ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) 3278 return false; 3279 3280 return true; 3281 } 3282 3283 static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) 3284 { 3285 bool readonly_request = (mnt_flags & MNT_READONLY); 3286 3287 if (readonly_request == __mnt_is_readonly(&mnt->mnt)) 3288 return 0; 3289 3290 if (readonly_request) 3291 return mnt_make_readonly(mnt); 3292 3293 mnt->mnt.mnt_flags &= ~MNT_READONLY; 3294 return 0; 3295 } 3296 3297 static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) 3298 { 3299 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; 3300 mnt->mnt.mnt_flags = mnt_flags; 3301 touch_mnt_namespace(mnt->mnt_ns); 3302 } 3303 3304 static void mnt_warn_timestamp_expiry(const struct path *mountpoint, 3305 struct vfsmount *mnt) 3306 { 3307 struct super_block *sb = mnt->mnt_sb; 3308 3309 if (!__mnt_is_readonly(mnt) && 3310 (!(sb->s_iflags & SB_I_TS_EXPIRY_WARNED)) && 3311 (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { 3312 char *buf, *mntpath; 3313 3314 buf = __getname(); 3315 if (buf) 3316 mntpath = d_path(mountpoint, buf, PATH_MAX); 3317 else 3318 mntpath = ERR_PTR(-ENOMEM); 3319 if (IS_ERR(mntpath)) 3320 mntpath = "(unknown)"; 3321 3322 pr_warn("%s filesystem being %s at %s supports timestamps until %ptTd (0x%llx)\n", 3323 sb->s_type->name, 3324 is_mounted(mnt) ? "remounted" : "mounted", 3325 mntpath, &sb->s_time_max, 3326 (unsigned long long)sb->s_time_max); 3327 3328 sb->s_iflags |= SB_I_TS_EXPIRY_WARNED; 3329 __putname(buf); 3330 } 3331 } 3332 3333 /* 3334 * Handle reconfiguration of the mountpoint only without alteration of the 3335 * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND 3336 * to mount(2). 3337 */ 3338 static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags) 3339 { 3340 struct super_block *sb = path->mnt->mnt_sb; 3341 struct mount *mnt = real_mount(path->mnt); 3342 int ret; 3343 3344 if (!check_mnt(mnt)) 3345 return -EINVAL; 3346 3347 if (!path_mounted(path)) 3348 return -EINVAL; 3349 3350 if (!can_change_locked_flags(mnt, mnt_flags)) 3351 return -EPERM; 3352 3353 /* 3354 * We're only checking whether the superblock is read-only not 3355 * changing it, so only take down_read(&sb->s_umount). 3356 */ 3357 down_read(&sb->s_umount); 3358 lock_mount_hash(); 3359 ret = change_mount_ro_state(mnt, mnt_flags); 3360 if (ret == 0) 3361 set_mount_attributes(mnt, mnt_flags); 3362 unlock_mount_hash(); 3363 up_read(&sb->s_umount); 3364 3365 mnt_warn_timestamp_expiry(path, &mnt->mnt); 3366 3367 return ret; 3368 } 3369 3370 /* 3371 * change filesystem flags. dir should be a physical root of filesystem. 3372 * If you've mounted a non-root directory somewhere and want to do remount 3373 * on it - tough luck. 3374 */ 3375 static int do_remount(const struct path *path, int sb_flags, 3376 int mnt_flags, void *data) 3377 { 3378 int err; 3379 struct super_block *sb = path->mnt->mnt_sb; 3380 struct mount *mnt = real_mount(path->mnt); 3381 struct fs_context *fc; 3382 3383 if (!check_mnt(mnt)) 3384 return -EINVAL; 3385 3386 if (!path_mounted(path)) 3387 return -EINVAL; 3388 3389 if (!can_change_locked_flags(mnt, mnt_flags)) 3390 return -EPERM; 3391 3392 fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); 3393 if (IS_ERR(fc)) 3394 return PTR_ERR(fc); 3395 3396 /* 3397 * Indicate to the filesystem that the remount request is coming 3398 * from the legacy mount system call. 3399 */ 3400 fc->oldapi = true; 3401 3402 err = parse_monolithic_mount_data(fc, data); 3403 if (!err) { 3404 down_write(&sb->s_umount); 3405 err = -EPERM; 3406 if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { 3407 err = reconfigure_super(fc); 3408 if (!err) { 3409 lock_mount_hash(); 3410 set_mount_attributes(mnt, mnt_flags); 3411 unlock_mount_hash(); 3412 } 3413 } 3414 up_write(&sb->s_umount); 3415 } 3416 3417 mnt_warn_timestamp_expiry(path, &mnt->mnt); 3418 3419 put_fs_context(fc); 3420 return err; 3421 } 3422 3423 static inline int tree_contains_unbindable(struct mount *mnt) 3424 { 3425 struct mount *p; 3426 for (p = mnt; p; p = next_mnt(p, mnt)) { 3427 if (IS_MNT_UNBINDABLE(p)) 3428 return 1; 3429 } 3430 return 0; 3431 } 3432 3433 static int do_set_group(const struct path *from_path, const struct path *to_path) 3434 { 3435 struct mount *from = real_mount(from_path->mnt); 3436 struct mount *to = real_mount(to_path->mnt); 3437 int err; 3438 3439 guard(namespace_excl)(); 3440 3441 err = may_change_propagation(from); 3442 if (err) 3443 return err; 3444 err = may_change_propagation(to); 3445 if (err) 3446 return err; 3447 3448 /* To and From paths should be mount roots */ 3449 if (!path_mounted(from_path)) 3450 return -EINVAL; 3451 if (!path_mounted(to_path)) 3452 return -EINVAL; 3453 3454 /* Setting sharing groups is only allowed across same superblock */ 3455 if (from->mnt.mnt_sb != to->mnt.mnt_sb) 3456 return -EINVAL; 3457 3458 /* From mount root should be wider than To mount root */ 3459 if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) 3460 return -EINVAL; 3461 3462 /* From mount should not have locked children in place of To's root */ 3463 if (__has_locked_children(from, to->mnt.mnt_root)) 3464 return -EINVAL; 3465 3466 /* Setting sharing groups is only allowed on private mounts */ 3467 if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) 3468 return -EINVAL; 3469 3470 /* From should not be private */ 3471 if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) 3472 return -EINVAL; 3473 3474 if (IS_MNT_SLAVE(from)) { 3475 hlist_add_behind(&to->mnt_slave, &from->mnt_slave); 3476 to->mnt_master = from->mnt_master; 3477 } 3478 3479 if (IS_MNT_SHARED(from)) { 3480 to->mnt_group_id = from->mnt_group_id; 3481 list_add(&to->mnt_share, &from->mnt_share); 3482 set_mnt_shared(to); 3483 } 3484 return 0; 3485 } 3486 3487 /** 3488 * path_overmounted - check if path is overmounted 3489 * @path: path to check 3490 * 3491 * Check if path is overmounted, i.e., if there's a mount on top of 3492 * @path->mnt with @path->dentry as mountpoint. 3493 * 3494 * Context: namespace_sem must be held at least shared. 3495 * MUST NOT be called under lock_mount_hash() (there one should just 3496 * call __lookup_mnt() and check if it returns NULL). 3497 * Return: If path is overmounted true is returned, false if not. 3498 */ 3499 static inline bool path_overmounted(const struct path *path) 3500 { 3501 unsigned seq = read_seqbegin(&mount_lock); 3502 bool no_child; 3503 3504 rcu_read_lock(); 3505 no_child = !__lookup_mnt(path->mnt, path->dentry); 3506 rcu_read_unlock(); 3507 if (need_seqretry(&mount_lock, seq)) { 3508 read_seqlock_excl(&mount_lock); 3509 no_child = !__lookup_mnt(path->mnt, path->dentry); 3510 read_sequnlock_excl(&mount_lock); 3511 } 3512 return unlikely(!no_child); 3513 } 3514 3515 /* 3516 * Check if there is a possibly empty chain of descent from p1 to p2. 3517 * Locks: namespace_sem (shared) or mount_lock (read_seqlock_excl). 3518 */ 3519 static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) 3520 { 3521 while (p2 != p1 && mnt_has_parent(p2)) 3522 p2 = p2->mnt_parent; 3523 return p2 == p1; 3524 } 3525 3526 /** 3527 * can_move_mount_beneath - check that we can mount beneath the top mount 3528 * @mnt_from: mount we are trying to move 3529 * @mnt_to: mount under which to mount 3530 * @mp: mountpoint of @mnt_to 3531 * 3532 * - Make sure that the caller can unmount the topmost mount ensuring 3533 * that the caller could reveal the underlying mountpoint. 3534 * - Ensure that nothing has been mounted on top of @mnt_from before we 3535 * grabbed @namespace_sem to avoid creating pointless shadow mounts. 3536 * - Prevent mounting beneath a mount if the propagation relationship 3537 * between the source mount, parent mount, and top mount would lead to 3538 * nonsensical mount trees. 3539 * 3540 * Context: This function expects namespace_lock() to be held. 3541 * Return: On success 0, and on error a negative error code is returned. 3542 */ 3543 static int can_move_mount_beneath(const struct mount *mnt_from, 3544 const struct mount *mnt_to, 3545 struct pinned_mountpoint *mp) 3546 { 3547 struct mount *parent_mnt_to = mnt_to->mnt_parent; 3548 3549 /* Avoid creating shadow mounts during mount propagation. */ 3550 if (mnt_from->overmount) 3551 return -EINVAL; 3552 3553 if (mount_is_ancestor(mnt_to, mnt_from)) 3554 return -EINVAL; 3555 3556 /* 3557 * If the parent mount propagates to the child mount this would 3558 * mean mounting @mnt_from on @mnt_to->mnt_parent and then 3559 * propagating a copy @c of @mnt_from on top of @mnt_to. This 3560 * defeats the whole purpose of mounting beneath another mount. 3561 */ 3562 if (propagation_would_overmount(parent_mnt_to, mnt_to, mp->mp)) 3563 return -EINVAL; 3564 3565 /* 3566 * If @mnt_to->mnt_parent propagates to @mnt_from this would 3567 * mean propagating a copy @c of @mnt_from on top of @mnt_from. 3568 * Afterwards @mnt_from would be mounted on top of 3569 * @mnt_to->mnt_parent and @mnt_to would be unmounted from 3570 * @mnt->mnt_parent and remounted on @mnt_from. But since @c is 3571 * already mounted on @mnt_from, @mnt_to would ultimately be 3572 * remounted on top of @c. Afterwards, @mnt_from would be 3573 * covered by a copy @c of @mnt_from and @c would be covered by 3574 * @mnt_from itself. This defeats the whole purpose of mounting 3575 * @mnt_from beneath @mnt_to. 3576 */ 3577 if (check_mnt(mnt_from) && 3578 propagation_would_overmount(parent_mnt_to, mnt_from, mp->mp)) 3579 return -EINVAL; 3580 3581 return 0; 3582 } 3583 3584 /* may_use_mount() - check if a mount tree can be used 3585 * @mnt: vfsmount to be used 3586 * 3587 * This helper checks if the caller may use the mount tree starting 3588 * from @path->mnt. The caller may use the mount tree under the 3589 * following circumstances: 3590 * 3591 * (1) The caller is located in the mount namespace of the mount tree. 3592 * This also implies that the mount does not belong to an anonymous 3593 * mount namespace. 3594 * (2) The caller is trying to use a mount tree that belongs to an 3595 * anonymous mount namespace. 3596 * 3597 * For that to be safe, this helper enforces that the origin mount 3598 * namespace the anonymous mount namespace was created from is the 3599 * same as the caller's mount namespace by comparing the sequence 3600 * numbers. 3601 * 3602 * The ownership of a non-anonymous mount namespace such as the 3603 * caller's cannot change. 3604 * => We know that the caller's mount namespace is stable. 3605 * 3606 * If the origin sequence number of the anonymous mount namespace is 3607 * the same as the sequence number of the caller's mount namespace. 3608 * => The owning namespaces are the same. 3609 * 3610 * ==> The earlier capability check on the owning namespace of the 3611 * caller's mount namespace ensures that the caller has the 3612 * ability to use the mount tree. 3613 * 3614 * Returns true if the mount tree can be used, false otherwise. 3615 */ 3616 static inline bool may_use_mount(struct mount *mnt) 3617 { 3618 if (check_mnt(mnt)) 3619 return true; 3620 3621 /* 3622 * Make sure that noone unmounted the target path or somehow 3623 * managed to get their hands on something purely kernel 3624 * internal. 3625 */ 3626 if (!is_mounted(&mnt->mnt)) 3627 return false; 3628 3629 return check_anonymous_mnt(mnt); 3630 } 3631 3632 static int do_move_mount(const struct path *old_path, 3633 const struct path *new_path, 3634 enum mnt_tree_flags_t flags) 3635 { 3636 struct mount *old = real_mount(old_path->mnt); 3637 int err; 3638 bool beneath = flags & MNT_TREE_BENEATH; 3639 3640 if (!path_mounted(old_path)) 3641 return -EINVAL; 3642 3643 if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) 3644 return -EINVAL; 3645 3646 LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath); 3647 if (IS_ERR(mp.parent)) 3648 return PTR_ERR(mp.parent); 3649 3650 if (check_mnt(old)) { 3651 /* if the source is in our namespace... */ 3652 /* ... it should be detachable from parent */ 3653 if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) 3654 return -EINVAL; 3655 /* ... which should not be shared */ 3656 if (IS_MNT_SHARED(old->mnt_parent)) 3657 return -EINVAL; 3658 /* ... and the target should be in our namespace */ 3659 if (!check_mnt(mp.parent)) 3660 return -EINVAL; 3661 } else { 3662 /* 3663 * otherwise the source must be the root of some anon namespace. 3664 */ 3665 if (!anon_ns_root(old)) 3666 return -EINVAL; 3667 /* 3668 * Bail out early if the target is within the same namespace - 3669 * subsequent checks would've rejected that, but they lose 3670 * some corner cases if we check it early. 3671 */ 3672 if (old->mnt_ns == mp.parent->mnt_ns) 3673 return -EINVAL; 3674 /* 3675 * Target should be either in our namespace or in an acceptable 3676 * anon namespace, sensu check_anonymous_mnt(). 3677 */ 3678 if (!may_use_mount(mp.parent)) 3679 return -EINVAL; 3680 } 3681 3682 if (beneath) { 3683 struct mount *over = real_mount(new_path->mnt); 3684 3685 if (mp.parent != over->mnt_parent) 3686 over = mp.parent->overmount; 3687 err = can_move_mount_beneath(old, over, &mp); 3688 if (err) 3689 return err; 3690 } 3691 3692 /* 3693 * Don't move a mount tree containing unbindable mounts to a destination 3694 * mount which is shared. 3695 */ 3696 if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old)) 3697 return -EINVAL; 3698 if (!check_for_nsfs_mounts(old)) 3699 return -ELOOP; 3700 if (mount_is_ancestor(old, mp.parent)) 3701 return -ELOOP; 3702 3703 return attach_recursive_mnt(old, &mp); 3704 } 3705 3706 static int do_move_mount_old(const struct path *path, const char *old_name) 3707 { 3708 struct path old_path __free(path_put) = {}; 3709 int err; 3710 3711 if (!old_name || !*old_name) 3712 return -EINVAL; 3713 3714 err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); 3715 if (err) 3716 return err; 3717 3718 return do_move_mount(&old_path, path, 0); 3719 } 3720 3721 /* 3722 * add a mount into a namespace's mount tree 3723 */ 3724 static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp, 3725 int mnt_flags) 3726 { 3727 struct mount *parent = mp->parent; 3728 3729 if (IS_ERR(parent)) 3730 return PTR_ERR(parent); 3731 3732 mnt_flags &= ~MNT_INTERNAL_FLAGS; 3733 3734 if (unlikely(!check_mnt(parent))) { 3735 /* that's acceptable only for automounts done in private ns */ 3736 if (!(mnt_flags & MNT_SHRINKABLE)) 3737 return -EINVAL; 3738 /* ... and for those we'd better have mountpoint still alive */ 3739 if (!parent->mnt_ns) 3740 return -EINVAL; 3741 } 3742 3743 /* Refuse the same filesystem on the same mount point */ 3744 if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb && 3745 parent->mnt.mnt_root == mp->mp->m_dentry) 3746 return -EBUSY; 3747 3748 if (d_is_symlink(newmnt->mnt.mnt_root)) 3749 return -EINVAL; 3750 3751 newmnt->mnt.mnt_flags = mnt_flags; 3752 return graft_tree(newmnt, mp); 3753 } 3754 3755 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); 3756 3757 /* 3758 * Create a new mount using a superblock configuration and request it 3759 * be added to the namespace tree. 3760 */ 3761 static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint, 3762 unsigned int mnt_flags) 3763 { 3764 struct super_block *sb; 3765 struct vfsmount *mnt __free(mntput) = fc_mount(fc); 3766 int error; 3767 3768 if (IS_ERR(mnt)) 3769 return PTR_ERR(mnt); 3770 3771 sb = fc->root->d_sb; 3772 error = security_sb_kern_mount(sb); 3773 if (unlikely(error)) 3774 return error; 3775 3776 if (unlikely(mount_too_revealing(sb, &mnt_flags))) { 3777 errorfcp(fc, "VFS", "Mount too revealing"); 3778 return -EPERM; 3779 } 3780 3781 mnt_warn_timestamp_expiry(mountpoint, mnt); 3782 3783 LOCK_MOUNT(mp, mountpoint); 3784 error = do_add_mount(real_mount(mnt), &mp, mnt_flags); 3785 if (!error) 3786 retain_and_null_ptr(mnt); // consumed on success 3787 return error; 3788 } 3789 3790 /* 3791 * create a new mount for userspace and request it to be added into the 3792 * namespace's tree 3793 */ 3794 static int do_new_mount(const struct path *path, const char *fstype, 3795 int sb_flags, int mnt_flags, 3796 const char *name, void *data) 3797 { 3798 struct file_system_type *type; 3799 struct fs_context *fc; 3800 const char *subtype = NULL; 3801 int err = 0; 3802 3803 if (!fstype) 3804 return -EINVAL; 3805 3806 type = get_fs_type(fstype); 3807 if (!type) 3808 return -ENODEV; 3809 3810 if (type->fs_flags & FS_HAS_SUBTYPE) { 3811 subtype = strchr(fstype, '.'); 3812 if (subtype) { 3813 subtype++; 3814 if (!*subtype) { 3815 put_filesystem(type); 3816 return -EINVAL; 3817 } 3818 } 3819 } 3820 3821 fc = fs_context_for_mount(type, sb_flags); 3822 put_filesystem(type); 3823 if (IS_ERR(fc)) 3824 return PTR_ERR(fc); 3825 3826 /* 3827 * Indicate to the filesystem that the mount request is coming 3828 * from the legacy mount system call. 3829 */ 3830 fc->oldapi = true; 3831 3832 if (subtype) 3833 err = vfs_parse_fs_string(fc, "subtype", subtype); 3834 if (!err && name) 3835 err = vfs_parse_fs_string(fc, "source", name); 3836 if (!err) 3837 err = parse_monolithic_mount_data(fc, data); 3838 if (!err && !mount_capable(fc)) 3839 err = -EPERM; 3840 if (!err) 3841 err = do_new_mount_fc(fc, path, mnt_flags); 3842 3843 put_fs_context(fc); 3844 return err; 3845 } 3846 3847 static void lock_mount_exact(const struct path *path, 3848 struct pinned_mountpoint *mp, bool copy_mount, 3849 unsigned int copy_flags) 3850 { 3851 struct dentry *dentry = path->dentry; 3852 int err; 3853 3854 /* Assert that inode_lock() locked the correct inode. */ 3855 VFS_WARN_ON_ONCE(copy_mount && !path_mounted(path)); 3856 3857 inode_lock(dentry->d_inode); 3858 namespace_lock(); 3859 if (unlikely(cant_mount(dentry))) 3860 err = -ENOENT; 3861 else if (!copy_mount && path_overmounted(path)) 3862 err = -EBUSY; 3863 else 3864 err = get_mountpoint(dentry, mp); 3865 if (unlikely(err)) { 3866 namespace_unlock(); 3867 inode_unlock(dentry->d_inode); 3868 mp->parent = ERR_PTR(err); 3869 return; 3870 } 3871 3872 if (copy_mount) 3873 mp->parent = clone_mnt(real_mount(path->mnt), dentry, copy_flags); 3874 else 3875 mp->parent = real_mount(path->mnt); 3876 if (unlikely(IS_ERR(mp->parent))) 3877 __unlock_mount(mp); 3878 } 3879 3880 int finish_automount(struct vfsmount *__m, const struct path *path) 3881 { 3882 struct vfsmount *m __free(mntput) = __m; 3883 struct mount *mnt; 3884 int err; 3885 3886 if (!m) 3887 return 0; 3888 if (IS_ERR(m)) 3889 return PTR_ERR(m); 3890 3891 mnt = real_mount(m); 3892 3893 if (m->mnt_root == path->dentry) 3894 return -ELOOP; 3895 3896 /* 3897 * we don't want to use LOCK_MOUNT() - in this case finding something 3898 * that overmounts our mountpoint to be means "quitely drop what we've 3899 * got", not "try to mount it on top". 3900 */ 3901 LOCK_MOUNT_EXACT(mp, path); 3902 if (mp.parent == ERR_PTR(-EBUSY)) 3903 return 0; 3904 3905 err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE); 3906 if (likely(!err)) 3907 retain_and_null_ptr(m); 3908 return err; 3909 } 3910 3911 /** 3912 * mnt_set_expiry - Put a mount on an expiration list 3913 * @mnt: The mount to list. 3914 * @expiry_list: The list to add the mount to. 3915 */ 3916 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) 3917 { 3918 guard(mount_locked_reader)(); 3919 list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); 3920 } 3921 EXPORT_SYMBOL(mnt_set_expiry); 3922 3923 /* 3924 * process a list of expirable mountpoints with the intent of discarding any 3925 * mountpoints that aren't in use and haven't been touched since last we came 3926 * here 3927 */ 3928 void mark_mounts_for_expiry(struct list_head *mounts) 3929 { 3930 struct mount *mnt, *next; 3931 LIST_HEAD(graveyard); 3932 3933 if (list_empty(mounts)) 3934 return; 3935 3936 guard(namespace_excl)(); 3937 guard(mount_writer)(); 3938 3939 /* extract from the expiration list every vfsmount that matches the 3940 * following criteria: 3941 * - already mounted 3942 * - only referenced by its parent vfsmount 3943 * - still marked for expiry (marked on the last call here; marks are 3944 * cleared by mntput()) 3945 */ 3946 list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { 3947 if (!is_mounted(&mnt->mnt)) 3948 continue; 3949 if (!xchg(&mnt->mnt_expiry_mark, 1) || 3950 propagate_mount_busy(mnt, 1)) 3951 continue; 3952 list_move(&mnt->mnt_expire, &graveyard); 3953 } 3954 while (!list_empty(&graveyard)) { 3955 mnt = list_first_entry(&graveyard, struct mount, mnt_expire); 3956 touch_mnt_namespace(mnt->mnt_ns); 3957 umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); 3958 } 3959 } 3960 3961 EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); 3962 3963 /* 3964 * Ripoff of 'select_parent()' 3965 * 3966 * search the list of submounts for a given mountpoint, and move any 3967 * shrinkable submounts to the 'graveyard' list. 3968 */ 3969 static int select_submounts(struct mount *parent, struct list_head *graveyard) 3970 { 3971 struct mount *this_parent = parent; 3972 struct list_head *next; 3973 int found = 0; 3974 3975 repeat: 3976 next = this_parent->mnt_mounts.next; 3977 resume: 3978 while (next != &this_parent->mnt_mounts) { 3979 struct list_head *tmp = next; 3980 struct mount *mnt = list_entry(tmp, struct mount, mnt_child); 3981 3982 next = tmp->next; 3983 if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) 3984 continue; 3985 /* 3986 * Descend a level if the d_mounts list is non-empty. 3987 */ 3988 if (!list_empty(&mnt->mnt_mounts)) { 3989 this_parent = mnt; 3990 goto repeat; 3991 } 3992 3993 if (!propagate_mount_busy(mnt, 1)) { 3994 list_move_tail(&mnt->mnt_expire, graveyard); 3995 found++; 3996 } 3997 } 3998 /* 3999 * All done at this level ... ascend and resume the search 4000 */ 4001 if (this_parent != parent) { 4002 next = this_parent->mnt_child.next; 4003 this_parent = this_parent->mnt_parent; 4004 goto resume; 4005 } 4006 return found; 4007 } 4008 4009 /* 4010 * process a list of expirable mountpoints with the intent of discarding any 4011 * submounts of a specific parent mountpoint 4012 * 4013 * mount_lock must be held for write 4014 */ 4015 static void shrink_submounts(struct mount *mnt) 4016 { 4017 LIST_HEAD(graveyard); 4018 struct mount *m; 4019 4020 /* extract submounts of 'mountpoint' from the expiration list */ 4021 while (select_submounts(mnt, &graveyard)) { 4022 while (!list_empty(&graveyard)) { 4023 m = list_first_entry(&graveyard, struct mount, 4024 mnt_expire); 4025 touch_mnt_namespace(m->mnt_ns); 4026 umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); 4027 } 4028 } 4029 } 4030 4031 static void *copy_mount_options(const void __user * data) 4032 { 4033 char *copy; 4034 unsigned left, offset; 4035 4036 if (!data) 4037 return NULL; 4038 4039 copy = kmalloc(PAGE_SIZE, GFP_KERNEL); 4040 if (!copy) 4041 return ERR_PTR(-ENOMEM); 4042 4043 left = copy_from_user(copy, data, PAGE_SIZE); 4044 4045 /* 4046 * Not all architectures have an exact copy_from_user(). Resort to 4047 * byte at a time. 4048 */ 4049 offset = PAGE_SIZE - left; 4050 while (left) { 4051 char c; 4052 if (get_user(c, (const char __user *)data + offset)) 4053 break; 4054 copy[offset] = c; 4055 left--; 4056 offset++; 4057 } 4058 4059 if (left == PAGE_SIZE) { 4060 kfree(copy); 4061 return ERR_PTR(-EFAULT); 4062 } 4063 4064 return copy; 4065 } 4066 4067 static char *copy_mount_string(const void __user *data) 4068 { 4069 return data ? strndup_user(data, PATH_MAX) : NULL; 4070 } 4071 4072 /* 4073 * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to 4074 * be given to the mount() call (ie: read-only, no-dev, no-suid etc). 4075 * 4076 * data is a (void *) that can point to any structure up to 4077 * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent 4078 * information (or be NULL). 4079 * 4080 * Pre-0.97 versions of mount() didn't have a flags word. 4081 * When the flags word was introduced its top half was required 4082 * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. 4083 * Therefore, if this magic number is present, it carries no information 4084 * and must be discarded. 4085 */ 4086 int path_mount(const char *dev_name, const struct path *path, 4087 const char *type_page, unsigned long flags, void *data_page) 4088 { 4089 unsigned int mnt_flags = 0, sb_flags; 4090 int ret; 4091 4092 /* Discard magic */ 4093 if ((flags & MS_MGC_MSK) == MS_MGC_VAL) 4094 flags &= ~MS_MGC_MSK; 4095 4096 /* Basic sanity checks */ 4097 if (data_page) 4098 ((char *)data_page)[PAGE_SIZE - 1] = 0; 4099 4100 if (flags & MS_NOUSER) 4101 return -EINVAL; 4102 4103 ret = security_sb_mount(dev_name, path, type_page, flags, data_page); 4104 if (ret) 4105 return ret; 4106 if (!may_mount()) 4107 return -EPERM; 4108 if (flags & SB_MANDLOCK) 4109 warn_mandlock(); 4110 4111 /* Default to relatime unless overriden */ 4112 if (!(flags & MS_NOATIME)) 4113 mnt_flags |= MNT_RELATIME; 4114 4115 /* Separate the per-mountpoint flags */ 4116 if (flags & MS_NOSUID) 4117 mnt_flags |= MNT_NOSUID; 4118 if (flags & MS_NODEV) 4119 mnt_flags |= MNT_NODEV; 4120 if (flags & MS_NOEXEC) 4121 mnt_flags |= MNT_NOEXEC; 4122 if (flags & MS_NOATIME) 4123 mnt_flags |= MNT_NOATIME; 4124 if (flags & MS_NODIRATIME) 4125 mnt_flags |= MNT_NODIRATIME; 4126 if (flags & MS_STRICTATIME) 4127 mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); 4128 if (flags & MS_RDONLY) 4129 mnt_flags |= MNT_READONLY; 4130 if (flags & MS_NOSYMFOLLOW) 4131 mnt_flags |= MNT_NOSYMFOLLOW; 4132 4133 /* The default atime for remount is preservation */ 4134 if ((flags & MS_REMOUNT) && 4135 ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | 4136 MS_STRICTATIME)) == 0)) { 4137 mnt_flags &= ~MNT_ATIME_MASK; 4138 mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; 4139 } 4140 4141 sb_flags = flags & (SB_RDONLY | 4142 SB_SYNCHRONOUS | 4143 SB_MANDLOCK | 4144 SB_DIRSYNC | 4145 SB_SILENT | 4146 SB_POSIXACL | 4147 SB_LAZYTIME | 4148 SB_I_VERSION); 4149 4150 if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) 4151 return do_reconfigure_mnt(path, mnt_flags); 4152 if (flags & MS_REMOUNT) 4153 return do_remount(path, sb_flags, mnt_flags, data_page); 4154 if (flags & MS_BIND) 4155 return do_loopback(path, dev_name, flags & MS_REC); 4156 if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) 4157 return do_change_type(path, flags); 4158 if (flags & MS_MOVE) 4159 return do_move_mount_old(path, dev_name); 4160 4161 return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, 4162 data_page); 4163 } 4164 4165 int do_mount(const char *dev_name, const char __user *dir_name, 4166 const char *type_page, unsigned long flags, void *data_page) 4167 { 4168 struct path path __free(path_put) = {}; 4169 int ret; 4170 4171 ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); 4172 if (ret) 4173 return ret; 4174 return path_mount(dev_name, &path, type_page, flags, data_page); 4175 } 4176 4177 static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) 4178 { 4179 return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); 4180 } 4181 4182 static void dec_mnt_namespaces(struct ucounts *ucounts) 4183 { 4184 dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); 4185 } 4186 4187 static void free_mnt_ns(struct mnt_namespace *ns) 4188 { 4189 if (!is_anon_ns(ns)) 4190 ns_common_free(ns); 4191 dec_mnt_namespaces(ns->ucounts); 4192 mnt_ns_tree_remove(ns); 4193 } 4194 4195 static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) 4196 { 4197 struct mnt_namespace *new_ns; 4198 struct ucounts *ucounts; 4199 int ret; 4200 4201 ucounts = inc_mnt_namespaces(user_ns); 4202 if (!ucounts) 4203 return ERR_PTR(-ENOSPC); 4204 4205 new_ns = kzalloc_obj(struct mnt_namespace, GFP_KERNEL_ACCOUNT); 4206 if (!new_ns) { 4207 dec_mnt_namespaces(ucounts); 4208 return ERR_PTR(-ENOMEM); 4209 } 4210 4211 if (anon) 4212 ret = ns_common_init_inum(new_ns, MNT_NS_ANON_INO); 4213 else 4214 ret = ns_common_init(new_ns); 4215 if (ret) { 4216 kfree(new_ns); 4217 dec_mnt_namespaces(ucounts); 4218 return ERR_PTR(ret); 4219 } 4220 ns_tree_gen_id(new_ns); 4221 4222 new_ns->is_anon = anon; 4223 refcount_set(&new_ns->passive, 1); 4224 new_ns->mounts = RB_ROOT; 4225 init_waitqueue_head(&new_ns->poll); 4226 new_ns->user_ns = get_user_ns(user_ns); 4227 new_ns->ucounts = ucounts; 4228 return new_ns; 4229 } 4230 4231 __latent_entropy 4232 struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, 4233 struct user_namespace *user_ns, struct fs_struct *new_fs) 4234 { 4235 struct mnt_namespace *new_ns; 4236 struct path old_root __free(path_put) = {}; 4237 struct path old_pwd __free(path_put) = {}; 4238 struct mount *p, *q; 4239 struct mount *old; 4240 struct mount *new; 4241 int copy_flags; 4242 4243 BUG_ON(!ns); 4244 4245 if (likely(!(flags & CLONE_NEWNS))) { 4246 get_mnt_ns(ns); 4247 return ns; 4248 } 4249 4250 old = ns->root; 4251 4252 new_ns = alloc_mnt_ns(user_ns, false); 4253 if (IS_ERR(new_ns)) 4254 return new_ns; 4255 4256 guard(namespace_excl)(); 4257 4258 if (flags & CLONE_EMPTY_MNTNS) 4259 copy_flags = 0; 4260 else 4261 copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; 4262 if (user_ns != ns->user_ns) 4263 copy_flags |= CL_SLAVE; 4264 4265 if (flags & CLONE_EMPTY_MNTNS) 4266 new = clone_mnt(old, old->mnt.mnt_root, copy_flags); 4267 else 4268 new = copy_tree(old, old->mnt.mnt_root, copy_flags); 4269 if (IS_ERR(new)) { 4270 emptied_ns = new_ns; 4271 return ERR_CAST(new); 4272 } 4273 if (user_ns != ns->user_ns) { 4274 guard(mount_writer)(); 4275 lock_mnt_tree(new); 4276 } 4277 new_ns->root = new; 4278 4279 if (flags & CLONE_EMPTY_MNTNS) { 4280 /* 4281 * Empty mount namespace: only the root mount exists. 4282 * Reset root and pwd to the cloned mount's root dentry. 4283 */ 4284 if (new_fs) { 4285 old_root = new_fs->root; 4286 old_pwd = new_fs->pwd; 4287 4288 new_fs->root.mnt = mntget(&new->mnt); 4289 new_fs->root.dentry = dget(new->mnt.mnt_root); 4290 4291 new_fs->pwd.mnt = mntget(&new->mnt); 4292 new_fs->pwd.dentry = dget(new->mnt.mnt_root); 4293 } 4294 mnt_add_to_ns(new_ns, new); 4295 new_ns->nr_mounts++; 4296 } else { 4297 /* 4298 * Full copy: walk old and new trees in parallel, switching 4299 * the tsk->fs->* elements and marking new vfsmounts as 4300 * belonging to new namespace. We have already acquired a 4301 * private fs_struct, so tsk->fs->lock is not needed. 4302 */ 4303 p = old; 4304 q = new; 4305 while (p) { 4306 mnt_add_to_ns(new_ns, q); 4307 new_ns->nr_mounts++; 4308 if (new_fs) { 4309 if (&p->mnt == new_fs->root.mnt) { 4310 old_root.mnt = new_fs->root.mnt; 4311 new_fs->root.mnt = mntget(&q->mnt); 4312 } 4313 if (&p->mnt == new_fs->pwd.mnt) { 4314 old_pwd.mnt = new_fs->pwd.mnt; 4315 new_fs->pwd.mnt = mntget(&q->mnt); 4316 } 4317 } 4318 p = next_mnt(p, old); 4319 q = next_mnt(q, new); 4320 if (!q) 4321 break; 4322 // an mntns binding we'd skipped? 4323 while (p->mnt.mnt_root != q->mnt.mnt_root) 4324 p = next_mnt(skip_mnt_tree(p), old); 4325 } 4326 } 4327 ns_tree_add_raw(new_ns); 4328 return new_ns; 4329 } 4330 4331 struct dentry *mount_subtree(struct vfsmount *m, const char *name) 4332 { 4333 struct mount *mnt = real_mount(m); 4334 struct mnt_namespace *ns; 4335 struct super_block *s; 4336 struct path path; 4337 int err; 4338 4339 ns = alloc_mnt_ns(&init_user_ns, true); 4340 if (IS_ERR(ns)) { 4341 mntput(m); 4342 return ERR_CAST(ns); 4343 } 4344 ns->root = mnt; 4345 ns->nr_mounts++; 4346 mnt_add_to_ns(ns, mnt); 4347 4348 err = vfs_path_lookup(m->mnt_root, m, 4349 name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); 4350 4351 put_mnt_ns(ns); 4352 4353 if (err) 4354 return ERR_PTR(err); 4355 4356 /* trade a vfsmount reference for active sb one */ 4357 s = path.mnt->mnt_sb; 4358 atomic_inc(&s->s_active); 4359 mntput(path.mnt); 4360 /* lock the sucker */ 4361 down_write(&s->s_umount); 4362 /* ... and return the root of (sub)tree on it */ 4363 return path.dentry; 4364 } 4365 EXPORT_SYMBOL(mount_subtree); 4366 4367 SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, 4368 char __user *, type, unsigned long, flags, void __user *, data) 4369 { 4370 int ret; 4371 char *kernel_type; 4372 char *kernel_dev; 4373 void *options; 4374 4375 kernel_type = copy_mount_string(type); 4376 ret = PTR_ERR(kernel_type); 4377 if (IS_ERR(kernel_type)) 4378 goto out_type; 4379 4380 kernel_dev = copy_mount_string(dev_name); 4381 ret = PTR_ERR(kernel_dev); 4382 if (IS_ERR(kernel_dev)) 4383 goto out_dev; 4384 4385 options = copy_mount_options(data); 4386 ret = PTR_ERR(options); 4387 if (IS_ERR(options)) 4388 goto out_data; 4389 4390 ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); 4391 4392 kfree(options); 4393 out_data: 4394 kfree(kernel_dev); 4395 out_dev: 4396 kfree(kernel_type); 4397 out_type: 4398 return ret; 4399 } 4400 4401 #define FSMOUNT_VALID_FLAGS \ 4402 (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ 4403 MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME | \ 4404 MOUNT_ATTR_NOSYMFOLLOW) 4405 4406 #define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) 4407 4408 #define MOUNT_SETATTR_PROPAGATION_FLAGS \ 4409 (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) 4410 4411 static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) 4412 { 4413 unsigned int mnt_flags = 0; 4414 4415 if (attr_flags & MOUNT_ATTR_RDONLY) 4416 mnt_flags |= MNT_READONLY; 4417 if (attr_flags & MOUNT_ATTR_NOSUID) 4418 mnt_flags |= MNT_NOSUID; 4419 if (attr_flags & MOUNT_ATTR_NODEV) 4420 mnt_flags |= MNT_NODEV; 4421 if (attr_flags & MOUNT_ATTR_NOEXEC) 4422 mnt_flags |= MNT_NOEXEC; 4423 if (attr_flags & MOUNT_ATTR_NODIRATIME) 4424 mnt_flags |= MNT_NODIRATIME; 4425 if (attr_flags & MOUNT_ATTR_NOSYMFOLLOW) 4426 mnt_flags |= MNT_NOSYMFOLLOW; 4427 4428 return mnt_flags; 4429 } 4430 4431 /* 4432 * Create a kernel mount representation for a new, prepared superblock 4433 * (specified by fs_fd) and attach to an open_tree-like file descriptor. 4434 */ 4435 SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, 4436 unsigned int, attr_flags) 4437 { 4438 struct path new_path __free(path_put) = {}; 4439 struct mnt_namespace *ns; 4440 struct fs_context *fc; 4441 struct vfsmount *new_mnt; 4442 struct mount *mnt; 4443 unsigned int mnt_flags = 0; 4444 long ret; 4445 4446 if ((flags & ~(FSMOUNT_CLOEXEC | FSMOUNT_NAMESPACE)) != 0) 4447 return -EINVAL; 4448 4449 if ((flags & FSMOUNT_NAMESPACE) && 4450 !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) 4451 return -EPERM; 4452 4453 if (!(flags & FSMOUNT_NAMESPACE) && !may_mount()) 4454 return -EPERM; 4455 4456 if (attr_flags & ~FSMOUNT_VALID_FLAGS) 4457 return -EINVAL; 4458 4459 mnt_flags = attr_flags_to_mnt_flags(attr_flags); 4460 4461 switch (attr_flags & MOUNT_ATTR__ATIME) { 4462 case MOUNT_ATTR_STRICTATIME: 4463 break; 4464 case MOUNT_ATTR_NOATIME: 4465 mnt_flags |= MNT_NOATIME; 4466 break; 4467 case MOUNT_ATTR_RELATIME: 4468 mnt_flags |= MNT_RELATIME; 4469 break; 4470 default: 4471 return -EINVAL; 4472 } 4473 4474 CLASS(fd, f)(fs_fd); 4475 if (fd_empty(f)) 4476 return -EBADF; 4477 4478 if (fd_file(f)->f_op != &fscontext_fops) 4479 return -EINVAL; 4480 4481 fc = fd_file(f)->private_data; 4482 4483 ACQUIRE(mutex_intr, uapi_mutex)(&fc->uapi_mutex); 4484 ret = ACQUIRE_ERR(mutex_intr, &uapi_mutex); 4485 if (ret) 4486 return ret; 4487 4488 /* There must be a valid superblock or we can't mount it */ 4489 ret = -EINVAL; 4490 if (!fc->root) 4491 return ret; 4492 4493 ret = -EPERM; 4494 if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { 4495 errorfcp(fc, "VFS", "Mount too revealing"); 4496 return ret; 4497 } 4498 4499 ret = -EBUSY; 4500 if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) 4501 return ret; 4502 4503 if (fc->sb_flags & SB_MANDLOCK) 4504 warn_mandlock(); 4505 4506 new_mnt = vfs_create_mount(fc); 4507 if (IS_ERR(new_mnt)) 4508 return PTR_ERR(new_mnt); 4509 if (new_mnt->mnt_sb->s_flags & SB_NOUSER) { 4510 mntput(new_mnt); 4511 return -EINVAL; 4512 } 4513 new_mnt->mnt_flags = mnt_flags; 4514 4515 new_path.dentry = dget(fc->root); 4516 new_path.mnt = new_mnt; 4517 4518 /* We've done the mount bit - now move the file context into more or 4519 * less the same state as if we'd done an fspick(). We don't want to 4520 * do any memory allocation or anything like that at this point as we 4521 * don't want to have to handle any errors incurred. 4522 */ 4523 vfs_clean_context(fc); 4524 4525 if (flags & FSMOUNT_NAMESPACE) 4526 return FD_ADD((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, 4527 open_new_namespace(&new_path, MOUNT_COPY_NEW)); 4528 4529 ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); 4530 if (IS_ERR(ns)) 4531 return PTR_ERR(ns); 4532 mnt = real_mount(new_path.mnt); 4533 ns->root = mnt; 4534 ns->nr_mounts = 1; 4535 mnt_add_to_ns(ns, mnt); 4536 mntget(new_path.mnt); 4537 4538 FD_PREPARE(fdf, (flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0, 4539 dentry_open(&new_path, O_PATH, fc->cred)); 4540 if (fdf.err) { 4541 dissolve_on_fput(new_path.mnt); 4542 return fdf.err; 4543 } 4544 4545 /* 4546 * Attach to an apparent O_PATH fd with a note that we 4547 * need to unmount it, not just simply put it. 4548 */ 4549 fd_prepare_file(fdf)->f_mode |= FMODE_NEED_UNMOUNT; 4550 return fd_publish(fdf); 4551 } 4552 4553 static inline int vfs_move_mount(const struct path *from_path, 4554 const struct path *to_path, 4555 enum mnt_tree_flags_t mflags) 4556 { 4557 int ret; 4558 4559 ret = security_move_mount(from_path, to_path); 4560 if (ret) 4561 return ret; 4562 4563 if (mflags & MNT_TREE_PROPAGATION) 4564 return do_set_group(from_path, to_path); 4565 4566 return do_move_mount(from_path, to_path, mflags); 4567 } 4568 4569 /* 4570 * Move a mount from one place to another. In combination with 4571 * fsopen()/fsmount() this is used to install a new mount and in combination 4572 * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy 4573 * a mount subtree. 4574 * 4575 * Note the flags value is a combination of MOVE_MOUNT_* flags. 4576 */ 4577 SYSCALL_DEFINE5(move_mount, 4578 int, from_dfd, const char __user *, from_pathname, 4579 int, to_dfd, const char __user *, to_pathname, 4580 unsigned int, flags) 4581 { 4582 struct path to_path __free(path_put) = {}; 4583 struct path from_path __free(path_put) = {}; 4584 unsigned int lflags, uflags; 4585 enum mnt_tree_flags_t mflags = 0; 4586 int ret = 0; 4587 4588 if (!may_mount()) 4589 return -EPERM; 4590 4591 if (flags & ~MOVE_MOUNT__MASK) 4592 return -EINVAL; 4593 4594 if ((flags & (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) == 4595 (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) 4596 return -EINVAL; 4597 4598 if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; 4599 if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; 4600 4601 uflags = 0; 4602 if (flags & MOVE_MOUNT_T_EMPTY_PATH) 4603 uflags = AT_EMPTY_PATH; 4604 4605 CLASS(filename_maybe_null,to_name)(to_pathname, uflags); 4606 if (!to_name && to_dfd >= 0) { 4607 CLASS(fd_raw, f_to)(to_dfd); 4608 if (fd_empty(f_to)) 4609 return -EBADF; 4610 4611 to_path = fd_file(f_to)->f_path; 4612 path_get(&to_path); 4613 } else { 4614 lflags = 0; 4615 if (flags & MOVE_MOUNT_T_SYMLINKS) 4616 lflags |= LOOKUP_FOLLOW; 4617 if (flags & MOVE_MOUNT_T_AUTOMOUNTS) 4618 lflags |= LOOKUP_AUTOMOUNT; 4619 ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); 4620 if (ret) 4621 return ret; 4622 } 4623 4624 uflags = 0; 4625 if (flags & MOVE_MOUNT_F_EMPTY_PATH) 4626 uflags = AT_EMPTY_PATH; 4627 4628 CLASS(filename_maybe_null,from_name)(from_pathname, uflags); 4629 if (!from_name && from_dfd >= 0) { 4630 CLASS(fd_raw, f_from)(from_dfd); 4631 if (fd_empty(f_from)) 4632 return -EBADF; 4633 4634 return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); 4635 } 4636 4637 lflags = 0; 4638 if (flags & MOVE_MOUNT_F_SYMLINKS) 4639 lflags |= LOOKUP_FOLLOW; 4640 if (flags & MOVE_MOUNT_F_AUTOMOUNTS) 4641 lflags |= LOOKUP_AUTOMOUNT; 4642 ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); 4643 if (ret) 4644 return ret; 4645 4646 return vfs_move_mount(&from_path, &to_path, mflags); 4647 } 4648 4649 /* 4650 * Return true if path is reachable from root 4651 * 4652 * locks: mount_locked_reader || namespace_shared && is_mounted(mnt) 4653 */ 4654 bool is_path_reachable(struct mount *mnt, struct dentry *dentry, 4655 const struct path *root) 4656 { 4657 while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { 4658 dentry = mnt->mnt_mountpoint; 4659 mnt = mnt->mnt_parent; 4660 } 4661 return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); 4662 } 4663 4664 bool path_is_under(const struct path *path1, const struct path *path2) 4665 { 4666 guard(mount_locked_reader)(); 4667 return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); 4668 } 4669 EXPORT_SYMBOL(path_is_under); 4670 4671 int path_pivot_root(struct path *new, struct path *old) 4672 { 4673 struct path root __free(path_put) = {}; 4674 struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; 4675 int error; 4676 4677 if (!may_mount()) 4678 return -EPERM; 4679 4680 error = security_sb_pivotroot(old, new); 4681 if (error) 4682 return error; 4683 4684 get_fs_root(current->fs, &root); 4685 4686 LOCK_MOUNT(old_mp, old); 4687 old_mnt = old_mp.parent; 4688 if (IS_ERR(old_mnt)) 4689 return PTR_ERR(old_mnt); 4690 4691 new_mnt = real_mount(new->mnt); 4692 root_mnt = real_mount(root.mnt); 4693 ex_parent = new_mnt->mnt_parent; 4694 root_parent = root_mnt->mnt_parent; 4695 if (IS_MNT_SHARED(old_mnt) || 4696 IS_MNT_SHARED(ex_parent) || 4697 IS_MNT_SHARED(root_parent)) 4698 return -EINVAL; 4699 if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) 4700 return -EINVAL; 4701 if (new_mnt->mnt.mnt_flags & MNT_LOCKED) 4702 return -EINVAL; 4703 if (d_unlinked(new->dentry)) 4704 return -ENOENT; 4705 if (new_mnt == root_mnt || old_mnt == root_mnt) 4706 return -EBUSY; /* loop, on the same file system */ 4707 if (!path_mounted(&root)) 4708 return -EINVAL; /* not a mountpoint */ 4709 if (!mnt_has_parent(root_mnt)) 4710 return -EINVAL; /* absolute root */ 4711 if (!path_mounted(new)) 4712 return -EINVAL; /* not a mountpoint */ 4713 if (!mnt_has_parent(new_mnt)) 4714 return -EINVAL; /* absolute root */ 4715 /* make sure we can reach put_old from new_root */ 4716 if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, new)) 4717 return -EINVAL; 4718 /* make certain new is below the root */ 4719 if (!is_path_reachable(new_mnt, new->dentry, &root)) 4720 return -EINVAL; 4721 lock_mount_hash(); 4722 umount_mnt(new_mnt); 4723 if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { 4724 new_mnt->mnt.mnt_flags |= MNT_LOCKED; 4725 root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; 4726 } 4727 /* mount new_root on / */ 4728 attach_mnt(new_mnt, root_parent, root_mnt->mnt_mp); 4729 umount_mnt(root_mnt); 4730 /* mount old root on put_old */ 4731 attach_mnt(root_mnt, old_mnt, old_mp.mp); 4732 touch_mnt_namespace(current->nsproxy->mnt_ns); 4733 /* A moved mount should not expire automatically */ 4734 list_del_init(&new_mnt->mnt_expire); 4735 unlock_mount_hash(); 4736 mnt_notify_add(root_mnt); 4737 mnt_notify_add(new_mnt); 4738 chroot_fs_refs(&root, new); 4739 return 0; 4740 } 4741 4742 /* 4743 * pivot_root Semantics: 4744 * Moves the root file system of the current process to the directory put_old, 4745 * makes new_root as the new root file system of the current process, and sets 4746 * root/cwd of all processes which had them on the current root to new_root. 4747 * 4748 * Restrictions: 4749 * The new_root and put_old must be directories, and must not be on the 4750 * same file system as the current process root. The put_old must be 4751 * underneath new_root, i.e. adding a non-zero number of /.. to the string 4752 * pointed to by put_old must yield the same directory as new_root. No other 4753 * file system may be mounted on put_old. After all, new_root is a mountpoint. 4754 * 4755 * The immutable nullfs filesystem is mounted as the true root of the VFS 4756 * hierarchy. The mutable rootfs (tmpfs/ramfs) is layered on top of this, 4757 * allowing pivot_root() to work normally from initramfs. 4758 * 4759 * Notes: 4760 * - we don't move root/cwd if they are not at the root (reason: if something 4761 * cared enough to change them, it's probably wrong to force them elsewhere) 4762 * - it's okay to pick a root that isn't the root of a file system, e.g. 4763 * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, 4764 * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root 4765 * first. 4766 */ 4767 SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, 4768 const char __user *, put_old) 4769 { 4770 struct path new __free(path_put) = {}; 4771 struct path old __free(path_put) = {}; 4772 int error; 4773 4774 error = user_path_at(AT_FDCWD, new_root, 4775 LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); 4776 if (error) 4777 return error; 4778 4779 error = user_path_at(AT_FDCWD, put_old, 4780 LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); 4781 if (error) 4782 return error; 4783 4784 return path_pivot_root(&new, &old); 4785 } 4786 4787 static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) 4788 { 4789 unsigned int flags = mnt->mnt.mnt_flags; 4790 4791 /* flags to clear */ 4792 flags &= ~kattr->attr_clr; 4793 /* flags to raise */ 4794 flags |= kattr->attr_set; 4795 4796 return flags; 4797 } 4798 4799 static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) 4800 { 4801 struct vfsmount *m = &mnt->mnt; 4802 struct user_namespace *fs_userns = m->mnt_sb->s_user_ns; 4803 4804 if (!kattr->mnt_idmap) 4805 return 0; 4806 4807 /* 4808 * Creating an idmapped mount with the filesystem wide idmapping 4809 * doesn't make sense so block that. We don't allow mushy semantics. 4810 */ 4811 if (kattr->mnt_userns == m->mnt_sb->s_user_ns) 4812 return -EINVAL; 4813 4814 /* 4815 * We only allow an mount to change it's idmapping if it has 4816 * never been accessible to userspace. 4817 */ 4818 if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) 4819 return -EPERM; 4820 4821 /* The underlying filesystem doesn't support idmapped mounts yet. */ 4822 if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) 4823 return -EINVAL; 4824 4825 /* The filesystem has turned off idmapped mounts. */ 4826 if (m->mnt_sb->s_iflags & SB_I_NOIDMAP) 4827 return -EINVAL; 4828 4829 /* We're not controlling the superblock. */ 4830 if (!ns_capable(fs_userns, CAP_SYS_ADMIN)) 4831 return -EPERM; 4832 4833 /* Mount has already been visible in the filesystem hierarchy. */ 4834 if (!is_anon_ns(mnt->mnt_ns)) 4835 return -EINVAL; 4836 4837 return 0; 4838 } 4839 4840 /** 4841 * mnt_allow_writers() - check whether the attribute change allows writers 4842 * @kattr: the new mount attributes 4843 * @mnt: the mount to which @kattr will be applied 4844 * 4845 * Check whether thew new mount attributes in @kattr allow concurrent writers. 4846 * 4847 * Return: true if writers need to be held, false if not 4848 */ 4849 static inline bool mnt_allow_writers(const struct mount_kattr *kattr, 4850 const struct mount *mnt) 4851 { 4852 return (!(kattr->attr_set & MNT_READONLY) || 4853 (mnt->mnt.mnt_flags & MNT_READONLY)) && 4854 !kattr->mnt_idmap; 4855 } 4856 4857 static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) 4858 { 4859 struct mount *m; 4860 int err; 4861 4862 for (m = mnt; m; m = next_mnt(m, mnt)) { 4863 if (!can_change_locked_flags(m, recalc_flags(kattr, m))) { 4864 err = -EPERM; 4865 break; 4866 } 4867 4868 err = can_idmap_mount(kattr, m); 4869 if (err) 4870 break; 4871 4872 if (!mnt_allow_writers(kattr, m)) { 4873 err = mnt_hold_writers(m); 4874 if (err) { 4875 m = next_mnt(m, mnt); 4876 break; 4877 } 4878 } 4879 4880 if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) 4881 return 0; 4882 } 4883 4884 if (err) { 4885 /* undo all mnt_hold_writers() we'd done */ 4886 for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt)) 4887 mnt_unhold_writers(p); 4888 } 4889 return err; 4890 } 4891 4892 static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) 4893 { 4894 struct mnt_idmap *old_idmap; 4895 4896 if (!kattr->mnt_idmap) 4897 return; 4898 4899 old_idmap = mnt_idmap(&mnt->mnt); 4900 4901 /* Pairs with smp_load_acquire() in mnt_idmap(). */ 4902 smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); 4903 mnt_idmap_put(old_idmap); 4904 } 4905 4906 static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) 4907 { 4908 struct mount *m; 4909 4910 for (m = mnt; m; m = next_mnt(m, mnt)) { 4911 unsigned int flags; 4912 4913 do_idmap_mount(kattr, m); 4914 flags = recalc_flags(kattr, m); 4915 WRITE_ONCE(m->mnt.mnt_flags, flags); 4916 4917 /* If we had to hold writers unblock them. */ 4918 mnt_unhold_writers(m); 4919 4920 if (kattr->propagation) 4921 change_mnt_propagation(m, kattr->propagation); 4922 if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) 4923 break; 4924 } 4925 touch_mnt_namespace(mnt->mnt_ns); 4926 } 4927 4928 static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr) 4929 { 4930 struct mount *mnt = real_mount(path->mnt); 4931 int err = 0; 4932 4933 if (!path_mounted(path)) 4934 return -EINVAL; 4935 4936 if (kattr->mnt_userns) { 4937 struct mnt_idmap *mnt_idmap; 4938 4939 mnt_idmap = alloc_mnt_idmap(kattr->mnt_userns); 4940 if (IS_ERR(mnt_idmap)) 4941 return PTR_ERR(mnt_idmap); 4942 kattr->mnt_idmap = mnt_idmap; 4943 } 4944 4945 if (kattr->propagation) { 4946 /* 4947 * Only take namespace_lock() if we're actually changing 4948 * propagation. 4949 */ 4950 namespace_lock(); 4951 if (kattr->propagation == MS_SHARED) { 4952 err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); 4953 if (err) { 4954 namespace_unlock(); 4955 return err; 4956 } 4957 } 4958 } 4959 4960 err = -EINVAL; 4961 lock_mount_hash(); 4962 4963 if (!anon_ns_root(mnt) && !check_mnt(mnt)) 4964 goto out; 4965 4966 /* 4967 * First, we get the mount tree in a shape where we can change mount 4968 * properties without failure. If we succeeded to do so we commit all 4969 * changes and if we failed we clean up. 4970 */ 4971 err = mount_setattr_prepare(kattr, mnt); 4972 if (!err) 4973 mount_setattr_commit(kattr, mnt); 4974 4975 out: 4976 unlock_mount_hash(); 4977 4978 if (kattr->propagation) { 4979 if (err) 4980 cleanup_group_ids(mnt, NULL); 4981 namespace_unlock(); 4982 } 4983 4984 return err; 4985 } 4986 4987 static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, 4988 struct mount_kattr *kattr) 4989 { 4990 struct ns_common *ns; 4991 struct user_namespace *mnt_userns; 4992 4993 if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) 4994 return 0; 4995 4996 if (attr->attr_clr & MOUNT_ATTR_IDMAP) { 4997 /* 4998 * We can only remove an idmapping if it's never been 4999 * exposed to userspace. 5000 */ 5001 if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) 5002 return -EINVAL; 5003 5004 /* 5005 * Removal of idmappings is equivalent to setting 5006 * nop_mnt_idmap. 5007 */ 5008 if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { 5009 kattr->mnt_idmap = &nop_mnt_idmap; 5010 return 0; 5011 } 5012 } 5013 5014 if (attr->userns_fd > INT_MAX) 5015 return -EINVAL; 5016 5017 CLASS(fd, f)(attr->userns_fd); 5018 if (fd_empty(f)) 5019 return -EBADF; 5020 5021 if (!proc_ns_file(fd_file(f))) 5022 return -EINVAL; 5023 5024 ns = get_proc_ns(file_inode(fd_file(f))); 5025 if (ns->ns_type != CLONE_NEWUSER) 5026 return -EINVAL; 5027 5028 /* 5029 * The initial idmapping cannot be used to create an idmapped 5030 * mount. We use the initial idmapping as an indicator of a mount 5031 * that is not idmapped. It can simply be passed into helpers that 5032 * are aware of idmapped mounts as a convenient shortcut. A user 5033 * can just create a dedicated identity mapping to achieve the same 5034 * result. 5035 */ 5036 mnt_userns = container_of(ns, struct user_namespace, ns); 5037 if (mnt_userns == &init_user_ns) 5038 return -EPERM; 5039 5040 /* We're not controlling the target namespace. */ 5041 if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) 5042 return -EPERM; 5043 5044 kattr->mnt_userns = get_user_ns(mnt_userns); 5045 return 0; 5046 } 5047 5048 static int build_mount_kattr(const struct mount_attr *attr, size_t usize, 5049 struct mount_kattr *kattr) 5050 { 5051 if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) 5052 return -EINVAL; 5053 if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) 5054 return -EINVAL; 5055 kattr->propagation = attr->propagation; 5056 5057 if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) 5058 return -EINVAL; 5059 5060 kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); 5061 kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); 5062 5063 /* 5064 * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, 5065 * users wanting to transition to a different atime setting cannot 5066 * simply specify the atime setting in @attr_set, but must also 5067 * specify MOUNT_ATTR__ATIME in the @attr_clr field. 5068 * So ensure that MOUNT_ATTR__ATIME can't be partially set in 5069 * @attr_clr and that @attr_set can't have any atime bits set if 5070 * MOUNT_ATTR__ATIME isn't set in @attr_clr. 5071 */ 5072 if (attr->attr_clr & MOUNT_ATTR__ATIME) { 5073 if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) 5074 return -EINVAL; 5075 5076 /* 5077 * Clear all previous time settings as they are mutually 5078 * exclusive. 5079 */ 5080 kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; 5081 switch (attr->attr_set & MOUNT_ATTR__ATIME) { 5082 case MOUNT_ATTR_RELATIME: 5083 kattr->attr_set |= MNT_RELATIME; 5084 break; 5085 case MOUNT_ATTR_NOATIME: 5086 kattr->attr_set |= MNT_NOATIME; 5087 break; 5088 case MOUNT_ATTR_STRICTATIME: 5089 break; 5090 default: 5091 return -EINVAL; 5092 } 5093 } else { 5094 if (attr->attr_set & MOUNT_ATTR__ATIME) 5095 return -EINVAL; 5096 } 5097 5098 return build_mount_idmapped(attr, usize, kattr); 5099 } 5100 5101 static void finish_mount_kattr(struct mount_kattr *kattr) 5102 { 5103 if (kattr->mnt_userns) { 5104 put_user_ns(kattr->mnt_userns); 5105 kattr->mnt_userns = NULL; 5106 } 5107 5108 if (kattr->mnt_idmap) 5109 mnt_idmap_put(kattr->mnt_idmap); 5110 } 5111 5112 static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, 5113 struct mount_kattr *kattr) 5114 { 5115 int ret; 5116 struct mount_attr attr; 5117 5118 BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); 5119 5120 if (unlikely(usize > PAGE_SIZE)) 5121 return -E2BIG; 5122 if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) 5123 return -EINVAL; 5124 5125 if (!may_mount()) 5126 return -EPERM; 5127 5128 ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); 5129 if (ret) 5130 return ret; 5131 5132 /* Don't bother walking through the mounts if this is a nop. */ 5133 if (attr.attr_set == 0 && 5134 attr.attr_clr == 0 && 5135 attr.propagation == 0) 5136 return 0; /* Tell caller to not bother. */ 5137 5138 ret = build_mount_kattr(&attr, usize, kattr); 5139 if (ret < 0) 5140 return ret; 5141 5142 return 1; 5143 } 5144 5145 SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, 5146 unsigned int, flags, struct mount_attr __user *, uattr, 5147 size_t, usize) 5148 { 5149 int err; 5150 struct path target; 5151 struct mount_kattr kattr; 5152 unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; 5153 5154 if (flags & ~(AT_EMPTY_PATH | 5155 AT_RECURSIVE | 5156 AT_SYMLINK_NOFOLLOW | 5157 AT_NO_AUTOMOUNT)) 5158 return -EINVAL; 5159 5160 if (flags & AT_NO_AUTOMOUNT) 5161 lookup_flags &= ~LOOKUP_AUTOMOUNT; 5162 if (flags & AT_SYMLINK_NOFOLLOW) 5163 lookup_flags &= ~LOOKUP_FOLLOW; 5164 5165 kattr = (struct mount_kattr) { 5166 .lookup_flags = lookup_flags, 5167 }; 5168 5169 if (flags & AT_RECURSIVE) 5170 kattr.kflags |= MOUNT_KATTR_RECURSE; 5171 5172 err = wants_mount_setattr(uattr, usize, &kattr); 5173 if (err <= 0) 5174 return err; 5175 5176 CLASS(filename_uflags, name)(path, flags); 5177 err = filename_lookup(dfd, name, kattr.lookup_flags, &target, NULL); 5178 if (!err) { 5179 err = do_mount_setattr(&target, &kattr); 5180 path_put(&target); 5181 } 5182 finish_mount_kattr(&kattr); 5183 return err; 5184 } 5185 5186 SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, 5187 unsigned, flags, struct mount_attr __user *, uattr, 5188 size_t, usize) 5189 { 5190 if (!uattr && usize) 5191 return -EINVAL; 5192 5193 FD_PREPARE(fdf, flags, vfs_open_tree(dfd, filename, flags)); 5194 if (fdf.err) 5195 return fdf.err; 5196 5197 if (uattr) { 5198 struct mount_kattr kattr = {}; 5199 struct file *file = fd_prepare_file(fdf); 5200 int ret; 5201 5202 if (flags & OPEN_TREE_CLONE) 5203 kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; 5204 if (flags & AT_RECURSIVE) 5205 kattr.kflags |= MOUNT_KATTR_RECURSE; 5206 5207 ret = wants_mount_setattr(uattr, usize, &kattr); 5208 if (ret > 0) { 5209 ret = do_mount_setattr(&file->f_path, &kattr); 5210 finish_mount_kattr(&kattr); 5211 } 5212 if (ret) 5213 return ret; 5214 } 5215 5216 return fd_publish(fdf); 5217 } 5218 5219 int show_path(struct seq_file *m, struct dentry *root) 5220 { 5221 if (root->d_sb->s_op->show_path) 5222 return root->d_sb->s_op->show_path(m, root); 5223 5224 seq_dentry(m, root, " \t\n\\"); 5225 return 0; 5226 } 5227 5228 static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns) 5229 { 5230 struct mount *mnt = mnt_find_id_at(ns, id); 5231 5232 if (!mnt || mnt->mnt_id_unique != id) 5233 return NULL; 5234 5235 return &mnt->mnt; 5236 } 5237 5238 struct kstatmount { 5239 struct statmount __user *buf; 5240 size_t bufsize; 5241 struct vfsmount *mnt; 5242 struct mnt_idmap *idmap; 5243 u64 mask; 5244 struct path root; 5245 struct seq_file seq; 5246 5247 /* Must be last --ends in a flexible-array member. */ 5248 struct statmount sm; 5249 }; 5250 5251 static u64 mnt_to_attr_flags(struct vfsmount *mnt) 5252 { 5253 unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags); 5254 u64 attr_flags = 0; 5255 5256 if (mnt_flags & MNT_READONLY) 5257 attr_flags |= MOUNT_ATTR_RDONLY; 5258 if (mnt_flags & MNT_NOSUID) 5259 attr_flags |= MOUNT_ATTR_NOSUID; 5260 if (mnt_flags & MNT_NODEV) 5261 attr_flags |= MOUNT_ATTR_NODEV; 5262 if (mnt_flags & MNT_NOEXEC) 5263 attr_flags |= MOUNT_ATTR_NOEXEC; 5264 if (mnt_flags & MNT_NODIRATIME) 5265 attr_flags |= MOUNT_ATTR_NODIRATIME; 5266 if (mnt_flags & MNT_NOSYMFOLLOW) 5267 attr_flags |= MOUNT_ATTR_NOSYMFOLLOW; 5268 5269 if (mnt_flags & MNT_NOATIME) 5270 attr_flags |= MOUNT_ATTR_NOATIME; 5271 else if (mnt_flags & MNT_RELATIME) 5272 attr_flags |= MOUNT_ATTR_RELATIME; 5273 else 5274 attr_flags |= MOUNT_ATTR_STRICTATIME; 5275 5276 if (is_idmapped_mnt(mnt)) 5277 attr_flags |= MOUNT_ATTR_IDMAP; 5278 5279 return attr_flags; 5280 } 5281 5282 static u64 mnt_to_propagation_flags(struct mount *m) 5283 { 5284 u64 propagation = 0; 5285 5286 if (IS_MNT_SHARED(m)) 5287 propagation |= MS_SHARED; 5288 if (IS_MNT_SLAVE(m)) 5289 propagation |= MS_SLAVE; 5290 if (IS_MNT_UNBINDABLE(m)) 5291 propagation |= MS_UNBINDABLE; 5292 if (!propagation) 5293 propagation |= MS_PRIVATE; 5294 5295 return propagation; 5296 } 5297 5298 u64 vfsmount_to_propagation_flags(struct vfsmount *mnt) 5299 { 5300 return mnt_to_propagation_flags(real_mount(mnt)); 5301 } 5302 EXPORT_SYMBOL_GPL(vfsmount_to_propagation_flags); 5303 5304 static void statmount_sb_basic(struct kstatmount *s) 5305 { 5306 struct super_block *sb = s->mnt->mnt_sb; 5307 5308 s->sm.mask |= STATMOUNT_SB_BASIC; 5309 s->sm.sb_dev_major = MAJOR(sb->s_dev); 5310 s->sm.sb_dev_minor = MINOR(sb->s_dev); 5311 s->sm.sb_magic = sb->s_magic; 5312 s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME); 5313 } 5314 5315 static void statmount_mnt_basic(struct kstatmount *s) 5316 { 5317 struct mount *m = real_mount(s->mnt); 5318 5319 s->sm.mask |= STATMOUNT_MNT_BASIC; 5320 s->sm.mnt_id = m->mnt_id_unique; 5321 s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique; 5322 s->sm.mnt_id_old = m->mnt_id; 5323 s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id; 5324 s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt); 5325 s->sm.mnt_propagation = mnt_to_propagation_flags(m); 5326 s->sm.mnt_peer_group = m->mnt_group_id; 5327 s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0; 5328 } 5329 5330 static void statmount_propagate_from(struct kstatmount *s) 5331 { 5332 struct mount *m = real_mount(s->mnt); 5333 5334 s->sm.mask |= STATMOUNT_PROPAGATE_FROM; 5335 if (IS_MNT_SLAVE(m)) 5336 s->sm.propagate_from = get_dominating_id(m, ¤t->fs->root); 5337 } 5338 5339 static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq) 5340 { 5341 int ret; 5342 size_t start = seq->count; 5343 5344 ret = show_path(seq, s->mnt->mnt_root); 5345 if (ret) 5346 return ret; 5347 5348 if (unlikely(seq_has_overflowed(seq))) 5349 return -EAGAIN; 5350 5351 /* 5352 * Unescape the result. It would be better if supplied string was not 5353 * escaped in the first place, but that's a pretty invasive change. 5354 */ 5355 seq->buf[seq->count] = '\0'; 5356 seq->count = start; 5357 seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); 5358 return 0; 5359 } 5360 5361 static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq) 5362 { 5363 struct vfsmount *mnt = s->mnt; 5364 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 5365 int err; 5366 5367 err = seq_path_root(seq, &mnt_path, &s->root, ""); 5368 return err == SEQ_SKIP ? 0 : err; 5369 } 5370 5371 static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq) 5372 { 5373 struct super_block *sb = s->mnt->mnt_sb; 5374 5375 seq_puts(seq, sb->s_type->name); 5376 return 0; 5377 } 5378 5379 static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq) 5380 { 5381 struct super_block *sb = s->mnt->mnt_sb; 5382 5383 if (sb->s_subtype) 5384 seq_puts(seq, sb->s_subtype); 5385 } 5386 5387 static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) 5388 { 5389 struct super_block *sb = s->mnt->mnt_sb; 5390 struct mount *r = real_mount(s->mnt); 5391 5392 if (sb->s_op->show_devname) { 5393 size_t start = seq->count; 5394 int ret; 5395 5396 ret = sb->s_op->show_devname(seq, s->mnt->mnt_root); 5397 if (ret) 5398 return ret; 5399 5400 if (unlikely(seq_has_overflowed(seq))) 5401 return -EAGAIN; 5402 5403 /* Unescape the result */ 5404 seq->buf[seq->count] = '\0'; 5405 seq->count = start; 5406 seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); 5407 } else { 5408 seq_puts(seq, r->mnt_devname); 5409 } 5410 return 0; 5411 } 5412 5413 static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns) 5414 { 5415 s->sm.mask |= STATMOUNT_MNT_NS_ID; 5416 s->sm.mnt_ns_id = ns->ns.ns_id; 5417 } 5418 5419 static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq) 5420 { 5421 struct vfsmount *mnt = s->mnt; 5422 struct super_block *sb = mnt->mnt_sb; 5423 size_t start = seq->count; 5424 int err; 5425 5426 err = security_sb_show_options(seq, sb); 5427 if (err) 5428 return err; 5429 5430 if (sb->s_op->show_options) { 5431 err = sb->s_op->show_options(seq, mnt->mnt_root); 5432 if (err) 5433 return err; 5434 } 5435 5436 if (unlikely(seq_has_overflowed(seq))) 5437 return -EAGAIN; 5438 5439 if (seq->count == start) 5440 return 0; 5441 5442 /* skip leading comma */ 5443 memmove(seq->buf + start, seq->buf + start + 1, 5444 seq->count - start - 1); 5445 seq->count--; 5446 5447 return 0; 5448 } 5449 5450 static inline int statmount_opt_process(struct seq_file *seq, size_t start) 5451 { 5452 char *buf_end, *opt_end, *src, *dst; 5453 int count = 0; 5454 5455 if (unlikely(seq_has_overflowed(seq))) 5456 return -EAGAIN; 5457 5458 buf_end = seq->buf + seq->count; 5459 dst = seq->buf + start; 5460 src = dst + 1; /* skip initial comma */ 5461 5462 if (src >= buf_end) { 5463 seq->count = start; 5464 return 0; 5465 } 5466 5467 *buf_end = '\0'; 5468 for (; src < buf_end; src = opt_end + 1) { 5469 opt_end = strchrnul(src, ','); 5470 *opt_end = '\0'; 5471 dst += string_unescape(src, dst, 0, UNESCAPE_OCTAL) + 1; 5472 if (WARN_ON_ONCE(++count == INT_MAX)) 5473 return -EOVERFLOW; 5474 } 5475 seq->count = dst - 1 - seq->buf; 5476 return count; 5477 } 5478 5479 static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq) 5480 { 5481 struct vfsmount *mnt = s->mnt; 5482 struct super_block *sb = mnt->mnt_sb; 5483 size_t start = seq->count; 5484 int err; 5485 5486 if (!sb->s_op->show_options) 5487 return 0; 5488 5489 err = sb->s_op->show_options(seq, mnt->mnt_root); 5490 if (err) 5491 return err; 5492 5493 err = statmount_opt_process(seq, start); 5494 if (err < 0) 5495 return err; 5496 5497 s->sm.opt_num = err; 5498 return 0; 5499 } 5500 5501 static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) 5502 { 5503 struct vfsmount *mnt = s->mnt; 5504 struct super_block *sb = mnt->mnt_sb; 5505 size_t start = seq->count; 5506 int err; 5507 5508 err = security_sb_show_options(seq, sb); 5509 if (err) 5510 return err; 5511 5512 err = statmount_opt_process(seq, start); 5513 if (err < 0) 5514 return err; 5515 5516 s->sm.opt_sec_num = err; 5517 return 0; 5518 } 5519 5520 static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) 5521 { 5522 int ret; 5523 5524 ret = statmount_mnt_idmap(s->idmap, seq, true); 5525 if (ret < 0) 5526 return ret; 5527 5528 s->sm.mnt_uidmap_num = ret; 5529 /* 5530 * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid 5531 * mappings. This allows userspace to distinguish between a 5532 * non-idmapped mount and an idmapped mount where none of the 5533 * individual mappings are valid in the caller's idmapping. 5534 */ 5535 if (is_valid_mnt_idmap(s->idmap)) 5536 s->sm.mask |= STATMOUNT_MNT_UIDMAP; 5537 return 0; 5538 } 5539 5540 static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) 5541 { 5542 int ret; 5543 5544 ret = statmount_mnt_idmap(s->idmap, seq, false); 5545 if (ret < 0) 5546 return ret; 5547 5548 s->sm.mnt_gidmap_num = ret; 5549 /* 5550 * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid 5551 * mappings. This allows userspace to distinguish between a 5552 * non-idmapped mount and an idmapped mount where none of the 5553 * individual mappings are valid in the caller's idmapping. 5554 */ 5555 if (is_valid_mnt_idmap(s->idmap)) 5556 s->sm.mask |= STATMOUNT_MNT_GIDMAP; 5557 return 0; 5558 } 5559 5560 static int statmount_string(struct kstatmount *s, u64 flag) 5561 { 5562 int ret = 0; 5563 size_t kbufsize; 5564 struct seq_file *seq = &s->seq; 5565 struct statmount *sm = &s->sm; 5566 u32 start, *offp; 5567 5568 /* Reserve an empty string at the beginning for any unset offsets */ 5569 if (!seq->count) 5570 seq_putc(seq, 0); 5571 5572 start = seq->count; 5573 5574 switch (flag) { 5575 case STATMOUNT_FS_TYPE: 5576 offp = &sm->fs_type; 5577 ret = statmount_fs_type(s, seq); 5578 break; 5579 case STATMOUNT_MNT_ROOT: 5580 offp = &sm->mnt_root; 5581 ret = statmount_mnt_root(s, seq); 5582 break; 5583 case STATMOUNT_MNT_POINT: 5584 offp = &sm->mnt_point; 5585 ret = statmount_mnt_point(s, seq); 5586 break; 5587 case STATMOUNT_MNT_OPTS: 5588 offp = &sm->mnt_opts; 5589 ret = statmount_mnt_opts(s, seq); 5590 break; 5591 case STATMOUNT_OPT_ARRAY: 5592 offp = &sm->opt_array; 5593 ret = statmount_opt_array(s, seq); 5594 break; 5595 case STATMOUNT_OPT_SEC_ARRAY: 5596 offp = &sm->opt_sec_array; 5597 ret = statmount_opt_sec_array(s, seq); 5598 break; 5599 case STATMOUNT_FS_SUBTYPE: 5600 offp = &sm->fs_subtype; 5601 statmount_fs_subtype(s, seq); 5602 break; 5603 case STATMOUNT_SB_SOURCE: 5604 offp = &sm->sb_source; 5605 ret = statmount_sb_source(s, seq); 5606 break; 5607 case STATMOUNT_MNT_UIDMAP: 5608 offp = &sm->mnt_uidmap; 5609 ret = statmount_mnt_uidmap(s, seq); 5610 break; 5611 case STATMOUNT_MNT_GIDMAP: 5612 offp = &sm->mnt_gidmap; 5613 ret = statmount_mnt_gidmap(s, seq); 5614 break; 5615 default: 5616 WARN_ON_ONCE(true); 5617 return -EINVAL; 5618 } 5619 5620 /* 5621 * If nothing was emitted, return to avoid setting the flag 5622 * and terminating the buffer. 5623 */ 5624 if (seq->count == start) 5625 return ret; 5626 if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize))) 5627 return -EOVERFLOW; 5628 if (kbufsize >= s->bufsize) 5629 return -EOVERFLOW; 5630 5631 /* signal a retry */ 5632 if (unlikely(seq_has_overflowed(seq))) 5633 return -EAGAIN; 5634 5635 if (ret) 5636 return ret; 5637 5638 seq->buf[seq->count++] = '\0'; 5639 sm->mask |= flag; 5640 *offp = start; 5641 return 0; 5642 } 5643 5644 static int copy_statmount_to_user(struct kstatmount *s) 5645 { 5646 struct statmount *sm = &s->sm; 5647 struct seq_file *seq = &s->seq; 5648 char __user *str = ((char __user *)s->buf) + sizeof(*sm); 5649 size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm)); 5650 5651 if (seq->count && copy_to_user(str, seq->buf, seq->count)) 5652 return -EFAULT; 5653 5654 /* Return the number of bytes copied to the buffer */ 5655 sm->size = copysize + seq->count; 5656 if (copy_to_user(s->buf, sm, copysize)) 5657 return -EFAULT; 5658 5659 return 0; 5660 } 5661 5662 static struct mount *listmnt_next(struct mount *curr, bool reverse) 5663 { 5664 struct rb_node *node; 5665 5666 if (reverse) 5667 node = rb_prev(&curr->mnt_node); 5668 else 5669 node = rb_next(&curr->mnt_node); 5670 5671 return node_to_mount(node); 5672 } 5673 5674 static int grab_requested_root(struct mnt_namespace *ns, struct path *root) 5675 { 5676 struct mount *first, *child; 5677 5678 rwsem_assert_held(&namespace_sem); 5679 5680 /* We're looking at our own ns, just use get_fs_root. */ 5681 if (ns == current->nsproxy->mnt_ns) { 5682 get_fs_root(current->fs, root); 5683 return 0; 5684 } 5685 5686 /* 5687 * We have to find the first mount in our ns and use that, however it 5688 * may not exist, so handle that properly. 5689 */ 5690 if (mnt_ns_empty(ns)) 5691 return -ENOENT; 5692 5693 first = ns->root; 5694 for (child = node_to_mount(ns->mnt_first_node); child; 5695 child = listmnt_next(child, false)) { 5696 if (child != first && child->mnt_parent == first) 5697 break; 5698 } 5699 if (!child) 5700 return -ENOENT; 5701 5702 root->mnt = mntget(&child->mnt); 5703 root->dentry = dget(root->mnt->mnt_root); 5704 return 0; 5705 } 5706 5707 /* This must be updated whenever a new flag is added */ 5708 #define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ 5709 STATMOUNT_MNT_BASIC | \ 5710 STATMOUNT_PROPAGATE_FROM | \ 5711 STATMOUNT_MNT_ROOT | \ 5712 STATMOUNT_MNT_POINT | \ 5713 STATMOUNT_FS_TYPE | \ 5714 STATMOUNT_MNT_NS_ID | \ 5715 STATMOUNT_MNT_OPTS | \ 5716 STATMOUNT_FS_SUBTYPE | \ 5717 STATMOUNT_SB_SOURCE | \ 5718 STATMOUNT_OPT_ARRAY | \ 5719 STATMOUNT_OPT_SEC_ARRAY | \ 5720 STATMOUNT_SUPPORTED_MASK | \ 5721 STATMOUNT_MNT_UIDMAP | \ 5722 STATMOUNT_MNT_GIDMAP) 5723 5724 /* locks: namespace_shared */ 5725 static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, 5726 struct file *mnt_file, struct mnt_namespace *ns) 5727 { 5728 int err; 5729 5730 if (mnt_file) { 5731 WARN_ON_ONCE(ns != NULL); 5732 5733 s->mnt = mnt_file->f_path.mnt; 5734 ns = real_mount(s->mnt)->mnt_ns; 5735 if (IS_ERR(ns)) 5736 return PTR_ERR(ns); 5737 if (!ns) 5738 /* 5739 * We can't set mount point and mnt_ns_id since we don't have a 5740 * ns for the mount. This can happen if the mount is unmounted 5741 * with MNT_DETACH. 5742 */ 5743 s->mask &= ~(STATMOUNT_MNT_POINT | STATMOUNT_MNT_NS_ID); 5744 } else { 5745 /* Has the namespace already been emptied? */ 5746 if (mnt_ns_id && mnt_ns_empty(ns)) 5747 return -ENOENT; 5748 5749 s->mnt = lookup_mnt_in_ns(mnt_id, ns); 5750 if (!s->mnt) 5751 return -ENOENT; 5752 } 5753 5754 if (ns) { 5755 err = grab_requested_root(ns, &s->root); 5756 if (err) 5757 return err; 5758 5759 if (!mnt_file) { 5760 struct mount *m; 5761 /* 5762 * Don't trigger audit denials. We just want to determine what 5763 * mounts to show users. 5764 */ 5765 m = real_mount(s->mnt); 5766 if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && 5767 !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) 5768 return -EPERM; 5769 } 5770 } 5771 5772 err = security_sb_statfs(s->mnt->mnt_root); 5773 if (err) 5774 return err; 5775 5776 /* 5777 * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap 5778 * can change concurrently as we only hold the read-side of the 5779 * namespace semaphore and mount properties may change with only 5780 * the mount lock held. 5781 * 5782 * We could sample the mount lock sequence counter to detect 5783 * those changes and retry. But it's not worth it. Worst that 5784 * happens is that the mnt->mnt_idmap pointer is already changed 5785 * while mnt->mnt_flags isn't or vica versa. So what. 5786 * 5787 * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved 5788 * via READ_ONCE()/WRITE_ONCE() and guard against theoretical 5789 * torn read/write. That's all we care about right now. 5790 */ 5791 s->idmap = mnt_idmap(s->mnt); 5792 if (s->mask & STATMOUNT_MNT_BASIC) 5793 statmount_mnt_basic(s); 5794 5795 if (s->mask & STATMOUNT_SB_BASIC) 5796 statmount_sb_basic(s); 5797 5798 if (s->mask & STATMOUNT_PROPAGATE_FROM) 5799 statmount_propagate_from(s); 5800 5801 if (s->mask & STATMOUNT_FS_TYPE) 5802 err = statmount_string(s, STATMOUNT_FS_TYPE); 5803 5804 if (!err && s->mask & STATMOUNT_MNT_ROOT) 5805 err = statmount_string(s, STATMOUNT_MNT_ROOT); 5806 5807 if (!err && s->mask & STATMOUNT_MNT_POINT) 5808 err = statmount_string(s, STATMOUNT_MNT_POINT); 5809 5810 if (!err && s->mask & STATMOUNT_MNT_OPTS) 5811 err = statmount_string(s, STATMOUNT_MNT_OPTS); 5812 5813 if (!err && s->mask & STATMOUNT_OPT_ARRAY) 5814 err = statmount_string(s, STATMOUNT_OPT_ARRAY); 5815 5816 if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY) 5817 err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY); 5818 5819 if (!err && s->mask & STATMOUNT_FS_SUBTYPE) 5820 err = statmount_string(s, STATMOUNT_FS_SUBTYPE); 5821 5822 if (!err && s->mask & STATMOUNT_SB_SOURCE) 5823 err = statmount_string(s, STATMOUNT_SB_SOURCE); 5824 5825 if (!err && s->mask & STATMOUNT_MNT_UIDMAP) 5826 err = statmount_string(s, STATMOUNT_MNT_UIDMAP); 5827 5828 if (!err && s->mask & STATMOUNT_MNT_GIDMAP) 5829 err = statmount_string(s, STATMOUNT_MNT_GIDMAP); 5830 5831 if (!err && s->mask & STATMOUNT_MNT_NS_ID) 5832 statmount_mnt_ns_id(s, ns); 5833 5834 if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { 5835 s->sm.mask |= STATMOUNT_SUPPORTED_MASK; 5836 s->sm.supported_mask = STATMOUNT_SUPPORTED; 5837 } 5838 5839 if (err) 5840 return err; 5841 5842 /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ 5843 WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); 5844 5845 return 0; 5846 } 5847 5848 static inline bool retry_statmount(const long ret, size_t *seq_size) 5849 { 5850 if (likely(ret != -EAGAIN)) 5851 return false; 5852 if (unlikely(check_mul_overflow(*seq_size, 2, seq_size))) 5853 return false; 5854 if (unlikely(*seq_size > MAX_RW_COUNT)) 5855 return false; 5856 return true; 5857 } 5858 5859 #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ 5860 STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ 5861 STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ 5862 STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ 5863 STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) 5864 5865 static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, 5866 struct statmount __user *buf, size_t bufsize, 5867 size_t seq_size) 5868 { 5869 if (!access_ok(buf, bufsize)) 5870 return -EFAULT; 5871 5872 memset(ks, 0, sizeof(*ks)); 5873 ks->mask = kreq->param; 5874 ks->buf = buf; 5875 ks->bufsize = bufsize; 5876 5877 if (ks->mask & STATMOUNT_STRING_REQ) { 5878 if (bufsize == sizeof(ks->sm)) 5879 return -EOVERFLOW; 5880 5881 ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT); 5882 if (!ks->seq.buf) 5883 return -ENOMEM; 5884 5885 ks->seq.size = seq_size; 5886 } 5887 5888 return 0; 5889 } 5890 5891 static int copy_mnt_id_req(const struct mnt_id_req __user *req, 5892 struct mnt_id_req *kreq, unsigned int flags) 5893 { 5894 int ret; 5895 size_t usize; 5896 5897 BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER1); 5898 5899 ret = get_user(usize, &req->size); 5900 if (ret) 5901 return -EFAULT; 5902 if (unlikely(usize > PAGE_SIZE)) 5903 return -E2BIG; 5904 if (unlikely(usize < MNT_ID_REQ_SIZE_VER0)) 5905 return -EINVAL; 5906 memset(kreq, 0, sizeof(*kreq)); 5907 ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); 5908 if (ret) 5909 return ret; 5910 5911 if (flags & STATMOUNT_BY_FD) { 5912 if (kreq->mnt_id || kreq->mnt_ns_id) 5913 return -EINVAL; 5914 } else { 5915 if (kreq->mnt_ns_fd != 0 && kreq->mnt_ns_id) 5916 return -EINVAL; 5917 /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ 5918 if (kreq->mnt_id <= MNT_UNIQUE_ID_OFFSET) 5919 return -EINVAL; 5920 } 5921 return 0; 5922 } 5923 5924 /* 5925 * If the user requested a specific mount namespace id, look that up and return 5926 * that, or if not simply grab a passive reference on our mount namespace and 5927 * return that. 5928 */ 5929 static struct mnt_namespace *grab_requested_mnt_ns(const struct mnt_id_req *kreq) 5930 { 5931 struct mnt_namespace *mnt_ns; 5932 5933 if (kreq->mnt_ns_id) { 5934 mnt_ns = lookup_mnt_ns(kreq->mnt_ns_id); 5935 if (!mnt_ns) 5936 return ERR_PTR(-ENOENT); 5937 } else if (kreq->mnt_ns_fd) { 5938 struct ns_common *ns; 5939 5940 CLASS(fd, f)(kreq->mnt_ns_fd); 5941 if (fd_empty(f)) 5942 return ERR_PTR(-EBADF); 5943 5944 if (!proc_ns_file(fd_file(f))) 5945 return ERR_PTR(-EINVAL); 5946 5947 ns = get_proc_ns(file_inode(fd_file(f))); 5948 if (ns->ns_type != CLONE_NEWNS) 5949 return ERR_PTR(-EINVAL); 5950 5951 mnt_ns = to_mnt_ns(ns); 5952 refcount_inc(&mnt_ns->passive); 5953 } else { 5954 mnt_ns = current->nsproxy->mnt_ns; 5955 refcount_inc(&mnt_ns->passive); 5956 } 5957 5958 return mnt_ns; 5959 } 5960 5961 SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req, 5962 struct statmount __user *, buf, size_t, bufsize, 5963 unsigned int, flags) 5964 { 5965 struct mnt_namespace *ns __free(mnt_ns_release) = NULL; 5966 struct kstatmount *ks __free(kfree) = NULL; 5967 struct file *mnt_file __free(fput) = NULL; 5968 struct mnt_id_req kreq; 5969 /* We currently support retrieval of 3 strings. */ 5970 size_t seq_size = 3 * PATH_MAX; 5971 int ret; 5972 5973 if (flags & ~STATMOUNT_BY_FD) 5974 return -EINVAL; 5975 5976 ret = copy_mnt_id_req(req, &kreq, flags); 5977 if (ret) 5978 return ret; 5979 5980 if (flags & STATMOUNT_BY_FD) { 5981 mnt_file = fget_raw(kreq.mnt_fd); 5982 if (!mnt_file) 5983 return -EBADF; 5984 /* do_statmount sets ns in case of STATMOUNT_BY_FD */ 5985 } else { 5986 ns = grab_requested_mnt_ns(&kreq); 5987 if (IS_ERR(ns)) 5988 return PTR_ERR(ns); 5989 5990 if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && 5991 !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) 5992 return -EPERM; 5993 } 5994 5995 ks = kmalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); 5996 if (!ks) 5997 return -ENOMEM; 5998 5999 retry: 6000 ret = prepare_kstatmount(ks, &kreq, buf, bufsize, seq_size); 6001 if (ret) 6002 return ret; 6003 6004 scoped_guard(namespace_shared) 6005 ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, mnt_file, ns); 6006 6007 if (!ret) 6008 ret = copy_statmount_to_user(ks); 6009 kvfree(ks->seq.buf); 6010 path_put(&ks->root); 6011 if (retry_statmount(ret, &seq_size)) 6012 goto retry; 6013 return ret; 6014 } 6015 6016 struct klistmount { 6017 u64 last_mnt_id; 6018 u64 mnt_parent_id; 6019 u64 *kmnt_ids; 6020 u32 nr_mnt_ids; 6021 struct mnt_namespace *ns; 6022 struct path root; 6023 }; 6024 6025 /* locks: namespace_shared */ 6026 static ssize_t do_listmount(struct klistmount *kls, bool reverse) 6027 { 6028 struct mnt_namespace *ns = kls->ns; 6029 u64 mnt_parent_id = kls->mnt_parent_id; 6030 u64 last_mnt_id = kls->last_mnt_id; 6031 u64 *mnt_ids = kls->kmnt_ids; 6032 size_t nr_mnt_ids = kls->nr_mnt_ids; 6033 struct path orig; 6034 struct mount *r, *first; 6035 ssize_t ret; 6036 6037 rwsem_assert_held(&namespace_sem); 6038 6039 ret = grab_requested_root(ns, &kls->root); 6040 if (ret) 6041 return ret; 6042 6043 if (mnt_parent_id == LSMT_ROOT) { 6044 orig = kls->root; 6045 } else { 6046 orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); 6047 if (!orig.mnt) 6048 return -ENOENT; 6049 orig.dentry = orig.mnt->mnt_root; 6050 } 6051 6052 /* 6053 * Don't trigger audit denials. We just want to determine what 6054 * mounts to show users. 6055 */ 6056 if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && 6057 !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) 6058 return -EPERM; 6059 6060 ret = security_sb_statfs(orig.dentry); 6061 if (ret) 6062 return ret; 6063 6064 if (!last_mnt_id) { 6065 if (reverse) 6066 first = node_to_mount(ns->mnt_last_node); 6067 else 6068 first = node_to_mount(ns->mnt_first_node); 6069 } else { 6070 if (reverse) 6071 first = mnt_find_id_at_reverse(ns, last_mnt_id - 1); 6072 else 6073 first = mnt_find_id_at(ns, last_mnt_id + 1); 6074 } 6075 6076 for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r, reverse)) { 6077 if (r->mnt_id_unique == mnt_parent_id) 6078 continue; 6079 if (!is_path_reachable(r, r->mnt.mnt_root, &orig)) 6080 continue; 6081 *mnt_ids = r->mnt_id_unique; 6082 mnt_ids++; 6083 nr_mnt_ids--; 6084 ret++; 6085 } 6086 return ret; 6087 } 6088 6089 static void __free_klistmount_free(const struct klistmount *kls) 6090 { 6091 path_put(&kls->root); 6092 kvfree(kls->kmnt_ids); 6093 mnt_ns_release(kls->ns); 6094 } 6095 6096 static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, 6097 size_t nr_mnt_ids) 6098 { 6099 u64 last_mnt_id = kreq->param; 6100 struct mnt_namespace *ns; 6101 6102 /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ 6103 if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) 6104 return -EINVAL; 6105 6106 kls->last_mnt_id = last_mnt_id; 6107 6108 kls->nr_mnt_ids = nr_mnt_ids; 6109 kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), 6110 GFP_KERNEL_ACCOUNT); 6111 if (!kls->kmnt_ids) 6112 return -ENOMEM; 6113 6114 ns = grab_requested_mnt_ns(kreq); 6115 if (IS_ERR(ns)) 6116 return PTR_ERR(ns); 6117 kls->ns = ns; 6118 6119 kls->mnt_parent_id = kreq->mnt_id; 6120 return 0; 6121 } 6122 6123 SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, 6124 u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) 6125 { 6126 struct klistmount kls __free(klistmount_free) = {}; 6127 const size_t maxcount = 1000000; 6128 struct mnt_id_req kreq; 6129 ssize_t ret; 6130 6131 if (flags & ~LISTMOUNT_REVERSE) 6132 return -EINVAL; 6133 6134 /* 6135 * If the mount namespace really has more than 1 million mounts the 6136 * caller must iterate over the mount namespace (and reconsider their 6137 * system design...). 6138 */ 6139 if (unlikely(nr_mnt_ids > maxcount)) 6140 return -EOVERFLOW; 6141 6142 if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids))) 6143 return -EFAULT; 6144 6145 ret = copy_mnt_id_req(req, &kreq, 0); 6146 if (ret) 6147 return ret; 6148 6149 ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); 6150 if (ret) 6151 return ret; 6152 6153 if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && 6154 !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) 6155 return -ENOENT; 6156 6157 /* 6158 * We only need to guard against mount topology changes as 6159 * listmount() doesn't care about any mount properties. 6160 */ 6161 scoped_guard(namespace_shared) 6162 ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); 6163 if (ret <= 0) 6164 return ret; 6165 6166 if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) 6167 return -EFAULT; 6168 6169 return ret; 6170 } 6171 6172 struct mnt_namespace init_mnt_ns = { 6173 .ns = NS_COMMON_INIT(init_mnt_ns), 6174 .user_ns = &init_user_ns, 6175 .passive = REFCOUNT_INIT(1), 6176 .mounts = RB_ROOT, 6177 .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll), 6178 }; 6179 6180 static void __init init_mount_tree(void) 6181 { 6182 struct vfsmount *mnt, *nullfs_mnt; 6183 struct mount *mnt_root; 6184 struct path root; 6185 6186 /* 6187 * We create two mounts: 6188 * 6189 * (1) nullfs with mount id 1 6190 * (2) mutable rootfs with mount id 2 6191 * 6192 * with (2) mounted on top of (1). 6193 */ 6194 nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); 6195 if (IS_ERR(nullfs_mnt)) 6196 panic("VFS: Failed to create nullfs"); 6197 6198 mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); 6199 if (IS_ERR(mnt)) 6200 panic("Can't create rootfs"); 6201 6202 VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); 6203 VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); 6204 6205 /* The namespace root is the nullfs mnt. */ 6206 mnt_root = real_mount(nullfs_mnt); 6207 init_mnt_ns.root = mnt_root; 6208 6209 /* Mount mutable rootfs on top of nullfs. */ 6210 root.mnt = nullfs_mnt; 6211 root.dentry = nullfs_mnt->mnt_root; 6212 6213 LOCK_MOUNT_EXACT(mp, &root); 6214 if (unlikely(IS_ERR(mp.parent))) 6215 panic("VFS: Failed to mount rootfs on nullfs"); 6216 scoped_guard(mount_writer) 6217 attach_mnt(real_mount(mnt), mp.parent, mp.mp); 6218 6219 pr_info("VFS: Finished mounting rootfs on nullfs\n"); 6220 6221 /* 6222 * We've dropped all locks here but that's fine. Not just are we 6223 * the only task that's running, there's no other mount 6224 * namespace in existence and the initial mount namespace is 6225 * completely empty until we add the mounts we just created. 6226 */ 6227 for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { 6228 mnt_add_to_ns(&init_mnt_ns, p); 6229 init_mnt_ns.nr_mounts++; 6230 } 6231 6232 init_task.nsproxy->mnt_ns = &init_mnt_ns; 6233 get_mnt_ns(&init_mnt_ns); 6234 6235 /* The root and pwd always point to the mutable rootfs. */ 6236 root.mnt = mnt; 6237 root.dentry = mnt->mnt_root; 6238 set_fs_pwd(current->fs, &root); 6239 set_fs_root(current->fs, &root); 6240 6241 ns_tree_add(&init_mnt_ns); 6242 } 6243 6244 void __init mnt_init(void) 6245 { 6246 int err; 6247 6248 mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 6249 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); 6250 6251 mount_hashtable = alloc_large_system_hash("Mount-cache", 6252 sizeof(struct hlist_head), 6253 mhash_entries, 19, 6254 HASH_ZERO, 6255 &m_hash_shift, &m_hash_mask, 0, 0); 6256 mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", 6257 sizeof(struct hlist_head), 6258 mphash_entries, 19, 6259 HASH_ZERO, 6260 &mp_hash_shift, &mp_hash_mask, 0, 0); 6261 6262 if (!mount_hashtable || !mountpoint_hashtable) 6263 panic("Failed to allocate mount hash table\n"); 6264 6265 kernfs_init(); 6266 6267 err = sysfs_init(); 6268 if (err) 6269 printk(KERN_WARNING "%s: sysfs_init error: %d\n", 6270 __func__, err); 6271 fs_kobj = kobject_create_and_add("fs", NULL); 6272 if (!fs_kobj) 6273 printk(KERN_WARNING "%s: kobj create error\n", __func__); 6274 shmem_init(); 6275 init_rootfs(); 6276 init_mount_tree(); 6277 } 6278 6279 void put_mnt_ns(struct mnt_namespace *ns) 6280 { 6281 if (!ns_ref_put(ns)) 6282 return; 6283 guard(namespace_excl)(); 6284 emptied_ns = ns; 6285 guard(mount_writer)(); 6286 umount_tree(ns->root, 0); 6287 } 6288 6289 struct vfsmount *kern_mount(struct file_system_type *type) 6290 { 6291 struct vfsmount *mnt; 6292 mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); 6293 if (!IS_ERR(mnt)) { 6294 /* 6295 * it is a longterm mount, don't release mnt until 6296 * we unmount before file sys is unregistered 6297 */ 6298 real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; 6299 } 6300 return mnt; 6301 } 6302 EXPORT_SYMBOL_GPL(kern_mount); 6303 6304 void kern_unmount(struct vfsmount *mnt) 6305 { 6306 /* release long term mount so mount point can be released */ 6307 if (!IS_ERR(mnt)) { 6308 mnt_make_shortterm(mnt); 6309 synchronize_rcu(); /* yecchhh... */ 6310 mntput(mnt); 6311 } 6312 } 6313 EXPORT_SYMBOL(kern_unmount); 6314 6315 void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) 6316 { 6317 unsigned int i; 6318 6319 for (i = 0; i < num; i++) 6320 mnt_make_shortterm(mnt[i]); 6321 synchronize_rcu_expedited(); 6322 for (i = 0; i < num; i++) 6323 mntput(mnt[i]); 6324 } 6325 EXPORT_SYMBOL(kern_unmount_array); 6326 6327 bool our_mnt(struct vfsmount *mnt) 6328 { 6329 return check_mnt(real_mount(mnt)); 6330 } 6331 6332 bool current_chrooted(void) 6333 { 6334 /* Does the current process have a non-standard root */ 6335 struct path fs_root __free(path_put) = {}; 6336 struct mount *root; 6337 6338 get_fs_root(current->fs, &fs_root); 6339 6340 /* Find the namespace root */ 6341 6342 guard(mount_locked_reader)(); 6343 6344 root = topmost_overmount(current->nsproxy->mnt_ns->root); 6345 6346 return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); 6347 } 6348 6349 static bool mnt_already_visible(struct mnt_namespace *ns, 6350 const struct super_block *sb, 6351 int *new_mnt_flags) 6352 { 6353 int new_flags = *new_mnt_flags; 6354 struct mount *mnt; 6355 6356 /* Don't acquire namespace semaphore without a good reason. */ 6357 if (hlist_empty(&ns->mnt_visible_mounts)) 6358 return false; 6359 6360 guard(namespace_shared)(); 6361 hlist_for_each_entry(mnt, &ns->mnt_visible_mounts, mnt_ns_visible) { 6362 const struct super_block *sb_visible = mnt->mnt.mnt_sb; 6363 struct mount *child; 6364 int mnt_flags; 6365 6366 if (sb_visible->s_type != sb->s_type) 6367 continue; 6368 6369 /* 6370 * Restricted variants are not compatible with anything, even 6371 * other restricted variants. 6372 */ 6373 if (sb_visible->s_iflags & SB_I_RESTRICTED_VARIANT) 6374 continue; 6375 6376 /* A local view of the mount flags */ 6377 mnt_flags = mnt->mnt.mnt_flags; 6378 6379 /* Don't miss readonly hidden in the superblock flags */ 6380 if (sb_rdonly(mnt->mnt.mnt_sb)) 6381 mnt_flags |= MNT_LOCK_READONLY; 6382 6383 /* Verify the mount flags are equal to or more permissive 6384 * than the proposed new mount. 6385 */ 6386 if ((mnt_flags & MNT_LOCK_READONLY) && 6387 !(new_flags & MNT_READONLY)) 6388 continue; 6389 if ((mnt_flags & MNT_LOCK_ATIME) && 6390 ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) 6391 continue; 6392 6393 /* This mount is not fully visible if there are any 6394 * locked child mounts that cover anything except for 6395 * empty directories. 6396 */ 6397 list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { 6398 struct inode *inode = child->mnt_mountpoint->d_inode; 6399 /* Only worry about locked mounts */ 6400 if (!(child->mnt.mnt_flags & MNT_LOCKED)) 6401 continue; 6402 /* Is the directory permanently empty? */ 6403 if (!is_empty_dir_inode(inode)) 6404 goto next; 6405 } 6406 /* Preserve the locked attributes */ 6407 *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ 6408 MNT_LOCK_ATIME); 6409 return true; 6410 next: ; 6411 } 6412 return false; 6413 } 6414 6415 static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) 6416 { 6417 const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; 6418 struct mnt_namespace *ns = current->nsproxy->mnt_ns; 6419 unsigned long s_iflags; 6420 6421 if (ns->user_ns == &init_user_ns) 6422 return false; 6423 6424 /* Can this filesystem be too revealing? */ 6425 if (!(sb->s_type->fs_flags & FS_USERNS_MOUNT_RESTRICTED)) 6426 return false; 6427 6428 s_iflags = sb->s_iflags; 6429 if ((s_iflags & required_iflags) != required_iflags) { 6430 WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", 6431 required_iflags); 6432 return true; 6433 } 6434 6435 /* 6436 * Restricted variants don't need an already visible mount because they 6437 * don't expose the full filesystem view. 6438 */ 6439 if (s_iflags & SB_I_RESTRICTED_VARIANT) 6440 return false; 6441 6442 return !mnt_already_visible(ns, sb, new_mnt_flags); 6443 } 6444 6445 bool mnt_may_suid(struct vfsmount *mnt) 6446 { 6447 /* 6448 * Foreign mounts (accessed via fchdir or through /proc 6449 * symlinks) are always treated as if they are nosuid. This 6450 * prevents namespaces from trusting potentially unsafe 6451 * suid/sgid bits, file caps, or security labels that originate 6452 * in other namespaces. 6453 */ 6454 return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && 6455 current_in_userns(mnt->mnt_sb->s_user_ns); 6456 } 6457 6458 static struct ns_common *mntns_get(struct task_struct *task) 6459 { 6460 struct ns_common *ns = NULL; 6461 struct nsproxy *nsproxy; 6462 6463 task_lock(task); 6464 nsproxy = task->nsproxy; 6465 if (nsproxy) { 6466 ns = &nsproxy->mnt_ns->ns; 6467 get_mnt_ns(to_mnt_ns(ns)); 6468 } 6469 task_unlock(task); 6470 6471 return ns; 6472 } 6473 6474 static void mntns_put(struct ns_common *ns) 6475 { 6476 put_mnt_ns(to_mnt_ns(ns)); 6477 } 6478 6479 static int mntns_install(struct nsset *nsset, struct ns_common *ns) 6480 { 6481 struct nsproxy *nsproxy = nsset->nsproxy; 6482 struct fs_struct *fs = nsset->fs; 6483 struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; 6484 struct user_namespace *user_ns = nsset->cred->user_ns; 6485 struct path root; 6486 int err; 6487 6488 if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || 6489 !ns_capable(user_ns, CAP_SYS_CHROOT) || 6490 !ns_capable(user_ns, CAP_SYS_ADMIN)) 6491 return -EPERM; 6492 6493 if (is_anon_ns(mnt_ns)) 6494 return -EINVAL; 6495 6496 if (fs->users != 1) 6497 return -EINVAL; 6498 6499 get_mnt_ns(mnt_ns); 6500 old_mnt_ns = nsproxy->mnt_ns; 6501 nsproxy->mnt_ns = mnt_ns; 6502 6503 /* Find the root */ 6504 err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, 6505 "/", LOOKUP_DOWN, &root); 6506 if (err) { 6507 /* revert to old namespace */ 6508 nsproxy->mnt_ns = old_mnt_ns; 6509 put_mnt_ns(mnt_ns); 6510 return err; 6511 } 6512 6513 put_mnt_ns(old_mnt_ns); 6514 6515 /* Update the pwd and root */ 6516 set_fs_pwd(fs, &root); 6517 set_fs_root(fs, &root); 6518 6519 path_put(&root); 6520 return 0; 6521 } 6522 6523 static struct user_namespace *mntns_owner(struct ns_common *ns) 6524 { 6525 return to_mnt_ns(ns)->user_ns; 6526 } 6527 6528 const struct proc_ns_operations mntns_operations = { 6529 .name = "mnt", 6530 .get = mntns_get, 6531 .put = mntns_put, 6532 .install = mntns_install, 6533 .owner = mntns_owner, 6534 }; 6535 6536 #ifdef CONFIG_SYSCTL 6537 static const struct ctl_table fs_namespace_sysctls[] = { 6538 { 6539 .procname = "mount-max", 6540 .data = &sysctl_mount_max, 6541 .maxlen = sizeof(unsigned int), 6542 .mode = 0644, 6543 .proc_handler = proc_dointvec_minmax, 6544 .extra1 = SYSCTL_ONE, 6545 }, 6546 }; 6547 6548 static int __init init_fs_namespace_sysctls(void) 6549 { 6550 register_sysctl_init("fs", fs_namespace_sysctls); 6551 return 0; 6552 } 6553 fs_initcall(init_fs_namespace_sysctls); 6554 6555 #endif /* CONFIG_SYSCTL */ 6556